diff options
Diffstat (limited to 'contrib/arm-optimized-routines/math/aarch64/advsimd')
82 files changed, 8641 insertions, 0 deletions
diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c new file mode 100644 index 000000000000..7873a07e6f56 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acos.c @@ -0,0 +1,122 @@ +/* + * Double-precision vector acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[12]; + float64x2_t pi, pi_over_2; + uint64x2_t abs_mask; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), + V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), + V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), + V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), + V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), + V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, + .pi = V2 (0x1.921fb54442d18p+1), + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define Oneu 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-53. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (acos, x, y, special); +} +#endif + +/* Double-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.18 ulps, + _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 + want 0x1.0d54d1985c069p+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.52 ulps, + _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 + want 0x1.edbbedf8a7d6cp-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (Oneu - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x), + vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax)); + float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float64x2_t y = vbslq_f64 (d->abs_mask, p, x); + + uint64x2_t is_neg = vcltzq_f64 (x); + float64x2_t off = vreinterpretq_f64_u64 ( + vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi))); + float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0)); + float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off); + + return vfmaq_f64 (add, mul, y); +} + +TEST_SIG (V, D, 1, acos, -1.0, 1.0) +TEST_ULP (V_NAME_D1 (acos), 1.02) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acos), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000) +TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c new file mode 100644 index 000000000000..e200f792c764 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosf.c @@ -0,0 +1,115 @@ +/* + * Single-precision vector acos(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f, pif; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), + .pif = V4 (0x1.921fb6p+1f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x32800000 /* 2^-26. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (acosf, x, y, special); +} +#endif + +/* Single-precision implementation of vector acos(x). + + For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct + rounding. + If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following + approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation of asin is an odd polynomial: + + acos(x) ~ pi/2 - (x + x^3 P(x^2)). + + The largest observed error in this region is 1.26 ulps, + _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 1.32 ulps, + _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1 + want 0x1.feb32ep-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* A single comparison for One, Small and QNaN. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with + z2 = x ^ 2 and z = |x| , if |x| < 0.5 + z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 + = 2 Q(|x|) , for 0.5 < x < 1.0 + = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ + float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x); + + uint32x4_t is_neg = vcltzq_f32 (x); + float32x4_t off = vreinterpretq_f32_u32 ( + vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg)); + float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0)); + float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off); + + return vfmaq_f32 (add, mul, y); +} + +HALF_WIDTH_ALIAS_F1 (acos) + +TEST_SIG (V, F, 1, acos, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (acos), 0.82) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acos), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000) +TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c new file mode 100644 index 000000000000..55d8ed5a421e --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acosh.c @@ -0,0 +1,65 @@ +/* + * Double-precision vector acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 1 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one, thresh; +} data = { + .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */ +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special, + const struct v_log1p_data *d) +{ + return v_call_f64 (acosh, x, log1p_inline (y, d), special); +} + +/* Vector approximation for double-precision acosh, based on log1p. + The largest observed error is 3.02 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 + want 0x1.f2d6d823bc9e2p-5. */ +VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh); + float64x2_t special_arg = x; + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (special))) + x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); +#endif + + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0)); + float64x2_t y = vaddq_f64 (x, v_f64 (1.0)); + y = vmulq_f64 (y, xm1); + y = vsqrtq_f64 (y); + y = vaddq_f64 (xm1, y); + + if (unlikely (v_any_u64 (special))) + return special_case (special_arg, y, special, &d->log1p_consts); + return log1p_inline (y, &d->log1p_consts); +} + +TEST_SIG (V, D, 1, acosh, 1.0, 10.0) +TEST_ULP (V_NAME_D1 (acosh), 2.53) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000) +TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000) +TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000) +TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c new file mode 100644 index 000000000000..029d457cfa8a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/acoshf.c @@ -0,0 +1,78 @@ +/* + * Single-precision vector acosh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +#define SquareLim 0x1p64 + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; +} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) }; + +#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint16x4_t special, + const struct v_log1pf_data *d) +{ + return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); +} + +/* Vector approximation for single-precision acosh, based on log1p. Maximum + error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it + is 3.00 ULP: + _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4 + want 0x1.ef0a7cp-4. + With exceptions disabled, we can compute u with a shorter dependency chain, + which gives maximum error of 3.22 ULP: + _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5 + want 0x1.fdcdd2p-5. */ + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh); + +#if WANT_SIMD_EXCEPT + /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use + only xm1 to calculate u, as operating on x will trigger invalid for NaN. + Widening sign-extend special predicate in order to mask with it. */ + uint32x4_t p + = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special))); + float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); + float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); +#else + float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one)); + float32x4_t u + = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one))); +#endif + + float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, special, &d->log1pf_consts); + return log1pf_inline (y, &d->log1pf_consts); +} + +HALF_WIDTH_ALIAS_F1 (acosh) + +TEST_SIG (V, F, 1, acosh, 1.0, 10.0) +#if WANT_SIMD_EXCEPT +TEST_ULP (V_NAME_F1 (acosh), 2.50) +#else +TEST_ULP (V_NAME_F1 (acosh), 2.78) +#endif +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500) +TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000) +TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000) +TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c new file mode 100644 index 000000000000..c751d9264a12 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asin.c @@ -0,0 +1,130 @@ +/* + * Double-precision vector asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10; + float64x2_t pi_over_2; + uint64x2_t abs_mask; + double c1, c3, c5, c7, c9, c11; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) + on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), +}; + +#define AllMask v_u64 (0xffffffffffffffff) +#define One 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (asin, x, y, special); +} +#endif + +/* Double-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use an order 11 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 1.01 ulps, + _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2 + want 0x1.ed78525a927eep-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.69 ulps, + _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t ax = vabsq_f64 (x); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate exceptions are raised. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)), + v_u64 (One - Small)); + if (unlikely (v_any_u64 (special))) + return special_case (x, x, AllMask); +#endif + + uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x), + vfmsq_n_f64 (v_f64 (0.5), ax, 0.5)); + float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); + + /* order-11 estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + float64x2_t p = vfmaq_f64 (p07, z16, p811); + + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0)); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +TEST_SIG (V, D, 1, asin, -1.0, 1.0) +TEST_ULP (V_NAME_D1 (asin), 2.20) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asin), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000) +TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c new file mode 100644 index 000000000000..970feb37e1d5 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinf.c @@ -0,0 +1,106 @@ +/* + * Single-precision vector asin(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t pi_over_2f; +} data = { + /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on + [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ + .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), + V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, + .pi_over_2f = V4 (0x1.921fb6p+0f), +}; + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Small 0x39800000 /* 2^-12. */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (asinf, x, y, special); +} +#endif + +/* Single-precision implementation of vector asin(x). + + For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct + rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the + following approximation. + + For |x| in [Small, 0.5], use order 4 polynomial P such that the final + approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). + + The largest observed error in this region is 0.83 ulps, + _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2. + + For |x| in [0.5, 1.0], use same approximation with a change of variable + + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + + The largest observed error in this region is 2.41 ulps, + _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask)); + +#if WANT_SIMD_EXCEPT + /* Special values need to be computed with scalar fallbacks so + that appropriate fp exceptions are raised. */ + uint32x4_t special + = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small)); + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (0xffffffff)); +#endif + + float32x4_t ax = vreinterpretq_f32_u32 (ia); + uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); + + /* Evaluate polynomial Q(x) = y + y * z * P(z) with + z = x ^ 2 and y = |x| , if |x| < 0.5 + z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ + float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), + vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); + + /* Use a single polynomial approximation P for both intervals. */ + float32x4_t p = v_horner_4_f32 (z2, d->poly); + /* Finalize polynomial: z + z * z2 * P(z2). */ + p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* asin(|x|) = Q(|x|) , for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + float32x4_t y + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); + + /* Copy sign. */ + return vbslq_f32 (v_u32 (AbsMask), y, x); +} + +HALF_WIDTH_ALIAS_F1 (asin) + +TEST_SIG (V, F, 1, asin, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (asin), 1.91) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asin), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000) +TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000) +TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000) +TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c new file mode 100644 index 000000000000..550302826bd9 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinh.c @@ -0,0 +1,242 @@ +/* + * Double-precision vector asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "test_defs.h" +#include "test_sig.h" +#include "v_math.h" + +const static struct data +{ + uint64x2_t huge_bound, abs_mask, off, mask; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound; +#endif + float64x2_t lc0, lc2; + double lc1, lc3, ln2, lc4; + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; + double c1, c3, c5, c7, c9, c11, c13, c15; + +} data = { + +#if WANT_SIMD_EXCEPT + .tiny_bound = V2 (0x1p-26), +#endif + /* Even terms of polynomial s.t. asinh(x) is approximated by + asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). + Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ + + .c0 = V2 (-0x1.55555555554a7p-3), + .c1 = 0x1.3333333326c7p-4, + .c2 = V2 (-0x1.6db6db68332e6p-5), + .c3 = 0x1.f1c71b26fb40dp-6, + .c4 = V2 (-0x1.6e8b8b654a621p-6), + .c5 = 0x1.1c4daa9e67871p-6, + .c6 = V2 (-0x1.c9871d10885afp-7), + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c8 = V2 (-0x1.3ddca533e9f54p-7), + .c9 = 0x1.0becef748dafcp-7, + .c10 = V2 (-0x1.b90c7099dd397p-8), + .c11 = 0x1.541f2bb1ffe51p-8, + .c12 = V2 (-0x1.d217026a669ecp-9), + .c13 = 0x1.0b5c7977aaf7p-9, + .c14 = V2 (-0x1.e0f37daef9127p-11), + .c15 = 0x1.388b5fe542a6p-12, + .c16 = V2 (-0x1.021a48685e287p-14), + .c17 = V2 (0x1.93d4ba83d34dap-18), + + .lc0 = V2 (-0x1.ffffffffffff7p-2), + .lc1 = 0x1.55555555170d4p-2, + .lc2 = V2 (-0x1.0000000399c27p-2), + .lc3 = 0x1.999b2e90e94cap-3, + .lc4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + + .off = V2 (0x3fe6900900000000), + .huge_bound = V2 (0x5fe0000000000000), + .abs_mask = V2 (0x7fffffffffffffff), + .mask = V2 (0xfffULL << 52), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask, + uint64x2_t special) +{ + /* Copy sign. */ + y = vbslq_f64 (abs_mask, y, x); + return v_call_f64 (asinh, x, y, special); +} + +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +log_inline (float64x2_t xm, const struct data *d) +{ + + uint64x2_t u = vreinterpretq_u64_f64 (xm); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->lc1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1); + y = vfmaq_f64 (p, r2, y); + return vfmaq_f64 (hi, y, r2); +} + +/* Double-precision implementation of vector asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 2.79 ULP, in + |x| >= 1: + _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1 + want 0x1.ffffd003219ddp-1. */ +VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t ax = vabsq_f64 (x); + + uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); + +#if WANT_SIMD_EXCEPT + uint64x2_t iax = vreinterpretq_u64_f64 (ax); + uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound)); + uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); + special = vorrq_u64 (special, tiny); +#else + uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound)); +#endif + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). + If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will + overflow, by setting special lanes to 1. These will be fixed later. */ + float64x2_t option_1 = v_f64 (0); + if (likely (v_any_u64 (gt1))) + { +#if WANT_SIMD_EXCEPT + float64x2_t xm = v_zerofy_f64 (ax, special); +#else + float64x2_t xm = ax; +#endif + option_1 = log_inline ( + vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will + overflow, and tiny lanes, which will underflow, by setting them to 0. They + will be fixed later, either by selecting x or falling back to the scalar + special-case. The largest observed error in this region is 1.47 ULPs: + _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. */ + float64x2_t option_2 = v_f64 (0); + + if (likely (v_any_u64 (vceqzq_u64 (gt1)))) + { + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); +#endif + float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2); + /* Order-17 Pairwise Horner scheme. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1); + float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17); + + float64x2_t p = vfmaq_f64 (p1415, z2, p1617); + p = vfmaq_f64 (p1213, z2, p); + p = vfmaq_f64 (p1011, z2, p); + p = vfmaq_f64 (p89, z2, p); + + p = vfmaq_f64 (p67, z2, p); + p = vfmaq_f64 (p45, z2, p); + + p = vfmaq_f64 (p23, z2, p); + + p = vfmaq_f64 (p01, z2, p); + option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2)); +#if WANT_SIMD_EXCEPT + option_2 = vbslq_f64 (tiny, x, option_2); +#endif + } + + /* Choose the right option for each lane. */ + float64x2_t y = vbslq_f64 (gt1, option_1, option_2); + if (unlikely (v_any_u64 (special))) + { + return special_case (x, y, d->abs_mask, special); + } + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +TEST_SIG (V, D, 1, asinh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (asinh), 2.29) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0, 0x1p-26, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p-26, 1, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 1, 0x1p511, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (asinh), 0x1p511, inf, 40000) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the v_sel is choosing the right option in all cases. */ +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0.5) +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 2) +TEST_CONTROL_VALUE (V_NAME_D1 (asinh), 0x1p600) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c new file mode 100644 index 000000000000..6a96f6ee9f4b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/asinhf.c @@ -0,0 +1,89 @@ +/* + * Single-precision vector asinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + float32x4_t one; + uint32x4_t big_bound; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .one = V4 (1), + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */ +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t sign, float32x4_t y, + uint32x4_t special, const struct data *d) +{ + return v_call_f32 ( + asinhf, x, + vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))), + special); +} + +/* Single-precision implementation of vector asinh(x), using vector log1p. + Worst-case error is 2.59 ULP: + _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3 + want 0x1.d449c4p-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); + uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax); + float32x4_t special_arg = x; + +#if WANT_SIMD_EXCEPT + /* Sidestep tiny and large values to avoid inadvertently triggering + under/overflow. */ + special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound)); + if (unlikely (v_any_u32 (special))) + { + ax = v_zerofy_f32 (ax, special); + x = v_zerofy_f32 (x, special); + } +#endif + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + float32x4_t d + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax))); + float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)); + + if (unlikely (v_any_u32 (special))) + return special_case (special_arg, sign, y, special, dat); + return vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts)))); +} + +HALF_WIDTH_ALIAS_F1 (asinh) + +TEST_SIG (V, F, 1, asinh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (asinh), 2.10) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000) +TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c new file mode 100644 index 000000000000..26d264321068 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan.c @@ -0,0 +1,135 @@ +/* + * Double-precision vector atan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), +}; + +#define SignMask v_u64 (0x8000000000000000) +#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */ +#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */ + +/* Fast implementation of vector atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: + _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t sign = vandq_u64 (ix, SignMask); + +#if WANT_SIMD_EXCEPT + uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000)); + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)), + v_u64 (BigBound - TinyBound)); + /* If any lane is special, fall back to the scalar routine for all lanes. */ + if (unlikely (v_any_u64 (special))) + return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1)); +#endif + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); + /* Use absolute value only when needed (odd powers of z). */ + float64x2_t az = vbslq_f64 ( + SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t y = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); + y = vaddq_f64 (y, shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); + return y; +} + +TEST_SIG (V, D, 1, atan, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (atan), 1.78) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atan), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000) +TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000) +TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000) +TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000) +TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000) +TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c new file mode 100644 index 000000000000..18c4b70b92f6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2.c @@ -0,0 +1,171 @@ +/* + * Double-precision vector atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64x2_t pi_over_2; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + uint64x2_t zeroinfnan, minustwo; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), + .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), + .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), + .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), + .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), + .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), + .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), + .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), + .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), + .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), + .c19 = 0x1.358851160a528p-16, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), + .minustwo = V2 (0xc000000000000000), +}; + +#define SignMask v_u64 (0x8000000000000000) + +/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, + uint64x2_t sign_xy, uint64x2_t cmp) +{ + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + return v_call2_f64 (atan2, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline uint64x2_t +zeroinfnan (uint64x2_t i, const struct data *d) +{ + /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); +} + +/* Fast implementation of vector atan2. + Maximum observed error is 2.8 ulps: + _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) + got 0x1.92d628ab678ccp-1 + want 0x1.92d628ab678cfp-1. */ +float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t iy = vreinterpretq_u64_f64 (y); + + uint64x2_t special_cases + = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint64x2_t sign_x = vandq_u64 (ix, SignMask); + uint64x2_t sign_y = vandq_u64 (iy, SignMask); + uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t pred_xlt0 = vcltzq_f64 (x); + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); + + /* Set up z for call to atan. */ + float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, q); + + /* Work out the correct shift. */ + float64x2_t shift + = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); + shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); + shift = vmulq_f64 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. + The order 19 polynomial P approximates + (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + float64x2_t z2 = vmulq_f64 (z, z); + float64x2_t x2 = vmulq_f64 (z2, z2); + float64x2_t x4 = vmulq_f64 (x2, x2); + float64x2_t x8 = vmulq_f64 (x4, x4); + + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t ret = vfmaq_f64 (p07, p819, x8); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); + ret = vaddq_f64 (ret, shift); + + if (unlikely (v_any_u64 (special_cases))) + return special_case (y, x, ret, sign_xy, special_cases); + + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); + + return ret; +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (V, D, 2, atan2) +// TODO tighten this once __v_atan2 is fixed +TEST_ULP (V_NAME_D2 (atan2), 2.9) +TEST_DISABLE_FENV (V_NAME_D2 (atan2)) +TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000) +TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c new file mode 100644 index 000000000000..632014249ab0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atan2f.c @@ -0,0 +1,127 @@ +/* + * Single-precision vector atan2(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, pi_over_2, c4, c6, c2; + float c1, c3, c5, c7; + uint32x4_t comp_const; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, + .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, + .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, + .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, + .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), +}; + +#define SignMask v_u32 (0x80000000) + +/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, + uint32x4_t sign_xy, uint32x4_t cmp) +{ + /* Account for the sign of y. */ + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); + return v_call2_f32 (atan2f, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline uint32x4_t +zeroinfnan (uint32x4_t i, const struct data *d) +{ + /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); +} + +/* Fast implementation of vector atan2f. Maximum observed error is + 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: + _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t iy = vreinterpretq_u32_f32 (y); + + uint32x4_t special_cases + = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); + + uint32x4_t sign_x = vandq_u32 (ix, SignMask); + uint32x4_t sign_y = vandq_u32 (iy, SignMask); + uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t pred_xlt0 = vcltzq_f32 (x); + uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); + + /* Set up z for call to atanf. */ + float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); + float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, q); + + /* Work out the correct shift. */ + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); + shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); + shift = vmulq_f32 (shift, d->pi_over_2); + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + + float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); + + /* y = shift + z * P(z^2). */ + ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); + + if (unlikely (v_any_u32 (special_cases))) + { + return special_case (y, x, ret, sign_xy, special_cases); + } + + /* Account for the sign of y. */ + return vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); +} + +HALF_WIDTH_ALIAS_F2 (atan2) + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +TEST_SIG (V, F, 2, atan2) +TEST_DISABLE_FENV (V_NAME_F2 (atan2)) +TEST_ULP (V_NAME_F2 (atan2), 2.46) +TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000) +TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000) +TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c new file mode 100644 index 000000000000..61927c9b261a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanf.c @@ -0,0 +1,109 @@ +/* + * Single-precision vector atan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" + +static const struct data +{ + float32x4_t poly[8]; + float32x4_t pi_over_2; +} data = { + /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-128, 1.0]. + Generated using fpminimax between FLT_MIN and 1. */ + .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), + V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), + V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, + .pi_over_2 = V4 (0x1.921fb6p+0f), +}; + +#define SignMask v_u32 (0x80000000) + +#define P(i) d->poly[i] + +#define TinyBound 0x30800000 /* asuint(0x1p-30). */ +#define BigBound 0x4e800000 /* asuint(0x1p30). */ + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (atanf, x, y, special); +} +#endif + +/* Fast implementation of vector atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] + using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: + _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t sign = vandq_u32 (ix, SignMask); + +#if WANT_SIMD_EXCEPT + uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); + uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), + v_u32 (BigBound - TinyBound)); + /* If any lane is special, fall back to the scalar routine for all lanes. */ + if (unlikely (v_any_u32 (special))) + return special_case (x, x, v_u32 (-1)); +#endif + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); + float32x4_t shift = vreinterpretq_f32_u32 ( + vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); + /* Use absolute value only when needed (odd powers of z). */ + float32x4_t az = vbslq_f32 ( + SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); + + /* Calculate the polynomial approximation. + Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z4 = vmulq_f32 (z2, z2); + + float32x4_t y = vfmaq_f32 ( + v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, + vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); + + /* y = shift + z * P(z^2). */ + y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); + + return y; +} + +HALF_WIDTH_ALIAS_F1 (atan) + +TEST_SIG (V, F, 1, atan, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (atan), 2.5) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c new file mode 100644 index 000000000000..c2f9585dd29b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanh.c @@ -0,0 +1,75 @@ +/* + * Double-precision vector atanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data log1p_consts; + uint64x2_t one; + uint64x2_t sign_mask; +} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, + .one = V2 (0x3ff0000000000000), + .sign_mask = V2 (0x8000000000000000) }; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y, + uint64x2_t special, const struct data *d) +{ + y = log1p_inline (y, &d->log1p_consts); + return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x), + vmulq_f64 (halfsign, y), special); +} + +/* Approximation for vector double-precision atanh(x) using modified log1p. + The greatest observed error is 3.31 ULP: + _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (atanh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5)); + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (ax); + uint64x2_t special = vcgeq_u64 (ia, d->one); + +#if WANT_SIMD_EXCEPT + ax = v_zerofy_f64 (ax, special); +#endif + + float64x2_t y; + y = vaddq_f64 (ax, ax); + y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); + + if (unlikely (v_any_u64 (special))) +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special, d); +#else + return special_case (ax, halfsign, y, special, d); +#endif + + y = log1p_inline (y, &d->log1p_consts); + return vmulq_f64 (y, halfsign); +} + +TEST_SIG (V, D, 1, atanh, -1.0, 1.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT) +TEST_ULP (V_NAME_D1 (atanh), 3.32) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0, 0x1p-23, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 0x1p-23, 1, 90000) +TEST_SYM_INTERVAL (V_NAME_D1 (atanh), 1, inf, 100) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (V_NAME_D1 (atanh), 0) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c new file mode 100644 index 000000000000..313d15ca6391 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/atanhf.c @@ -0,0 +1,90 @@ +/* + * Single-precision vector atanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +const static struct data +{ + struct v_log1pf_data log1pf_consts; + uint32x4_t one; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound; +#endif +} data = { + .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, + .one = V4 (0x3f800000), +#if WANT_SIMD_EXCEPT + /* 0x1p-12, below which atanhf(x) rounds to x. */ + .tiny_bound = V4 (0x39800000), +#endif +}; + +#define AbsMask v_u32 (0x7fffffff) +#define Half v_u32 (0x3f000000) + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y, + uint32x4_t special) +{ + return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign), + vmulq_f32 (halfsign, y), special); +} + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 2.93 ULP: + _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5 + want 0x1.f4dcf8p-5. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + +#if WANT_SIMD_EXCEPT + uint32x4_t special + = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound)); + /* Side-step special cases by setting those lanes to 0, which will trigger no + exceptions. These will be fixed up later. */ + if (unlikely (v_any_u32 (special))) + ax = v_zerofy_f32 (ax, special); +#else + uint32x4_t special = vcgeq_u32 (iax, d->one); +#endif + + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), + vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); + y = log1pf_inline (y, &d->log1pf_consts); + + /* If exceptions not required, pass ax to special-case for shorter dependency + chain. If exceptions are required ax will have been zerofied, so have to + pass x. */ + if (unlikely (v_any_u32 (special))) +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special); +#else + return special_case (ax, halfsign, y, special); +#endif + return vmulq_f32 (halfsign, y); +} + +HALF_WIDTH_ALIAS_F1 (atanh) + +TEST_SIG (V, F, 1, atanh, -1.0, 1.0) +TEST_ULP (V_NAME_F1 (atanh), 2.44) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0, 0x1p-12, 500) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 0x1p-12, 1, 200000) +TEST_SYM_INTERVAL (V_NAME_F1 (atanh), 1, inf, 1000) +/* atanh is asymptotic at 1, which is the default control value - have to set + -c 0 specially to ensure fp exceptions are triggered correctly (choice of + control lane is irrelevant if fp exceptions are disabled). */ +TEST_CONTROL_VALUE (V_NAME_F1 (atanh), 0) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c new file mode 100644 index 000000000000..8e72e5b566fc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrt.c @@ -0,0 +1,127 @@ +/* + * Double-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f64.h" + +const static struct data +{ + float64x2_t poly[4], one_third, shift; + int64x2_t exp_bias; + uint64x2_t abs_mask, tiny_bound; + uint32x4_t thresh; + double table[5]; +} data = { + .shift = V2 (0x1.8p52), + .poly = { /* Generated with fpminimax in [0.5, 1]. */ + V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1), + V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) }, + .exp_bias = V2 (1022), + .abs_mask = V2(0x7fffffffffffffff), + .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */ + .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */ + .one_third = V2(0x1.5555555555555p-2), + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, + 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 } +}; + +#define MantissaMask v_u64 (0x000fffffffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint32x2_t special) +{ + return v_call_f64 (cbrt, x, y, vmovl_u32 (special)); +} + +/* Approximation for double-precision vector cbrt(x), using low-order + polynomial and two Newton iterations. + + The vector version of frexp does not handle subnormals + correctly. As a result these need to be handled by the scalar + fallback, where accuracy may be worse than that of the vector code + path. + + Greatest observed error in the normal range is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + _ZGVnN2v_cbrt (0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + /* Subnormal, +/-0 and special values. */ + uint32x2_t special + = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5)); + int64x2_t exp_bias = d->exp_bias; + uint64x2_t ia12 = vshrq_n_u64 (iax, 52); + int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point + for Newton iterations. */ + float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly); + float64x2_t one_third = d->one_third; + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + float64x2_t m_by_3 = vmulq_f64 (m, one_third); + float64x2_t two_thirds = vaddq_f64 (one_third, one_third); + float64x2_t a + = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p); + a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + float64x2_t ef = vcvtq_f64_s64 (e); + float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third)); + int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3))); + int64x2_t ey = vcvtq_s64_f64 (eb3f); + + float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] }; + my = vmulq_f64 (my, a); + + /* Vector version of ldexp. */ + float64x2_t y = vreinterpretq_f64_s64 ( + vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52)); + y = vmulq_f64 (y, my); + + if (unlikely (v_any_u32h (special))) + return special_case (x, vbslq_f64 (d->abs_mask, y, x), special); + + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); +} + +/* Worse-case ULP error assumes that scalar fallback is GLIBC 2.40 cbrt, which + has ULP error of 3.67 at 0x1.7a337e1ba1ec2p-257 [1]. Largest observed error + in the vector path is 1.79 ULP. + [1] Innocente, V., & Zimmermann, P. (2024). Accuracy of Mathematical + Functions in Single, Double, Double Extended, and Quadruple Precision. */ +TEST_ULP (V_NAME_D1 (cbrt), 3.17) +TEST_SIG (V, D, 1, cbrt, -10.0, 10.0) +TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c new file mode 100644 index 000000000000..4e76feb2dd8b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cbrtf.c @@ -0,0 +1,117 @@ +/* + * Single-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" + +const static struct data +{ + float32x4_t poly[4], one_third; + float table[5]; +} data = { + .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with + FPMinimax. */ + V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1), + V4 (0x1.2c74c2p-3) }, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 }, + .one_third = V4 (0x1.555556p-2f), +}; + +#define SignMask v_u32 (0x80000000) +#define SmallestNormal v_u32 (0x00800000) +#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */ +#define MantissaMask v_u32 (0x007fffff) +#define HalfExp v_u32 (0x3f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint16x4_t special) +{ + return v_call_f32 (cbrtf, x, y, vmovl_u16 (special)); +} + +static inline float32x4_t +shifted_lookup (const float *table, int32x4_t i) +{ + return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2], + table[i[3] + 2] }; +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration + with initial guess obtained by a low-order polynomial. Greatest error + is 1.64 ULP. This is observed for every value where the mantissa is + 0x1.85a2aa and the exponent is a multiple of 3, for example: + _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1 + want 0x1.267932p+1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + /* Subnormal, +/-0 and special values. */ + uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5)); + int32x4_t e + = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126)); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly); + + float32x4_t one_third = d->one_third; + float32x4_t two_thirds = vaddq_f32 (one_third, one_third); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float32x4_t m_by_3 = vmulq_f32 (m, one_third); + float32x4_t a + = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which + is an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third); + int32x4_t ey = vcvtq_s32_f32 (ef); + int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3))); + + float32x4_t my = shifted_lookup (d->table, em3); + my = vmulq_f32 (my, a); + + /* Vector version of ldexpf. */ + float32x4_t y + = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23)); + y = vmulq_f32 (y, my); + + if (unlikely (v_any_u16h (special))) + return special_case (x, vbslq_f32 (SignMask, x, y), special); + + /* Copy sign. */ + return vbslq_f32 (SignMask, x, y); +} + +HALF_WIDTH_ALIAS_F1 (cbrt) + +TEST_SIG (V, F, 1, cbrt, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (cbrt), 1.15) +TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c new file mode 100644 index 000000000000..40ba5ff31f20 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpi.c @@ -0,0 +1,47 @@ +/* + * Double-precision vector sincos function - return-by-value interface. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincos_common.h" +#include "v_math.h" +#include "test_defs.h" + +static float64x2x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y) +{ + return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special), + v_call_f64 (cos, x, y.val[1], special) }; +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR float64x2x2_t +_ZGVnN2v_cexpi (float64x2_t x) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, sc); + return sc; +} + +TEST_DISABLE_FENV (_ZGVnN2v_cexpi_cos) +TEST_DISABLE_FENV (_ZGVnN2v_cexpi_sin) +TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73) +TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73) +#define V_CEXPI_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n) +V_CEXPI_INTERVAL (0, 0x1p23, 500000) +V_CEXPI_INTERVAL (-0, -0x1p23, 500000) +V_CEXPI_INTERVAL (0x1p23, inf, 10000) +V_CEXPI_INTERVAL (-0x1p23, -inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c new file mode 100644 index 000000000000..e55d99653a66 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cexpif.c @@ -0,0 +1,49 @@ +/* + * Single-precision vector cexpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "test_defs.h" + +static float32x4x2_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y) +{ + return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special), + v_call_f32 (cosf, x, y.val[1], special) }; +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR float32x4x2_t +_ZGVnN4v_cexpif (float32x4_t x) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, sc); + return sc; +} + +TEST_DISABLE_FENV (_ZGVnN4v_cexpif_sin) +TEST_DISABLE_FENV (_ZGVnN4v_cexpif_cos) +TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17) +TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31) +#define V_CEXPIF_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n) +V_CEXPIF_INTERVAL (0, 0x1p20, 500000) +V_CEXPIF_INTERVAL (-0, -0x1p20, 500000) +V_CEXPIF_INTERVAL (0x1p20, inf, 10000) +V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c new file mode 100644 index 000000000000..9f3de4dd5c36 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cos.c @@ -0,0 +1,92 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); + n = vsubq_f64 (n, v_f64 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +TEST_SIG (V, D, 1, cos, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (cos), 3.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cos), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (cos), 0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c new file mode 100644 index 000000000000..d2844e44e196 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosf.c @@ -0,0 +1,89 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +HALF_WIDTH_ALIAS_F1 (cos) + +TEST_SIG (V, F, 1, cos, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (cos), 1.4) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cos), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0, 0x1p20, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (cos), 0x1p20, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c new file mode 100644 index 000000000000..54407b23aa9d --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cosh.c @@ -0,0 +1,107 @@ +/* + * Double-precision vector cosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[3]; + float64x2_t inv_ln2; + double ln2[2]; + float64x2_t shift, thres; + uint64x2_t index_mask, special_bound; +} data = { + .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5), }, + + .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + /* -ln2/N. */ + .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64}, + .shift = V2 (0x1.8p+52), + .thres = V2 (704.0), + + .index_mask = V2 (0xff), + /* 0x1.6p9, above which exp overflows. */ + .special_bound = V2 (0x4086000000000000), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from v_exp_tail, with no + special-case handling or tail. */ +static inline float64x2_t +exp_inline (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* n = round(x/(ln2/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*ln2/N. */ + float64x2_t ln2 = vld1q_f64 (d->ln2); + float64x2_t r = vfmaq_laneq_f64 (x, n, ln2, 0); + r = vfmaq_laneq_f64 (r, n, ln2, 1); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, d->index_mask); + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r); + y = vfmaq_f64 (d->poly[0], y, r); + y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_tail_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + return vfmaq_f64 (s, y, s); +} + +/* Approximation for vector double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 + want 0x1.f711dcb0c77b1p+7. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t special + = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + float64x2_t t = exp_inline (ax); + float64x2_t half_t = vmulq_n_f64 (t, 0.5); + float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t); + + /* Fall back to scalar for any special cases. */ + if (unlikely (v_any_u64 (special))) + return special_case (x, vaddq_f64 (half_t, half_over_t), special); + + return vaddq_f64 (half_t, half_over_t); +} + +TEST_SIG (V, D, 1, cosh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (cosh), 1.43) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cosh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c new file mode 100644 index 000000000000..f1ed3e5161fd --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/coshf.c @@ -0,0 +1,92 @@ +/* + * Single-precision vector cosh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_expf_inline.h" +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + struct v_expf_data expf_consts; + uint32x4_t tiny_bound; + float32x4_t bound; +#if WANT_SIMD_EXCEPT + uint32x4_t special_bound; +#endif +} data = { + .expf_consts = V_EXPF_DATA, + .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .bound = V4 (0x1.5a92d8p+6), +#if WANT_SIMD_EXCEPT + .special_bound = V4 (0x42ad496c), +#endif +}; + +#if !WANT_SIMD_EXCEPT +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, + uint32x4_t special) +{ + return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); +} +#endif + +/* Single-precision vector cosh, using vector expf. + Maximum error is 2.38 ULP: + _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 + want 0x1.6a4922p+4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cosh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, fall back to the scalar + variant for all inputs if any input is a special value or above the bound + at which expf overflows. */ + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); + if (unlikely (v_any_u32 (special))) + return v_call_f32 (coshf, x, x, v_u32 (-1)); + + uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound); + /* If any input is tiny, avoid underflow exception by fixing tiny lanes of + input to 0, which will generate no exceptions. */ + if (unlikely (v_any_u32 (tiny))) + ax = v_zerofy_f32 (ax, tiny); + float32x4_t t = v_expf_inline (ax, &d->expf_consts); +#else + uint32x4_t special = vcageq_f32 (x, d->bound); + float32x4_t t = v_expf_inline (x, &d->expf_consts); +#endif + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + float32x4_t half_t = vmulq_n_f32 (t, 0.5); + float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (tiny))) + return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); +#else + if (unlikely (v_any_u32 (special))) + return special_case (x, half_t, half_over_t, special); +#endif + + return vaddq_f32 (half_t, half_over_t); +} + +HALF_WIDTH_ALIAS_F1 (cosh) + +TEST_SIG (V, F, 1, cosh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (cosh), 1.89) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1p-63, 1, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 1, 0x1.5a92d8p+6, 80000) +TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c new file mode 100644 index 000000000000..e63201a55786 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospi.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cospi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[10]; + float64x2_t range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (arm_math_cospi, x, y, cmp); +} + +/* Approximation for vector double-precision cospi(x). + Maximum Error 3.06 ULP: + _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1 + want 0x1.fa854babfb6c1p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float64x2_t r = vabsq_f64 (x); + uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f64 (r, cmp); + uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63); + +#else + float64x2_t r = x; + uint64x2_t cmp = vcageq_f64 (r, d->range_val); + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + +#endif + + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_D1 (cospi), 2.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c new file mode 100644 index 000000000000..62f4b8122b2c --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/cospif.c @@ -0,0 +1,86 @@ +/* + * Single-precision vector cospi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[6]; + float32x4_t range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (arm_math_cospif, x, y, cmp); +} + +/* Approximation for vector single-precision cospi(x) + Maximum Error: 3.17 ULP: + _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float32x4_t r = vabsq_f32 (x); + uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f32 (r, cmp); + uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31); + +#else + float32x4_t r = x; + uint32x4_t cmp = vcageq_f32 (r, d->range_val); + + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + +#endif + + /* r = x - rint(x). */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +HALF_WIDTH_ALIAS_F1 (cospi) + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_F1 (cospi), 2.67) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c new file mode 100644 index 000000000000..40717a660ce2 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erf.c @@ -0,0 +1,166 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t third; + float64x2_t tenth, two_over_five, two_over_nine; + double two_over_fifteen, two_over_fortyfive; + float64x2_t max, shift; + uint64x2_t max_idx; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound, huge_bound, scale_minus_one; +#endif +} data = { + .max_idx = V2 (768), + .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ + .two_over_fifteen = 0x1.1111111111111p-3, + .tenth = V2 (-0x1.999999999999ap-4), + .two_over_five = V2 (-0x1.999999999999ap-2), + .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, + .max = V2 (5.9921875), /* 6 - 1/128. */ + .shift = V2 (0x1p45), +#if WANT_SIMD_EXCEPT + .huge_bound = V2 (0x1p205), + .tiny_bound = V2 (0x1p-226), + .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffffffffffff + +struct entry +{ + float64x2_t erf; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf), + e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf); + e.erf = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +/* Double-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [ + + 1 + - r d + + 1/3 (2 r^2 - 1) d^2 + - 1/6 (r (2 r^2 - 3)) d^3 + + 1/30 (4 r^4 - 12 r^2 + 3) d^4 + - 1/90 (4 r^4 - 20 r^2 + 15) d^5 + ] + + Maximum measure error: 2.29 ULP + V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8 + want -0x1.20dd59132ebafp-8. */ +float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + + float64x2_t a = vabsq_f64 (x); + /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs + to return expected results. */ + uint64x2_t a_le_max = vcaleq_f64 (x, dat->max); + uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max); + +#if WANT_SIMD_EXCEPT + /* |x| huge or tiny. */ + uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound); + uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound); + uint64x2_t cmp = vorrq_u64 (cmp1, cmp2); + /* If any lanes are special, mask them with 1 for small x or 8 for large + values and retain a copy of a to allow special case handler to fix special + lanes later. This is only necessary if fenv exceptions are to be triggered + correctly. */ + if (unlikely (v_any_u64 (cmp))) + { + a = vbslq_f64 (cmp1, v_f64 (8.0), a); + a = vbslq_f64 (cmp2, v_f64 (1.0), a); + } +#endif + + /* Set r to multiple of 1/128 nearest to |x|. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Lookup erf(r) and scale(r) in table, without shortcut for small values, + but with saturated indices for large values and NaNs in order to avoid + segfault. */ + uint64x2_t i + = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); + i = vbslq_u64 (a_le_max, i, dat->max_idx); + struct entry e = lookup (i); + + float64x2_t r = vsubq_f64 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t two_over_fifteen_and_fortyfive + = vld1q_f64 (&dat->two_over_fifteen); + + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ + float64x2_t p1 = r; + float64x2_t p2 + = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); + float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2, + two_over_fifteen_and_fortyfive, 0); + p4 = vfmsq_f64 (dat->tenth, r2, p4); + float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2, + two_over_fifteen_and_fortyfive, 1); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); + + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p34, d2, p5); + y = vfmaq_f64 (p12, d2, y); + + y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y)); + + /* Solves the |x| = inf and NaN cases. */ + y = vbslq_f64 (a_gt_max, v_f64 (1.0), y); + + /* Copy sign. */ + y = vbslq_f64 (v_u64 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp2))) + { + /* Neutralise huge values of x before fixing small values. */ + x = vbslq_f64 (cmp1, v_f64 (1.0), x); + /* Fix tiny values that trigger spurious underflow. */ + return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y); + } +#endif + return y; +} + +TEST_SIG (V, D, 1, erf, -6.0, 6.0) +TEST_ULP (V_NAME_D1 (erf), 1.79) +/* WANT_SIMD_EXCEPT blocks miss some cases. */ +TEST_DISABLE_FENV (V_NAME_D1 (erf)) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000) +TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c new file mode 100644 index 000000000000..97ef09ecc113 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfc.c @@ -0,0 +1,205 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t offset, table_scale; + float64x2_t max, shift; + float64x2_t p20, p40, p41, p51; + double p42, p52; + double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; +#if WANT_SIMD_EXCEPT + float64x2_t uflow_bound; +#endif +} data = { + /* Set an offset so the range of the index used for lookup is 3487, and it + can be clamped using a saturated add on an offset index. + Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */ + .offset = V2 (0xbd3ffffffffff260), + .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */ + .max = V2 (0x1.b3ep+4), /* 3487/128. */ + .shift = V2 (0x1p45), + .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ + .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ + .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ + .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ + /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ + .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, + .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, + .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 }, + .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 }, + .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 }, +#if WANT_SIMD_EXCEPT + .uflow_bound = V2 (0x1.a8b12fc6e4892p+4), +#endif +}; + +#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */ +#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */ + +struct entry +{ + float64x2_t erfc; + float64x2_t scale; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + float64x2_t e1 + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); + float64x2_t e2 + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); + e.erfc = vuzp1q_f64 (e1, e2); + e.scale = vuzp2q_f64 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + return v_call_f64 (erfc, x, y, cmp); +} +#endif + +/* Optimized double-precision vector erfc(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5 + + p6(r) d^6 + ... + p10(r) d^10 + + Polynomials p6(r) to p10(r) are computed using recurrence relation + + 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0, + with p0 = 1, and p1(r) = -r. + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + + Maximum measured error: 1.71 ULP + V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608 + want 0x1.e15fcbea3e7adp-608. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (erfc) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */ + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound)); + /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x), + vreinterpretq_s64_f64 (dat->uflow_bound)); + cmp = vorrq_u64 (cmp, uflow); + float64x2_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u64 (cmp))) + x = v_zerofy_f64 (x, cmp); +#endif + + float64x2_t a = vabsq_f64 (x); + a = vminq_f64 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float64x2_t shift = dat->shift; + float64x2_t z = vaddq_f64 (a, shift); + + /* Clamp index to a range of 3487. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float64x2_t r = vsubq_f64 (z, shift); + float64x2_t d = vsubq_f64 (a, r); + float64x2_t d2 = vmulq_f64 (d, d); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t p1 = r; + float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); + float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); + float64x2_t p42_p52 = vld1q_f64 (&dat->p42); + float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0); + p4 = vfmsq_f64 (dat->p40, r2, p4); + float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1); + p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); + /* Compute p_i using recurrence relation: + p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ + float64x2_t qr5 = vld1q_f64 (dat->qr5), qr6 = vld1q_f64 (dat->qr6), + qr7 = vld1q_f64 (dat->qr7), qr8 = vld1q_f64 (dat->qr8), + qr9 = vld1q_f64 (dat->qr9); + float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, qr5, 0)); + p6 = vmulq_laneq_f64 (p6, qr5, 1); + float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, qr6, 0)); + p7 = vmulq_laneq_f64 (p7, qr6, 1); + float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, qr7, 0)); + p8 = vmulq_laneq_f64 (p8, qr7, 1); + float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, qr8, 0)); + p9 = vmulq_laneq_f64 (p9, qr8, 1); + float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, qr9, 0)); + p10 = vmulq_laneq_f64 (p10, qr9, 1); + /* Compute polynomial in d using pairwise Horner scheme. */ + float64x2_t p90 = vfmaq_f64 (p9, d, p10); + float64x2_t p78 = vfmaq_f64 (p7, d, p8); + float64x2_t p56 = vfmaq_f64 (p5, d, p6); + float64x2_t p34 = vfmaq_f64 (p3, d, p4); + float64x2_t p12 = vfmaq_f64 (p1, d, p2); + float64x2_t y = vfmaq_f64 (p78, d2, p90); + y = vfmaq_f64 (p56, d2, y); + y = vfmaq_f64 (p34, d2, y); + y = vfmaq_f64 (p12, d2, y); + + y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y)); + + /* Offset equals 2.0 if sign, else 0.0. */ + uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63); + float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float64x2_t fac = vreinterpretq_f64_u64 ( + vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (xm, vfmaq_f64 (off, fac, y), cmp); +#endif + + return vfmaq_f64 (off, fac, y); +} + +TEST_SIG (V, D, 1, erfc, -6.0, 28.0) +TEST_ULP (V_NAME_D1 (erfc), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (erfc), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000) +TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c new file mode 100644 index 000000000000..f420439ef8a3 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erfcf.c @@ -0,0 +1,174 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint32x4_t offset, table_scale; + float32x4_t max, shift; + float coeffs[4]; + float32x4_t third, two_over_five, tenth; +#if WANT_SIMD_EXCEPT + float32x4_t uflow_bound; +#endif + +} data = { + /* Set an offset so the range of the index used for lookup is 644, and it can + be clamped using a saturated add. */ + .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */ + .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */ + .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */ + .shift = V4 (0x1p17f), + /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and + fmas. */ + .coeffs = { 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 }, + .third = V4 (0x1.555556p-2f), + .two_over_five = V4 (-0x1.99999ap-2f), + .tenth = V4 (-0x1.99999ap-4f), +#if WANT_SIMD_EXCEPT + .uflow_bound = V4 (0x1.2639cp+3f), +#endif +}; + +#define TinyBound 0x41000000 /* 0x1p-62f << 1. */ +#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */ +#define Off 0xfffffd7b /* 0xffffffff - 644. */ + +struct entry +{ + float32x4_t erfc; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float32x2_t t0 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); + float32x2_t t1 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); + float32x2_t t2 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); + float32x2_t t3 + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); + e.erfc = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +#if WANT_SIMD_EXCEPT +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + return v_call_f32 (erfcf, x, y, cmp); +} +#endif + +/* Optimized single-precision vector erfcf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/64. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erfc(x) ~ erfc(r) - scale * d * poly(r, d), with + + poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3 + + (2/15 r^4 - 2/5 r^2 + 1/10) d^4 + + Values of erfc(r) and scale are read from lookup tables. Stored values + are scaled to avoid hitting the subnormal range. + + Note that for x < 0, erfc(x) = 2.0 - erfc(-x). + Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0). + _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120 + want 0x1.f51216p-120. */ +NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */ + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound)); + /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in + integer domain to avoid raising exceptions in presence of nans. */ + uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x), + vreinterpretq_s32_f32 (dat->uflow_bound)); + cmp = vorrq_u32 (cmp, uflow); + float32x4_t xm = x; + /* If any lanes are special, mask them with 0 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + float32x4_t a = vabsq_f32 (x); + a = vminq_f32 (a, dat->max); + + /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + /* Clamp index to a range of 644. A naive approach would use a subtract and + min. Instead we offset the table address and the index, then use a + saturating add. */ + uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset); + + struct entry e = lookup (i); + + /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ + float32x4_t r = vsubq_f32 (z, shift); + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t p1 = r; + float32x4_t coeffs = vld1q_f32 (dat->coeffs); + float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, coeffs, 1); + float32x4_t p3 + = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, coeffs, 0)); + float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, coeffs, 2); + p4 = vfmsq_f32 (dat->tenth, r2, p4); + + float32x4_t y = vfmaq_f32 (p3, d, p4); + y = vfmaq_f32 (p2, d, y); + y = vfmaq_f32 (p1, d, y); + y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y)); + + /* Offset equals 2.0f if sign, else 0.0f. */ + uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31); + float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30)); + /* Copy sign and scale back in a single fma. Since the bit patterns do not + overlap, then logical or and addition are equivalent here. */ + float32x4_t fac = vreinterpretq_f32_u32 ( + vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (xm, vfmaq_f32 (off, fac, y), cmp); +#endif + + return vfmaq_f32 (off, fac, y); +} + +HALF_WIDTH_ALIAS_F1 (erfc) + +TEST_SIG (V, F, 1, erfc, -4.0, 10.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erfc), WANT_SIMD_EXCEPT) +TEST_ULP (V_NAME_F1 (erfc), 1.14) +TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000) +TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c new file mode 100644 index 000000000000..508bc4c2f5e2 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/erff.c @@ -0,0 +1,120 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t max, shift, third; +#if WANT_SIMD_EXCEPT + float32x4_t tiny_bound, scale_minus_one; +#endif +} data = { + .max = V4 (3.9375), /* 4 - 8/128. */ + .shift = V4 (0x1p16f), + .third = V4 (0x1.555556p-2f), /* 1/3. */ +#if WANT_SIMD_EXCEPT + .tiny_bound = V4 (0x1p-62f), + .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */ +#endif +}; + +#define AbsMask 0x7fffffff + +struct entry +{ + float32x4_t erf; + float32x4_t scale; +}; + +static inline struct entry +lookup (uint32x4_t i) +{ + struct entry e; + float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf); + float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf); + float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf); + float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf); + float32x4_t e1 = vcombine_f32 (t0, t1); + float32x4_t e2 = vcombine_f32 (t2, t3); + e.erf = vuzp1q_f32 (e1, e2); + e.scale = vuzp2q_f32 (e1, e2); + return e; +} + +/* Single-precision implementation of vector erf(x). + Approximation based on series expansion near x rounded to + nearest multiple of 1/128. + Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r, + + erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2] + + Values of erf(r) and scale are read from lookup tables. + For |x| > 3.9375, erf(|x|) rounds to 1.0f. + + Maximum error: 1.93 ULP + _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 + want 0x1.fd6868p-9. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x) +{ + const struct data *dat = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* |x| < 2^-62. */ + uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + float32x4_t a = vabsq_f32 (x); + uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max); + + /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to + 2/sqrt(pi), when x reduced to r = 0. */ + float32x4_t shift = dat->shift; + float32x4_t z = vaddq_f32 (a, shift); + + uint32x4_t i + = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift)); + i = vminq_u32 (i, v_u32 (512)); + struct entry e = lookup (i); + + float32x4_t r = vsubq_f32 (z, shift); + + /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */ + float32x4_t d = vsubq_f32 (a, r); + float32x4_t d2 = vmulq_f32 (d, d); + float32x4_t y = vfmaq_f32 (r, dat->third, d); + y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y)); + + /* Solves the |x| = inf case. */ + y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y); + + /* Copy sign. */ + y = vbslq_f32 (v_u32 (AbsMask), y, x); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y); +#endif + return y; +} + +HALF_WIDTH_ALIAS_F1 (erf) + +TEST_SIG (V, F, 1, erf, -4.0, 4.0) +TEST_ULP (V_NAME_F1 (erf), 1.43) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (erf), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000) +TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c new file mode 100644 index 000000000000..a928c35c9418 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp.c @@ -0,0 +1,134 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static volatile struct +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ + .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), + V2 (0x1.55555da646206p-5) }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ + .special_bound = V2 (704.0), +#endif + .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), + .shift = V2 (0x1.8p+52) +}; + +#define C(i) data.poly[i] +#define Tab __v_exp_data + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ +# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f64 (exp, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) +{ + float64x2_t n, r, r2, s, y, z; + uint64x2_t cmp, u, e; + +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcagtq_f64 (x, data.special_bound); +#endif + + /* n = round(x/(ln2/N)). */ + z = vfmaq_f64 (data.shift, x, data.inv_ln2); + u = vreinterpretq_u64_f64 (z); + n = vsubq_f64 (z, data.shift); + + /* r = x - n*ln2/N. */ + r = x; + r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (r, data.ln2_lo, n); + + e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (C (0), C (1), r); + y = vfmaq_f64 (y, C (2), r2); + y = vfmaq_f64 (r, y, r2); + + /* s = 2^(n/N). */ + u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; + s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n); +#endif + + return vfmaq_f64 (s, y, s); +} + +TEST_SIG (V, D, 1, exp, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp), 1.9) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (exp), 0, 0xffff000000000000, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp), 0x1p-6, 0x1p6, 400000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp), 633.3, 733.3, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c new file mode 100644 index 000000000000..24fdd1c7d257 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10.c @@ -0,0 +1,147 @@ +/* + * Double-precision vector 10^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "mathlib.h" +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */ +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 163840.0 /* 1280.0 * N. */ + +const static struct data +{ + float64x2_t poly[4]; + float64x2_t log10_2, log2_10_hi, log2_10_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm. + rel error: 0x1.5ddf8f28p-54 + abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ] + maxerr: 1.14432 +0.5 ulp. */ + .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1), + V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) }, + .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */ + .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */ + .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66), + .shift = V2 (0x1.8p+52), +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (ScaleBound), + .special_bound = V2 (SpecialBound), +#endif +}; + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask v_u64 (N - 1) + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ +# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f64 (exp10, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp10. + Maximum measured error is 1.64 ulp. + _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5 + want 0x1.f8dab6d7fed0ap+5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcageq_f64 (x, d->special_bound); +#endif + + /* n = round(x/(log10(2)/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*log10(2)/N. */ + float64x2_t r = x; + r = vfmsq_f64 (r, d->log2_10_hi, n); + r = vfmsq_f64 (r, d->log2_10_lo, n); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, IndexMask); + + /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]); + float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]); + p = vfmaq_f64 (p, y, r2); + y = vmulq_f64 (r, p); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n, d); +#endif + + return vfmaq_f64 (s, y, s); +} + +#if WANT_EXP10_TESTS +TEST_SIG (S, D, 1, exp10, -9.9, 9.9) +TEST_SIG (V, D, 1, exp10, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp10), 1.15) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c new file mode 100644 index 000000000000..eb0d5dd0d57c --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp10f.c @@ -0,0 +1,147 @@ +/* + * Single-precision vector 10^x function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _GNU_SOURCE +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_poly_f32.h" + +#define ScaleBound 192.0f + +static const struct data +{ + float32x4_t c0, c1, c3; + float log10_2_high, log10_2_low, c2, c4; + float32x4_t inv_log10_2, special_bound; + uint32x4_t exponent_bias, special_offset, special_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t scale_thresh; +#endif +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. + rel error: 0x1.89dafa3p-24 + abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] + maxerr: 1.85943 +0.5 ulp. */ + .c0 = V4 (0x1.26bb16p+1f), + .c1 = V4 (0x1.5350d2p+1f), + .c2 = 0x1.04744ap+1f, + .c3 = V4 (0x1.2d8176p+0f), + .c4 = 0x1.12b41ap-1f, + .inv_log10_2 = V4 (0x1.a934fp+1), + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + .special_bound = V4 (126.0f), + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), +#if !WANT_SIMD_EXCEPT + .scale_thresh = V4 (ScaleBound) +#endif +}; + +#if WANT_SIMD_EXCEPT + +# define SpecialBound 38.0f /* rint(log10(2^127)). */ +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */ +# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (exp10f, x, y, cmp); +} + +#else + +# define SpecialBound 126.0f + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +/* Fast vector implementation of single-precision exp10. + Algorithm is accurate to 2.36 ULP. + _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11 + want 0x1.7e79cp+11. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t cmp = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = v_zerofy_f32 (x, cmp); +#endif + + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ + float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0); + r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); +#endif + + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3); + float32x4_t p14 = vfmaq_f32 (p12, r2, p34); + float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} + +HALF_WIDTH_ALIAS_F1 (exp10) + +#if WANT_EXP10_TESTS +TEST_SIG (S, F, 1, exp10, -9.9, 9.9) +TEST_SIG (V, F, 1, exp10, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp10), 1.86) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c new file mode 100644 index 000000000000..63448d806b82 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2.c @@ -0,0 +1,128 @@ +/* + * Double-precision vector 2^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define BigBound 1022.0 +#define UOFlowBound 1280.0 +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ + +static const struct data +{ + float64x2_t poly[4]; + float64x2_t shift, scale_big_bound, scale_uoflow_bound; +} data = { + /* Coefficients are computed using Remez algorithm with + minimisation of the absolute error. */ + .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3), + V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) }, + .shift = V2 (0x1.8p52 / N), + .scale_big_bound = V2 (BigBound), + .scale_uoflow_bound = V2 (UOFlowBound), +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){ __v_exp_data[i[0] & IndexMask], + __v_exp_data[i[1] & IndexMask] }; +} + +#if WANT_SIMD_EXCEPT + +# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ + +/* Call scalar exp2 as a fallback. */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) +{ + return v_call_f64 (exp2, x, y, is_special); +} + +#else + +# define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset)); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b)); + float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +/* Fast vector implementation of exp2. + Maximum measured error is 1.65 ulp. + _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1 + want 0x1.f8db0d4df721dp-1. */ +VPCS_ATTR +float64x2_t V_NAME_D1 (exp2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp; +#if WANT_SIMD_EXCEPT + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); + /* Mask special lanes and retain a copy of x for passing to special-case + handler. */ + float64x2_t xc = x; + x = v_zerofy_f64 (x, cmp); +#else + cmp = vcagtq_f64 (x, d->scale_big_bound); +#endif + + /* n = round(x/N). */ + float64x2_t z = vaddq_f64 (d->shift, x); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n/N. */ + float64x2_t r = vsubq_f64 (x, n); + + /* s = 2^(n/N). */ + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + u = lookup_sbits (u); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + /* y ~ exp2(r) - 1. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); + y = vmulq_f64 (r, y); + + if (unlikely (v_any_u64 (cmp))) +#if !WANT_SIMD_EXCEPT + return special_case (s, y, n, d); +#else + return special_case (xc, vfmaq_f64 (s, s, y), cmp); +#endif + return vfmaq_f64 (s, s, y); +} + +TEST_SIG (V, D, 1, exp2, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (exp2), 1.15) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c new file mode 100644 index 000000000000..40f6170d3702 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f.c @@ -0,0 +1,122 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t c1, c3; + uint32x4_t exponent_bias, special_offset, special_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t scale_thresh, special_bound; +#endif + float c0, c2, c4, zero; +} data = { + /* maxerr: 1.962 ulp. */ + .c0 = 0x1.59977ap-10f, + .c1 = V4 (0x1.3ce9e4p-7f), + .c2 = 0x1.c6bd32p-5f, + .c3 = V4 (0x1.ebf9bcp-3f), + .c4 = 0x1.62e422p-1f, + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f32 (exp2f, x, y, cmp); +} + +#else + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = vsubq_f32 (x, n); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); +#endif + + float32x4_t c024 = vld1q_f32 (&d->c0); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1); + q = vfmaq_f32 (q, p, r2); + p = vmulq_laneq_f32 (r, c024, 2); + float32x4_t poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} + +HALF_WIDTH_ALIAS_F1 (exp2) + +TEST_SIG (V, F, 1, exp2, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp2), 1.49) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp2), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (exp2), 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp2), 0x1p-14, 0x1p8, 500000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c new file mode 100644 index 000000000000..1f8e89ab658f --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/exp2f_1u.c @@ -0,0 +1,73 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c1, c2, c3, c4, c5, shift; + uint32x4_t exponent_bias; + float32x4_t special_bound, scale_thresh; + uint32x4_t special_offset, special_bias; +} data = { + .shift = V4 (0x1.8p23f), + .exponent_bias = V4 (0x3f800000), + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), + /* maxerr: 0.878 ulp. */ + .c0 = V4 (0x1.416b5ep-13f), + .c1 = V4 (0x1.5f082ep-10f), + .c2 = V4 (0x1.3b2dep-7f), + .c3 = V4 (0x1.c6af7cp-5f), + .c4 = V4 (0x1.ebfbdcp-3f), + .c5 = V4 (0x1.62e43p-1f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r1 = vmulq_f32 (s1, s1); + float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ + /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + const struct data *d = ptr_barrier (&data); + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = x - n; + uint32x4_t e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; + float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + + float32x4_t p = vfmaq_f32 (d->c1, d->c0, r); + p = vfmaq_f32 (d->c2, p, r); + p = vfmaq_f32 (d->c3, p, r); + p = vfmaq_f32 (d->c4, p, r); + p = vfmaq_f32 (d->c5, p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (p, n, e, d); + return scale * p; +} + +TEST_ULP (_ZGVnN4v_exp2f_1u, 0.4) +TEST_DISABLE_FENV (_ZGVnN4v_exp2f_1u) +TEST_INTERVAL (_ZGVnN4v_exp2f_1u, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (_ZGVnN4v_exp2f_1u, 0x1p-14, 0x1p8, 500000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c new file mode 100644 index 000000000000..e5b1f020d1a0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf.c @@ -0,0 +1,130 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t c1, c3, c4, inv_ln2; + float ln2_hi, ln2_lo, c0, c2; + uint32x4_t exponent_bias, special_offset, special_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.45358 +0.5 ulp. */ + .c0 = 0x1.0e4020p-7f, + .c1 = V4 (0x1.573e2ep-5f), + .c2 = 0x1.555e66p-3f, + .c3 = V4 (0x1.fffdb6p-2f), + .c4 = V4 (0x1.ffffecp-1f), + .inv_ln2 = V4 (0x1.715476p+0f), + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (expf, x, y, cmp); +} + +#else + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + // (s2 + p*s2)*s1 = s2(p+1)s1 + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); + +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t cmp = vcgeq_u32 ( + vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), + TinyBound), + SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); +#endif + + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} + +HALF_WIDTH_ALIAS_F1 (exp) + +TEST_SIG (V, F, 1, exp, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (exp), 1.49) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (exp), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (exp), 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (exp), 0x1p-14, 0x1p8, 500000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c new file mode 100644 index 000000000000..4e114d810e08 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expf_1u.c @@ -0,0 +1,79 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t shift, inv_ln2; + uint32x4_t exponent_bias; + float32x4_t c1, c2, c3, c4; + float32x4_t special_bound, scale_thresh; + uint32x4_t special_offset, special_bias; + float ln2_hi, ln2_lo, c0, nothing; +} data = { + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .exponent_bias = V4 (0x3f800000), + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), + .special_offset = V4 (0x83000000), + .special_bias = V4 (0x7f000000), + /* maxerr: 0.36565 +0.5 ulp. */ + .c0 = 0x1.6a6000p-10f, + .c1 = V4 (0x1.12718ep-7f), + .c2 = V4 (0x1.555af0p-5f), + .c3 = V4 (0x1.555430p-3f), + .c4 = V4 (0x1.fffff4p-2f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r1 = vmulq_f32 (s1, s1); + float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t ln2_c0 = vld1q_f32 (&d->ln2_hi); + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t z = vmulq_f32 (x, d->inv_ln2); + float32x4_t n = vrndaq_f32 (z); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c0, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c0, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (e + d->exponent_bias); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c0, 2); + p = vfmaq_f32 (d->c2, p, r); + p = vfmaq_f32 (d->c3, p, r); + p = vfmaq_f32 (d->c4, p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + p = vfmaq_f32 (v_f32 (1.0f), p, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (p, n, e, d); + return scale * p; +} + +TEST_ULP (_ZGVnN4v_expf_1u, 0.4) +TEST_DISABLE_FENV (_ZGVnN4v_expf_1u) +TEST_INTERVAL (_ZGVnN4v_expf_1u, 0, 0xffff0000, 10000) +TEST_SYM_INTERVAL (_ZGVnN4v_expf_1u, 0x1p-14, 0x1p8, 500000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c new file mode 100644 index 000000000000..7535a1830427 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1.c @@ -0,0 +1,77 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; +#if WANT_SIMD_EXCEPT + uint64x2_t thresh, tiny_bound; +#else + float64x2_t oflow_bound; +#endif +} data = { + .d = V_EXPM1_DATA, +#if WANT_SIMD_EXCEPT + /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs + compare. */ + .thresh = V2 (0x78c56fa6d34b552), + /* asuint64(0x1p-51) << 1. */ + .tiny_bound = V2 (0x3cc0000000000000 << 1), +#else + /* Value above which expm1(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V2 (0x1.62b7d369a5aa9p+9), +#endif +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, const struct data *d) +{ + return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d), + special); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.05 ULP: + _ZGVnN2v_expm1(0x1.6329669eb8c87p-2) got 0x1.a8897eef87b34p-2 + want 0x1.a8897eef87b32p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ix = vreinterpretq_u64_f64 (x); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint64x2_t special + = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); +#else + /* Large input, NaNs and Infs. */ + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); +#endif + + if (unlikely (v_any_u64 (special))) + return special_case (x, special, d); + + /* expm1(x) ~= p * t + (t - 1). */ + return expm1_inline (x, &d->d); +} + +TEST_SIG (V, D, 1, expm1, -9.9, 9.9) +TEST_ULP (V_NAME_D1 (expm1), 1.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c new file mode 100644 index 000000000000..6d4431dcd8a5 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/expm1f.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data d; +#if WANT_SIMD_EXCEPT + uint32x4_t thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + .d = V_EXPM1F_DATA, +#if !WANT_SIMD_EXCEPT + /* Value above which expm1f(x) should overflow. Absolute value of the + underflow bound is greater than this, so it catches both cases - there is + a small window where fallbacks are triggered unnecessarily. */ + .oflow_bound = V4 (0x1.5ebc4p+6), +#else + /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute + compare. */ + .thresh = V4 (0x1d5ebc40), +#endif +}; + +/* asuint(0x1p-23), shifted by 1 for abs compare. */ +#define TinyBound v_u32 (0x34000000 << 1) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, const struct data *d) +{ + return v_call_f32 ( + expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special); +} + +/* Single-precision vector exp(x) - 1 function. + The maximum error is 1.62 ULP: + _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2 + want 0x1.da9f44p-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ix = vreinterpretq_u32_f32 (x); + /* If fp exceptions are to be triggered correctly, fall back to scalar for + |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for + shift-left by 1, and compare with thresh which was left-shifted offline - + this is effectively an absolute compare. */ + uint32x4_t special + = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); +#else + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); +#endif + + if (unlikely (v_any_u32 (special))) + return special_case (x, special, d); + + /* expm1(x) ~= p * t + (t - 1). */ + return expm1f_inline (x, &d->d); +} + +HALF_WIDTH_ALIAS_F1 (expm1) + +TEST_SIG (V, F, 1, expm1, -9.9, 9.9) +TEST_ULP (V_NAME_F1 (expm1), 1.13) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000) +TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000) +TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h new file mode 100644 index 000000000000..0c8350a1a77b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/finite_pow.h @@ -0,0 +1,361 @@ +/* + * Double-precision x^y function. + * + * Copyright (c) 2018-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Scalar version of pow used for fallbacks in vector implementations. */ + +/* Data is defined in v_pow_log_data.c. */ +#define N_LOG (1 << V_POW_LOG_TABLE_BITS) +#define Off 0x3fe6955500000000 +#define As __v_pow_log_data.poly + +/* Data is defined in v_pow_exp_data.c. */ +#define N_EXP (1 << V_POW_EXP_TABLE_BITS) +#define SignBias (0x800 << V_POW_EXP_TABLE_BITS) +#define SmallExp 0x3c9 /* top12(0x1p-54). */ +#define BigExp 0x408 /* top12(512.0). */ +#define ThresExp 0x03f /* BigExp - SmallExp. */ +#define InvLn2N __v_pow_exp_data.n_over_ln2 +#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi +#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo +#define SBits __v_pow_exp_data.sbits +#define Cs __v_pow_exp_data.poly + +/* Constants associated with pow. */ +#define SmallPowX 0x001 /* top12(0x1p-126). */ +#define BigPowX 0x7ff /* top12(INFINITY). */ +#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ +#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */ +#define BigPowY 0x43e /* top12(0x1.749p62). */ +#define ThresPowY 0x080 /* BigPowY - SmallPowY. */ + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline double +log_inline (uint64_t ix, double *tail) +{ + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64_t tmp = ix - Off; + int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1); + int k = (int64_t) tmp >> 52; /* arithmetic shift. */ + uint64_t iz = ix - (tmp & 0xfffULL << 52); + double z = asdouble (iz); + double kd = (double) k; + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + double invc = __v_pow_log_data.invc[i]; + double logc = __v_pow_log_data.logc[i]; + double logctail = __v_pow_log_data.logctail[i]; + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + double r = fma (z, invc, -1.0); + + /* k*Ln2 + log(c) + r. */ + double t1 = kd * __v_pow_log_data.ln2_hi + logc; + double t2 = t1 + r; + double lo1 = kd * __v_pow_log_data.ln2_lo + logctail; + double lo2 = t1 - t2 + r; + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double ar = As[0] * r; + double ar2 = r * ar; + double ar3 = r * ar2; + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + double hi = t2 + ar2; + double lo3 = fma (ar, r, -ar2); + double lo4 = t2 - hi + ar2; + /* p = log1p(r) - r - A[0]*r*r. */ + double p = (ar3 + * (As[1] + r * As[2] + + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6])))); + double lo = lo1 + lo2 + lo3 + lo4 + p; + double y = hi + lo; + *tail = hi - y + lo; + return y; +} + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +special_case (double tmp, uint64_t sbits, uint64_t ki) +{ + double scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return y; + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble (sbits); + y = scale + scale * tmp; +#if WANT_SIMD_EXCEPT + if (fabs (y) < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double hi, lo, one = 1.0; + if (y < 0.0) + one = -1.0; + lo = scale - y + scale * tmp; + hi = one + y; + lo = one - hi + y + lo; + y = (hi + lo) - one; + /* Fix the sign of 0. */ + if (y == 0.0) + y = asdouble (sbits & 0x8000000000000000); + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } +#endif + y = 0x1p-1022 * y; + return y; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ +static inline double +exp_inline (double x, double xtail, uint32_t sign_bias) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + if (abstop - SmallExp >= 0x80000000) + { + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return sign_bias ? -1.0 : 1.0; + } + if (abstop >= top12 (1024.0)) + { + /* Note: inf and nan are already handled. */ + /* Skip errno handling. */ +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (sign_bias) + : __math_oflow (sign_bias); +#else + double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY; + return sign_bias ? -res_uoflow : res_uoflow; +#endif + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return scale + scale * tmp; +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + A version of exp_inline that is not inlined and for which sign_bias is + equal to 0. */ +static double NOINLINE +exp_nosignbias (double x, double xtail) +{ + uint32_t abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - SmallExp >= ThresExp)) + { + /* Avoid spurious underflow for tiny x. */ + if (abstop - SmallExp >= 0x80000000) + return 1.0; + /* Note: inf and nan are already handled. */ + if (abstop >= top12 (1024.0)) +#if WANT_SIMD_EXCEPT + return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0); +#else + return asuint64 (x) >> 63 ? 0.0 : INFINITY; +#endif + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + double z = InvLn2N * x; + double kd = round (z); + uint64_t ki = lround (z); + double r = x - kd * Ln2HiN - kd * Ln2LoN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale. */ + uint64_t idx = ki & (N_EXP - 1); + uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64_t sbits = SBits[idx] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + double r2 = r * r; + double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]); + if (unlikely (abstop == 0)) + return special_case (tmp, sbits, ki); + double scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return scale + scale * tmp; +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int +checkint (uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int +zeroinfnan (uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1; +} + +static double NOINLINE +pow_scalar_special_case (double x, double y) +{ + uint32_t sign_bias = 0; + uint64_t ix, iy; + uint32_t topx, topy; + + ix = asuint64 (x); + iy = asuint64 (y); + topx = top12 (x); + topy = top12 (y); + if (unlikely (topx - SmallPowX >= ThresPowX + || (topy & 0x7ff) - SmallPowY >= ThresPowY)) + { + /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 + and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */ + /* Special cases: (x < 0x1p-126 or inf or nan) or + (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */ + if (unlikely (zeroinfnan (iy))) + { + if (2 * iy == 0) + return issignaling_inline (x) ? x + y : 1.0; + if (ix == asuint64 (1.0)) + return issignaling_inline (y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64 (INFINITY) + || 2 * iy > 2 * asuint64 (INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64 (1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (unlikely (zeroinfnan (ix))) + { + double x2 = x * x; + if (ix >> 63 && checkint (iy) == 1) + { + x2 = -x2; + sign_bias = 1; + } +#if WANT_SIMD_EXCEPT + if (2 * ix == 0 && iy >> 63) + return __math_divzero (sign_bias); +#endif + return iy >> 63 ? 1 / x2 : x2; + } + /* Here x and y are non-zero finite. */ + if (ix >> 63) + { + /* Finite x < 0. */ + int yint = checkint (iy); + if (yint == 0) +#if WANT_SIMD_EXCEPT + return __math_invalid (x); +#else + return __builtin_nan (""); +#endif + if (yint == 1) + sign_bias = SignBias; + ix &= 0x7fffffffffffffff; + topx &= 0x7ff; + } + if ((topy & 0x7ff) - SmallPowY >= ThresPowY) + { + /* Note: sign_bias == 0 here because y is not odd. */ + if (ix == asuint64 (1.0)) + return 1.0; + /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ + if ((topy & 0x7ff) < SmallPowY) + return 1.0; +#if WANT_SIMD_EXCEPT + return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0) + : __math_uflow (0); +#else + return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0; +#endif + } + if (topx == 0) + { + /* Normalize subnormal x so exponent becomes negative. */ + ix = asuint64 (x * 0x1p52); + ix &= 0x7fffffffffffffff; + ix -= 52ULL << 52; + } + } + + double lo; + double hi = log_inline (ix, &lo); + double ehi = y * hi; + double elo = y * lo + fma (y, hi, -ehi); + return exp_inline (ehi, elo, sign_bias); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c new file mode 100644 index 000000000000..dc01ed5bac93 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypot.c @@ -0,0 +1,95 @@ +/* + * Double-precision vector hypot(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint64x2_t tiny_bound, thres; +} data = { + .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */ + .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */ +}; +#else +static const struct data +{ + uint64x2_t tiny_bound; + uint32x4_t thres; +} data = { + .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */ + .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum, + uint32x2_t special) +{ + return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special)); +} + +/* Vector implementation of double-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222) + got 0x1.6a1b19400964ep-204 + want 0x1.6a1b19400964dp-204. */ +#if WANT_SIMD_EXCEPT + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + float64x2_t ay = vabsq_f64 (y); + + uint64x2_t ix = vreinterpretq_u64_f64 (ax); + uint64x2_t iy = vreinterpretq_u64_f64 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres); + uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f64 (ax, specialx); + ay = v_zerofy_f64 (ay, specialy); + uint32x2_t special = vaddhn_u64 (specialx, specialy); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#else + +float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y); + + uint32x2_t special + = vcge_u32 (vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound), + vget_low_u32 (d->thres)); + + if (unlikely (v_any_u32h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f64 (sqsum); +} +#endif + +TEST_SIG (V, D, 2, hypot, -10.0, 10.0) +TEST_ULP (V_NAME_D2 (hypot), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT) +TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c new file mode 100644 index 000000000000..69634875be5a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/hypotf.c @@ -0,0 +1,96 @@ +/* + * Single-precision vector hypot(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#if WANT_SIMD_EXCEPT +static const struct data +{ + uint32x4_t tiny_bound, thres; +} data = { + .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */ + .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */ +}; +#else +static const struct data +{ + uint32x4_t tiny_bound; + uint16x8_t thres; +} data = { + .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */ + .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */ +}; +#endif + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, + uint16x4_t special) +{ + return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special)); +} + +/* Vector implementation of single-precision hypot. + Maximum error observed is 1.21 ULP: + _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13 + want 0x1.6a41dp-13. */ +#if WANT_SIMD_EXCEPT + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + float32x4_t ay = vabsq_f32 (y); + + uint32x4_t ix = vreinterpretq_u32_f32 (ax); + uint32x4_t iy = vreinterpretq_u32_f32 (ay); + + /* Extreme values, NaNs, and infinities should be handled by the scalar + fallback for correct flag handling. */ + uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres); + uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres); + ax = v_zerofy_f32 (ax, specialx); + ay = v_zerofy_f32 (ay, specialy); + uint16x4_t special = vaddhn_u32 (specialx, specialy); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#else + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); + + uint16x4_t special + = vcge_u16 (vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound), + vget_low_u16 (d->thres)); + + if (unlikely (v_any_u16h (special))) + return special_case (x, y, sqsum, special); + + return vsqrtq_f32 (sqsum); +} +#endif + +HALF_WIDTH_ALIAS_F2 (hypot) + +TEST_SIG (V, F, 2, hypot, -10.0, 10.0) +TEST_ULP (V_NAME_F2 (hypot), 1.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT) +TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000) +TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c new file mode 100644 index 000000000000..94e3f4482079 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log.c @@ -0,0 +1,118 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + float64x2_t c0, c2; + double c1, c3, ln2, c4; +} data = { + /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .c0 = V2 (-0x1.ffffffffffff7p-2), + .c1 = 0x1.55555555170d4p-2, + .c2 = V2 (-0x1.0000000399c27p-2), + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .sign_exp_mask = V2 (0xfff0000000000000), + .off = V2 (0x3fe6900900000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */ +}; + +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Double-precision vector log routine. + The maximum observed error is 2.17 ULP: + _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log), 1.67) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_D1 (log), 0, 0xffff000000000000, 10000) +TEST_INTERVAL (V_NAME_D1 (log), 0x1p-4, 0x1p4, 400000) +TEST_INTERVAL (V_NAME_D1 (log), 0, inf, 400000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c new file mode 100644 index 000000000000..c2b8f1c54f0e --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10.c @@ -0,0 +1,132 @@ +/* + * Double-precision vector log10(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + double invln10, log10_2; + double c1, c3; + float64x2_t c0, c2, c4; +} data = { + /* Computed from log coefficients divided by log(10) then rounded to double + precision. */ + .c0 = V2 (-0x1.bcb7b1526e506p-3), + .c1 = 0x1.287a7636be1d1p-3, + .c2 = V2 (-0x1.bcb7b158af938p-4), + .c3 = 0x1.63c78734e6d07p-4, + .c4 = V2 (-0x1.287461742fee4p-4), + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ +}; + +#define N (1 << V_LOG10_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t log10c; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log10c = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Fast implementation of double-precision vector log10 + is a slight modification of double-precision vector log. + Max ULP error: < 2.5 ulp (nearest rounding.) + Maximum measured at 2.46 ulp for x in [0.96, 0.97] + _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 + want 0x1.fff6be3cae4b9p-6. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + /* hi = r / log(10) + log10(c) + k*log10(2). + Constants in v_log10_data.c are computed (in extended precision) as + e.log10c := e.logc * invln10. */ + float64x2_t cte = vld1q_f64 (&d->invln10); + float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0); + + /* y = log10(1+r) + n * log10(2). */ + hi = vfmaq_laneq_f64 (hi, kd, cte, 1); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_f64 (y, d->c4, r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log10, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log10), 1.97) +TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000) +TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c new file mode 100644 index 000000000000..907c1051e086 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log10f.c @@ -0,0 +1,106 @@ +/* + * Single-precision vector log10 function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c2, c4, c6, inv_ln10, ln2; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; +} data = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + .c0 = V4 (-0x1.bcb79cp-3f), + .c1 = 0x1.2879c8p-3f, + .c2 = V4 (-0x1.bcd472p-4f), + .c3 = 0x1.6408f8p-4f, + .c4 = V4 (-0x1.246f8p-4f), + .c5 = 0x1.f0e514p-5f, + .c6 = V4 (-0x1.0fc92cp-4f), + .c7 = 0x1.f5f76ap-5f, + .ln2 = V4 (0x1.62e43p-1f), + .inv_ln10 = V4 (0x1.bcb7b2p-2f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); +} + +/* Fast implementation of AdvSIMD log10f, + uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and + an order 9 polynomial. + Maximum error: 3.305ulps (nearest rounding.) + _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t c1357 = vld1q_f32 (&d->c1); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log10(1+r) + n * log10(2). */ + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + + float32x4_t p47 = vfmaq_f32 (c45, r2, c67); + float32x4_t p27 = vfmaq_f32 (c23, r2, p47); + float32x4_t poly = vfmaq_f32 (c01, r2, p27); + + /* y = Log10(2) * n + poly * InvLn(10). */ + float32x4_t y = vfmaq_f32 (r, d->ln2, n); + y = vmulq_f32 (y, d->inv_ln10); + + if (unlikely (v_any_u16h (special))) + return special_case (y, u_off, poly, r2, special, d); + return vfmaq_f32 (y, poly, r2); +} + +HALF_WIDTH_ALIAS_F1 (log10) + +TEST_SIG (V, F, 1, log10, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log10), 2.81) +TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100) +TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100) +TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c new file mode 100644 index 000000000000..42a0c5793920 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1p.c @@ -0,0 +1,61 @@ +/* + * Double-precision vector log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" + +const static struct data +{ + struct v_log1p_data d; + uint64x2_t inf, minus_one; +} data = { .d = V_LOG1P_CONSTANTS_TABLE, + .inf = V2 (0x7ff0000000000000), + .minus_one = V2 (0xbff0000000000000) }; + +#define BottomMask v_u64 (0xffffffff) + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, uint64x2_t cmp, const struct data *d) +{ + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float64x2_t x_nospecial = v_zerofy_f64 (x, cmp); + return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp); +} + +/* Vector log1p approximation using polynomial on reduced interval. Routine is + a modification of the algorithm used in scalar log1p, with no shortcut for + k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP: + _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2 + want 0x1.fd61d0727429fp+2 . */ +VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + uint64x2_t special_cases + = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one)); + + if (unlikely (v_any_u64 (special_cases))) + return special_case (x, special_cases, d); + + return log1p_inline (x, &d->d); +} + +TEST_SIG (V, D, 1, log1p, -0.9, 10.0) +TEST_ULP (V_NAME_D1 (log1p), 1.95) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000) +TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c new file mode 100644 index 000000000000..94b90249128f --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log1pf.c @@ -0,0 +1,92 @@ +/* + * Single-precision vector log(1+x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_log1pf_inline.h" + +#if WANT_SIMD_EXCEPT + +const static struct data +{ + uint32x4_t minus_one, thresh; + struct v_log1pf_data d; +} data = { + .d = V_LOG1PF_CONSTANTS_TABLE, + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */ + .minus_one = V4 (0xbf800000), +}; + +/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ +# define TinyBound v_u32 (0x34000000) + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp, const struct data *d) +{ + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float32x4_t x_nospecial = v_zerofy_f32 (x, cmp); + return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.69 ULP: + _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3 + want 0x1.cfcbdcp-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t ix = vreinterpretq_u32_f32 (x); + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + + uint32x4_t special_cases + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh), + vcgeq_u32 (ix, d->minus_one)); + + if (unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases, d); + + return log1pf_inline (x, &d->d); +} + +#else + +const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp) +{ + return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp); +} + +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.63 ULP: + _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3 + want 0x1.fdcb16p-3. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log1p) (float32x4_t x) +{ + uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)), + vcaleq_f32 (x, v_f32 (0x1p127f))); + + if (unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases); + + return log1pf_inline (x, ptr_barrier (&data)); +} + +#endif + +HALF_WIDTH_ALIAS_F1 (log1p) + +TEST_SIG (V, F, 1, log1p, -0.9, 10.0) +TEST_ULP (V_NAME_F1 (log1p), 1.20) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000) +TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000) +TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000) +TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c new file mode 100644 index 000000000000..7d2e44dad2c9 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2.c @@ -0,0 +1,123 @@ +/* + * Double-precision vector log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + uint64x2_t off, sign_exp_mask, offset_lower_bound; + uint32x4_t special_bound; + float64x2_t c0, c2; + double c1, c3, invln2, c4; +} data = { + /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 + and N = 128, then scaled by log2(e) in extended precision and rounded back + to double precision. */ + .c0 = V2 (-0x1.71547652b8300p-1), + .c1 = 0x1.ec709dc340953p-2, + .c2 = V2 (-0x1.71547651c8f35p-2), + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = V2 (0x3fe6900900000000), + .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */ +}; + +#define N (1 << V_LOG2_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t log2c; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + struct entry e; + uint64_t i0 + = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + uint64_t i1 + = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG2_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.log2c = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) +{ + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); +} + +/* Double-precision vector log2 routine. Implements the same algorithm as + vector log10, with coefficients and table entries scaled in extended + precision. The maximum observed error is 2.58 ULP: + _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); + + float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2); + float64x2_t hi + = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0); + + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); +} + +TEST_SIG (V, D, 1, log2, 0.01, 11.1) +TEST_ULP (V_NAME_D1 (log2), 2.09) +TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c new file mode 100644 index 000000000000..3053c64bc552 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/log2f.c @@ -0,0 +1,102 @@ +/* + * Single-precision vector log2 function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t c0, c2, c4, c6, c8; + uint32x4_t off, offset_lower_bound; + uint16x8_t special_bound; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; +} data = { + /* Coefficients generated using Remez algorithm approximate + log2(1+r)/r for r in [ -1/3, 1/3 ]. + rel error: 0x1.c4c4b0cp-26. */ + .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + .c1 = -0x1.715458p-1f, + .c2 = V4 (0x1.ec701cp-2f), + .c3 = -0x1.7171a4p-2f, + .c4 = V4 (0x1.27a0b8p-2f), + .c5 = -0x1.e5143ep-3f, + .c6 = V4 (0x1.9d8ecap-3f), + .c7 = -0x1.c675bp-3f, + .c8 = V4 (0x1.9e495p-3f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); +} + +/* Fast implementation for single precision AdvSIMD log2, + relies on same argument reduction as AdvSIMD logf. + Maximum error: 2.48 ULPs + _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u_off = vsubq_u32 (u_off, d->off); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log2(1+r) + n. */ + float32x4_t r2 = vmulq_f32 (r, r); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); + float32x4_t p48 = vfmaq_f32 (c45, r2, p68); + float32x4_t p28 = vfmaq_f32 (c23, r2, p48); + float32x4_t p = vfmaq_f32 (c01, r2, p28); + + if (unlikely (v_any_u16h (special))) + return special_case (n, u_off, p, r, special, d); + return vfmaq_f32 (n, p, r); +} + +HALF_WIDTH_ALIAS_F1 (log2) + +TEST_SIG (V, F, 1, log2, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log2), 1.99) +TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000) +TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c new file mode 100644 index 000000000000..84705fad05ee --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/logf.c @@ -0,0 +1,88 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t c2, c4, c6, ln2; + uint32x4_t off, offset_lower_bound, mantissa_mask; + uint16x8_t special_bound; + float c1, c3, c5, c0; +} data = { + /* 3.34 ulp error. */ + .c0 = -0x1.3e737cp-3f, + .c1 = 0x1.5a9aa2p-3f, + .c2 = V4 (-0x1.4f9934p-3f), + .c3 = 0x1.961348p-3f, + .c4 = V4 (-0x1.00187cp-2f), + .c5 = 0x1.555d7cp-2f, + .c6 = V4 (-0x1.ffffc8p-2f), + .ln2 = V4 (0x1.62e43p-1f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, + uint16x4_t cmp, const struct data *d) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t c1350 = vld1q_f32 (&d->c1); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + float32x4_t r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); + float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); + p = vfmaq_laneq_f32 (p, r2, c1350, 3); + + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (p, u_off, y, r2, cmp, d); + return vfmaq_f32 (p, y, r2); +} + +HALF_WIDTH_ALIAS_F1 (log) + +TEST_SIG (V, F, 1, log, 0.01, 11.1) +TEST_ULP (V_NAME_F1 (log), 2.9) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (log), WANT_SIMD_EXCEPT) +TEST_INTERVAL (V_NAME_F1 (log), 0, 0xffff0000, 10000) +TEST_INTERVAL (V_NAME_F1 (log), 0x1p-4, 0x1p4, 500000) +TEST_INTERVAL (V_NAME_F1 (log), 0, inf, 50000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c new file mode 100644 index 000000000000..da2fcbff8514 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modf.c @@ -0,0 +1,33 @@ +/* + * Double-precision vector modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modf algorithm. Produces exact values in all rounding modes. */ +float64x2_t VPCS_ATTR V_NAME_D1_L1 (modf) (float64x2_t x, double *out_int) +{ + /* Get integer component of x. */ + float64x2_t rounded = vrndq_f64 (x); + vst1q_f64 (out_int, rounded); + + /* Subtract integer component from input. */ + uint64x2_t remaining = vreinterpretq_u64_f64 (vsubq_f64 (x, rounded)); + + /* Return +0 for integer x. */ + uint64x2_t is_integer = vceqq_f64 (x, rounded); + return vreinterpretq_f64_u64 (vbicq_u64 (remaining, is_integer)); +} + +TEST_ULP (_ZGVnN2vl8_modf_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_frac, 1, inf, 20000) + +TEST_ULP (_ZGVnN2vl8_modf_int, 0.0) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN2vl8_modf_int, 1, inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c new file mode 100644 index 000000000000..0a646b24cb1a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/modff.c @@ -0,0 +1,34 @@ +/* + * Single-precision vector modf(x, *y) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Modff algorithm. Produces exact values in all rounding modes. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1_L1 (modf) (float32x4_t x, + float *out_int) +{ + /* Get integer component of x. */ + float32x4_t rounded = vrndq_f32 (x); + vst1q_f32 (out_int, rounded); + + /* Subtract integer component from input. */ + uint32x4_t remaining = vreinterpretq_u32_f32 (vsubq_f32 (x, rounded)); + + /* Return +0 for integer x. */ + uint32x4_t is_integer = vceqq_f32 (x, rounded); + return vreinterpretq_f32_u32 (vbicq_u32 (remaining, is_integer)); +} + +TEST_ULP (_ZGVnN4vl4_modff_frac, 0.0) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_frac, 1, inf, 20000) + +TEST_ULP (_ZGVnN4vl4_modff_int, 0.0) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 0, 1, 20000) +TEST_SYM_INTERVAL (_ZGVnN4vl4_modff_int, 1, inf, 20000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c new file mode 100644 index 000000000000..db9d6e9ba14b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/pow.c @@ -0,0 +1,284 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +/* Defines parameters of the approximation and scalar fallback. */ +#include "finite_pow.h" + +#define VecSmallPowX v_u64 (SmallPowX) +#define VecThresPowX v_u64 (ThresPowX) +#define VecSmallPowY v_u64 (SmallPowY) +#define VecThresPowY v_u64 (ThresPowY) + +static const struct data +{ + uint64x2_t inf; + float64x2_t small_powx; + uint64x2_t offset, mask; + uint64x2_t mask_sub_0, mask_sub_1; + float64x2_t log_c0, log_c2, log_c4, log_c5; + double log_c1, log_c3; + double ln2_lo, ln2_hi; + uint64x2_t small_exp, thres_exp; + double ln2_lo_n, ln2_hi_n; + double inv_ln2_n, exp_c2; + float64x2_t exp_c0, exp_c1; +} data = { + /* Power threshold. */ + .inf = V2 (0x7ff0000000000000), + .small_powx = V2 (0x1p-126), + .offset = V2 (Off), + .mask = V2 (0xfffULL << 52), + .mask_sub_0 = V2 (1ULL << 52), + .mask_sub_1 = V2 (52ULL << 52), + /* Coefficients copied from v_pow_log_data.c + relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] + Coefficients are scaled to match the scaling during evaluation. */ + .log_c0 = V2 (0x1.555555555556p-2 * -2), + .log_c1 = -0x1.0000000000006p-2 * -2, + .log_c2 = V2 (0x1.999999959554ep-3 * 4), + .log_c3 = -0x1.555555529a47ap-3 * 4, + .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8), + .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8), + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 + (0.550 without fma) if |x| < ln2/512. */ + .exp_c0 = V2 (0x1.fffffffffffd4p-2), + .exp_c1 = V2 (0x1.5555571d6ef9p-3), + .exp_c2 = 0x1.5555576a5adcep-5, + .small_exp = V2 (0x3c90000000000000), + .thres_exp = V2 (0x03f0000000000000), + .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */ + .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */ + .ln2_lo_n = -0x1.c610ca86c3899p-45, +}; + +/* This version implements an algorithm close to scalar pow but + - does not implement the trick in the exp's specialcase subroutine to avoid + double-rounding, + - does not use a tail in the exponential core computation, + - and pow's exp polynomial order and table bits might differ. + + Maximum measured error is 1.04 ULPs: + _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13) + got 0x1.f71162f473251p-1 + want 0x1.f71162f473252p-1. */ + +static inline float64x2_t +v_masked_lookup_f64 (const double *table, uint64x2_t i) +{ + return (float64x2_t){ + table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)], + table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)] + }; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline float64x2_t +v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) +{ + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + uint64x2_t tmp = vsubq_u64 (ix, d->offset); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + float64x2_t kd = vcvtq_f64_s64 (k); + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp); + float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp); + float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp); + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); + /* k*Ln2 + log(c) + r. */ + float64x2_t ln2 = vld1q_f64 (&d->ln2_lo); + float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1); + float64x2_t t2 = vaddq_f64 (t1, r); + float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0); + float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); + /* Evaluation is optimized assuming superscalar pipelined execution. */ + float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r); + float64x2_t ar2 = vmulq_f64 (r, ar); + float64x2_t ar3 = vmulq_f64 (r, ar2); + /* k*Ln2 + log(c) + r + A[0]*r*r. */ + float64x2_t hi = vaddq_f64 (t2, ar2); + float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); + float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); + /* p = log1p(r) - r - A[0]*r*r. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1); + float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5); + float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1); + float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0); + float64x2_t p = vfmaq_f64 (a34, ar2, a56); + p = vfmaq_f64 (a12, ar2, p); + p = vmulq_f64 (ar3, p); + float64x2_t lo + = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p); + float64x2_t y = vaddq_f64 (hi, lo); + *tail = vaddq_f64 (vsubq_f64 (hi, y), lo); + return y; +} + +static float64x2_t VPCS_ATTR NOINLINE +exp_special_case (float64x2_t x, float64x2_t xtail) +{ + return (float64x2_t){ exp_nosignbias (x[0], xtail[0]), + exp_nosignbias (x[1], xtail[1]) }; +} + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ +static inline float64x2_t +v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d) +{ + /* Fallback to scalar exp_inline for all lanes if any lane + contains value of x s.t. |x| <= 2^-54 or >= 512. */ + uint64x2_t uoflowx = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp), + d->thres_exp); + if (unlikely (v_any_u64 (uoflowx))) + return exp_special_case (x, vnegq_f64 (neg_xtail)); + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n); + float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0); + float64x2_t kd = vrndnq_f64 (z); + uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z)); + float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n); + float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1); + r = vfmsq_laneq_f64 (r, kd, ln2_n, 0); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = vsubq_f64 (r, neg_xtail); + /* 2^(k/N) ~= scale. */ + uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); + uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + uint64x2_t sbits = v_lookup_u64 (SBits, idx); + sbits = vaddq_u64 (sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1); + tmp = vfmaq_f64 (d->exp_c0, r, tmp); + tmp = vfmaq_f64 (r, r2, tmp); + float64x2_t scale = vreinterpretq_f64_u64 (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return vfmaq_f64 (scale, scale, tmp); +} + +static float64x2_t NOINLINE VPCS_ATTR +scalar_fallback (float64x2_t x, float64x2_t y) +{ + return (float64x2_t){ pow_scalar_special_case (x[0], y[0]), + pow_scalar_special_case (x[1], y[1]) }; +} + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + const struct data *d = ptr_barrier (&data); + /* Case of x <= 0 is too complicated to be vectorised efficiently here, + fallback to scalar pow for all lanes if any x < 0 detected. */ + if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x)))) + return scalar_fallback (x, y); + + uint64x2_t vix = vreinterpretq_u64_f64 (x); + uint64x2_t viy = vreinterpretq_u64_f64 (y); + uint64x2_t iay = vandq_u64 (viy, d->inf); + + /* Special cases of x or y. */ +#if WANT_SIMD_EXCEPT + /* Small or large. */ + uint64x2_t vtopx = vshrq_n_u64 (vix, 52); + uint64x2_t vabstopy = vshrq_n_u64 (iay, 52); + uint64x2_t specialx + = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX); + uint64x2_t specialy + = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY); +#else + /* The case y==0 does not trigger a special case, since in this case it is + necessary to fix the result only if x is a signalling nan, which already + triggers a special case. We test y==0 directly in the scalar fallback. */ + uint64x2_t iax = vandq_u64 (vix, d->inf); + uint64x2_t specialx = vcgeq_u64 (iax, d->inf); + uint64x2_t specialy = vcgeq_u64 (iay, d->inf); +#endif + uint64x2_t special = vorrq_u64 (specialx, specialy); + /* Fallback to scalar on all lanes if any lane is inf or nan. */ + if (unlikely (v_any_u64 (special))) + return scalar_fallback (x, y); + + /* Small cases of x: |x| < 0x1p-126. */ + uint64x2_t smallx = vcaltq_f64 (x, d->small_powx); + if (unlikely (v_any_u64 (smallx))) + { + /* Update ix if top 12 bits of x are 0. */ + uint64x2_t sub_x = vceqzq_u64 (vshrq_n_u64 (vix, 52)); + if (unlikely (v_any_u64 (sub_x))) + { + /* Normalize subnormal x so exponent becomes negative. */ + uint64x2_t vix_norm = vreinterpretq_u64_f64 ( + vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0)))); + vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1); + vix = vbslq_u64 (sub_x, vix_norm, vix); + } + } + + /* Vector Log(ix, &lo). */ + float64x2_t vlo; + float64x2_t vhi = v_log_inline (vix, &vlo, d); + + /* Vector Exp(y_loghi, y_loglo). */ + float64x2_t vehi = vmulq_f64 (y, vhi); + float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); + float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo); + return v_exp_inline (vehi, neg_velo, d); +} + +TEST_SIG (V, D, 2, pow) +TEST_ULP (V_NAME_D2 (pow), 0.55) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D2 (pow), WANT_SIMD_EXCEPT) +/* Wide intervals spanning the whole domain but shared between x and y. */ +#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \ + TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n) +#define EXPAND(str) str##000000000 +#define SHL52(str) EXPAND (str) +V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000) +V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000) +V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000) +V_POW_INTERVAL2 (0, inf, 0, inf, 1000) +/* x~1 or y~1. */ +V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) +V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) +V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) +/* around argmaxs of ULP error. */ +V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) +V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) +/* x is negative, y is odd or even integer, or y is real not integer. */ +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) +/* 1.0^y. */ +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) +TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c new file mode 100644 index 000000000000..47f74cf38ab0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/powf.c @@ -0,0 +1,209 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A d->log2_poly +#define C d->exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct data +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + float64x2_t log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + float64x2_t exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + V2 (-0x1.6ff5daa3b3d7cp-2 * Scale), + V2 (0x1.ec81d03c01aebp-2 * Scale), + V2 (-0x1.71547bb43f101p-1 * Scale), + V2 (0x1.7154764a815cbp0 * Scale)}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + V2 (0x1.c6af84b912394p-5 / Scale / Scale / Scale), + V2 (0x1.ebfce50fac4f3p-3 / Scale / Scale), + V2 (0x1.62e42ff0c52d6p-1 / Scale)}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +static inline float64x2_t +ylogx_core (const struct data *d, float64x2_t iz, float64x2_t k, + float64x2_t invc, float64x2_t logc, float64x2_t y) +{ + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), iz, invc); + float64x2_t y0 = vaddq_f64 (logc, k); + + /* Polynomial to approximate log1p(r)/ln2. */ + float64x2_t logx = vfmaq_f64 (A[1], r, A[0]); + logx = vfmaq_f64 (A[2], logx, r); + logx = vfmaq_f64 (A[3], logx, r); + logx = vfmaq_f64 (y0, logx, r); + + return vmulq_f64 (logx, y); +} + +static inline float64x2_t +log2_lookup (const struct data *d, uint32_t i) +{ + return vld1q_f64 ( + &d->log2_tab[(i >> (23 - V_POWF_LOG2_TABLE_BITS)) & Log2IdxMask].invc); +} + +static inline uint64x1_t +exp2f_lookup (const struct data *d, uint64_t i) +{ + return vld1_u64 (&d->exp2f_tab[i % (1 << V_EXP2F_TABLE_BITS)]); +} + +static inline float32x2_t +powf_core (const struct data *d, float64x2_t ylogx) +{ + /* N*x = k + r with r in [-1/2, 1/2]. */ + float64x2_t kd = vrndnq_f64 (ylogx); + int64x2_t ki = vcvtaq_s64_f64 (ylogx); + float64x2_t r = vsubq_f64 (ylogx, kd); + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64x2_t t = vcombine_u64 (exp2f_lookup (d, vgetq_lane_s64 (ki, 0)), + exp2f_lookup (d, vgetq_lane_s64 (ki, 1))); + t = vaddq_u64 ( + t, vreinterpretq_u64_s64 (vshlq_n_s64 (ki, 52 - V_EXP2F_TABLE_BITS))); + float64x2_t s = vreinterpretq_f64_u64 (t); + float64x2_t p = vfmaq_f64 (C[1], r, C[0]); + p = vfmaq_f64 (C[2], r, p); + p = vfmaq_f64 (s, p, vmulq_f64 (s, r)); + return vcvt_f32_f64 (p); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + const struct data *d = ptr_barrier (&data); + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + float32x4_t iz = vreinterpretq_f32_u32 (vsubq_u32 (u, top)); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + /* Use double precision for each lane: split input vectors into lo and hi + halves and promote. */ + float64x2_t tab0 = log2_lookup (d, vgetq_lane_u32 (tmp, 0)), + tab1 = log2_lookup (d, vgetq_lane_u32 (tmp, 1)), + tab2 = log2_lookup (d, vgetq_lane_u32 (tmp, 2)), + tab3 = log2_lookup (d, vgetq_lane_u32 (tmp, 3)); + + float64x2_t iz_lo = vcvt_f64_f32 (vget_low_f32 (iz)), + iz_hi = vcvt_high_f64_f32 (iz); + + float64x2_t k_lo = vcvtq_f64_s64 (vmovl_s32 (vget_low_s32 (k))), + k_hi = vcvtq_f64_s64 (vmovl_high_s32 (k)); + + float64x2_t invc_lo = vzip1q_f64 (tab0, tab1), + invc_hi = vzip1q_f64 (tab2, tab3), + logc_lo = vzip2q_f64 (tab0, tab1), + logc_hi = vzip2q_f64 (tab2, tab3); + + float64x2_t y_lo = vcvt_f64_f32 (vget_low_f32 (y)), + y_hi = vcvt_high_f64_f32 (y); + + float64x2_t ylogx_lo = ylogx_core (d, iz_lo, k_lo, invc_lo, logc_lo, y_lo); + float64x2_t ylogx_hi = ylogx_core (d, iz_hi, k_hi, invc_hi, logc_hi, y_hi); + + uint32x4_t ylogx_top = vuzp2q_u32 (vreinterpretq_u32_f64 (ylogx_lo), + vreinterpretq_u32_f64 (ylogx_hi)); + + cmp = vorrq_u32 ( + cmp, vcgeq_u32 (vandq_u32 (vshrq_n_u32 (ylogx_top, 15), v_u32 (0xffff)), + vdupq_n_u32 (asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) + >> 47))); + + float32x2_t p_lo = powf_core (d, ylogx_lo); + float32x2_t p_hi = powf_core (d, ylogx_hi); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, vcombine_f32 (p_lo, p_hi), cmp); + return vcombine_f32 (p_lo, p_hi); +} + +HALF_WIDTH_ALIAS_F2 (pow) + +TEST_SIG (V, F, 2, pow) +TEST_ULP (V_NAME_F2 (pow), 2.1) +TEST_DISABLE_FENV (V_NAME_F2 (pow)) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, 0x1p-7, 0x1p7, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-1, 0x1p1, -0x1p-7, -0x1p7, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, 0x1p-1, 0x1p1, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1p-70, 0x1p70, -0x1p-1, -0x1p1, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p14, 50000) +TEST_INTERVAL2 (V_NAME_F2 (pow), 0x1.ep-1, 0x1.1p0, -0x1p8, -0x1p14, 50000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c new file mode 100644 index 000000000000..0461bbb99405 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sin.c @@ -0,0 +1,105 @@ +/* + * Double-precision vector sin function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "test_defs.h" +#include "test_sig.h" +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; +} data = { + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + + .range_val = V2 (0x1p23), + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), +}; + +#if WANT_SIMD_EXCEPT +/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ +# define TinyBound v_u64 (0x3020000000000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u64 (0x1160000000000000) +#endif + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sin, x, y, cmp); +} + +/* Vector (AdvSIMD) sin approximation. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be + triggered correctly, set any special lanes to 1 (which is neutral w.r.t. + fenv). These lanes will be fixed by special-case handler later. */ + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); +#else + r = x; + cmp = vcageq_f64 (x, d->range_val); +#endif + + /* n = rint(|x|/pi). */ + n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +TEST_SIG (V, D, 1, sin, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (sin), 3.0) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sin), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0, 0x1p23, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (sin), 0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c new file mode 100644 index 000000000000..83bfa45efa98 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincos.c @@ -0,0 +1,67 @@ +/* + * Double-precision vector sincos function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincos declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include <math.h> + +#include "v_math.h" +#include "test_defs.h" +#include "v_sincos_common.h" + +/* sincos not available for all scalar libm implementations. */ +#if defined(_MSC_VER) || !defined(__GLIBC__) +static void +sincos (double x, double *out_sin, double *out_cos) +{ + *out_sin = sin (x); + *out_cos = cos (x); +} +#endif + +static void VPCS_ATTR NOINLINE +special_case (float64x2_t x, uint64x2_t special, double *out_sin, + double *out_cos) +{ + if (special[0]) + sincos (x[0], out_sin, out_cos); + if (special[1]) + sincos (x[1], out_sin + 1, out_cos + 1); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincos_data *d = ptr_barrier (&v_sincos_data); + uint64x2_t special = check_ge_rangeval (x, d); + + float64x2x2_t sc = v_sincos_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); + + if (unlikely (v_any_u64 (special))) + special_case (x, special, out_sin, out_cos); +} + +TEST_DISABLE_FENV (_ZGVnN2v_sincos_cos) +TEST_DISABLE_FENV (_ZGVnN2v_sincos_sin) +TEST_ULP (_ZGVnN2v_sincos_sin, 2.73) +TEST_ULP (_ZGVnN2v_sincos_cos, 2.73) +#define V_SINCOS_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n) +V_SINCOS_INTERVAL (0, 0x1p-31, 50000) +V_SINCOS_INTERVAL (0x1p-31, 0x1p23, 500000) +V_SINCOS_INTERVAL (0x1p23, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c new file mode 100644 index 000000000000..cd482f38d5f6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincosf.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector sincos function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* Define _GNU_SOURCE in order to include sincosf declaration. If building + pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to + be linked against the scalar sincosf from math/. */ +#define _GNU_SOURCE +#include <math.h> + +#include "v_sincosf_common.h" +#include "v_math.h" +#include "test_defs.h" + +/* sincos not available for all scalar libm implementations. */ +#if defined(_MSC_VER) || !defined(__GLIBC__) +static void +sincosf (float x, float *out_sin, float *out_cos) +{ + *out_sin = sinf (x); + *out_cos = cosf (x); +} +#endif + +static void VPCS_ATTR NOINLINE +special_case (float32x4_t x, uint32x4_t special, float *out_sin, + float *out_cos) +{ + for (int i = 0; i < 4; i++) + if (special[i]) + sincosf (x[i], out_sin + i, out_cos + i); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data); + uint32x4_t special = check_ge_rangeval (x, d); + + float32x4x2_t sc = v_sincosf_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); + + if (unlikely (v_any_u32 (special))) + special_case (x, special, out_sin, out_cos); +} + +TEST_DISABLE_FENV (_ZGVnN4v_sincosf_sin) +TEST_DISABLE_FENV (_ZGVnN4v_sincosf_cos) +TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17) +TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31) +#define V_SINCOSF_INTERVAL(lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \ + TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n) +V_SINCOSF_INTERVAL (0, 0x1p-31, 50000) +V_SINCOSF_INTERVAL (0x1p-31, 0x1p20, 500000) +V_SINCOSF_INTERVAL (0x1p20, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c new file mode 100644 index 000000000000..fd425202ce67 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospi.c @@ -0,0 +1,44 @@ +/* + * Double-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_sincospi_common.h" +#include "v_math.h" +#include "test_defs.h" + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using separate argument reduction and shared low-order + polynomials. + Approximation for vector double-precision sincospi(x). + Maximum Error 3.09 ULP: + _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1 + Maximum Error 3.16 ULP: + _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. */ +VPCS_ATTR void +_ZGVnN2vl8l8_sincospi (float64x2_t x, double *out_sin, double *out_cos) +{ + const struct v_sincospi_data *d = ptr_barrier (&v_sincospi_data); + + float64x2x2_t sc = v_sincospi_inline (x, d); + + vst1q_f64 (out_sin, sc.val[0]); + vst1q_f64 (out_cos, sc.val[1]); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVnN2v_sincospi_cos) +TEST_DISABLE_FENV (_ZGVnN2v_sincospi_sin) +TEST_ULP (_ZGVnN2v_sincospi_sin, 2.59) +TEST_ULP (_ZGVnN2v_sincospi_cos, 2.66) +# define V_SINCOSPI_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN2v_sincospi_cos, lo, hi, n) +V_SINCOSPI_INTERVAL (0, 0x1p-63, 10000) +V_SINCOSPI_INTERVAL (0x1p-63, 0.5, 50000) +V_SINCOSPI_INTERVAL (0.5, 0x1p63, 50000) +V_SINCOSPI_INTERVAL (0x1p63, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c new file mode 100644 index 000000000000..760ea3d4f5e1 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sincospif.c @@ -0,0 +1,43 @@ +/* + * Single-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_sincospif_common.h" +#include "v_math.h" +#include "test_defs.h" +#include "mathlib.h" + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + */ +VPCS_ATTR void +_ZGVnN4vl4l4_sincospif (float32x4_t x, float *out_sin, float *out_cos) +{ + const struct v_sincospif_data *d = ptr_barrier (&v_sincospif_data); + + float32x4x2_t sc = v_sincospif_inline (x, d); + + vst1q_f32 (out_sin, sc.val[0]); + vst1q_f32 (out_cos, sc.val[1]); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (_ZGVnN4v_sincospif_sin) +TEST_DISABLE_FENV (_ZGVnN4v_sincospif_cos) +TEST_ULP (_ZGVnN4v_sincospif_sin, 2.54) +TEST_ULP (_ZGVnN4v_sincospif_cos, 2.68) +# define V_SINCOSPIF_INTERVAL(lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_sin, lo, hi, n) \ + TEST_SYM_INTERVAL (_ZGVnN4v_sincospif_cos, lo, hi, n) +V_SINCOSPIF_INTERVAL (0, 0x1p-63, 10000) +V_SINCOSPIF_INTERVAL (0x1p-63, 0.5, 50000) +V_SINCOSPIF_INTERVAL (0.5, 0x1p31, 50000) +V_SINCOSPIF_INTERVAL (0x1p31, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c new file mode 100644 index 000000000000..0764434039a0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinf.c @@ -0,0 +1,92 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "test_defs.h" +#include "test_sig.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .range_val = V4 (0x1p20f) +}; + +#if WANT_SIMD_EXCEPT +/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ +# define TinyBound v_u32 (0x22000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u32 (0x27800000) +#endif + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinf, x, y, cmp); +} + +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); +#else + r = x; + cmp = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(|x|/pi). */ + n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, vmulq_f32 (y, r2), r); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +HALF_WIDTH_ALIAS_F1 (sin) + +TEST_SIG (V, F, 1, sin, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (sin), 1.4) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sin), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0, 0x1p20, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (sin), 0x1p20, inf, 10000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c new file mode 100644 index 000000000000..f65ccd0c6270 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinh.c @@ -0,0 +1,80 @@ +/* + * Double-precision vector sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; + uint64x2_t halff; +#if WANT_SIMD_EXCEPT + uint64x2_t tiny_bound, thresh; +#else + float64x2_t large_bound; +#endif +} data = { + .d = V_EXPM1_DATA, + .halff = V2 (0x3fe0000000000000), +#if WANT_SIMD_EXCEPT + /* 2^-26, below which sinh(x) rounds to x. */ + .tiny_bound = V2 (0x3e50000000000000), + /* asuint(large_bound) - asuint(tiny_bound). */ + .thresh = V2 (0x0230000000000000), +#else + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x1p+9), +#endif +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x) +{ + return v_call_f64 (sinh, x, x, v_u64 (-1)); +} + +/* Approximation for vector double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.52 ULP: + _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2 + want -0x1.ac2f05bb66fc9p-2. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + float64x2_t halfsign = vreinterpretq_f64_u64 ( + vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff)); + +#if WANT_SIMD_EXCEPT + uint64x2_t special = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); +#else + uint64x2_t special = vcageq_f64 (x, d->large_bound); +#endif + + /* Fall back to scalar variant for all lanes if any of them are special. */ + if (unlikely (v_any_u64 (special))) + return special_case (x); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + float64x2_t t = expm1_inline (ax, &d->d); + t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); + return vmulq_f64 (t, halfsign); +} + +TEST_SIG (V, D, 1, sinh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (sinh), 2.02) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c new file mode 100644 index 000000000000..12dbe26b425b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinhf.c @@ -0,0 +1,84 @@ +/* + * Single-precision vector sinh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data expm1f_consts; +#if WANT_SIMD_EXCEPT + uint32x4_t tiny_bound, thresh; +#else + float32x4_t oflow_bound; +#endif +} data = { + .expm1f_consts = V_EXPM1F_DATA, +#if WANT_SIMD_EXCEPT + /* 0x1.6a09e8p-32, below which expm1f underflows. */ + .tiny_bound = V4 (0x2fb504f4), + /* asuint(oflow_bound) - asuint(tiny_bound). */ + .thresh = V4 (0x12fbbbb3), +#else + /* 0x1.61814ep+6, above which expm1f helper overflows. */ + .oflow_bound = V4 (0x1.61814ep+6), +#endif +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign, + uint32x4_t special) +{ + return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special); +} + +/* Approximation for vector single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4 + want 0x1.e469e4p-4. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + float32x4_t halfsign = vreinterpretq_f32_u32 ( + vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5)))); + +#if WANT_SIMD_EXCEPT + uint32x4_t special = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh); + ax = v_zerofy_f32 (ax, special); +#else + uint32x4_t special = vcageq_f32 (x, d->oflow_bound); +#endif + + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us + to retain acceptable accuracy for very small inputs. */ + float32x4_t t = expm1f_inline (ax, &d->expm1f_consts); + t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0)))); + + /* Fall back to the scalar variant for any lanes that should trigger an + exception. */ + if (unlikely (v_any_u32 (special))) + return special_case (x, t, halfsign, special); + + return vmulq_f32 (t, halfsign); +} + +HALF_WIDTH_ALIAS_F1 (sinh) + +TEST_SIG (V, F, 1, sinh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (sinh), 1.76) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c new file mode 100644 index 000000000000..f86d167a2ac3 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpi.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector sinpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */ +/* asuint64(0x1p64) - TinyBound. */ +# define Thresh v_u64 (0x07f0000000000000) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (arm_math_sinpi, x, y, cmp); +} +#endif + +/* Approximation for vector double-precision sinpi(x). + Maximum Error 3.05 ULP: + _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1 + want 0x1.fb295878301cap-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float64x2_t r = v_zerofy_f64 (x, cmp); +#else + float64x2_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_D1 (sinpi), 2.56) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000) +TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c new file mode 100644 index 000000000000..98ba9d84d2fb --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/sinpif.c @@ -0,0 +1,84 @@ +/* + * Single-precision vector sinpi function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */ +# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (arm_math_sinpif, x, y, cmp); +} +#endif + +/* Approximation for vector single-precision sinpi(x) + Maximum Error 3.03 ULP: + _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float32x4_t r = v_zerofy_f32 (x, cmp); +#else + float32x4_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +HALF_WIDTH_ALIAS_F1 (sinpi) + +#if WANT_TRIGPI_TESTS +TEST_ULP (V_NAME_F1 (sinpi), 2.54) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000) +TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c new file mode 100644 index 000000000000..957f9aba3a1e --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tan.c @@ -0,0 +1,122 @@ +/* + * Double-precision vector tan(x) function. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float64x2_t poly[9]; + double half_pi[2]; + float64x2_t two_over_pi, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3), + V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6), + V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), + V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), + V2 (0x1.4e4fd14147622p-12) }, + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, + .two_over_pi = V2 (0x1.45f306dc9c883p-1), + .shift = V2 (0x1.8p52), +#if !WANT_SIMD_EXCEPT + .range_val = V2 (0x1p23), +#endif +}; + +#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */ +#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */ +#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */ + +/* Special cases (fall back to scalar calls). */ +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x) +{ + return v_call_f64 (tan, x, x, v_u64 (-1)); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) +{ + const struct data *dat = ptr_barrier (&data); + /* Our argument reduction cannot calculate q with sufficient accuracy for + very large inputs. Fall back to scalar routine for all lanes if any are + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for + tiny input to avoid underflow. */ +#if WANT_SIMD_EXCEPT + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + /* iax - tiny_bound > range_val - tiny_bound. */ + uint64x2_t special + = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh)); + if (unlikely (v_any_u64 (special))) + return special_case (x); +#endif + + /* q = nearest integer to 2 * x / pi. */ + float64x2_t q + = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift); + int64x2_t qi = vcvtq_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; + float64x2_t half_pi = vld1q_f64 (dat->half_pi); + r = vfmsq_laneq_f64 (r, q, half_pi, 0); + r = vfmsq_laneq_f64 (r, q, half_pi, 1); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = vmulq_n_f64 (r, 0.5); + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2), + r8 = vmulq_f64 (r4, r4); + /* Offset coefficients to evaluate from C1 onwards. */ + float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1); + p = vfmaq_f64 (dat->poly[0], p, r2); + p = vfmaq_f64 (r, r2, vmulq_f64 (p, r)); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). + */ + float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); + float64x2_t d = vaddq_f64 (p, p); + + uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); + +#if !WANT_SIMD_EXCEPT + uint64x2_t special = vcageq_f64 (x, dat->range_val); + if (unlikely (v_any_u64 (special))) + return special_case (x); +#endif + + return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)), + vbslq_f64 (no_recip, d, n)); +} + +TEST_SIG (V, D, 1, tan, -3.1, 3.1) +TEST_ULP (V_NAME_D1 (tan), 2.99) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000) +TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c new file mode 100644 index 000000000000..ed5448649f6c --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanf.c @@ -0,0 +1,130 @@ +/* + * Single-precision vector tan(x) function. + * + * Copyright (c) 2021-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f32.h" +#include "test_sig.h" +#include "test_defs.h" + +static const struct data +{ + float32x4_t poly[6]; + float pi_consts[4]; + float32x4_t shift; +#if !WANT_SIMD_EXCEPT + float32x4_t range_val; +#endif +} data = { + /* Coefficients generated using FPMinimax. */ + .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), + V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ + .pi_consts + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, + .shift = V4 (0x1.8p+23f), +#if !WANT_SIMD_EXCEPT + .range_val = V4 (0x1p15f), +#endif +}; + +#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */ +#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */ +#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */ + +/* Special cases (fall back to scalar calls). */ +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + return v_call_f32 (tanf, x, y, cmp); +} + +/* Use a full Estrin scheme to evaluate polynomial. */ +static inline float32x4_t +eval_poly (float32x4_t z, const struct data *d) +{ + float32x4_t z2 = vmulq_f32 (z, z); +#if WANT_SIMD_EXCEPT + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. + If fp exceptions are to be triggered correctly, + sidestep this by fixing such lanes to 0. */ + uint32x4_t will_uflow + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); + if (unlikely (v_any_u32 (will_uflow))) + z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); +#endif + float32x4_t z4 = vmulq_f32 (z2, z2); + return v_estrin_5_f32 (z, z2, z4, d->poly); +} + +/* Fast implementation of AdvSIMD tanf. + Maximum error is 3.45 ULP: + __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t special_arg = x; + + /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast + regression. */ +#if WANT_SIMD_EXCEPT + uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x)); + /* If fp exceptions are to be triggered correctly, also special-case tiny + input, as this will load to overflow later. Fix any special lanes to 1 to + prevent any exceptions being triggered. */ + uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh); + if (unlikely (v_any_u32 (special))) + x = vbslq_f32 (special, v_f32 (1.0f), x); +#else + /* Otherwise, special-case large and special values. */ + uint32x4_t special = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(x/(pi/2)). */ + float32x4_t pi_consts = vld1q_f32 (d->pi_consts); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, pi_consts, 3); + float32x4_t n = vsubq_f32 (q, d->shift); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); + + /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ + float32x4_t r; + r = vfmaq_laneq_f32 (x, n, pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, pi_consts, 2); + + /* If x lives in an interval, where |tan(x)| + - is finite, then use a polynomial approximation of the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use symmetries of tangent and the identity + tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use + the same polynomial approximation of tan as above. */ + + /* Invert sign of r if odd quadrant. */ + float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1))); + + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ + float32x4_t z2 = vmulq_f32 (r, r); + float32x4_t p = eval_poly (z2, d); + float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p); + + /* Compute reciprocal and apply if required. */ + float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y); + + if (unlikely (v_any_u32 (special))) + return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special); + return vbslq_f32 (pred_alt, inv_y, y); +} + +HALF_WIDTH_ALIAS_F1 (tan) + +TEST_SIG (V, F, 1, tan, -3.1, 3.1) +TEST_ULP (V_NAME_F1 (tan), 2.96) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tan), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000) +TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c new file mode 100644 index 000000000000..3dc6e5527ffc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanh.c @@ -0,0 +1,67 @@ +/* + * Double-precision vector tanh(x) function. + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1_inline.h" + +static const struct data +{ + struct v_expm1_data d; + uint64x2_t thresh, tiny_bound; +} data = { + .d = V_EXPM1_DATA, + .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ + /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ + .thresh = V2 (0x01f241bf835f9d5f), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t q, float64x2_t qp2, + uint64x2_t special) +{ + return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special); +} + +/* Vector approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.70 ULP: + _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3 + want -0x1.be5452a6459fbp-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); + + float64x2_t u = x; + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh); +#if WANT_SIMD_EXCEPT + /* To trigger fp exceptions correctly, set special lanes to a neutral value. + They will be fixed up later by the special-case handler. */ + if (unlikely (v_any_u64 (special))) + u = v_zerofy_f64 (u, special); +#endif + + u = vaddq_f64 (u, u); + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float64x2_t q = expm1_inline (u, &d->d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0)); + + if (unlikely (v_any_u64 (special))) + return special_case (x, q, qp2, special); + return vdivq_f64 (q, qp2); +} + +TEST_SIG (V, D, 1, tanh, -10.0, 10.0) +TEST_ULP (V_NAME_D1 (tanh), 2.21) +TEST_DISABLE_FENV_IF_NOT (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c new file mode 100644 index 000000000000..18fe93c7e7ba --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanhf.c @@ -0,0 +1,81 @@ +/* + * Single-precision vector tanh(x) function. + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" +#include "v_expm1f_inline.h" + +static const struct data +{ + struct v_expm1f_data expm1f_consts; + uint32x4_t boring_bound, large_bound, onef; +} data = { + .expm1f_consts = V_EXPM1F_DATA, + /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ + .boring_bound = V4 (0x41102cb3), + .large_bound = V4 (0x7f800000), +}; + +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring, + float32x4_t q, uint32x4_t special) +{ + return v_call_f32 ( + tanhf, x, + vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))), + special); +} + +/* Approximation for single-precision vector tanh(x), using a simplified + version of expm1f. The maximum error is 2.58 ULP: + _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + uint32x4_t ix = vreinterpretq_u32_f32 (x); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t sign = veorq_u32 (ix, iax); + uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); + /* expm1 exponent bias is 1.0f reinterpreted to int. */ + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 ( + sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias))); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered properly, set all special and boring + lanes to 0, which will trigger no exceptions, and fix them up later. */ + uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound), + vcltq_u32 (iax, v_u32 (0x34000000))); + x = v_zerofy_f32 (x, is_boring); + if (unlikely (v_any_u32 (special))) + x = v_zerofy_f32 (x, special); +#else + uint32x4_t special = vcgtq_u32 (iax, d->large_bound); +#endif + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); + + if (unlikely (v_any_u32 (special))) + return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q, + special); + + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); + return vbslq_f32 (is_boring, boring, y); +} + +HALF_WIDTH_ALIAS_F1 (tanh) + +TEST_SIG (V, F, 1, tanh, -10.0, 10.0) +TEST_ULP (V_NAME_F1 (tanh), 2.09) +TEST_DISABLE_FENV_IF_NOT (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100) diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c new file mode 100644 index 000000000000..16de00ad5556 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpi.c @@ -0,0 +1,88 @@ +/* + * Double-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpi_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = V2 (0x1.921fb54442d18p1), /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5), + .c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9), + .c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13), + .c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17), + .c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21), + .c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27), + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + float64x2_t n = vrndnq_f64 (x); + + /* inf produces nan that propagates. */ + float64x2_t xr = vsubq_f64 (x, n); + float64x2_t ar = vabdq_f64 (x, n); + uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25)); + float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar); + + /* Order-14 pairwise Horner. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + + float64x2_t c_1_3 = vld1q_f64 (&d->c1); + float64x2_t c_5_7 = vld1q_f64 (&d->c5); + float64x2_t c_9_11 = vld1q_f64 (&d->c9); + float64x2_t c_13_14 = vld1q_f64 (&d->c13); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0); + + float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1); + p = vfmaq_f64 (p1011, r4, p); + p = vfmaq_f64 (p89, r4, p); + p = vfmaq_f64 (p67, r4, p); + p = vfmaq_f64 (p45, r4, p); + p = vfmaq_f64 (p23, r4, p); + p = vfmaq_f64 (p01, r4, p); + p = vmulq_f64 (r, p); + + float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p); + float64x2_t y = vbslq_f64 (flip, p_recip, p); + + uint64x2_t sign + = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar)); + return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign)); +} + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (V_NAME_D1 (tanpi)) +TEST_ULP (V_NAME_D1 (tanpi), 2.57) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p-31, 0.5, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0.5, 1.0, 200000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 1.0, 0x1p23, 50000) +TEST_SYM_INTERVAL (V_NAME_D1 (tanpi), 0x1p23, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c new file mode 100644 index 000000000000..7bd6d206819f --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c @@ -0,0 +1,70 @@ +/* + * Single-precision vector tanpi(x) function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "test_sig.h" +#include "test_defs.h" + +const static struct v_tanpif_data +{ + float32x4_t c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficents for tan(pi * x). */ + .c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f), + .c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f, + .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpi(x) + The maximum error is 3.34 ULP: + _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + + float32x4_t n = vrndnq_f32 (x); + + /* inf produces nan that propagates. */ + float32x4_t xr = vsubq_f32 (x, n); + float32x4_t ar = vabdq_f32 (x, n); + uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f)); + float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar); + + /* Order-7 pairwise Horner polynomial evaluation scheme. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + + float32x4_t odd_coeffs = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3); + float32x4_t p = vfmaq_f32 (p45, r4, p67); + p = vfmaq_f32 (p23, r4, p); + p = vfmaq_f32 (p01, r4, p); + + p = vmulq_f32 (r, p); + float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p); + float32x4_t y = vbslq_f32 (flip, p_recip, p); + + uint32x4_t sign + = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar)); + return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign)); +} + +HALF_WIDTH_ALIAS_F1 (tanpi) + +#if WANT_TRIGPI_TESTS +TEST_DISABLE_FENV (V_NAME_F1 (tanpi)) +TEST_ULP (V_NAME_F1 (tanpi), 2.84) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000) +TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000) +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h new file mode 100644 index 000000000000..797d217820c3 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expf_inline.h @@ -0,0 +1,58 @@ +/* + * Helper for single-precision routines which calculate exp(ax) and do not + * need special-case handling + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPF_INLINE_H +#define MATH_V_EXPF_INLINE_H + +#include "v_math.h" + +struct v_expf_data +{ + float ln2_hi, ln2_lo, c0, c2; + float32x4_t inv_ln2, c1, c3, c4; + /* asuint(1.0f). */ + uint32x4_t exponent_bias; +}; + +/* maxerr: 1.45358 +0.5 ulp. */ +#define V_EXPF_DATA \ + { \ + .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \ + .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \ + } + +static inline float32x4_t +v_expf_inline (float32x4_t x, const struct v_expf_data *d) +{ + /* Helper routine for calculating exp(ax). + Copied from v_expf.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t ax = vabsq_f32 (x); + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); + float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + + /* Custom order-4 Estrin avoids building high order monomial. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); + return vfmaq_f32 (scale, poly, scale); +} + +#endif // MATH_V_EXPF_INLINE_H diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h new file mode 100644 index 000000000000..82d2e9415d93 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1_inline.h @@ -0,0 +1,86 @@ +/* + * Helper for double-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPM1_INLINE_H +#define MATH_V_EXPM1_INLINE_H + +#include "v_math.h" + +struct v_expm1_data +{ + float64x2_t c2, c4, c6, c8; + float64x2_t invln2; + int64x2_t exponent_bias; + double c1, c3, c5, c7, c9, c10; + double ln2[2]; +}; + +/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ +#define V_EXPM1_DATA \ + { \ + .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \ + .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \ + .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \ + .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \ + .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \ + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \ + .invln2 = V2 (0x1.71547652b82fep0), \ + .exponent_bias = V2 (0x3ff0000000000000), \ + } + +static inline float64x2_t +expm1_inline (float64x2_t x, const struct v_expm1_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2)); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); + f = vfmsq_laneq_f64 (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t lane_consts_13 = vld1q_f64 (&d->c1); + float64x2_t lane_consts_57 = vld1q_f64 (&d->c5); + float64x2_t lane_consts_910 = vld1q_f64 (&d->c9); + float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1); + float64x2_t p03 = vfmaq_f64 (p01, f2, p23); + float64x2_t p47 = vfmaq_f64 (p45, f2, p67); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0); + float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1); + p = vfmaq_f64 (p47, f4, p); + p = vfmaq_f64 (p03, f4, p); + + p = vfmaq_f64 (f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); +} + +#endif // MATH_V_EXPM1_INLINE_H diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h new file mode 100644 index 000000000000..463b07aa7705 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_expm1f_inline.h @@ -0,0 +1,62 @@ +/* + * Helper for single-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_EXPM1F_INLINE_H +#define MATH_V_EXPM1F_INLINE_H + +#include "v_math.h" + +struct v_expm1f_data +{ + float32x4_t c0, c2; + int32x4_t exponent_bias; + float c1, c3, inv_ln2, c4; + float ln2_hi, ln2_lo; +}; + +/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, + log(2)/2]. Exponent bias is asuint(1.0f). */ +#define V_EXPM1F_DATA \ + { \ + .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \ + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + } + +static inline float32x4_t +expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float32x2_t ln2 = vld1_f32 (&d->ln2_hi); + float32x4_t lane_consts = vld1q_f32 (&d->c1); + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2)); + int32x4_t i = vcvtq_s32_f32 (j); + float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0); + f = vfmsq_lane_f32 (f, j, ln2, 1); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */ + float32x4_t f2 = vmulq_f32 (f, f); + float32x4_t f4 = vmulq_f32 (f2, f2); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1); + float32x4_t p = vfmaq_f32 (p01, f2, p23); + p = vfmaq_laneq_f32 (p, f4, lane_consts, 3); + p = vfmaq_f32 (f, f2, p); + + /* t = 2^i. */ + int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); + float32x4_t t = vreinterpretq_f32_s32 (u); + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); +} + +#endif // MATH_V_EXPM1F_INLINE_H diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h new file mode 100644 index 000000000000..ef906ae4b603 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1p_inline.h @@ -0,0 +1,119 @@ +/* + * Helper for vector double-precision routines which calculate log(1 + x) and + * do not need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef MATH_V_LOG1P_INLINE_H +#define MATH_V_LOG1P_INLINE_H + +#include "v_math.h" + +struct v_log1p_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16; + uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; + int64x2_t one_top; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; + double ln2[2]; +}; + +/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ +#define V_LOG1P_CONSTANTS_TABLE \ + { \ + .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \ + .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \ + .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \ + .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \ + .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \ + .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \ + .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \ + .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \ + .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \ + .c18 = -0x1.cfa7385bdb37ep-6, \ + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \ + .hf_rt2_top = V2 (0x3fe6a09e00000000), \ + .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ + .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ + } + +#define BottomMask v_u64 (0xffffffff) + +static inline float64x2_t +eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1718 = vld1q_f64 (&d->c17); + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0); + float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1); + p = vfmaq_f64 (p1415, m2, p); + p = vfmaq_f64 (p1213, m2, p); + p = vfmaq_f64 (p1011, m2, p); + p = vfmaq_f64 (p89, m2, p); + p = vfmaq_f64 (p67, m2, p); + p = vfmaq_f64 (p45, m2, p); + p = vfmaq_f64 (p23, m2, p); + return vfmaq_f64 (p01, m2, p); +} + +static inline float64x2_t +log1p_inline (float64x2_t x, const struct v_log1p_data *d) +{ + /* Helper for calculating log(x + 1): + - No special-case handling - this should be dealt with by the caller. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using v_sel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. */ + float64x2_t m = vaddq_f64 (x, v_f64 (1.0)); + uint64x2_t mi = vreinterpretq_u64_f64 (m); + uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); + + int64x2_t ki + = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); + float64x2_t k = vcvtq_f64_s64 (ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); + uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0)); + + /* Correction term c/m. */ + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m); + +#ifndef WANT_V_LOG1P_K0_SHORTCUT +# error \ + "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_V_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + uint64x2_t k0 = vceqzq_f64 (k); + cm = v_zerofy_f64 (cm, k0); + f = vbslq_f64 (k0, x, f); +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t p = eval_poly (f, f2, d); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1); + float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0); + return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); +} + +#endif // MATH_V_LOG1P_INLINE_H diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h new file mode 100644 index 000000000000..e81fa24486ae --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log1pf_inline.h @@ -0,0 +1,94 @@ +/* + * Helper for single-precision routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2022-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_V_LOG1PF_INLINE_H +#define MATH_V_LOG1PF_INLINE_H + +#include "v_math.h" +#include "v_poly_f32.h" + +struct v_log1pf_data +{ + uint32x4_t four; + int32x4_t three_quarters; + float c0, c3, c5, c7; + float32x4_t c4, c6, c1, c2, ln2; +}; + +/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients + (1, -0.5) are not stored as they can be generated more efficiently. */ +#define V_LOG1PF_CONSTANTS_TABLE \ + { \ + .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \ + .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \ + .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \ + .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \ + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ + .three_quarters = V4 (0x3f400000) \ + } + +static inline float32x4_t +eval_poly (float32x4_t m, const struct v_log1pf_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float32x4_t c0357 = vld1q_f32 (&d->c0); + float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0); + float32x4_t m2 = vmulq_f32 (m, m); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1); + float32x4_t p = vfmaq_f32 (p45, m2, p67); + p = vfmaq_f32 (p23, m2, p); + p = vfmaq_f32 (d->c1, m, p); + p = vmulq_f32 (m2, p); + p = vfmaq_f32 (m, m2, p); + return vfmaq_f32 (p, m2, q); +} + +static inline float32x4_t +log1pf_inline (float32x4_t x, const struct v_log1pf_data *d) +{ + /* Helper for calculating log(x + 1). */ + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + int32x4_t k + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), + v_s32 (0xff800000)); + uint32x4_t ku = vreinterpretq_u32_s32 (k); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); + + /* Scale x by exponent manipulation. */ + float32x4_t m_scale + = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); + m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); + + /* Evaluate polynomial on the reduced interval. */ + float32x4_t p = eval_poly (m_scale, d); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); + + /* Apply the scaling back. */ + return vfmaq_f32 (p, scale_back, d->ln2); +} + +#endif // MATH_V_LOG1PF_INLINE_H diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h new file mode 100644 index 000000000000..770f9e81c195 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_log_inline.h @@ -0,0 +1,104 @@ +/* + * Double-precision vector log(x) function - inline version + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "math_config.h" + +#ifndef V_LOG_INLINE_POLY_ORDER +# error Cannot use inline log helper without specifying poly order (options are 4 or 5) +#endif + +#if V_LOG_INLINE_POLY_ORDER == 4 +# define POLY \ + { \ + V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2), \ + V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3) \ + } +#elif V_LOG_INLINE_POLY_ORDER == 5 +# define POLY \ + { \ + V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), \ + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), \ + V2 (-0x1.554e550bd501ep-3) \ + } +#else +# error Can only choose order 4 or 5 for log poly +#endif + +struct v_log_inline_data +{ + float64x2_t poly[V_LOG_INLINE_POLY_ORDER]; + float64x2_t ln2; + uint64x2_t off, sign_exp_mask; +}; + +#define V_LOG_CONSTANTS \ + { \ + .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1), \ + .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000) \ + } + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +log_lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static inline float64x2_t +v_log_inline (float64x2_t x, const struct v_log_inline_data *d) +{ + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, d->off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = log_lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); +#if V_LOG_POLY_ORDER == 5 + y = vfmaq_f64 (y, A (4), r2); +#endif + y = vfmaq_f64 (p, y, r2); + + return vfmaq_f64 (hi, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h new file mode 100644 index 000000000000..75cd71cc87a7 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_math.h @@ -0,0 +1,202 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#if !__aarch64__ +# error "Cannot build without AArch64" +#endif + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun +#define V_NAME_F1_L1(fun) _ZGVnN4vl4_##fun##f +#define V_NAME_D1_L1(fun) _ZGVnN2vl8_##fun + +#if USE_GLIBC_ABI + +# define HALF_WIDTH_ALIAS_F1(fun) \ + float32x2_t VPCS_ATTR _ZGVnN2v_##fun##f (float32x2_t x) \ + { \ + return vget_low_f32 (_ZGVnN4v_##fun##f (vcombine_f32 (x, x))); \ + } + +# define HALF_WIDTH_ALIAS_F2(fun) \ + float32x2_t VPCS_ATTR _ZGVnN2vv_##fun##f (float32x2_t x, float32x2_t y) \ + { \ + return vget_low_f32 ( \ + _ZGVnN4vv_##fun##f (vcombine_f32 (x, x), vcombine_f32 (y, y))); \ + } + +#else +# define HALF_WIDTH_ALIAS_F1(fun) +# define HALF_WIDTH_ALIAS_F2(fun) +#endif + +#include <stdint.h> +#include "math_config.h" +#include <arm_neon.h> + +/* Shorthand helpers for declaring constants. */ +#define V2(X) \ + { \ + X, X \ + } +#define V4(X) \ + { \ + X, X, X, X \ + } +#define V8(X) \ + { \ + X, X, X, X, X, X, X, X \ + } + +static inline int +v_any_u16h (uint16x4_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; +} + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +static inline int32x4_t +v_s32 (int32_t x) +{ + return (int32x4_t) V4 (x); +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline int +v_any_u32h (uint32x2_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3] }; +} +static inline float32x4_t +v_zerofy_f32 (float32x4_t x, uint32x4_t mask) +{ + return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask)); +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +static inline int64x2_t +v_s64 (int64_t x) +{ + return (int64x2_t) V2 (x); +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; +} + +static inline float64x2_t +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, + float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1h = x1[1]; + double x2h = x2[1]; + if (likely (p[0])) + y[0] = f (x1[0], x2[0]); + if (likely (p1)) + y[1] = f (x1h, x2h); + return y; +} +static inline float64x2_t +v_zerofy_f64 (float64x2_t x, uint64x2_t mask) +{ + return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask)); +} + +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h new file mode 100644 index 000000000000..9a9c5c1ac15b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f32.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on single-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_POLY_ADVSIMD_F32_H +#define MATH_POLY_ADVSIMD_F32_H + +#include <arm_neon.h> + +/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f32. */ +#define VTYPE float32x4_t +#define FMA(x, y, z) vfmaq_f32 (z, x, y) +#define VWRAP(f) v_##f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h new file mode 100644 index 000000000000..4331bfbd03b0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_poly_f64.h @@ -0,0 +1,24 @@ +/* + * Helpers for evaluating polynomials on double-precision AdvSIMD input, using + * various schemes. + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef MATH_POLY_ADVSIMD_F64_H +#define MATH_POLY_ADVSIMD_F64_H + +#include <arm_neon.h> + +/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f64. */ +#define VTYPE float64x2_t +#define FMA(x, y, z) vfmaq_f64 (z, x, y) +#define VWRAP(f) v_##f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h new file mode 100644 index 000000000000..14227d9339a8 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h @@ -0,0 +1,86 @@ +/* + * Core approximation for double-precision vector sincos + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "v_poly_f64.h" + +static const struct v_sincos_data +{ + float64x2_t sin_poly[7], cos_poly[6], pio2[3]; + float64x2_t inv_pio2, shift, range_val; +} v_sincos_data = { + .inv_pio2 = V2 (0x1.45f306dc9c882p-1), + .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26), + V2 (0x1.1a62633145c07p-54) }, + .shift = V2 (0x1.8p52), + .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ + V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ + V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10), + V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22), + V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) }, + .range_val = V2 (0x1p23), }; + +static inline uint64x2_t +check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d) +{ + return vcagtq_f64 (x, d->range_val); +} + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate polynomials. + Largest observed error is for sin, 3.22 ULP: + v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ +static inline float64x2x2_t +v_sincos_inline (float64x2_t x, const struct v_sincos_data *d) +{ + /* q = nearest integer to 2 * x / pi. */ + float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift); + int64x2_t n = vcvtq_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + float64x2_t r = x; + r = vfmsq_f64 (r, q, d->pio2[0]); + r = vfmsq_f64 (r, q, d->pio2[1]); + r = vfmsq_f64 (r, q, d->pio2[2]); + + float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2; + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly); + s = vfmaq_f64 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly); + c = vfmaq_f64 (v_f64 (-0.5), r2, c); + c = vfmaq_f64 (v_f64 (1), r2, c); + + /* If odd quadrant, swap cos and sin. */ + uint64x2_t swap = vtstq_s64 (n, v_s64 (1)); + float64x2_t ss = vbslq_f64 (swap, c, s); + float64x2_t cc = vbslq_f64 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) + cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ + uint64x2_t sin_sign + = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62); + uint64x2_t cos_sign = vshlq_n_u64 ( + vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)), + 62); + ss = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign)); + cc = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign)); + + return (float64x2x2_t){ ss, cc }; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h new file mode 100644 index 000000000000..7c29eded14d6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincosf_common.h @@ -0,0 +1,84 @@ +/* + * Core approximation for single-precision vector sincos + * + * Copyright (c) 2023-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +const static struct v_sincosf_data +{ + float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; +} v_sincosf_data = { + .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ + V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) }, + .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ + V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) }, + .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) }, + .inv_pio2 = V4 (0x1.45f306p-1f), + .shift = V4 (0x1.8p23), + .range_val = V4 (0x1p20), +}; + +static inline uint32x4_t +check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d) +{ + return vcagtq_f32 (x, d->range_val); +} + +/* Single-precision vector function allowing calculation of both sin and cos in + one function call, using shared argument reduction and separate low-order + polynomials. + Worst-case error for sin is 1.67 ULP: + v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 + Worst-case error for cos is 1.81 ULP: + v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ +static inline float32x4x2_t +v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d) +{ + /* n = rint ( x / (pi/2) ). */ + float32x4_t shift = d->shift; + float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2); + q = vsubq_f32 (q, shift); + int32x4_t n = vcvtq_s32_f32 (q); + + /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ + float32x4_t r = x; + r = vfmsq_f32 (r, q, d->pio2[0]); + r = vfmsq_f32 (r, q, d->pio2[1]); + r = vfmsq_f32 (r, q, d->pio2[2]); + + /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ + float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2); + float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]); + s = vfmaq_f32 (d->poly_sin[0], r2, s); + s = vfmaq_f32 (r, r3, s); + + /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]); + float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]); + c = vfmaq_f32 (c, r4, p); + c = vfmaq_f32 (v_f32 (1), c, r2); + + /* If odd quadrant, swap cos and sin. */ + uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1)); + float32x4_t ss = vbslq_f32 (swap, c, s); + float32x4_t cc = vbslq_f32 (swap, s, c); + + /* Fix signs according to quadrant. + ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) + cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ + uint32x4_t sin_sign + = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30); + uint32x4_t cos_sign = vshlq_n_u32 ( + vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)), + 30); + ss = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign)); + cc = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign)); + + return (float32x4x2_t){ ss, cc }; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h new file mode 100644 index 000000000000..438b141b9174 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospi_common.h @@ -0,0 +1,64 @@ +/* + * Helper for Double-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "v_math.h" +#include "v_poly_f64.h" + +static const struct v_sincospi_data +{ + float64x2_t poly[10], range_val; +} v_sincospi_data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +/* Double-precision vector function allowing calculation of both sin and cos in + one function call, using separate argument reduction and shared low-order + polynomials. + Approximation for vector double-precision sincospi(x). + Maximum Error 3.09 ULP: + _ZGVnN2v_sincospi_sin(0x1.7a41deb4b21e1p+14) got 0x1.fd54d0b327cf1p-1 + want 0x1.fd54d0b327cf4p-1 + Maximum Error 3.16 ULP: + _ZGVnN2v_sincospi_cos(-0x1.11e3c7e284adep-5) got 0x1.fd2da484ff3ffp-1 + want 0x1.fd2da484ff402p-1. */ +static inline float64x2x2_t +v_sincospi_inline (float64x2_t x, const struct v_sincospi_data *d) +{ + /* If r is odd, the sign of the result should be inverted for sinpi + and reintroduced for cospi. */ + uint64x2_t cmp = vcgeq_f64 (x, d->range_val); + uint64x2_t odd = vshlq_n_u64 ( + vbicq_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (x)), cmp), 63); + + /* r = x - rint(x). */ + float64x2_t sr = vsubq_f64 (x, vrndaq_f64 (x)); + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + float64x2_t cr = vsubq_f64 (v_f64 (0.5), vabsq_f64 (sr)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float64x2_t sr2 = vmulq_f64 (sr, sr); + float64x2_t sr4 = vmulq_f64 (sr2, sr2); + float64x2_t cr2 = vmulq_f64 (cr, cr); + float64x2_t cr4 = vmulq_f64 (cr2, cr2); + + float64x2_t ss = vmulq_f64 (v_pw_horner_9_f64 (sr2, sr4, d->poly), sr); + float64x2_t cc = vmulq_f64 (v_pw_horner_9_f64 (cr2, cr4, d->poly), cr); + + float64x2_t sinpix + = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (ss), odd)); + + float64x2_t cospix + = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (cc), odd)); + + return (float64x2x2_t){ sinpix, cospix }; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h new file mode 100644 index 000000000000..8d4177dd871e --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincospif_common.h @@ -0,0 +1,57 @@ +/* + * Helper for Single-precision vector sincospi function. + * + * Copyright (c) 2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#include "v_math.h" +#include "v_poly_f32.h" + +const static struct v_sincospif_data +{ + float32x4_t poly[6], range_val; +} v_sincospif_data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +/* Single-precision vector function allowing calculation of both sinpi and + cospi in one function call, using shared argument reduction and polynomials. + Worst-case error for sin is 3.04 ULP: + _ZGVnN4v_sincospif_sin(0x1.1d341ap-1) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + Worst-case error for cos is 3.18 ULP: + _ZGVnN4v_sincospif_cos(0x1.d341a8p-5) got 0x1.f7cd56p-1 want 0x1.f7cd5p-1. + */ +static inline float32x4x2_t +v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d) +{ + /* If r is odd, the sign of the result should be inverted for sinpi and + reintroduced for cospi. */ + uint32x4_t cmp = vcgeq_f32 (x, d->range_val); + uint32x4_t odd = vshlq_n_u32 ( + vbicq_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), cmp), 31); + + /* r = x - rint(x). */ + float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x)); + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t sr2 = vmulq_f32 (sr, sr); + float32x4_t sr4 = vmulq_f32 (sr2, sr2); + float32x4_t cr2 = vmulq_f32 (cr, cr); + float32x4_t cr4 = vmulq_f32 (cr2, cr2); + + float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr); + float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr); + + float32x4_t sinpix + = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (ss), odd)); + float32x4_t cospix + = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (cc), odd)); + + return (float32x4x2_t){ sinpix, cospix }; +} |