diff options
Diffstat (limited to 'contrib/arm-optimized-routines/math')
147 files changed, 3276 insertions, 3257 deletions
diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk index 3b841ab71955..5e9494a7bd3c 100644 --- a/contrib/arm-optimized-routines/math/Dir.mk +++ b/contrib/arm-optimized-routines/math/Dir.mk @@ -1,12 +1,14 @@ # Makefile fragment - requires GNU make # -# Copyright (c) 2019, Arm Limited. -# SPDX-License-Identifier: MIT +# Copyright (c) 2019-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception S := $(srcdir)/math B := build/math math-lib-srcs := $(wildcard $(S)/*.[cS]) +math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS]) + math-test-srcs := \ $(S)/test/mathtest.c \ $(S)/test/mathbench.c \ @@ -15,6 +17,7 @@ math-test-srcs := \ math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS]) math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h)) +math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h)) math-libs := \ build/lib/libmathlib.so \ @@ -42,10 +45,11 @@ math-files := \ $(math-tools) \ $(math-host-tools) \ $(math-includes) \ + $(math-test-includes) \ -all-math: $(math-libs) $(math-tools) $(math-includes) +all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) -$(math-objs): $(math-includes) +$(math-objs): $(math-includes) $(math-test-includes) $(math-objs): CFLAGS_ALL += $(math-cflags) $(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno $(math-host-objs): CC = $(HOST_CC) @@ -63,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs) $(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc $(math-tools): LDLIBS += $(math-ldlibs) -lm +# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled +$(math-tools): CFLAGS_ALL += $(math-sve-cflags) build/bin/rtest: $(math-host-objs) $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) @@ -83,6 +89,9 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a build/include/%.h: $(S)/include/%.h cp $< $@ +build/include/test/%.h: $(S)/test/%.h + cp $< $@ + build/bin/%.sh: $(S)/test/%.sh cp $< $@ @@ -96,7 +105,7 @@ check-math-rtest: $(math-host-tools) $(math-tools) cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags) check-math-ulp: $(math-tools) - ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR) + ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR) check-math: check-math-test check-math-rtest check-math-ulp diff --git a/contrib/arm-optimized-routines/math/README.contributors b/contrib/arm-optimized-routines/math/README.contributors new file mode 100644 index 000000000000..33e7ba376e41 --- /dev/null +++ b/contrib/arm-optimized-routines/math/README.contributors @@ -0,0 +1,78 @@ +STYLE REQUIREMENTS +================== + +1. Most code in this sub-directory is expected to be upstreamed into glibc so + the GNU Coding Standard and glibc specific conventions should be followed + to ease upstreaming. + +2. ABI and symbols: the code should be written so it is suitable for inclusion + into a libc with minimal changes. This e.g. means that internal symbols + should be hidden and in the implementation reserved namespace according to + ISO C and POSIX rules. If possible the built shared libraries and static + library archives should be usable to override libc symbols at link time (or + at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI + (other than symbol versioning), this cannot be done reliably for static + linking so this is a best effort requirement. + +3. API: include headers should be suitable for benchmarking and testing code + and should not conflict with libc headers. + + +CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY +============================================== + +1. Math functions have quality and performance requirements. + +2. Quality: + - Worst-case ULP error should be small in the entire input domain (for most + common double precision scalar functions the target is < 0.66 ULP error, + and < 1 ULP for single precision, even performance optimized function + variant should not have > 5 ULP error if the goal is to be a drop in + replacement for a standard math function), this should be tested + statistically (or on all inputs if possible in reasonable amount of time). + The ulp tool is for this and runulp.sh should be updated for new functions. + + - All standard rounding modes need to be supported but in non-default rounding + modes the quality requirement can be relaxed. (Non-nearest rounded + computation can be slow and inaccurate but has to be correct for conformance + reasons.) + + - Special cases and error handling need to follow ISO C Annex F requirements, + POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts: + https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions + this should be tested by direct tests (glibc test system may be used for it). + + - Error handling code should be decoupled from the approximation code as much + as possible. (There are helper functions, these take care of errno as well + as exception raising.) + + - Vector math code does not need to work in non-nearest rounding mode and error + handling side effects need not happen (fenv exceptions and errno), but the + result should be correct (within quality requirements, which are lower for + vector code than for scalar code). + + - Error bounds of the approximation should be clearly documented. + + - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux + systems. (Routines and features can be disabled on specific targets, but + the build must complete). On aarch64, both little- and big-endian targets + are supported as well as valid combinations of architecture extensions. + The configurations that should be tested depend on the contribution. + +3. Performance: + - Common math code should be benchmarked on modern aarch64 microarchitectures + over typical inputs. + + - Performance improvements should be documented (relative numbers can be + published; it is enough to use the mathbench microbenchmark tool which should + be updated for new functions). + + - Attention should be paid to the compilation flags: for aarch64 fma + contraction should be on and math errno turned off so some builtins can be + inlined. + + - The code should be reasonably performant on x86_64 too, e.g. some rounding + instructions and fma may not be available on x86_64, such builtins turn into + libc calls with slow code. Such slowdown is not acceptable, a faster fallback + should be present: glibc and bionic use the same code on all targets. (This + does not apply to vector math code). diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c new file mode 100644 index 000000000000..9a73575bce89 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c @@ -0,0 +1,87 @@ +/* + * Double-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c new file mode 100644 index 000000000000..b9890b2998ad --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector cos function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (x, d->range_val); + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c new file mode 100644 index 000000000000..bc5609faf4fc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c @@ -0,0 +1,125 @@ +/* + * Double-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static volatile struct +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; +#if !WANT_SIMD_EXCEPT + float64x2_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.88 +0.5 ulp + rel error: 1.4337*2^-53 + abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ + .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3), + V2 (0x1.55555da646206p-5) }, +#if !WANT_SIMD_EXCEPT + .scale_thresh = V2 (163840.0), /* 1280.0 * N. */ + .special_bound = V2 (704.0), +#endif + .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */ + .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */ + .ln2_lo = V2 (0x1.abc9e3b39803f3p-63), + .shift = V2 (0x1.8p+52) +}; + +#define C(i) data.poly[i] +#define Tab __v_exp_data + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */ +# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ +# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f64 (exp, x, y, cmp); +} + +#else + +# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vbslq_f64 (cmp, r1, r0); +} + +#endif + +float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x) +{ + float64x2_t n, r, r2, s, y, z; + uint64x2_t cmp, u, e; + +#if WANT_SIMD_EXCEPT + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + float64x2_t xm = x; + uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound); + if (unlikely (v_any_u64 (cmp))) + x = vbslq_f64 (cmp, v_f64 (1), x); +#else + cmp = vcagtq_f64 (x, data.special_bound); +#endif + + /* n = round(x/(ln2/N)). */ + z = vfmaq_f64 (data.shift, x, data.inv_ln2); + u = vreinterpretq_u64_f64 (z); + n = vsubq_f64 (z, data.shift); + + /* r = x - n*ln2/N. */ + r = x; + r = vfmsq_f64 (r, data.ln2_hi, n); + r = vfmsq_f64 (r, data.ln2_lo, n); + + e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (C (0), C (1), r); + y = vfmaq_f64 (y, C (2), r2); + y = vfmaq_f64 (r, y, r2); + + /* s = 2^(n/N). */ + u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] }; + s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + if (unlikely (v_any_u64 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f64 (s, y, s), cmp); +#else + return special_case (s, y, n); +#endif + + return vfmaq_f64 (s, y, s); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c new file mode 100644 index 000000000000..e402205e98e6 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c @@ -0,0 +1,113 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.962 ulp. */ + .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), + V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine for special lanes. */ + return v_call_f32 (exp2f, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ + uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special_case to fix special lanes later. This is only necessary if fenv + exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + n = vrndaq_f32 (x); + r = vsubq_f32 (x, n); + e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c new file mode 100644 index 000000000000..ba6b02fbb4bc --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c @@ -0,0 +1,72 @@ +/* + * Single-precision vector 2^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.878 ulp. */ + 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) +#define C5 v_f32 (Poly[5]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_exp2f_1u (float32x4_t x) +{ + float32x4_t n, r, scale, poly, absn; + uint32x4_t cmp, e; + + /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ +#if 0 + float32x4_t z; + z = x + Shift; + n = z - Shift; + r = x - n; + e = vreinterpretq_u32_f32 (z) << 23; +#else + n = vrndaq_f32 (x); + r = x - n; + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23; +#endif + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (C5, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c new file mode 100644 index 000000000000..45f0848cac5b --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c @@ -0,0 +1,146 @@ +/* + * Lookup table for double-precision e^x vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +# define N (1 << V_EXP_TABLE_BITS) + +/* 2^(j/N), j=0..N. */ +const uint64_t __v_exp_data[] = { +# if N == 128 + 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061, + 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de, + 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f, + 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b, + 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0, + 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea, + 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa, + 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96, + 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd, + 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990, + 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715, + 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1, + 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7, + 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c, + 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d, + 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de, + 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7, + 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f, + 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429, + 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09, + 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225, + 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf, + 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74, + 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f, + 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62, + 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad, + 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db, + 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6, + 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50, + 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323, + 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d, + 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a, + 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb, + 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a, + 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c, + 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5, + 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c, + 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398, + 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f, + 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83, + 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27, + 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14, + 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1, +# elif N == 256 + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +# endif +}; diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf.c b/contrib/arm-optimized-routines/math/aarch64/v_expf.c new file mode 100644 index 000000000000..34e8b6081bcd --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_expf.c @@ -0,0 +1,122 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[5]; + float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; + uint32x4_t exponent_bias; +#if !WANT_SIMD_EXCEPT + float32x4_t special_bound, scale_thresh; +#endif +} data = { + /* maxerr: 1.45358 +0.5 ulp. */ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, + .shift = V4 (0x1.8p23f), + .inv_ln2 = V4 (0x1.715476p+0f), + .ln2_hi = V4 (0x1.62e4p-1f), + .ln2_lo = V4 (0x1.7f7d1cp-20f), + .exponent_bias = V4 (0x3f800000), +#if !WANT_SIMD_EXCEPT + .special_bound = V4 (126.0f), + .scale_thresh = V4 (192.0f), +#endif +}; + +#define C(i) d->poly[i] + +#if WANT_SIMD_EXCEPT + +# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ +# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */ +# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) +{ + /* If fenv exceptions are to be triggered correctly, fall back to the scalar + routine to special lanes. */ + return v_call_f32 (expf, x, y, cmp); +} + +#else + +# define SpecialOffset v_u32 (0x82000000) +# define SpecialBias v_u32 (0x7f000000) + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vbslq_f32 (cmp2, r2, r); +} + +#endif + +float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, scale, p, q, poly, z; + uint32x4_t cmp, e; + +#if WANT_SIMD_EXCEPT + /* asuint(x) - TinyBound >= BigBound - TinyBound. */ + cmp = vcgeq_u32 ( + vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), + TinyBound), + SpecialBound); + float32x4_t xm = x; + /* If any lanes are special, mask them with 1 and retain a copy of x to allow + special case handler to fix special lanes later. This is only necessary if + fenv exceptions are to be triggered correctly. */ + if (unlikely (v_any_u32 (cmp))) + x = vbslq_f32 (cmp, v_f32 (1), x); +#endif + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + z = vfmaq_f32 (d->shift, x, d->inv_ln2); + n = vsubq_f32 (z, d->shift); + r = vfmsq_f32 (x, n, d->ln2_hi); + r = vfmsq_f32 (r, n, d->ln2_lo); + e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + +#if !WANT_SIMD_EXCEPT + cmp = vcagtq_f32 (n, d->special_bound); +#endif + + r2 = vmulq_f32 (r, r); + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + + if (unlikely (v_any_u32 (cmp))) +#if WANT_SIMD_EXCEPT + return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp); +#else + return special_case (poly, n, e, cmp, scale, d); +#endif + + return vfmaq_f32 (scale, poly, scale); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c new file mode 100644 index 000000000000..43d03fa34efa --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c @@ -0,0 +1,77 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const float Poly[] = { + /* maxerr: 0.36565 +0.5 ulp. */ + 0x1.6a6000p-10f, + 0x1.12718ep-7f, + 0x1.555af0p-5f, + 0x1.555430p-3f, + 0x1.fffff4p-2f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +static float32x4_t VPCS_ATTR NOINLINE +specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000); + float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b); + float32x4_t s2 = vreinterpretq_f32_u32 (e - b); + uint32x4_t cmp = absn > v_f32 (192.0f); + float32x4_t r1 = s1 * s1; + float32x4_t r0 = poly * s1 * s2; + return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1)) + | (~cmp & vreinterpretq_u32_f32 (r0))); +} + +float32x4_t VPCS_ATTR +_ZGVnN4v_expf_1u (float32x4_t x) +{ + float32x4_t n, r, scale, poly, absn, z; + uint32x4_t cmp, e; + + /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ +#if 1 + z = vfmaq_f32 (Shift, x, InvLn2); + n = z - Shift; + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_f32 (z) << 23; +#else + z = x * InvLn2; + n = vrndaq_f32 (z); + r = vfmaq_f32 (x, n, -Ln2hi); + r = vfmaq_f32 (r, n, -Ln2lo); + e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23; +#endif + scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000)); + absn = vabsq_f32 (n); + cmp = absn > v_f32 (126.0f); + poly = vfmaq_f32 (C1, C0, r); + poly = vfmaq_f32 (C2, poly, r); + poly = vfmaq_f32 (C3, poly, r); + poly = vfmaq_f32 (C4, poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + poly = vfmaq_f32 (v_f32 (1.0f), poly, r); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn); + return scale * poly; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log.c b/contrib/arm-optimized-routines/math/aarch64/v_log.c new file mode 100644 index 000000000000..1d1c1fa62c04 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_log.c @@ -0,0 +1,100 @@ +/* + * Double-precision vector log(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint64x2_t min_norm; + uint32x4_t special_bound; + float64x2_t poly[5]; + float64x2_t ln2; + uint64x2_t sign_exp_mask; +} data = { + /* Worst-case error: 1.17 + 0.5 ulp. + Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), + V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), + V2 (-0x1.554e550bd501ep-3) }, + .ln2 = V2 (0x1.62e42fefa39efp-1), + .min_norm = V2 (0x0010000000000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .sign_exp_mask = V2 (0xfff0000000000000) +}; + +#define A(i) d->poly[i] +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) +#define Off v_u64 (0x3fe6900900000000) + +struct entry +{ + float64x2_t invc; + float64x2_t logc; +}; + +static inline struct entry +lookup (uint64x2_t i) +{ + /* Since N is a power of 2, n % N = n & (N - 1). */ + struct entry e; + uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; + float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); + float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); + e.invc = vuzp1q_f64 (e0, e1); + e.logc = vuzp2q_f64 (e0, e1); + return e; +} + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, + uint32x2_t cmp) +{ + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); +} + +float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t z, r, r2, p, y, kd, hi; + uint64x2_t ix, iz, tmp; + uint32x2_t cmp; + int64x2_t k; + struct entry e; + + ix = vreinterpretq_u64_f64 (x); + cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), + vget_low_u32 (d->special_bound)); + + /* x = 2^k z; where z is in range [Off,2*Off) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = vsubq_u64 (ix, Off); + k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ + iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + z = vreinterpretq_f64_u64 (iz); + e = lookup (tmp); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + kd = vcvtq_f64_s64 (k); + + /* hi = r + log(c) + k*Ln2. */ + hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = vmulq_f64 (r, r); + y = vfmaq_f64 (A (2), A (3), r); + p = vfmaq_f64 (A (0), A (1), r); + y = vfmaq_f64 (y, A (4), r2); + y = vfmaq_f64 (p, y, r2); + + if (unlikely (v_any_u32h (cmp))) + return special_case (x, y, hi, r2, cmp); + return vfmaq_f64 (hi, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log_data.c b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c new file mode 100644 index 000000000000..82351bb14766 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c @@ -0,0 +1,156 @@ +/* + * Lookup table for double-precision log(x) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define N (1 << V_LOG_TABLE_BITS) + +const struct v_log_data __v_log_data = { + /* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + poly(z/c - 1) + + where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1, + N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables: + + table[i].invc = 1/c + table[i].logc = (double)log(c) + + where c is near the center of the subinterval and is chosen by trying several + floating point invc candidates around 1/center and selecting one for which + the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval + that contains 1 and the previous one got tweaked to avoid cancellation. */ + .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 }, + { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 }, + { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 }, + { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 }, + { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 }, + { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 }, + { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 }, + { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 }, + { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 }, + { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 }, + { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 }, + { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 }, + { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 }, + { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 }, + { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 }, + { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 }, + { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 }, + { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 }, + { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 }, + { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 }, + { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 }, + { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 }, + { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 }, + { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 }, + { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 }, + { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 }, + { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 }, + { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 }, + { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 }, + { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 }, + { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 }, + { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 }, + { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 }, + { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 }, + { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 }, + { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 }, + { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 }, + { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 }, + { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 }, + { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 }, + { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 }, + { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 }, + { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 }, + { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 }, + { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 }, + { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 }, + { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 }, + { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 }, + { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 }, + { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 }, + { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 }, + { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 }, + { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 }, + { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 }, + { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 }, + { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 }, + { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 }, + { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 }, + { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 }, + { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 }, + { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 }, + { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 }, + { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 }, + { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 }, + { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 }, + { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 }, + { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 }, + { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 }, + { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 }, + { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 }, + { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 }, + { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 }, + { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 }, + { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 }, + { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 }, + { 1.0, 0.0 }, + { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 }, + { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 }, + { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 }, + { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 }, + { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 }, + { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 }, + { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 }, + { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 }, + { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 }, + { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 }, + { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 }, + { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 }, + { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 }, + { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 }, + { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 }, + { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 }, + { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 }, + { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 }, + { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 }, + { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 }, + { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 }, + { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 }, + { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 }, + { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 }, + { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 }, + { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 }, + { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 }, + { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 }, + { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 }, + { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 }, + { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 }, + { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 }, + { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 }, + { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 }, + { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 }, + { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 }, + { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 }, + { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 }, + { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 }, + { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 }, + { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 }, + { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 }, + { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 }, + { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 }, + { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 }, + { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 }, + { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 }, + { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 }, + { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 }, + { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 }, + { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 }, + { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } } +}; diff --git a/contrib/arm-optimized-routines/math/aarch64/v_logf.c b/contrib/arm-optimized-routines/math/aarch64/v_logf.c new file mode 100644 index 000000000000..66ebbbcd2b5a --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_logf.c @@ -0,0 +1,74 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + uint32x4_t min_norm; + uint16x8_t special_bound; + float32x4_t poly[7]; + float32x4_t ln2, tiny_bound; + uint32x4_t off, mantissa_mask; +} data = { + /* 3.34 ulp error. */ + .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), + V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), + V4 (-0x1.ffffc8p-2f) }, + .ln2 = V4 (0x1.62e43p-1f), + .tiny_bound = V4 (0x1p-126), + .min_norm = V4 (0x00800000), + .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .off = V4 (0x3f2aaaab), /* 0.666667. */ + .mantissa_mask = V4 (0x007fffff) +}; + +#define P(i) d->poly[7 - i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, + uint16x4_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, p, q, r, r2, y; + uint32x4_t u; + uint16x4_t cmp; + + u = vreinterpretq_u32_f32 (x); + cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), + vget_low_u16 (d->special_bound)); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = vsubq_u32 (u, d->off); + n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ + u = vandq_u32 (u, d->mantissa_mask); + u = vaddq_u32 (u, d->off); + r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + + /* y = log(1+r) + n*ln2. */ + r2 = vmulq_f32 (r, r); + /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ + p = vfmaq_f32 (P (5), P (6), r); + q = vfmaq_f32 (P (3), P (4), r); + y = vfmaq_f32 (P (1), P (2), r); + p = vfmaq_f32 (p, P (7), r2); + q = vfmaq_f32 (q, p, r2); + y = vfmaq_f32 (y, q, r2); + p = vfmaq_f32 (r, d->ln2, n); + + if (unlikely (v_any_u16h (cmp))) + return special_case (x, y, r2, p, cmp); + return vfmaq_f32 (p, y, r2); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_math.h b/contrib/arm-optimized-routines/math/aarch64/v_math.h new file mode 100644 index 000000000000..1dc9916c6fb0 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_math.h @@ -0,0 +1,135 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#if !__aarch64__ +# error "Cannot build without AArch64" +#endif + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +#include <stdint.h> +#include "../math_config.h" +#include <arm_neon.h> + +/* Shorthand helpers for declaring constants. */ +# define V2(X) { X, X } +# define V4(X) { X, X, X, X } +# define V8(X) { X, X, X, X, X, X, X, X } + +static inline int +v_any_u16h (uint16x4_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0; +} + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline int +v_any_u32h (uint32x2_t x) +{ + return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){tab[idx[0]], tab[idx[1]]}; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + double p1 = p[1]; + double x1 = x[1]; + if (likely (p[0])) + y[0] = f (x[0]); + if (likely (p1)) + y[1] = f (x1); + return y; +} + +#endif diff --git a/contrib/arm-optimized-routines/math/aarch64/v_pow.c b/contrib/arm-optimized-routines/math/aarch64/v_pow.c new file mode 100644 index 000000000000..734f1663a283 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_pow.c @@ -0,0 +1,22 @@ +/* + * Double-precision vector pow function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) +{ + float64x2_t z; + for (int lane = 0; lane < v_lanes64 (); lane++) + { + double sx = x[lane]; + double sy = y[lane]; + double sz = pow (sx, sy); + z[lane] = sz; + } + return z; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_powf.c b/contrib/arm-optimized-routines/math/aarch64/v_powf.c new file mode 100644 index 000000000000..3a4163ab0558 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_powf.c @@ -0,0 +1,148 @@ +/* + * Single-precision vector powf function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" + +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Thresh v_u32 (0x7f000000) /* Max - Min. */ +#define MantissaMask v_u32 (0x007fffff) + +#define A data.log2_poly +#define C data.exp2f_poly + +/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ +#define Off v_u32 (0x3f35d000) + +#define V_POWF_LOG2_TABLE_BITS 5 +#define V_EXP2F_TABLE_BITS 5 +#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1) +#define Scale ((double) (1 << V_EXP2F_TABLE_BITS)) + +static const struct +{ + struct + { + double invc, logc; + } log2_tab[1 << V_POWF_LOG2_TABLE_BITS]; + double log2_poly[4]; + uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS]; + double exp2f_poly[3]; +} data = { + .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale}, + {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale}, + {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale}, + {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale}, + {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale}, + {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale}, + {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale}, + {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale}, + {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale}, + {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale}, + {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale}, + {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale}, + {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale}, + {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale}, + {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale}, + {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale}, + {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale}, + {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale}, + {0x1p+0, 0x0p+0 * Scale}, + {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale}, + {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale}, + {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale}, + {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale}, + {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale}, + {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale}, + {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale}, + {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale}, + {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale}, + {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale}, + {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale}, + {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale}, + {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},}, + .log2_poly = { /* rel err: 1.5 * 2^-30. */ + -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale, + -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,}, + .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, + 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa, + 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715, + 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, + 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, + 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, + 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db, + 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, + 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, + 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f, + 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,}, + .exp2f_poly = { /* rel err: 1.69 * 2^-34. */ + 0x1.c6af84b912394p-5 / Scale / Scale / Scale, + 0x1.ebfce50fac4f3p-3 / Scale / Scale, + 0x1.62e42ff0c52d6p-1 / Scale}}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp) +{ + return v_call2_f32 (powf, x, y, ret, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y) +{ + uint32x4_t u = vreinterpretq_u32_f32 (x); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh); + uint32x4_t tmp = vsubq_u32 (u, Off); + uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + Log2IdxMask); + uint32x4_t top = vbicq_u32 (tmp, MantissaMask); + uint32x4_t iz = vsubq_u32 (u, top); + int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top), + 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */ + + float32x4_t ret; + for (int lane = 0; lane < 4; lane++) + { + /* Use double precision for each lane. */ + double invc = data.log2_tab[i[lane]].invc; + double logc = data.log2_tab[i[lane]].logc; + double z = (double) asfloat (iz[lane]); + + /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ + double r = __builtin_fma (z, invc, -1.0); + double y0 = logc + (double) k[lane]; + + /* Polynomial to approximate log1p(r)/ln2. */ + double logx = A[0]; + logx = r * logx + A[1]; + logx = r * logx + A[2]; + logx = r * logx + A[3]; + logx = r * logx + y0; + double ylogx = y[lane] * logx; + cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff) + >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47 + ? 1 + : cmp[lane]; + + /* N*x = k + r with r in [-1/2, 1/2]. */ + double kd = round (ylogx); + uint64_t ki = lround (ylogx); + r = ylogx - kd; + + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ + uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)]; + t += ki << (52 - V_EXP2F_TABLE_BITS); + double s = asdouble (t); + double p = C[0]; + p = __builtin_fma (p, r, C[1]); + p = __builtin_fma (p, r, C[2]); + p = __builtin_fma (p, s * r, s); + + ret[lane] = p; + } + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, ret, cmp); + return ret; +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sin.c b/contrib/arm-optimized-routines/math/aarch64/v_sin.c new file mode 100644 index 000000000000..04129c31133d --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_sin.c @@ -0,0 +1,97 @@ +/* + * Double-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + + .range_val = V2 (0x1p23), + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ +# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sin, x, y, cmp); +} + +/* Vector (AdvSIMD) sin approximation. + Maximum observed error in [-pi/2, pi/2], where argument is not reduced, + is 2.87 ULP: + _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1 + want 0x1.fffffffa7dc05p-1 + Maximum observed error in the entire non-special domain ([-2^23, 2^23]) + is 3.22 ULP: + _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3 + want 0x1.ffdcd125c84f8p-3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, y, t1, t2, t3; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be + triggered correctly, set any special lanes to 1 (which is neutral w.r.t. + fenv). These lanes will be fixed by special-case handler later. */ + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); +#else + r = x; + cmp = vcageq_f64 (x, d->range_val); +#endif + + /* n = rint(|x|/pi). */ + n = vfmaq_f64 (d->shift, d->inv_pi, r); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sinf.c b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c new file mode 100644 index 000000000000..336879844459 --- /dev/null +++ b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector sin function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "mathlib.h" +#include "v_math.h" + +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .range_val = V4 (0x1p20f) +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ +# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +#endif + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinf, x, y, cmp); +} + +float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); +#else + r = x; + cmp = vcageq_f32 (x, d->range_val); +#endif + + /* n = rint(|x|/pi) */ + n = vfmaq_f32 (d->shift, d->inv_pi, r); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r) */ + r2 = vmulq_f32 (r, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, vmulq_f32 (y, r2), r); + + if (unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} diff --git a/contrib/arm-optimized-routines/math/cosf.c b/contrib/arm-optimized-routines/math/cosf.c index f29f19474e23..6293ce8f1b7d 100644 --- a/contrib/arm-optimized-routines/math/cosf.c +++ b/contrib/arm-optimized-routines/math/cosf.c @@ -1,8 +1,8 @@ /* * Single-precision cos function. * - * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -22,7 +22,7 @@ cosf (float y) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4)) + if (abstop12 (y) < abstop12 (pio4f)) { double x2 = x * x; diff --git a/contrib/arm-optimized-routines/math/erf.c b/contrib/arm-optimized-routines/math/erf.c index 12d7e5160df7..5f9f40dda264 100644 --- a/contrib/arm-optimized-routines/math/erf.c +++ b/contrib/arm-optimized-routines/math/erf.c @@ -2,7 +2,7 @@ * Double-precision erf(x) function. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/erf_data.c b/contrib/arm-optimized-routines/math/erf_data.c index 807875bdd7f5..10cf1fae93e0 100644 --- a/contrib/arm-optimized-routines/math/erf_data.c +++ b/contrib/arm-optimized-routines/math/erf_data.c @@ -2,7 +2,7 @@ * Shared data between erf and erfc. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/erff.c b/contrib/arm-optimized-routines/math/erff.c index a58e82565dc3..9fa476dbbab2 100644 --- a/contrib/arm-optimized-routines/math/erff.c +++ b/contrib/arm-optimized-routines/math/erff.c @@ -2,7 +2,7 @@ * Single-precision erf(x) function. * * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/math/erff_data.c b/contrib/arm-optimized-routines/math/erff_data.c index fa6b1ef4dedb..f822788d0dd8 100644 --- a/contrib/arm-optimized-routines/math/erff_data.c +++ b/contrib/arm-optimized-routines/math/erff_data.c @@ -2,7 +2,7 @@ * Data for approximation of erff. * * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/exp.c b/contrib/arm-optimized-routines/math/exp.c index 7f5024cd8792..1de500c31f3e 100644 --- a/contrib/arm-optimized-routines/math/exp.c +++ b/contrib/arm-optimized-routines/math/exp.c @@ -2,7 +2,7 @@ * Double-precision e^x function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <float.h> diff --git a/contrib/arm-optimized-routines/math/exp10.c b/contrib/arm-optimized-routines/math/exp10.c new file mode 100644 index 000000000000..0fbec4c694ca --- /dev/null +++ b/contrib/arm-optimized-routines/math/exp10.c @@ -0,0 +1,129 @@ +/* + * Double-precision 10^x function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) +#define IndexMask (N - 1) +#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */ +#define UFlowBound -0x1.5ep+8 /* -350. */ +#define SmallTop 0x3c6 /* top12(0x1p-57). */ +#define BigTop 0x407 /* top12(0x1p8). */ +#define Thresh 0x41 /* BigTop - SmallTop. */ +#define Shift __exp_data.shift +#define C(i) __exp_data.exp10_poly[i] + +static double +special_case (uint64_t sbits, double_t tmp, uint64_t ki) +{ + double_t scale, y; + + if (ki - (1ull << 16) < 0x80000000) + { + /* The exponent of scale might have overflowed by 1. */ + sbits -= 1ull << 52; + scale = asdouble (sbits); + y = 2 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + + /* n < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble (sbits); + y = scale + scale * tmp; + + if (y < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t lo = scale - y + scale * tmp; + double_t hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double (hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + + return check_uflow (y); +} + +/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */ +double +exp10 (double x) +{ + uint64_t ix = asuint64 (x); + uint32_t abstop = (ix >> 52) & 0x7ff; + + if (unlikely (abstop - SmallTop >= Thresh)) + { + if (abstop - SmallTop >= 0x80000000) + /* Avoid spurious underflow for tiny x. + Note: 0 is common input. */ + return x + 1; + if (abstop == 0x7ff) + return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0; + if (x >= OFlowBound) + return __math_oflow (0); + if (x < UFlowBound) + return __math_uflow (0); + + /* Large x is special-cased below. */ + abstop = 0; + } + + /* Reduce x: z = x * N / log10(2), k = round(z). */ + double_t z = __exp_data.invlog10_2N * x; + double_t kd; + int64_t ki; +#if TOINT_INTRINSICS + kd = roundtoint (z); + ki = converttoint (z); +#else + kd = eval_as_double (z + Shift); + kd -= Shift; + ki = kd; +#endif + + /* r = x - k * log10(2), r in [-0.5, 0.5]. */ + double_t r = x; + r = __exp_data.neglog10_2hiN * kd + r; + r = __exp_data.neglog10_2loN * kd + r; + + /* exp10(x) = 2^(k/N) * 2^(r/N). + Approximate the two components separately. */ + + /* s = 2^(k/N), using lookup table. */ + uint64_t e = ki << (52 - EXP_TABLE_BITS); + uint64_t i = (ki & IndexMask) * 2; + uint64_t u = __exp_data.tab[i + 1]; + uint64_t sbits = u + e; + + double_t tail = asdouble (__exp_data.tab[i]); + + /* 2^(r/N) ~= 1 + r * Poly(r). */ + double_t r2 = r * r; + double_t p = C (0) + r * C (1); + double_t y = C (2) + r * C (3); + y = y + r2 * C (4); + y = p + r2 * y; + y = tail + y * r; + + if (unlikely (abstop == 0)) + return special_case (sbits, y, ki); + + /* Assemble components: + y = 2^(r/N) * 2^(k/N) + ~= (y + 1) * s. */ + double_t s = asdouble (sbits); + return eval_as_double (s * y + s); +} diff --git a/contrib/arm-optimized-routines/math/exp2.c b/contrib/arm-optimized-routines/math/exp2.c index 35ab39f22ed5..a1eee44f1f48 100644 --- a/contrib/arm-optimized-routines/math/exp2.c +++ b/contrib/arm-optimized-routines/math/exp2.c @@ -2,7 +2,7 @@ * Double-precision 2^x function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <float.h> diff --git a/contrib/arm-optimized-routines/math/exp2f.c b/contrib/arm-optimized-routines/math/exp2f.c index 94b32538aa0d..776c3ddf7663 100644 --- a/contrib/arm-optimized-routines/math/exp2f.c +++ b/contrib/arm-optimized-routines/math/exp2f.c @@ -2,7 +2,7 @@ * Single-precision 2^x function. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <math.h> diff --git a/contrib/arm-optimized-routines/math/exp2f_data.c b/contrib/arm-optimized-routines/math/exp2f_data.c index 3fb0ad11b15a..f0cb7fccacd1 100644 --- a/contrib/arm-optimized-routines/math/exp2f_data.c +++ b/contrib/arm-optimized-routines/math/exp2f_data.c @@ -2,7 +2,7 @@ * Shared data between expf, exp2f and powf. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/exp_data.c b/contrib/arm-optimized-routines/math/exp_data.c index cba76832566f..c20b1b2d3e06 100644 --- a/contrib/arm-optimized-routines/math/exp_data.c +++ b/contrib/arm-optimized-routines/math/exp_data.c @@ -1,8 +1,8 @@ /* * Shared data between exp, exp2 and pow. * - * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" @@ -12,6 +12,7 @@ const struct exp_data __exp_data = { // N/ln2 .invln2N = 0x1.71547652b82fep0 * N, +.invlog10_2N = 0x1.a934f0979a371p1 * N, // -ln2/N #if N == 64 .negln2hiN = -0x1.62e42fefa0000p-7, @@ -26,6 +27,8 @@ const struct exp_data __exp_data = { .negln2hiN = -0x1.62e42fef80000p-10, .negln2loN = -0x1.1cf79abc9e3b4p-45, #endif +.neglog10_2hiN = -0x1.3441350ap-2 / N, +.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N, // Used for rounding when !TOINT_INTRINSICS #if EXP_USE_TOINT_NARROW .shift = 0x1800000000.8p0, @@ -147,6 +150,24 @@ const struct exp_data __exp_data = { 0x1.3b2ab786ee1dap-7, #endif }, +.exp10_poly = { +#if EXP10_POLY_WIDE +/* Range is wider if using shift-based reduction: coeffs generated + using Remez in [-log10(2)/128, log10(2)/128 ]. */ +0x1.26bb1bbb55515p1, +0x1.53524c73cd32bp1, +0x1.0470591e1a108p1, +0x1.2bd77b12fe9a8p0, +0x1.14289fef24b78p-1 +#else +/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */ +0x1.26bb1bbb55516p1, +0x1.53524c73ce9fep1, +0x1.0470591ce4b26p1, +0x1.2bd76577fe684p0, +0x1.1446eeccd0efbp-1 +#endif +}, // 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) // tab[2*k] = asuint64(T[k]) // tab[2*k+1] = asuint64(H[k]) - (k << 52)/N diff --git a/contrib/arm-optimized-routines/math/expf.c b/contrib/arm-optimized-routines/math/expf.c index 9b2f0c3d8c56..08a20d59e491 100644 --- a/contrib/arm-optimized-routines/math/expf.c +++ b/contrib/arm-optimized-routines/math/expf.c @@ -2,7 +2,7 @@ * Single-precision e^x function. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <math.h> diff --git a/contrib/arm-optimized-routines/math/include/mathlib.h b/contrib/arm-optimized-routines/math/include/mathlib.h index 279d829d8ea1..64cbb9c1f850 100644 --- a/contrib/arm-optimized-routines/math/include/mathlib.h +++ b/contrib/arm-optimized-routines/math/include/mathlib.h @@ -1,8 +1,8 @@ /* * Public API. * - * Copyright (c) 2015-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2015-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATHLIB_H @@ -18,74 +18,33 @@ float cosf (float); void sincosf (float, float*, float*); double exp (double); +double exp10 (double); double exp2 (double); double log (double); double log2 (double); double pow (double, double); -/* Scalar functions using the vector algorithm with identical result. */ -float __s_sinf (float); -float __s_cosf (float); -float __s_expf (float); -float __s_expf_1u (float); -float __s_exp2f (float); -float __s_exp2f_1u (float); -float __s_logf (float); -float __s_powf (float, float); -double __s_sin (double); -double __s_cos (double); -double __s_exp (double); -double __s_log (double); -double __s_pow (double, double); - #if __aarch64__ -#if __GNUC__ >= 5 +# if __GNUC__ >= 5 typedef __Float32x4_t __f32x4_t; typedef __Float64x2_t __f64x2_t; -#elif __clang_major__*100+__clang_minor__ >= 305 +# elif __clang_major__*100+__clang_minor__ >= 305 typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; -#else -#error Unsupported compiler -#endif - -/* Vector functions following the base PCS. */ -__f32x4_t __v_sinf (__f32x4_t); -__f32x4_t __v_cosf (__f32x4_t); -__f32x4_t __v_expf (__f32x4_t); -__f32x4_t __v_expf_1u (__f32x4_t); -__f32x4_t __v_exp2f (__f32x4_t); -__f32x4_t __v_exp2f_1u (__f32x4_t); -__f32x4_t __v_logf (__f32x4_t); -__f32x4_t __v_powf (__f32x4_t, __f32x4_t); -__f64x2_t __v_sin (__f64x2_t); -__f64x2_t __v_cos (__f64x2_t); -__f64x2_t __v_exp (__f64x2_t); -__f64x2_t __v_log (__f64x2_t); -__f64x2_t __v_pow (__f64x2_t, __f64x2_t); +# else +# error Unsupported compiler +# endif -#if __GNUC__ >= 9 || __clang_major__ >= 8 -#define __vpcs __attribute__((__aarch64_vector_pcs__)) - -/* Vector functions following the vector PCS. */ -__vpcs __f32x4_t __vn_sinf (__f32x4_t); -__vpcs __f32x4_t __vn_cosf (__f32x4_t); -__vpcs __f32x4_t __vn_expf (__f32x4_t); -__vpcs __f32x4_t __vn_expf_1u (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f (__f32x4_t); -__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t); -__vpcs __f32x4_t __vn_logf (__f32x4_t); -__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t); -__vpcs __f64x2_t __vn_sin (__f64x2_t); -__vpcs __f64x2_t __vn_cos (__f64x2_t); -__vpcs __f64x2_t __vn_exp (__f64x2_t); -__vpcs __f64x2_t __vn_log (__f64x2_t); -__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t); +# if __GNUC__ >= 9 || __clang_major__ >= 8 +# undef __vpcs +# define __vpcs __attribute__((__aarch64_vector_pcs__)) /* Vector functions following the vector PCS using ABI names. */ __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); @@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); -#endif +# endif #endif #endif diff --git a/contrib/arm-optimized-routines/math/log.c b/contrib/arm-optimized-routines/math/log.c index d3b7bc60747c..43dfc2a744f0 100644 --- a/contrib/arm-optimized-routines/math/log.c +++ b/contrib/arm-optimized-routines/math/log.c @@ -2,7 +2,7 @@ * Double-precision log(x) function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <float.h> diff --git a/contrib/arm-optimized-routines/math/log2.c b/contrib/arm-optimized-routines/math/log2.c index 55102b772969..3f9c21b03962 100644 --- a/contrib/arm-optimized-routines/math/log2.c +++ b/contrib/arm-optimized-routines/math/log2.c @@ -2,7 +2,7 @@ * Double-precision log2(x) function. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <float.h> diff --git a/contrib/arm-optimized-routines/math/log2_data.c b/contrib/arm-optimized-routines/math/log2_data.c index 3fc9b47c1f03..293bd7df4118 100644 --- a/contrib/arm-optimized-routines/math/log2_data.c +++ b/contrib/arm-optimized-routines/math/log2_data.c @@ -2,7 +2,7 @@ * Data for log2. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/log2f.c b/contrib/arm-optimized-routines/math/log2f.c index acb629e6846c..0a44fa2024f6 100644 --- a/contrib/arm-optimized-routines/math/log2f.c +++ b/contrib/arm-optimized-routines/math/log2f.c @@ -2,7 +2,7 @@ * Single-precision log2 function. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <math.h> diff --git a/contrib/arm-optimized-routines/math/log2f_data.c b/contrib/arm-optimized-routines/math/log2f_data.c index f3546d730aba..4866ef7f8171 100644 --- a/contrib/arm-optimized-routines/math/log2f_data.c +++ b/contrib/arm-optimized-routines/math/log2f_data.c @@ -2,7 +2,7 @@ * Data definition for log2f. * * Copyright (c) 2017-2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/log_data.c b/contrib/arm-optimized-routines/math/log_data.c index 96a098d42c16..3ecc1f40a822 100644 --- a/contrib/arm-optimized-routines/math/log_data.c +++ b/contrib/arm-optimized-routines/math/log_data.c @@ -2,7 +2,7 @@ * Data for log. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/logf.c b/contrib/arm-optimized-routines/math/logf.c index cfbaee12df10..820f74c3e66a 100644 --- a/contrib/arm-optimized-routines/math/logf.c +++ b/contrib/arm-optimized-routines/math/logf.c @@ -1,8 +1,8 @@ /* * Single-precision log function. * - * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <math.h> @@ -57,7 +57,7 @@ logf (float x) tmp = ix - OFF; i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; k = (int32_t) tmp >> 23; /* arithmetic shift */ - iz = ix - (tmp & 0x1ff << 23); + iz = ix - (tmp & 0xff800000); invc = T[i].invc; logc = T[i].logc; z = (double_t) asfloat (iz); diff --git a/contrib/arm-optimized-routines/math/logf_data.c b/contrib/arm-optimized-routines/math/logf_data.c index e8973ce4fedc..04247684755f 100644 --- a/contrib/arm-optimized-routines/math/logf_data.c +++ b/contrib/arm-optimized-routines/math/logf_data.c @@ -2,7 +2,7 @@ * Data definition for logf. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/math_config.h b/contrib/arm-optimized-routines/math/math_config.h index e85104337048..faf77b31fc99 100644 --- a/contrib/arm-optimized-routines/math/math_config.h +++ b/contrib/arm-optimized-routines/math/math_config.h @@ -1,8 +1,8 @@ /* * Configuration for math routines. * - * Copyright (c) 2017-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef _MATH_CONFIG_H @@ -92,6 +92,46 @@ # define unlikely(x) (x) #endif +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +/* Symbol renames to avoid libc conflicts. */ +#define __math_oflowf arm_math_oflowf +#define __math_uflowf arm_math_uflowf +#define __math_may_uflowf arm_math_may_uflowf +#define __math_divzerof arm_math_divzerof +#define __math_oflow arm_math_oflow +#define __math_uflow arm_math_uflow +#define __math_may_uflow arm_math_may_uflow +#define __math_divzero arm_math_divzero +#define __math_invalidf arm_math_invalidf +#define __math_invalid arm_math_invalid +#define __math_check_oflow arm_math_check_oflow +#define __math_check_uflow arm_math_check_uflow +#define __math_check_oflowf arm_math_check_oflowf +#define __math_check_uflowf arm_math_check_uflowf + +#define __sincosf_table arm_math_sincosf_table +#define __inv_pio4 arm_math_inv_pio4 +#define __exp2f_data arm_math_exp2f_data +#define __logf_data arm_math_logf_data +#define __log2f_data arm_math_log2f_data +#define __powf_log2_data arm_math_powf_log2_data +#define __exp_data arm_math_exp_data +#define __log_data arm_math_log_data +#define __log2_data arm_math_log2_data +#define __pow_log_data arm_math_pow_log_data +#define __erff_data arm_math_erff_data +#define __erf_data arm_math_erf_data +#define __v_exp_data arm_math_v_exp_data +#define __v_log_data arm_math_v_log_data + #if HAVE_FAST_ROUND /* When set, the roundtoint and converttoint functions are provided with the semantics documented below. */ @@ -381,15 +421,22 @@ extern const struct powf_log2_data #define EXP_USE_TOINT_NARROW 0 #define EXP2_POLY_ORDER 5 #define EXP2_POLY_WIDE 0 +/* Wider exp10 polynomial necessary for good precision in non-nearest rounding + and !TOINT_INTRINSICS. */ +#define EXP10_POLY_WIDE 0 extern const struct exp_data { double invln2N; + double invlog10_2N; double shift; double negln2hiN; double negln2loN; + double neglog10_2hiN; + double neglog10_2loN; double poly[4]; /* Last four coefficients. */ double exp2_shift; double exp2_poly[EXP2_POLY_ORDER]; + double exp10_poly[5]; uint64_t tab[2*(1 << EXP_TABLE_BITS)]; } __exp_data HIDDEN; @@ -459,4 +506,16 @@ extern const struct erf_data double erfc_poly_F[ERFC_POLY_F_NCOEFFS]; } __erf_data HIDDEN; +#define V_EXP_TABLE_BITS 7 +extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; + +#define V_LOG_TABLE_BITS 7 +extern const struct v_log_data +{ + struct + { + double invc, logc; + } table[1 << V_LOG_TABLE_BITS]; +} __v_log_data HIDDEN; + #endif diff --git a/contrib/arm-optimized-routines/math/math_err.c b/contrib/arm-optimized-routines/math/math_err.c index 1bf9538a1ab1..cfe072809cf4 100644 --- a/contrib/arm-optimized-routines/math/math_err.c +++ b/contrib/arm-optimized-routines/math/math_err.c @@ -2,7 +2,7 @@ * Double-precision math error handling. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/math_errf.c b/contrib/arm-optimized-routines/math/math_errf.c index d5350b819ab1..4233918b1eae 100644 --- a/contrib/arm-optimized-routines/math/math_errf.c +++ b/contrib/arm-optimized-routines/math/math_errf.c @@ -2,7 +2,7 @@ * Single-precision math error handling. * * Copyright (c) 2017-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/pow.c b/contrib/arm-optimized-routines/math/pow.c index 86842c6abacd..af719fe5ab10 100644 --- a/contrib/arm-optimized-routines/math/pow.c +++ b/contrib/arm-optimized-routines/math/pow.c @@ -2,7 +2,7 @@ * Double-precision x^y function. * * Copyright (c) 2018-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <float.h> diff --git a/contrib/arm-optimized-routines/math/pow_log_data.c b/contrib/arm-optimized-routines/math/pow_log_data.c index 45569c5cc064..2a4c250d85c3 100644 --- a/contrib/arm-optimized-routines/math/pow_log_data.c +++ b/contrib/arm-optimized-routines/math/pow_log_data.c @@ -2,7 +2,7 @@ * Data for the log part of pow. * * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/powf.c b/contrib/arm-optimized-routines/math/powf.c index 6ba45d3852a5..05c80bb2eb67 100644 --- a/contrib/arm-optimized-routines/math/powf.c +++ b/contrib/arm-optimized-routines/math/powf.c @@ -2,7 +2,7 @@ * Single-precision pow function. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <math.h> diff --git a/contrib/arm-optimized-routines/math/powf_log2_data.c b/contrib/arm-optimized-routines/math/powf_log2_data.c index 97e0d98cdbab..243836a549fd 100644 --- a/contrib/arm-optimized-routines/math/powf_log2_data.c +++ b/contrib/arm-optimized-routines/math/powf_log2_data.c @@ -2,7 +2,7 @@ * Data definition for powf. * * Copyright (c) 2017-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "math_config.h" diff --git a/contrib/arm-optimized-routines/math/s_cos.c b/contrib/arm-optimized-routines/math/s_cos.c deleted file mode 100644 index 53a95b0adfde..000000000000 --- a/contrib/arm-optimized-routines/math/s_cos.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_cos.c" diff --git a/contrib/arm-optimized-routines/math/s_cosf.c b/contrib/arm-optimized-routines/math/s_cosf.c deleted file mode 100644 index 914c02eba651..000000000000 --- a/contrib/arm-optimized-routines/math/s_cosf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_cosf.c" diff --git a/contrib/arm-optimized-routines/math/s_exp.c b/contrib/arm-optimized-routines/math/s_exp.c deleted file mode 100644 index ac7246b2c100..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_exp.c" diff --git a/contrib/arm-optimized-routines/math/s_exp2f.c b/contrib/arm-optimized-routines/math/s_exp2f.c deleted file mode 100644 index df7dfd680ff4..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp2f.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_exp2f.c" diff --git a/contrib/arm-optimized-routines/math/s_exp2f_1u.c b/contrib/arm-optimized-routines/math/s_exp2f_1u.c deleted file mode 100644 index 5e3852b41d83..000000000000 --- a/contrib/arm-optimized-routines/math/s_exp2f_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_exp2f_1u.c" diff --git a/contrib/arm-optimized-routines/math/s_expf.c b/contrib/arm-optimized-routines/math/s_expf.c deleted file mode 100644 index 3492c460733d..000000000000 --- a/contrib/arm-optimized-routines/math/s_expf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_expf.c" diff --git a/contrib/arm-optimized-routines/math/s_expf_1u.c b/contrib/arm-optimized-routines/math/s_expf_1u.c deleted file mode 100644 index eb7bbcba5566..000000000000 --- a/contrib/arm-optimized-routines/math/s_expf_1u.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_expf_1u.c" diff --git a/contrib/arm-optimized-routines/math/s_log.c b/contrib/arm-optimized-routines/math/s_log.c deleted file mode 100644 index 23289cf948ec..000000000000 --- a/contrib/arm-optimized-routines/math/s_log.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_log.c" diff --git a/contrib/arm-optimized-routines/math/s_logf.c b/contrib/arm-optimized-routines/math/s_logf.c deleted file mode 100644 index 9399350fc1ee..000000000000 --- a/contrib/arm-optimized-routines/math/s_logf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_logf.c" diff --git a/contrib/arm-optimized-routines/math/s_pow.c b/contrib/arm-optimized-routines/math/s_pow.c deleted file mode 100644 index 2e34c9f896d6..000000000000 --- a/contrib/arm-optimized-routines/math/s_pow.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_pow.c" diff --git a/contrib/arm-optimized-routines/math/s_powf.c b/contrib/arm-optimized-routines/math/s_powf.c deleted file mode 100644 index 6d91a4a72b37..000000000000 --- a/contrib/arm-optimized-routines/math/s_powf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_powf.c" diff --git a/contrib/arm-optimized-routines/math/s_sin.c b/contrib/arm-optimized-routines/math/s_sin.c deleted file mode 100644 index 06982c2018c6..000000000000 --- a/contrib/arm-optimized-routines/math/s_sin.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_sin.c" diff --git a/contrib/arm-optimized-routines/math/s_sinf.c b/contrib/arm-optimized-routines/math/s_sinf.c deleted file mode 100644 index 68ca90853736..000000000000 --- a/contrib/arm-optimized-routines/math/s_sinf.c +++ /dev/null @@ -1,6 +0,0 @@ -/* - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#define SCALAR 1 -#include "v_sinf.c" diff --git a/contrib/arm-optimized-routines/math/sincosf.c b/contrib/arm-optimized-routines/math/sincosf.c index 9746f1c22e6c..446f21d60faf 100644 --- a/contrib/arm-optimized-routines/math/sincosf.c +++ b/contrib/arm-optimized-routines/math/sincosf.c @@ -1,8 +1,8 @@ /* * Single-precision sin/cos function. * - * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4)) + if (abstop12 (y) < abstop12 (pio4f)) { double x2 = x * x; diff --git a/contrib/arm-optimized-routines/math/sincosf.h b/contrib/arm-optimized-routines/math/sincosf.h index 1e80fc9ba8e1..ec23ed7aeb26 100644 --- a/contrib/arm-optimized-routines/math/sincosf.h +++ b/contrib/arm-optimized-routines/math/sincosf.h @@ -1,8 +1,8 @@ /* * Header for sinf, cosf and sincosf. * - * Copyright (c) 2018, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> @@ -12,7 +12,7 @@ /* 2PI * 2^-64. */ static const double pi63 = 0x1.921FB54442D18p-62; /* PI / 4. */ -static const double pio4 = 0x1.921FB54442D18p-1; +static const float pio4f = 0x1.921FB6p-1f; /* The constants and polynomials for sine and cosine. */ typedef struct diff --git a/contrib/arm-optimized-routines/math/sincosf_data.c b/contrib/arm-optimized-routines/math/sincosf_data.c index ab4ac4710fef..22525290ab08 100644 --- a/contrib/arm-optimized-routines/math/sincosf_data.c +++ b/contrib/arm-optimized-routines/math/sincosf_data.c @@ -2,7 +2,7 @@ * Data definition for sinf, cosf and sincosf. * * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdint.h> diff --git a/contrib/arm-optimized-routines/math/sinf.c b/contrib/arm-optimized-routines/math/sinf.c index ddbc1daf74a9..8dd8ae458794 100644 --- a/contrib/arm-optimized-routines/math/sinf.c +++ b/contrib/arm-optimized-routines/math/sinf.c @@ -1,8 +1,8 @@ /* * Single-precision sin function. * - * Copyright (c) 2018-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2021, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <math.h> @@ -21,7 +21,7 @@ sinf (float y) int n; const sincos_t *p = &__sincosf_table[0]; - if (abstop12 (y) < abstop12 (pio4)) + if (abstop12 (y) < abstop12 (pio4f)) { s = x * x; diff --git a/contrib/arm-optimized-routines/math/test/mathbench.c b/contrib/arm-optimized-routines/math/test/mathbench.c index 0c17826e5296..ed7e89bb7710 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench.c +++ b/contrib/arm-optimized-routines/math/test/mathbench.c @@ -1,8 +1,8 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #undef _GNU_SOURCE @@ -15,11 +15,6 @@ #include <math.h> #include "mathlib.h" -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - /* Number of measurements, best result is reported. */ #define MEASURE 60 /* Array size. */ @@ -34,8 +29,9 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#if __aarch64__ && WANT_VMATH -typedef __f64x2_t v_double; +#ifdef __vpcs +#include <arm_neon.h> +typedef float64x2_t v_double; #define v_double_len() 2 @@ -51,7 +47,7 @@ v_double_dup (double x) return (v_double){x, x}; } -typedef __f32x4_t v_float; +typedef float32x4_t v_float; #define v_float_len() 4 @@ -76,141 +72,91 @@ typedef float v_float; #define v_float_len(x) 1 #define v_float_load(x) (x)[0] #define v_float_dup(x) (x) -#endif - -static double -dummy (double x) -{ - return x; -} - -static float -dummyf (float x) -{ - return x; -} - -#if WANT_VMATH -#if __aarch64__ -static v_double -__v_dummy (v_double x) -{ - return x; -} -static v_float -__v_dummyf (v_float x) -{ - return x; -} - -#ifdef __vpcs -__vpcs static v_double -__vn_dummy (v_double x) -{ - return x; -} +#endif -__vpcs static v_float -__vn_dummyf (v_float x) -{ - return x; -} +#if WANT_SVE_MATH +#include <arm_sve.h> +typedef svbool_t sv_bool; +typedef svfloat64_t sv_double; -__vpcs static v_float -xy__vn_powf (v_float x) -{ - return __vn_powf (x, x); -} +#define sv_double_len() svcntd() -__vpcs static v_float -xy_Z_powf (v_float x) +static inline sv_double +sv_double_load (const double *p) { - return _ZGVnN4vv_powf (x, x); + svbool_t pg = svptrue_b64(); + return svld1(pg, p); } -__vpcs static v_double -xy__vn_pow (v_double x) +static inline sv_double +sv_double_dup (double x) { - return __vn_pow (x, x); + return svdup_n_f64(x); } -__vpcs static v_double -xy_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (x, x); -} -#endif +typedef svfloat32_t sv_float; -static v_float -xy__v_powf (v_float x) -{ - return __v_powf (x, x); -} +#define sv_float_len() svcntw() -static v_double -xy__v_pow (v_double x) +static inline sv_float +sv_float_load (const float *p) { - return __v_pow (x, x); + svbool_t pg = svptrue_b32(); + return svld1(pg, p); } -#endif -static float -xy__s_powf (float x) +static inline sv_float +sv_float_dup (float x) { - return __s_powf (x, x); -} - -static double -xy__s_pow (double x) -{ - return __s_pow (x, x); + return svdup_n_f32(x); } +#else +/* dummy definitions to make things compile. */ +#define sv_double_len(x) 1 +#define sv_float_len(x) 1 #endif static double -xypow (double x) +dummy (double x) { - return pow (x, x); + return x; } static float -xypowf (float x) +dummyf (float x) { - return powf (x, x); + return x; } - -static double -xpow (double x) +#ifdef __vpcs +__vpcs static v_double +__vn_dummy (v_double x) { - return pow (x, 23.4); + return x; } -static float -xpowf (float x) +__vpcs static v_float +__vn_dummyf (v_float x) { - return powf (x, 23.4f); + return x; } - -static double -ypow (double x) +#endif +#if WANT_SVE_MATH +static sv_double +__sv_dummy (sv_double x, sv_bool pg) { - return pow (2.34, x); + return x; } -static float -ypowf (float x) +static sv_float +__sv_dummyf (sv_float x, sv_bool pg) { - return powf (2.34f, x); + return x; } -static float -sincosf_wrap (float x) -{ - float s, c; - sincosf (x, &s, &c); - return s + c; -} +#endif + +#include "test/mathbench_wrappers.h" static const struct fun { @@ -223,127 +169,40 @@ static const struct fun { double (*d) (double); float (*f) (float); - v_double (*vd) (v_double); - v_float (*vf) (v_float); #ifdef __vpcs __vpcs v_double (*vnd) (v_double); __vpcs v_float (*vnf) (v_float); #endif +#if WANT_SVE_MATH + sv_double (*svd) (sv_double, sv_bool); + sv_float (*svf) (sv_float, sv_bool); +#endif } fun; } funtab[] = { #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, -#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, -#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, +#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, +#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) -D (exp, -9.9, 9.9) -D (exp, 0.5, 1.0) -D (exp2, -9.9, 9.9) -D (log, 0.01, 11.1) -D (log, 0.999, 1.001) -D (log2, 0.01, 11.1) -D (log2, 0.999, 1.001) -{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, -D (xpow, 0.01, 11.1) -D (ypow, -9.9, 9.9) -D (erf, -6.0, 6.0) - F (dummyf, 1.0, 2.0) -F (expf, -9.9, 9.9) -F (exp2f, -9.9, 9.9) -F (logf, 0.01, 11.1) -F (log2f, 0.01, 11.1) -{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, -F (xpowf, 0.01, 11.1) -F (ypowf, -9.9, 9.9) -{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, -F (sinf, 0.1, 0.7) -F (sinf, 0.8, 3.1) -F (sinf, -3.1, 3.1) -F (sinf, 3.3, 33.3) -F (sinf, 100, 1000) -F (sinf, 1e6, 1e32) -F (cosf, 0.1, 0.7) -F (cosf, 0.8, 3.1) -F (cosf, -3.1, 3.1) -F (cosf, 3.3, 33.3) -F (cosf, 100, 1000) -F (cosf, 1e6, 1e32) -F (erff, -4.0, 4.0) -#if WANT_VMATH -D (__s_sin, -3.1, 3.1) -D (__s_cos, -3.1, 3.1) -D (__s_exp, -9.9, 9.9) -D (__s_log, 0.01, 11.1) -{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, -F (__s_expf, -9.9, 9.9) -F (__s_expf_1u, -9.9, 9.9) -F (__s_exp2f, -9.9, 9.9) -F (__s_exp2f_1u, -9.9, 9.9) -F (__s_logf, 0.01, 11.1) -{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, -F (__s_sinf, -3.1, 3.1) -F (__s_cosf, -3.1, 3.1) -#if __aarch64__ -VD (__v_dummy, 1.0, 2.0) -VD (__v_sin, -3.1, 3.1) -VD (__v_cos, -3.1, 3.1) -VD (__v_exp, -9.9, 9.9) -VD (__v_log, 0.01, 11.1) -{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, -VF (__v_dummyf, 1.0, 2.0) -VF (__v_expf, -9.9, 9.9) -VF (__v_expf_1u, -9.9, 9.9) -VF (__v_exp2f, -9.9, 9.9) -VF (__v_exp2f_1u, -9.9, 9.9) -VF (__v_logf, 0.01, 11.1) -{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, -VF (__v_sinf, -3.1, 3.1) -VF (__v_cosf, -3.1, 3.1) #ifdef __vpcs VND (__vn_dummy, 1.0, 2.0) -VND (__vn_exp, -9.9, 9.9) -VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (__vn_log, 0.01, 11.1) -VND (_ZGVnN2v_log, 0.01, 11.1) -{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, -{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (__vn_sin, -3.1, 3.1) -VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (__vn_cos, -3.1, 3.1) -VND (_ZGVnN2v_cos, -3.1, 3.1) VNF (__vn_dummyf, 1.0, 2.0) -VNF (__vn_expf, -9.9, 9.9) -VNF (_ZGVnN4v_expf, -9.9, 9.9) -VNF (__vn_expf_1u, -9.9, 9.9) -VNF (__vn_exp2f, -9.9, 9.9) -VNF (_ZGVnN4v_exp2f, -9.9, 9.9) -VNF (__vn_exp2f_1u, -9.9, 9.9) -VNF (__vn_logf, 0.01, 11.1) -VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, -{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (__vn_sinf, -3.1, 3.1) -VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (__vn_cosf, -3.1, 3.1) -VNF (_ZGVnN4v_cosf, -3.1, 3.1) -#endif #endif +#if WANT_SVE_MATH +SVD (__sv_dummy, 1.0, 2.0) +SVF (__sv_dummyf, 1.0, 2.0) #endif +#include "test/mathbench_funcs.h" {0}, #undef F #undef D -#undef VF -#undef VD #undef VNF #undef VND +#undef SVF +#undef SVD }; static void @@ -442,69 +301,75 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } +#ifdef __vpcs static void -run_v_thruput (v_double f (v_double)) +run_vn_thruput (__vpcs v_double f (v_double)) { for (int i = 0; i < N; i += v_double_len ()) f (v_double_load (A+i)); } static void -runf_v_thruput (v_float f (v_float)) +runf_vn_thruput (__vpcs v_float f (v_float)) { for (int i = 0; i < N; i += v_float_len ()) f (v_float_load (Af+i)); } static void -run_v_latency (v_double f (v_double)) +run_vn_latency (__vpcs v_double f (v_double)) { - v_double z = v_double_dup (zero); - v_double prev = z; + volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; + uint64x2_t sel = vsel; + v_double prev = v_double_dup (0); for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + prev = f (vbslq_f64 (sel, prev, v_double_load (A+i))); } static void -runf_v_latency (v_float f (v_float)) +runf_vn_latency (__vpcs v_float f (v_float)) { - v_float z = v_float_dup (zero); - v_float prev = z; + volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; + uint32x4_t sel = vsel; + v_float prev = v_float_dup (0); for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i))); } +#endif -#ifdef __vpcs +#if WANT_SVE_MATH static void -run_vn_thruput (__vpcs v_double f (v_double)) +run_sv_thruput (sv_double f (sv_double, sv_bool)) { - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); + for (int i = 0; i < N; i += sv_double_len ()) + f (sv_double_load (A+i), svptrue_b64 ()); } static void -runf_vn_thruput (__vpcs v_float f (v_float)) +runf_sv_thruput (sv_float f (sv_float, sv_bool)) { - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); + for (int i = 0; i < N; i += sv_float_len ()) + f (sv_float_load (Af+i), svptrue_b32 ()); } static void -run_vn_latency (__vpcs v_double f (v_double)) +run_sv_latency (sv_double f (sv_double, sv_bool)) { - v_double z = v_double_dup (zero); - v_double prev = z; - for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + volatile sv_bool vsel = svptrue_b64 (); + sv_bool sel = vsel; + sv_double prev = sv_double_dup (0); + for (int i = 0; i < N; i += sv_double_len ()) + prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ()); } static void -runf_vn_latency (__vpcs v_float f (v_float)) +runf_sv_latency (sv_float f (sv_float, sv_bool)) { - v_float z = v_float_dup (zero); - v_float prev = z; - for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + volatile sv_bool vsel = svptrue_b32 (); + sv_bool sel = vsel; + sv_float prev = sv_float_dup (0); + for (int i = 0; i < N; i += sv_float_len ()) + prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ()); } #endif @@ -539,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi) const char *s = type == 't' ? "rthruput" : "latency"; int vlen = 1; - if (f->vec && f->prec == 'd') - vlen = v_double_len(); - else if (f->vec && f->prec == 'f') - vlen = v_float_len(); + if (f->vec == 'n') + vlen = f->prec == 'd' ? v_double_len() : v_float_len(); + else if (f->vec == 's') + vlen = f->prec == 'd' ? sv_double_len() : sv_float_len(); if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); @@ -552,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); - else if (f->prec == 'd' && type == 't' && f->vec == 'v') - TIMEIT (run_v_thruput, f->fun.vd); - else if (f->prec == 'd' && type == 'l' && f->vec == 'v') - TIMEIT (run_v_latency, f->fun.vd); - else if (f->prec == 'f' && type == 't' && f->vec == 'v') - TIMEIT (runf_v_thruput, f->fun.vf); - else if (f->prec == 'f' && type == 'l' && f->vec == 'v') - TIMEIT (runf_v_latency, f->fun.vf); #ifdef __vpcs else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); @@ -570,20 +427,32 @@ bench1 (const struct fun *f, int type, double lo, double hi) else if (f->prec == 'f' && type == 'l' && f->vec == 'n') TIMEIT (runf_vn_latency, f->fun.vnf); #endif +#if WANT_SVE_MATH + else if (f->prec == 'd' && type == 't' && f->vec == 's') + TIMEIT (run_sv_thruput, f->fun.svd); + else if (f->prec == 'd' && type == 'l' && f->vec == 's') + TIMEIT (run_sv_latency, f->fun.svd); + else if (f->prec == 'f' && type == 't' && f->vec == 's') + TIMEIT (runf_sv_thruput, f->fun.svf); + else if (f->prec == 'f' && type == 'l' && f->vec == 's') + TIMEIT (runf_sv_latency, f->fun.svf); +#endif if (type == 't') { ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } else if (type == 'l') { ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); - printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } fflush (stdout); } diff --git a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h new file mode 100644 index 000000000000..84c4e68650ac --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h @@ -0,0 +1,62 @@ +/* + * Function entries for mathbench. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +/* clang-format off */ +D (exp, -9.9, 9.9) +D (exp, 0.5, 1.0) +D (exp10, -9.9, 9.9) +D (exp2, -9.9, 9.9) +D (log, 0.01, 11.1) +D (log, 0.999, 1.001) +D (log2, 0.01, 11.1) +D (log2, 0.999, 1.001) +{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, +D (xpow, 0.01, 11.1) +D (ypow, -9.9, 9.9) +D (erf, -6.0, 6.0) + +F (expf, -9.9, 9.9) +F (exp2f, -9.9, 9.9) +F (logf, 0.01, 11.1) +F (log2f, 0.01, 11.1) +{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, +F (xpowf, 0.01, 11.1) +F (ypowf, -9.9, 9.9) +{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, +{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, +F (sinf, 0.1, 0.7) +F (sinf, 0.8, 3.1) +F (sinf, -3.1, 3.1) +F (sinf, 3.3, 33.3) +F (sinf, 100, 1000) +F (sinf, 1e6, 1e32) +F (cosf, 0.1, 0.7) +F (cosf, 0.8, 3.1) +F (cosf, -3.1, 3.1) +F (cosf, 3.3, 33.3) +F (cosf, 100, 1000) +F (cosf, 1e6, 1e32) +F (erff, -4.0, 4.0) +#ifdef __vpcs +VND (_ZGVnN2v_exp, -9.9, 9.9) +VND (_ZGVnN2v_log, 0.01, 11.1) +{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, +VND (_ZGVnN2v_sin, -3.1, 3.1) +VND (_ZGVnN2v_cos, -3.1, 3.1) +VNF (_ZGVnN4v_expf, -9.9, 9.9) +VNF (_ZGVnN4v_expf_1u, -9.9, 9.9) +VNF (_ZGVnN4v_exp2f, -9.9, 9.9) +VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9) +VNF (_ZGVnN4v_logf, 0.01, 11.1) +{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, +VNF (_ZGVnN4v_sinf, -3.1, 3.1) +VNF (_ZGVnN4v_cosf, -3.1, 3.1) +#endif + /* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h new file mode 100644 index 000000000000..062b9db56de5 --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h @@ -0,0 +1,66 @@ +/* + * Function wrappers for mathbench. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifdef __vpcs + +__vpcs static v_float +xy_Z_powf (v_float x) +{ + return _ZGVnN4vv_powf (x, x); +} + +__vpcs static v_double +xy_Z_pow (v_double x) +{ + return _ZGVnN2vv_pow (x, x); +} + +#endif + +static double +xypow (double x) +{ + return pow (x, x); +} + +static float +xypowf (float x) +{ + return powf (x, x); +} + +static double +xpow (double x) +{ + return pow (x, 23.4); +} + +static float +xpowf (float x) +{ + return powf (x, 23.4f); +} + +static double +ypow (double x) +{ + return pow (2.34, x); +} + +static float +ypowf (float x) +{ + return powf (2.34f, x); +} + +static float +sincosf_wrap (float x) +{ + float s, c; + sincosf (x, &s, &c); + return s + c; +} diff --git a/contrib/arm-optimized-routines/math/test/mathtest.c b/contrib/arm-optimized-routines/math/test/mathtest.c index 310896738e47..834233fdde9d 100644 --- a/contrib/arm-optimized-routines/math/test/mathtest.c +++ b/contrib/arm-optimized-routines/math/test/mathtest.c @@ -1,8 +1,8 @@ /* * mathtest.c - test rig for mathlib * - * Copyright (c) 1998-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 1998-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <assert.h> @@ -196,9 +196,11 @@ int is_complex_rettype(int rettype) { #define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name } #define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name } +#ifndef PL /* sincosf wrappers for easier testing. */ static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; } static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; } +#endif test_func tfuncs[] = { /* trigonometric */ @@ -218,9 +220,10 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT), TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4), +#ifndef PL TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4), - +#endif /* hyperbolic */ TFUNC(at_d, rt_d, atanh, 4*ULPUNIT), TFUNC(at_d, rt_d, asinh, 4*ULPUNIT), @@ -251,6 +254,7 @@ test_func tfuncs[] = { TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4), TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4), TFUNC(at_s,rt_s, expm1f, ULPUNIT), + TFUNC(at_d,rt_d, exp10, ULPUNIT), /* power */ TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4), @@ -1018,6 +1022,7 @@ int runtest(testdetail t) { DO_DOP(d_arg1,op1r); DO_DOP(d_arg2,op2r); s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0]; + s_res.i = 0; /* * Detect NaNs, infinities and denormals on input, and set a @@ -1152,22 +1157,25 @@ int runtest(testdetail t) { tresultr[0] = t.resultr[0]; tresultr[1] = t.resultr[1]; resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd]; + resulti[0] = resulti[1] = 0; wres = 2; break; case rt_i: tresultr[0] = t.resultr[0]; resultr[0] = intres; + resulti[0] = 0; wres = 1; break; case rt_s: case rt_s2: tresultr[0] = t.resultr[0]; resultr[0] = s_res.i; + resulti[0] = 0; wres = 1; break; default: puts("unhandled rettype in runtest"); - wres = 0; + abort (); } if(t.resultc != rc_none) { int err = 0; diff --git a/contrib/arm-optimized-routines/math/test/rtest/dotest.c b/contrib/arm-optimized-routines/math/test/rtest/dotest.c index 6be79e1df0d1..5b3e9b4f18e4 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/dotest.c +++ b/contrib/arm-optimized-routines/math/test/rtest/dotest.c @@ -2,7 +2,7 @@ * dotest.c - actually generate mathlib test cases * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdio.h> diff --git a/contrib/arm-optimized-routines/math/test/rtest/intern.h b/contrib/arm-optimized-routines/math/test/rtest/intern.h index 12a9c749e18e..3ebd7ddaf85d 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/intern.h +++ b/contrib/arm-optimized-routines/math/test/rtest/intern.h @@ -2,7 +2,7 @@ * intern.h * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef mathtest_intern_h diff --git a/contrib/arm-optimized-routines/math/test/rtest/main.c b/contrib/arm-optimized-routines/math/test/rtest/main.c index 0d8ead891320..3d533c946f79 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/main.c +++ b/contrib/arm-optimized-routines/math/test/rtest/main.c @@ -2,7 +2,7 @@ * main.c * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <assert.h> diff --git a/contrib/arm-optimized-routines/math/test/rtest/random.c b/contrib/arm-optimized-routines/math/test/rtest/random.c index 56123966b8c4..1de32580b733 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/random.c +++ b/contrib/arm-optimized-routines/math/test/rtest/random.c @@ -2,7 +2,7 @@ * random.c - random number generator for producing mathlib test cases * * Copyright (c) 1998-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "types.h" diff --git a/contrib/arm-optimized-routines/math/test/rtest/random.h b/contrib/arm-optimized-routines/math/test/rtest/random.h index b4b22df82a3d..0b477d72b234 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/random.h +++ b/contrib/arm-optimized-routines/math/test/rtest/random.h @@ -2,7 +2,7 @@ * random.h - header for random.c * * Copyright (c) 2009-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "types.h" diff --git a/contrib/arm-optimized-routines/math/test/rtest/semi.c b/contrib/arm-optimized-routines/math/test/rtest/semi.c index c9f0daf76508..70a7844a48d6 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/semi.c +++ b/contrib/arm-optimized-routines/math/test/rtest/semi.c @@ -2,7 +2,7 @@ * semi.c: test implementations of mathlib seminumerical functions * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <stdio.h> diff --git a/contrib/arm-optimized-routines/math/test/rtest/semi.h b/contrib/arm-optimized-routines/math/test/rtest/semi.h index 17dc4158fb51..7a1444e55d28 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/semi.h +++ b/contrib/arm-optimized-routines/math/test/rtest/semi.h @@ -2,7 +2,7 @@ * semi.h: header for semi.c * * Copyright (c) 1999-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef test_semi_h diff --git a/contrib/arm-optimized-routines/math/test/rtest/types.h b/contrib/arm-optimized-routines/math/test/rtest/types.h index 53cd557fa4cf..e15b4e06a0d4 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/types.h +++ b/contrib/arm-optimized-routines/math/test/rtest/types.h @@ -2,7 +2,7 @@ * types.h * * Copyright (c) 2005-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #ifndef mathtest_types_h diff --git a/contrib/arm-optimized-routines/math/test/rtest/wrappers.c b/contrib/arm-optimized-routines/math/test/rtest/wrappers.c index de45ac5768d0..441017192ab4 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/wrappers.c +++ b/contrib/arm-optimized-routines/math/test/rtest/wrappers.c @@ -2,7 +2,7 @@ * wrappers.c - wrappers to modify output of MPFR/MPC test functions * * Copyright (c) 2014-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include <assert.h> diff --git a/contrib/arm-optimized-routines/math/test/rtest/wrappers.h b/contrib/arm-optimized-routines/math/test/rtest/wrappers.h index 7b09c85a59f1..0a8a58777d8a 100644 --- a/contrib/arm-optimized-routines/math/test/rtest/wrappers.h +++ b/contrib/arm-optimized-routines/math/test/rtest/wrappers.h @@ -2,7 +2,7 @@ * wrappers.h - wrappers to modify output of MPFR/MPC test functions * * Copyright (c) 2014-2019, Arm Limited. - * SPDX-License-Identifier: MIT + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ typedef struct { diff --git a/contrib/arm-optimized-routines/math/test/runulp.sh b/contrib/arm-optimized-routines/math/test/runulp.sh index 0190d9ab27fb..e2e03e3ae761 100755 --- a/contrib/arm-optimized-routines/math/test/runulp.sh +++ b/contrib/arm-optimized-routines/math/test/runulp.sh @@ -2,8 +2,8 @@ # ULP error check script. # -# Copyright (c) 2019-2020, Arm Limited. -# SPDX-License-Identifier: MIT +# Copyright (c) 2019-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception #set -x set -eu @@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000 t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000 t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000 +L=0.02 +t exp10 0 0x1p-47 5000 +t exp10 -0 -0x1p-47 5000 +t exp10 0x1p-47 1 50000 +t exp10 -0x1p-47 -1 50000 +t exp10 1 0x1.34413509f79ffp8 50000 +t exp10 -1 -0x1.434e6420f4374p8 50000 +t exp10 0x1.34413509f79ffp8 inf 5000 +t exp10 -0x1.434e6420f4374p8 -inf 5000 + L=1.0 Ldir=0.9 t erf 0 0xffff000000000000 10000 @@ -143,15 +153,10 @@ Ldir=0.5 done # vector functions + Ldir=0.5 r='n' -flags="${ULPFLAGS:--q} -f" -runs= -check __s_exp 1 && runs=1 -runv= -check __v_exp 1 && runv=1 -runvn= -check __vn_exp 1 && runvn=1 +flags="${ULPFLAGS:--q}" range_exp=' 0 0xffff000000000000 10000 @@ -177,9 +182,10 @@ range_pow=' ' range_sin=' - 0 0xffff000000000000 10000 - 0x1p-4 0x1p4 400000 - -0x1p-23 0x1p23 400000 + 0 0x1p23 500000 + -0 -0x1p23 500000 + 0x1p23 inf 10000 + -0x1p23 -inf 10000 ' range_cos="$range_sin" @@ -199,9 +205,10 @@ range_logf=' ' range_sinf=' - 0 0xffff0000 10000 - 0x1p-4 0x1p4 300000 --0x1p-9 -0x1p9 300000 + 0 0x1p20 500000 + -0 -0x1p20 500000 + 0x1p20 inf 10000 + -0x1p20 -inf 10000 ' range_cosf="$range_sinf" @@ -229,9 +236,8 @@ L_sinf=1.4 L_cosf=1.4 L_powf=2.1 -while read G F R +while read G F D do - [ "$R" = 1 ] || continue case "$G" in \#*) continue ;; esac eval range="\${range_$G}" eval L="\${L_$G}" @@ -239,74 +245,35 @@ do do [ -n "$X" ] || continue case "$X" in \#*) continue ;; esac - t $F $X + disable_fenv="" + if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then + # If library was built with SIMD exceptions + # disabled, disable fenv checking in ulp + # tool. Otherwise, fenv checking may still be + # disabled by adding -f to the end of the run + # line. + disable_fenv="-f" + fi + t $D $disable_fenv $F $X done << EOF $range + EOF done << EOF # group symbol run -exp __s_exp $runs -exp __v_exp $runv -exp __vn_exp $runvn -exp _ZGVnN2v_exp $runvn - -log __s_log $runs -log __v_log $runv -log __vn_log $runvn -log _ZGVnN2v_log $runvn - -pow __s_pow $runs -pow __v_pow $runv -pow __vn_pow $runvn -pow _ZGVnN2vv_pow $runvn - -sin __s_sin $runs -sin __v_sin $runv -sin __vn_sin $runvn -sin _ZGVnN2v_sin $runvn - -cos __s_cos $runs -cos __v_cos $runv -cos __vn_cos $runvn -cos _ZGVnN2v_cos $runvn - -expf __s_expf $runs -expf __v_expf $runv -expf __vn_expf $runvn -expf _ZGVnN4v_expf $runvn - -expf_1u __s_expf_1u $runs -expf_1u __v_expf_1u $runv -expf_1u __vn_expf_1u $runvn - -exp2f __s_exp2f $runs -exp2f __v_exp2f $runv -exp2f __vn_exp2f $runvn -exp2f _ZGVnN4v_exp2f $runvn - -exp2f_1u __s_exp2f_1u $runs -exp2f_1u __v_exp2f_1u $runv -exp2f_1u __vn_exp2f_1u $runvn - -logf __s_logf $runs -logf __v_logf $runv -logf __vn_logf $runvn -logf _ZGVnN4v_logf $runvn - -sinf __s_sinf $runs -sinf __v_sinf $runv -sinf __vn_sinf $runvn -sinf _ZGVnN4v_sinf $runvn - -cosf __s_cosf $runs -cosf __v_cosf $runv -cosf __vn_cosf $runvn -cosf _ZGVnN4v_cosf $runvn - -powf __s_powf $runs -powf __v_powf $runv -powf __vn_powf $runvn -powf _ZGVnN4vv_powf $runvn +exp _ZGVnN2v_exp +log _ZGVnN2v_log +pow _ZGVnN2vv_pow -f +sin _ZGVnN2v_sin -z +cos _ZGVnN2v_cos +expf _ZGVnN4v_expf +expf_1u _ZGVnN4v_expf_1u -f +exp2f _ZGVnN4v_exp2f +exp2f_1u _ZGVnN4v_exp2f_1u -f +logf _ZGVnN4v_logf +sinf _ZGVnN4v_sinf -z +cosf _ZGVnN4v_cosf +powf _ZGVnN4vv_powf -f EOF [ 0 -eq $FAIL ] || { diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst index 79160443f099..7ea0d45795a3 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst @@ -1,7 +1,7 @@ ; cosf.tst - Directed test cases for SP cosine ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=cosf op1=7fc00001 result=7fc00001 errno=0 func=cosf op1=ffc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst index 7fa4d1868c0e..12384cef0dd9 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst @@ -1,7 +1,7 @@ ; erf.tst - Directed test cases for erf ; ; Copyright (c) 2007-2020, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst index d05b7b1119c4..28f8fa37f5aa 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst @@ -1,7 +1,7 @@ ; erff.tst ; ; Copyright (c) 2007-2020, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=erff op1=7fc00001 result=7fc00001 errno=0 func=erff op1=ffc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst index 85d556cd1e00..0bb2ef4579cc 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst @@ -1,7 +1,7 @@ ; Directed test cases for exp ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst new file mode 100644 index 000000000000..2cf4273bd1d7 --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst @@ -0,0 +1,15 @@ +; Directed test cases for exp10 +; +; Copyright (c) 2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0 +func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux +func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0 +func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst index fa56c9f8be4b..7069f9010c8c 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst @@ -1,7 +1,7 @@ ; Directed test cases for exp2 ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst index 38cfc3f78ac6..6ca2eeab4e12 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst @@ -1,7 +1,7 @@ ; exp2f.tst - Directed test cases for exp2f ; ; Copyright (c) 2017-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=exp2f op1=7fc00001 result=7fc00001 errno=0 func=exp2f op1=ffc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst index ff0f671c2656..89ae8fe78e6c 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst @@ -1,7 +1,7 @@ ; expf.tst - Directed test cases for expf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=expf op1=7fc00001 result=7fc00001 errno=0 func=expf op1=ffc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst index a0aa398cbf73..686ea835645b 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst @@ -1,7 +1,7 @@ ; Directed test cases for log ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst index ff1286cbd53e..361bddec374b 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst @@ -1,7 +1,7 @@ ; Directed test cases for log2 ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst index 5832c4f08f1e..5fce051cddba 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst @@ -1,7 +1,7 @@ ; log2f.tst - Directed test cases for log2f ; ; Copyright (c) 2017-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=log2f op1=7fc00001 result=7fc00001 errno=0 func=log2f op1=ffc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst index 6e68a36e0f6a..a6d1b9d5c51f 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst @@ -1,7 +1,7 @@ ; logf.tst - Directed test cases for logf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=logf op1=7fc00001 result=7fc00001 errno=0 func=logf op1=ffc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst index 19665817153d..879d12864afe 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst @@ -1,7 +1,7 @@ ; Directed test cases for pow ; ; Copyright (c) 2018-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0 func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst index 3fa8b110f8bc..46d522400871 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst @@ -1,7 +1,7 @@ ; powf.tst - Directed test cases for powf ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst index 4b33d2291c66..cddb346558ea 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst @@ -1,7 +1,7 @@ ; Directed test cases for SP sincos ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst index ded80b1598c6..041b13d5d6cb 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst @@ -1,7 +1,7 @@ ; sinf.tst - Directed test cases for SP sine ; ; Copyright (c) 2007-2019, Arm Limited. -; SPDX-License-Identifier: MIT +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception func=sinf op1=7fc00001 result=7fc00001 errno=0 diff --git a/contrib/arm-optimized-routines/math/test/testcases/random/double.tst b/contrib/arm-optimized-routines/math/test/testcases/random/double.tst index c24ff80d5d95..8e885d61722a 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/random/double.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/random/double.tst @@ -1,7 +1,7 @@ !! double.tst - Random test case specification for DP functions !! !! Copyright (c) 1999-2019, Arm Limited. -!! SPDX-License-Identifier: MIT +!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception test exp 10000 test exp2 10000 diff --git a/contrib/arm-optimized-routines/math/test/testcases/random/float.tst b/contrib/arm-optimized-routines/math/test/testcases/random/float.tst index d02a22750abe..ea4a5a015214 100644 --- a/contrib/arm-optimized-routines/math/test/testcases/random/float.tst +++ b/contrib/arm-optimized-routines/math/test/testcases/random/float.tst @@ -1,7 +1,7 @@ !! single.tst - Random test case specification for SP functions !! !! Copyright (c) 1999-2019, Arm Limited. -!! SPDX-License-Identifier: MIT +!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception test sinf 10000 test cosf 10000 diff --git a/contrib/arm-optimized-routines/math/test/ulp.c b/contrib/arm-optimized-routines/math/test/ulp.c index 51479b87a0fd..5ff29972e50e 100644 --- a/contrib/arm-optimized-routines/math/test/ulp.c +++ b/contrib/arm-optimized-routines/math/test/ulp.c @@ -1,10 +1,11 @@ /* * ULP error checking tool for math functions. * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#define _GNU_SOURCE #include <ctype.h> #include <fenv.h> #include <float.h> @@ -23,11 +24,6 @@ # include <mpfr.h> #endif -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - static inline uint64_t asuint64 (double f) { @@ -212,73 +208,61 @@ struct conf unsigned long long n; double softlim; double errlim; + int ignore_zero_sign; }; -/* Wrappers for sincos. */ -static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} -static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} -static double sincos_sin(double x) {(void)cos(x); return sin(x);} -static double sincos_cos(double x) {(void)sin(x); return cos(x);} -#if USE_MPFR -static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } -static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } -#endif - /* A bit of a hack: call vector functions twice with the same input in lane 0 but a different value in other lanes: once with an in-range value and then with a special case value. */ static int secondcall; /* Wrappers for vector functions. */ -#if __aarch64__ && WANT_VMATH +#ifdef __vpcs typedef __f32x4_t v_float; typedef __f64x2_t v_double; -static const float fv[2] = {1.0f, -INFINITY}; -static const double dv[2] = {1.0, -INFINITY}; +/* First element of fv and dv may be changed by -c argument. */ +static float fv[2] = {1.0f, -INFINITY}; +static double dv[2] = {1.0, -INFINITY}; static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; } static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; } - -static float v_sinf(float x) { return __v_sinf(argf(x))[0]; } -static float v_cosf(float x) { return __v_cosf(argf(x))[0]; } -static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; } -static float v_expf(float x) { return __v_expf(argf(x))[0]; } -static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; } -static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; } -static float v_logf(float x) { return __v_logf(argf(x))[0]; } -static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; } -static double v_sin(double x) { return __v_sin(argd(x))[0]; } -static double v_cos(double x) { return __v_cos(argd(x))[0]; } -static double v_exp(double x) { return __v_exp(argd(x))[0]; } -static double v_log(double x) { return __v_log(argd(x))[0]; } -static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; } -#ifdef __vpcs -static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; } -static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; } -static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; } -static float vn_expf(float x) { return __vn_expf(argf(x))[0]; } -static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; } -static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; } -static float vn_logf(float x) { return __vn_logf(argf(x))[0]; } -static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; } -static double vn_sin(double x) { return __vn_sin(argd(x))[0]; } -static double vn_cos(double x) { return __vn_cos(argd(x))[0]; } -static double vn_exp(double x) { return __vn_exp(argd(x))[0]; } -static double vn_log(double x) { return __vn_log(argd(x))[0]; } -static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; } -static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } -static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } -static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } -static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } -static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } -static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } -static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } -static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } -static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } -static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } -static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } +#if WANT_SVE_MATH +#include <arm_sve.h> +typedef __SVFloat32_t sv_float; +typedef __SVFloat64_t sv_double; + +static inline sv_float svargf(float x) { + int n = svcntw(); + float base[n]; + for (int i=0; i<n; i++) + base[i] = (float)x; + base[n-1] = (float) fv[secondcall]; + return svld1(svptrue_b32(), base); +} +static inline sv_double svargd(double x) { + int n = svcntd(); + double base[n]; + for (int i=0; i<n; i++) + base[i] = x; + base[n-1] = dv[secondcall]; + return svld1(svptrue_b64(), base); +} +static inline float svretf(sv_float vec) { + int n = svcntw(); + float res[n]; + svst1(svptrue_b32(), res, vec); + return res[0]; +} +static inline double svretd(sv_double vec) { + int n = svcntd(); + double res[n]; + svst1(svptrue_b64(), res, vec); + return res[0]; +} #endif #endif +#include "test/ulp_wrappers.h" + struct fun { const char *name; @@ -322,83 +306,44 @@ static const struct fun fun[] = { #define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0) #define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0) #define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0) - F1 (sin) - F1 (cos) - F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) - F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) - F1 (exp) - F1 (exp2) - F1 (log) - F1 (log2) - F2 (pow) - F1 (erf) - D1 (exp) - D1 (exp2) - D1 (log) - D1 (log2) - D2 (pow) - D1 (erf) -#if WANT_VMATH - F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0) - F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0) - F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0) - F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0) - F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0) - F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0) - F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0) - F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0) - F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0) - F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0) - F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0) - F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0) - F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0) -#if __aarch64__ - F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1) - F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1) - F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1) -#ifdef __vpcs - F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) - F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1) - F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1) - F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1) - F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) - F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) - F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) - F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) - F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) - F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) - F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1) - F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) - F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) - F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) - F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) -#endif -#endif -#endif +/* Neon routines. */ +#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0) +#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0) +#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0) +#define ZVNF1(x) VNF1 (x) ZVF1 (x) +#define ZVNF2(x) VNF2 (x) ZVF2 (x) +#define ZVND1(x) VND1 (x) ZVD1 (x) +#define ZVND2(x) VND2 (x) ZVD2 (x) +/* SVE routines. */ +#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0) +#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0) +#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0) +#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0) +#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0) + +#include "test/ulp_funcs.h" + #undef F #undef F1 #undef F2 #undef D1 #undef D2 +#undef SVF1 +#undef SVF2 +#undef SVD1 +#undef SVD2 {0}}; /* Boilerplate for generic calls. */ @@ -639,12 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r) static void usage (void) { - puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func " + puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func " "lo [hi [x lo2 hi2] [count]]"); puts ("Compares func against a higher precision implementation in [lo; hi]."); puts ("-q: quiet."); puts ("-m: use mpfr even if faster method is available."); - puts ("-f: disable fenv testing (rounding modes and exceptions)."); + puts ("-f: disable fenv exceptions testing."); +#ifdef ___vpcs + puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n" + " This should be different from tested input in other lanes, and non-special \n" + " (i.e. should not trigger fenv exceptions). Default is 1."); +#endif + puts ("-z: ignore sign of 0."); puts ("Supported func:"); for (const struct fun *f = fun; f->name; f++) printf ("\t%s\n", f->name); @@ -768,6 +719,7 @@ main (int argc, char *argv[]) conf.fenv = 1; conf.softlim = 0; conf.errlim = INFINITY; + conf.ignore_zero_sign = 0; for (;;) { argc--; @@ -807,11 +759,22 @@ main (int argc, char *argv[]) { argc--; argv++; - if (argc < 1) + if (argc < 1 || argv[0][1] != '\0') usage (); conf.rc = argv[0][0]; } break; + case 'z': + conf.ignore_zero_sign = 1; + break; +#ifdef __vpcs + case 'c': + argc--; + argv++; + fv[0] = strtof(argv[0], 0); + dv[0] = strtod(argv[0], 0); + break; +#endif default: usage (); } @@ -837,7 +800,19 @@ main (int argc, char *argv[]) if (strcmp (argv[0], f->name) == 0) break; if (!f->name) - usage (); + { +#ifndef __vpcs + /* Ignore vector math functions if vector math is not supported. */ + if (strncmp (argv[0], "_ZGVnN", 6) == 0) + exit (0); +#endif +#if !WANT_SVE_MATH + if (strncmp (argv[0], "_ZGVsMxv", 8) == 0) + exit (0); +#endif + printf ("math function %s not supported\n", argv[0]); + exit (1); + } if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG) conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */ if (!USE_MPFR && conf.mpfr) diff --git a/contrib/arm-optimized-routines/math/test/ulp.h b/contrib/arm-optimized-routines/math/test/ulp.h index a0c301664321..b0bc59aeef8d 100644 --- a/contrib/arm-optimized-routines/math/test/ulp.h +++ b/contrib/arm-optimized-routines/math/test/ulp.h @@ -1,8 +1,8 @@ /* * Generic functions for ULP error estimation. * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ /* For each different math function type, @@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t) /* Difference between exact result and closest real number that gets rounded to got, i.e. error before rounding, for a correctly rounded result the difference is 0. */ -static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) +static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r, + int ignore_zero_sign) { RT(float) want = p->y; RT(float) d; @@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r) if (RT(asuint) (got) == RT(asuint) (want)) return 0.0; + if (isnan (got) && isnan (want)) + /* Ignore sign of NaN. */ + return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY; if (signbit (got) != signbit (want)) - /* May have false positives with NaN. */ - //return isnan(got) && isnan(want) ? 0 : INFINITY; - return INFINITY; + { + /* Fall through to ULP calculation if ignoring sign of zero and at + exactly one of want and got is non-zero. */ + if (ignore_zero_sign && want == got) + return 0.0; + if (!ignore_zero_sign || (want != 0 && got != 0)) + return INFINITY; + } if (!isfinite (want) || !isfinite (got)) { if (isnan (got) != isnan (want)) @@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r, static inline void T(call_nofenv) (const struct fun *f, struct T(args) a, int r, RT(float) * y, int *ex) { + if (r != FE_TONEAREST) + fesetround (r); *y = T(call) (f, a); *ex = 0; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); } static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a, @@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a, int r, struct RT(ret) * p, RT(float) ygot, int exgot) { + if (r != FE_TONEAREST) + fesetround (r); RT(double) yl = T(call_long) (f, a); p->y = (RT(float)) yl; + if (r != FE_TONEAREST) + fesetround (FE_TONEAREST); if (RT(isok_nofenv) (ygot, p->y)) return 1; p->ulpexp = RT(ulpscale) (p->y); @@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen, if (!ok) { int print = 0; - double err = RT(ulperr) (ygot, &want, r); + double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign); double abserr = fabs (err); // TODO: count errors below accuracy limit. if (abserr > 0) diff --git a/contrib/arm-optimized-routines/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/math/test/ulp_funcs.h new file mode 100644 index 000000000000..84f7927d3935 --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/ulp_funcs.h @@ -0,0 +1,40 @@ +/* + * Function entries for ulp. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +/* clang-format off */ + F1 (sin) + F1 (cos) + F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0) + F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0) + F1 (exp) + F1 (exp2) + F1 (log) + F1 (log2) + F2 (pow) + F1 (erf) + D1 (exp) + D1 (exp10) + D1 (exp2) + D1 (log) + D1 (log2) + D2 (pow) + D1 (erf) +#ifdef __vpcs + F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1) + F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1) + F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1) + F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1) + F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1) + F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1) + F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1) + F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1) + F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1) + F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1) + F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1) + F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1) +#endif +/* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h new file mode 100644 index 000000000000..60dc3d6dd652 --- /dev/null +++ b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h @@ -0,0 +1,37 @@ +/* + * Function wrappers for ulp. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* clang-format off */ + +/* Wrappers for sincos. */ +static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);} +static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);} +static double sincos_sin(double x) {(void)cos(x); return sin(x);} +static double sincos_cos(double x) {(void)sin(x); return cos(x);} +#if USE_MPFR +static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); } +static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); } +#endif + +/* Wrappers for vector functions. */ +#ifdef __vpcs +static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; } +static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; } +static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; } +static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; } +static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; } +static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; } +static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; } +static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; } +static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; } +static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; } +static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; } +static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; } +static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; } +#endif + +/* clang-format on */ diff --git a/contrib/arm-optimized-routines/math/tgamma128.c b/contrib/arm-optimized-routines/math/tgamma128.c new file mode 100644 index 000000000000..65deacc49d99 --- /dev/null +++ b/contrib/arm-optimized-routines/math/tgamma128.c @@ -0,0 +1,356 @@ +/* + * Implementation of the true gamma function (as opposed to lgamma) + * for 128-bit long double. + * + * Copyright (c) 2006-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* + * This module implements the float128 gamma function under the name + * tgamma128. It's expected to be suitable for integration into system + * maths libraries under the standard name tgammal, if long double is + * 128-bit. Such a library will probably want to check the error + * handling and optimize the initial process of extracting the + * exponent, which is done here by simple and portable (but + * potentially slower) methods. + */ + +#include <float.h> +#include <math.h> +#include <stdbool.h> +#include <stddef.h> + +/* Only binary128 format is supported. */ +#if LDBL_MANT_DIG == 113 + +#include "tgamma128.h" + +#define lenof(x) (sizeof(x)/sizeof(*(x))) + +/* + * Helper routine to evaluate a polynomial via Horner's rule + */ +static long double poly(const long double *coeffs, size_t n, long double x) +{ + long double result = coeffs[--n]; + + while (n > 0) + result = (result * x) + coeffs[--n]; + + return result; +} + +/* + * Compute sin(pi*x) / pi, for use in the reflection formula that + * relates gamma(-x) and gamma(x). + */ +static long double sin_pi_x_over_pi(long double x) +{ + int quo; + long double fracpart = remquol(x, 0.5L, &quo); + + long double sign = 1.0L; + if (quo & 2) + sign = -sign; + quo &= 1; + + if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) { + /* For numbers this size, sin(pi*x) is so close to pi*x that + * sin(pi*x)/pi is indistinguishable from x in float128 */ + return sign * fracpart; + } + + if (quo == 0) { + return sign * sinl(pi*fracpart) / pi; + } else { + return sign * cosl(pi*fracpart) / pi; + } +} + +/* Return tgamma(x) on the assumption that x >= 8. */ +static long double tgamma_large(long double x, + bool negative, long double negadjust) +{ + /* + * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K, + * where K is a correction factor computed as a polynomial in 1/x. + * + * (Vaguely inspired by the form of the Lanczos approximation, but + * I tried the Lanczos approximation itself and it suffers badly + * from big cancellation leading to loss of significance.) + */ + long double t = 1/x; + long double p = poly(coeffs_large, lenof(coeffs_large), t); + + /* + * To avoid overflow in cases where x^(x-0.5) does overflow + * but gamma(x) does not, we split x^(x-0.5) in half and + * multiply back up _after_ multiplying the shrinking factor + * of exp(-(x-0.5)). + * + * Note that computing x-0.5 and (x-0.5)/2 is exact for the + * relevant range of x, so the only sources of error are pow + * and exp themselves, plus the multiplications. + */ + long double powhalf = powl(x, (x-0.5L)/2.0L); + long double expret = expl(-(x-0.5L)); + + if (!negative) { + return (expret * powhalf) * powhalf * p; + } else { + /* + * Apply the reflection formula as commented below, but + * carefully: negadjust has magnitude less than 1, so it can + * turn a case where gamma(+x) would overflow into a case + * where gamma(-x) doesn't underflow. Not only that, but the + * FP format has greater range in the tiny domain due to + * denormals. For both reasons, it's not good enough to + * compute the positive result and then adjust it. + */ + long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p); + return ret / powhalf; + } +} + +/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */ +static long double tgamma_tiny(long double x, + bool negative, long double negadjust) +{ + /* + * For x near zero, we use a polynomial approximation to + * g = 1/(x*gamma(x)), and then return 1/(g*x). + */ + long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x); + if (!negative) + return 1.0L / (g*x); + else + return g / negadjust; +} + +/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */ +static long double tgamma_ultratiny(long double x, bool negative, + long double negadjust) +{ + /* On this interval, gamma can't even be distinguished from 1/x, + * so we skip the polynomial evaluation in tgamma_tiny, partly to + * save time and partly to avoid the tiny intermediate values + * setting the underflow exception flag. */ + if (!negative) + return 1.0L / x; + else + return 1.0L / negadjust; +} + +/* Return tgamma(x) on the assumption that 1 <= x <= 2. */ +static long double tgamma_central(long double x) +{ + /* + * In this central interval, our strategy is to finding the + * difference between x and the point where gamma has a minimum, + * and approximate based on that. + */ + + /* The difference between the input x and the minimum x. The first + * subtraction is expected to be exact, since x and min_hi have + * the same exponent (unless x=2, in which case it will still be + * exact). */ + long double t = (x - min_x_hi) - min_x_lo; + + /* + * Now use two different polynomials for the intervals [1,m] and + * [m,2]. + */ + long double p; + if (t < 0) + p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t); + else + p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t); + + return (min_y_lo + p * (t*t)) + min_y_hi; +} + +long double tgamma128(long double x) +{ + /* + * Start by extracting the number's sign and exponent, and ruling + * out cases of non-normalized numbers. + * + * For an implementation integrated into a system libm, it would + * almost certainly be quicker to do this by direct bitwise access + * to the input float128 value, using whatever is the local idiom + * for knowing its endianness. + * + * Integration into a system libc may also need to worry about + * setting errno, if that's the locally preferred way to report + * math.h errors. + */ + int sign = signbit(x); + int exponent; + switch (fpclassify(x)) { + case FP_NAN: + return x+x; /* propagate QNaN, make SNaN throw an exception */ + case FP_ZERO: + return 1/x; /* divide by zero on purpose to indicate a pole */ + case FP_INFINITE: + if (sign) { + return x-x; /* gamma(-inf) has indeterminate sign, so provoke an + * IEEE invalid operation exception to indicate that */ + } + return x; /* but gamma(+inf) is just +inf with no error */ + case FP_SUBNORMAL: + exponent = -16384; + break; + default: + frexpl(x, &exponent); + exponent--; + break; + } + + bool negative = false; + long double negadjust = 0.0L; + + if (sign) { + /* + * Euler's reflection formula is + * + * gamma(1-x) gamma(x) = pi/sin(pi*x) + * + * pi + * => gamma(x) = -------------------- + * gamma(1-x) sin(pi*x) + * + * But computing 1-x is going to lose a lot of accuracy when x + * is very small, so instead we transform using the recurrence + * gamma(t+1)=t gamma(t). Setting t=-x, this gives us + * gamma(1-x) = -x gamma(-x), so we now have + * + * pi + * gamma(x) = ---------------------- + * -x gamma(-x) sin(pi*x) + * + * which relates gamma(x) to gamma(-x), which is much nicer, + * since x can be turned into -x without rounding. + */ + negadjust = sin_pi_x_over_pi(x); + negative = true; + x = -x; + + /* + * Now the ultimate answer we want is + * + * 1 / (gamma(x) * x * negadjust) + * + * where x is the positive value we've just turned it into. + * + * For some of the cases below, we'll compute gamma(x) + * normally and then compute this adjusted value afterwards. + * But for others, we can implement the reciprocal operation + * in this formula by _avoiding_ an inversion that the + * sub-case was going to do anyway. + */ + + if (negadjust == 0) { + /* + * Special case for negative integers. Applying the + * reflection formula would cause division by zero, but + * standards would prefer we treat this error case as an + * invalid operation and return NaN instead. (Possibly + * because otherwise you'd have to decide which sign of + * infinity to return, and unlike the x=0 case, there's no + * sign of zero available to disambiguate.) + */ + return negadjust / negadjust; + } + } + + /* + * Split the positive domain into various cases. For cases where + * we do the negative-number adjustment the usual way, we'll leave + * the answer in 'g' and drop out of the if statement. + */ + long double g; + + if (exponent >= 11) { + /* + * gamma of any positive value this large overflows, and gamma + * of any negative value underflows. + */ + if (!negative) { + long double huge = 0x1p+12288L; + return huge * huge; /* provoke an overflow */ + } else { + long double tiny = 0x1p-12288L; + return tiny * tiny * negadjust; /* underflow, of the right sign */ + } + } else if (exponent >= 3) { + /* Negative-number adjustment happens inside here */ + return tgamma_large(x, negative, negadjust); + } else if (exponent < -113) { + /* Negative-number adjustment happens inside here */ + return tgamma_ultratiny(x, negative, negadjust); + } else if (exponent < -5) { + /* Negative-number adjustment happens inside here */ + return tgamma_tiny(x, negative, negadjust); + } else if (exponent == 0) { + g = tgamma_central(x); + } else if (exponent < 0) { + /* + * For x in [1/32,1) we range-reduce upwards to the interval + * [1,2), using the inverse of the normal recurrence formula: + * gamma(x) = gamma(x+1)/x. + */ + g = tgamma_central(1+x) / x; + } else { + /* + * For x in [2,8) we range-reduce downwards to the interval + * [1,2) by repeated application of the recurrence formula. + * + * Actually multiplying (x-1) by (x-2) by (x-3) and so on + * would introduce multiple ULPs of rounding error. We can get + * better accuracy by writing x = (k+1/2) + t, where k is an + * integer and |t|<1/2, and expanding out the obvious factor + * (x-1)(x-2)...(x-k+1) as a polynomial in t. + */ + long double mult; + int i = x; + if (i == 2) { /* x in [2,3) */ + mult = (x-1); + } else { + long double t = x - (i + 0.5L); + switch (i) { + /* E.g. for x=3.5+t, we want + * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */ + case 3: + mult = 3.75L+t*(4.0L+t); + break; + case 4: + mult = 13.125L+t*(17.75L+t*(7.5L+t)); + break; + case 5: + mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t))); + break; + case 6: + mult = 324.84375L+t*(570.5625L+t*(376.250L+t*( + 117.5L+t*(17.5L+t)))); + break; + case 7: + mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*( + 1140.0L+t*(231.25L+t*(24.0L+t))))); + break; + } + } + + g = tgamma_central(x - (i-1)) * mult; + } + + if (!negative) { + /* Positive domain: return g unmodified */ + return g; + } else { + /* Negative domain: apply the reflection formula as commented above */ + return 1.0L / (g * x * negadjust); + } +} + +#endif diff --git a/contrib/arm-optimized-routines/math/tgamma128.h b/contrib/arm-optimized-routines/math/tgamma128.h new file mode 100644 index 000000000000..90875a22dce4 --- /dev/null +++ b/contrib/arm-optimized-routines/math/tgamma128.h @@ -0,0 +1,141 @@ +/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +/* The largest positive value for which 128-bit tgamma does not overflow. */ +static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L; + +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +static const long double coeffs_large[] = { + 0x1.8535745aa79569579b9eec0f3bbcp+0L, + 0x1.0378f83c6fb8f0e51269f2b4a973p-3L, + 0x1.59f6a05094f69686c3380f4e2783p-8L, + -0x1.0b291dee952a82764a4859b081a6p-8L, + -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L, + 0x1.387a8b5f38dd77e7f139b1021e86p-10L, + 0x1.bca46637f65b13750c728cc29e40p-14L, + -0x1.d80401c00aef998c9e303151a51cp-11L, + -0x1.49cb6bb09f935a2053ccc2cf3711p-14L, + 0x1.4e950204437dcaf2be77f73a6f45p-10L, + 0x1.cb711a2d65f188bf60110934d6bep-14L, + -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L, + -0x1.0305ab9760cddb0d833e73766836p-12L, + 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L, + 0x1.bb4144740ad9290123fdcea684aap-11L, + -0x1.72ab4e88272a229bfafd192450f0p-5L, + 0x1.80c70ac6eb3b7a698983d25a62b8p-12L, + 0x1.e222791c6743ce3e3cae220fb236p-3L, + 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L, + -0x1.9d204fa235a42cd901b123d2ad47p+1L, + 0x1.55b56d1158f77ddb1c95fc44ab02p+0L, + 0x1.37f900a11dbd892abd7dde533e2dp+5L, + -0x1.2da49f4188dd89cb958369ef2401p+7L, + 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L, + -0x1.61433cebe649098c9611c4c7774ap+7L, +}; + +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +static const long double coeffs_tiny[] = { + 0x1.0000000000000000000000000000p+0L, + 0x1.2788cfc6fb618f49a37c7f0201fep-1L, + -0x1.4fcf4026afa2dceb8490ade22796p-1L, + -0x1.5815e8fa27047c8f42b5d9217244p-5L, + 0x1.5512320b43fbe5dfa771333518f7p-3L, + -0x1.59af103c340927bffdd44f954bfcp-5L, + -0x1.3b4af28483e210479657e5543366p-7L, + 0x1.d919c527f6070bfce9b29c2ace9cp-8L, + -0x1.317112ce35337def3556a18aa178p-10L, + -0x1.c364fe77a6f27677b985b1fa2e1dp-13L, + 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L, + -0x1.51cf9f090b5dc398ba86305e3634p-16L, + -0x1.4e80f64c04a339740de06ca9fa4ap-20L, + 0x1.241ddc2aef2ec20e58b08f2fda17p-20L, +}; + +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L; +static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L; + +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L; +static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +static const long double coeffs_central_neg[] = { + 0x1.b6c53f7377b83839c8a292e43b69p-2L, + 0x1.0bae9f40c7d09ed76e732045850ap-3L, + 0x1.4981175e14d04c3530e51d01c5fep-3L, + 0x1.79f77aaf032c948af3a9edbd2061p-4L, + 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L, + 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L, + 0x1.0b44c2f92982f887b55ec36dfdb0p-5L, + 0x1.6df1de1e178ef72ca7bd63d40870p-6L, + 0x1.f63f502bde27e81c0f5e13479b43p-7L, + 0x1.57fd67d901f40ea011353ad89a0ap-7L, + 0x1.d7151376eed187eb753e2273cafcp-8L, + 0x1.427162b5c6ff1d904c71ef53e37cp-8L, + 0x1.b954b8c3a56cf93e49ef6538928ap-9L, + 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L, + 0x1.9d35250d9b9378d9b59df734537ap-10L, + 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L, + 0x1.7e0db39bb99cdb52b028d9359380p-11L, + 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L, + 0x1.27521cf5fd24dcdf43524e6add11p-13L, + 0x1.06461d62243bf9a826b42349672fp-10L, + -0x1.2b852abead28209b4e0c756dc46ep-9L, + 0x1.be673c11a72c826115ec6d286c14p-8L, + -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L, + 0x1.fa362bd2dc68f41abef2d8600acdp-6L, + -0x1.a21585b2f52f8b23855de8e452edp-5L, + 0x1.1f234431ed032052fc92e64e0493p-4L, + -0x1.40d332476ca0199c60cdae3f9132p-4L, + 0x1.1d45dc665d86012eba2eea199cefp-4L, + -0x1.8491016cdd08dc9be7ade9b5fef3p-5L, + 0x1.7e7e2fbc6d49ad484300d6add324p-6L, + -0x1.e63fe3f874a37276a8d7d8b705ecp-8L, + 0x1.30a2a73944f8c84998314d69c23fp-10L, +}; + +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +static const long double coeffs_central_pos[] = { + 0x1.b6c53f7377b83839c8a292e22aa2p-2L, + -0x1.0bae9f40c7d09ed76e72e1c955dep-3L, + 0x1.4981175e14d04c3530ee5e1ecebcp-3L, + -0x1.79f77aaf032c948ac983d77f3e07p-4L, + 0x1.1e97bd10821095ab7dc94936cc11p-4L, + -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L, + 0x1.0b44c2f929837fafef7b5d9e80f1p-5L, + -0x1.6df1de1e175fe2a51faa25cddbb4p-6L, + 0x1.f63f502be57d11aed2cfe90843ffp-7L, + -0x1.57fd67d852f230015b9f64770273p-7L, + 0x1.d715138adc07e5fce81077070357p-8L, + -0x1.4271618e9fda8992a667adb15f4fp-8L, + 0x1.b954d15d9eb772e80fdd760672d7p-9L, + -0x1.2dfe391241d3cb79c8c15182843dp-9L, + 0x1.9d44396fcd48451c3ba924cee814p-10L, + -0x1.1ac195fb99739e341589e39803e6p-10L, + 0x1.82e46127b68f002770826e25f146p-11L, + -0x1.089dacd90d9f41493119ac178359p-11L, + 0x1.6993c007b20394a057d21f3d37f8p-12L, + -0x1.ec43a709f4446560c099dec8e31bp-13L, + 0x1.4ba36322f4074e9add9450f003cap-13L, + -0x1.b3f83a977965ca1b7937bf5b34cap-14L, + 0x1.10af346abc09cb25a6d9fe810b6ep-14L, + -0x1.38d8ea1188f242f50203edc395bdp-15L, + 0x1.39add987a948ec56f62b721a4475p-16L, + -0x1.02a4e141f286c8a967e2df9bc9adp-17L, + 0x1.433b50af22425f546e87113062d7p-19L, + -0x1.0c7b73cb0013f00aafc103e8e382p-21L, + 0x1.b852de313ec38da2297f6deaa6b4p-25L, +}; + +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L; diff --git a/contrib/arm-optimized-routines/math/tools/cos.sollya b/contrib/arm-optimized-routines/math/tools/cos.sollya index bd72d6b74820..6690adfcbb9b 100644 --- a/contrib/arm-optimized-routines/math/tools/cos.sollya +++ b/contrib/arm-optimized-routines/math/tools/cos.sollya @@ -1,7 +1,7 @@ // polynomial for approximating cos(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 8; // polynomial degree a = -pi/4; // interval diff --git a/contrib/arm-optimized-routines/math/tools/exp.sollya b/contrib/arm-optimized-routines/math/tools/exp.sollya index b7a462cda5a4..0668bdb5b3d3 100644 --- a/contrib/arm-optimized-routines/math/tools/exp.sollya +++ b/contrib/arm-optimized-routines/math/tools/exp.sollya @@ -1,7 +1,7 @@ // polynomial for approximating e^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 5; // poly degree N = 128; // table entries diff --git a/contrib/arm-optimized-routines/math/tools/exp2.sollya b/contrib/arm-optimized-routines/math/tools/exp2.sollya index e760769601d4..bd0a42d6bbcb 100644 --- a/contrib/arm-optimized-routines/math/tools/exp2.sollya +++ b/contrib/arm-optimized-routines/math/tools/exp2.sollya @@ -1,7 +1,7 @@ // polynomial for approximating 2^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception // exp2f parameters deg = 3; // poly degree diff --git a/contrib/arm-optimized-routines/math/tools/log.sollya b/contrib/arm-optimized-routines/math/tools/log.sollya index 6df4db44b6f3..5288f5572925 100644 --- a/contrib/arm-optimized-routines/math/tools/log.sollya +++ b/contrib/arm-optimized-routines/math/tools/log.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 12; // poly degree // |log(1+x)| > 0x1p-4 outside the interval diff --git a/contrib/arm-optimized-routines/math/tools/log2.sollya b/contrib/arm-optimized-routines/math/tools/log2.sollya index 4a364c0f111f..85811be5d90c 100644 --- a/contrib/arm-optimized-routines/math/tools/log2.sollya +++ b/contrib/arm-optimized-routines/math/tools/log2.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log2(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 11; // poly degree // |log2(1+x)| > 0x1p-4 outside the interval diff --git a/contrib/arm-optimized-routines/math/tools/log2_abs.sollya b/contrib/arm-optimized-routines/math/tools/log2_abs.sollya index 82c4dac26fa1..d018ba0145d2 100644 --- a/contrib/arm-optimized-routines/math/tools/log2_abs.sollya +++ b/contrib/arm-optimized-routines/math/tools/log2_abs.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log2(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 7; // poly degree // interval ~= 1/(2*N), where N is the table entries diff --git a/contrib/arm-optimized-routines/math/tools/log_abs.sollya b/contrib/arm-optimized-routines/math/tools/log_abs.sollya index a2ac190fc497..5f9bfe41a683 100644 --- a/contrib/arm-optimized-routines/math/tools/log_abs.sollya +++ b/contrib/arm-optimized-routines/math/tools/log_abs.sollya @@ -1,7 +1,7 @@ // polynomial for approximating log(1+x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 6; // poly degree // interval ~= 1/(2*N), where N is the table entries diff --git a/contrib/arm-optimized-routines/math/tools/plot.py b/contrib/arm-optimized-routines/math/tools/plot.py index 6c8b89ff284b..a0fa02322560 100755 --- a/contrib/arm-optimized-routines/math/tools/plot.py +++ b/contrib/arm-optimized-routines/math/tools/plot.py @@ -3,7 +3,7 @@ # ULP error plot tool. # # Copyright (c) 2019, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception import numpy as np import matplotlib.pyplot as plt diff --git a/contrib/arm-optimized-routines/math/tools/remez.jl b/contrib/arm-optimized-routines/math/tools/remez.jl index 2ff436f5287f..1deab67d0660 100755 --- a/contrib/arm-optimized-routines/math/tools/remez.jl +++ b/contrib/arm-optimized-routines/math/tools/remez.jl @@ -4,7 +4,7 @@ # remez.jl - implementation of the Remez algorithm for polynomial approximation # # Copyright (c) 2015-2019, Arm Limited. -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception import Base.\ diff --git a/contrib/arm-optimized-routines/math/tools/sin.sollya b/contrib/arm-optimized-routines/math/tools/sin.sollya index a6e851145c11..a19300019867 100644 --- a/contrib/arm-optimized-routines/math/tools/sin.sollya +++ b/contrib/arm-optimized-routines/math/tools/sin.sollya @@ -1,7 +1,7 @@ // polynomial for approximating sin(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 7; // polynomial degree a = -pi/4; // interval diff --git a/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl new file mode 100644 index 000000000000..ecec174110ea --- /dev/null +++ b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl @@ -0,0 +1,212 @@ +# -*- julia -*- +# +# Generate tgamma128.h, containing polynomials and constants used by +# tgamma128.c. +# +# Copyright (c) 2006-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +# This Julia program depends on the 'Remez' and 'SpecialFunctions' +# library packages. To install them, run this at the interactive Julia +# prompt: +# +# import Pkg; Pkg.add(["Remez", "SpecialFunctions"]) +# +# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04). + +import Printf +import Remez +import SpecialFunctions + +# Round a BigFloat to 128-bit long double and format it as a C99 hex +# float literal. +function quadhex(x) + sign = " " + if x < 0 + sign = "-" + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + mantissa = BigInt(round(x)) + + mantstr = string(mantissa, base=16, pad=29) + return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end], + exponent) +end + +# Round a BigFloat to 128-bit long double and return it still as a +# BigFloat. +function quadval(x, round=0) + sign = +1 + if x.sign < 0 + sign = -1 + x = -x + end + + exponent = BigInt(floor(log2(x))) + exponent = max(exponent, -16382) + @assert(exponent <= 16383) # else overflow + + x /= BigFloat(2)^exponent + @assert(1 <= x < 2) + x *= BigFloat(2)^112 + if round < 0 + mantissa = floor(x) + elseif round > 0 + mantissa = ceil(x) + else + mantissa = round(x) + end + + return sign * mantissa * BigFloat(2)^(exponent - 112) +end + +# Output an array of BigFloats as a C array declaration. +function dumparray(a, name) + println("static const long double ", name, "[] = {") + for x in N + println(" ", quadhex(x), ",") + end + println("};") +end + +print("/* + * Polynomial coefficients and other constants for tgamma128.c. + * + * Copyright (c) 2006,2009,2023 Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +") + +Base.MPFR.setprecision(512) + +e = exp(BigFloat(1)) + +print(" +/* The largest positive value for which 128-bit tgamma does not overflow. */ +") +lo = BigFloat("1000") +hi = BigFloat("2000") +while true + global lo + global hi + global max_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + max_x = mid + break + end + if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2)) + lo = mid + else + hi = mid + end +end +max_x = quadval(max_x, -1) +println("static const long double max_x = ", quadhex(max_x), ";") + +print(" +/* Coefficients of the polynomial used in the tgamma_large() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? sqrt(BigFloat(2)*pi/e) : + exp(SpecialFunctions.logabsgamma(1/x)[1] + + (1/x-0.5)*(1+log(x))), + (0, 1/BigFloat(8)), + 24, 0, + (x, y) -> 1/y +) +dumparray(N, "coeffs_large") + +print(" +/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)), + (0, 1/BigFloat(32)), + 13, 0, +) +dumparray(N, "coeffs_tiny") + +print(" +/* The location within the interval [1,2] where gamma has a minimum. + * Specified as the sum of two 128-bit values, for extra precision. */ +") +lo = BigFloat("1.4") +hi = BigFloat("1.5") +while true + global lo + global hi + global min_x + + mid = (lo + hi) / 2 + if mid == lo || mid == hi + min_x = mid + break + end + if SpecialFunctions.digamma(mid) < 0 + lo = mid + else + hi = mid + end +end +min_x_hi = quadval(min_x, -1) +println("static const long double min_x_hi = ", quadhex(min_x_hi), ";") +println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";") + +print(" +/* The actual minimum value that gamma takes at that location. + * Again specified as the sum of two 128-bit values. */ +") +min_y = SpecialFunctions.gamma(min_x) +min_y_hi = quadval(min_y, -1) +println("static const long double min_y_hi = ", quadhex(min_y_hi), ";") +println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";") + +function taylor_bodge(x) + # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2. + # Used in the Remez calls below for x values very near the origin, to avoid + # significance loss problems when trying to compute it directly via that + # formula (even in MPFR's extra precision). + return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506")))) +end + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [1,min_x] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) : + (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x), + (0, min_x - 1), + 31, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_neg") + +print(" +/* Coefficients of the polynomial used in the tgamma_central() subroutine + * for computing gamma on the interval [min_x,2] */ +") +N, D, E, X = Remez.ratfn_minimax( + x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) : + (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x), + (0, 2 - min_x), + 28, 0, + (x, y) -> x^2, +) +dumparray(N, "coeffs_central_pos") + +print(" +/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine + */ +") +println("static const long double pi = ", quadhex(BigFloat(pi)), ";") diff --git a/contrib/arm-optimized-routines/math/tools/v_exp.sollya b/contrib/arm-optimized-routines/math/tools/v_exp.sollya index c0abb63fb642..5fa7de7435a9 100644 --- a/contrib/arm-optimized-routines/math/tools/v_exp.sollya +++ b/contrib/arm-optimized-routines/math/tools/v_exp.sollya @@ -1,7 +1,7 @@ // polynomial for approximating e^x // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 4; // poly degree N = 128; // table entries diff --git a/contrib/arm-optimized-routines/math/tools/v_log.sollya b/contrib/arm-optimized-routines/math/tools/v_log.sollya index cc3d2c4ae72a..d982524eb920 100644 --- a/contrib/arm-optimized-routines/math/tools/v_log.sollya +++ b/contrib/arm-optimized-routines/math/tools/v_log.sollya @@ -1,7 +1,7 @@ // polynomial used for __v_log(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 6; // poly degree a = -0x1.fc1p-9; diff --git a/contrib/arm-optimized-routines/math/tools/v_sin.sollya b/contrib/arm-optimized-routines/math/tools/v_sin.sollya index 65cc9957c624..63b9d65a1ac3 100644 --- a/contrib/arm-optimized-routines/math/tools/v_sin.sollya +++ b/contrib/arm-optimized-routines/math/tools/v_sin.sollya @@ -1,7 +1,7 @@ // polynomial for approximating sin(x) // // Copyright (c) 2019, Arm Limited. -// SPDX-License-Identifier: MIT +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception deg = 15; // polynomial degree a = -pi/2; // interval diff --git a/contrib/arm-optimized-routines/math/v_cos.c b/contrib/arm-optimized-routines/math/v_cos.c deleted file mode 100644 index 20ba6bd0d0d9..000000000000 --- a/contrib/arm-optimized-routines/math/v_cos.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Double-precision vector cos function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define HalfPi v_f64 (0x1.921fb54442d18p+0) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define RangeVal v_f64 (0x1p23) -#define AbsMask v_u64 (0x7fffffffffffffff) - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (cos, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(cos) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t odd, cmp; - - r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); - cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); - - /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = v_fma_f64 (InvPi, r + HalfPi, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - n -= v_f64 (0.5); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_cosf.c b/contrib/arm-optimized-routines/math/v_cosf.c deleted file mode 100644 index 150294b8845e..000000000000 --- a/contrib/arm-optimized-routines/math/v_cosf.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Single-precision vector cos function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) -#define HalfPi v_f32 (0x1.921fb6p0f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (cosf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(cosf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t odd, cmp; - - r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); - cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); - - /* n = rint((|x|+pi/2)/pi) - 0.5 */ - n = v_fma_f32 (InvPi, r + HalfPi, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - n -= v_f32 (0.5f); - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp.c b/contrib/arm-optimized-routines/math/v_exp.c deleted file mode 100644 index e459d53fddd2..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Double-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED -#include "v_exp.h" - -#if V_EXP_TABLE_BITS == 7 -/* maxerr: 1.88 +0.5 ulp - rel error: 1.4337*2^-53 - abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */ -#define C1 v_f64 (0x1.ffffffffffd43p-2) -#define C2 v_f64 (0x1.55555c75adbb2p-3) -#define C3 v_f64 (0x1.55555da646206p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */ -#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */ -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63) -#elif V_EXP_TABLE_BITS == 8 -/* maxerr: 0.54 +0.5 ulp - rel error: 1.4318*2^-58 - abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */ -#define C1 v_f64 (0x1.fffffffffffd4p-2) -#define C2 v_f64 (0x1.5555571d6b68cp-3) -#define C3 v_f64 (0x1.5555576a59599p-5) -#define InvLn2 v_f64 (0x1.71547652b82fep8) -#define Ln2hi v_f64 (0x1.62e42fefa39efp-9) -#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64) -#endif - -#define N (1 << V_EXP_TABLE_BITS) -#define Tab __v_exp_data -#define IndexMask v_u64 (N - 1) -#define Shift v_f64 (0x1.8p+52) -#define Thres v_f64 (704.0) - -VPCS_ATTR -static v_f64_t -specialcase (v_f64_t s, v_f64_t y, v_f64_t n) -{ - v_f64_t absn = v_abs_f64 (n); - - /* 2^(n/N) may overflow, break it up into s1*s2. */ - v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); - v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); - v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); - v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); - v_f64_t r1 = s1 * s1; - v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; - return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); -} - -VPCS_ATTR -v_f64_t -V_NAME(exp) (v_f64_t x) -{ - v_f64_t n, r, r2, s, y, z; - v_u64_t cmp, u, e, i; - - cmp = v_cond_u64 (v_abs_f64 (x) > Thres); - - /* n = round(x/(ln2/N)). */ - z = v_fma_f64 (x, InvLn2, Shift); - u = v_as_u64_f64 (z); - n = z - Shift; - - /* r = x - n*ln2/N. */ - r = x; - r = v_fma_f64 (-Ln2hi, n, r); - r = v_fma_f64 (-Ln2lo, n, r); - - e = u << (52 - V_EXP_TABLE_BITS); - i = u & IndexMask; - - /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ - r2 = r * r; - y = v_fma_f64 (C2, r, C1); - y = v_fma_f64 (C3, r2, y); - y = v_fma_f64 (y, r2, r); - - /* s = 2^(n/N). */ - u = v_lookup_u64 (Tab, i); - s = v_as_f64_u64 (u + e); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (s, y, n); - return v_fma_f64 (y, s, s); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp.h b/contrib/arm-optimized-routines/math/v_exp.h deleted file mode 100644 index 305da19c0a53..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Declarations for double-precision e^x vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_EXP_TABLE_BITS 7 - -extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN; -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp2f.c b/contrib/arm-optimized-routines/math/v_exp2f.c deleted file mode 100644 index e3ea5af3414d..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp2f.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.962 ulp. */ - 0x1.59977ap-10f, - 0x1.3ce9e4p-7f, - 0x1.c6bd32p-5f, - 0x1.ebf9bcp-3f, - 0x1.62e422p-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -VPCS_ATTR -v_f32_t -V_NAME(exp2f) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, absn; - v_u32_t cmp, e; - - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn, cmp, scale); - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/v_exp2f_1u.c deleted file mode 100644 index 1caa14d9bfff..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp2f_1u.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Single-precision vector 2^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 0.878 ulp. */ - 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) -#define C5 v_f32 (Poly[5]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); -} - -VPCS_ATTR -v_f32_t -V_NAME(exp2f_1u) (v_f32_t x) -{ - v_f32_t n, r, scale, poly, absn; - v_u32_t cmp, e; - - /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ -#if 0 - v_f32_t z; - z = x + Shift; - n = z - Shift; - r = x - n; - e = v_as_u32_f32 (z) << 23; -#else - n = v_round_f32 (x); - r = x - n; - e = v_as_u32_s32 (v_round_s32 (x)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, C5); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} -#endif diff --git a/contrib/arm-optimized-routines/math/v_exp_data.c b/contrib/arm-optimized-routines/math/v_exp_data.c deleted file mode 100644 index 365355497e95..000000000000 --- a/contrib/arm-optimized-routines/math/v_exp_data.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Lookup table for double-precision e^x vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_exp.h" -#if WANT_VMATH - -#define N (1 << V_EXP_TABLE_BITS) - -/* 2^(j/N), j=0..N. */ -const u64_t __v_exp_data[] = { -#if N == 128 -0x3ff0000000000000, -0x3feff63da9fb3335, -0x3fefec9a3e778061, -0x3fefe315e86e7f85, -0x3fefd9b0d3158574, -0x3fefd06b29ddf6de, -0x3fefc74518759bc8, -0x3fefbe3ecac6f383, -0x3fefb5586cf9890f, -0x3fefac922b7247f7, -0x3fefa3ec32d3d1a2, -0x3fef9b66affed31b, -0x3fef9301d0125b51, -0x3fef8abdc06c31cc, -0x3fef829aaea92de0, -0x3fef7a98c8a58e51, -0x3fef72b83c7d517b, -0x3fef6af9388c8dea, -0x3fef635beb6fcb75, -0x3fef5be084045cd4, -0x3fef54873168b9aa, -0x3fef4d5022fcd91d, -0x3fef463b88628cd6, -0x3fef3f49917ddc96, -0x3fef387a6e756238, -0x3fef31ce4fb2a63f, -0x3fef2b4565e27cdd, -0x3fef24dfe1f56381, -0x3fef1e9df51fdee1, -0x3fef187fd0dad990, -0x3fef1285a6e4030b, -0x3fef0cafa93e2f56, -0x3fef06fe0a31b715, -0x3fef0170fc4cd831, -0x3feefc08b26416ff, -0x3feef6c55f929ff1, -0x3feef1a7373aa9cb, -0x3feeecae6d05d866, -0x3feee7db34e59ff7, -0x3feee32dc313a8e5, -0x3feedea64c123422, -0x3feeda4504ac801c, -0x3feed60a21f72e2a, -0x3feed1f5d950a897, -0x3feece086061892d, -0x3feeca41ed1d0057, -0x3feec6a2b5c13cd0, -0x3feec32af0d7d3de, -0x3feebfdad5362a27, -0x3feebcb299fddd0d, -0x3feeb9b2769d2ca7, -0x3feeb6daa2cf6642, -0x3feeb42b569d4f82, -0x3feeb1a4ca5d920f, -0x3feeaf4736b527da, -0x3feead12d497c7fd, -0x3feeab07dd485429, -0x3feea9268a5946b7, -0x3feea76f15ad2148, -0x3feea5e1b976dc09, -0x3feea47eb03a5585, -0x3feea34634ccc320, -0x3feea23882552225, -0x3feea155d44ca973, -0x3feea09e667f3bcd, -0x3feea012750bdabf, -0x3fee9fb23c651a2f, -0x3fee9f7df9519484, -0x3fee9f75e8ec5f74, -0x3fee9f9a48a58174, -0x3fee9feb564267c9, -0x3feea0694fde5d3f, -0x3feea11473eb0187, -0x3feea1ed0130c132, -0x3feea2f336cf4e62, -0x3feea427543e1a12, -0x3feea589994cce13, -0x3feea71a4623c7ad, -0x3feea8d99b4492ed, -0x3feeaac7d98a6699, -0x3feeace5422aa0db, -0x3feeaf3216b5448c, -0x3feeb1ae99157736, -0x3feeb45b0b91ffc6, -0x3feeb737b0cdc5e5, -0x3feeba44cbc8520f, -0x3feebd829fde4e50, -0x3feec0f170ca07ba, -0x3feec49182a3f090, -0x3feec86319e32323, -0x3feecc667b5de565, -0x3feed09bec4a2d33, -0x3feed503b23e255d, -0x3feed99e1330b358, -0x3feede6b5579fdbf, -0x3feee36bbfd3f37a, -0x3feee89f995ad3ad, -0x3feeee07298db666, -0x3feef3a2b84f15fb, -0x3feef9728de5593a, -0x3feeff76f2fb5e47, -0x3fef05b030a1064a, -0x3fef0c1e904bc1d2, -0x3fef12c25bd71e09, -0x3fef199bdd85529c, -0x3fef20ab5fffd07a, -0x3fef27f12e57d14b, -0x3fef2f6d9406e7b5, -0x3fef3720dcef9069, -0x3fef3f0b555dc3fa, -0x3fef472d4a07897c, -0x3fef4f87080d89f2, -0x3fef5818dcfba487, -0x3fef60e316c98398, -0x3fef69e603db3285, -0x3fef7321f301b460, -0x3fef7c97337b9b5f, -0x3fef864614f5a129, -0x3fef902ee78b3ff6, -0x3fef9a51fbc74c83, -0x3fefa4afa2a490da, -0x3fefaf482d8e67f1, -0x3fefba1bee615a27, -0x3fefc52b376bba97, -0x3fefd0765b6e4540, -0x3fefdbfdad9cbe14, -0x3fefe7c1819e90d8, -0x3feff3c22b8f71f1, -#elif N == 256 -0x3ff0000000000000, -0x3feffb1afa5abcbf, -0x3feff63da9fb3335, -0x3feff168143b0281, -0x3fefec9a3e778061, -0x3fefe7d42e11bbcc, -0x3fefe315e86e7f85, -0x3fefde5f72f654b1, -0x3fefd9b0d3158574, -0x3fefd50a0e3c1f89, -0x3fefd06b29ddf6de, -0x3fefcbd42b72a836, -0x3fefc74518759bc8, -0x3fefc2bdf66607e0, -0x3fefbe3ecac6f383, -0x3fefb9c79b1f3919, -0x3fefb5586cf9890f, -0x3fefb0f145e46c85, -0x3fefac922b7247f7, -0x3fefa83b23395dec, -0x3fefa3ec32d3d1a2, -0x3fef9fa55fdfa9c5, -0x3fef9b66affed31b, -0x3fef973028d7233e, -0x3fef9301d0125b51, -0x3fef8edbab5e2ab6, -0x3fef8abdc06c31cc, -0x3fef86a814f204ab, -0x3fef829aaea92de0, -0x3fef7e95934f312e, -0x3fef7a98c8a58e51, -0x3fef76a45471c3c2, -0x3fef72b83c7d517b, -0x3fef6ed48695bbc0, -0x3fef6af9388c8dea, -0x3fef672658375d2f, -0x3fef635beb6fcb75, -0x3fef5f99f8138a1c, -0x3fef5be084045cd4, -0x3fef582f95281c6b, -0x3fef54873168b9aa, -0x3fef50e75eb44027, -0x3fef4d5022fcd91d, -0x3fef49c18438ce4d, -0x3fef463b88628cd6, -0x3fef42be3578a819, -0x3fef3f49917ddc96, -0x3fef3bdda27912d1, -0x3fef387a6e756238, -0x3fef351ffb82140a, -0x3fef31ce4fb2a63f, -0x3fef2e85711ece75, -0x3fef2b4565e27cdd, -0x3fef280e341ddf29, -0x3fef24dfe1f56381, -0x3fef21ba7591bb70, -0x3fef1e9df51fdee1, -0x3fef1b8a66d10f13, -0x3fef187fd0dad990, -0x3fef157e39771b2f, -0x3fef1285a6e4030b, -0x3fef0f961f641589, -0x3fef0cafa93e2f56, -0x3fef09d24abd886b, -0x3fef06fe0a31b715, -0x3fef0432edeeb2fd, -0x3fef0170fc4cd831, -0x3feefeb83ba8ea32, -0x3feefc08b26416ff, -0x3feef96266e3fa2d, -0x3feef6c55f929ff1, -0x3feef431a2de883b, -0x3feef1a7373aa9cb, -0x3feeef26231e754a, -0x3feeecae6d05d866, -0x3feeea401b7140ef, -0x3feee7db34e59ff7, -0x3feee57fbfec6cf4, -0x3feee32dc313a8e5, -0x3feee0e544ede173, -0x3feedea64c123422, -0x3feedc70df1c5175, -0x3feeda4504ac801c, -0x3feed822c367a024, -0x3feed60a21f72e2a, -0x3feed3fb2709468a, -0x3feed1f5d950a897, -0x3feecffa3f84b9d4, -0x3feece086061892d, -0x3feecc2042a7d232, -0x3feeca41ed1d0057, -0x3feec86d668b3237, -0x3feec6a2b5c13cd0, -0x3feec4e1e192aed2, -0x3feec32af0d7d3de, -0x3feec17dea6db7d7, -0x3feebfdad5362a27, -0x3feebe41b817c114, -0x3feebcb299fddd0d, -0x3feebb2d81d8abff, -0x3feeb9b2769d2ca7, -0x3feeb8417f4531ee, -0x3feeb6daa2cf6642, -0x3feeb57de83f4eef, -0x3feeb42b569d4f82, -0x3feeb2e2f4f6ad27, -0x3feeb1a4ca5d920f, -0x3feeb070dde910d2, -0x3feeaf4736b527da, -0x3feeae27dbe2c4cf, -0x3feead12d497c7fd, -0x3feeac0827ff07cc, -0x3feeab07dd485429, -0x3feeaa11fba87a03, -0x3feea9268a5946b7, -0x3feea84590998b93, -0x3feea76f15ad2148, -0x3feea6a320dceb71, -0x3feea5e1b976dc09, -0x3feea52ae6cdf6f4, -0x3feea47eb03a5585, -0x3feea3dd1d1929fd, -0x3feea34634ccc320, -0x3feea2b9febc8fb7, -0x3feea23882552225, -0x3feea1c1c70833f6, -0x3feea155d44ca973, -0x3feea0f4b19e9538, -0x3feea09e667f3bcd, -0x3feea052fa75173e, -0x3feea012750bdabf, -0x3fee9fdcddd47645, -0x3fee9fb23c651a2f, -0x3fee9f9298593ae5, -0x3fee9f7df9519484, -0x3fee9f7466f42e87, -0x3fee9f75e8ec5f74, -0x3fee9f8286ead08a, -0x3fee9f9a48a58174, -0x3fee9fbd35d7cbfd, -0x3fee9feb564267c9, -0x3feea024b1ab6e09, -0x3feea0694fde5d3f, -0x3feea0b938ac1cf6, -0x3feea11473eb0187, -0x3feea17b0976cfdb, -0x3feea1ed0130c132, -0x3feea26a62ff86f0, -0x3feea2f336cf4e62, -0x3feea3878491c491, -0x3feea427543e1a12, -0x3feea4d2add106d9, -0x3feea589994cce13, -0x3feea64c1eb941f7, -0x3feea71a4623c7ad, -0x3feea7f4179f5b21, -0x3feea8d99b4492ed, -0x3feea9cad931a436, -0x3feeaac7d98a6699, -0x3feeabd0a478580f, -0x3feeace5422aa0db, -0x3feeae05bad61778, -0x3feeaf3216b5448c, -0x3feeb06a5e0866d9, -0x3feeb1ae99157736, -0x3feeb2fed0282c8a, -0x3feeb45b0b91ffc6, -0x3feeb5c353aa2fe2, -0x3feeb737b0cdc5e5, -0x3feeb8b82b5f98e5, -0x3feeba44cbc8520f, -0x3feebbdd9a7670b3, -0x3feebd829fde4e50, -0x3feebf33e47a22a2, -0x3feec0f170ca07ba, -0x3feec2bb4d53fe0d, -0x3feec49182a3f090, -0x3feec674194bb8d5, -0x3feec86319e32323, -0x3feeca5e8d07f29e, -0x3feecc667b5de565, -0x3feece7aed8eb8bb, -0x3feed09bec4a2d33, -0x3feed2c980460ad8, -0x3feed503b23e255d, -0x3feed74a8af46052, -0x3feed99e1330b358, -0x3feedbfe53c12e59, -0x3feede6b5579fdbf, -0x3feee0e521356eba, -0x3feee36bbfd3f37a, -0x3feee5ff3a3c2774, -0x3feee89f995ad3ad, -0x3feeeb4ce622f2ff, -0x3feeee07298db666, -0x3feef0ce6c9a8952, -0x3feef3a2b84f15fb, -0x3feef68415b749b1, -0x3feef9728de5593a, -0x3feefc6e29f1c52a, -0x3feeff76f2fb5e47, -0x3fef028cf22749e4, -0x3fef05b030a1064a, -0x3fef08e0b79a6f1f, -0x3fef0c1e904bc1d2, -0x3fef0f69c3f3a207, -0x3fef12c25bd71e09, -0x3fef16286141b33d, -0x3fef199bdd85529c, -0x3fef1d1cd9fa652c, -0x3fef20ab5fffd07a, -0x3fef244778fafb22, -0x3fef27f12e57d14b, -0x3fef2ba88988c933, -0x3fef2f6d9406e7b5, -0x3fef33405751c4db, -0x3fef3720dcef9069, -0x3fef3b0f2e6d1675, -0x3fef3f0b555dc3fa, -0x3fef43155b5bab74, -0x3fef472d4a07897c, -0x3fef4b532b08c968, -0x3fef4f87080d89f2, -0x3fef53c8eacaa1d6, -0x3fef5818dcfba487, -0x3fef5c76e862e6d3, -0x3fef60e316c98398, -0x3fef655d71ff6075, -0x3fef69e603db3285, -0x3fef6e7cd63a8315, -0x3fef7321f301b460, -0x3fef77d5641c0658, -0x3fef7c97337b9b5f, -0x3fef81676b197d17, -0x3fef864614f5a129, -0x3fef8b333b16ee12, -0x3fef902ee78b3ff6, -0x3fef953924676d76, -0x3fef9a51fbc74c83, -0x3fef9f7977cdb740, -0x3fefa4afa2a490da, -0x3fefa9f4867cca6e, -0x3fefaf482d8e67f1, -0x3fefb4aaa2188510, -0x3fefba1bee615a27, -0x3fefbf9c1cb6412a, -0x3fefc52b376bba97, -0x3fefcac948dd7274, -0x3fefd0765b6e4540, -0x3fefd632798844f8, -0x3fefdbfdad9cbe14, -0x3fefe1d802243c89, -0x3fefe7c1819e90d8, -0x3fefedba3692d514, -0x3feff3c22b8f71f1, -0x3feff9d96b2a23d9, -#endif -}; -#endif diff --git a/contrib/arm-optimized-routines/math/v_expf.c b/contrib/arm-optimized-routines/math/v_expf.c deleted file mode 100644 index d403e00534f0..000000000000 --- a/contrib/arm-optimized-routines/math/v_expf.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 1.45358 +0.5 ulp. */ - 0x1.0e4020p-7f, - 0x1.573e2ep-5f, - 0x1.555e66p-3f, - 0x1.fffdb6p-2f, - 0x1.ffffecp-1f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); - v_u32_t r2 = v_as_u32_f32 (s1 * s1); - v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); - /* Similar to r1 but avoids double rounding in the subnormal range. */ - v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); - return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf) (v_f32_t x) -{ - v_f32_t n, r, r2, scale, p, q, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - r2 = r * r; - p = v_fma_f32 (C0, r, C1); - q = v_fma_f32 (C2, r, C3); - q = v_fma_f32 (p, r2, q); - p = C4 * r; - poly = v_fma_f32 (q, r2, p); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn, cmp, scale); - return v_fma_f32 (poly, scale, scale); -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_expf_1u.c b/contrib/arm-optimized-routines/math/v_expf_1u.c deleted file mode 100644 index 023bd248c9ac..000000000000 --- a/contrib/arm-optimized-routines/math/v_expf_1u.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Single-precision vector e^x function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* maxerr: 0.36565 +0.5 ulp. */ - 0x1.6a6000p-10f, - 0x1.12718ep-7f, - 0x1.555af0p-5f, - 0x1.555430p-3f, - 0x1.fffff4p-2f, -}; -#define C0 v_f32 (Poly[0]) -#define C1 v_f32 (Poly[1]) -#define C2 v_f32 (Poly[2]) -#define C3 v_f32 (Poly[3]) -#define C4 v_f32 (Poly[4]) - -#define Shift v_f32 (0x1.8p23f) -#define InvLn2 v_f32 (0x1.715476p+0f) -#define Ln2hi v_f32 (0x1.62e4p-1f) -#define Ln2lo v_f32 (0x1.7f7d1cp-20f) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn) -{ - /* 2^n may overflow, break it up into s1*s2. */ - v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000); - v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); - v_f32_t s2 = v_as_f32_u32 (e - b); - v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f)); - v_f32_t r1 = s1 * s1; - v_f32_t r0 = poly * s1 * s2; - return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0))); -} - -VPCS_ATTR -v_f32_t -V_NAME(expf_1u) (v_f32_t x) -{ - v_f32_t n, r, scale, poly, absn, z; - v_u32_t cmp, e; - - /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ -#if 1 - z = v_fma_f32 (x, InvLn2, Shift); - n = z - Shift; - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_f32 (z) << 23; -#else - z = x * InvLn2; - n = v_round_f32 (z); - r = v_fma_f32 (n, -Ln2hi, x); - r = v_fma_f32 (n, -Ln2lo, r); - e = v_as_u32_s32 (v_round_s32 (z)) << 23; -#endif - scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); - absn = v_abs_f32 (n); - cmp = v_cond_u32 (absn > v_f32 (126.0f)); - poly = v_fma_f32 (C0, r, C1); - poly = v_fma_f32 (poly, r, C2); - poly = v_fma_f32 (poly, r, C3); - poly = v_fma_f32 (poly, r, C4); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - poly = v_fma_f32 (poly, r, v_f32 (1.0f)); - if (unlikely (v_any_u32 (cmp))) - return specialcase (poly, n, e, absn); - return scale * poly; -} -#endif diff --git a/contrib/arm-optimized-routines/math/v_log.c b/contrib/arm-optimized-routines/math/v_log.c deleted file mode 100644 index d84c740d2b6b..000000000000 --- a/contrib/arm-optimized-routines/math/v_log.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Double-precision vector log(x) function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#include "v_log.h" -#if V_SUPPORTED - -/* Worst-case error: 1.17 + 0.5 ulp. */ - -static const f64_t Poly[] = { - /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - -0x1.ffffffffffff7p-2, - 0x1.55555555170d4p-2, - -0x1.0000000399c27p-2, - 0x1.999b2e90e94cap-3, - -0x1.554e550bd501ep-3, -}; - -#define A0 v_f64 (Poly[0]) -#define A1 v_f64 (Poly[1]) -#define A2 v_f64 (Poly[2]) -#define A3 v_f64 (Poly[3]) -#define A4 v_f64 (Poly[4]) -#define Ln2 v_f64 (0x1.62e42fefa39efp-1) -#define N (1 << V_LOG_TABLE_BITS) -#define OFF v_u64 (0x3fe6900900000000) - -struct entry -{ - v_f64_t invc; - v_f64_t logc; -}; - -static inline struct entry -lookup (v_u64_t i) -{ - struct entry e; -#ifdef SCALAR - e.invc = __v_log_data[i].invc; - e.logc = __v_log_data[i].logc; -#else - e.invc[0] = __v_log_data[i[0]].invc; - e.logc[0] = __v_log_data[i[0]].logc; - e.invc[1] = __v_log_data[i[1]].invc; - e.logc[1] = __v_log_data[i[1]].logc; -#endif - return e; -} - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (log, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(log) (v_f64_t x) -{ - v_f64_t z, r, r2, p, y, kd, hi; - v_u64_t ix, iz, tmp, top, i, cmp; - v_s64_t k; - struct entry e; - - ix = v_as_u64_f64 (x); - top = ix >> 48; - cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); - - /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. - The range is split into N subintervals. - The ith subinterval contains z and c is near its center. */ - tmp = ix - OFF; - i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N; - k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */ - iz = ix - (tmp & v_u64 (0xfffULL << 52)); - z = v_as_f64_u64 (iz); - e = lookup (i); - - /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); - kd = v_to_f64_s64 (k); - - /* hi = r + log(c) + k*Ln2. */ - hi = v_fma_f64 (kd, Ln2, e.logc + r); - /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = r * r; - y = v_fma_f64 (A3, r, A2); - p = v_fma_f64 (A1, r, A0); - y = v_fma_f64 (A4, r2, y); - y = v_fma_f64 (y, r2, p); - y = v_fma_f64 (y, r2, hi); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_log.h b/contrib/arm-optimized-routines/math/v_log.h deleted file mode 100644 index bcc2fa6fa930..000000000000 --- a/contrib/arm-optimized-routines/math/v_log.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Declarations for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_math.h" -#if WANT_VMATH - -#define V_LOG_TABLE_BITS 7 - -extern const struct v_log_data -{ - f64_t invc; - f64_t logc; -} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN; -#endif diff --git a/contrib/arm-optimized-routines/math/v_log_data.c b/contrib/arm-optimized-routines/math/v_log_data.c deleted file mode 100644 index 97ee5b09c6a9..000000000000 --- a/contrib/arm-optimized-routines/math/v_log_data.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Lookup table for double-precision log(x) vector function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "v_log.h" -#if WANT_VMATH - -#define N (1 << V_LOG_TABLE_BITS) - -/* Algorithm: - - x = 2^k z - log(x) = k ln2 + log(c) + poly(z/c - 1) - -where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) -and log(c) and 1/c for the ith subinterval comes from a lookup table: - - tab[i].invc = 1/c - tab[i].logc = (double)log(c) - -where c is near the center of the subinterval and is chosen by trying several -floating point invc candidates around 1/center and selecting one for which -the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval -that contains 1 and the previous one got tweaked to avoid cancellation. */ -const struct v_log_data __v_log_data[N] = { -{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2}, -{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2}, -{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2}, -{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2}, -{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2}, -{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2}, -{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2}, -{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2}, -{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2}, -{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2}, -{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2}, -{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2}, -{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2}, -{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2}, -{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2}, -{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2}, -{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2}, -{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2}, -{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2}, -{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3}, -{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3}, -{0x1.446f12b278001p+0, -0x1.e52e160484698p-3}, -{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3}, -{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3}, -{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3}, -{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3}, -{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3}, -{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3}, -{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3}, -{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3}, -{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3}, -{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3}, -{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3}, -{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3}, -{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3}, -{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3}, -{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3}, -{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3}, -{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3}, -{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3}, -{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3}, -{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3}, -{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3}, -{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3}, -{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3}, -{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4}, -{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4}, -{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4}, -{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4}, -{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4}, -{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4}, -{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4}, -{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4}, -{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4}, -{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4}, -{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4}, -{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4}, -{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4}, -{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4}, -{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4}, -{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5}, -{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5}, -{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5}, -{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5}, -{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5}, -{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5}, -{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5}, -{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5}, -{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6}, -{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6}, -{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6}, -{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6}, -{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7}, -{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7}, -{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9}, -{1.0, 0.0}, -{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8}, -{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7}, -{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6}, -{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6}, -{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5}, -{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5}, -{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5}, -{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5}, -{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4}, -{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4}, -{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4}, -{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4}, -{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4}, -{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4}, -{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4}, -{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4}, -{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4}, -{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3}, -{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3}, -{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3}, -{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3}, -{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3}, -{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3}, -{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3}, -{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3}, -{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3}, -{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3}, -{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3}, -{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3}, -{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3}, -{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3}, -{0x1.9998e1480b618p-1, 0x1.c903161240163p-3}, -{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3}, -{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3}, -{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3}, -{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3}, -{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2}, -{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2}, -{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2}, -{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2}, -{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2}, -{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2}, -{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2}, -{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2}, -{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2}, -{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2}, -{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2}, -{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2}, -{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2}, -{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2}, -{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2}, -{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2}, -}; -#endif diff --git a/contrib/arm-optimized-routines/math/v_logf.c b/contrib/arm-optimized-routines/math/v_logf.c deleted file mode 100644 index 7373192f03fa..000000000000 --- a/contrib/arm-optimized-routines/math/v_logf.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Single-precision vector log function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 3.34 ulp error */ - -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, - -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, -}; -#define P7 v_f32 (Poly[0]) -#define P6 v_f32 (Poly[1]) -#define P5 v_f32 (Poly[2]) -#define P4 v_f32 (Poly[3]) -#define P3 v_f32 (Poly[4]) -#define P2 v_f32 (Poly[5]) -#define P1 v_f32 (Poly[6]) - -#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define Mask v_u32 (0x007fffff) -#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (logf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(logf) (v_f32_t x) -{ - v_f32_t n, p, q, r, r2, y; - v_u32_t u, cmp; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - - /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */ - u -= Off; - n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */ - u &= Mask; - u += Off; - r = v_as_f32_u32 (u) - v_f32 (1.0f); - - /* y = log(1+r) + n*ln2. */ - r2 = r * r; - /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = v_fma_f32 (P6, r, P5); - q = v_fma_f32 (P4, r, P3); - y = v_fma_f32 (P2, r, P1); - p = v_fma_f32 (P7, r2, p); - q = v_fma_f32 (p, r2, q); - y = v_fma_f32 (q, r2, y); - p = v_fma_f32 (Ln2, n, r); - y = v_fma_f32 (y, r2, p); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_math.h b/contrib/arm-optimized-routines/math/v_math.h deleted file mode 100644 index f2cc4670bb9b..000000000000 --- a/contrib/arm-optimized-routines/math/v_math.h +++ /dev/null @@ -1,641 +0,0 @@ -/* - * Vector math abstractions. - * - * Copyright (c) 2019-2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#ifndef _V_MATH_H -#define _V_MATH_H - -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif -#if WANT_VMATH - -/* The goal of this header is to allow vector and scalar - build of the same algorithm, the provided intrinsic - wrappers are also vector length agnostic so they can - be implemented for SVE too (or other simd architectures) - and then the code should work on those targets too. */ - -#if SCALAR -#define V_NAME(x) __s_##x -#elif VPCS && __aarch64__ -#define V_NAME(x) __vn_##x -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) -#else -#define V_NAME(x) __v_##x -#endif - -#ifndef VPCS_ATTR -#define VPCS_ATTR -#endif -#ifndef VPCS_ALIAS -#define VPCS_ALIAS -#endif - -#include <stdint.h> -#include "math_config.h" - -typedef float f32_t; -typedef uint32_t u32_t; -typedef int32_t s32_t; -typedef double f64_t; -typedef uint64_t u64_t; -typedef int64_t s64_t; - -/* reinterpret as type1 from type2. */ -static inline u32_t -as_u32_f32 (f32_t x) -{ - union { f32_t f; u32_t u; } r = {x}; - return r.u; -} -static inline f32_t -as_f32_u32 (u32_t x) -{ - union { u32_t u; f32_t f; } r = {x}; - return r.f; -} -static inline s32_t -as_s32_u32 (u32_t x) -{ - union { u32_t u; s32_t i; } r = {x}; - return r.i; -} -static inline u32_t -as_u32_s32 (s32_t x) -{ - union { s32_t i; u32_t u; } r = {x}; - return r.u; -} -static inline u64_t -as_u64_f64 (f64_t x) -{ - union { f64_t f; u64_t u; } r = {x}; - return r.u; -} -static inline f64_t -as_f64_u64 (u64_t x) -{ - union { u64_t u; f64_t f; } r = {x}; - return r.f; -} -static inline s64_t -as_s64_u64 (u64_t x) -{ - union { u64_t u; s64_t i; } r = {x}; - return r.i; -} -static inline u64_t -as_u64_s64 (s64_t x) -{ - union { s64_t i; u64_t u; } r = {x}; - return r.u; -} - -#if SCALAR -#define V_SUPPORTED 1 -typedef f32_t v_f32_t; -typedef u32_t v_u32_t; -typedef s32_t v_s32_t; -typedef f64_t v_f64_t; -typedef u64_t v_u64_t; -typedef s64_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 1; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return x; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return x; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return x; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - *x = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - *x = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - *x = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x ? -1 : 0; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return __builtin_fabsf (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return __builtin_fmaf (x, y, z); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return __builtin_roundf (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return __builtin_lroundf (x); /* relies on -fno-math-errno. */ -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return x; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return tab[idx]; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return f (x); -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return f (x1, x2); -} - -static inline int -v_lanes64 (void) -{ - return 1; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return x; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return x; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return x; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - *x = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - return x != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x ? -1 : 0; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return __builtin_fabs (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return __builtin_fma (x, y, z); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return __builtin_round (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return __builtin_lround (x); /* relies on -fno-math-errno. */ -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return x; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return x; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return tab[idx]; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return f (x); -} - -#elif __aarch64__ -#define V_SUPPORTED 1 -#include <arm_neon.h> -typedef float32x4_t v_f32_t; -typedef uint32x4_t v_u32_t; -typedef int32x4_t v_s32_t; -typedef float64x2_t v_f64_t; -typedef uint64x2_t v_u64_t; -typedef int64x2_t v_s64_t; - -static inline int -v_lanes32 (void) -{ - return 4; -} - -static inline v_f32_t -v_f32 (f32_t x) -{ - return (v_f32_t){x, x, x, x}; -} -static inline v_u32_t -v_u32 (u32_t x) -{ - return (v_u32_t){x, x, x, x}; -} -static inline v_s32_t -v_s32 (s32_t x) -{ - return (v_s32_t){x, x, x, x}; -} - -static inline f32_t -v_get_f32 (v_f32_t x, int i) -{ - return x[i]; -} -static inline u32_t -v_get_u32 (v_u32_t x, int i) -{ - return x[i]; -} -static inline s32_t -v_get_s32 (v_s32_t x, int i) -{ - return x[i]; -} - -static inline void -v_set_f32 (v_f32_t *x, int i, f32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_u32 (v_u32_t *x, int i, u32_t v) -{ - (*x)[i] = v; -} -static inline void -v_set_s32 (v_s32_t *x, int i, s32_t v) -{ - (*x)[i] = v; -} - -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u32 (v_u32_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u32_t -v_cond_u32 (v_u32_t x) -{ - return x; -} -static inline v_f32_t -v_abs_f32 (v_f32_t x) -{ - return vabsq_f32 (x); -} -static inline v_f32_t -v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) -{ - return vfmaq_f32 (z, x, y); -} -static inline v_f32_t -v_round_f32 (v_f32_t x) -{ - return vrndaq_f32 (x); -} -static inline v_s32_t -v_round_s32 (v_f32_t x) -{ - return vcvtaq_s32_f32 (x); -} -/* convert to type1 from type2. */ -static inline v_f32_t -v_to_f32_s32 (v_s32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -static inline v_f32_t -v_to_f32_u32 (v_u32_t x) -{ - return (v_f32_t){x[0], x[1], x[2], x[3]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u32_t -v_as_u32_f32 (v_f32_t x) -{ - union { v_f32_t f; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_as_f32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_f32_t f; } r = {x}; - return r.f; -} -static inline v_s32_t -v_as_s32_u32 (v_u32_t x) -{ - union { v_u32_t u; v_s32_t i; } r = {x}; - return r.i; -} -static inline v_u32_t -v_as_u32_s32 (v_s32_t x) -{ - union { v_s32_t i; v_u32_t u; } r = {x}; - return r.u; -} -static inline v_f32_t -v_lookup_f32 (const f32_t *tab, v_u32_t idx) -{ - return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_u32_t -v_lookup_u32 (const u32_t *tab, v_u32_t idx) -{ - return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; -} -static inline v_f32_t -v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) -{ - return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], - p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; -} -static inline v_f32_t -v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, - v_u32_t p) -{ - return ( - v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], - p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; -} - -static inline int -v_lanes64 (void) -{ - return 2; -} -static inline v_f64_t -v_f64 (f64_t x) -{ - return (v_f64_t){x, x}; -} -static inline v_u64_t -v_u64 (u64_t x) -{ - return (v_u64_t){x, x}; -} -static inline v_s64_t -v_s64 (s64_t x) -{ - return (v_s64_t){x, x}; -} -static inline f64_t -v_get_f64 (v_f64_t x, int i) -{ - return x[i]; -} -static inline void -v_set_f64 (v_f64_t *x, int i, f64_t v) -{ - (*x)[i] = v; -} -/* true if any elements of a v_cond result is non-zero. */ -static inline int -v_any_u64 (v_u64_t x) -{ - /* assume elements in x are either 0 or -1u. */ - return vpaddd_u64 (x) != 0; -} -/* to wrap the result of relational operators. */ -static inline v_u64_t -v_cond_u64 (v_u64_t x) -{ - return x; -} -static inline v_f64_t -v_abs_f64 (v_f64_t x) -{ - return vabsq_f64 (x); -} -static inline v_f64_t -v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) -{ - return vfmaq_f64 (z, x, y); -} -static inline v_f64_t -v_round_f64 (v_f64_t x) -{ - return vrndaq_f64 (x); -} -static inline v_s64_t -v_round_s64 (v_f64_t x) -{ - return vcvtaq_s64_f64 (x); -} -/* convert to type1 from type2. */ -static inline v_f64_t -v_to_f64_s64 (v_s64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -static inline v_f64_t -v_to_f64_u64 (v_u64_t x) -{ - return (v_f64_t){x[0], x[1]}; -} -/* reinterpret as type1 from type2. */ -static inline v_u64_t -v_as_u64_f64 (v_f64_t x) -{ - union { v_f64_t f; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_as_f64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_f64_t f; } r = {x}; - return r.f; -} -static inline v_s64_t -v_as_s64_u64 (v_u64_t x) -{ - union { v_u64_t u; v_s64_t i; } r = {x}; - return r.i; -} -static inline v_u64_t -v_as_u64_s64 (v_s64_t x) -{ - union { v_s64_t i; v_u64_t u; } r = {x}; - return r.u; -} -static inline v_f64_t -v_lookup_f64 (const f64_t *tab, v_u64_t idx) -{ - return (v_f64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_u64_t -v_lookup_u64 (const u64_t *tab, v_u64_t idx) -{ - return (v_u64_t){tab[idx[0]], tab[idx[1]]}; -} -static inline v_f64_t -v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) -{ - return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; -} -#endif - -#endif -#endif diff --git a/contrib/arm-optimized-routines/math/v_pow.c b/contrib/arm-optimized-routines/math/v_pow.c deleted file mode 100644 index a209d57f41ce..000000000000 --- a/contrib/arm-optimized-routines/math/v_pow.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Double-precision vector pow function. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -VPCS_ATTR -v_f64_t -V_NAME(pow) (v_f64_t x, v_f64_t y) -{ - v_f64_t z; - for (int lane = 0; lane < v_lanes64 (); lane++) - { - f64_t sx = v_get_f64 (x, lane); - f64_t sy = v_get_f64 (y, lane); - f64_t sz = pow (sx, sy); - v_set_f64 (&z, lane, sz); - } - return z; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_powf.c b/contrib/arm-optimized-routines/math/v_powf.c deleted file mode 100644 index fb80fa6f1846..000000000000 --- a/contrib/arm-optimized-routines/math/v_powf.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Single-precision vector powf function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -#define Min v_u32 (0x00800000) -#define Max v_u32 (0x7f800000) -#define SBITS 5 -#define Tlog v__powf_log2_data.tab -#define Texp v__exp2f_data.tab -#define A v__powf_log2_data.poly -#define C v__exp2f_data.poly -#define LOGDEG 4 - -#if LOGDEG == 5 -/* 1.01 ulp */ -#define OFF v_u32 (0x3f330000) -#define TBITS 4 -#elif LOGDEG == 4 -/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */ -#define OFF v_u32 (0x3f35d000) -#define TBITS 5 -#endif - -#define V_EXP2F_TABLE_BITS SBITS -#define V_EXP2F_POLY_ORDER 3 -struct v_exp2f_data -{ - uint64_t tab[1 << V_EXP2F_TABLE_BITS]; - double poly[V_EXP2F_POLY_ORDER]; -}; - -#define V_POWF_LOG2_TABLE_BITS TBITS -#define V_POWF_LOG2_POLY_ORDER LOGDEG -#define SCALE ((double) (1 << SBITS)) -struct v_powf_log2_data -{ - struct - { - double invc, logc; - } tab[1 << V_POWF_LOG2_TABLE_BITS]; - double poly[V_POWF_LOG2_POLY_ORDER]; -}; - -static const struct v_powf_log2_data v__powf_log2_data = { -#if LOGDEG == 5 - .tab = { -{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE }, -{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE }, -{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE }, -{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE }, -{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE }, -{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE }, -{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE }, -{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE }, -{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE }, -{ 0x1p+0, 0x0p+0 * SCALE }, -{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE }, -{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE }, -{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE }, -{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE }, -{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE }, -{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE }, - }, -/* rel err: 1.46 * 2^-32 */ - .poly = { -0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE, -0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE, -0x1.71547652ab82bp0 * SCALE, - } -#elif LOGDEG == 4 - .tab = { -{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE}, -{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE}, -{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE}, -{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE}, -{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE}, -{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE}, -{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE}, -{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE}, -{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE}, -{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE}, -{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE}, -{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE}, -{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE}, -{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE}, -{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE}, -{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE}, -{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE}, -{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE}, -{0x1p+0, 0x0p+0 * SCALE}, -{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE}, -{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE}, -{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE}, -{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE}, -{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE}, -{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE}, -{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE}, -{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE}, -{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE}, -{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE}, -{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE}, -{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE}, -{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE}, - }, -/* rel err: 1.5 * 2^-30 */ - .poly = { - -0x1.6ff5daa3b3d7cp-2 * SCALE, - 0x1.ec81d03c01aebp-2 * SCALE, - -0x1.71547bb43f101p-1 * SCALE, - 0x1.7154764a815cbp0 * SCALE, - } -#endif -}; - -static const struct v_exp2f_data v__exp2f_data = { - .tab = { -0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, -0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, -0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, -0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, -0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, -0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, -0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, -0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, - }, -/* rel err: 1.69 * 2^-34 */ - .poly = { -0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE - }, -}; - -VPCS_ATTR -__attribute__ ((noinline)) static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp) -{ - return v_call2_f32 (powf, x, y, ret, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(powf) (v_f32_t x, v_f32_t y) -{ - v_u32_t u, tmp, cmp, i, top, iz; - v_s32_t k; - v_f32_t ret; - - u = v_as_u32_f32 (x); - cmp = v_cond_u32 (u - Min >= Max - Min); - tmp = u - OFF; - i = (tmp >> (23 - TBITS)) % (1 << TBITS); - top = tmp & 0xff800000; - iz = u - top; - k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */ - - for (int lane = 0; lane < v_lanes32 (); lane++) - { - uint32_t si, siz; - int32_t sk; - float sy; - - /* Use double precision for each lane. */ - double invc, logc, z, r, p, y0, logx, ylogx, kd, s; - uint64_t ki, t; - - si = v_get_u32 (i, lane); - siz = v_get_u32 (iz, lane); - sk = v_get_s32 (k, lane); - sy = v_get_f32 (y, lane); - - invc = Tlog[si].invc; - logc = Tlog[si].logc; - z = (double) as_f32_u32 (siz); - - /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */ - r = __builtin_fma (z, invc, -1.0); - y0 = logc + (double) sk; - - /* Polynomial to approximate log1p(r)/ln2. */ -#if LOGDEG == 5 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + A[4]; - logx = r * logx + y0; -#elif LOGDEG == 4 - logx = A[0]; - logx = r * logx + A[1]; - logx = r * logx + A[2]; - logx = r * logx + A[3]; - logx = r * logx + y0; -#endif - ylogx = sy * logx; - v_set_u32 (&cmp, lane, - (as_u64_f64 (ylogx) >> 47 & 0xffff) - >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47 - ? 1 - : v_get_u32 (cmp, lane)); - - /* N*x = k + r with r in [-1/2, 1/2] */ -#if TOINT_INTRINSICS - kd = roundtoint (ylogx); /* k */ - ki = converttoint (ylogx); -#else -# define SHIFT 0x1.8p52 - kd = eval_as_double (ylogx + SHIFT); - ki = asuint64 (kd); - kd -= SHIFT; -#endif - r = ylogx - kd; - - /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ - t = Texp[ki % (1 << SBITS)]; - t += ki << (52 - SBITS); - s = as_f64_u64 (t); - p = C[0]; - p = __builtin_fma (p, r, C[1]); - p = __builtin_fma (p, r, C[2]); - p = __builtin_fma (p, s * r, s); - - v_set_f32 (&ret, lane, p); - } - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, ret, cmp); - return ret; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_sin.c b/contrib/arm-optimized-routines/math/v_sin.c deleted file mode 100644 index 2b9ed059189c..000000000000 --- a/contrib/arm-optimized-routines/math/v_sin.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Double-precision vector sin function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const double Poly[] = { -/* worst-case error is 3.5 ulp. - abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */ --0x1.9f4a9c8b21dc9p-41, - 0x1.60e88a10163f2p-33, --0x1.ae6361b7254e7p-26, - 0x1.71de382e8d62bp-19, --0x1.a01a019aeb4ffp-13, - 0x1.111111110b25ep-7, --0x1.55555555554c3p-3, -}; - -#define C7 v_f64 (Poly[0]) -#define C6 v_f64 (Poly[1]) -#define C5 v_f64 (Poly[2]) -#define C4 v_f64 (Poly[3]) -#define C3 v_f64 (Poly[4]) -#define C2 v_f64 (Poly[5]) -#define C1 v_f64 (Poly[6]) - -#define InvPi v_f64 (0x1.45f306dc9c883p-2) -#define Pi1 v_f64 (0x1.921fb54442d18p+1) -#define Pi2 v_f64 (0x1.1a62633145c06p-53) -#define Pi3 v_f64 (0x1.c1cd129024e09p-106) -#define Shift v_f64 (0x1.8p52) -#define RangeVal v_f64 (0x1p23) -#define AbsMask v_u64 (0x7fffffffffffffff) - -VPCS_ATTR -__attribute__ ((noinline)) static v_f64_t -specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) -{ - return v_call_f64 (sin, x, y, cmp); -} - -VPCS_ATTR -v_f64_t -V_NAME(sin) (v_f64_t x) -{ - v_f64_t n, r, r2, y; - v_u64_t sign, odd, cmp; - - r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask); - sign = v_as_u64_f64 (x) & ~AbsMask; - cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal)); - - /* n = rint(|x|/pi). */ - n = v_fma_f64 (InvPi, r, Shift); - odd = v_as_u64_f64 (n) << 63; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ - r = v_fma_f64 (-Pi1, n, r); - r = v_fma_f64 (-Pi2, n, r); - r = v_fma_f64 (-Pi3, n, r); - - /* sin(r) poly approx. */ - r2 = r * r; - y = v_fma_f64 (C7, r2, C6); - y = v_fma_f64 (y, r2, C5); - y = v_fma_f64 (y, r2, C4); - y = v_fma_f64 (y, r2, C3); - y = v_fma_f64 (y, r2, C2); - y = v_fma_f64 (y, r2, C1); - y = v_fma_f64 (y * r2, r, r); - - /* sign. */ - y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd); - - if (unlikely (v_any_u64 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/v_sinf.c b/contrib/arm-optimized-routines/math/v_sinf.c deleted file mode 100644 index e66bfce6d8aa..000000000000 --- a/contrib/arm-optimized-routines/math/v_sinf.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Single-precision vector sin function. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ - -#include "mathlib.h" -#include "v_math.h" -#if V_SUPPORTED - -static const float Poly[] = { - /* 1.886 ulp error */ - 0x1.5b2e76p-19f, - -0x1.9f42eap-13f, - 0x1.110df4p-7f, - -0x1.555548p-3f, -}; -#define Pi1 v_f32 (0x1.921fb6p+1f) -#define Pi2 v_f32 (-0x1.777a5cp-24f) -#define Pi3 v_f32 (-0x1.ee59dap-49f) -#define A3 v_f32 (Poly[3]) -#define A5 v_f32 (Poly[2]) -#define A7 v_f32 (Poly[1]) -#define A9 v_f32 (Poly[0]) -#define RangeVal v_f32 (0x1p20f) -#define InvPi v_f32 (0x1.45f306p-2f) -#define Shift v_f32 (0x1.8p+23f) -#define AbsMask v_u32 (0x7fffffff) - -VPCS_ATTR -static v_f32_t -specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) -{ - /* Fall back to scalar code. */ - return v_call_f32 (sinf, x, y, cmp); -} - -VPCS_ATTR -v_f32_t -V_NAME(sinf) (v_f32_t x) -{ - v_f32_t n, r, r2, y; - v_u32_t sign, odd, cmp; - - r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask); - sign = v_as_u32_f32 (x) & ~AbsMask; - cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal)); - - /* n = rint(|x|/pi) */ - n = v_fma_f32 (InvPi, r, Shift); - odd = v_as_u32_f32 (n) << 31; - n -= Shift; - - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ - r = v_fma_f32 (-Pi1, n, r); - r = v_fma_f32 (-Pi2, n, r); - r = v_fma_f32 (-Pi3, n, r); - - /* y = sin(r) */ - r2 = r * r; - y = v_fma_f32 (A9, r2, A7); - y = v_fma_f32 (y, r2, A5); - y = v_fma_f32 (y, r2, A3); - y = v_fma_f32 (y * r2, r, r); - - /* sign fix */ - y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd); - - if (unlikely (v_any_u32 (cmp))) - return specialcase (x, y, cmp); - return y; -} -VPCS_ALIAS -#endif diff --git a/contrib/arm-optimized-routines/math/vn_cos.c b/contrib/arm-optimized-routines/math/vn_cos.c deleted file mode 100644 index b57a549eba68..000000000000 --- a/contrib/arm-optimized-routines/math/vn_cos.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cos. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos) -#include "v_cos.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_cosf.c b/contrib/arm-optimized-routines/math/vn_cosf.c deleted file mode 100644 index 6321d4620fa7..000000000000 --- a/contrib/arm-optimized-routines/math/vn_cosf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_cosf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf) -#include "v_cosf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp.c b/contrib/arm-optimized-routines/math/vn_exp.c deleted file mode 100644 index 06e269d41766..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp) -#include "v_exp.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp2f.c b/contrib/arm-optimized-routines/math/vn_exp2f.c deleted file mode 100644 index db9707e86f16..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp2f.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f) -#include "v_exp2f.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c b/contrib/arm-optimized-routines/math/vn_exp2f_1u.c deleted file mode 100644 index 17bd0abd7a60..000000000000 --- a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_exp2f_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_exp2f_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_expf.c b/contrib/arm-optimized-routines/math/vn_expf.c deleted file mode 100644 index 0652907225d9..000000000000 --- a/contrib/arm-optimized-routines/math/vn_expf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) -#include "v_expf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_expf_1u.c b/contrib/arm-optimized-routines/math/vn_expf_1u.c deleted file mode 100644 index 3be776814822..000000000000 --- a/contrib/arm-optimized-routines/math/vn_expf_1u.c +++ /dev/null @@ -1,11 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_expf_1u. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#include "v_expf_1u.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_log.c b/contrib/arm-optimized-routines/math/vn_log.c deleted file mode 100644 index b58fe8ff820a..000000000000 --- a/contrib/arm-optimized-routines/math/vn_log.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_log. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log) -#include "v_log.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_logf.c b/contrib/arm-optimized-routines/math/vn_logf.c deleted file mode 100644 index cc5b8ae3ed55..000000000000 --- a/contrib/arm-optimized-routines/math/vn_logf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_logf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf) -#include "v_logf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_pow.c b/contrib/arm-optimized-routines/math/vn_pow.c deleted file mode 100644 index 260950113b04..000000000000 --- a/contrib/arm-optimized-routines/math/vn_pow.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_pow. - * - * Copyright (c) 2020, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow) -#include "v_pow.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_powf.c b/contrib/arm-optimized-routines/math/vn_powf.c deleted file mode 100644 index 095d07e337ad..000000000000 --- a/contrib/arm-optimized-routines/math/vn_powf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_powf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf) -#include "v_powf.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_sin.c b/contrib/arm-optimized-routines/math/vn_sin.c deleted file mode 100644 index 905c79623350..000000000000 --- a/contrib/arm-optimized-routines/math/vn_sin.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sin. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin) -#include "v_sin.c" -#endif diff --git a/contrib/arm-optimized-routines/math/vn_sinf.c b/contrib/arm-optimized-routines/math/vn_sinf.c deleted file mode 100644 index 1214e1a55638..000000000000 --- a/contrib/arm-optimized-routines/math/vn_sinf.c +++ /dev/null @@ -1,12 +0,0 @@ -/* - * AdvSIMD vector PCS variant of __v_sinf. - * - * Copyright (c) 2019, Arm Limited. - * SPDX-License-Identifier: MIT - */ -#include "mathlib.h" -#ifdef __vpcs -#define VPCS 1 -#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf) -#include "v_sinf.c" -#endif |