aboutsummaryrefslogtreecommitdiff
path: root/contrib/arm-optimized-routines/math
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/arm-optimized-routines/math')
-rw-r--r--contrib/arm-optimized-routines/math/Dir.mk19
-rw-r--r--contrib/arm-optimized-routines/math/README.contributors78
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_cos.c87
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_cosf.c82
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_exp.c125
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_exp2f.c113
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c72
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_exp_data.c146
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_expf.c122
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c77
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_log.c100
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_log_data.c156
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_logf.c74
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_math.h135
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_pow.c22
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_powf.c148
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_sin.c97
-rw-r--r--contrib/arm-optimized-routines/math/aarch64/v_sinf.c82
-rw-r--r--contrib/arm-optimized-routines/math/cosf.c6
-rw-r--r--contrib/arm-optimized-routines/math/erf.c2
-rw-r--r--contrib/arm-optimized-routines/math/erf_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/erff.c2
-rw-r--r--contrib/arm-optimized-routines/math/erff_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/exp.c2
-rw-r--r--contrib/arm-optimized-routines/math/exp10.c129
-rw-r--r--contrib/arm-optimized-routines/math/exp2.c2
-rw-r--r--contrib/arm-optimized-routines/math/exp2f.c2
-rw-r--r--contrib/arm-optimized-routines/math/exp2f_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/exp_data.c25
-rw-r--r--contrib/arm-optimized-routines/math/expf.c2
-rw-r--r--contrib/arm-optimized-routines/math/include/mathlib.h69
-rw-r--r--contrib/arm-optimized-routines/math/log.c2
-rw-r--r--contrib/arm-optimized-routines/math/log2.c2
-rw-r--r--contrib/arm-optimized-routines/math/log2_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/log2f.c2
-rw-r--r--contrib/arm-optimized-routines/math/log2f_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/log_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/logf.c6
-rw-r--r--contrib/arm-optimized-routines/math/logf_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/math_config.h63
-rw-r--r--contrib/arm-optimized-routines/math/math_err.c2
-rw-r--r--contrib/arm-optimized-routines/math/math_errf.c2
-rw-r--r--contrib/arm-optimized-routines/math/pow.c2
-rw-r--r--contrib/arm-optimized-routines/math/pow_log_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/powf.c2
-rw-r--r--contrib/arm-optimized-routines/math/powf_log2_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/s_cos.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_cosf.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_exp.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_exp2f.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_exp2f_1u.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_expf.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_expf_1u.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_log.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_logf.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_pow.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_powf.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_sin.c6
-rw-r--r--contrib/arm-optimized-routines/math/s_sinf.c6
-rw-r--r--contrib/arm-optimized-routines/math/sincosf.c6
-rw-r--r--contrib/arm-optimized-routines/math/sincosf.h6
-rw-r--r--contrib/arm-optimized-routines/math/sincosf_data.c2
-rw-r--r--contrib/arm-optimized-routines/math/sinf.c6
-rw-r--r--contrib/arm-optimized-routines/math/test/mathbench.c369
-rw-r--r--contrib/arm-optimized-routines/math/test/mathbench_funcs.h62
-rw-r--r--contrib/arm-optimized-routines/math/test/mathbench_wrappers.h66
-rw-r--r--contrib/arm-optimized-routines/math/test/mathtest.c16
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/dotest.c2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/intern.h2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/main.c2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/random.c2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/random.h2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/semi.c2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/semi.h2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/types.h2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/wrappers.c2
-rw-r--r--contrib/arm-optimized-routines/math/test/rtest/wrappers.h2
-rwxr-xr-xcontrib/arm-optimized-routines/math/test/runulp.sh127
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst15
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/log.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/random/double.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/testcases/random/float.tst2
-rw-r--r--contrib/arm-optimized-routines/math/test/ulp.c245
-rw-r--r--contrib/arm-optimized-routines/math/test/ulp.h31
-rw-r--r--contrib/arm-optimized-routines/math/test/ulp_funcs.h40
-rw-r--r--contrib/arm-optimized-routines/math/test/ulp_wrappers.h37
-rw-r--r--contrib/arm-optimized-routines/math/tgamma128.c356
-rw-r--r--contrib/arm-optimized-routines/math/tgamma128.h141
-rw-r--r--contrib/arm-optimized-routines/math/tools/cos.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/exp.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/exp2.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/log.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/log2.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/log2_abs.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/log_abs.sollya2
-rwxr-xr-xcontrib/arm-optimized-routines/math/tools/plot.py2
-rwxr-xr-xcontrib/arm-optimized-routines/math/tools/remez.jl2
-rw-r--r--contrib/arm-optimized-routines/math/tools/sin.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl212
-rw-r--r--contrib/arm-optimized-routines/math/tools/v_exp.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/v_log.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/tools/v_sin.sollya2
-rw-r--r--contrib/arm-optimized-routines/math/v_cos.c87
-rw-r--r--contrib/arm-optimized-routines/math/v_cosf.c76
-rw-r--r--contrib/arm-optimized-routines/math/v_exp.c94
-rw-r--r--contrib/arm-optimized-routines/math/v_exp.h14
-rw-r--r--contrib/arm-optimized-routines/math/v_exp2f.c78
-rw-r--r--contrib/arm-optimized-routines/math/v_exp2f_1u.c75
-rw-r--r--contrib/arm-optimized-routines/math/v_exp_data.c403
-rw-r--r--contrib/arm-optimized-routines/math/v_expf.c83
-rw-r--r--contrib/arm-optimized-routines/math/v_expf_1u.c80
-rw-r--r--contrib/arm-optimized-routines/math/v_log.c104
-rw-r--r--contrib/arm-optimized-routines/math/v_log.h18
-rw-r--r--contrib/arm-optimized-routines/math/v_log_data.c158
-rw-r--r--contrib/arm-optimized-routines/math/v_logf.c73
-rw-r--r--contrib/arm-optimized-routines/math/v_math.h641
-rw-r--r--contrib/arm-optimized-routines/math/v_pow.c27
-rw-r--r--contrib/arm-optimized-routines/math/v_powf.c235
-rw-r--r--contrib/arm-optimized-routines/math/v_sin.c86
-rw-r--r--contrib/arm-optimized-routines/math/v_sinf.c75
-rw-r--r--contrib/arm-optimized-routines/math/vn_cos.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_cosf.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_exp.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_exp2f.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_exp2f_1u.c11
-rw-r--r--contrib/arm-optimized-routines/math/vn_expf.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_expf_1u.c11
-rw-r--r--contrib/arm-optimized-routines/math/vn_log.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_logf.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_pow.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_powf.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_sin.c12
-rw-r--r--contrib/arm-optimized-routines/math/vn_sinf.c12
147 files changed, 3276 insertions, 3257 deletions
diff --git a/contrib/arm-optimized-routines/math/Dir.mk b/contrib/arm-optimized-routines/math/Dir.mk
index 3b841ab71955..5e9494a7bd3c 100644
--- a/contrib/arm-optimized-routines/math/Dir.mk
+++ b/contrib/arm-optimized-routines/math/Dir.mk
@@ -1,12 +1,14 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/math
B := build/math
math-lib-srcs := $(wildcard $(S)/*.[cS])
+math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+
math-test-srcs := \
$(S)/test/mathtest.c \
$(S)/test/mathbench.c \
@@ -15,6 +17,7 @@ math-test-srcs := \
math-test-host-srcs := $(wildcard $(S)/test/rtest/*.[cS])
math-includes := $(patsubst $(S)/%,build/%,$(wildcard $(S)/include/*.h))
+math-test-includes := $(patsubst $(S)/%,build/include/%,$(wildcard $(S)/test/*.h))
math-libs := \
build/lib/libmathlib.so \
@@ -42,10 +45,11 @@ math-files := \
$(math-tools) \
$(math-host-tools) \
$(math-includes) \
+ $(math-test-includes) \
-all-math: $(math-libs) $(math-tools) $(math-includes)
+all-math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
-$(math-objs): $(math-includes)
+$(math-objs): $(math-includes) $(math-test-includes)
$(math-objs): CFLAGS_ALL += $(math-cflags)
$(B)/test/mathtest.o: CFLAGS_ALL += -fmath-errno
$(math-host-objs): CC = $(HOST_CC)
@@ -63,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs)
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
+# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
+$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
build/bin/rtest: $(math-host-objs)
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
@@ -83,6 +89,9 @@ build/bin/ulp: $(B)/test/ulp.o build/lib/libmathlib.a
build/include/%.h: $(S)/include/%.h
cp $< $@
+build/include/test/%.h: $(S)/test/%.h
+ cp $< $@
+
build/bin/%.sh: $(S)/test/%.sh
cp $< $@
@@ -96,7 +105,7 @@ check-math-rtest: $(math-host-tools) $(math-tools)
cat $(math-rtests) | build/bin/rtest | $(EMULATOR) build/bin/mathtest $(math-testflags)
check-math-ulp: $(math-tools)
- ULPFLAGS="$(math-ulpflags)" build/bin/runulp.sh $(EMULATOR)
+ ULPFLAGS="$(math-ulpflags)" WANT_SIMD_EXCEPT="$(WANT_SIMD_EXCEPT)" build/bin/runulp.sh $(EMULATOR)
check-math: check-math-test check-math-rtest check-math-ulp
diff --git a/contrib/arm-optimized-routines/math/README.contributors b/contrib/arm-optimized-routines/math/README.contributors
new file mode 100644
index 000000000000..33e7ba376e41
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/README.contributors
@@ -0,0 +1,78 @@
+STYLE REQUIREMENTS
+==================
+
+1. Most code in this sub-directory is expected to be upstreamed into glibc so
+ the GNU Coding Standard and glibc specific conventions should be followed
+ to ease upstreaming.
+
+2. ABI and symbols: the code should be written so it is suitable for inclusion
+ into a libc with minimal changes. This e.g. means that internal symbols
+ should be hidden and in the implementation reserved namespace according to
+ ISO C and POSIX rules. If possible the built shared libraries and static
+ library archives should be usable to override libc symbols at link time (or
+ at runtime via LD_PRELOAD). This requires the symbols to follow the glibc ABI
+ (other than symbol versioning), this cannot be done reliably for static
+ linking so this is a best effort requirement.
+
+3. API: include headers should be suitable for benchmarking and testing code
+ and should not conflict with libc headers.
+
+
+CONTRIBUTION GUIDELINES FOR math SUB-DIRECTORY
+==============================================
+
+1. Math functions have quality and performance requirements.
+
+2. Quality:
+ - Worst-case ULP error should be small in the entire input domain (for most
+ common double precision scalar functions the target is < 0.66 ULP error,
+ and < 1 ULP for single precision, even performance optimized function
+ variant should not have > 5 ULP error if the goal is to be a drop in
+ replacement for a standard math function), this should be tested
+ statistically (or on all inputs if possible in reasonable amount of time).
+ The ulp tool is for this and runulp.sh should be updated for new functions.
+
+ - All standard rounding modes need to be supported but in non-default rounding
+ modes the quality requirement can be relaxed. (Non-nearest rounded
+ computation can be slow and inaccurate but has to be correct for conformance
+ reasons.)
+
+ - Special cases and error handling need to follow ISO C Annex F requirements,
+ POSIX requirements, IEEE 754-2008 requirements and Glibc requiremnts:
+ https://www.gnu.org/software/libc/manual/html_mono/libc.html#Errors-in-Math-Functions
+ this should be tested by direct tests (glibc test system may be used for it).
+
+ - Error handling code should be decoupled from the approximation code as much
+ as possible. (There are helper functions, these take care of errno as well
+ as exception raising.)
+
+ - Vector math code does not need to work in non-nearest rounding mode and error
+ handling side effects need not happen (fenv exceptions and errno), but the
+ result should be correct (within quality requirements, which are lower for
+ vector code than for scalar code).
+
+ - Error bounds of the approximation should be clearly documented.
+
+ - The code should build and pass tests on arm, aarch64 and x86_64 GNU linux
+ systems. (Routines and features can be disabled on specific targets, but
+ the build must complete). On aarch64, both little- and big-endian targets
+ are supported as well as valid combinations of architecture extensions.
+ The configurations that should be tested depend on the contribution.
+
+3. Performance:
+ - Common math code should be benchmarked on modern aarch64 microarchitectures
+ over typical inputs.
+
+ - Performance improvements should be documented (relative numbers can be
+ published; it is enough to use the mathbench microbenchmark tool which should
+ be updated for new functions).
+
+ - Attention should be paid to the compilation flags: for aarch64 fma
+ contraction should be on and math errno turned off so some builtins can be
+ inlined.
+
+ - The code should be reasonably performant on x86_64 too, e.g. some rounding
+ instructions and fma may not be available on x86_64, such builtins turn into
+ libc calls with slow code. Such slowdown is not acceptable, a faster fallback
+ should be present: glibc and bionic use the same code on all targets. (This
+ does not apply to vector math code).
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cos.c b/contrib/arm-optimized-routines/math/aarch64/v_cos.c
new file mode 100644
index 000000000000..9a73575bce89
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .half_pi = V2 (0x1.921fb54442d18p+0),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .shift = V2 (0x1.8p52),
+ .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f64 (x);
+ cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+ vreinterpretq_u64_f64 (d->range_val));
+ if (unlikely (v_any_u64 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+ cmp = vcageq_f64 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+ n = vsubq_f64 (n, d->shift);
+ n = vsubq_f64 (n, v_f64 (0.5));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_cosf.c b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c
new file mode 100644
index 000000000000..b9890b2998ad
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_cosf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .shift = V4 (0x1.8p+23f),
+ .half_pi = V4 (0x1.921fb6p0f),
+ .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, r3, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f32 (x);
+ cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+ vreinterpretq_u32_f32 (d->range_val));
+ if (unlikely (v_any_u32 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+ cmp = vcageq_f32 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+ n = vsubq_f32 (n, d->shift);
+ n = vsubq_f32 (n, v_f32 (0.5f));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r). */
+ r2 = vmulq_f32 (r, r);
+ r3 = vmulq_f32 (r2, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, y, r3);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp.c b/contrib/arm-optimized-routines/math/aarch64/v_exp.c
new file mode 100644
index 000000000000..bc5609faf4fc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp.c
@@ -0,0 +1,125 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.88 +0.5 ulp
+ rel error: 1.4337*2^-53
+ abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
+ .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+ V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V2 (163840.0), /* 1280.0 * N. */
+ .special_bound = V2 (704.0),
+#endif
+ .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
+ .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
+ .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+ .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+ float64x2_t n, r, r2, s, y, z;
+ uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ float64x2_t xm = x;
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+ if (unlikely (v_any_u64 (cmp)))
+ x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+ cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+ /* n = round(x/(ln2/N)). */
+ z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+ u = vreinterpretq_u64_f64 (z);
+ n = vsubq_f64 (z, data.shift);
+
+ /* r = x - n*ln2/N. */
+ r = x;
+ r = vfmsq_f64 (r, data.ln2_hi, n);
+ r = vfmsq_f64 (r, data.ln2_lo, n);
+
+ e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (C (0), C (1), r);
+ y = vfmaq_f64 (y, C (2), r2);
+ y = vfmaq_f64 (r, y, r2);
+
+ /* s = 2^(n/N). */
+ u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+ s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+ return special_case (s, y, n);
+#endif
+
+ return vfmaq_f64 (s, y, s);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
new file mode 100644
index 000000000000..e402205e98e6
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f.c
@@ -0,0 +1,113 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.962 ulp. */
+ .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+ V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine for special lanes. */
+ return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, scale, p, q, poly;
+ uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ n = vrndaq_f32 (x);
+ r = vsubq_f32 (x, n);
+ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ r2 = vmulq_f32 (r, r);
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
new file mode 100644
index 000000000000..ba6b02fbb4bc
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp2f_1u.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const float Poly[] = {
+ /* maxerr: 0.878 ulp. */
+ 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+#define C5 v_f32 (Poly[5])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+ float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+ uint32x4_t cmp = absn > v_f32 (192.0f);
+ float32x4_t r1 = s1 * s1;
+ float32x4_t r0 = poly * s1 * s2;
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
+ float32x4_t n, r, scale, poly, absn;
+ uint32x4_t cmp, e;
+
+ /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+#if 0
+ float32x4_t z;
+ z = x + Shift;
+ n = z - Shift;
+ r = x - n;
+ e = vreinterpretq_u32_f32 (z) << 23;
+#else
+ n = vrndaq_f32 (x);
+ r = x - n;
+ e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+#endif
+ scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
+ absn = vabsq_f32 (n);
+ cmp = absn > v_f32 (126.0f);
+ poly = vfmaq_f32 (C1, C0, r);
+ poly = vfmaq_f32 (C2, poly, r);
+ poly = vfmaq_f32 (C3, poly, r);
+ poly = vfmaq_f32 (C4, poly, r);
+ poly = vfmaq_f32 (C5, poly, r);
+ poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (poly, n, e, absn);
+ return scale * poly;
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c
new file mode 100644
index 000000000000..45f0848cac5b
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_exp_data.c
@@ -0,0 +1,146 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+# define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N. */
+const uint64_t __v_exp_data[] = {
+# if N == 128
+ 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
+ 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
+ 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
+ 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
+ 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
+ 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
+ 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
+ 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
+ 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
+ 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
+ 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
+ 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
+ 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
+ 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
+ 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
+ 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
+ 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
+ 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
+ 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
+ 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
+ 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
+ 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
+ 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
+ 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
+ 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
+ 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
+ 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
+ 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
+ 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
+ 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
+ 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
+ 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
+ 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
+ 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
+ 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
+ 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
+ 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
+ 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
+ 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
+ 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
+ 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
+ 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
+ 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
+# elif N == 256
+ 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+ 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+ 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+ 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+ 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+ 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+ 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+ 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+ 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+ 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+ 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+ 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+ 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+ 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+ 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+ 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+ 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+ 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+ 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+ 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+ 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+ 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+ 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+ 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+ 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+ 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+ 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+ 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+ 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+ 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+ 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+ 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+ 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+ 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+ 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+ 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+ 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+ 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+ 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+ 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+ 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+ 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+ 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+ 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+ 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+ 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+ 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+ 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+ 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+ 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+ 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+ 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+ 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+ 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+ 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+ 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+ 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+ 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+ 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+ 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+ 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+ 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+ 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+ 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+ 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+ 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+ 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+ 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+ 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+ 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+ 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+ 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+ 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+ 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+ 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+ 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+ 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+ 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+ 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+ 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+ 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+ 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+ 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+ 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+ 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+ 0x3feff9d96b2a23d9,
+# endif
+};
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf.c b/contrib/arm-optimized-routines/math/aarch64/v_expf.c
new file mode 100644
index 000000000000..34e8b6081bcd
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_expf.c
@@ -0,0 +1,122 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+ uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.45358 +0.5 ulp. */
+ .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+ V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+ .shift = V4 (0x1.8p23f),
+ .inv_ln2 = V4 (0x1.715476p+0f),
+ .ln2_hi = V4 (0x1.62e4p-1f),
+ .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, scale, p, q, poly, z;
+ uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
+ cmp = vcgeq_u32 (
+ vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+ TinyBound),
+ SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ z = vfmaq_f32 (d->shift, x, d->inv_ln2);
+ n = vsubq_f32 (z, d->shift);
+ r = vfmsq_f32 (x, n, d->ln2_hi);
+ r = vfmsq_f32 (r, n, d->ln2_lo);
+ e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ r2 = vmulq_f32 (r, r);
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c
new file mode 100644
index 000000000000..43d03fa34efa
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_expf_1u.c
@@ -0,0 +1,77 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const float Poly[] = {
+ /* maxerr: 0.36565 +0.5 ulp. */
+ 0x1.6a6000p-10f,
+ 0x1.12718ep-7f,
+ 0x1.555af0p-5f,
+ 0x1.555430p-3f,
+ 0x1.fffff4p-2f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+ float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+ uint32x4_t cmp = absn > v_f32 (192.0f);
+ float32x4_t r1 = s1 * s1;
+ float32x4_t r0 = poly * s1 * s2;
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
+{
+ float32x4_t n, r, scale, poly, absn, z;
+ uint32x4_t cmp, e;
+
+ /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+#if 1
+ z = vfmaq_f32 (Shift, x, InvLn2);
+ n = z - Shift;
+ r = vfmaq_f32 (x, n, -Ln2hi);
+ r = vfmaq_f32 (r, n, -Ln2lo);
+ e = vreinterpretq_u32_f32 (z) << 23;
+#else
+ z = x * InvLn2;
+ n = vrndaq_f32 (z);
+ r = vfmaq_f32 (x, n, -Ln2hi);
+ r = vfmaq_f32 (r, n, -Ln2lo);
+ e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
+#endif
+ scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
+ absn = vabsq_f32 (n);
+ cmp = absn > v_f32 (126.0f);
+ poly = vfmaq_f32 (C1, C0, r);
+ poly = vfmaq_f32 (C2, poly, r);
+ poly = vfmaq_f32 (C3, poly, r);
+ poly = vfmaq_f32 (C4, poly, r);
+ poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (poly, n, e, absn);
+ return scale * poly;
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log.c b/contrib/arm-optimized-routines/math/aarch64/v_log.c
new file mode 100644
index 000000000000..1d1c1fa62c04
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_log.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ uint64x2_t min_norm;
+ uint32x4_t special_bound;
+ float64x2_t poly[5];
+ float64x2_t ln2;
+ uint64x2_t sign_exp_mask;
+} data = {
+ /* Worst-case error: 1.17 + 0.5 ulp.
+ Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
+ V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
+ V2 (-0x1.554e550bd501ep-3) },
+ .ln2 = V2 (0x1.62e42fefa39efp-1),
+ .min_norm = V2 (0x0010000000000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
+ .sign_exp_mask = V2 (0xfff0000000000000)
+};
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+#define Off v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+ uint32x2_t cmp)
+{
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t z, r, r2, p, y, kd, hi;
+ uint64x2_t ix, iz, tmp;
+ uint32x2_t cmp;
+ int64x2_t k;
+ struct entry e;
+
+ ix = vreinterpretq_u64_f64 (x);
+ cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+ vget_low_u32 (d->special_bound));
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = vsubq_u64 (ix, Off);
+ k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
+ iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ z = vreinterpretq_f64_u64 (iz);
+ e = lookup (tmp);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (A (2), A (3), r);
+ p = vfmaq_f64 (A (0), A (1), r);
+ y = vfmaq_f64 (y, A (4), r2);
+ y = vfmaq_f64 (p, y, r2);
+
+ if (unlikely (v_any_u32h (cmp)))
+ return special_case (x, y, hi, r2, cmp);
+ return vfmaq_f64 (hi, y, r2);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_log_data.c b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c
new file mode 100644
index 000000000000..82351bb14766
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_log_data.c
@@ -0,0 +1,156 @@
+/*
+ * Lookup table for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#define N (1 << V_LOG_TABLE_BITS)
+
+const struct v_log_data __v_log_data = {
+ /* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+ where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+ N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
+
+ table[i].invc = 1/c
+ table[i].logc = (double)log(c)
+
+ where c is near the center of the subinterval and is chosen by trying several
+ floating point invc candidates around 1/center and selecting one for which
+ the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+ that contains 1 and the previous one got tweaked to avoid cancellation. */
+ .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
+ { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
+ { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
+ { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
+ { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
+ { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
+ { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
+ { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
+ { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
+ { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
+ { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
+ { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
+ { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
+ { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
+ { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
+ { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
+ { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
+ { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
+ { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
+ { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
+ { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
+ { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
+ { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
+ { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
+ { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
+ { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
+ { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
+ { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
+ { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
+ { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
+ { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
+ { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
+ { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
+ { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
+ { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
+ { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
+ { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
+ { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
+ { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
+ { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
+ { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
+ { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
+ { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
+ { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
+ { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
+ { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
+ { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
+ { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
+ { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
+ { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
+ { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
+ { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
+ { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
+ { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
+ { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
+ { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
+ { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
+ { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
+ { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
+ { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
+ { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
+ { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
+ { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
+ { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
+ { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
+ { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
+ { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
+ { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
+ { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
+ { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
+ { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
+ { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
+ { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
+ { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
+ { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
+ { 1.0, 0.0 },
+ { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
+ { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
+ { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
+ { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
+ { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
+ { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
+ { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
+ { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
+ { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
+ { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
+ { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
+ { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
+ { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
+ { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
+ { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
+ { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
+ { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
+ { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
+ { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
+ { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
+ { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
+ { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
+ { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
+ { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
+ { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
+ { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
+ { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
+ { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
+ { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
+ { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
+ { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
+ { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
+ { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
+ { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
+ { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
+ { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
+ { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
+ { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
+ { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
+ { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
+ { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
+ { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
+ { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
+ { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
+ { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
+ { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
+ { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
+ { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
+ { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
+ { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
+ { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
+ { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
+};
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_logf.c b/contrib/arm-optimized-routines/math/aarch64/v_logf.c
new file mode 100644
index 000000000000..66ebbbcd2b5a
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_logf.c
@@ -0,0 +1,74 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ uint32x4_t min_norm;
+ uint16x8_t special_bound;
+ float32x4_t poly[7];
+ float32x4_t ln2, tiny_bound;
+ uint32x4_t off, mantissa_mask;
+} data = {
+ /* 3.34 ulp error. */
+ .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+ V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+ V4 (-0x1.ffffc8p-2f) },
+ .ln2 = V4 (0x1.62e43p-1f),
+ .tiny_bound = V4 (0x1p-126),
+ .min_norm = V4 (0x00800000),
+ .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff)
+};
+
+#define P(i) d->poly[7 - i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
+ uint16x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, p, q, r, r2, y;
+ uint32x4_t u;
+ uint16x4_t cmp;
+
+ u = vreinterpretq_u32_f32 (x);
+ cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+ vget_low_u16 (d->special_bound));
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u = vsubq_u32 (u, d->off);
+ n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
+ u = vandq_u32 (u, d->mantissa_mask);
+ u = vaddq_u32 (u, d->off);
+ r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log(1+r) + n*ln2. */
+ r2 = vmulq_f32 (r, r);
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ p = vfmaq_f32 (P (5), P (6), r);
+ q = vfmaq_f32 (P (3), P (4), r);
+ y = vfmaq_f32 (P (1), P (2), r);
+ p = vfmaq_f32 (p, P (7), r2);
+ q = vfmaq_f32 (q, p, r2);
+ y = vfmaq_f32 (y, q, r2);
+ p = vfmaq_f32 (r, d->ln2, n);
+
+ if (unlikely (v_any_u16h (cmp)))
+ return special_case (x, y, r2, p, cmp);
+ return vfmaq_f32 (p, y, r2);
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_math.h b/contrib/arm-optimized-routines/math/aarch64/v_math.h
new file mode 100644
index 000000000000..1dc9916c6fb0
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_math.h
@@ -0,0 +1,135 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#if !__aarch64__
+# error "Cannot build without AArch64"
+#endif
+
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+
+#include <stdint.h>
+#include "../math_config.h"
+#include <arm_neon.h>
+
+/* Shorthand helpers for declaring constants. */
+# define V2(X) { X, X }
+# define V4(X) { X, X, X, X }
+# define V8(X) { X, X, X, X, X, X, X, X }
+
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
+
+static inline int
+v_lanes32 (void)
+{
+ return 4;
+}
+
+static inline float32x4_t
+v_f32 (float x)
+{
+ return (float32x4_t) V4 (x);
+}
+static inline uint32x4_t
+v_u32 (uint32_t x)
+{
+ return (uint32x4_t) V4 (x);
+}
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u32 (uint32x4_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
+{
+ return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
+{
+ return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+ p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+ float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
+ p[1] ? f (x1[1], x2[1]) : y[1],
+ p[2] ? f (x1[2], x2[2]) : y[2],
+ p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+ return 2;
+}
+static inline float64x2_t
+v_f64 (double x)
+{
+ return (float64x2_t) V2 (x);
+}
+static inline uint64x2_t
+v_u64 (uint64_t x)
+{
+ return (uint64x2_t) V2 (x);
+}
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u64 (uint64x2_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (x) != 0;
+}
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
+{
+ return (float64x2_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
+{
+ return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
+{
+ double p1 = p[1];
+ double x1 = x[1];
+ if (likely (p[0]))
+ y[0] = f (x[0]);
+ if (likely (p1))
+ y[1] = f (x1);
+ return y;
+}
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_pow.c b/contrib/arm-optimized-routines/math/aarch64/v_pow.c
new file mode 100644
index 000000000000..734f1663a283
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_pow.c
@@ -0,0 +1,22 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+{
+ float64x2_t z;
+ for (int lane = 0; lane < v_lanes64 (); lane++)
+ {
+ double sx = x[lane];
+ double sy = y[lane];
+ double sz = pow (sx, sy);
+ z[lane] = sz;
+ }
+ return z;
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_powf.c b/contrib/arm-optimized-routines/math/aarch64/v_powf.c
new file mode 100644
index 000000000000..3a4163ab0558
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_powf.c
@@ -0,0 +1,148 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min. */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A data.log2_poly
+#define C data.exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct
+{
+ struct
+ {
+ double invc, logc;
+ } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+ double log2_poly[4];
+ uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+ double exp2f_poly[3];
+} data = {
+ .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+ {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+ {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+ {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+ {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+ {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+ {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+ {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+ {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+ {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+ {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+ {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+ {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+ {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+ {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+ {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+ {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+ {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+ {0x1p+0, 0x0p+0 * Scale},
+ {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+ {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+ {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+ {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+ {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+ {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+ {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+ {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+ {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+ {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+ {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+ {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+ {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+ .log2_poly = { /* rel err: 1.5 * 2^-30. */
+ -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
+ -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
+ .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+ 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+ 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+ 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+ 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+ 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+ 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+ 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+ 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+ 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+ 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+ .exp2f_poly = { /* rel err: 1.69 * 2^-34. */
+ 0x1.c6af84b912394p-5 / Scale / Scale / Scale,
+ 0x1.ebfce50fac4f3p-3 / Scale / Scale,
+ 0x1.62e42ff0c52d6p-1 / Scale}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+ return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+ uint32x4_t u = vreinterpretq_u32_f32 (x);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+ uint32x4_t tmp = vsubq_u32 (u, Off);
+ uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+ Log2IdxMask);
+ uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+ uint32x4_t iz = vsubq_u32 (u, top);
+ int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+ 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
+
+ float32x4_t ret;
+ for (int lane = 0; lane < 4; lane++)
+ {
+ /* Use double precision for each lane. */
+ double invc = data.log2_tab[i[lane]].invc;
+ double logc = data.log2_tab[i[lane]].logc;
+ double z = (double) asfloat (iz[lane]);
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
+ double r = __builtin_fma (z, invc, -1.0);
+ double y0 = logc + (double) k[lane];
+
+ /* Polynomial to approximate log1p(r)/ln2. */
+ double logx = A[0];
+ logx = r * logx + A[1];
+ logx = r * logx + A[2];
+ logx = r * logx + A[3];
+ logx = r * logx + y0;
+ double ylogx = y[lane] * logx;
+ cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
+ >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
+ ? 1
+ : cmp[lane];
+
+ /* N*x = k + r with r in [-1/2, 1/2]. */
+ double kd = round (ylogx);
+ uint64_t ki = lround (ylogx);
+ r = ylogx - kd;
+
+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
+ uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
+ t += ki << (52 - V_EXP2F_TABLE_BITS);
+ double s = asdouble (t);
+ double p = C[0];
+ p = __builtin_fma (p, r, C[1]);
+ p = __builtin_fma (p, r, C[2]);
+ p = __builtin_fma (p, s * r, s);
+
+ ret[lane] = p;
+ }
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, ret, cmp);
+ return ret;
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sin.c b/contrib/arm-optimized-routines/math/aarch64/v_sin.c
new file mode 100644
index 000000000000..04129c31133d
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_sin.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+
+ .range_val = V2 (0x1p23),
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .shift = V2 (0x1.8p52),
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
+# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
+#endif
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (sin, x, y, cmp);
+}
+
+/* Vector (AdvSIMD) sin approximation.
+ Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
+ is 2.87 ULP:
+ _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
+ want 0x1.fffffffa7dc05p-1
+ Maximum observed error in the entire non-special domain ([-2^23, 2^23])
+ is 3.22 ULP:
+ _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
+ want 0x1.ffdcd125c84f8p-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
+ triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+ fenv). These lanes will be fixed by special-case handler later. */
+ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+ r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+#else
+ r = x;
+ cmp = vcageq_f64 (x, d->range_val);
+#endif
+
+ /* n = rint(|x|/pi). */
+ n = vfmaq_f64 (d->shift, d->inv_pi, r);
+ odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+ n = vsubq_f64 (n, d->shift);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/aarch64/v_sinf.c b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c
new file mode 100644
index 000000000000..336879844459
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/aarch64/v_sinf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .shift = V4 (0x1.8p+23f),
+ .range_val = V4 (0x1p20f)
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
+# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
+#endif
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (sinf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+#else
+ r = x;
+ cmp = vcageq_f32 (x, d->range_val);
+#endif
+
+ /* n = rint(|x|/pi) */
+ n = vfmaq_f32 (d->shift, d->inv_pi, r);
+ odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+ n = vsubq_f32 (n, d->shift);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r) */
+ r2 = vmulq_f32 (r, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/contrib/arm-optimized-routines/math/cosf.c b/contrib/arm-optimized-routines/math/cosf.c
index f29f19474e23..6293ce8f1b7d 100644
--- a/contrib/arm-optimized-routines/math/cosf.c
+++ b/contrib/arm-optimized-routines/math/cosf.c
@@ -1,8 +1,8 @@
/*
* Single-precision cos function.
*
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -22,7 +22,7 @@ cosf (float y)
int n;
const sincos_t *p = &__sincosf_table[0];
- if (abstop12 (y) < abstop12 (pio4))
+ if (abstop12 (y) < abstop12 (pio4f))
{
double x2 = x * x;
diff --git a/contrib/arm-optimized-routines/math/erf.c b/contrib/arm-optimized-routines/math/erf.c
index 12d7e5160df7..5f9f40dda264 100644
--- a/contrib/arm-optimized-routines/math/erf.c
+++ b/contrib/arm-optimized-routines/math/erf.c
@@ -2,7 +2,7 @@
* Double-precision erf(x) function.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/erf_data.c b/contrib/arm-optimized-routines/math/erf_data.c
index 807875bdd7f5..10cf1fae93e0 100644
--- a/contrib/arm-optimized-routines/math/erf_data.c
+++ b/contrib/arm-optimized-routines/math/erf_data.c
@@ -2,7 +2,7 @@
* Shared data between erf and erfc.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/erff.c b/contrib/arm-optimized-routines/math/erff.c
index a58e82565dc3..9fa476dbbab2 100644
--- a/contrib/arm-optimized-routines/math/erff.c
+++ b/contrib/arm-optimized-routines/math/erff.c
@@ -2,7 +2,7 @@
* Single-precision erf(x) function.
*
* Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/math/erff_data.c b/contrib/arm-optimized-routines/math/erff_data.c
index fa6b1ef4dedb..f822788d0dd8 100644
--- a/contrib/arm-optimized-routines/math/erff_data.c
+++ b/contrib/arm-optimized-routines/math/erff_data.c
@@ -2,7 +2,7 @@
* Data for approximation of erff.
*
* Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/exp.c b/contrib/arm-optimized-routines/math/exp.c
index 7f5024cd8792..1de500c31f3e 100644
--- a/contrib/arm-optimized-routines/math/exp.c
+++ b/contrib/arm-optimized-routines/math/exp.c
@@ -2,7 +2,7 @@
* Double-precision e^x function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <float.h>
diff --git a/contrib/arm-optimized-routines/math/exp10.c b/contrib/arm-optimized-routines/math/exp10.c
new file mode 100644
index 000000000000..0fbec4c694ca
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/exp10.c
@@ -0,0 +1,129 @@
+/*
+ * Double-precision 10^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
+#define UFlowBound -0x1.5ep+8 /* -350. */
+#define SmallTop 0x3c6 /* top12(0x1p-57). */
+#define BigTop 0x407 /* top12(0x1p8). */
+#define Thresh 0x41 /* BigTop - SmallTop. */
+#define Shift __exp_data.shift
+#define C(i) __exp_data.exp10_poly[i]
+
+static double
+special_case (uint64_t sbits, double_t tmp, uint64_t ki)
+{
+ double_t scale, y;
+
+ if (ki - (1ull << 16) < 0x80000000)
+ {
+ /* The exponent of scale might have overflowed by 1. */
+ sbits -= 1ull << 52;
+ scale = asdouble (sbits);
+ y = 2 * (scale + scale * tmp);
+ return check_oflow (eval_as_double (y));
+ }
+
+ /* n < 0, need special care in the subnormal range. */
+ sbits += 1022ull << 52;
+ scale = asdouble (sbits);
+ y = scale + scale * tmp;
+
+ if (y < 1.0)
+ {
+ /* Round y to the right precision before scaling it into the subnormal
+ range to avoid double rounding that can cause 0.5+E/2 ulp error where
+ E is the worst-case ulp error outside the subnormal range. So this
+ is only useful if the goal is better than 1 ulp worst-case error. */
+ double_t lo = scale - y + scale * tmp;
+ double_t hi = 1.0 + y;
+ lo = 1.0 - hi + y + lo;
+ y = eval_as_double (hi + lo) - 1.0;
+ /* Avoid -0.0 with downward rounding. */
+ if (WANT_ROUNDING && y == 0.0)
+ y = 0.0;
+ /* The underflow exception needs to be signaled explicitly. */
+ force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+ }
+ y = 0x1p-1022 * y;
+
+ return check_uflow (y);
+}
+
+/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
+double
+exp10 (double x)
+{
+ uint64_t ix = asuint64 (x);
+ uint32_t abstop = (ix >> 52) & 0x7ff;
+
+ if (unlikely (abstop - SmallTop >= Thresh))
+ {
+ if (abstop - SmallTop >= 0x80000000)
+ /* Avoid spurious underflow for tiny x.
+ Note: 0 is common input. */
+ return x + 1;
+ if (abstop == 0x7ff)
+ return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
+ if (x >= OFlowBound)
+ return __math_oflow (0);
+ if (x < UFlowBound)
+ return __math_uflow (0);
+
+ /* Large x is special-cased below. */
+ abstop = 0;
+ }
+
+ /* Reduce x: z = x * N / log10(2), k = round(z). */
+ double_t z = __exp_data.invlog10_2N * x;
+ double_t kd;
+ int64_t ki;
+#if TOINT_INTRINSICS
+ kd = roundtoint (z);
+ ki = converttoint (z);
+#else
+ kd = eval_as_double (z + Shift);
+ kd -= Shift;
+ ki = kd;
+#endif
+
+ /* r = x - k * log10(2), r in [-0.5, 0.5]. */
+ double_t r = x;
+ r = __exp_data.neglog10_2hiN * kd + r;
+ r = __exp_data.neglog10_2loN * kd + r;
+
+ /* exp10(x) = 2^(k/N) * 2^(r/N).
+ Approximate the two components separately. */
+
+ /* s = 2^(k/N), using lookup table. */
+ uint64_t e = ki << (52 - EXP_TABLE_BITS);
+ uint64_t i = (ki & IndexMask) * 2;
+ uint64_t u = __exp_data.tab[i + 1];
+ uint64_t sbits = u + e;
+
+ double_t tail = asdouble (__exp_data.tab[i]);
+
+ /* 2^(r/N) ~= 1 + r * Poly(r). */
+ double_t r2 = r * r;
+ double_t p = C (0) + r * C (1);
+ double_t y = C (2) + r * C (3);
+ y = y + r2 * C (4);
+ y = p + r2 * y;
+ y = tail + y * r;
+
+ if (unlikely (abstop == 0))
+ return special_case (sbits, y, ki);
+
+ /* Assemble components:
+ y = 2^(r/N) * 2^(k/N)
+ ~= (y + 1) * s. */
+ double_t s = asdouble (sbits);
+ return eval_as_double (s * y + s);
+}
diff --git a/contrib/arm-optimized-routines/math/exp2.c b/contrib/arm-optimized-routines/math/exp2.c
index 35ab39f22ed5..a1eee44f1f48 100644
--- a/contrib/arm-optimized-routines/math/exp2.c
+++ b/contrib/arm-optimized-routines/math/exp2.c
@@ -2,7 +2,7 @@
* Double-precision 2^x function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <float.h>
diff --git a/contrib/arm-optimized-routines/math/exp2f.c b/contrib/arm-optimized-routines/math/exp2f.c
index 94b32538aa0d..776c3ddf7663 100644
--- a/contrib/arm-optimized-routines/math/exp2f.c
+++ b/contrib/arm-optimized-routines/math/exp2f.c
@@ -2,7 +2,7 @@
* Single-precision 2^x function.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
diff --git a/contrib/arm-optimized-routines/math/exp2f_data.c b/contrib/arm-optimized-routines/math/exp2f_data.c
index 3fb0ad11b15a..f0cb7fccacd1 100644
--- a/contrib/arm-optimized-routines/math/exp2f_data.c
+++ b/contrib/arm-optimized-routines/math/exp2f_data.c
@@ -2,7 +2,7 @@
* Shared data between expf, exp2f and powf.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/exp_data.c b/contrib/arm-optimized-routines/math/exp_data.c
index cba76832566f..c20b1b2d3e06 100644
--- a/contrib/arm-optimized-routines/math/exp_data.c
+++ b/contrib/arm-optimized-routines/math/exp_data.c
@@ -1,8 +1,8 @@
/*
* Shared data between exp, exp2 and pow.
*
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
@@ -12,6 +12,7 @@
const struct exp_data __exp_data = {
// N/ln2
.invln2N = 0x1.71547652b82fep0 * N,
+.invlog10_2N = 0x1.a934f0979a371p1 * N,
// -ln2/N
#if N == 64
.negln2hiN = -0x1.62e42fefa0000p-7,
@@ -26,6 +27,8 @@ const struct exp_data __exp_data = {
.negln2hiN = -0x1.62e42fef80000p-10,
.negln2loN = -0x1.1cf79abc9e3b4p-45,
#endif
+.neglog10_2hiN = -0x1.3441350ap-2 / N,
+.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
// Used for rounding when !TOINT_INTRINSICS
#if EXP_USE_TOINT_NARROW
.shift = 0x1800000000.8p0,
@@ -147,6 +150,24 @@ const struct exp_data __exp_data = {
0x1.3b2ab786ee1dap-7,
#endif
},
+.exp10_poly = {
+#if EXP10_POLY_WIDE
+/* Range is wider if using shift-based reduction: coeffs generated
+ using Remez in [-log10(2)/128, log10(2)/128 ]. */
+0x1.26bb1bbb55515p1,
+0x1.53524c73cd32bp1,
+0x1.0470591e1a108p1,
+0x1.2bd77b12fe9a8p0,
+0x1.14289fef24b78p-1
+#else
+/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
+0x1.26bb1bbb55516p1,
+0x1.53524c73ce9fep1,
+0x1.0470591ce4b26p1,
+0x1.2bd76577fe684p0,
+0x1.1446eeccd0efbp-1
+#endif
+},
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
// tab[2*k] = asuint64(T[k])
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/contrib/arm-optimized-routines/math/expf.c b/contrib/arm-optimized-routines/math/expf.c
index 9b2f0c3d8c56..08a20d59e491 100644
--- a/contrib/arm-optimized-routines/math/expf.c
+++ b/contrib/arm-optimized-routines/math/expf.c
@@ -2,7 +2,7 @@
* Single-precision e^x function.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
diff --git a/contrib/arm-optimized-routines/math/include/mathlib.h b/contrib/arm-optimized-routines/math/include/mathlib.h
index 279d829d8ea1..64cbb9c1f850 100644
--- a/contrib/arm-optimized-routines/math/include/mathlib.h
+++ b/contrib/arm-optimized-routines/math/include/mathlib.h
@@ -1,8 +1,8 @@
/*
* Public API.
*
- * Copyright (c) 2015-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2015-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _MATHLIB_H
@@ -18,74 +18,33 @@ float cosf (float);
void sincosf (float, float*, float*);
double exp (double);
+double exp10 (double);
double exp2 (double);
double log (double);
double log2 (double);
double pow (double, double);
-/* Scalar functions using the vector algorithm with identical result. */
-float __s_sinf (float);
-float __s_cosf (float);
-float __s_expf (float);
-float __s_expf_1u (float);
-float __s_exp2f (float);
-float __s_exp2f_1u (float);
-float __s_logf (float);
-float __s_powf (float, float);
-double __s_sin (double);
-double __s_cos (double);
-double __s_exp (double);
-double __s_log (double);
-double __s_pow (double, double);
-
#if __aarch64__
-#if __GNUC__ >= 5
+# if __GNUC__ >= 5
typedef __Float32x4_t __f32x4_t;
typedef __Float64x2_t __f64x2_t;
-#elif __clang_major__*100+__clang_minor__ >= 305
+# elif __clang_major__*100+__clang_minor__ >= 305
typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-#else
-#error Unsupported compiler
-#endif
-
-/* Vector functions following the base PCS. */
-__f32x4_t __v_sinf (__f32x4_t);
-__f32x4_t __v_cosf (__f32x4_t);
-__f32x4_t __v_expf (__f32x4_t);
-__f32x4_t __v_expf_1u (__f32x4_t);
-__f32x4_t __v_exp2f (__f32x4_t);
-__f32x4_t __v_exp2f_1u (__f32x4_t);
-__f32x4_t __v_logf (__f32x4_t);
-__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
-__f64x2_t __v_sin (__f64x2_t);
-__f64x2_t __v_cos (__f64x2_t);
-__f64x2_t __v_exp (__f64x2_t);
-__f64x2_t __v_log (__f64x2_t);
-__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
+# else
+# error Unsupported compiler
+# endif
-#if __GNUC__ >= 9 || __clang_major__ >= 8
-#define __vpcs __attribute__((__aarch64_vector_pcs__))
-
-/* Vector functions following the vector PCS. */
-__vpcs __f32x4_t __vn_sinf (__f32x4_t);
-__vpcs __f32x4_t __vn_cosf (__f32x4_t);
-__vpcs __f32x4_t __vn_expf (__f32x4_t);
-__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
-__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
-__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
-__vpcs __f32x4_t __vn_logf (__f32x4_t);
-__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t __vn_sin (__f64x2_t);
-__vpcs __f64x2_t __vn_cos (__f64x2_t);
-__vpcs __f64x2_t __vn_exp (__f64x2_t);
-__vpcs __f64x2_t __vn_log (__f64x2_t);
-__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
+# if __GNUC__ >= 9 || __clang_major__ >= 8
+# undef __vpcs
+# define __vpcs __attribute__((__aarch64_vector_pcs__))
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
@@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
-#endif
+# endif
#endif
#endif
diff --git a/contrib/arm-optimized-routines/math/log.c b/contrib/arm-optimized-routines/math/log.c
index d3b7bc60747c..43dfc2a744f0 100644
--- a/contrib/arm-optimized-routines/math/log.c
+++ b/contrib/arm-optimized-routines/math/log.c
@@ -2,7 +2,7 @@
* Double-precision log(x) function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <float.h>
diff --git a/contrib/arm-optimized-routines/math/log2.c b/contrib/arm-optimized-routines/math/log2.c
index 55102b772969..3f9c21b03962 100644
--- a/contrib/arm-optimized-routines/math/log2.c
+++ b/contrib/arm-optimized-routines/math/log2.c
@@ -2,7 +2,7 @@
* Double-precision log2(x) function.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <float.h>
diff --git a/contrib/arm-optimized-routines/math/log2_data.c b/contrib/arm-optimized-routines/math/log2_data.c
index 3fc9b47c1f03..293bd7df4118 100644
--- a/contrib/arm-optimized-routines/math/log2_data.c
+++ b/contrib/arm-optimized-routines/math/log2_data.c
@@ -2,7 +2,7 @@
* Data for log2.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/log2f.c b/contrib/arm-optimized-routines/math/log2f.c
index acb629e6846c..0a44fa2024f6 100644
--- a/contrib/arm-optimized-routines/math/log2f.c
+++ b/contrib/arm-optimized-routines/math/log2f.c
@@ -2,7 +2,7 @@
* Single-precision log2 function.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
diff --git a/contrib/arm-optimized-routines/math/log2f_data.c b/contrib/arm-optimized-routines/math/log2f_data.c
index f3546d730aba..4866ef7f8171 100644
--- a/contrib/arm-optimized-routines/math/log2f_data.c
+++ b/contrib/arm-optimized-routines/math/log2f_data.c
@@ -2,7 +2,7 @@
* Data definition for log2f.
*
* Copyright (c) 2017-2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/log_data.c b/contrib/arm-optimized-routines/math/log_data.c
index 96a098d42c16..3ecc1f40a822 100644
--- a/contrib/arm-optimized-routines/math/log_data.c
+++ b/contrib/arm-optimized-routines/math/log_data.c
@@ -2,7 +2,7 @@
* Data for log.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/logf.c b/contrib/arm-optimized-routines/math/logf.c
index cfbaee12df10..820f74c3e66a 100644
--- a/contrib/arm-optimized-routines/math/logf.c
+++ b/contrib/arm-optimized-routines/math/logf.c
@@ -1,8 +1,8 @@
/*
* Single-precision log function.
*
- * Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
@@ -57,7 +57,7 @@ logf (float x)
tmp = ix - OFF;
i = (tmp >> (23 - LOGF_TABLE_BITS)) % N;
k = (int32_t) tmp >> 23; /* arithmetic shift */
- iz = ix - (tmp & 0x1ff << 23);
+ iz = ix - (tmp & 0xff800000);
invc = T[i].invc;
logc = T[i].logc;
z = (double_t) asfloat (iz);
diff --git a/contrib/arm-optimized-routines/math/logf_data.c b/contrib/arm-optimized-routines/math/logf_data.c
index e8973ce4fedc..04247684755f 100644
--- a/contrib/arm-optimized-routines/math/logf_data.c
+++ b/contrib/arm-optimized-routines/math/logf_data.c
@@ -2,7 +2,7 @@
* Data definition for logf.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/math_config.h b/contrib/arm-optimized-routines/math/math_config.h
index e85104337048..faf77b31fc99 100644
--- a/contrib/arm-optimized-routines/math/math_config.h
+++ b/contrib/arm-optimized-routines/math/math_config.h
@@ -1,8 +1,8 @@
/*
* Configuration for math routines.
*
- * Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2017-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef _MATH_CONFIG_H
@@ -92,6 +92,46 @@
# define unlikely(x) (x)
#endif
+/* Return ptr but hide its value from the compiler so accesses through it
+ cannot be optimized based on the contents. */
+#define ptr_barrier(ptr) \
+ ({ \
+ __typeof (ptr) __ptr = (ptr); \
+ __asm("" : "+r"(__ptr)); \
+ __ptr; \
+ })
+
+/* Symbol renames to avoid libc conflicts. */
+#define __math_oflowf arm_math_oflowf
+#define __math_uflowf arm_math_uflowf
+#define __math_may_uflowf arm_math_may_uflowf
+#define __math_divzerof arm_math_divzerof
+#define __math_oflow arm_math_oflow
+#define __math_uflow arm_math_uflow
+#define __math_may_uflow arm_math_may_uflow
+#define __math_divzero arm_math_divzero
+#define __math_invalidf arm_math_invalidf
+#define __math_invalid arm_math_invalid
+#define __math_check_oflow arm_math_check_oflow
+#define __math_check_uflow arm_math_check_uflow
+#define __math_check_oflowf arm_math_check_oflowf
+#define __math_check_uflowf arm_math_check_uflowf
+
+#define __sincosf_table arm_math_sincosf_table
+#define __inv_pio4 arm_math_inv_pio4
+#define __exp2f_data arm_math_exp2f_data
+#define __logf_data arm_math_logf_data
+#define __log2f_data arm_math_log2f_data
+#define __powf_log2_data arm_math_powf_log2_data
+#define __exp_data arm_math_exp_data
+#define __log_data arm_math_log_data
+#define __log2_data arm_math_log2_data
+#define __pow_log_data arm_math_pow_log_data
+#define __erff_data arm_math_erff_data
+#define __erf_data arm_math_erf_data
+#define __v_exp_data arm_math_v_exp_data
+#define __v_log_data arm_math_v_log_data
+
#if HAVE_FAST_ROUND
/* When set, the roundtoint and converttoint functions are provided with
the semantics documented below. */
@@ -381,15 +421,22 @@ extern const struct powf_log2_data
#define EXP_USE_TOINT_NARROW 0
#define EXP2_POLY_ORDER 5
#define EXP2_POLY_WIDE 0
+/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
+ and !TOINT_INTRINSICS. */
+#define EXP10_POLY_WIDE 0
extern const struct exp_data
{
double invln2N;
+ double invlog10_2N;
double shift;
double negln2hiN;
double negln2loN;
+ double neglog10_2hiN;
+ double neglog10_2loN;
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
+ double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data HIDDEN;
@@ -459,4 +506,16 @@ extern const struct erf_data
double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
} __erf_data HIDDEN;
+#define V_EXP_TABLE_BITS 7
+extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+
+#define V_LOG_TABLE_BITS 7
+extern const struct v_log_data
+{
+ struct
+ {
+ double invc, logc;
+ } table[1 << V_LOG_TABLE_BITS];
+} __v_log_data HIDDEN;
+
#endif
diff --git a/contrib/arm-optimized-routines/math/math_err.c b/contrib/arm-optimized-routines/math/math_err.c
index 1bf9538a1ab1..cfe072809cf4 100644
--- a/contrib/arm-optimized-routines/math/math_err.c
+++ b/contrib/arm-optimized-routines/math/math_err.c
@@ -2,7 +2,7 @@
* Double-precision math error handling.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/math_errf.c b/contrib/arm-optimized-routines/math/math_errf.c
index d5350b819ab1..4233918b1eae 100644
--- a/contrib/arm-optimized-routines/math/math_errf.c
+++ b/contrib/arm-optimized-routines/math/math_errf.c
@@ -2,7 +2,7 @@
* Single-precision math error handling.
*
* Copyright (c) 2017-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/pow.c b/contrib/arm-optimized-routines/math/pow.c
index 86842c6abacd..af719fe5ab10 100644
--- a/contrib/arm-optimized-routines/math/pow.c
+++ b/contrib/arm-optimized-routines/math/pow.c
@@ -2,7 +2,7 @@
* Double-precision x^y function.
*
* Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <float.h>
diff --git a/contrib/arm-optimized-routines/math/pow_log_data.c b/contrib/arm-optimized-routines/math/pow_log_data.c
index 45569c5cc064..2a4c250d85c3 100644
--- a/contrib/arm-optimized-routines/math/pow_log_data.c
+++ b/contrib/arm-optimized-routines/math/pow_log_data.c
@@ -2,7 +2,7 @@
* Data for the log part of pow.
*
* Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/powf.c b/contrib/arm-optimized-routines/math/powf.c
index 6ba45d3852a5..05c80bb2eb67 100644
--- a/contrib/arm-optimized-routines/math/powf.c
+++ b/contrib/arm-optimized-routines/math/powf.c
@@ -2,7 +2,7 @@
* Single-precision pow function.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
diff --git a/contrib/arm-optimized-routines/math/powf_log2_data.c b/contrib/arm-optimized-routines/math/powf_log2_data.c
index 97e0d98cdbab..243836a549fd 100644
--- a/contrib/arm-optimized-routines/math/powf_log2_data.c
+++ b/contrib/arm-optimized-routines/math/powf_log2_data.c
@@ -2,7 +2,7 @@
* Data definition for powf.
*
* Copyright (c) 2017-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
diff --git a/contrib/arm-optimized-routines/math/s_cos.c b/contrib/arm-optimized-routines/math/s_cos.c
deleted file mode 100644
index 53a95b0adfde..000000000000
--- a/contrib/arm-optimized-routines/math/s_cos.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_cos.c"
diff --git a/contrib/arm-optimized-routines/math/s_cosf.c b/contrib/arm-optimized-routines/math/s_cosf.c
deleted file mode 100644
index 914c02eba651..000000000000
--- a/contrib/arm-optimized-routines/math/s_cosf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_cosf.c"
diff --git a/contrib/arm-optimized-routines/math/s_exp.c b/contrib/arm-optimized-routines/math/s_exp.c
deleted file mode 100644
index ac7246b2c100..000000000000
--- a/contrib/arm-optimized-routines/math/s_exp.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_exp.c"
diff --git a/contrib/arm-optimized-routines/math/s_exp2f.c b/contrib/arm-optimized-routines/math/s_exp2f.c
deleted file mode 100644
index df7dfd680ff4..000000000000
--- a/contrib/arm-optimized-routines/math/s_exp2f.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_exp2f.c"
diff --git a/contrib/arm-optimized-routines/math/s_exp2f_1u.c b/contrib/arm-optimized-routines/math/s_exp2f_1u.c
deleted file mode 100644
index 5e3852b41d83..000000000000
--- a/contrib/arm-optimized-routines/math/s_exp2f_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_exp2f_1u.c"
diff --git a/contrib/arm-optimized-routines/math/s_expf.c b/contrib/arm-optimized-routines/math/s_expf.c
deleted file mode 100644
index 3492c460733d..000000000000
--- a/contrib/arm-optimized-routines/math/s_expf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_expf.c"
diff --git a/contrib/arm-optimized-routines/math/s_expf_1u.c b/contrib/arm-optimized-routines/math/s_expf_1u.c
deleted file mode 100644
index eb7bbcba5566..000000000000
--- a/contrib/arm-optimized-routines/math/s_expf_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_expf_1u.c"
diff --git a/contrib/arm-optimized-routines/math/s_log.c b/contrib/arm-optimized-routines/math/s_log.c
deleted file mode 100644
index 23289cf948ec..000000000000
--- a/contrib/arm-optimized-routines/math/s_log.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_log.c"
diff --git a/contrib/arm-optimized-routines/math/s_logf.c b/contrib/arm-optimized-routines/math/s_logf.c
deleted file mode 100644
index 9399350fc1ee..000000000000
--- a/contrib/arm-optimized-routines/math/s_logf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_logf.c"
diff --git a/contrib/arm-optimized-routines/math/s_pow.c b/contrib/arm-optimized-routines/math/s_pow.c
deleted file mode 100644
index 2e34c9f896d6..000000000000
--- a/contrib/arm-optimized-routines/math/s_pow.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_pow.c"
diff --git a/contrib/arm-optimized-routines/math/s_powf.c b/contrib/arm-optimized-routines/math/s_powf.c
deleted file mode 100644
index 6d91a4a72b37..000000000000
--- a/contrib/arm-optimized-routines/math/s_powf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_powf.c"
diff --git a/contrib/arm-optimized-routines/math/s_sin.c b/contrib/arm-optimized-routines/math/s_sin.c
deleted file mode 100644
index 06982c2018c6..000000000000
--- a/contrib/arm-optimized-routines/math/s_sin.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_sin.c"
diff --git a/contrib/arm-optimized-routines/math/s_sinf.c b/contrib/arm-optimized-routines/math/s_sinf.c
deleted file mode 100644
index 68ca90853736..000000000000
--- a/contrib/arm-optimized-routines/math/s_sinf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#define SCALAR 1
-#include "v_sinf.c"
diff --git a/contrib/arm-optimized-routines/math/sincosf.c b/contrib/arm-optimized-routines/math/sincosf.c
index 9746f1c22e6c..446f21d60faf 100644
--- a/contrib/arm-optimized-routines/math/sincosf.c
+++ b/contrib/arm-optimized-routines/math/sincosf.c
@@ -1,8 +1,8 @@
/*
* Single-precision sin/cos function.
*
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -22,7 +22,7 @@ sincosf (float y, float *sinp, float *cosp)
int n;
const sincos_t *p = &__sincosf_table[0];
- if (abstop12 (y) < abstop12 (pio4))
+ if (abstop12 (y) < abstop12 (pio4f))
{
double x2 = x * x;
diff --git a/contrib/arm-optimized-routines/math/sincosf.h b/contrib/arm-optimized-routines/math/sincosf.h
index 1e80fc9ba8e1..ec23ed7aeb26 100644
--- a/contrib/arm-optimized-routines/math/sincosf.h
+++ b/contrib/arm-optimized-routines/math/sincosf.h
@@ -1,8 +1,8 @@
/*
* Header for sinf, cosf and sincosf.
*
- * Copyright (c) 2018, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
@@ -12,7 +12,7 @@
/* 2PI * 2^-64. */
static const double pi63 = 0x1.921FB54442D18p-62;
/* PI / 4. */
-static const double pio4 = 0x1.921FB54442D18p-1;
+static const float pio4f = 0x1.921FB6p-1f;
/* The constants and polynomials for sine and cosine. */
typedef struct
diff --git a/contrib/arm-optimized-routines/math/sincosf_data.c b/contrib/arm-optimized-routines/math/sincosf_data.c
index ab4ac4710fef..22525290ab08 100644
--- a/contrib/arm-optimized-routines/math/sincosf_data.c
+++ b/contrib/arm-optimized-routines/math/sincosf_data.c
@@ -2,7 +2,7 @@
* Data definition for sinf, cosf and sincosf.
*
* Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdint.h>
diff --git a/contrib/arm-optimized-routines/math/sinf.c b/contrib/arm-optimized-routines/math/sinf.c
index ddbc1daf74a9..8dd8ae458794 100644
--- a/contrib/arm-optimized-routines/math/sinf.c
+++ b/contrib/arm-optimized-routines/math/sinf.c
@@ -1,8 +1,8 @@
/*
* Single-precision sin function.
*
- * Copyright (c) 2018-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2021, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <math.h>
@@ -21,7 +21,7 @@ sinf (float y)
int n;
const sincos_t *p = &__sincosf_table[0];
- if (abstop12 (y) < abstop12 (pio4))
+ if (abstop12 (y) < abstop12 (pio4f))
{
s = x * x;
diff --git a/contrib/arm-optimized-routines/math/test/mathbench.c b/contrib/arm-optimized-routines/math/test/mathbench.c
index 0c17826e5296..ed7e89bb7710 100644
--- a/contrib/arm-optimized-routines/math/test/mathbench.c
+++ b/contrib/arm-optimized-routines/math/test/mathbench.c
@@ -1,8 +1,8 @@
/*
* Microbenchmark for math functions.
*
- * Copyright (c) 2018-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#undef _GNU_SOURCE
@@ -15,11 +15,6 @@
#include <math.h>
#include "mathlib.h"
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
-#endif
-
/* Number of measurements, best result is reported. */
#define MEASURE 60
/* Array size. */
@@ -34,8 +29,9 @@ static float Af[N];
static long measurecount = MEASURE;
static long itercount = ITER;
-#if __aarch64__ && WANT_VMATH
-typedef __f64x2_t v_double;
+#ifdef __vpcs
+#include <arm_neon.h>
+typedef float64x2_t v_double;
#define v_double_len() 2
@@ -51,7 +47,7 @@ v_double_dup (double x)
return (v_double){x, x};
}
-typedef __f32x4_t v_float;
+typedef float32x4_t v_float;
#define v_float_len() 4
@@ -76,141 +72,91 @@ typedef float v_float;
#define v_float_len(x) 1
#define v_float_load(x) (x)[0]
#define v_float_dup(x) (x)
-#endif
-
-static double
-dummy (double x)
-{
- return x;
-}
-
-static float
-dummyf (float x)
-{
- return x;
-}
-
-#if WANT_VMATH
-#if __aarch64__
-static v_double
-__v_dummy (v_double x)
-{
- return x;
-}
-static v_float
-__v_dummyf (v_float x)
-{
- return x;
-}
-
-#ifdef __vpcs
-__vpcs static v_double
-__vn_dummy (v_double x)
-{
- return x;
-}
+#endif
-__vpcs static v_float
-__vn_dummyf (v_float x)
-{
- return x;
-}
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef svbool_t sv_bool;
+typedef svfloat64_t sv_double;
-__vpcs static v_float
-xy__vn_powf (v_float x)
-{
- return __vn_powf (x, x);
-}
+#define sv_double_len() svcntd()
-__vpcs static v_float
-xy_Z_powf (v_float x)
+static inline sv_double
+sv_double_load (const double *p)
{
- return _ZGVnN4vv_powf (x, x);
+ svbool_t pg = svptrue_b64();
+ return svld1(pg, p);
}
-__vpcs static v_double
-xy__vn_pow (v_double x)
+static inline sv_double
+sv_double_dup (double x)
{
- return __vn_pow (x, x);
+ return svdup_n_f64(x);
}
-__vpcs static v_double
-xy_Z_pow (v_double x)
-{
- return _ZGVnN2vv_pow (x, x);
-}
-#endif
+typedef svfloat32_t sv_float;
-static v_float
-xy__v_powf (v_float x)
-{
- return __v_powf (x, x);
-}
+#define sv_float_len() svcntw()
-static v_double
-xy__v_pow (v_double x)
+static inline sv_float
+sv_float_load (const float *p)
{
- return __v_pow (x, x);
+ svbool_t pg = svptrue_b32();
+ return svld1(pg, p);
}
-#endif
-static float
-xy__s_powf (float x)
+static inline sv_float
+sv_float_dup (float x)
{
- return __s_powf (x, x);
-}
-
-static double
-xy__s_pow (double x)
-{
- return __s_pow (x, x);
+ return svdup_n_f32(x);
}
+#else
+/* dummy definitions to make things compile. */
+#define sv_double_len(x) 1
+#define sv_float_len(x) 1
#endif
static double
-xypow (double x)
+dummy (double x)
{
- return pow (x, x);
+ return x;
}
static float
-xypowf (float x)
+dummyf (float x)
{
- return powf (x, x);
+ return x;
}
-
-static double
-xpow (double x)
+#ifdef __vpcs
+__vpcs static v_double
+__vn_dummy (v_double x)
{
- return pow (x, 23.4);
+ return x;
}
-static float
-xpowf (float x)
+__vpcs static v_float
+__vn_dummyf (v_float x)
{
- return powf (x, 23.4f);
+ return x;
}
-
-static double
-ypow (double x)
+#endif
+#if WANT_SVE_MATH
+static sv_double
+__sv_dummy (sv_double x, sv_bool pg)
{
- return pow (2.34, x);
+ return x;
}
-static float
-ypowf (float x)
+static sv_float
+__sv_dummyf (sv_float x, sv_bool pg)
{
- return powf (2.34f, x);
+ return x;
}
-static float
-sincosf_wrap (float x)
-{
- float s, c;
- sincosf (x, &s, &c);
- return s + c;
-}
+#endif
+
+#include "test/mathbench_wrappers.h"
static const struct fun
{
@@ -223,127 +169,40 @@ static const struct fun
{
double (*d) (double);
float (*f) (float);
- v_double (*vd) (v_double);
- v_float (*vf) (v_float);
#ifdef __vpcs
__vpcs v_double (*vnd) (v_double);
__vpcs v_float (*vnf) (v_float);
#endif
+#if WANT_SVE_MATH
+ sv_double (*svd) (sv_double, sv_bool);
+ sv_float (*svf) (sv_float, sv_bool);
+#endif
} fun;
} funtab[] = {
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
-#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
-#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
+#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
+#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
D (dummy, 1.0, 2.0)
-D (exp, -9.9, 9.9)
-D (exp, 0.5, 1.0)
-D (exp2, -9.9, 9.9)
-D (log, 0.01, 11.1)
-D (log, 0.999, 1.001)
-D (log2, 0.01, 11.1)
-D (log2, 0.999, 1.001)
-{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
-D (xpow, 0.01, 11.1)
-D (ypow, -9.9, 9.9)
-D (erf, -6.0, 6.0)
-
F (dummyf, 1.0, 2.0)
-F (expf, -9.9, 9.9)
-F (exp2f, -9.9, 9.9)
-F (logf, 0.01, 11.1)
-F (log2f, 0.01, 11.1)
-{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
-F (xpowf, 0.01, 11.1)
-F (ypowf, -9.9, 9.9)
-{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
-{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
-F (sinf, 0.1, 0.7)
-F (sinf, 0.8, 3.1)
-F (sinf, -3.1, 3.1)
-F (sinf, 3.3, 33.3)
-F (sinf, 100, 1000)
-F (sinf, 1e6, 1e32)
-F (cosf, 0.1, 0.7)
-F (cosf, 0.8, 3.1)
-F (cosf, -3.1, 3.1)
-F (cosf, 3.3, 33.3)
-F (cosf, 100, 1000)
-F (cosf, 1e6, 1e32)
-F (erff, -4.0, 4.0)
-#if WANT_VMATH
-D (__s_sin, -3.1, 3.1)
-D (__s_cos, -3.1, 3.1)
-D (__s_exp, -9.9, 9.9)
-D (__s_log, 0.01, 11.1)
-{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
-F (__s_expf, -9.9, 9.9)
-F (__s_expf_1u, -9.9, 9.9)
-F (__s_exp2f, -9.9, 9.9)
-F (__s_exp2f_1u, -9.9, 9.9)
-F (__s_logf, 0.01, 11.1)
-{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
-F (__s_sinf, -3.1, 3.1)
-F (__s_cosf, -3.1, 3.1)
-#if __aarch64__
-VD (__v_dummy, 1.0, 2.0)
-VD (__v_sin, -3.1, 3.1)
-VD (__v_cos, -3.1, 3.1)
-VD (__v_exp, -9.9, 9.9)
-VD (__v_log, 0.01, 11.1)
-{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
-VF (__v_dummyf, 1.0, 2.0)
-VF (__v_expf, -9.9, 9.9)
-VF (__v_expf_1u, -9.9, 9.9)
-VF (__v_exp2f, -9.9, 9.9)
-VF (__v_exp2f_1u, -9.9, 9.9)
-VF (__v_logf, 0.01, 11.1)
-{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
-VF (__v_sinf, -3.1, 3.1)
-VF (__v_cosf, -3.1, 3.1)
#ifdef __vpcs
VND (__vn_dummy, 1.0, 2.0)
-VND (__vn_exp, -9.9, 9.9)
-VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (__vn_log, 0.01, 11.1)
-VND (_ZGVnN2v_log, 0.01, 11.1)
-{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
-{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (__vn_sin, -3.1, 3.1)
-VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (__vn_cos, -3.1, 3.1)
-VND (_ZGVnN2v_cos, -3.1, 3.1)
VNF (__vn_dummyf, 1.0, 2.0)
-VNF (__vn_expf, -9.9, 9.9)
-VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (__vn_expf_1u, -9.9, 9.9)
-VNF (__vn_exp2f, -9.9, 9.9)
-VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (__vn_exp2f_1u, -9.9, 9.9)
-VNF (__vn_logf, 0.01, 11.1)
-VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
-{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (__vn_sinf, -3.1, 3.1)
-VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (__vn_cosf, -3.1, 3.1)
-VNF (_ZGVnN4v_cosf, -3.1, 3.1)
-#endif
#endif
+#if WANT_SVE_MATH
+SVD (__sv_dummy, 1.0, 2.0)
+SVF (__sv_dummyf, 1.0, 2.0)
#endif
+#include "test/mathbench_funcs.h"
{0},
#undef F
#undef D
-#undef VF
-#undef VD
#undef VNF
#undef VND
+#undef SVF
+#undef SVD
};
static void
@@ -442,69 +301,75 @@ runf_latency (float f (float))
prev = f (Af[i] + prev * z);
}
+#ifdef __vpcs
static void
-run_v_thruput (v_double f (v_double))
+run_vn_thruput (__vpcs v_double f (v_double))
{
for (int i = 0; i < N; i += v_double_len ())
f (v_double_load (A+i));
}
static void
-runf_v_thruput (v_float f (v_float))
+runf_vn_thruput (__vpcs v_float f (v_float))
{
for (int i = 0; i < N; i += v_float_len ())
f (v_float_load (Af+i));
}
static void
-run_v_latency (v_double f (v_double))
+run_vn_latency (__vpcs v_double f (v_double))
{
- v_double z = v_double_dup (zero);
- v_double prev = z;
+ volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
+ uint64x2_t sel = vsel;
+ v_double prev = v_double_dup (0);
for (int i = 0; i < N; i += v_double_len ())
- prev = f (v_double_load (A+i) + prev * z);
+ prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
}
static void
-runf_v_latency (v_float f (v_float))
+runf_vn_latency (__vpcs v_float f (v_float))
{
- v_float z = v_float_dup (zero);
- v_float prev = z;
+ volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
+ uint32x4_t sel = vsel;
+ v_float prev = v_float_dup (0);
for (int i = 0; i < N; i += v_float_len ())
- prev = f (v_float_load (Af+i) + prev * z);
+ prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
}
+#endif
-#ifdef __vpcs
+#if WANT_SVE_MATH
static void
-run_vn_thruput (__vpcs v_double f (v_double))
+run_sv_thruput (sv_double f (sv_double, sv_bool))
{
- for (int i = 0; i < N; i += v_double_len ())
- f (v_double_load (A+i));
+ for (int i = 0; i < N; i += sv_double_len ())
+ f (sv_double_load (A+i), svptrue_b64 ());
}
static void
-runf_vn_thruput (__vpcs v_float f (v_float))
+runf_sv_thruput (sv_float f (sv_float, sv_bool))
{
- for (int i = 0; i < N; i += v_float_len ())
- f (v_float_load (Af+i));
+ for (int i = 0; i < N; i += sv_float_len ())
+ f (sv_float_load (Af+i), svptrue_b32 ());
}
static void
-run_vn_latency (__vpcs v_double f (v_double))
+run_sv_latency (sv_double f (sv_double, sv_bool))
{
- v_double z = v_double_dup (zero);
- v_double prev = z;
- for (int i = 0; i < N; i += v_double_len ())
- prev = f (v_double_load (A+i) + prev * z);
+ volatile sv_bool vsel = svptrue_b64 ();
+ sv_bool sel = vsel;
+ sv_double prev = sv_double_dup (0);
+ for (int i = 0; i < N; i += sv_double_len ())
+ prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
}
static void
-runf_vn_latency (__vpcs v_float f (v_float))
+runf_sv_latency (sv_float f (sv_float, sv_bool))
{
- v_float z = v_float_dup (zero);
- v_float prev = z;
- for (int i = 0; i < N; i += v_float_len ())
- prev = f (v_float_load (Af+i) + prev * z);
+ volatile sv_bool vsel = svptrue_b32 ();
+ sv_bool sel = vsel;
+ sv_float prev = sv_float_dup (0);
+ for (int i = 0; i < N; i += sv_float_len ())
+ prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
}
#endif
@@ -539,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
const char *s = type == 't' ? "rthruput" : "latency";
int vlen = 1;
- if (f->vec && f->prec == 'd')
- vlen = v_double_len();
- else if (f->vec && f->prec == 'f')
- vlen = v_float_len();
+ if (f->vec == 'n')
+ vlen = f->prec == 'd' ? v_double_len() : v_float_len();
+ else if (f->vec == 's')
+ vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
if (f->prec == 'd' && type == 't' && f->vec == 0)
TIMEIT (run_thruput, f->fun.d);
@@ -552,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi)
TIMEIT (runf_thruput, f->fun.f);
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
TIMEIT (runf_latency, f->fun.f);
- else if (f->prec == 'd' && type == 't' && f->vec == 'v')
- TIMEIT (run_v_thruput, f->fun.vd);
- else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
- TIMEIT (run_v_latency, f->fun.vd);
- else if (f->prec == 'f' && type == 't' && f->vec == 'v')
- TIMEIT (runf_v_thruput, f->fun.vf);
- else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
- TIMEIT (runf_v_latency, f->fun.vf);
#ifdef __vpcs
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
TIMEIT (run_vn_thruput, f->fun.vnd);
@@ -570,20 +427,32 @@ bench1 (const struct fun *f, int type, double lo, double hi)
else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
TIMEIT (runf_vn_latency, f->fun.vnf);
#endif
+#if WANT_SVE_MATH
+ else if (f->prec == 'd' && type == 't' && f->vec == 's')
+ TIMEIT (run_sv_thruput, f->fun.svd);
+ else if (f->prec == 'd' && type == 'l' && f->vec == 's')
+ TIMEIT (run_sv_latency, f->fun.svd);
+ else if (f->prec == 'f' && type == 't' && f->vec == 's')
+ TIMEIT (runf_sv_thruput, f->fun.svf);
+ else if (f->prec == 'f' && type == 'l' && f->vec == 's')
+ TIMEIT (runf_sv_latency, f->fun.svf);
+#endif
if (type == 't')
{
ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
- printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
+ printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
+ f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
- (unsigned long long) dt, lo, hi);
+ (unsigned long long) dt, lo, hi, vlen);
}
else if (type == 'l')
{
ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
- printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
+ printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
+ f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
- (unsigned long long) dt, lo, hi);
+ (unsigned long long) dt, lo, hi, vlen);
}
fflush (stdout);
}
diff --git a/contrib/arm-optimized-routines/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h
new file mode 100644
index 000000000000..84c4e68650ac
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/mathbench_funcs.h
@@ -0,0 +1,62 @@
+/*
+ * Function entries for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+/* clang-format off */
+D (exp, -9.9, 9.9)
+D (exp, 0.5, 1.0)
+D (exp10, -9.9, 9.9)
+D (exp2, -9.9, 9.9)
+D (log, 0.01, 11.1)
+D (log, 0.999, 1.001)
+D (log2, 0.01, 11.1)
+D (log2, 0.999, 1.001)
+{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
+D (xpow, 0.01, 11.1)
+D (ypow, -9.9, 9.9)
+D (erf, -6.0, 6.0)
+
+F (expf, -9.9, 9.9)
+F (exp2f, -9.9, 9.9)
+F (logf, 0.01, 11.1)
+F (log2f, 0.01, 11.1)
+{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
+F (xpowf, 0.01, 11.1)
+F (ypowf, -9.9, 9.9)
+{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
+{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
+F (sinf, 0.1, 0.7)
+F (sinf, 0.8, 3.1)
+F (sinf, -3.1, 3.1)
+F (sinf, 3.3, 33.3)
+F (sinf, 100, 1000)
+F (sinf, 1e6, 1e32)
+F (cosf, 0.1, 0.7)
+F (cosf, 0.8, 3.1)
+F (cosf, -3.1, 3.1)
+F (cosf, 3.3, 33.3)
+F (cosf, 100, 1000)
+F (cosf, 1e6, 1e32)
+F (erff, -4.0, 4.0)
+#ifdef __vpcs
+VND (_ZGVnN2v_exp, -9.9, 9.9)
+VND (_ZGVnN2v_log, 0.01, 11.1)
+{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
+VND (_ZGVnN2v_sin, -3.1, 3.1)
+VND (_ZGVnN2v_cos, -3.1, 3.1)
+VNF (_ZGVnN4v_expf, -9.9, 9.9)
+VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
+VNF (_ZGVnN4v_logf, 0.01, 11.1)
+{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
+VNF (_ZGVnN4v_sinf, -3.1, 3.1)
+VNF (_ZGVnN4v_cosf, -3.1, 3.1)
+#endif
+ /* clang-format on */
diff --git a/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h
new file mode 100644
index 000000000000..062b9db56de5
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/mathbench_wrappers.h
@@ -0,0 +1,66 @@
+/*
+ * Function wrappers for mathbench.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifdef __vpcs
+
+__vpcs static v_float
+xy_Z_powf (v_float x)
+{
+ return _ZGVnN4vv_powf (x, x);
+}
+
+__vpcs static v_double
+xy_Z_pow (v_double x)
+{
+ return _ZGVnN2vv_pow (x, x);
+}
+
+#endif
+
+static double
+xypow (double x)
+{
+ return pow (x, x);
+}
+
+static float
+xypowf (float x)
+{
+ return powf (x, x);
+}
+
+static double
+xpow (double x)
+{
+ return pow (x, 23.4);
+}
+
+static float
+xpowf (float x)
+{
+ return powf (x, 23.4f);
+}
+
+static double
+ypow (double x)
+{
+ return pow (2.34, x);
+}
+
+static float
+ypowf (float x)
+{
+ return powf (2.34f, x);
+}
+
+static float
+sincosf_wrap (float x)
+{
+ float s, c;
+ sincosf (x, &s, &c);
+ return s + c;
+}
diff --git a/contrib/arm-optimized-routines/math/test/mathtest.c b/contrib/arm-optimized-routines/math/test/mathtest.c
index 310896738e47..834233fdde9d 100644
--- a/contrib/arm-optimized-routines/math/test/mathtest.c
+++ b/contrib/arm-optimized-routines/math/test/mathtest.c
@@ -1,8 +1,8 @@
/*
* mathtest.c - test rig for mathlib
*
- * Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 1998-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <assert.h>
@@ -196,9 +196,11 @@ int is_complex_rettype(int rettype) {
#define TFUNCARM(arg,ret,name,tolerance) { t_func, arg, ret, (void*)& ARM_PREFIX(name), m_none, tolerance, #name }
#define MFUNC(arg,ret,name,tolerance) { t_macro, arg, ret, NULL, m_##name, tolerance, #name }
+#ifndef PL
/* sincosf wrappers for easier testing. */
static float sincosf_sinf(float x) { float s,c; sincosf(x, &s, &c); return s; }
static float sincosf_cosf(float x) { float s,c; sincosf(x, &s, &c); return c; }
+#endif
test_func tfuncs[] = {
/* trigonometric */
@@ -218,9 +220,10 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, tanf, 4*ULPUNIT),
TFUNCARM(at_s,rt_s, sinf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, cosf, 3*ULPUNIT/4),
+#ifndef PL
TFUNCARM(at_s,rt_s, sincosf_sinf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, sincosf_cosf, 3*ULPUNIT/4),
-
+#endif
/* hyperbolic */
TFUNC(at_d, rt_d, atanh, 4*ULPUNIT),
TFUNC(at_d, rt_d, asinh, 4*ULPUNIT),
@@ -251,6 +254,7 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
TFUNC(at_s,rt_s, expm1f, ULPUNIT),
+ TFUNC(at_d,rt_d, exp10, ULPUNIT),
/* power */
TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@@ -1018,6 +1022,7 @@ int runtest(testdetail t) {
DO_DOP(d_arg1,op1r);
DO_DOP(d_arg2,op2r);
s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
+ s_res.i = 0;
/*
* Detect NaNs, infinities and denormals on input, and set a
@@ -1152,22 +1157,25 @@ int runtest(testdetail t) {
tresultr[0] = t.resultr[0];
tresultr[1] = t.resultr[1];
resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
+ resulti[0] = resulti[1] = 0;
wres = 2;
break;
case rt_i:
tresultr[0] = t.resultr[0];
resultr[0] = intres;
+ resulti[0] = 0;
wres = 1;
break;
case rt_s:
case rt_s2:
tresultr[0] = t.resultr[0];
resultr[0] = s_res.i;
+ resulti[0] = 0;
wres = 1;
break;
default:
puts("unhandled rettype in runtest");
- wres = 0;
+ abort ();
}
if(t.resultc != rc_none) {
int err = 0;
diff --git a/contrib/arm-optimized-routines/math/test/rtest/dotest.c b/contrib/arm-optimized-routines/math/test/rtest/dotest.c
index 6be79e1df0d1..5b3e9b4f18e4 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/dotest.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/dotest.c
@@ -2,7 +2,7 @@
* dotest.c - actually generate mathlib test cases
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdio.h>
diff --git a/contrib/arm-optimized-routines/math/test/rtest/intern.h b/contrib/arm-optimized-routines/math/test/rtest/intern.h
index 12a9c749e18e..3ebd7ddaf85d 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/intern.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/intern.h
@@ -2,7 +2,7 @@
* intern.h
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef mathtest_intern_h
diff --git a/contrib/arm-optimized-routines/math/test/rtest/main.c b/contrib/arm-optimized-routines/math/test/rtest/main.c
index 0d8ead891320..3d533c946f79 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/main.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/main.c
@@ -2,7 +2,7 @@
* main.c
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <assert.h>
diff --git a/contrib/arm-optimized-routines/math/test/rtest/random.c b/contrib/arm-optimized-routines/math/test/rtest/random.c
index 56123966b8c4..1de32580b733 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/random.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/random.c
@@ -2,7 +2,7 @@
* random.c - random number generator for producing mathlib test cases
*
* Copyright (c) 1998-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "types.h"
diff --git a/contrib/arm-optimized-routines/math/test/rtest/random.h b/contrib/arm-optimized-routines/math/test/rtest/random.h
index b4b22df82a3d..0b477d72b234 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/random.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/random.h
@@ -2,7 +2,7 @@
* random.h - header for random.c
*
* Copyright (c) 2009-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "types.h"
diff --git a/contrib/arm-optimized-routines/math/test/rtest/semi.c b/contrib/arm-optimized-routines/math/test/rtest/semi.c
index c9f0daf76508..70a7844a48d6 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/semi.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/semi.c
@@ -2,7 +2,7 @@
* semi.c: test implementations of mathlib seminumerical functions
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <stdio.h>
diff --git a/contrib/arm-optimized-routines/math/test/rtest/semi.h b/contrib/arm-optimized-routines/math/test/rtest/semi.h
index 17dc4158fb51..7a1444e55d28 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/semi.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/semi.h
@@ -2,7 +2,7 @@
* semi.h: header for semi.c
*
* Copyright (c) 1999-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef test_semi_h
diff --git a/contrib/arm-optimized-routines/math/test/rtest/types.h b/contrib/arm-optimized-routines/math/test/rtest/types.h
index 53cd557fa4cf..e15b4e06a0d4 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/types.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/types.h
@@ -2,7 +2,7 @@
* types.h
*
* Copyright (c) 2005-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#ifndef mathtest_types_h
diff --git a/contrib/arm-optimized-routines/math/test/rtest/wrappers.c b/contrib/arm-optimized-routines/math/test/rtest/wrappers.c
index de45ac5768d0..441017192ab4 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/wrappers.c
+++ b/contrib/arm-optimized-routines/math/test/rtest/wrappers.c
@@ -2,7 +2,7 @@
* wrappers.c - wrappers to modify output of MPFR/MPC test functions
*
* Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include <assert.h>
diff --git a/contrib/arm-optimized-routines/math/test/rtest/wrappers.h b/contrib/arm-optimized-routines/math/test/rtest/wrappers.h
index 7b09c85a59f1..0a8a58777d8a 100644
--- a/contrib/arm-optimized-routines/math/test/rtest/wrappers.h
+++ b/contrib/arm-optimized-routines/math/test/rtest/wrappers.h
@@ -2,7 +2,7 @@
* wrappers.h - wrappers to modify output of MPFR/MPC test functions
*
* Copyright (c) 2014-2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
typedef struct {
diff --git a/contrib/arm-optimized-routines/math/test/runulp.sh b/contrib/arm-optimized-routines/math/test/runulp.sh
index 0190d9ab27fb..e2e03e3ae761 100755
--- a/contrib/arm-optimized-routines/math/test/runulp.sh
+++ b/contrib/arm-optimized-routines/math/test/runulp.sh
@@ -2,8 +2,8 @@
# ULP error check script.
#
-# Copyright (c) 2019-2020, Arm Limited.
-# SPDX-License-Identifier: MIT
+# Copyright (c) 2019-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
#set -x
set -eu
@@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
+L=0.02
+t exp10 0 0x1p-47 5000
+t exp10 -0 -0x1p-47 5000
+t exp10 0x1p-47 1 50000
+t exp10 -0x1p-47 -1 50000
+t exp10 1 0x1.34413509f79ffp8 50000
+t exp10 -1 -0x1.434e6420f4374p8 50000
+t exp10 0x1.34413509f79ffp8 inf 5000
+t exp10 -0x1.434e6420f4374p8 -inf 5000
+
L=1.0
Ldir=0.9
t erf 0 0xffff000000000000 10000
@@ -143,15 +153,10 @@ Ldir=0.5
done
# vector functions
+
Ldir=0.5
r='n'
-flags="${ULPFLAGS:--q} -f"
-runs=
-check __s_exp 1 && runs=1
-runv=
-check __v_exp 1 && runv=1
-runvn=
-check __vn_exp 1 && runvn=1
+flags="${ULPFLAGS:--q}"
range_exp='
0 0xffff000000000000 10000
@@ -177,9 +182,10 @@ range_pow='
'
range_sin='
- 0 0xffff000000000000 10000
- 0x1p-4 0x1p4 400000
- -0x1p-23 0x1p23 400000
+ 0 0x1p23 500000
+ -0 -0x1p23 500000
+ 0x1p23 inf 10000
+ -0x1p23 -inf 10000
'
range_cos="$range_sin"
@@ -199,9 +205,10 @@ range_logf='
'
range_sinf='
- 0 0xffff0000 10000
- 0x1p-4 0x1p4 300000
--0x1p-9 -0x1p9 300000
+ 0 0x1p20 500000
+ -0 -0x1p20 500000
+ 0x1p20 inf 10000
+ -0x1p20 -inf 10000
'
range_cosf="$range_sinf"
@@ -229,9 +236,8 @@ L_sinf=1.4
L_cosf=1.4
L_powf=2.1
-while read G F R
+while read G F D
do
- [ "$R" = 1 ] || continue
case "$G" in \#*) continue ;; esac
eval range="\${range_$G}"
eval L="\${L_$G}"
@@ -239,74 +245,35 @@ do
do
[ -n "$X" ] || continue
case "$X" in \#*) continue ;; esac
- t $F $X
+ disable_fenv=""
+ if [ -z "$WANT_SIMD_EXCEPT" ] || [ $WANT_SIMD_EXCEPT -eq 0 ]; then
+ # If library was built with SIMD exceptions
+ # disabled, disable fenv checking in ulp
+ # tool. Otherwise, fenv checking may still be
+ # disabled by adding -f to the end of the run
+ # line.
+ disable_fenv="-f"
+ fi
+ t $D $disable_fenv $F $X
done << EOF
$range
+
EOF
done << EOF
# group symbol run
-exp __s_exp $runs
-exp __v_exp $runv
-exp __vn_exp $runvn
-exp _ZGVnN2v_exp $runvn
-
-log __s_log $runs
-log __v_log $runv
-log __vn_log $runvn
-log _ZGVnN2v_log $runvn
-
-pow __s_pow $runs
-pow __v_pow $runv
-pow __vn_pow $runvn
-pow _ZGVnN2vv_pow $runvn
-
-sin __s_sin $runs
-sin __v_sin $runv
-sin __vn_sin $runvn
-sin _ZGVnN2v_sin $runvn
-
-cos __s_cos $runs
-cos __v_cos $runv
-cos __vn_cos $runvn
-cos _ZGVnN2v_cos $runvn
-
-expf __s_expf $runs
-expf __v_expf $runv
-expf __vn_expf $runvn
-expf _ZGVnN4v_expf $runvn
-
-expf_1u __s_expf_1u $runs
-expf_1u __v_expf_1u $runv
-expf_1u __vn_expf_1u $runvn
-
-exp2f __s_exp2f $runs
-exp2f __v_exp2f $runv
-exp2f __vn_exp2f $runvn
-exp2f _ZGVnN4v_exp2f $runvn
-
-exp2f_1u __s_exp2f_1u $runs
-exp2f_1u __v_exp2f_1u $runv
-exp2f_1u __vn_exp2f_1u $runvn
-
-logf __s_logf $runs
-logf __v_logf $runv
-logf __vn_logf $runvn
-logf _ZGVnN4v_logf $runvn
-
-sinf __s_sinf $runs
-sinf __v_sinf $runv
-sinf __vn_sinf $runvn
-sinf _ZGVnN4v_sinf $runvn
-
-cosf __s_cosf $runs
-cosf __v_cosf $runv
-cosf __vn_cosf $runvn
-cosf _ZGVnN4v_cosf $runvn
-
-powf __s_powf $runs
-powf __v_powf $runv
-powf __vn_powf $runvn
-powf _ZGVnN4vv_powf $runvn
+exp _ZGVnN2v_exp
+log _ZGVnN2v_log
+pow _ZGVnN2vv_pow -f
+sin _ZGVnN2v_sin -z
+cos _ZGVnN2v_cos
+expf _ZGVnN4v_expf
+expf_1u _ZGVnN4v_expf_1u -f
+exp2f _ZGVnN4v_exp2f
+exp2f_1u _ZGVnN4v_exp2f_1u -f
+logf _ZGVnN4v_logf
+sinf _ZGVnN4v_sinf -z
+cosf _ZGVnN4v_cosf
+powf _ZGVnN4vv_powf -f
EOF
[ 0 -eq $FAIL ] || {
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst
index 79160443f099..7ea0d45795a3 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/cosf.tst
@@ -1,7 +1,7 @@
; cosf.tst - Directed test cases for SP cosine
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=cosf op1=7fc00001 result=7fc00001 errno=0
func=cosf op1=ffc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst
index 7fa4d1868c0e..12384cef0dd9 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/erf.tst
@@ -1,7 +1,7 @@
; erf.tst - Directed test cases for erf
;
; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=erf op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=erf op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
index d05b7b1119c4..28f8fa37f5aa 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/erff.tst
@@ -1,7 +1,7 @@
; erff.tst
;
; Copyright (c) 2007-2020, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=erff op1=7fc00001 result=7fc00001 errno=0
func=erff op1=ffc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst
index 85d556cd1e00..0bb2ef4579cc 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp.tst
@@ -1,7 +1,7 @@
; Directed test cases for exp
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=exp op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=exp op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst
new file mode 100644
index 000000000000..2cf4273bd1d7
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp10.tst
@@ -0,0 +1,15 @@
+; Directed test cases for exp10
+;
+; Copyright (c) 2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
+func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst
index fa56c9f8be4b..7069f9010c8c 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2.tst
@@ -1,7 +1,7 @@
; Directed test cases for exp2
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=exp2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=exp2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst
index 38cfc3f78ac6..6ca2eeab4e12 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/exp2f.tst
@@ -1,7 +1,7 @@
; exp2f.tst - Directed test cases for exp2f
;
; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=exp2f op1=7fc00001 result=7fc00001 errno=0
func=exp2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst
index ff0f671c2656..89ae8fe78e6c 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/expf.tst
@@ -1,7 +1,7 @@
; expf.tst - Directed test cases for expf
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=expf op1=7fc00001 result=7fc00001 errno=0
func=expf op1=ffc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst
index a0aa398cbf73..686ea835645b 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log.tst
@@ -1,7 +1,7 @@
; Directed test cases for log
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=log op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
index ff1286cbd53e..361bddec374b 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log2.tst
@@ -1,7 +1,7 @@
; Directed test cases for log2
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
index 5832c4f08f1e..5fce051cddba 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/log2f.tst
@@ -1,7 +1,7 @@
; log2f.tst - Directed test cases for log2f
;
; Copyright (c) 2017-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=log2f op1=7fc00001 result=7fc00001 errno=0
func=log2f op1=ffc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst
index 6e68a36e0f6a..a6d1b9d5c51f 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/logf.tst
@@ -1,7 +1,7 @@
; logf.tst - Directed test cases for logf
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=logf op1=7fc00001 result=7fc00001 errno=0
func=logf op1=ffc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst
index 19665817153d..879d12864afe 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/pow.tst
@@ -1,7 +1,7 @@
; Directed test cases for pow
;
; Copyright (c) 2018-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=pow op1=00000000.00000000 op2=00000000.00000000 result=3ff00000.00000000 errno=0
func=pow op1=00000000.00000000 op2=00000000.00000001 result=00000000.00000000 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst
index 3fa8b110f8bc..46d522400871 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/powf.tst
@@ -1,7 +1,7 @@
; powf.tst - Directed test cases for powf
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=powf op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i
func=powf op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst
index 4b33d2291c66..cddb346558ea 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/sincosf.tst
@@ -1,7 +1,7 @@
; Directed test cases for SP sincos
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=sincosf_sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst b/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst
index ded80b1598c6..041b13d5d6cb 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/directed/sinf.tst
@@ -1,7 +1,7 @@
; sinf.tst - Directed test cases for SP sine
;
; Copyright (c) 2007-2019, Arm Limited.
-; SPDX-License-Identifier: MIT
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
func=sinf op1=7fc00001 result=7fc00001 errno=0
diff --git a/contrib/arm-optimized-routines/math/test/testcases/random/double.tst b/contrib/arm-optimized-routines/math/test/testcases/random/double.tst
index c24ff80d5d95..8e885d61722a 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/random/double.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/random/double.tst
@@ -1,7 +1,7 @@
!! double.tst - Random test case specification for DP functions
!!
!! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
test exp 10000
test exp2 10000
diff --git a/contrib/arm-optimized-routines/math/test/testcases/random/float.tst b/contrib/arm-optimized-routines/math/test/testcases/random/float.tst
index d02a22750abe..ea4a5a015214 100644
--- a/contrib/arm-optimized-routines/math/test/testcases/random/float.tst
+++ b/contrib/arm-optimized-routines/math/test/testcases/random/float.tst
@@ -1,7 +1,7 @@
!! single.tst - Random test case specification for SP functions
!!
!! Copyright (c) 1999-2019, Arm Limited.
-!! SPDX-License-Identifier: MIT
+!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
test sinf 10000
test cosf 10000
diff --git a/contrib/arm-optimized-routines/math/test/ulp.c b/contrib/arm-optimized-routines/math/test/ulp.c
index 51479b87a0fd..5ff29972e50e 100644
--- a/contrib/arm-optimized-routines/math/test/ulp.c
+++ b/contrib/arm-optimized-routines/math/test/ulp.c
@@ -1,10 +1,11 @@
/*
* ULP error checking tool for math functions.
*
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#define _GNU_SOURCE
#include <ctype.h>
#include <fenv.h>
#include <float.h>
@@ -23,11 +24,6 @@
# include <mpfr.h>
#endif
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
-#endif
-
static inline uint64_t
asuint64 (double f)
{
@@ -212,73 +208,61 @@ struct conf
unsigned long long n;
double softlim;
double errlim;
+ int ignore_zero_sign;
};
-/* Wrappers for sincos. */
-static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
-static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
-static double sincos_sin(double x) {(void)cos(x); return sin(x);}
-static double sincos_cos(double x) {(void)sin(x); return cos(x);}
-#if USE_MPFR
-static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
-static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
-#endif
-
/* A bit of a hack: call vector functions twice with the same
input in lane 0 but a different value in other lanes: once
with an in-range value and then with a special case value. */
static int secondcall;
/* Wrappers for vector functions. */
-#if __aarch64__ && WANT_VMATH
+#ifdef __vpcs
typedef __f32x4_t v_float;
typedef __f64x2_t v_double;
-static const float fv[2] = {1.0f, -INFINITY};
-static const double dv[2] = {1.0, -INFINITY};
+/* First element of fv and dv may be changed by -c argument. */
+static float fv[2] = {1.0f, -INFINITY};
+static double dv[2] = {1.0, -INFINITY};
static inline v_float argf(float x) { return (v_float){x,x,x,fv[secondcall]}; }
static inline v_double argd(double x) { return (v_double){x,dv[secondcall]}; }
-
-static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
-static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
-static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
-static float v_expf(float x) { return __v_expf(argf(x))[0]; }
-static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
-static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
-static float v_logf(float x) { return __v_logf(argf(x))[0]; }
-static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
-static double v_sin(double x) { return __v_sin(argd(x))[0]; }
-static double v_cos(double x) { return __v_cos(argd(x))[0]; }
-static double v_exp(double x) { return __v_exp(argd(x))[0]; }
-static double v_log(double x) { return __v_log(argd(x))[0]; }
-static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
-#ifdef __vpcs
-static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
-static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
-static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
-static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
-static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
-static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
-static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
-static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
-static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
-static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
-static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
-static double vn_log(double x) { return __vn_log(argd(x))[0]; }
-static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
-static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
-static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
-static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
-static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
-static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
-static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
-static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
-static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
-static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
-static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
-static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#if WANT_SVE_MATH
+#include <arm_sve.h>
+typedef __SVFloat32_t sv_float;
+typedef __SVFloat64_t sv_double;
+
+static inline sv_float svargf(float x) {
+ int n = svcntw();
+ float base[n];
+ for (int i=0; i<n; i++)
+ base[i] = (float)x;
+ base[n-1] = (float) fv[secondcall];
+ return svld1(svptrue_b32(), base);
+}
+static inline sv_double svargd(double x) {
+ int n = svcntd();
+ double base[n];
+ for (int i=0; i<n; i++)
+ base[i] = x;
+ base[n-1] = dv[secondcall];
+ return svld1(svptrue_b64(), base);
+}
+static inline float svretf(sv_float vec) {
+ int n = svcntw();
+ float res[n];
+ svst1(svptrue_b32(), res, vec);
+ return res[0];
+}
+static inline double svretd(sv_double vec) {
+ int n = svcntd();
+ double res[n];
+ svst1(svptrue_b64(), res, vec);
+ return res[0];
+}
#endif
#endif
+#include "test/ulp_wrappers.h"
+
struct fun
{
const char *name;
@@ -322,83 +306,44 @@ static const struct fun fun[] = {
#define F2(x) F (x##f, x##f, x, mpfr_##x, 2, 1, f2, 0)
#define D1(x) F (x, x, x##l, mpfr_##x, 1, 0, d1, 0)
#define D2(x) F (x, x, x##l, mpfr_##x, 2, 0, d2, 0)
- F1 (sin)
- F1 (cos)
- F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
- F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
- F1 (exp)
- F1 (exp2)
- F1 (log)
- F1 (log2)
- F2 (pow)
- F1 (erf)
- D1 (exp)
- D1 (exp2)
- D1 (log)
- D1 (log2)
- D2 (pow)
- D1 (erf)
-#if WANT_VMATH
- F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
- F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
- F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
- F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
- F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
- F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
- F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
- F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
- F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
-#if __aarch64__
- F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#ifdef __vpcs
- F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
- F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
- F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
- F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
-#endif
-#endif
-#endif
+/* Neon routines. */
+#define VF1(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VF2(x) F (__v_##x##f, v_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VD1(x) F (__v_##x, v_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VD2(x) F (__v_##x, v_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define VNF1(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define VNF2(x) F (__vn_##x##f, vn_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define VND1(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define VND2(x) F (__vn_##x, vn_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVF1(x) F (_ZGVnN4v_##x##f, Z_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZVF2(x) F (_ZGVnN4vv_##x##f, Z_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZVD1(x) F (_ZGVnN2v_##x, Z_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZVD2(x) F (_ZGVnN2vv_##x, Z_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZVNF1(x) VNF1 (x) ZVF1 (x)
+#define ZVNF2(x) VNF2 (x) ZVF2 (x)
+#define ZVND1(x) VND1 (x) ZVD1 (x)
+#define ZVND2(x) VND2 (x) ZVD2 (x)
+/* SVE routines. */
+#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define SVD1(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define SVD2(x) F (__sv_##x, sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+#define ZSVF1(x) F (_ZGVsMxv_##x##f, Z_sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
+#define ZSVF2(x) F (_ZGVsMxvv_##x##f, Z_sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
+#define ZSVD1(x) F (_ZGVsMxv_##x, Z_sv_##x, x##l, mpfr_##x, 1, 0, d1, 0)
+#define ZSVD2(x) F (_ZGVsMxvv_##x, Z_sv_##x, x##l, mpfr_##x, 2, 0, d2, 0)
+
+#include "test/ulp_funcs.h"
+
#undef F
#undef F1
#undef F2
#undef D1
#undef D2
+#undef SVF1
+#undef SVF2
+#undef SVD1
+#undef SVD2
{0}};
/* Boilerplate for generic calls. */
@@ -639,12 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
static void
usage (void)
{
- puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
+ puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func "
"lo [hi [x lo2 hi2] [count]]");
puts ("Compares func against a higher precision implementation in [lo; hi].");
puts ("-q: quiet.");
puts ("-m: use mpfr even if faster method is available.");
- puts ("-f: disable fenv testing (rounding modes and exceptions).");
+ puts ("-f: disable fenv exceptions testing.");
+#ifdef ___vpcs
+ puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
+ " This should be different from tested input in other lanes, and non-special \n"
+ " (i.e. should not trigger fenv exceptions). Default is 1.");
+#endif
+ puts ("-z: ignore sign of 0.");
puts ("Supported func:");
for (const struct fun *f = fun; f->name; f++)
printf ("\t%s\n", f->name);
@@ -768,6 +719,7 @@ main (int argc, char *argv[])
conf.fenv = 1;
conf.softlim = 0;
conf.errlim = INFINITY;
+ conf.ignore_zero_sign = 0;
for (;;)
{
argc--;
@@ -807,11 +759,22 @@ main (int argc, char *argv[])
{
argc--;
argv++;
- if (argc < 1)
+ if (argc < 1 || argv[0][1] != '\0')
usage ();
conf.rc = argv[0][0];
}
break;
+ case 'z':
+ conf.ignore_zero_sign = 1;
+ break;
+#ifdef __vpcs
+ case 'c':
+ argc--;
+ argv++;
+ fv[0] = strtof(argv[0], 0);
+ dv[0] = strtod(argv[0], 0);
+ break;
+#endif
default:
usage ();
}
@@ -837,7 +800,19 @@ main (int argc, char *argv[])
if (strcmp (argv[0], f->name) == 0)
break;
if (!f->name)
- usage ();
+ {
+#ifndef __vpcs
+ /* Ignore vector math functions if vector math is not supported. */
+ if (strncmp (argv[0], "_ZGVnN", 6) == 0)
+ exit (0);
+#endif
+#if !WANT_SVE_MATH
+ if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
+ exit (0);
+#endif
+ printf ("math function %s not supported\n", argv[0]);
+ exit (1);
+ }
if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */
if (!USE_MPFR && conf.mpfr)
diff --git a/contrib/arm-optimized-routines/math/test/ulp.h b/contrib/arm-optimized-routines/math/test/ulp.h
index a0c301664321..b0bc59aeef8d 100644
--- a/contrib/arm-optimized-routines/math/test/ulp.h
+++ b/contrib/arm-optimized-routines/math/test/ulp.h
@@ -1,8 +1,8 @@
/*
* Generic functions for ULP error estimation.
*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
/* For each different math function type,
@@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
/* Difference between exact result and closest real number that
gets rounded to got, i.e. error before rounding, for a correctly
rounded result the difference is 0. */
-static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
+static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
+ int ignore_zero_sign)
{
RT(float) want = p->y;
RT(float) d;
@@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
if (RT(asuint) (got) == RT(asuint) (want))
return 0.0;
+ if (isnan (got) && isnan (want))
+ /* Ignore sign of NaN. */
+ return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
if (signbit (got) != signbit (want))
- /* May have false positives with NaN. */
- //return isnan(got) && isnan(want) ? 0 : INFINITY;
- return INFINITY;
+ {
+ /* Fall through to ULP calculation if ignoring sign of zero and at
+ exactly one of want and got is non-zero. */
+ if (ignore_zero_sign && want == got)
+ return 0.0;
+ if (!ignore_zero_sign || (want != 0 && got != 0))
+ return INFINITY;
+ }
if (!isfinite (want) || !isfinite (got))
{
if (isnan (got) != isnan (want))
@@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
int r, RT(float) * y, int *ex)
{
+ if (r != FE_TONEAREST)
+ fesetround (r);
*y = T(call) (f, a);
*ex = 0;
+ if (r != FE_TONEAREST)
+ fesetround (FE_TONEAREST);
}
static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
@@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
int r, struct RT(ret) * p,
RT(float) ygot, int exgot)
{
+ if (r != FE_TONEAREST)
+ fesetround (r);
RT(double) yl = T(call_long) (f, a);
p->y = (RT(float)) yl;
+ if (r != FE_TONEAREST)
+ fesetround (FE_TONEAREST);
if (RT(isok_nofenv) (ygot, p->y))
return 1;
p->ulpexp = RT(ulpscale) (p->y);
@@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
if (!ok)
{
int print = 0;
- double err = RT(ulperr) (ygot, &want, r);
+ double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
double abserr = fabs (err);
// TODO: count errors below accuracy limit.
if (abserr > 0)
diff --git a/contrib/arm-optimized-routines/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/math/test/ulp_funcs.h
new file mode 100644
index 000000000000..84f7927d3935
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/ulp_funcs.h
@@ -0,0 +1,40 @@
+/*
+ * Function entries for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+/* clang-format off */
+ F1 (sin)
+ F1 (cos)
+ F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
+ F (sincosf_cosf, sincosf_cosf, sincos_cos, sincos_mpfr_cos, 1, 1, f1, 0)
+ F1 (exp)
+ F1 (exp2)
+ F1 (log)
+ F1 (log2)
+ F2 (pow)
+ F1 (erf)
+ D1 (exp)
+ D1 (exp10)
+ D1 (exp2)
+ D1 (log)
+ D1 (log2)
+ D2 (pow)
+ D1 (erf)
+#ifdef __vpcs
+ F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
+ F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
+ F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
+ F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
+ F (_ZGVnN2v_sin, Z_sin, sinl, mpfr_sin, 1, 0, d1, 1)
+ F (_ZGVnN2v_cos, Z_cos, cosl, mpfr_cos, 1, 0, d1, 1)
+ F (_ZGVnN2v_exp, Z_exp, expl, mpfr_exp, 1, 0, d1, 1)
+ F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
+ F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
+#endif
+/* clang-format on */
diff --git a/contrib/arm-optimized-routines/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h
new file mode 100644
index 000000000000..60dc3d6dd652
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/test/ulp_wrappers.h
@@ -0,0 +1,37 @@
+/*
+ * Function wrappers for ulp.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* clang-format off */
+
+/* Wrappers for sincos. */
+static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
+static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
+static double sincos_sin(double x) {(void)cos(x); return sin(x);}
+static double sincos_cos(double x) {(void)sin(x); return cos(x);}
+#if USE_MPFR
+static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_cos(y,x,r); return mpfr_sin(y,x,r); }
+static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,x,r); return mpfr_cos(y,x,r); }
+#endif
+
+/* Wrappers for vector functions. */
+#ifdef __vpcs
+static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
+static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
+static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
+static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
+static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
+static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
+static double Z_sin(double x) { return _ZGVnN2v_sin(argd(x))[0]; }
+static double Z_cos(double x) { return _ZGVnN2v_cos(argd(x))[0]; }
+static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
+static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
+static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
+#endif
+
+/* clang-format on */
diff --git a/contrib/arm-optimized-routines/math/tgamma128.c b/contrib/arm-optimized-routines/math/tgamma128.c
new file mode 100644
index 000000000000..65deacc49d99
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/tgamma128.c
@@ -0,0 +1,356 @@
+/*
+ * Implementation of the true gamma function (as opposed to lgamma)
+ * for 128-bit long double.
+ *
+ * Copyright (c) 2006-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/*
+ * This module implements the float128 gamma function under the name
+ * tgamma128. It's expected to be suitable for integration into system
+ * maths libraries under the standard name tgammal, if long double is
+ * 128-bit. Such a library will probably want to check the error
+ * handling and optimize the initial process of extracting the
+ * exponent, which is done here by simple and portable (but
+ * potentially slower) methods.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+/* Only binary128 format is supported. */
+#if LDBL_MANT_DIG == 113
+
+#include "tgamma128.h"
+
+#define lenof(x) (sizeof(x)/sizeof(*(x)))
+
+/*
+ * Helper routine to evaluate a polynomial via Horner's rule
+ */
+static long double poly(const long double *coeffs, size_t n, long double x)
+{
+ long double result = coeffs[--n];
+
+ while (n > 0)
+ result = (result * x) + coeffs[--n];
+
+ return result;
+}
+
+/*
+ * Compute sin(pi*x) / pi, for use in the reflection formula that
+ * relates gamma(-x) and gamma(x).
+ */
+static long double sin_pi_x_over_pi(long double x)
+{
+ int quo;
+ long double fracpart = remquol(x, 0.5L, &quo);
+
+ long double sign = 1.0L;
+ if (quo & 2)
+ sign = -sign;
+ quo &= 1;
+
+ if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
+ /* For numbers this size, sin(pi*x) is so close to pi*x that
+ * sin(pi*x)/pi is indistinguishable from x in float128 */
+ return sign * fracpart;
+ }
+
+ if (quo == 0) {
+ return sign * sinl(pi*fracpart) / pi;
+ } else {
+ return sign * cosl(pi*fracpart) / pi;
+ }
+}
+
+/* Return tgamma(x) on the assumption that x >= 8. */
+static long double tgamma_large(long double x,
+ bool negative, long double negadjust)
+{
+ /*
+ * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
+ * where K is a correction factor computed as a polynomial in 1/x.
+ *
+ * (Vaguely inspired by the form of the Lanczos approximation, but
+ * I tried the Lanczos approximation itself and it suffers badly
+ * from big cancellation leading to loss of significance.)
+ */
+ long double t = 1/x;
+ long double p = poly(coeffs_large, lenof(coeffs_large), t);
+
+ /*
+ * To avoid overflow in cases where x^(x-0.5) does overflow
+ * but gamma(x) does not, we split x^(x-0.5) in half and
+ * multiply back up _after_ multiplying the shrinking factor
+ * of exp(-(x-0.5)).
+ *
+ * Note that computing x-0.5 and (x-0.5)/2 is exact for the
+ * relevant range of x, so the only sources of error are pow
+ * and exp themselves, plus the multiplications.
+ */
+ long double powhalf = powl(x, (x-0.5L)/2.0L);
+ long double expret = expl(-(x-0.5L));
+
+ if (!negative) {
+ return (expret * powhalf) * powhalf * p;
+ } else {
+ /*
+ * Apply the reflection formula as commented below, but
+ * carefully: negadjust has magnitude less than 1, so it can
+ * turn a case where gamma(+x) would overflow into a case
+ * where gamma(-x) doesn't underflow. Not only that, but the
+ * FP format has greater range in the tiny domain due to
+ * denormals. For both reasons, it's not good enough to
+ * compute the positive result and then adjust it.
+ */
+ long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
+ return ret / powhalf;
+ }
+}
+
+/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
+static long double tgamma_tiny(long double x,
+ bool negative, long double negadjust)
+{
+ /*
+ * For x near zero, we use a polynomial approximation to
+ * g = 1/(x*gamma(x)), and then return 1/(g*x).
+ */
+ long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
+ if (!negative)
+ return 1.0L / (g*x);
+ else
+ return g / negadjust;
+}
+
+/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
+static long double tgamma_ultratiny(long double x, bool negative,
+ long double negadjust)
+{
+ /* On this interval, gamma can't even be distinguished from 1/x,
+ * so we skip the polynomial evaluation in tgamma_tiny, partly to
+ * save time and partly to avoid the tiny intermediate values
+ * setting the underflow exception flag. */
+ if (!negative)
+ return 1.0L / x;
+ else
+ return 1.0L / negadjust;
+}
+
+/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
+static long double tgamma_central(long double x)
+{
+ /*
+ * In this central interval, our strategy is to finding the
+ * difference between x and the point where gamma has a minimum,
+ * and approximate based on that.
+ */
+
+ /* The difference between the input x and the minimum x. The first
+ * subtraction is expected to be exact, since x and min_hi have
+ * the same exponent (unless x=2, in which case it will still be
+ * exact). */
+ long double t = (x - min_x_hi) - min_x_lo;
+
+ /*
+ * Now use two different polynomials for the intervals [1,m] and
+ * [m,2].
+ */
+ long double p;
+ if (t < 0)
+ p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
+ else
+ p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
+
+ return (min_y_lo + p * (t*t)) + min_y_hi;
+}
+
+long double tgamma128(long double x)
+{
+ /*
+ * Start by extracting the number's sign and exponent, and ruling
+ * out cases of non-normalized numbers.
+ *
+ * For an implementation integrated into a system libm, it would
+ * almost certainly be quicker to do this by direct bitwise access
+ * to the input float128 value, using whatever is the local idiom
+ * for knowing its endianness.
+ *
+ * Integration into a system libc may also need to worry about
+ * setting errno, if that's the locally preferred way to report
+ * math.h errors.
+ */
+ int sign = signbit(x);
+ int exponent;
+ switch (fpclassify(x)) {
+ case FP_NAN:
+ return x+x; /* propagate QNaN, make SNaN throw an exception */
+ case FP_ZERO:
+ return 1/x; /* divide by zero on purpose to indicate a pole */
+ case FP_INFINITE:
+ if (sign) {
+ return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
+ * IEEE invalid operation exception to indicate that */
+ }
+ return x; /* but gamma(+inf) is just +inf with no error */
+ case FP_SUBNORMAL:
+ exponent = -16384;
+ break;
+ default:
+ frexpl(x, &exponent);
+ exponent--;
+ break;
+ }
+
+ bool negative = false;
+ long double negadjust = 0.0L;
+
+ if (sign) {
+ /*
+ * Euler's reflection formula is
+ *
+ * gamma(1-x) gamma(x) = pi/sin(pi*x)
+ *
+ * pi
+ * => gamma(x) = --------------------
+ * gamma(1-x) sin(pi*x)
+ *
+ * But computing 1-x is going to lose a lot of accuracy when x
+ * is very small, so instead we transform using the recurrence
+ * gamma(t+1)=t gamma(t). Setting t=-x, this gives us
+ * gamma(1-x) = -x gamma(-x), so we now have
+ *
+ * pi
+ * gamma(x) = ----------------------
+ * -x gamma(-x) sin(pi*x)
+ *
+ * which relates gamma(x) to gamma(-x), which is much nicer,
+ * since x can be turned into -x without rounding.
+ */
+ negadjust = sin_pi_x_over_pi(x);
+ negative = true;
+ x = -x;
+
+ /*
+ * Now the ultimate answer we want is
+ *
+ * 1 / (gamma(x) * x * negadjust)
+ *
+ * where x is the positive value we've just turned it into.
+ *
+ * For some of the cases below, we'll compute gamma(x)
+ * normally and then compute this adjusted value afterwards.
+ * But for others, we can implement the reciprocal operation
+ * in this formula by _avoiding_ an inversion that the
+ * sub-case was going to do anyway.
+ */
+
+ if (negadjust == 0) {
+ /*
+ * Special case for negative integers. Applying the
+ * reflection formula would cause division by zero, but
+ * standards would prefer we treat this error case as an
+ * invalid operation and return NaN instead. (Possibly
+ * because otherwise you'd have to decide which sign of
+ * infinity to return, and unlike the x=0 case, there's no
+ * sign of zero available to disambiguate.)
+ */
+ return negadjust / negadjust;
+ }
+ }
+
+ /*
+ * Split the positive domain into various cases. For cases where
+ * we do the negative-number adjustment the usual way, we'll leave
+ * the answer in 'g' and drop out of the if statement.
+ */
+ long double g;
+
+ if (exponent >= 11) {
+ /*
+ * gamma of any positive value this large overflows, and gamma
+ * of any negative value underflows.
+ */
+ if (!negative) {
+ long double huge = 0x1p+12288L;
+ return huge * huge; /* provoke an overflow */
+ } else {
+ long double tiny = 0x1p-12288L;
+ return tiny * tiny * negadjust; /* underflow, of the right sign */
+ }
+ } else if (exponent >= 3) {
+ /* Negative-number adjustment happens inside here */
+ return tgamma_large(x, negative, negadjust);
+ } else if (exponent < -113) {
+ /* Negative-number adjustment happens inside here */
+ return tgamma_ultratiny(x, negative, negadjust);
+ } else if (exponent < -5) {
+ /* Negative-number adjustment happens inside here */
+ return tgamma_tiny(x, negative, negadjust);
+ } else if (exponent == 0) {
+ g = tgamma_central(x);
+ } else if (exponent < 0) {
+ /*
+ * For x in [1/32,1) we range-reduce upwards to the interval
+ * [1,2), using the inverse of the normal recurrence formula:
+ * gamma(x) = gamma(x+1)/x.
+ */
+ g = tgamma_central(1+x) / x;
+ } else {
+ /*
+ * For x in [2,8) we range-reduce downwards to the interval
+ * [1,2) by repeated application of the recurrence formula.
+ *
+ * Actually multiplying (x-1) by (x-2) by (x-3) and so on
+ * would introduce multiple ULPs of rounding error. We can get
+ * better accuracy by writing x = (k+1/2) + t, where k is an
+ * integer and |t|<1/2, and expanding out the obvious factor
+ * (x-1)(x-2)...(x-k+1) as a polynomial in t.
+ */
+ long double mult;
+ int i = x;
+ if (i == 2) { /* x in [2,3) */
+ mult = (x-1);
+ } else {
+ long double t = x - (i + 0.5L);
+ switch (i) {
+ /* E.g. for x=3.5+t, we want
+ * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
+ case 3:
+ mult = 3.75L+t*(4.0L+t);
+ break;
+ case 4:
+ mult = 13.125L+t*(17.75L+t*(7.5L+t));
+ break;
+ case 5:
+ mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
+ break;
+ case 6:
+ mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
+ 117.5L+t*(17.5L+t))));
+ break;
+ case 7:
+ mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
+ 1140.0L+t*(231.25L+t*(24.0L+t)))));
+ break;
+ }
+ }
+
+ g = tgamma_central(x - (i-1)) * mult;
+ }
+
+ if (!negative) {
+ /* Positive domain: return g unmodified */
+ return g;
+ } else {
+ /* Negative domain: apply the reflection formula as commented above */
+ return 1.0L / (g * x * negadjust);
+ }
+}
+
+#endif
diff --git a/contrib/arm-optimized-routines/math/tgamma128.h b/contrib/arm-optimized-routines/math/tgamma128.h
new file mode 100644
index 000000000000..90875a22dce4
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/tgamma128.h
@@ -0,0 +1,141 @@
+/*
+ * Polynomial coefficients and other constants for tgamma128.c.
+ *
+ * Copyright (c) 2006-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* The largest positive value for which 128-bit tgamma does not overflow. */
+static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
+
+/* Coefficients of the polynomial used in the tgamma_large() subroutine */
+static const long double coeffs_large[] = {
+ 0x1.8535745aa79569579b9eec0f3bbcp+0L,
+ 0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
+ 0x1.59f6a05094f69686c3380f4e2783p-8L,
+ -0x1.0b291dee952a82764a4859b081a6p-8L,
+ -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
+ 0x1.387a8b5f38dd77e7f139b1021e86p-10L,
+ 0x1.bca46637f65b13750c728cc29e40p-14L,
+ -0x1.d80401c00aef998c9e303151a51cp-11L,
+ -0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
+ 0x1.4e950204437dcaf2be77f73a6f45p-10L,
+ 0x1.cb711a2d65f188bf60110934d6bep-14L,
+ -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
+ -0x1.0305ab9760cddb0d833e73766836p-12L,
+ 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
+ 0x1.bb4144740ad9290123fdcea684aap-11L,
+ -0x1.72ab4e88272a229bfafd192450f0p-5L,
+ 0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
+ 0x1.e222791c6743ce3e3cae220fb236p-3L,
+ 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
+ -0x1.9d204fa235a42cd901b123d2ad47p+1L,
+ 0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
+ 0x1.37f900a11dbd892abd7dde533e2dp+5L,
+ -0x1.2da49f4188dd89cb958369ef2401p+7L,
+ 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
+ -0x1.61433cebe649098c9611c4c7774ap+7L,
+};
+
+/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
+static const long double coeffs_tiny[] = {
+ 0x1.0000000000000000000000000000p+0L,
+ 0x1.2788cfc6fb618f49a37c7f0201fep-1L,
+ -0x1.4fcf4026afa2dceb8490ade22796p-1L,
+ -0x1.5815e8fa27047c8f42b5d9217244p-5L,
+ 0x1.5512320b43fbe5dfa771333518f7p-3L,
+ -0x1.59af103c340927bffdd44f954bfcp-5L,
+ -0x1.3b4af28483e210479657e5543366p-7L,
+ 0x1.d919c527f6070bfce9b29c2ace9cp-8L,
+ -0x1.317112ce35337def3556a18aa178p-10L,
+ -0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
+ 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
+ -0x1.51cf9f090b5dc398ba86305e3634p-16L,
+ -0x1.4e80f64c04a339740de06ca9fa4ap-20L,
+ 0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
+};
+
+/* The location within the interval [1,2] where gamma has a minimum.
+ * Specified as the sum of two 128-bit values, for extra precision. */
+static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
+static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L;
+
+/* The actual minimum value that gamma takes at that location.
+ * Again specified as the sum of two 128-bit values. */
+static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L;
+static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L;
+
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [1,min_x] */
+static const long double coeffs_central_neg[] = {
+ 0x1.b6c53f7377b83839c8a292e43b69p-2L,
+ 0x1.0bae9f40c7d09ed76e732045850ap-3L,
+ 0x1.4981175e14d04c3530e51d01c5fep-3L,
+ 0x1.79f77aaf032c948af3a9edbd2061p-4L,
+ 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
+ 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
+ 0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
+ 0x1.6df1de1e178ef72ca7bd63d40870p-6L,
+ 0x1.f63f502bde27e81c0f5e13479b43p-7L,
+ 0x1.57fd67d901f40ea011353ad89a0ap-7L,
+ 0x1.d7151376eed187eb753e2273cafcp-8L,
+ 0x1.427162b5c6ff1d904c71ef53e37cp-8L,
+ 0x1.b954b8c3a56cf93e49ef6538928ap-9L,
+ 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
+ 0x1.9d35250d9b9378d9b59df734537ap-10L,
+ 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
+ 0x1.7e0db39bb99cdb52b028d9359380p-11L,
+ 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
+ 0x1.27521cf5fd24dcdf43524e6add11p-13L,
+ 0x1.06461d62243bf9a826b42349672fp-10L,
+ -0x1.2b852abead28209b4e0c756dc46ep-9L,
+ 0x1.be673c11a72c826115ec6d286c14p-8L,
+ -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
+ 0x1.fa362bd2dc68f41abef2d8600acdp-6L,
+ -0x1.a21585b2f52f8b23855de8e452edp-5L,
+ 0x1.1f234431ed032052fc92e64e0493p-4L,
+ -0x1.40d332476ca0199c60cdae3f9132p-4L,
+ 0x1.1d45dc665d86012eba2eea199cefp-4L,
+ -0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
+ 0x1.7e7e2fbc6d49ad484300d6add324p-6L,
+ -0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
+ 0x1.30a2a73944f8c84998314d69c23fp-10L,
+};
+
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [min_x,2] */
+static const long double coeffs_central_pos[] = {
+ 0x1.b6c53f7377b83839c8a292e22aa2p-2L,
+ -0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
+ 0x1.4981175e14d04c3530ee5e1ecebcp-3L,
+ -0x1.79f77aaf032c948ac983d77f3e07p-4L,
+ 0x1.1e97bd10821095ab7dc94936cc11p-4L,
+ -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
+ 0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
+ -0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
+ 0x1.f63f502be57d11aed2cfe90843ffp-7L,
+ -0x1.57fd67d852f230015b9f64770273p-7L,
+ 0x1.d715138adc07e5fce81077070357p-8L,
+ -0x1.4271618e9fda8992a667adb15f4fp-8L,
+ 0x1.b954d15d9eb772e80fdd760672d7p-9L,
+ -0x1.2dfe391241d3cb79c8c15182843dp-9L,
+ 0x1.9d44396fcd48451c3ba924cee814p-10L,
+ -0x1.1ac195fb99739e341589e39803e6p-10L,
+ 0x1.82e46127b68f002770826e25f146p-11L,
+ -0x1.089dacd90d9f41493119ac178359p-11L,
+ 0x1.6993c007b20394a057d21f3d37f8p-12L,
+ -0x1.ec43a709f4446560c099dec8e31bp-13L,
+ 0x1.4ba36322f4074e9add9450f003cap-13L,
+ -0x1.b3f83a977965ca1b7937bf5b34cap-14L,
+ 0x1.10af346abc09cb25a6d9fe810b6ep-14L,
+ -0x1.38d8ea1188f242f50203edc395bdp-15L,
+ 0x1.39add987a948ec56f62b721a4475p-16L,
+ -0x1.02a4e141f286c8a967e2df9bc9adp-17L,
+ 0x1.433b50af22425f546e87113062d7p-19L,
+ -0x1.0c7b73cb0013f00aafc103e8e382p-21L,
+ 0x1.b852de313ec38da2297f6deaa6b4p-25L,
+};
+
+/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
+ */
+static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L;
diff --git a/contrib/arm-optimized-routines/math/tools/cos.sollya b/contrib/arm-optimized-routines/math/tools/cos.sollya
index bd72d6b74820..6690adfcbb9b 100644
--- a/contrib/arm-optimized-routines/math/tools/cos.sollya
+++ b/contrib/arm-optimized-routines/math/tools/cos.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating cos(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 8; // polynomial degree
a = -pi/4; // interval
diff --git a/contrib/arm-optimized-routines/math/tools/exp.sollya b/contrib/arm-optimized-routines/math/tools/exp.sollya
index b7a462cda5a4..0668bdb5b3d3 100644
--- a/contrib/arm-optimized-routines/math/tools/exp.sollya
+++ b/contrib/arm-optimized-routines/math/tools/exp.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating e^x
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 5; // poly degree
N = 128; // table entries
diff --git a/contrib/arm-optimized-routines/math/tools/exp2.sollya b/contrib/arm-optimized-routines/math/tools/exp2.sollya
index e760769601d4..bd0a42d6bbcb 100644
--- a/contrib/arm-optimized-routines/math/tools/exp2.sollya
+++ b/contrib/arm-optimized-routines/math/tools/exp2.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating 2^x
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
// exp2f parameters
deg = 3; // poly degree
diff --git a/contrib/arm-optimized-routines/math/tools/log.sollya b/contrib/arm-optimized-routines/math/tools/log.sollya
index 6df4db44b6f3..5288f5572925 100644
--- a/contrib/arm-optimized-routines/math/tools/log.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 12; // poly degree
// |log(1+x)| > 0x1p-4 outside the interval
diff --git a/contrib/arm-optimized-routines/math/tools/log2.sollya b/contrib/arm-optimized-routines/math/tools/log2.sollya
index 4a364c0f111f..85811be5d90c 100644
--- a/contrib/arm-optimized-routines/math/tools/log2.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log2.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log2(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 11; // poly degree
// |log2(1+x)| > 0x1p-4 outside the interval
diff --git a/contrib/arm-optimized-routines/math/tools/log2_abs.sollya b/contrib/arm-optimized-routines/math/tools/log2_abs.sollya
index 82c4dac26fa1..d018ba0145d2 100644
--- a/contrib/arm-optimized-routines/math/tools/log2_abs.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log2_abs.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log2(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 7; // poly degree
// interval ~= 1/(2*N), where N is the table entries
diff --git a/contrib/arm-optimized-routines/math/tools/log_abs.sollya b/contrib/arm-optimized-routines/math/tools/log_abs.sollya
index a2ac190fc497..5f9bfe41a683 100644
--- a/contrib/arm-optimized-routines/math/tools/log_abs.sollya
+++ b/contrib/arm-optimized-routines/math/tools/log_abs.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating log(1+x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 6; // poly degree
// interval ~= 1/(2*N), where N is the table entries
diff --git a/contrib/arm-optimized-routines/math/tools/plot.py b/contrib/arm-optimized-routines/math/tools/plot.py
index 6c8b89ff284b..a0fa02322560 100755
--- a/contrib/arm-optimized-routines/math/tools/plot.py
+++ b/contrib/arm-optimized-routines/math/tools/plot.py
@@ -3,7 +3,7 @@
# ULP error plot tool.
#
# Copyright (c) 2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
import numpy as np
import matplotlib.pyplot as plt
diff --git a/contrib/arm-optimized-routines/math/tools/remez.jl b/contrib/arm-optimized-routines/math/tools/remez.jl
index 2ff436f5287f..1deab67d0660 100755
--- a/contrib/arm-optimized-routines/math/tools/remez.jl
+++ b/contrib/arm-optimized-routines/math/tools/remez.jl
@@ -4,7 +4,7 @@
# remez.jl - implementation of the Remez algorithm for polynomial approximation
#
# Copyright (c) 2015-2019, Arm Limited.
-# SPDX-License-Identifier: MIT
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
import Base.\
diff --git a/contrib/arm-optimized-routines/math/tools/sin.sollya b/contrib/arm-optimized-routines/math/tools/sin.sollya
index a6e851145c11..a19300019867 100644
--- a/contrib/arm-optimized-routines/math/tools/sin.sollya
+++ b/contrib/arm-optimized-routines/math/tools/sin.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating sin(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 7; // polynomial degree
a = -pi/4; // interval
diff --git a/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl
new file mode 100644
index 000000000000..ecec174110ea
--- /dev/null
+++ b/contrib/arm-optimized-routines/math/tools/tgamma128_gen.jl
@@ -0,0 +1,212 @@
+# -*- julia -*-
+#
+# Generate tgamma128.h, containing polynomials and constants used by
+# tgamma128.c.
+#
+# Copyright (c) 2006-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+# This Julia program depends on the 'Remez' and 'SpecialFunctions'
+# library packages. To install them, run this at the interactive Julia
+# prompt:
+#
+# import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
+#
+# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
+
+import Printf
+import Remez
+import SpecialFunctions
+
+# Round a BigFloat to 128-bit long double and format it as a C99 hex
+# float literal.
+function quadhex(x)
+ sign = " "
+ if x < 0
+ sign = "-"
+ x = -x
+ end
+
+ exponent = BigInt(floor(log2(x)))
+ exponent = max(exponent, -16382)
+ @assert(exponent <= 16383) # else overflow
+
+ x /= BigFloat(2)^exponent
+ @assert(1 <= x < 2)
+ x *= BigFloat(2)^112
+ mantissa = BigInt(round(x))
+
+ mantstr = string(mantissa, base=16, pad=29)
+ return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
+ exponent)
+end
+
+# Round a BigFloat to 128-bit long double and return it still as a
+# BigFloat.
+function quadval(x, round=0)
+ sign = +1
+ if x.sign < 0
+ sign = -1
+ x = -x
+ end
+
+ exponent = BigInt(floor(log2(x)))
+ exponent = max(exponent, -16382)
+ @assert(exponent <= 16383) # else overflow
+
+ x /= BigFloat(2)^exponent
+ @assert(1 <= x < 2)
+ x *= BigFloat(2)^112
+ if round < 0
+ mantissa = floor(x)
+ elseif round > 0
+ mantissa = ceil(x)
+ else
+ mantissa = round(x)
+ end
+
+ return sign * mantissa * BigFloat(2)^(exponent - 112)
+end
+
+# Output an array of BigFloats as a C array declaration.
+function dumparray(a, name)
+ println("static const long double ", name, "[] = {")
+ for x in N
+ println(" ", quadhex(x), ",")
+ end
+ println("};")
+end
+
+print("/*
+ * Polynomial coefficients and other constants for tgamma128.c.
+ *
+ * Copyright (c) 2006,2009,2023 Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+")
+
+Base.MPFR.setprecision(512)
+
+e = exp(BigFloat(1))
+
+print("
+/* The largest positive value for which 128-bit tgamma does not overflow. */
+")
+lo = BigFloat("1000")
+hi = BigFloat("2000")
+while true
+ global lo
+ global hi
+ global max_x
+
+ mid = (lo + hi) / 2
+ if mid == lo || mid == hi
+ max_x = mid
+ break
+ end
+ if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
+ lo = mid
+ else
+ hi = mid
+ end
+end
+max_x = quadval(max_x, -1)
+println("static const long double max_x = ", quadhex(max_x), ";")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_large() subroutine */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
+ exp(SpecialFunctions.logabsgamma(1/x)[1] +
+ (1/x-0.5)*(1+log(x))),
+ (0, 1/BigFloat(8)),
+ 24, 0,
+ (x, y) -> 1/y
+)
+dumparray(N, "coeffs_large")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
+ (0, 1/BigFloat(32)),
+ 13, 0,
+)
+dumparray(N, "coeffs_tiny")
+
+print("
+/* The location within the interval [1,2] where gamma has a minimum.
+ * Specified as the sum of two 128-bit values, for extra precision. */
+")
+lo = BigFloat("1.4")
+hi = BigFloat("1.5")
+while true
+ global lo
+ global hi
+ global min_x
+
+ mid = (lo + hi) / 2
+ if mid == lo || mid == hi
+ min_x = mid
+ break
+ end
+ if SpecialFunctions.digamma(mid) < 0
+ lo = mid
+ else
+ hi = mid
+ end
+end
+min_x_hi = quadval(min_x, -1)
+println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
+println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
+
+print("
+/* The actual minimum value that gamma takes at that location.
+ * Again specified as the sum of two 128-bit values. */
+")
+min_y = SpecialFunctions.gamma(min_x)
+min_y_hi = quadval(min_y, -1)
+println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
+println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
+
+function taylor_bodge(x)
+ # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
+ # Used in the Remez calls below for x values very near the origin, to avoid
+ # significance loss problems when trying to compute it directly via that
+ # formula (even in MPFR's extra precision).
+ return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
+end
+
+print("
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [1,min_x] */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
+ (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
+ (0, min_x - 1),
+ 31, 0,
+ (x, y) -> x^2,
+)
+dumparray(N, "coeffs_central_neg")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [min_x,2] */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
+ (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
+ (0, 2 - min_x),
+ 28, 0,
+ (x, y) -> x^2,
+)
+dumparray(N, "coeffs_central_pos")
+
+print("
+/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
+ */
+")
+println("static const long double pi = ", quadhex(BigFloat(pi)), ";")
diff --git a/contrib/arm-optimized-routines/math/tools/v_exp.sollya b/contrib/arm-optimized-routines/math/tools/v_exp.sollya
index c0abb63fb642..5fa7de7435a9 100644
--- a/contrib/arm-optimized-routines/math/tools/v_exp.sollya
+++ b/contrib/arm-optimized-routines/math/tools/v_exp.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating e^x
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 4; // poly degree
N = 128; // table entries
diff --git a/contrib/arm-optimized-routines/math/tools/v_log.sollya b/contrib/arm-optimized-routines/math/tools/v_log.sollya
index cc3d2c4ae72a..d982524eb920 100644
--- a/contrib/arm-optimized-routines/math/tools/v_log.sollya
+++ b/contrib/arm-optimized-routines/math/tools/v_log.sollya
@@ -1,7 +1,7 @@
// polynomial used for __v_log(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 6; // poly degree
a = -0x1.fc1p-9;
diff --git a/contrib/arm-optimized-routines/math/tools/v_sin.sollya b/contrib/arm-optimized-routines/math/tools/v_sin.sollya
index 65cc9957c624..63b9d65a1ac3 100644
--- a/contrib/arm-optimized-routines/math/tools/v_sin.sollya
+++ b/contrib/arm-optimized-routines/math/tools/v_sin.sollya
@@ -1,7 +1,7 @@
// polynomial for approximating sin(x)
//
// Copyright (c) 2019, Arm Limited.
-// SPDX-License-Identifier: MIT
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
deg = 15; // polynomial degree
a = -pi/2; // interval
diff --git a/contrib/arm-optimized-routines/math/v_cos.c b/contrib/arm-optimized-routines/math/v_cos.c
deleted file mode 100644
index 20ba6bd0d0d9..000000000000
--- a/contrib/arm-optimized-routines/math/v_cos.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Double-precision vector cos function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const double Poly[] = {
-/* worst-case error is 3.5 ulp.
- abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
--0x1.9f4a9c8b21dc9p-41,
- 0x1.60e88a10163f2p-33,
--0x1.ae6361b7254e7p-26,
- 0x1.71de382e8d62bp-19,
--0x1.a01a019aeb4ffp-13,
- 0x1.111111110b25ep-7,
--0x1.55555555554c3p-3,
-};
-
-#define C7 v_f64 (Poly[0])
-#define C6 v_f64 (Poly[1])
-#define C5 v_f64 (Poly[2])
-#define C4 v_f64 (Poly[3])
-#define C3 v_f64 (Poly[4])
-#define C2 v_f64 (Poly[5])
-#define C1 v_f64 (Poly[6])
-
-#define InvPi v_f64 (0x1.45f306dc9c883p-2)
-#define HalfPi v_f64 (0x1.921fb54442d18p+0)
-#define Pi1 v_f64 (0x1.921fb54442d18p+1)
-#define Pi2 v_f64 (0x1.1a62633145c06p-53)
-#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
-#define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (cos, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(cos) (v_f64_t x)
-{
- v_f64_t n, r, r2, y;
- v_u64_t odd, cmp;
-
- r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
- cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
-
- /* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = v_fma_f64 (InvPi, r + HalfPi, Shift);
- odd = v_as_u64_f64 (n) << 63;
- n -= Shift;
- n -= v_f64 (0.5);
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = v_fma_f64 (-Pi1, n, r);
- r = v_fma_f64 (-Pi2, n, r);
- r = v_fma_f64 (-Pi3, n, r);
-
- /* sin(r) poly approx. */
- r2 = r * r;
- y = v_fma_f64 (C7, r2, C6);
- y = v_fma_f64 (y, r2, C5);
- y = v_fma_f64 (y, r2, C4);
- y = v_fma_f64 (y, r2, C3);
- y = v_fma_f64 (y, r2, C2);
- y = v_fma_f64 (y, r2, C1);
- y = v_fma_f64 (y * r2, r, r);
-
- /* sign. */
- y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_cosf.c b/contrib/arm-optimized-routines/math/v_cosf.c
deleted file mode 100644
index 150294b8845e..000000000000
--- a/contrib/arm-optimized-routines/math/v_cosf.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Single-precision vector cos function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* 1.886 ulp error */
- 0x1.5b2e76p-19f,
- -0x1.9f42eap-13f,
- 0x1.110df4p-7f,
- -0x1.555548p-3f,
-};
-#define Pi1 v_f32 (0x1.921fb6p+1f)
-#define Pi2 v_f32 (-0x1.777a5cp-24f)
-#define Pi3 v_f32 (-0x1.ee59dap-49f)
-#define A3 v_f32 (Poly[3])
-#define A5 v_f32 (Poly[2])
-#define A7 v_f32 (Poly[1])
-#define A9 v_f32 (Poly[0])
-#define RangeVal v_f32 (0x1p20f)
-#define InvPi v_f32 (0x1.45f306p-2f)
-#define Shift v_f32 (0x1.8p+23f)
-#define AbsMask v_u32 (0x7fffffff)
-#define HalfPi v_f32 (0x1.921fb6p0f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (cosf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(cosf) (v_f32_t x)
-{
- v_f32_t n, r, r2, y;
- v_u32_t odd, cmp;
-
- r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
- cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
-
- /* n = rint((|x|+pi/2)/pi) - 0.5 */
- n = v_fma_f32 (InvPi, r + HalfPi, Shift);
- odd = v_as_u32_f32 (n) << 31;
- n -= Shift;
- n -= v_f32 (0.5f);
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
- r = v_fma_f32 (-Pi1, n, r);
- r = v_fma_f32 (-Pi2, n, r);
- r = v_fma_f32 (-Pi3, n, r);
-
- /* y = sin(r) */
- r2 = r * r;
- y = v_fma_f32 (A9, r2, A7);
- y = v_fma_f32 (y, r2, A5);
- y = v_fma_f32 (y, r2, A3);
- y = v_fma_f32 (y * r2, r, r);
-
- /* sign fix */
- y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_exp.c b/contrib/arm-optimized-routines/math/v_exp.c
deleted file mode 100644
index e459d53fddd2..000000000000
--- a/contrib/arm-optimized-routines/math/v_exp.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Double-precision vector e^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-#include "v_exp.h"
-
-#if V_EXP_TABLE_BITS == 7
-/* maxerr: 1.88 +0.5 ulp
- rel error: 1.4337*2^-53
- abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
-#define C1 v_f64 (0x1.ffffffffffd43p-2)
-#define C2 v_f64 (0x1.55555c75adbb2p-3)
-#define C3 v_f64 (0x1.55555da646206p-5)
-#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */
-#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */
-#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
-#elif V_EXP_TABLE_BITS == 8
-/* maxerr: 0.54 +0.5 ulp
- rel error: 1.4318*2^-58
- abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */
-#define C1 v_f64 (0x1.fffffffffffd4p-2)
-#define C2 v_f64 (0x1.5555571d6b68cp-3)
-#define C3 v_f64 (0x1.5555576a59599p-5)
-#define InvLn2 v_f64 (0x1.71547652b82fep8)
-#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
-#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
-#endif
-
-#define N (1 << V_EXP_TABLE_BITS)
-#define Tab __v_exp_data
-#define IndexMask v_u64 (N - 1)
-#define Shift v_f64 (0x1.8p+52)
-#define Thres v_f64 (704.0)
-
-VPCS_ATTR
-static v_f64_t
-specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
-{
- v_f64_t absn = v_abs_f64 (n);
-
- /* 2^(n/N) may overflow, break it up into s1*s2. */
- v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
- v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
- v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
- v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
- v_f64_t r1 = s1 * s1;
- v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
- return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(exp) (v_f64_t x)
-{
- v_f64_t n, r, r2, s, y, z;
- v_u64_t cmp, u, e, i;
-
- cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
-
- /* n = round(x/(ln2/N)). */
- z = v_fma_f64 (x, InvLn2, Shift);
- u = v_as_u64_f64 (z);
- n = z - Shift;
-
- /* r = x - n*ln2/N. */
- r = x;
- r = v_fma_f64 (-Ln2hi, n, r);
- r = v_fma_f64 (-Ln2lo, n, r);
-
- e = u << (52 - V_EXP_TABLE_BITS);
- i = u & IndexMask;
-
- /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
- r2 = r * r;
- y = v_fma_f64 (C2, r, C1);
- y = v_fma_f64 (C3, r2, y);
- y = v_fma_f64 (y, r2, r);
-
- /* s = 2^(n/N). */
- u = v_lookup_u64 (Tab, i);
- s = v_as_f64_u64 (u + e);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (s, y, n);
- return v_fma_f64 (y, s, s);
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_exp.h b/contrib/arm-optimized-routines/math/v_exp.h
deleted file mode 100644
index 305da19c0a53..000000000000
--- a/contrib/arm-optimized-routines/math/v_exp.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Declarations for double-precision e^x vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_EXP_TABLE_BITS 7
-
-extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_exp2f.c b/contrib/arm-optimized-routines/math/v_exp2f.c
deleted file mode 100644
index e3ea5af3414d..000000000000
--- a/contrib/arm-optimized-routines/math/v_exp2f.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 1.962 ulp. */
- 0x1.59977ap-10f,
- 0x1.3ce9e4p-7f,
- 0x1.c6bd32p-5f,
- 0x1.ebf9bcp-3f,
- 0x1.62e422p-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
- v_u32_t r2 = v_as_u32_f32 (s1 * s1);
- v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
- return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(exp2f) (v_f32_t x)
-{
- v_f32_t n, r, r2, scale, p, q, poly, absn;
- v_u32_t cmp, e;
-
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
-#if 0
- v_f32_t z;
- z = x + Shift;
- n = z - Shift;
- r = x - n;
- e = v_as_u32_f32 (z) << 23;
-#else
- n = v_round_f32 (x);
- r = x - n;
- e = v_as_u32_s32 (v_round_s32 (x)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- r2 = r * r;
- p = v_fma_f32 (C0, r, C1);
- q = v_fma_f32 (C2, r, C3);
- q = v_fma_f32 (p, r2, q);
- p = C4 * r;
- poly = v_fma_f32 (q, r2, p);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn, cmp, scale);
- return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_exp2f_1u.c b/contrib/arm-optimized-routines/math/v_exp2f_1u.c
deleted file mode 100644
index 1caa14d9bfff..000000000000
--- a/contrib/arm-optimized-routines/math/v_exp2f_1u.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 0.878 ulp. */
- 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-#define C5 v_f32 (Poly[5])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
- v_f32_t r1 = s1 * s1;
- v_f32_t r0 = poly * s1 * s2;
- return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(exp2f_1u) (v_f32_t x)
-{
- v_f32_t n, r, scale, poly, absn;
- v_u32_t cmp, e;
-
- /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
-#if 0
- v_f32_t z;
- z = x + Shift;
- n = z - Shift;
- r = x - n;
- e = v_as_u32_f32 (z) << 23;
-#else
- n = v_round_f32 (x);
- r = x - n;
- e = v_as_u32_s32 (v_round_s32 (x)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- poly = v_fma_f32 (C0, r, C1);
- poly = v_fma_f32 (poly, r, C2);
- poly = v_fma_f32 (poly, r, C3);
- poly = v_fma_f32 (poly, r, C4);
- poly = v_fma_f32 (poly, r, C5);
- poly = v_fma_f32 (poly, r, v_f32 (1.0f));
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn);
- return scale * poly;
-}
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_exp_data.c b/contrib/arm-optimized-routines/math/v_exp_data.c
deleted file mode 100644
index 365355497e95..000000000000
--- a/contrib/arm-optimized-routines/math/v_exp_data.c
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * Lookup table for double-precision e^x vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_exp.h"
-#if WANT_VMATH
-
-#define N (1 << V_EXP_TABLE_BITS)
-
-/* 2^(j/N), j=0..N. */
-const u64_t __v_exp_data[] = {
-#if N == 128
-0x3ff0000000000000,
-0x3feff63da9fb3335,
-0x3fefec9a3e778061,
-0x3fefe315e86e7f85,
-0x3fefd9b0d3158574,
-0x3fefd06b29ddf6de,
-0x3fefc74518759bc8,
-0x3fefbe3ecac6f383,
-0x3fefb5586cf9890f,
-0x3fefac922b7247f7,
-0x3fefa3ec32d3d1a2,
-0x3fef9b66affed31b,
-0x3fef9301d0125b51,
-0x3fef8abdc06c31cc,
-0x3fef829aaea92de0,
-0x3fef7a98c8a58e51,
-0x3fef72b83c7d517b,
-0x3fef6af9388c8dea,
-0x3fef635beb6fcb75,
-0x3fef5be084045cd4,
-0x3fef54873168b9aa,
-0x3fef4d5022fcd91d,
-0x3fef463b88628cd6,
-0x3fef3f49917ddc96,
-0x3fef387a6e756238,
-0x3fef31ce4fb2a63f,
-0x3fef2b4565e27cdd,
-0x3fef24dfe1f56381,
-0x3fef1e9df51fdee1,
-0x3fef187fd0dad990,
-0x3fef1285a6e4030b,
-0x3fef0cafa93e2f56,
-0x3fef06fe0a31b715,
-0x3fef0170fc4cd831,
-0x3feefc08b26416ff,
-0x3feef6c55f929ff1,
-0x3feef1a7373aa9cb,
-0x3feeecae6d05d866,
-0x3feee7db34e59ff7,
-0x3feee32dc313a8e5,
-0x3feedea64c123422,
-0x3feeda4504ac801c,
-0x3feed60a21f72e2a,
-0x3feed1f5d950a897,
-0x3feece086061892d,
-0x3feeca41ed1d0057,
-0x3feec6a2b5c13cd0,
-0x3feec32af0d7d3de,
-0x3feebfdad5362a27,
-0x3feebcb299fddd0d,
-0x3feeb9b2769d2ca7,
-0x3feeb6daa2cf6642,
-0x3feeb42b569d4f82,
-0x3feeb1a4ca5d920f,
-0x3feeaf4736b527da,
-0x3feead12d497c7fd,
-0x3feeab07dd485429,
-0x3feea9268a5946b7,
-0x3feea76f15ad2148,
-0x3feea5e1b976dc09,
-0x3feea47eb03a5585,
-0x3feea34634ccc320,
-0x3feea23882552225,
-0x3feea155d44ca973,
-0x3feea09e667f3bcd,
-0x3feea012750bdabf,
-0x3fee9fb23c651a2f,
-0x3fee9f7df9519484,
-0x3fee9f75e8ec5f74,
-0x3fee9f9a48a58174,
-0x3fee9feb564267c9,
-0x3feea0694fde5d3f,
-0x3feea11473eb0187,
-0x3feea1ed0130c132,
-0x3feea2f336cf4e62,
-0x3feea427543e1a12,
-0x3feea589994cce13,
-0x3feea71a4623c7ad,
-0x3feea8d99b4492ed,
-0x3feeaac7d98a6699,
-0x3feeace5422aa0db,
-0x3feeaf3216b5448c,
-0x3feeb1ae99157736,
-0x3feeb45b0b91ffc6,
-0x3feeb737b0cdc5e5,
-0x3feeba44cbc8520f,
-0x3feebd829fde4e50,
-0x3feec0f170ca07ba,
-0x3feec49182a3f090,
-0x3feec86319e32323,
-0x3feecc667b5de565,
-0x3feed09bec4a2d33,
-0x3feed503b23e255d,
-0x3feed99e1330b358,
-0x3feede6b5579fdbf,
-0x3feee36bbfd3f37a,
-0x3feee89f995ad3ad,
-0x3feeee07298db666,
-0x3feef3a2b84f15fb,
-0x3feef9728de5593a,
-0x3feeff76f2fb5e47,
-0x3fef05b030a1064a,
-0x3fef0c1e904bc1d2,
-0x3fef12c25bd71e09,
-0x3fef199bdd85529c,
-0x3fef20ab5fffd07a,
-0x3fef27f12e57d14b,
-0x3fef2f6d9406e7b5,
-0x3fef3720dcef9069,
-0x3fef3f0b555dc3fa,
-0x3fef472d4a07897c,
-0x3fef4f87080d89f2,
-0x3fef5818dcfba487,
-0x3fef60e316c98398,
-0x3fef69e603db3285,
-0x3fef7321f301b460,
-0x3fef7c97337b9b5f,
-0x3fef864614f5a129,
-0x3fef902ee78b3ff6,
-0x3fef9a51fbc74c83,
-0x3fefa4afa2a490da,
-0x3fefaf482d8e67f1,
-0x3fefba1bee615a27,
-0x3fefc52b376bba97,
-0x3fefd0765b6e4540,
-0x3fefdbfdad9cbe14,
-0x3fefe7c1819e90d8,
-0x3feff3c22b8f71f1,
-#elif N == 256
-0x3ff0000000000000,
-0x3feffb1afa5abcbf,
-0x3feff63da9fb3335,
-0x3feff168143b0281,
-0x3fefec9a3e778061,
-0x3fefe7d42e11bbcc,
-0x3fefe315e86e7f85,
-0x3fefde5f72f654b1,
-0x3fefd9b0d3158574,
-0x3fefd50a0e3c1f89,
-0x3fefd06b29ddf6de,
-0x3fefcbd42b72a836,
-0x3fefc74518759bc8,
-0x3fefc2bdf66607e0,
-0x3fefbe3ecac6f383,
-0x3fefb9c79b1f3919,
-0x3fefb5586cf9890f,
-0x3fefb0f145e46c85,
-0x3fefac922b7247f7,
-0x3fefa83b23395dec,
-0x3fefa3ec32d3d1a2,
-0x3fef9fa55fdfa9c5,
-0x3fef9b66affed31b,
-0x3fef973028d7233e,
-0x3fef9301d0125b51,
-0x3fef8edbab5e2ab6,
-0x3fef8abdc06c31cc,
-0x3fef86a814f204ab,
-0x3fef829aaea92de0,
-0x3fef7e95934f312e,
-0x3fef7a98c8a58e51,
-0x3fef76a45471c3c2,
-0x3fef72b83c7d517b,
-0x3fef6ed48695bbc0,
-0x3fef6af9388c8dea,
-0x3fef672658375d2f,
-0x3fef635beb6fcb75,
-0x3fef5f99f8138a1c,
-0x3fef5be084045cd4,
-0x3fef582f95281c6b,
-0x3fef54873168b9aa,
-0x3fef50e75eb44027,
-0x3fef4d5022fcd91d,
-0x3fef49c18438ce4d,
-0x3fef463b88628cd6,
-0x3fef42be3578a819,
-0x3fef3f49917ddc96,
-0x3fef3bdda27912d1,
-0x3fef387a6e756238,
-0x3fef351ffb82140a,
-0x3fef31ce4fb2a63f,
-0x3fef2e85711ece75,
-0x3fef2b4565e27cdd,
-0x3fef280e341ddf29,
-0x3fef24dfe1f56381,
-0x3fef21ba7591bb70,
-0x3fef1e9df51fdee1,
-0x3fef1b8a66d10f13,
-0x3fef187fd0dad990,
-0x3fef157e39771b2f,
-0x3fef1285a6e4030b,
-0x3fef0f961f641589,
-0x3fef0cafa93e2f56,
-0x3fef09d24abd886b,
-0x3fef06fe0a31b715,
-0x3fef0432edeeb2fd,
-0x3fef0170fc4cd831,
-0x3feefeb83ba8ea32,
-0x3feefc08b26416ff,
-0x3feef96266e3fa2d,
-0x3feef6c55f929ff1,
-0x3feef431a2de883b,
-0x3feef1a7373aa9cb,
-0x3feeef26231e754a,
-0x3feeecae6d05d866,
-0x3feeea401b7140ef,
-0x3feee7db34e59ff7,
-0x3feee57fbfec6cf4,
-0x3feee32dc313a8e5,
-0x3feee0e544ede173,
-0x3feedea64c123422,
-0x3feedc70df1c5175,
-0x3feeda4504ac801c,
-0x3feed822c367a024,
-0x3feed60a21f72e2a,
-0x3feed3fb2709468a,
-0x3feed1f5d950a897,
-0x3feecffa3f84b9d4,
-0x3feece086061892d,
-0x3feecc2042a7d232,
-0x3feeca41ed1d0057,
-0x3feec86d668b3237,
-0x3feec6a2b5c13cd0,
-0x3feec4e1e192aed2,
-0x3feec32af0d7d3de,
-0x3feec17dea6db7d7,
-0x3feebfdad5362a27,
-0x3feebe41b817c114,
-0x3feebcb299fddd0d,
-0x3feebb2d81d8abff,
-0x3feeb9b2769d2ca7,
-0x3feeb8417f4531ee,
-0x3feeb6daa2cf6642,
-0x3feeb57de83f4eef,
-0x3feeb42b569d4f82,
-0x3feeb2e2f4f6ad27,
-0x3feeb1a4ca5d920f,
-0x3feeb070dde910d2,
-0x3feeaf4736b527da,
-0x3feeae27dbe2c4cf,
-0x3feead12d497c7fd,
-0x3feeac0827ff07cc,
-0x3feeab07dd485429,
-0x3feeaa11fba87a03,
-0x3feea9268a5946b7,
-0x3feea84590998b93,
-0x3feea76f15ad2148,
-0x3feea6a320dceb71,
-0x3feea5e1b976dc09,
-0x3feea52ae6cdf6f4,
-0x3feea47eb03a5585,
-0x3feea3dd1d1929fd,
-0x3feea34634ccc320,
-0x3feea2b9febc8fb7,
-0x3feea23882552225,
-0x3feea1c1c70833f6,
-0x3feea155d44ca973,
-0x3feea0f4b19e9538,
-0x3feea09e667f3bcd,
-0x3feea052fa75173e,
-0x3feea012750bdabf,
-0x3fee9fdcddd47645,
-0x3fee9fb23c651a2f,
-0x3fee9f9298593ae5,
-0x3fee9f7df9519484,
-0x3fee9f7466f42e87,
-0x3fee9f75e8ec5f74,
-0x3fee9f8286ead08a,
-0x3fee9f9a48a58174,
-0x3fee9fbd35d7cbfd,
-0x3fee9feb564267c9,
-0x3feea024b1ab6e09,
-0x3feea0694fde5d3f,
-0x3feea0b938ac1cf6,
-0x3feea11473eb0187,
-0x3feea17b0976cfdb,
-0x3feea1ed0130c132,
-0x3feea26a62ff86f0,
-0x3feea2f336cf4e62,
-0x3feea3878491c491,
-0x3feea427543e1a12,
-0x3feea4d2add106d9,
-0x3feea589994cce13,
-0x3feea64c1eb941f7,
-0x3feea71a4623c7ad,
-0x3feea7f4179f5b21,
-0x3feea8d99b4492ed,
-0x3feea9cad931a436,
-0x3feeaac7d98a6699,
-0x3feeabd0a478580f,
-0x3feeace5422aa0db,
-0x3feeae05bad61778,
-0x3feeaf3216b5448c,
-0x3feeb06a5e0866d9,
-0x3feeb1ae99157736,
-0x3feeb2fed0282c8a,
-0x3feeb45b0b91ffc6,
-0x3feeb5c353aa2fe2,
-0x3feeb737b0cdc5e5,
-0x3feeb8b82b5f98e5,
-0x3feeba44cbc8520f,
-0x3feebbdd9a7670b3,
-0x3feebd829fde4e50,
-0x3feebf33e47a22a2,
-0x3feec0f170ca07ba,
-0x3feec2bb4d53fe0d,
-0x3feec49182a3f090,
-0x3feec674194bb8d5,
-0x3feec86319e32323,
-0x3feeca5e8d07f29e,
-0x3feecc667b5de565,
-0x3feece7aed8eb8bb,
-0x3feed09bec4a2d33,
-0x3feed2c980460ad8,
-0x3feed503b23e255d,
-0x3feed74a8af46052,
-0x3feed99e1330b358,
-0x3feedbfe53c12e59,
-0x3feede6b5579fdbf,
-0x3feee0e521356eba,
-0x3feee36bbfd3f37a,
-0x3feee5ff3a3c2774,
-0x3feee89f995ad3ad,
-0x3feeeb4ce622f2ff,
-0x3feeee07298db666,
-0x3feef0ce6c9a8952,
-0x3feef3a2b84f15fb,
-0x3feef68415b749b1,
-0x3feef9728de5593a,
-0x3feefc6e29f1c52a,
-0x3feeff76f2fb5e47,
-0x3fef028cf22749e4,
-0x3fef05b030a1064a,
-0x3fef08e0b79a6f1f,
-0x3fef0c1e904bc1d2,
-0x3fef0f69c3f3a207,
-0x3fef12c25bd71e09,
-0x3fef16286141b33d,
-0x3fef199bdd85529c,
-0x3fef1d1cd9fa652c,
-0x3fef20ab5fffd07a,
-0x3fef244778fafb22,
-0x3fef27f12e57d14b,
-0x3fef2ba88988c933,
-0x3fef2f6d9406e7b5,
-0x3fef33405751c4db,
-0x3fef3720dcef9069,
-0x3fef3b0f2e6d1675,
-0x3fef3f0b555dc3fa,
-0x3fef43155b5bab74,
-0x3fef472d4a07897c,
-0x3fef4b532b08c968,
-0x3fef4f87080d89f2,
-0x3fef53c8eacaa1d6,
-0x3fef5818dcfba487,
-0x3fef5c76e862e6d3,
-0x3fef60e316c98398,
-0x3fef655d71ff6075,
-0x3fef69e603db3285,
-0x3fef6e7cd63a8315,
-0x3fef7321f301b460,
-0x3fef77d5641c0658,
-0x3fef7c97337b9b5f,
-0x3fef81676b197d17,
-0x3fef864614f5a129,
-0x3fef8b333b16ee12,
-0x3fef902ee78b3ff6,
-0x3fef953924676d76,
-0x3fef9a51fbc74c83,
-0x3fef9f7977cdb740,
-0x3fefa4afa2a490da,
-0x3fefa9f4867cca6e,
-0x3fefaf482d8e67f1,
-0x3fefb4aaa2188510,
-0x3fefba1bee615a27,
-0x3fefbf9c1cb6412a,
-0x3fefc52b376bba97,
-0x3fefcac948dd7274,
-0x3fefd0765b6e4540,
-0x3fefd632798844f8,
-0x3fefdbfdad9cbe14,
-0x3fefe1d802243c89,
-0x3fefe7c1819e90d8,
-0x3fefedba3692d514,
-0x3feff3c22b8f71f1,
-0x3feff9d96b2a23d9,
-#endif
-};
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_expf.c b/contrib/arm-optimized-routines/math/v_expf.c
deleted file mode 100644
index d403e00534f0..000000000000
--- a/contrib/arm-optimized-routines/math/v_expf.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 1.45358 +0.5 ulp. */
- 0x1.0e4020p-7f,
- 0x1.573e2ep-5f,
- 0x1.555e66p-3f,
- 0x1.fffdb6p-2f,
- 0x1.ffffecp-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
- v_u32_t r2 = v_as_u32_f32 (s1 * s1);
- v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
- return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(expf) (v_f32_t x)
-{
- v_f32_t n, r, r2, scale, p, q, poly, absn, z;
- v_u32_t cmp, e;
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-#if 1
- z = v_fma_f32 (x, InvLn2, Shift);
- n = z - Shift;
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_f32 (z) << 23;
-#else
- z = x * InvLn2;
- n = v_round_f32 (z);
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_s32 (v_round_s32 (z)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- r2 = r * r;
- p = v_fma_f32 (C0, r, C1);
- q = v_fma_f32 (C2, r, C3);
- q = v_fma_f32 (p, r2, q);
- p = C4 * r;
- poly = v_fma_f32 (q, r2, p);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn, cmp, scale);
- return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_expf_1u.c b/contrib/arm-optimized-routines/math/v_expf_1u.c
deleted file mode 100644
index 023bd248c9ac..000000000000
--- a/contrib/arm-optimized-routines/math/v_expf_1u.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 0.36565 +0.5 ulp. */
- 0x1.6a6000p-10f,
- 0x1.12718ep-7f,
- 0x1.555af0p-5f,
- 0x1.555430p-3f,
- 0x1.fffff4p-2f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
- v_f32_t r1 = s1 * s1;
- v_f32_t r0 = poly * s1 * s2;
- return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(expf_1u) (v_f32_t x)
-{
- v_f32_t n, r, scale, poly, absn, z;
- v_u32_t cmp, e;
-
- /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-#if 1
- z = v_fma_f32 (x, InvLn2, Shift);
- n = z - Shift;
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_f32 (z) << 23;
-#else
- z = x * InvLn2;
- n = v_round_f32 (z);
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_s32 (v_round_s32 (z)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- poly = v_fma_f32 (C0, r, C1);
- poly = v_fma_f32 (poly, r, C2);
- poly = v_fma_f32 (poly, r, C3);
- poly = v_fma_f32 (poly, r, C4);
- poly = v_fma_f32 (poly, r, v_f32 (1.0f));
- poly = v_fma_f32 (poly, r, v_f32 (1.0f));
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn);
- return scale * poly;
-}
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_log.c b/contrib/arm-optimized-routines/math/v_log.c
deleted file mode 100644
index d84c740d2b6b..000000000000
--- a/contrib/arm-optimized-routines/math/v_log.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Double-precision vector log(x) function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#include "v_log.h"
-#if V_SUPPORTED
-
-/* Worst-case error: 1.17 + 0.5 ulp. */
-
-static const f64_t Poly[] = {
- /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
- -0x1.ffffffffffff7p-2,
- 0x1.55555555170d4p-2,
- -0x1.0000000399c27p-2,
- 0x1.999b2e90e94cap-3,
- -0x1.554e550bd501ep-3,
-};
-
-#define A0 v_f64 (Poly[0])
-#define A1 v_f64 (Poly[1])
-#define A2 v_f64 (Poly[2])
-#define A3 v_f64 (Poly[3])
-#define A4 v_f64 (Poly[4])
-#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
-#define N (1 << V_LOG_TABLE_BITS)
-#define OFF v_u64 (0x3fe6900900000000)
-
-struct entry
-{
- v_f64_t invc;
- v_f64_t logc;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
- struct entry e;
-#ifdef SCALAR
- e.invc = __v_log_data[i].invc;
- e.logc = __v_log_data[i].logc;
-#else
- e.invc[0] = __v_log_data[i[0]].invc;
- e.logc[0] = __v_log_data[i[0]].logc;
- e.invc[1] = __v_log_data[i[1]].invc;
- e.logc[1] = __v_log_data[i[1]].logc;
-#endif
- return e;
-}
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (log, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(log) (v_f64_t x)
-{
- v_f64_t z, r, r2, p, y, kd, hi;
- v_u64_t ix, iz, tmp, top, i, cmp;
- v_s64_t k;
- struct entry e;
-
- ix = v_as_u64_f64 (x);
- top = ix >> 48;
- cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
-
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- tmp = ix - OFF;
- i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
- k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
- iz = ix - (tmp & v_u64 (0xfffULL << 52));
- z = v_as_f64_u64 (iz);
- e = lookup (i);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
- kd = v_to_f64_s64 (k);
-
- /* hi = r + log(c) + k*Ln2. */
- hi = v_fma_f64 (kd, Ln2, e.logc + r);
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- r2 = r * r;
- y = v_fma_f64 (A3, r, A2);
- p = v_fma_f64 (A1, r, A0);
- y = v_fma_f64 (A4, r2, y);
- y = v_fma_f64 (y, r2, p);
- y = v_fma_f64 (y, r2, hi);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_log.h b/contrib/arm-optimized-routines/math/v_log.h
deleted file mode 100644
index bcc2fa6fa930..000000000000
--- a/contrib/arm-optimized-routines/math/v_log.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Declarations for double-precision log(x) vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_LOG_TABLE_BITS 7
-
-extern const struct v_log_data
-{
- f64_t invc;
- f64_t logc;
-} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_log_data.c b/contrib/arm-optimized-routines/math/v_log_data.c
deleted file mode 100644
index 97ee5b09c6a9..000000000000
--- a/contrib/arm-optimized-routines/math/v_log_data.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Lookup table for double-precision log(x) vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "v_log.h"
-#if WANT_VMATH
-
-#define N (1 << V_LOG_TABLE_BITS)
-
-/* Algorithm:
-
- x = 2^k z
- log(x) = k ln2 + log(c) + poly(z/c - 1)
-
-where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
-and log(c) and 1/c for the ith subinterval comes from a lookup table:
-
- tab[i].invc = 1/c
- tab[i].logc = (double)log(c)
-
-where c is near the center of the subinterval and is chosen by trying several
-floating point invc candidates around 1/center and selecting one for which
-the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
-that contains 1 and the previous one got tweaked to avoid cancellation. */
-const struct v_log_data __v_log_data[N] = {
-{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
-{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
-{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
-{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
-{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
-{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
-{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
-{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
-{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
-{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
-{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
-{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
-{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
-{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
-{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
-{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
-{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
-{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
-{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
-{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
-{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
-{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
-{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
-{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
-{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
-{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
-{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
-{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
-{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
-{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
-{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
-{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
-{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
-{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
-{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
-{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
-{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
-{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
-{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
-{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
-{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
-{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
-{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
-{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
-{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
-{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
-{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
-{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
-{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
-{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
-{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
-{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
-{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
-{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
-{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
-{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
-{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
-{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
-{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
-{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
-{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
-{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
-{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
-{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
-{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
-{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
-{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
-{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
-{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
-{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
-{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
-{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
-{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
-{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
-{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
-{1.0, 0.0},
-{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
-{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
-{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
-{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
-{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
-{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
-{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
-{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
-{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
-{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
-{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
-{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
-{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
-{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
-{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
-{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
-{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
-{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
-{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
-{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
-{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
-{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
-{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
-{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
-{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
-{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
-{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
-{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
-{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
-{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
-{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
-{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
-{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
-{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
-{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
-{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
-{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
-{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
-{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
-{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
-{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
-{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
-{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
-{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
-{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
-{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
-{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
-{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
-{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
-{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
-{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
-{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
-};
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_logf.c b/contrib/arm-optimized-routines/math/v_logf.c
deleted file mode 100644
index 7373192f03fa..000000000000
--- a/contrib/arm-optimized-routines/math/v_logf.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Single-precision vector log function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* 3.34 ulp error */
- -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
- -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
-};
-#define P7 v_f32 (Poly[0])
-#define P6 v_f32 (Poly[1])
-#define P5 v_f32 (Poly[2])
-#define P4 v_f32 (Poly[3])
-#define P3 v_f32 (Poly[4])
-#define P2 v_f32 (Poly[5])
-#define P1 v_f32 (Poly[6])
-
-#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Mask v_u32 (0x007fffff)
-#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (logf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(logf) (v_f32_t x)
-{
- v_f32_t n, p, q, r, r2, y;
- v_u32_t u, cmp;
-
- u = v_as_u32_f32 (x);
- cmp = v_cond_u32 (u - Min >= Max - Min);
-
- /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
- u -= Off;
- n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
- u &= Mask;
- u += Off;
- r = v_as_f32_u32 (u) - v_f32 (1.0f);
-
- /* y = log(1+r) + n*ln2. */
- r2 = r * r;
- /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
- p = v_fma_f32 (P6, r, P5);
- q = v_fma_f32 (P4, r, P3);
- y = v_fma_f32 (P2, r, P1);
- p = v_fma_f32 (P7, r2, p);
- q = v_fma_f32 (p, r2, q);
- y = v_fma_f32 (q, r2, y);
- p = v_fma_f32 (Ln2, n, r);
- y = v_fma_f32 (y, r2, p);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_math.h b/contrib/arm-optimized-routines/math/v_math.h
deleted file mode 100644
index f2cc4670bb9b..000000000000
--- a/contrib/arm-optimized-routines/math/v_math.h
+++ /dev/null
@@ -1,641 +0,0 @@
-/*
- * Vector math abstractions.
- *
- * Copyright (c) 2019-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#ifndef _V_MATH_H
-#define _V_MATH_H
-
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
-#endif
-#if WANT_VMATH
-
-/* The goal of this header is to allow vector and scalar
- build of the same algorithm, the provided intrinsic
- wrappers are also vector length agnostic so they can
- be implemented for SVE too (or other simd architectures)
- and then the code should work on those targets too. */
-
-#if SCALAR
-#define V_NAME(x) __s_##x
-#elif VPCS && __aarch64__
-#define V_NAME(x) __vn_##x
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-#else
-#define V_NAME(x) __v_##x
-#endif
-
-#ifndef VPCS_ATTR
-#define VPCS_ATTR
-#endif
-#ifndef VPCS_ALIAS
-#define VPCS_ALIAS
-#endif
-
-#include <stdint.h>
-#include "math_config.h"
-
-typedef float f32_t;
-typedef uint32_t u32_t;
-typedef int32_t s32_t;
-typedef double f64_t;
-typedef uint64_t u64_t;
-typedef int64_t s64_t;
-
-/* reinterpret as type1 from type2. */
-static inline u32_t
-as_u32_f32 (f32_t x)
-{
- union { f32_t f; u32_t u; } r = {x};
- return r.u;
-}
-static inline f32_t
-as_f32_u32 (u32_t x)
-{
- union { u32_t u; f32_t f; } r = {x};
- return r.f;
-}
-static inline s32_t
-as_s32_u32 (u32_t x)
-{
- union { u32_t u; s32_t i; } r = {x};
- return r.i;
-}
-static inline u32_t
-as_u32_s32 (s32_t x)
-{
- union { s32_t i; u32_t u; } r = {x};
- return r.u;
-}
-static inline u64_t
-as_u64_f64 (f64_t x)
-{
- union { f64_t f; u64_t u; } r = {x};
- return r.u;
-}
-static inline f64_t
-as_f64_u64 (u64_t x)
-{
- union { u64_t u; f64_t f; } r = {x};
- return r.f;
-}
-static inline s64_t
-as_s64_u64 (u64_t x)
-{
- union { u64_t u; s64_t i; } r = {x};
- return r.i;
-}
-static inline u64_t
-as_u64_s64 (s64_t x)
-{
- union { s64_t i; u64_t u; } r = {x};
- return r.u;
-}
-
-#if SCALAR
-#define V_SUPPORTED 1
-typedef f32_t v_f32_t;
-typedef u32_t v_u32_t;
-typedef s32_t v_s32_t;
-typedef f64_t v_f64_t;
-typedef u64_t v_u64_t;
-typedef s64_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
- return 1;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
- return x;
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
- return x;
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
- return x;
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
- return x;
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
- return x;
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
- return x;
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
- *x = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
- *x = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
- *x = v;
-}
-
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (v_u32_t x)
-{
- return x != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
- return x ? -1 : 0;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
- return __builtin_fabsf (x);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
- return __builtin_fmaf (x, y, z);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
- return __builtin_roundf (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
- return __builtin_lroundf (x); /* relies on -fno-math-errno. */
-}
-/* convert to type1 from type2. */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
- return x;
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
- return x;
-}
-/* reinterpret as type1 from type2. */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
- union { v_f32_t f; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_f32_t f; } r = {x};
- return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_s32_t i; } r = {x};
- return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
- union { v_s32_t i; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
- return tab[idx];
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
- return tab[idx];
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
- return f (x);
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
- v_u32_t p)
-{
- return f (x1, x2);
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 1;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
- return x;
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
- return x;
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
- return x;
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
- return x;
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
- *x = v;
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (v_u64_t x)
-{
- return x != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
- return x ? -1 : 0;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
- return __builtin_fabs (x);
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
- return __builtin_fma (x, y, z);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
- return __builtin_round (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
- return __builtin_lround (x); /* relies on -fno-math-errno. */
-}
-/* convert to type1 from type2. */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
- return x;
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
- return x;
-}
-/* reinterpret as type1 from type2. */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
- union { v_f64_t f; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_f64_t f; } r = {x};
- return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_s64_t i; } r = {x};
- return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
- union { v_s64_t i; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
- return tab[idx];
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
- return tab[idx];
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
- return f (x);
-}
-
-#elif __aarch64__
-#define V_SUPPORTED 1
-#include <arm_neon.h>
-typedef float32x4_t v_f32_t;
-typedef uint32x4_t v_u32_t;
-typedef int32x4_t v_s32_t;
-typedef float64x2_t v_f64_t;
-typedef uint64x2_t v_u64_t;
-typedef int64x2_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
- return 4;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
- return (v_f32_t){x, x, x, x};
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
- return (v_u32_t){x, x, x, x};
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
- return (v_s32_t){x, x, x, x};
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
- return x[i];
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
- return x[i];
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
- return x[i];
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
- (*x)[i] = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
- (*x)[i] = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
- (*x)[i] = v;
-}
-
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (v_u32_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
- return x;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
- return vabsq_f32 (x);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
- return vfmaq_f32 (z, x, y);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
- return vrndaq_f32 (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
- return vcvtaq_s32_f32 (x);
-}
-/* convert to type1 from type2. */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
- return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
- return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-/* reinterpret as type1 from type2. */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
- union { v_f32_t f; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_f32_t f; } r = {x};
- return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_s32_t i; } r = {x};
- return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
- union { v_s32_t i; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
- return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
- return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
- return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
- p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
- v_u32_t p)
-{
- return (
- v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
- p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 2;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
- return (v_f64_t){x, x};
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
- return (v_u64_t){x, x};
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
- return (v_s64_t){x, x};
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
- return x[i];
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
- (*x)[i] = v;
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (v_u64_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (x) != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
- return x;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
- return vabsq_f64 (x);
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
- return vfmaq_f64 (z, x, y);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
- return vrndaq_f64 (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
- return vcvtaq_s64_f64 (x);
-}
-/* convert to type1 from type2. */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
- return (v_f64_t){x[0], x[1]};
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
- return (v_f64_t){x[0], x[1]};
-}
-/* reinterpret as type1 from type2. */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
- union { v_f64_t f; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_f64_t f; } r = {x};
- return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_s64_t i; } r = {x};
- return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
- union { v_s64_t i; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
- return (v_f64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
- return (v_u64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
- return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
-}
-#endif
-
-#endif
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_pow.c b/contrib/arm-optimized-routines/math/v_pow.c
deleted file mode 100644
index a209d57f41ce..000000000000
--- a/contrib/arm-optimized-routines/math/v_pow.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Double-precision vector pow function.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-VPCS_ATTR
-v_f64_t
-V_NAME(pow) (v_f64_t x, v_f64_t y)
-{
- v_f64_t z;
- for (int lane = 0; lane < v_lanes64 (); lane++)
- {
- f64_t sx = v_get_f64 (x, lane);
- f64_t sy = v_get_f64 (y, lane);
- f64_t sz = pow (sx, sy);
- v_set_f64 (&z, lane, sz);
- }
- return z;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_powf.c b/contrib/arm-optimized-routines/math/v_powf.c
deleted file mode 100644
index fb80fa6f1846..000000000000
--- a/contrib/arm-optimized-routines/math/v_powf.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Single-precision vector powf function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define SBITS 5
-#define Tlog v__powf_log2_data.tab
-#define Texp v__exp2f_data.tab
-#define A v__powf_log2_data.poly
-#define C v__exp2f_data.poly
-#define LOGDEG 4
-
-#if LOGDEG == 5
-/* 1.01 ulp */
-#define OFF v_u32 (0x3f330000)
-#define TBITS 4
-#elif LOGDEG == 4
-/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
-#define OFF v_u32 (0x3f35d000)
-#define TBITS 5
-#endif
-
-#define V_EXP2F_TABLE_BITS SBITS
-#define V_EXP2F_POLY_ORDER 3
-struct v_exp2f_data
-{
- uint64_t tab[1 << V_EXP2F_TABLE_BITS];
- double poly[V_EXP2F_POLY_ORDER];
-};
-
-#define V_POWF_LOG2_TABLE_BITS TBITS
-#define V_POWF_LOG2_POLY_ORDER LOGDEG
-#define SCALE ((double) (1 << SBITS))
-struct v_powf_log2_data
-{
- struct
- {
- double invc, logc;
- } tab[1 << V_POWF_LOG2_TABLE_BITS];
- double poly[V_POWF_LOG2_POLY_ORDER];
-};
-
-static const struct v_powf_log2_data v__powf_log2_data = {
-#if LOGDEG == 5
- .tab = {
-{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
-{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
-{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
-{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
-{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
-{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
-{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
-{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
-{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
-{ 0x1p+0, 0x0p+0 * SCALE },
-{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
-{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
-{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
-{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
-{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
-{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
- },
-/* rel err: 1.46 * 2^-32 */
- .poly = {
-0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
-0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
-0x1.71547652ab82bp0 * SCALE,
- }
-#elif LOGDEG == 4
- .tab = {
-{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
-{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
-{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
-{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
-{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
-{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
-{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
-{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
-{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
-{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
-{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
-{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
-{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
-{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
-{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
-{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
-{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
-{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
-{0x1p+0, 0x0p+0 * SCALE},
-{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
-{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
-{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
-{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
-{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
-{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
-{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
-{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
-{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
-{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
-{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
-{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
-{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
- },
-/* rel err: 1.5 * 2^-30 */
- .poly = {
- -0x1.6ff5daa3b3d7cp-2 * SCALE,
- 0x1.ec81d03c01aebp-2 * SCALE,
- -0x1.71547bb43f101p-1 * SCALE,
- 0x1.7154764a815cbp0 * SCALE,
- }
-#endif
-};
-
-static const struct v_exp2f_data v__exp2f_data = {
- .tab = {
-0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
-0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
-0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
-0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
-0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
-0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
-0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
-0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
- },
-/* rel err: 1.69 * 2^-34 */
- .poly = {
-0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
- },
-};
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
-{
- return v_call2_f32 (powf, x, y, ret, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(powf) (v_f32_t x, v_f32_t y)
-{
- v_u32_t u, tmp, cmp, i, top, iz;
- v_s32_t k;
- v_f32_t ret;
-
- u = v_as_u32_f32 (x);
- cmp = v_cond_u32 (u - Min >= Max - Min);
- tmp = u - OFF;
- i = (tmp >> (23 - TBITS)) % (1 << TBITS);
- top = tmp & 0xff800000;
- iz = u - top;
- k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
-
- for (int lane = 0; lane < v_lanes32 (); lane++)
- {
- uint32_t si, siz;
- int32_t sk;
- float sy;
-
- /* Use double precision for each lane. */
- double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
- uint64_t ki, t;
-
- si = v_get_u32 (i, lane);
- siz = v_get_u32 (iz, lane);
- sk = v_get_s32 (k, lane);
- sy = v_get_f32 (y, lane);
-
- invc = Tlog[si].invc;
- logc = Tlog[si].logc;
- z = (double) as_f32_u32 (siz);
-
- /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
- r = __builtin_fma (z, invc, -1.0);
- y0 = logc + (double) sk;
-
- /* Polynomial to approximate log1p(r)/ln2. */
-#if LOGDEG == 5
- logx = A[0];
- logx = r * logx + A[1];
- logx = r * logx + A[2];
- logx = r * logx + A[3];
- logx = r * logx + A[4];
- logx = r * logx + y0;
-#elif LOGDEG == 4
- logx = A[0];
- logx = r * logx + A[1];
- logx = r * logx + A[2];
- logx = r * logx + A[3];
- logx = r * logx + y0;
-#endif
- ylogx = sy * logx;
- v_set_u32 (&cmp, lane,
- (as_u64_f64 (ylogx) >> 47 & 0xffff)
- >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
- ? 1
- : v_get_u32 (cmp, lane));
-
- /* N*x = k + r with r in [-1/2, 1/2] */
-#if TOINT_INTRINSICS
- kd = roundtoint (ylogx); /* k */
- ki = converttoint (ylogx);
-#else
-# define SHIFT 0x1.8p52
- kd = eval_as_double (ylogx + SHIFT);
- ki = asuint64 (kd);
- kd -= SHIFT;
-#endif
- r = ylogx - kd;
-
- /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
- t = Texp[ki % (1 << SBITS)];
- t += ki << (52 - SBITS);
- s = as_f64_u64 (t);
- p = C[0];
- p = __builtin_fma (p, r, C[1]);
- p = __builtin_fma (p, r, C[2]);
- p = __builtin_fma (p, s * r, s);
-
- v_set_f32 (&ret, lane, p);
- }
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, ret, cmp);
- return ret;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_sin.c b/contrib/arm-optimized-routines/math/v_sin.c
deleted file mode 100644
index 2b9ed059189c..000000000000
--- a/contrib/arm-optimized-routines/math/v_sin.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Double-precision vector sin function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const double Poly[] = {
-/* worst-case error is 3.5 ulp.
- abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
--0x1.9f4a9c8b21dc9p-41,
- 0x1.60e88a10163f2p-33,
--0x1.ae6361b7254e7p-26,
- 0x1.71de382e8d62bp-19,
--0x1.a01a019aeb4ffp-13,
- 0x1.111111110b25ep-7,
--0x1.55555555554c3p-3,
-};
-
-#define C7 v_f64 (Poly[0])
-#define C6 v_f64 (Poly[1])
-#define C5 v_f64 (Poly[2])
-#define C4 v_f64 (Poly[3])
-#define C3 v_f64 (Poly[4])
-#define C2 v_f64 (Poly[5])
-#define C1 v_f64 (Poly[6])
-
-#define InvPi v_f64 (0x1.45f306dc9c883p-2)
-#define Pi1 v_f64 (0x1.921fb54442d18p+1)
-#define Pi2 v_f64 (0x1.1a62633145c06p-53)
-#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
-#define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (sin, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(sin) (v_f64_t x)
-{
- v_f64_t n, r, r2, y;
- v_u64_t sign, odd, cmp;
-
- r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
- sign = v_as_u64_f64 (x) & ~AbsMask;
- cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
-
- /* n = rint(|x|/pi). */
- n = v_fma_f64 (InvPi, r, Shift);
- odd = v_as_u64_f64 (n) << 63;
- n -= Shift;
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = v_fma_f64 (-Pi1, n, r);
- r = v_fma_f64 (-Pi2, n, r);
- r = v_fma_f64 (-Pi3, n, r);
-
- /* sin(r) poly approx. */
- r2 = r * r;
- y = v_fma_f64 (C7, r2, C6);
- y = v_fma_f64 (y, r2, C5);
- y = v_fma_f64 (y, r2, C4);
- y = v_fma_f64 (y, r2, C3);
- y = v_fma_f64 (y, r2, C2);
- y = v_fma_f64 (y, r2, C1);
- y = v_fma_f64 (y * r2, r, r);
-
- /* sign. */
- y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/v_sinf.c b/contrib/arm-optimized-routines/math/v_sinf.c
deleted file mode 100644
index e66bfce6d8aa..000000000000
--- a/contrib/arm-optimized-routines/math/v_sinf.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Single-precision vector sin function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* 1.886 ulp error */
- 0x1.5b2e76p-19f,
- -0x1.9f42eap-13f,
- 0x1.110df4p-7f,
- -0x1.555548p-3f,
-};
-#define Pi1 v_f32 (0x1.921fb6p+1f)
-#define Pi2 v_f32 (-0x1.777a5cp-24f)
-#define Pi3 v_f32 (-0x1.ee59dap-49f)
-#define A3 v_f32 (Poly[3])
-#define A5 v_f32 (Poly[2])
-#define A7 v_f32 (Poly[1])
-#define A9 v_f32 (Poly[0])
-#define RangeVal v_f32 (0x1p20f)
-#define InvPi v_f32 (0x1.45f306p-2f)
-#define Shift v_f32 (0x1.8p+23f)
-#define AbsMask v_u32 (0x7fffffff)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (sinf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(sinf) (v_f32_t x)
-{
- v_f32_t n, r, r2, y;
- v_u32_t sign, odd, cmp;
-
- r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
- sign = v_as_u32_f32 (x) & ~AbsMask;
- cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
-
- /* n = rint(|x|/pi) */
- n = v_fma_f32 (InvPi, r, Shift);
- odd = v_as_u32_f32 (n) << 31;
- n -= Shift;
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
- r = v_fma_f32 (-Pi1, n, r);
- r = v_fma_f32 (-Pi2, n, r);
- r = v_fma_f32 (-Pi3, n, r);
-
- /* y = sin(r) */
- r2 = r * r;
- y = v_fma_f32 (A9, r2, A7);
- y = v_fma_f32 (y, r2, A5);
- y = v_fma_f32 (y, r2, A3);
- y = v_fma_f32 (y * r2, r, r);
-
- /* sign fix */
- y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_cos.c b/contrib/arm-optimized-routines/math/vn_cos.c
deleted file mode 100644
index b57a549eba68..000000000000
--- a/contrib/arm-optimized-routines/math/vn_cos.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cos.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
-#include "v_cos.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_cosf.c b/contrib/arm-optimized-routines/math/vn_cosf.c
deleted file mode 100644
index 6321d4620fa7..000000000000
--- a/contrib/arm-optimized-routines/math/vn_cosf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cosf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
-#include "v_cosf.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_exp.c b/contrib/arm-optimized-routines/math/vn_exp.c
deleted file mode 100644
index 06e269d41766..000000000000
--- a/contrib/arm-optimized-routines/math/vn_exp.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
-#include "v_exp.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_exp2f.c b/contrib/arm-optimized-routines/math/vn_exp2f.c
deleted file mode 100644
index db9707e86f16..000000000000
--- a/contrib/arm-optimized-routines/math/vn_exp2f.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp2f.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
-#include "v_exp2f.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c b/contrib/arm-optimized-routines/math/vn_exp2f_1u.c
deleted file mode 100644
index 17bd0abd7a60..000000000000
--- a/contrib/arm-optimized-routines/math/vn_exp2f_1u.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp2f_1u.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_exp2f_1u.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_expf.c b/contrib/arm-optimized-routines/math/vn_expf.c
deleted file mode 100644
index 0652907225d9..000000000000
--- a/contrib/arm-optimized-routines/math/vn_expf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
-#include "v_expf.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_expf_1u.c b/contrib/arm-optimized-routines/math/vn_expf_1u.c
deleted file mode 100644
index 3be776814822..000000000000
--- a/contrib/arm-optimized-routines/math/vn_expf_1u.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf_1u.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_expf_1u.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_log.c b/contrib/arm-optimized-routines/math/vn_log.c
deleted file mode 100644
index b58fe8ff820a..000000000000
--- a/contrib/arm-optimized-routines/math/vn_log.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
-#include "v_log.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_logf.c b/contrib/arm-optimized-routines/math/vn_logf.c
deleted file mode 100644
index cc5b8ae3ed55..000000000000
--- a/contrib/arm-optimized-routines/math/vn_logf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_logf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
-#include "v_logf.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_pow.c b/contrib/arm-optimized-routines/math/vn_pow.c
deleted file mode 100644
index 260950113b04..000000000000
--- a/contrib/arm-optimized-routines/math/vn_pow.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_pow.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
-#include "v_pow.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_powf.c b/contrib/arm-optimized-routines/math/vn_powf.c
deleted file mode 100644
index 095d07e337ad..000000000000
--- a/contrib/arm-optimized-routines/math/vn_powf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_powf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
-#include "v_powf.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_sin.c b/contrib/arm-optimized-routines/math/vn_sin.c
deleted file mode 100644
index 905c79623350..000000000000
--- a/contrib/arm-optimized-routines/math/vn_sin.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sin.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
-#include "v_sin.c"
-#endif
diff --git a/contrib/arm-optimized-routines/math/vn_sinf.c b/contrib/arm-optimized-routines/math/vn_sinf.c
deleted file mode 100644
index 1214e1a55638..000000000000
--- a/contrib/arm-optimized-routines/math/vn_sinf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sinf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
-#include "v_sinf.c"
-#endif