diff options
author | Andrew Turner <andrew@FreeBSD.org> | 2023-06-09 16:56:02 +0000 |
---|---|---|
committer | Andrew Turner <andrew@FreeBSD.org> | 2023-06-09 16:56:02 +0000 |
commit | 072a4ba82a01476eaee33781ccd241033eefcf0b (patch) | |
tree | caa6144dde0c7923c17942f573ee0d548a33a609 /contrib/arm-optimized-routines/pl/math | |
parent | 1dd169af7143db4df613f273e565919c1c2b53f5 (diff) | |
parent | 29866ecb89620f1c798b7f5ff6710255f13aa52e (diff) | |
download | src-072a4ba82a01476eaee33781ccd241033eefcf0b.tar.gz src-072a4ba82a01476eaee33781ccd241033eefcf0b.zip |
Update the Arm Optimized Routine library to v23.01
Sponsored by: Arm Ltd
Diffstat (limited to 'contrib/arm-optimized-routines/pl/math')
273 files changed, 17790 insertions, 0 deletions
diff --git a/contrib/arm-optimized-routines/pl/math/Dir.mk b/contrib/arm-optimized-routines/pl/math/Dir.mk new file mode 100644 index 000000000000..be65344572a8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/Dir.mk @@ -0,0 +1,229 @@ +# Makefile fragment - requires GNU make +# +# Copyright (c) 2019-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +PLM := $(srcdir)/pl/math +AOR := $(srcdir)/math +B := build/pl/math + +math-lib-srcs := $(wildcard $(PLM)/*.[cS]) +math-test-srcs := \ + $(AOR)/test/mathtest.c \ + $(AOR)/test/mathbench.c \ + $(AOR)/test/ulp.c \ + +math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS]) + +math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h)) +math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h)) + +math-libs := \ + build/pl/lib/libmathlib.so \ + build/pl/lib/libmathlib.a \ + +math-tools := \ + build/pl/bin/mathtest \ + build/pl/bin/mathbench \ + build/pl/bin/mathbench_libc \ + build/pl/bin/runulp.sh \ + build/pl/bin/ulp \ + +math-host-tools := \ + build/pl/bin/rtest \ + +math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs))) +math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs))) +math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs))) +math-target-objs := $(math-lib-objs) $(math-test-objs) +math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs) + +pl/math-files := \ + $(math-objs) \ + $(math-libs) \ + $(math-tools) \ + $(math-host-tools) \ + $(math-includes) \ + $(math-test-includes) \ + +all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes) + +$(math-objs): $(math-includes) $(math-test-includes) +$(math-objs): CFLAGS_PL += $(math-cflags) +$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno +$(math-host-objs): CC = $(HOST_CC) +$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS) + +build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs) + # Replace PL_SIG + cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@ + +build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs) + # Replace PL_SIG macros with mathbench func entries + cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@ + +build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs) + # Replace PL_SIG macros with ULP wrapper declarations + cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@ + +$(B)/test/ulp.o: $(AOR)/test/ulp.h build/pl/include/test/ulp_funcs_gen.h build/pl/include/test/ulp_wrappers_gen.h +$(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test + +$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h +$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test + +build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os) + $(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^ + +build/pl/lib/libmathlib.a: $(math-lib-objs) + rm -f $@ + $(AR) rc $@ $^ + $(RANLIB) $@ + +$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc +$(math-tools): LDLIBS += $(math-ldlibs) -lm + +# Some targets to build pl/math/test from math/test sources +build/pl/math/test/%.o: $(srcdir)/math/test/%.S + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/math/test/%.o: $(srcdir)/math/test/%.c + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/math/test/%.os: $(srcdir)/math/test/%.S + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/math/test/%.os: $(srcdir)/math/test/%.c + $(CC) $(CFLAGS_PL) -c -o $@ $< + +# Some targets to build pl/ sources using appropriate flags +build/pl/%.o: $(srcdir)/pl/%.S + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/%.o: $(srcdir)/pl/%.c + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/%.os: $(srcdir)/pl/%.S + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/%.os: $(srcdir)/pl/%.c + $(CC) $(CFLAGS_PL) -c -o $@ $< + +build/pl/bin/rtest: $(math-host-objs) + $(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS) + +build/pl/bin/mathtest: $(B)/test/mathtest.o build/pl/lib/libmathlib.a + $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +build/pl/bin/mathbench: $(B)/test/mathbench.o build/pl/lib/libmathlib.a + $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +# This is not ideal, but allows custom symbols in mathbench to get resolved. +build/pl/bin/mathbench_libc: $(B)/test/mathbench.o build/pl/lib/libmathlib.a + $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $< $(LDLIBS) -lc build/pl/lib/libmathlib.a -lm + +build/pl/bin/ulp: $(B)/test/ulp.o build/pl/lib/libmathlib.a + $(CC) $(CFLAGS_PL) $(LDFLAGS) -static -o $@ $^ $(LDLIBS) + +build/pl/include/%.h: $(PLM)/include/%.h + cp $< $@ + +build/pl/include/test/%.h: $(PLM)/test/%.h + cp $< $@ + +build/pl/bin/%.sh: $(PLM)/test/%.sh + cp $< $@ + +pl-math-tests := $(wildcard $(PLM)/test/testcases/directed/*.tst) +pl-math-rtests := $(wildcard $(PLM)/test/testcases/random/*.tst) + +check-pl/math-test: $(math-tools) + cat $(pl-math-tests) | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) + +check-pl/math-rtest: $(math-host-tools) $(math-tools) + cat $(pl-math-rtests) | build/pl/bin/rtest | $(EMULATOR) build/pl/bin/mathtest $(math-testflags) + +ulp-input-dir=$(B)/test/inputs + +math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs))) +math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs))) +math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs))) +math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs))) + +ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs) + +$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags) + +$(ulp-input-dir)/%.ulp: $(PLM)/%.c + mkdir -p $(@D) + $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@ + +$(ulp-input-dir)/%.alias: $(PLM)/%.c + mkdir -p $(@D) + $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@ + +$(ulp-input-dir)/%.fenv: $(PLM)/%.c + mkdir -p $(@D) + $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@ + +$(ulp-input-dir)/%.itv: $(PLM)/%.c + mkdir -p $(dir $@) + $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_INTERVAL " || true; } | sed "s/ PL_TEST_INTERVAL/\nPL_TEST_INTERVAL/g" > $@ + +ulp-lims := $(ulp-input-dir)/limits +$(ulp-lims): $(math-lib-lims) + cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@ + +ulp-aliases := $(ulp-input-dir)/aliases +$(ulp-aliases): $(math-lib-aliases) + cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@ + +fenv-exps := $(ulp-input-dir)/fenv +$(fenv-exps): $(math-lib-fenvs) + cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@ + +ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias +$(ulp-itvs-noalias): $(math-lib-itvs) + cat $^ > $@ + +rename-aliases := $(ulp-input-dir)/rename_alias.sed +$(rename-aliases): $(ulp-aliases) + # Build sed script for replacing aliases from generated alias file + cat $< | awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@ + +ulp-itvs-alias := $(ulp-input-dir)/itvs_alias +$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases) + cat $< | sed -f $(rename-aliases) > $@ + +ulp-itvs := $(ulp-input-dir)/intervals +$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias) + cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@ + +check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs) + WANT_SVE_MATH=$(WANT_SVE_MATH) \ + ULPFLAGS="$(math-ulpflags)" \ + LIMITS=../../../$(ulp-lims) \ + ALIASES=../../../$(ulp-aliases) \ + INTERVALS=../../../$(ulp-itvs) \ + FENV=../../../$(fenv-exps) \ + build/pl/bin/runulp.sh $(EMULATOR) + +check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp + +$(DESTDIR)$(libdir)/pl/%.so: build/pl/lib/%.so + $(INSTALL) -D $< $@ + +$(DESTDIR)$(libdir)/pl/%: build/pl/lib/% + $(INSTALL) -m 644 -D $< $@ + +$(DESTDIR)$(includedir)/pl/%: build/pl/include/% + $(INSTALL) -m 644 -D $< $@ + +install-pl/math: \ + $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \ + $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%) + +clean-pl/math: + rm -f $(pl/math-files) + +.PHONY: all-pl/math check-pl/math-test check-pl/math-rtest check-pl/math-ulp check-pl/math install-pl/math clean-pl/math diff --git a/contrib/arm-optimized-routines/pl/math/acosh_3u.c b/contrib/arm-optimized-routines/pl/math/acosh_3u.c new file mode 100644 index 000000000000..4e2cb6737ba8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/acosh_3u.c @@ -0,0 +1,66 @@ +/* + * Double-precision acosh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Ln2 (0x1.62e42fefa39efp-1) +#define MinusZero (0x8000000000000000) +#define SquareLim (0x5fe0000000000000) /* asuint64(0x1.0p511). */ +#define Two (0x4000000000000000) /* asuint64(2.0). */ + +double +optr_aor_log_f64 (double); + +double +log1p (double); + +/* acosh approximation using a variety of approaches on different intervals: + + acosh(x) = ln(x + sqrt(x * x - 1)). + + x >= 2^511: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is + close enough to x that we can calculate the result by ln(2x) == ln(x) + + ln(2). The greatest observed error in this region is 0.98 ULP: + acosh(0x1.1b9bf42923d1dp+853) got 0x1.28066a11a7c7fp+9 + want 0x1.28066a11a7c8p+9. + + x > 2: Calculate the result directly using definition of acosh(x). Greatest + observed error in this region is 1.33 ULP: + acosh(0x1.1e45d14bfcfa2p+1) got 0x1.71a06f50c34b5p+0 + want 0x1.71a06f50c34b6p+0. + + 0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is + undefined. For 1 <= x <= 2, the largest observed error is 2.69 ULP: + acosh(0x1.073528248093p+0) got 0x1.e4d9bd20684f3p-3 + want 0x1.e4d9bd20684f6p-3. */ +double +acosh (double x) +{ + uint64_t ix = asuint64 (x); + + if (unlikely (ix >= MinusZero)) + return __math_invalid (x); + + if (unlikely (ix >= SquareLim)) + return optr_aor_log_f64 (x) + Ln2; + + if (ix >= Two) + return optr_aor_log_f64 (x + sqrt (x * x - 1)); + + double xm1 = x - 1; + return log1p (xm1 + sqrt (2 * xm1 + xm1 * xm1)); +} + +PL_SIG (S, D, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (acosh, 2.19) +PL_TEST_INTERVAL (acosh, 0, 1, 10000) +PL_TEST_INTERVAL (acosh, 1, 2, 100000) +PL_TEST_INTERVAL (acosh, 2, 0x1p511, 100000) +PL_TEST_INTERVAL (acosh, 0x1p511, inf, 100000) +PL_TEST_INTERVAL (acosh, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/acoshf_2u8.c b/contrib/arm-optimized-routines/pl/math/acoshf_2u8.c new file mode 100644 index 000000000000..c9cded7fd2ff --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/acoshf_2u8.c @@ -0,0 +1,63 @@ +/* + * Single-precision acosh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Ln2 (0x1.62e4p-1f) +#define MinusZero 0x80000000 +#define SquareLim 0x5f800000 /* asuint(0x1p64). */ +#define Two 0x40000000 + +/* Single-precision log from math/. */ +float +optr_aor_log_f32 (float); + +/* Single-precision log(1+x) from pl/math. */ +float +log1pf (float); + +/* acoshf approximation using a variety of approaches on different intervals: + + x >= 2^64: We cannot square x without overflow. For huge x, sqrt(x*x - 1) is + close enough to x that we can calculate the result by ln(2x) == ln(x) + + ln(2). The greatest error in the region is 0.94 ULP: + acoshf(0x1.15f706p+92) got 0x1.022e14p+6 want 0x1.022e16p+6. + + x > 2: Calculate the result directly using definition of asinh(x) = ln(x + + sqrt(x*x - 1)). Greatest error in this region is 1.30 ULP: + acoshf(0x1.249d8p+1) got 0x1.77e1aep+0 want 0x1.77e1bp+0. + + 0 <= x <= 2: Calculate the result using log1p. For x < 1, acosh(x) is + undefined. For 1 <= x <= 2, the greatest error is 2.78 ULP: + acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 want 0x1.ef9ea2p-3. */ +float +acoshf (float x) +{ + uint32_t ix = asuint (x); + + if (unlikely (ix >= MinusZero)) + return __math_invalidf (x); + + if (unlikely (ix >= SquareLim)) + return optr_aor_log_f32 (x) + Ln2; + + if (ix > Two) + return optr_aor_log_f32 (x + sqrtf (x * x - 1)); + + float xm1 = x - 1; + return log1pf (xm1 + sqrtf (2 * xm1 + xm1 * xm1)); +} + +PL_SIG (S, F, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (acoshf, 2.30) +PL_TEST_INTERVAL (acoshf, 0, 1, 100) +PL_TEST_INTERVAL (acoshf, 1, 2, 10000) +PL_TEST_INTERVAL (acoshf, 2, 0x1p64, 100000) +PL_TEST_INTERVAL (acoshf, 0x1p64, inf, 100000) +PL_TEST_INTERVAL (acoshf, -0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/asinh_2u5.c b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c new file mode 100644 index 000000000000..f1679556d5f8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinh_2u5.c @@ -0,0 +1,86 @@ +/* + * Double-precision asinh(x) function + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "estrin.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffffffffffff +#define ExpM26 0x3e50000000000000 /* asuint64(0x1.0p-26). */ +#define One 0x3ff0000000000000 /* asuint64(1.0). */ +#define Exp511 0x5fe0000000000000 /* asuint64(0x1.0p511). */ +#define Ln2 0x1.62e42fefa39efp-1 + +double +optr_aor_log_f64 (double); + +/* Scalar double-precision asinh implementation. This routine uses different + approaches on different intervals: + + |x| < 2^-26: Return x. Function is exact in this region. + + |x| < 1: Use custom order-17 polynomial. This is least accurate close to 1. + The largest observed error in this region is 1.47 ULPs: + asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. + + |x| < 2^511: Upper bound of this region is close to sqrt(DBL_MAX). Calculate + the result directly using the definition asinh(x) = ln(x + sqrt(x*x + 1)). + The largest observed error in this region is 2.03 ULPs: + asinh(-0x1.00094e0f39574p+0) got -0x1.c3508eb6a681ep-1 + want -0x1.c3508eb6a682p-1. + + |x| >= 2^511: We cannot square x without overflow at a low + cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot + even double x without overflow, so calculate this as ln(x) + + ln(2). The largest observed error in this region is 0.98 ULPs at many + values, for instance: + asinh(0x1.5255a4cf10319p+975) got 0x1.52652f4cb26cbp+9 + want 0x1.52652f4cb26ccp+9. */ +double +asinh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + double ax = asdouble (ia); + uint64_t sign = ix & ~AbsMask; + + if (ia < ExpM26) + { + return x; + } + + if (ia < One) + { + double x2 = x * x; + double z2 = x2 * x2; + double z4 = z2 * z2; + double z8 = z4 * z4; +#define C(i) __asinh_data.poly[i] + double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); + double y = fma (p, x2 * ax, ax); + return asdouble (asuint64 (y) | sign); + } + + if (unlikely (ia >= Exp511)) + { + return asdouble (asuint64 (optr_aor_log_f64 (ax) + Ln2) | sign); + } + + return asdouble (asuint64 (optr_aor_log_f64 (ax + sqrt (ax * ax + 1))) + | sign); +} + +PL_SIG (S, D, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (asinh, 1.54) +PL_TEST_INTERVAL (asinh, -0x1p-26, 0x1p-26, 50000) +PL_TEST_INTERVAL (asinh, 0x1p-26, 1.0, 40000) +PL_TEST_INTERVAL (asinh, -0x1p-26, -1.0, 10000) +PL_TEST_INTERVAL (asinh, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (asinh, -1.0, -100.0, 10000) +PL_TEST_INTERVAL (asinh, 100.0, inf, 50000) +PL_TEST_INTERVAL (asinh, -100.0, -inf, 10000) diff --git a/contrib/arm-optimized-routines/pl/math/asinh_data.c b/contrib/arm-optimized-routines/pl/math/asinh_data.c new file mode 100644 index 000000000000..073b19799bda --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinh_data.c @@ -0,0 +1,22 @@ +/* + * Double-precision polynomial coefficients for scalar asinh(x) + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* asinh(x) is odd, and the first term of the Taylor expansion is x, so we can + approximate the function by x + x^3 * P(x^2), where P(z) has the form: + C0 + C1 * z + C2 * z^2 + C3 * z^3 + ... + Note P is evaluated on even powers of x only. See tools/asinh.sollya for the + algorithm used to generate these coefficients. */ +const struct asinh_data __asinh_data + = {.poly + = {-0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, + 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, + -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, + 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, + -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, + 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18}}; diff --git a/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c new file mode 100644 index 000000000000..2b2c55db56dc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinhf_3u5.c @@ -0,0 +1,78 @@ +/* + * Single-precision asinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "estrinf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffff) +#define SqrtFltMax (0x1.749e96p+10f) +#define Ln2 (0x1.62e4p-1f) +#define One (0x3f8) +#define ExpM12 (0x398) + +#define C(i) __asinhf_data.coeffs[i] + +float +optr_aor_log_f32 (float); + +/* asinhf approximation using a variety of approaches on different intervals: + + |x| < 2^-12: Return x. Function is exactly rounded in this region. + + |x| < 1.0: Use custom order-8 polynomial. The largest observed + error in this region is 1.3ulps: + asinhf(0x1.f0f74cp-1) got 0x1.b88de4p-1 want 0x1.b88de2p-1. + + |x| <= SqrtFltMax: Calculate the result directly using the + definition of asinh(x) = ln(x + sqrt(x*x + 1)). The largest + observed error in this region is 1.99ulps. + asinhf(0x1.00e358p+0) got 0x1.c4849ep-1 want 0x1.c484a2p-1. + + |x| > SqrtFltMax: We cannot square x without overflow at a low + cost. At very large x, asinh(x) ~= ln(2x). At huge x we cannot + even double x without overflow, so calculate this as ln(x) + + ln(2). This largest observed error in this region is 3.39ulps. + asinhf(0x1.749e9ep+10) got 0x1.fffff8p+2 want 0x1.fffffep+2. */ +float +asinhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + float ax = asfloat (ia); + uint32_t sign = ix & ~AbsMask; + + if (unlikely (ia12 < ExpM12 || ia == 0x7f800000)) + return x; + + if (unlikely (ia12 >= 0x7f8)) + return __math_invalidf (x); + + if (ia12 < One) + { + float x2 = ax * ax; + float p = ESTRIN_7 (ax, x2, x2 * x2, C); + float y = fmaf (x2, p, ax); + return asfloat (asuint (y) | sign); + } + + if (unlikely (ax > SqrtFltMax)) + { + return asfloat (asuint (optr_aor_log_f32 (ax) + Ln2) | sign); + } + + return asfloat (asuint (optr_aor_log_f32 (ax + sqrtf (ax * ax + 1))) | sign); +} + +PL_SIG (S, F, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (asinhf, 2.9) +PL_TEST_INTERVAL (asinhf, 0, 0x1p-12, 5000) +PL_TEST_INTERVAL (asinhf, 0x1p-12, 1.0, 50000) +PL_TEST_INTERVAL (asinhf, 1.0, 0x1p11, 50000) +PL_TEST_INTERVAL (asinhf, 0x1p11, 0x1p127, 20000) diff --git a/contrib/arm-optimized-routines/pl/math/asinhf_data.c b/contrib/arm-optimized-routines/pl/math/asinhf_data.c new file mode 100644 index 000000000000..cd1ef16b3b6a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/asinhf_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients for single-precision asinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Approximate asinhf(x) directly in [2^-12, 1]. See for tools/asinhf.sollya for + these coeffs were generated. */ +const struct asinhf_data __asinhf_data + = {.coeffs + = {-0x1.9b16fap-19f, -0x1.552baap-3f, -0x1.4e572ap-11f, 0x1.3a81dcp-4f, + 0x1.65bbaap-10f, -0x1.057f1p-4f, 0x1.6c1d46p-5f, -0x1.4cafe8p-7f}}; diff --git a/contrib/arm-optimized-routines/pl/math/atan2_2u5.c b/contrib/arm-optimized-routines/pl/math/atan2_2u5.c new file mode 100644 index 000000000000..c909ac99fa22 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atan2_2u5.c @@ -0,0 +1,159 @@ +/* + * Double-precision scalar atan2(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <stdbool.h> + +#include "atan_common.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Pi (0x1.921fb54442d18p+1) +#define PiOver2 (0x1.921fb54442d18p+0) +#define PiOver4 (0x1.921fb54442d18p-1) +#define SignMask (0x8000000000000000) +#define ExpMask (0x7ff0000000000000) + +/* We calculate atan2 by P(n/d), where n and d are similar to the input + arguments, and P is a polynomial. Evaluating P(x) requires calculating x^8, + which may underflow if n and d have very different magnitude. + POW8_EXP_UFLOW_BOUND is the lower bound of the difference in exponents of n + and d for which P underflows, and is used to special-case such inputs. */ +#define POW8_EXP_UFLOW_BOUND 62 + +static inline int64_t +biased_exponent (double f) +{ + uint64_t fi = asuint64 (f); + return (fi & ExpMask) >> 52; +} + +/* Fast implementation of scalar atan2. Largest errors are when y and x are + close together. The greatest observed error is 2.28 ULP: + atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) + got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ +double +atan2 (double y, double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iy = asuint64 (y); + + uint64_t sign_x = ix & SignMask; + uint64_t sign_y = iy & SignMask; + + uint64_t iax = ix & ~SignMask; + uint64_t iay = iy & ~SignMask; + + bool xisnan = isnan (x); + if (unlikely (isnan (y) && !xisnan)) + return __math_invalid (y); + if (unlikely (xisnan)) + return __math_invalid (x); + + /* m = 2 * sign(x) + sign(y). */ + uint32_t m = ((iy >> 63) & 1) | ((ix >> 62) & 2); + + int64_t exp_diff = biased_exponent (x) - biased_exponent (y); + + /* y = 0. */ + if (iay == 0) + { + switch (m) + { + case 0: + case 1: + return y; /* atan(+-0,+anything)=+-0. */ + case 2: + return Pi; /* atan(+0,-anything) = pi. */ + case 3: + return -Pi; /* atan(-0,-anything) =-pi. */ + } + } + /* Special case for (x, y) either on or very close to the y axis. Either x = + 0, or y is much larger than x (difference in exponents >= + POW8_EXP_UFLOW_BOUND). */ + if (unlikely (iax == 0 || exp_diff <= -POW8_EXP_UFLOW_BOUND)) + return sign_y ? -PiOver2 : PiOver2; + + /* Special case for either x is INF or (x, y) is very close to x axis and x is + negative. */ + if (unlikely (iax == 0x7ff0000000000000 + || (exp_diff >= POW8_EXP_UFLOW_BOUND && m >= 2))) + { + if (iay == 0x7ff0000000000000) + { + switch (m) + { + case 0: + return PiOver4; /* atan(+INF,+INF). */ + case 1: + return -PiOver4; /* atan(-INF,+INF). */ + case 2: + return 3.0 * PiOver4; /* atan(+INF,-INF). */ + case 3: + return -3.0 * PiOver4; /* atan(-INF,-INF). */ + } + } + else + { + switch (m) + { + case 0: + return 0.0; /* atan(+...,+INF). */ + case 1: + return -0.0; /* atan(-...,+INF). */ + case 2: + return Pi; /* atan(+...,-INF). */ + case 3: + return -Pi; /* atan(-...,-INF). */ + } + } + } + /* y is INF. */ + if (iay == 0x7ff0000000000000) + return sign_y ? -PiOver2 : PiOver2; + + uint64_t sign_xy = sign_x ^ sign_y; + + double ax = asdouble (iax); + double ay = asdouble (iay); + uint64_t pred_aygtax = (ay > ax); + + /* Set up z for call to atan. */ + double n = pred_aygtax ? -ax : ay; + double d = pred_aygtax ? ay : ax; + double z = n / d; + + double ret; + if (unlikely (m < 2 && exp_diff >= POW8_EXP_UFLOW_BOUND)) + { + /* If (x, y) is very close to x axis and x is positive, the polynomial + will underflow and evaluate to z. */ + ret = z; + } + else + { + /* Work out the correct shift. */ + double shift = sign_x ? -2.0 : 0.0; + shift = pred_aygtax ? shift + 1.0 : shift; + shift *= PiOver2; + + ret = eval_poly (z, z, shift); + } + + /* Account for the sign of x and y. */ + return asdouble (asuint64 (ret) ^ sign_xy); +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +PL_SIG (S, D, 2, atan2) +PL_TEST_ULP (atan2, 1.78) +PL_TEST_INTERVAL (atan2, -10.0, 10.0, 50000) +PL_TEST_INTERVAL (atan2, -1.0, 1.0, 40000) +PL_TEST_INTERVAL (atan2, 0.0, 1.0, 40000) +PL_TEST_INTERVAL (atan2, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (atan2, 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/atan2f_3u.c new file mode 100644 index 000000000000..38e1df59c102 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atan2f_3u.c @@ -0,0 +1,167 @@ +/* + * Single-precision scalar atan2(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <stdbool.h> + +#include "atanf_common.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Pi (0x1.921fb6p+1f) +#define PiOver2 (0x1.921fb6p+0f) +#define PiOver4 (0x1.921fb6p-1f) +#define SignMask (0x80000000) + +/* We calculate atan2f by P(n/d), where n and d are similar to the input + arguments, and P is a polynomial. The polynomial may underflow. + POLY_UFLOW_BOUND is the lower bound of the difference in exponents of n and d + for which P underflows, and is used to special-case such inputs. */ +#define POLY_UFLOW_BOUND 24 + +static inline int32_t +biased_exponent (float f) +{ + uint32_t fi = asuint (f); + int32_t ex = (int32_t) ((fi & 0x7f800000) >> 23); + if (unlikely (ex == 0)) + { + /* Subnormal case - we still need to get the exponent right for subnormal + numbers as division may take us back inside the normal range. */ + return ex - __builtin_clz (fi << 9); + } + return ex; +} + +/* Fast implementation of scalar atan2f. Largest observed error is + 2.88ulps in [99.0, 101.0] x [99.0, 101.0]: + atan2f(0x1.9332d8p+6, 0x1.8cb6c4p+6) got 0x1.964646p-1 + want 0x1.964640p-1. */ +float +atan2f (float y, float x) +{ + uint32_t ix = asuint (x); + uint32_t iy = asuint (y); + + uint32_t sign_x = ix & SignMask; + uint32_t sign_y = iy & SignMask; + + uint32_t iax = ix & ~SignMask; + uint32_t iay = iy & ~SignMask; + + /* x or y is NaN. */ + if ((iax > 0x7f800000) || (iay > 0x7f800000)) + return x + y; + + /* m = 2 * sign(x) + sign(y). */ + uint32_t m = ((iy >> 31) & 1) | ((ix >> 30) & 2); + + /* The following follows glibc ieee754 implementation, except + that we do not use +-tiny shifts (non-nearest rounding mode). */ + + int32_t exp_diff = biased_exponent (x) - biased_exponent (y); + + /* Special case for (x, y) either on or very close to the x axis. Either y = + 0, or y is tiny and x is huge (difference in exponents >= + POLY_UFLOW_BOUND). In the second case, we only want to use this special + case when x is negative (i.e. quadrants 2 or 3). */ + if (unlikely (iay == 0 || (exp_diff >= POLY_UFLOW_BOUND && m >= 2))) + { + switch (m) + { + case 0: + case 1: + return y; /* atan(+-0,+anything)=+-0. */ + case 2: + return Pi; /* atan(+0,-anything) = pi. */ + case 3: + return -Pi; /* atan(-0,-anything) =-pi. */ + } + } + /* Special case for (x, y) either on or very close to the y axis. Either x = + 0, or x is tiny and y is huge (difference in exponents >= + POLY_UFLOW_BOUND). */ + if (unlikely (iax == 0 || exp_diff <= -POLY_UFLOW_BOUND)) + return sign_y ? -PiOver2 : PiOver2; + + /* x is INF. */ + if (iax == 0x7f800000) + { + if (iay == 0x7f800000) + { + switch (m) + { + case 0: + return PiOver4; /* atan(+INF,+INF). */ + case 1: + return -PiOver4; /* atan(-INF,+INF). */ + case 2: + return 3.0f * PiOver4; /* atan(+INF,-INF). */ + case 3: + return -3.0f * PiOver4; /* atan(-INF,-INF). */ + } + } + else + { + switch (m) + { + case 0: + return 0.0f; /* atan(+...,+INF). */ + case 1: + return -0.0f; /* atan(-...,+INF). */ + case 2: + return Pi; /* atan(+...,-INF). */ + case 3: + return -Pi; /* atan(-...,-INF). */ + } + } + } + /* y is INF. */ + if (iay == 0x7f800000) + return sign_y ? -PiOver2 : PiOver2; + + uint32_t sign_xy = sign_x ^ sign_y; + + float ax = asfloat (iax); + float ay = asfloat (iay); + + bool pred_aygtax = (ay > ax); + + /* Set up z for call to atanf. */ + float n = pred_aygtax ? -ax : ay; + float d = pred_aygtax ? ay : ax; + float z = n / d; + + float ret; + if (unlikely (m < 2 && exp_diff >= POLY_UFLOW_BOUND)) + { + /* If (x, y) is very close to x axis and x is positive, the polynomial + will underflow and evaluate to z. */ + ret = z; + } + else + { + /* Work out the correct shift. */ + float shift = sign_x ? -2.0f : 0.0f; + shift = pred_aygtax ? shift + 1.0f : shift; + shift *= PiOver2; + + ret = eval_poly (z, z, shift); + } + + /* Account for the sign of x and y. */ + return asfloat (asuint (ret) ^ sign_xy); +} + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +PL_SIG (S, F, 2, atan2) +PL_TEST_ULP (atan2f, 2.4) +PL_TEST_INTERVAL (atan2f, -10.0, 10.0, 50000) +PL_TEST_INTERVAL (atan2f, -1.0, 1.0, 40000) +PL_TEST_INTERVAL (atan2f, 0.0, 1.0, 40000) +PL_TEST_INTERVAL (atan2f, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (atan2f, 1e6, 1e32, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/atan_2u5.c b/contrib/arm-optimized-routines/pl/math/atan_2u5.c new file mode 100644 index 000000000000..ee4770101758 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atan_2u5.c @@ -0,0 +1,73 @@ +/* + * Double-precision atan(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "pl_sig.h" +#include "pl_test.h" +#include "atan_common.h" + +#define AbsMask 0x7fffffffffffffff +#define PiOver2 0x1.921fb54442d18p+0 +#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)). */ +#define BigBound 0x434 /* top12(asuint64(0x1p53)). */ +#define OneTop 0x3ff + +/* Fast implementation of double-precision atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: + atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +double +atan (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t sign = ix & ~AbsMask; + uint64_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 52; + + if (unlikely (ia12 >= BigBound || ia12 < TinyBound)) + { + if (ia12 < TinyBound) + /* Avoid underflow by returning x. */ + return x; + if (ia > 0x7ff0000000000000) + /* Propagate NaN. */ + return __math_invalid (x); + /* atan(x) rounds to PiOver2 for large x. */ + return asdouble (asuint64 (PiOver2) ^ sign); + } + + double z, az, shift; + if (ia12 >= OneTop) + { + /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x). */ + z = -1.0 / x; + shift = PiOver2; + /* Use absolute value only when needed (odd powers of z). */ + az = -fabs (z); + } + else + { + /* For x < 1, approximate atan(x) directly. */ + z = x; + shift = 0; + az = asdouble (ia); + } + + /* Calculate polynomial, shift + z + z^3 * P(z^2). */ + double y = eval_poly (z, az, shift); + /* Copy sign. */ + return asdouble (asuint64 (y) ^ sign); +} + +PL_SIG (S, D, 1, atan, -10.0, 10.0) +PL_TEST_ULP (atan, 1.78) +PL_TEST_INTERVAL (atan, 0, 0x1p-30, 10000) +PL_TEST_INTERVAL (atan, -0, -0x1p-30, 1000) +PL_TEST_INTERVAL (atan, 0x1p-30, 0x1p53, 900000) +PL_TEST_INTERVAL (atan, -0x1p-30, -0x1p53, 90000) +PL_TEST_INTERVAL (atan, 0x1p53, inf, 10000) +PL_TEST_INTERVAL (atan, -0x1p53, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/atan_common.h b/contrib/arm-optimized-routines/pl/math/atan_common.h new file mode 100644 index 000000000000..da0da6436854 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atan_common.h @@ -0,0 +1,49 @@ +/* + * Double-precision polynomial evaluation function for scalar and vector atan(x) + * and atan2(y,x). + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "estrin.h" + +#if V_SUPPORTED + +#include "v_math.h" + +#define DBL_T v_f64_t +#define P(i) v_f64 (__atan_poly_data.poly[i]) + +#else + +#define DBL_T double +#define P(i) __atan_poly_data.poly[i] + +#endif + +/* Polynomial used in fast atan(x) and atan2(y,x) implementations + The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ +static inline DBL_T +eval_poly (DBL_T z, DBL_T az, DBL_T shift) +{ + /* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of + full scheme to avoid underflow in x^16. */ + DBL_T z2 = z * z; + DBL_T x2 = z2 * z2; + DBL_T x4 = x2 * x2; + DBL_T x8 = x4 * x4; + DBL_T y + = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P)); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + y = FMA (y, z2 * az, az); + y = y + shift; + + return y; +} + +#undef DBL_T +#undef FMA +#undef P diff --git a/contrib/arm-optimized-routines/pl/math/atan_data.c b/contrib/arm-optimized-routines/pl/math/atan_data.c new file mode 100644 index 000000000000..91d0f61d2eaf --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atan_data.c @@ -0,0 +1,20 @@ +/* + * Double-precision polynomial coefficients for vector atan(x) and atan2(y,x). + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct atan_poly_data __atan_poly_data = { + .poly = {/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on + [2**-1022, 1.0]. See atan.sollya for details of how these were + generated. */ + -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, + 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, + -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, + 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, + -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, + 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, + -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16}}; diff --git a/contrib/arm-optimized-routines/pl/math/atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c new file mode 100644 index 000000000000..9d17f252b8b9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atanf_2u9.c @@ -0,0 +1,76 @@ +/* + * Single-precision atan(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "atanf_common.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define PiOver2 0x1.921fb6p+0f +#define AbsMask 0x7fffffff +#define TinyBound 0x30800000 /* asuint(0x1p-30). */ +#define BigBound 0x4e800000 /* asuint(0x1p30). */ +#define One 0x3f800000 + +/* Approximation of single-precision atan(x) based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] + using z=-1/x and shift = pi/2. + Maximum error is 2.88 ulps: + atanf(0x1.0565ccp+0) got 0x1.97771p-1 + want 0x1.97770ap-1. */ +float +atanf (float x) +{ + uint32_t ix = asuint (x); + uint32_t sign = ix & ~AbsMask; + uint32_t ia = ix & AbsMask; + + if (unlikely (ia < TinyBound)) + /* Avoid underflow by returning x. */ + return x; + + if (unlikely (ia > BigBound)) + { + if (ia > 0x7f800000) + /* Propagate NaN. */ + return __math_invalidf (x); + /* atan(x) rounds to PiOver2 for large x. */ + return asfloat (asuint (PiOver2) ^ sign); + } + + float z, az, shift; + if (ia > One) + { + /* For x > 1, use atan(x) = pi / 2 + atan(-1 / x). */ + z = -1.0f / x; + shift = PiOver2; + /* Use absolute value only when needed (odd powers of z). */ + az = -fabsf (z); + } + else + { + /* For x < 1, approximate atan(x) directly. */ + z = x; + az = asfloat (ia); + shift = 0; + } + + /* Calculate polynomial, shift + z + z^3 * P(z^2). */ + float y = eval_poly (z, az, shift); + /* Copy sign. */ + return asfloat (asuint (y) ^ sign); +} + +PL_SIG (S, F, 1, atan, -10.0, 10.0) +PL_TEST_ULP (atanf, 2.38) +PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000) +PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000) +PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000) +PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000) +PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000) +PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000) +PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000) +PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/atanf_common.h b/contrib/arm-optimized-routines/pl/math/atanf_common.h new file mode 100644 index 000000000000..37ca76dee2f7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atanf_common.h @@ -0,0 +1,51 @@ +/* + * Single-precision polynomial evaluation function for scalar and vector + * atan(x) and atan2(y,x). + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_ATANF_COMMON_H +#define PL_MATH_ATANF_COMMON_H + +#include "math_config.h" +#include "estrinf.h" + +#if V_SUPPORTED + +#include "v_math.h" + +#define FLT_T v_f32_t +#define P(i) v_f32 (__atanf_poly_data.poly[i]) + +#else + +#define FLT_T float +#define P(i) __atanf_poly_data.poly[i] + +#endif + +/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations + The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ +static inline FLT_T +eval_poly (FLT_T z, FLT_T az, FLT_T shift) +{ + /* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, + a standard implementation using z8 creates spurious underflow + in the very last fma (when z^8 is small enough). + Therefore, we split the last fma into a mul and and an fma. + Horner and single-level Estrin have higher errors that exceed + threshold. */ + FLT_T z2 = z * z; + FLT_T z4 = z2 * z2; + + /* Then assemble polynomial. */ + FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P)); + + /* Finalize: + y = shift + z * P(z^2). */ + return FMA (y, z2 * az, az) + shift; +} + +#endif // PL_MATH_ATANF_COMMON_H diff --git a/contrib/arm-optimized-routines/pl/math/atanf_data.c b/contrib/arm-optimized-routines/pl/math/atanf_data.c new file mode 100644 index 000000000000..c4cba2378cea --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atanf_data.c @@ -0,0 +1,15 @@ +/* + * Single-precision polynomial coefficients for vector atan(x) and atan2(y,x). + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. + */ +const struct atanf_poly_data __atanf_poly_data = { + .poly = {/* See atanf.sollya for details of how these were generated. */ + -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, + -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f}}; diff --git a/contrib/arm-optimized-routines/pl/math/atanh_3u.c b/contrib/arm-optimized-routines/pl/math/atanh_3u.c new file mode 100644 index 000000000000..a168cd555ff6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atanh_3u.c @@ -0,0 +1,86 @@ +/* + * Double-precision atanh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 +#define Ln2Hi 0x1.62e42fefa3800p-1 +#define Ln2Lo 0x1.ef35793c76730p-45 +#define OneMHfRt2Top \ + 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ +#define OneTop12 0x3ff +#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ +#define BottomMask 0xffffffff +#define C(i) __log1p_data.coeffs[i] + +static inline double +log1p_inline (double x) +{ + /* Helper for calculating log(1 + x) using order-18 polynomial on a reduced + interval. Copied from log1p_2u.c, with no special-case handling. See that + file for details of the algorithm. */ + double m = x + 1; + uint64_t mi = asuint64 (m); + + /* Decompose x + 1 into (f + 1) * 2^k, with k chosen such that f is in + [sqrt(2)/2, sqrt(2)]. */ + uint32_t u = (mi >> 32) + OneMHfRt2Top; + int32_t k = (int32_t) (u >> 20) - OneTop12; + uint32_t utop = (u & 0x000fffff) + HfRt2Top; + uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask); + double f = asdouble (u_red) - 1; + + /* Correction term for round-off in f. */ + double cm = (x - (m - 1)) / m; + + /* Approximate log1p(f) with polynomial. */ + double f2 = f * f; + double f4 = f2 * f2; + double f8 = f4 * f4; + double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f); + + /* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */ + double kd = k; + double y = fma (Ln2Lo, kd, cm); + return y + fma (Ln2Hi, kd, p); +} + +/* Approximation for double-precision inverse tanh(x), using a simplified + version of log1p. Greatest observed error is 3.00 ULP: + atanh(0x1.e58f3c108d714p-4) got 0x1.e7da77672a647p-4 + want 0x1.e7da77672a64ap-4. */ +double +atanh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t sign = ix & ~AbsMask; + uint64_t ia = ix & AbsMask; + + if (unlikely (ia == One)) + return __math_divzero (sign >> 32); + + if (unlikely (ia > One)) + return __math_invalid (x); + + double halfsign = asdouble (Half | sign); + double ax = asdouble (ia); + return halfsign * log1p_inline ((2 * ax) / (1 - ax)); +} + +PL_SIG (S, D, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (atanh, 3.00) +PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000) +PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000) +PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000) +PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000) +PL_TEST_INTERVAL (atanh, 1, inf, 100) +PL_TEST_INTERVAL (atanh, -1, -inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c new file mode 100644 index 000000000000..fb90aa29c7a3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/atanhf_3u1.c @@ -0,0 +1,88 @@ +/* + * Single-precision atanh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define Four 0x40800000 +#define Ln2 0x1.62e43p-1f +#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ + +#define C(i) __log1pf_data.coeffs[i] + +static inline float +eval_poly (float m) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + float p_12 = fmaf (m, C (1), C (0)); + float p_34 = fmaf (m, C (3), C (2)); + float p_56 = fmaf (m, C (5), C (4)); + float p_78 = fmaf (m, C (7), C (6)); + + float m2 = m * m; + float p_02 = fmaf (m2, p_12, m); + float p_36 = fmaf (m2, p_56, p_34); + float p_79 = fmaf (m2, C (8), p_78); + + float m4 = m2 * m2; + float p_06 = fmaf (m4, p_36, p_02); + + return fmaf (m4 * p_79, m4, p_06); +} + +static inline float +log1pf_inline (float x) +{ + /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no + special-case handling. See that file for details of the algorithm. */ + float m = x + 1.0f; + int k = (asuint (m) - 0x3f400000) & 0xff800000; + float s = asfloat (Four - k); + float m_scale = asfloat (asuint (x) - k) + fmaf (0.25f, s, -1.0f); + float p = eval_poly (m_scale); + float scale_back = (float) k * 0x1.0p-23f; + return fmaf (scale_back, Ln2, p); +} + +/* Approximation for single-precision inverse tanh(x), using a simplified + version of log1p. Maximum error is 3.08 ULP: + atanhf(0x1.ff0d5p-5) got 0x1.ffb768p-5 + want 0x1.ffb76ep-5. */ +float +atanhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & ~AbsMask; + + if (unlikely (iax < TinyBound)) + return x; + + if (iax == One) + return __math_divzero (sign); + + if (unlikely (iax > One)) + return __math_invalidf (x); + + float halfsign = asfloat (Half | sign); + float ax = asfloat (iax); + return halfsign * log1pf_inline ((2 * ax) / (1 - ax)); +} + +PL_SIG (S, F, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (atanhf, 2.59) +PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500) +PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000) +PL_TEST_INTERVAL (atanhf, 1, inf, 1000) +PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500) +PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000) +PL_TEST_INTERVAL (atanhf, -1, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c new file mode 100644 index 000000000000..83715dd18a3e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cbrt_2u.c @@ -0,0 +1,70 @@ +/* + * Double-precision cbrt(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +PL_SIG (S, D, 1, cbrt, -10.0, 10.0) + +#define AbsMask 0x7fffffffffffffff +#define TwoThirds 0x1.5555555555555p-1 + +#define C(i) __cbrt_data.poly[i] +#define T(i) __cbrt_data.table[i] + +/* Approximation for double-precision cbrt(x), using low-order polynomial and + two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +double +cbrt (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + uint64_t sign = ix & ~AbsMask; + + if (unlikely (iax == 0 || iax == 0x7f80000000000000)) + return x; + + /* |x| = m * 2^e, where m is in [0.5, 1.0]. + We can easily decompose x into m and e using frexp. */ + int e; + double m = frexp (asdouble (iax), &e); + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for + Newton iterations. */ + double p_01 = fma (C (1), m, C (0)); + double p_23 = fma (C (3), m, C (2)); + double p = fma (p_23, m * m, p_01); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + double m_by_3 = m / 3; + double a = fma (TwoThirds, p, m_by_3 / (p * p)); + a = fma (TwoThirds, a, m_by_3 / (a * a)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. + i is an integer in [-2, 2], so t can be looked up in the table T. + Hence the result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. + Which can be done easily using ldexp. */ + return asdouble (asuint64 (ldexp (a * T (2 + e % 3), e / 3)) | sign); +} + +PL_TEST_ULP (cbrt, 1.30) +PL_TEST_INTERVAL (cbrt, 0, inf, 1000000) +PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrt_data.c b/contrib/arm-optimized-routines/pl/math/cbrt_data.c new file mode 100644 index 000000000000..3d484c2779e2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cbrt_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and table entries for double-precision cbrt(x). + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct cbrt_data __cbrt_data + = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1]. + See cbrt.sollya for details of generation. */ + 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1, 0x1.2c74eaa3ba428p-3}, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0, 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0}}; diff --git a/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c new file mode 100644 index 000000000000..adc591786a6a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cbrtf_1u5.c @@ -0,0 +1,67 @@ +/* + * Single-precision cbrt(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "estrinf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffff +#define SignMask 0x80000000 +#define TwoThirds 0x1.555556p-1f + +#define C(i) __cbrtf_data.poly[i] +#define T(i) __cbrtf_data.table[i] + +/* Approximation for single-precision cbrt(x), using low-order polynomial and + one Newton iteration on a reduced interval. Greatest error is 1.5 ULP. This + is observed for every value where the mantissa is 0x1.81410e and the exponent + is a multiple of 3, for example: + cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 + want 0x1.255d92p+10. */ +float +cbrtf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & SignMask; + + if (unlikely (iax == 0 || iax == 0x7f800000)) + return x; + + /* |x| = m * 2^e, where m is in [0.5, 1.0]. + We can easily decompose x into m and e using frexpf. */ + int e; + float m = frexpf (asfloat (iax), &e); + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + float p = ESTRIN_3 (m, m * m, C); + /* One iteration of Newton's method for iteratively approximating cbrt. */ + float m_by_3 = m / 3; + float a = fmaf (TwoThirds, p, m_by_3 / (p * p)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + Let t = (2 ^ (e / 3)) / (2 ^ round(e / 3)). + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3. + i is an integer in [-2, 2], so t can be looked up in the table T. + Hence the result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. + Which can be done easily using ldexpf. */ + return asfloat (asuint (ldexpf (a * T (2 + e % 3), e / 3)) | sign); +} + +PL_SIG (S, F, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (cbrtf, 1.03) +PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000) +PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000) diff --git a/contrib/arm-optimized-routines/pl/math/cbrtf_data.c b/contrib/arm-optimized-routines/pl/math/cbrtf_data.c new file mode 100644 index 000000000000..c6cdb4de0d65 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cbrtf_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and table entries for single-precision cbrt(x). + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct cbrtf_data __cbrtf_data + = {.poly = { /* Coefficients for very rough approximation of cbrt(x) in [0.5, 1]. + See cbrtf.sollya for details of generation. */ + 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1, 0x1.2c74c2p-3}, + .table = { /* table[i] = 2^((i - 2) / 3). */ + 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0}}; diff --git a/contrib/arm-optimized-routines/pl/math/cosh_2u.c b/contrib/arm-optimized-routines/pl/math/cosh_2u.c new file mode 100644 index 000000000000..5d1df0717453 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/cosh_2u.c @@ -0,0 +1,66 @@ +/* + * Double-precision cosh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffffffffffff +#define SpecialBound \ + 0x40861da04cbafe44 /* 0x1.61da04cbafe44p+9, above which exp overflows. */ + +double +__exp_dd (double, double); + +static double +specialcase (double x, uint64_t iax) +{ + if (iax == 0x7ff0000000000000) + return INFINITY; + if (iax > 0x7ff0000000000000) + return __math_invalid (x); + /* exp overflows above SpecialBound. At this magnitude cosh(x) is dominated by + exp(x), so we can approximate cosh(x) by (exp(|x|/2)) ^ 2 / 2. */ + double t = __exp_dd (asdouble (iax) / 2, 0); + return (0.5 * t) * t; +} + +/* Approximation for double-precision cosh(x). + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the special region, 1.93 ULP: + cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. + + The greatest observed error in the non-special region is 1.03 ULP: + cosh(0x1.502cd8e56ab3bp+0) got 0x1.fe54962842d0ep+0 + want 0x1.fe54962842d0fp+0. */ +double +cosh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + + /* exp overflows a little bit before cosh, so use special-case handler for the + gap, as well as special values. */ + if (unlikely (iax >= SpecialBound)) + return specialcase (x, iax); + + double ax = asdouble (iax); + /* Use double-precision exp helper to calculate exp(x), then: + cosh(x) = exp(|x|) / 2 + 1 / (exp(|x| * 2). */ + double t = __exp_dd (ax, 0); + return 0.5 * t + 0.5 / t; +} + +PL_SIG (S, D, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (cosh, 1.43) +PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000) +PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000) +PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000) +PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000) +PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100) +PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/coshf_1u9.c b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c new file mode 100644 index 000000000000..c125c929aa77 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/coshf_1u9.c @@ -0,0 +1,71 @@ +/* + * Single-precision cosh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffff +#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ +#define SpecialBound \ + 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \ + special case. */ + +float +optr_aor_exp_f32 (float); + +static NOINLINE float +specialcase (float x, uint32_t iax) +{ + if (iax == 0x7f800000) + return INFINITY; + if (iax > 0x7f800000) + return __math_invalidf (x); + if (iax <= TinyBound) + /* For tiny x, avoid underflow by just returning 1. */ + return 1; + /* Otherwise SpecialBound <= |x| < Inf. x is too large to calculate exp(x) + without overflow, so use exp(|x|/2) instead. For large x cosh(x) is + dominated by exp(x), so return: + cosh(x) ~= (exp(|x|/2))^2 / 2. */ + float t = optr_aor_exp_f32 (asfloat (iax) / 2); + return (0.5 * t) * t; +} + +/* Approximation for single-precision cosh(x) using exp. + cosh(x) = (exp(x) + exp(-x)) / 2. + The maximum error is 1.89 ULP, observed for |x| > SpecialBound: + coshf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. + The maximum error observed for TinyBound < |x| < SpecialBound is 1.02 ULP: + coshf(0x1.50a3cp+0) got 0x1.ff21dcp+0 want 0x1.ff21dap+0. */ +float +coshf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + float ax = asfloat (iax); + + if (unlikely (iax <= TinyBound || iax >= SpecialBound)) + { + /* x is tiny, large or special. */ + return specialcase (x, iax); + } + + /* Compute cosh using the definition: + coshf(x) = exp(x) / 2 + exp(-x) / 2. */ + float t = optr_aor_exp_f32 (ax); + return 0.5f * t + 0.5f / t; +} + +PL_SIG (S, F, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (coshf, 1.89) +PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100) +PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000) +PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000) +PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100) +PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000) +PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000) diff --git a/contrib/arm-optimized-routines/pl/math/erfc_4u5.c b/contrib/arm-optimized-routines/pl/math/erfc_4u5.c new file mode 100644 index 000000000000..e9af9d3bcdb4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfc_4u5.c @@ -0,0 +1,155 @@ +/* + * Double-precision erfc(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pairwise_horner.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask (0x7fffffffffffffff) + +#define xint __erfc_data.interval_bounds +#define PX __erfc_data.poly + +/* Accurate exponential from optimized routines. */ +double +__exp_dd (double x, double xtail); + +static inline double +eval_poly_horner (double z, int i) +{ + double z2 = z * z; +#define C(j) PX[i][j] + return PAIRWISE_HORNER_12 (z, z2, C); +} + +/* Accurate evaluation of exp(x^2) + using compensated product (x^2 ~ x*x + e2) + and the __exp_dd(y,d) routine, that is the + computation of exp(y+d) with a small correction d<<y. */ +static inline double +eval_accurate_gaussian (double a) +{ + double e2; + double a2 = a * a; + double aa1 = -fma (0x1.0000002p27, a, -a); + aa1 = fma (0x1.0000002p27, a, aa1); + double aa2 = a - aa1; + e2 = fma (-aa1, aa1, a2); + e2 = fma (-aa1, aa2, e2); + e2 = fma (-aa2, aa1, e2); + e2 = fma (-aa2, aa2, e2); + return __exp_dd (-a2, e2); +} + +/* Approximation of erfc for |x| > 6.0. */ +static inline double +approx_erfc_hi (double x, int i) +{ + double a = fabs (x); + double z = a - xint[i]; + double p = eval_poly_horner (z, i); + double e_mx2 = eval_accurate_gaussian (a); + return p * e_mx2; +} + +static inline int +get_itv_idx (double x) +{ + /* Interval bounds are a logarithmic scale, i.e. interval n has + lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain + the interval index. */ + double a = asdouble (asuint64 (x) & AbsMask); + double z = a + 1.0; + z = z * z; + z = z * z; + return (asuint64 (z) >> 52) - 1023; +} + +/* Approximation of erfc for |x| < 6.0. */ +static inline double +approx_erfc_lo (double x, uint32_t sign, int i) +{ + double a = fabs (x); + double z = a - xint[i]; + double p = eval_poly_horner (z, i); + double e_mx2 = eval_accurate_gaussian (a); + if (sign) + return fma (-p, e_mx2, 2.0); + else + return p * e_mx2; +} + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +abstop12 (double x) +{ + return (asuint64 (x) >> 52) & 0x7ff; +} + +/* Top 32 bits of a double. */ +static inline uint32_t +top32 (double x) +{ + return asuint64 (x) >> 32; +} + +/* Fast erfc implementation. + The approximation uses polynomial approximation of + exp(x^2) * erfc(x) with fixed orders on 20 intervals. + Maximum measured error is 4.05 ULPs:. + erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2 + want 0x1.ff84036f8f0b7p-2. */ +double +erfc (double x) +{ + /* Get top words. */ + uint32_t ix = top32 (x); /* We need to compare at most 32 bits. */ + uint32_t ia = ix & 0x7fffffff; + uint32_t sign = ix >> 31; + + /* Handle special cases and small values with a single comparison: + abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small) + Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2 + Errno EDOM does not have to be set in case of erfc(nan). + Only ERANGE may be set in case of underflow. + Small values (|x|<small) + |x|<0x1.0p-56 => accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7) + |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd). */ + if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd)) + { + if (abstop12 (x) >= 0x7ff) + return (double) (sign << 1) + 1.0 / x; /* special cases. */ + else + return 1.0 - x; /* small case. */ + } + else if (ia < 0x40180000) + { /* |x| < 6.0. */ + return approx_erfc_lo (x, sign, get_itv_idx (x)); + } + else if (sign) + { /* x <= -6.0. */ + return 2.0; + } + else if (ia < 0x403c0000) + { /* 6.0 <= x < 28. */ + return approx_erfc_hi (x, get_itv_idx (x)); + } + else + { /* x > 28. */ + return __math_uflow (0); + } +} + +PL_SIG (S, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (erfc, 3.56) +PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000) +PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000) +PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000) +PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (erfc, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfc_data.c b/contrib/arm-optimized-routines/pl/math/erfc_data.c new file mode 100644 index 000000000000..fa7184fcc871 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfc_data.c @@ -0,0 +1,145 @@ +/* + * Data used in double-precision erfc(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double + precision. Generated using the Remez algorithm on each interval separately + (see erfc.sollya for more detail). */ +const struct erfc_data __erfc_data = { + +/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a + logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the + exception of the first interval. */ +.interval_bounds = { + 0x1.0p-50, /* Tiny boundary. */ + 0x1.837f05c490126p-3, /* 0.189. */ + 0x1.a827997709f7ap-2, /* 0.414. */ + 0x1.5d13f326fe9c8p-1, /* 0.682. */ + 0x1.0p0, /* 1.000. */ + 0x1.60dfc14636e2ap0, /* 1.378. */ + 0x1.d413cccfe779ap0, /* 1.828. */ + 0x1.2e89f995ad3adp1, /* 2.364. */ + 0x1.8p1, /* 3.000. */ + 0x1.e0dfc14636e2ap1, /* 3.757. */ + 0x1.2a09e667f3bcdp2, /* 4.657. */ + 0x1.6e89f995ad3adp2, /* 5.727. */ + 0x1.cp2, /* 7.000. */ + 0x1.106fe0a31b715p3, /* 8.514. */ + 0x1.4a09e667f3bcdp3, /* 10.31. */ + 0x1.8e89f995ad3adp3, /* 12.45. */ + 0x1.ep3, /* 15.00. */ + 0x1.206fe0a31b715p4, /* 18.03. */ + 0x1.5a09e667f3bcdp4, /* 21.63. */ + 0x1.9e89f995ad3adp4, /* 25.91. */ + 0x1.fp4 /* 31.00. */ +}, + +/* Coefficients for each order 12 polynomial on each of the 20 intervals. */ +.poly = { + {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1, + -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2, + 0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5, + -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9, + 0x1.c9bfafa73899cp-11}, + {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1, + -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3, + 0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6, + -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10, + 0x1.526a8a14e9bfcp-12}, + {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2, + -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4, + 0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7, + -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12, + 0x1.b451af7dd52fep-14}, + {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2, + -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5, + 0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9, + -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13, + 0x1.e654e67532b44p-16}, + {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3, + -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6, + 0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10, + -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15, + 0x1.d213a128a75c9p-18}, + {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4, + -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8, + 0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12, + -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17, + 0x1.7f90154bde15dp-20}, + {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5, + -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9, + 0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14, + -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20, + 0x1.1020f4741f79ep-22}, + {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6, + -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11, + 0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16, + -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22, + 0x1.501716d098f14p-25}, + {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6, + -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12, + 0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18, + -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25, + 0x1.6eb74e2e99662p-28}, + {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7, + -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14, + 0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20, + -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28, + 0x1.68510d1c32842p-31}, + {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8, + -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15, + 0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23, + -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30, + 0x1.45aabbe505f6ap-34}, + {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9, + -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17, + 0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25, + -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33, + 0x1.14989aac741c2p-37}, + {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10, + -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18, + 0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27, + -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37, + 0x1.c21ba1b404f5ap-41}, + {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11, + -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20, + 0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30, + -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40, + 0x1.6487c50052867p-44}, + {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11, + -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22, + 0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32, + -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43, + 0x1.165732f1ae138p-47}, + {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12, + -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23, + 0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34, + -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46, + 0x1.b0241c6d5b761p-51}, + {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13, + -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25, + 0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37, + -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49, + 0x1.4f8abb4398a0dp-54}, + {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14, + -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26, + 0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39, + -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52, + 0x1.058cd4ea9bf04p-57}, + {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15, + -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28, + 0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41, + -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55, + 0x1.9a2af47d77e44p-61}, + {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15, + -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30, + 0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44, + -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58, + 0x1.43d3358c64dafp-64} +} +}; diff --git a/contrib/arm-optimized-routines/pl/math/erfcf.h b/contrib/arm-optimized-routines/pl/math/erfcf.h new file mode 100644 index 000000000000..8f1e5f4226e3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfcf.h @@ -0,0 +1,38 @@ +/* + * Shared functions for scalar and vector single-precision erfc(x) functions. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_ERFCF_H +#define PL_MATH_ERFCF_H + +#include "math_config.h" + +#define FMA fma +#include "estrin_wrap.h" + +/* Accurate exponential from optimized-routines. */ +double +__exp_dd (double x, double xtail); + +static inline double +eval_poly (double z, const double *coeff) +{ + double z2 = z * z; + double z4 = z2 * z2; + double z8 = z4 * z4; +#define C(i) coeff[i] + return ESTRIN_15 (z, z2, z4, z8, C); +#undef C +} + +static inline double +eval_exp_mx2 (double x) +{ + return __exp_dd (-(x * x), 0.0); +} + +#undef FMA +#endif // PL_MATH_ERFCF_H diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_2u.c b/contrib/arm-optimized-routines/pl/math/erfcf_2u.c new file mode 100644 index 000000000000..5a3f9b00aa5c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfcf_2u.c @@ -0,0 +1,133 @@ +/* + * Single-precision erfc(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "erfcf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define P(i) __erfcf_poly_data.poly[i] + +/* Approximation of erfcf for |x| > 4.0. */ +static inline float +approx_erfcf_hi (float x, uint32_t sign, const double *coeff) +{ + if (sign) + { + return 2.0f; + } + + /* Polynomial contribution. */ + double z = (double) fabs (x); + float p = (float) eval_poly (z, coeff); + /* Gaussian contribution. */ + float e_mx2 = (float) eval_exp_mx2 (z); + + return p * e_mx2; +} + +/* Approximation of erfcf for |x| < 4.0. */ +static inline float +approx_erfcf_lo (float x, uint32_t sign, const double *coeff) +{ + /* Polynomial contribution. */ + double z = (double) fabs (x); + float p = (float) eval_poly (z, coeff); + /* Gaussian contribution. */ + float e_mx2 = (float) eval_exp_mx2 (z); + + if (sign) + return fmaf (-p, e_mx2, 2.0f); + else + return p * e_mx2; +} + +/* Top 12 bits of a float (sign and exponent bits). */ +static inline uint32_t +abstop12 (float x) +{ + return (asuint (x) >> 20) & 0x7ff; +} + +/* Top 12 bits of a float. */ +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +/* Fast erfcf approximation using polynomial approximation + multiplied by gaussian. + Most of the computation is carried out in double precision, + and is very sensitive to accuracy of polynomial and exp + evaluation. + Worst-case error is 1.968ulps, obtained for x = 2.0412941. + erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp + err 1.46788. */ +float +erfcf (float x) +{ + /* Get top words and sign. */ + uint32_t ix = asuint (x); /* We need to compare at most 32 bits. */ + uint32_t sign = ix >> 31; + uint32_t ia12 = top12 (x) & 0x7ff; + + /* Handle special cases and small values with a single comparison: + abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small) + + Special cases + erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2 + + Errno + EDOM does not have to be set in case of erfcf(nan). + Only ERANGE may be set in case of underflow. + + Small values (|x|<small) + |x|<0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */ + if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328)) + { + if (abstop12 (x) >= 0x7f8) + return (float) (sign << 1) + 1.0f / x; /* Special cases. */ + else + return 1.0f - x; /* Small case. */ + } + + /* Normalized numbers divided in 4 intervals + with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for + the interesting region as it is the smallest value, representable as a + 12-bit integer, for which returning 0 gives <1.5 ULP. */ + if (ia12 < 0x400) + { + return approx_erfcf_lo (x, sign, P (0)); + } + if (ia12 < 0x408) + { + return approx_erfcf_lo (x, sign, P (1)); + } + if (ia12 < 0x410) + { + return approx_erfcf_hi (x, sign, P (2)); + } + if (ia12 < 0x412) + { + return approx_erfcf_hi (x, sign, P (3)); + } + if (sign) + { + return 2.0f; + } + return __math_uflowf (0); +} + +PL_SIG (S, F, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (erfcf, 1.5) +PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000) +PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (erfcf, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erfcf_data.c b/contrib/arm-optimized-routines/pl/math/erfcf_data.c new file mode 100644 index 000000000000..2e018c8c6710 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erfcf_data.c @@ -0,0 +1,57 @@ +/* + * Data used in single-precision erfc(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double + precision. Generated using the Remez algorithm on each interval separately + (see erfcf.sollya for more detail). */ +const struct erfcf_poly_data __erfcf_poly_data + = {.poly + = {{ +#if ERFCF_POLY_NCOEFFS == 16 + 0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1, + -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2, + 0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5, + -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9, + 0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17, + -0x1.32712a6275c4dp-21 +#endif + }, + + { +#if ERFCF_POLY_NCOEFFS == 16 + 0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1, + -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3, + 0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8, + -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15, + 0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27, + -0x1.8ec1581647f9fp-33 +#endif + }, + + { +#if ERFCF_POLY_NCOEFFS == 16 + 0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1, + -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6, + 0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13, + -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23, + 0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38, + -0x1.027034672f11cp-44 +#endif + }, + + { +#if ERFCF_POLY_NCOEFFS == 16 + 0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2, + -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8, + 0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17, + -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30, + 0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46, + -0x1.45abac612344bp-53 +#endif + }}}; diff --git a/contrib/arm-optimized-routines/pl/math/erff_1u5.c b/contrib/arm-optimized-routines/pl/math/erff_1u5.c new file mode 100644 index 000000000000..1a69872c43e5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erff_1u5.c @@ -0,0 +1,108 @@ +/* + * Single-precision erf(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "estrinf.h" +#include "hornerf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f +#define A __erff_data.erff_poly_A +#define B __erff_data.erff_poly_B + +/* Top 12 bits of a float. */ +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +/* Efficient implementation of erff using either a pure polynomial approximation + or the exponential of a polynomial. Worst-case error is 1.09ulps at + 0x1.c111acp-1. */ +float +erff (float x) +{ + float r, x2; + + /* Get top word. */ + uint32_t ix = asuint (x); + uint32_t sign = ix >> 31; + uint32_t ia12 = top12 (x) & 0x7ff; + + /* Limit of both intervals is 0.875 for performance reasons but coefficients + computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy + from 0.94 to 1.1ulps. */ + if (ia12 < 0x3f6) + { /* a = |x| < 0.875. */ + + /* Tiny and subnormal cases. */ + if (unlikely (ia12 < 0x318)) + { /* |x| < 2^(-28). */ + if (unlikely (ia12 < 0x040)) + { /* |x| < 2^(-119). */ + float y = fmaf (TwoOverSqrtPiMinusOne, x, x); + return check_uflowf (y); + } + return x + TwoOverSqrtPiMinusOne * x; + } + + x2 = x * x; + + /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2). + */ +#define C(i) A[i] + r = fmaf (HORNER_5 (x2, C), x, x); +#undef C + } + else if (ia12 < 0x408) + { /* |x| < 4.0 - Use a custom Estrin scheme. */ + + float a = fabsf (x); + /* Use Estrin scheme on high order (small magnitude) coefficients. */ +#define C(i) B[i] + r = ESTRIN_3_ (a, x * x, C, 3); +#undef C + /* Then switch to pure Horner scheme. */ + r = fmaf (r, a, B[2]); + r = fmaf (r, a, B[1]); + r = fmaf (r, a, B[0]); + r = fmaf (r, a, a); + /* Single precision exponential with ~0.5ulps ensures erff has maximum + relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on + [0.875, 4.0]. */ + r = expf (-r); + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f + r; + else + r = 1.0f - r; + } + else + { /* |x| >= 4.0. */ + + /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */ + if (unlikely (ia12 >= 0x7f8)) + return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x; + + /* Explicit copysign (calling copysignf increases latency). */ + if (sign) + r = -1.0f; + else + r = 1.0f; + } + return r; +} + +PL_SIG (S, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (erff, 0.6) +PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000) +PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (erff, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/erff_data.c b/contrib/arm-optimized-routines/pl/math/erff_data.c new file mode 100644 index 000000000000..2352baefd35f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/erff_data.c @@ -0,0 +1,16 @@ +/* + * Data for approximation of erff. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Minimax approximation of erff. */ +const struct erff_data __erff_data + = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f, + -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f}, + .erff_poly_B + = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f, + 0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}}; diff --git a/contrib/arm-optimized-routines/pl/math/estrin.h b/contrib/arm-optimized-routines/pl/math/estrin.h new file mode 100644 index 000000000000..f967fb0475b0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/estrin.h @@ -0,0 +1,16 @@ +/* + * Helper macros for double-precision Estrin polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#if V_SUPPORTED +#define FMA v_fma_f64 +#else +#define FMA fma +#endif + +#include "estrin_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/estrin_wrap.h b/contrib/arm-optimized-routines/pl/math/estrin_wrap.h new file mode 100644 index 000000000000..2ae07001f2cf --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/estrin_wrap.h @@ -0,0 +1,48 @@ +/* + * Helper macros for double-precision Estrin polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define ESTRIN_1_(x, c, i) FMA(x, c(1 + i), c(i)) +#define ESTRIN_2_(x, x2, c, i) FMA(x2, c(2 + i), ESTRIN_1_(x, c, i)) +#define ESTRIN_3_(x, x2, c, i) FMA(x2, ESTRIN_1_(x, c, 2 + i), ESTRIN_1_(x, c, i)) +#define ESTRIN_4_(x, x2, x4, c, i) FMA(x4, c(4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_5_(x, x2, x4, c, i) FMA(x4, ESTRIN_1_(x, c, 4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_6_(x, x2, x4, c, i) FMA(x4, ESTRIN_2_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_7_(x, x2, x4, c, i) FMA(x4, ESTRIN_3_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i)) +#define ESTRIN_8_(x, x2, x4, x8, c, i) FMA(x8, c(8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_9_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_1_(x, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_10_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_2_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_11_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_3_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_12_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_4_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_13_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_5_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_14_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_6_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_15_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_7_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i)) +#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) +#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) +#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) +#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i)) + +#define ESTRIN_1(x, c) ESTRIN_1_(x, c, 0) +#define ESTRIN_2(x, x2, c) ESTRIN_2_(x, x2, c, 0) +#define ESTRIN_3(x, x2, c) ESTRIN_3_(x, x2, c, 0) +#define ESTRIN_4(x, x2, x4, c) ESTRIN_4_(x, x2, x4, c, 0) +#define ESTRIN_5(x, x2, x4, c) ESTRIN_5_(x, x2, x4, c, 0) +#define ESTRIN_6(x, x2, x4, c) ESTRIN_6_(x, x2, x4, c, 0) +#define ESTRIN_7(x, x2, x4, c) ESTRIN_7_(x, x2, x4, c, 0) +#define ESTRIN_8(x, x2, x4, x8, c) ESTRIN_8_(x, x2, x4, x8, c, 0) +#define ESTRIN_9(x, x2, x4, x8, c) ESTRIN_9_(x, x2, x4, x8, c, 0) +#define ESTRIN_10(x, x2, x4, x8, c) ESTRIN_10_(x, x2, x4, x8, c, 0) +#define ESTRIN_11(x, x2, x4, x8, c) ESTRIN_11_(x, x2, x4, x8, c, 0) +#define ESTRIN_12(x, x2, x4, x8, c) ESTRIN_12_(x, x2, x4, x8, c, 0) +#define ESTRIN_13(x, x2, x4, x8, c) ESTRIN_13_(x, x2, x4, x8, c, 0) +#define ESTRIN_14(x, x2, x4, x8, c) ESTRIN_14_(x, x2, x4, x8, c, 0) +#define ESTRIN_15(x, x2, x4, x8, c) ESTRIN_15_(x, x2, x4, x8, c, 0) +#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0) +#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0) +#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0) +#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0) +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/estrinf.h b/contrib/arm-optimized-routines/pl/math/estrinf.h new file mode 100644 index 000000000000..175233c6c799 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/estrinf.h @@ -0,0 +1,14 @@ +/* + * Helper macros for single-precision Estrin polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f32 +#else +#define FMA fmaf +#endif + +#include "estrin_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/exp.c b/contrib/arm-optimized-routines/pl/math/exp.c new file mode 100644 index 000000000000..90253b68875d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/exp.c @@ -0,0 +1,163 @@ +/* + * Double-precision e^x function. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <float.h> +#include <math.h> +#include <stdint.h> +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) +#define InvLn2N __exp_data.invln2N +#define NegLn2hiN __exp_data.negln2hiN +#define NegLn2loN __exp_data.negln2loN +#define Shift __exp_data.shift +#define T __exp_data.tab +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER] +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER] +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER] +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER] +#define C6 __exp_data.poly[9 - EXP_POLY_ORDER] + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double +specialcase (double_t tmp, uint64_t sbits, uint64_t ki) +{ + double_t scale, y; + + if ((ki & 0x80000000) == 0) + { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble (sbits); + y = 0x1p1009 * (scale + scale * tmp); + return check_oflow (eval_as_double (y)); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble (sbits); + y = scale + scale * tmp; + if (y < 1.0) + { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t hi, lo; + lo = scale - y + scale * tmp; + hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double (hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + return check_uflow (eval_as_double (y)); +} + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t +top12 (double x) +{ + return asuint64 (x) >> 52; +} + +/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + If hastail is 0 then xtail is assumed to be 0 too. */ +static inline double +exp_inline (double x, double xtail, int hastail) +{ + uint32_t abstop; + uint64_t ki, idx, top, sbits; + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t kd, z, r, r2, scale, tail, tmp; + + abstop = top12 (x) & 0x7ff; + if (unlikely (abstop - top12 (0x1p-54) >= top12 (512.0) - top12 (0x1p-54))) + { + if (abstop - top12 (0x1p-54) >= 0x80000000) + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return WANT_ROUNDING ? 1.0 + x : 1.0; + if (abstop >= top12 (1024.0)) + { + if (asuint64 (x) == asuint64 (-INFINITY)) + return 0.0; + if (abstop >= top12 (INFINITY)) + return 1.0 + x; + if (asuint64 (x) >> 63) + return __math_uflow (0); + else + return __math_oflow (0); + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + z = InvLn2N * x; +#if TOINT_INTRINSICS + kd = roundtoint (z); + ki = converttoint (z); +#elif EXP_USE_TOINT_NARROW + /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes. */ + kd = eval_as_double (z + Shift); + ki = asuint64 (kd) >> 16; + kd = (double_t) (int32_t) ki; +#else + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + kd = eval_as_double (z + Shift); + ki = asuint64 (kd); + kd -= Shift; +#endif + r = x + kd * NegLn2hiN + kd * NegLn2loN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + if (hastail) + r += xtail; + /* 2^(k/N) ~= scale * (1 + tail). */ + idx = 2 * (ki % N); + top = ki << (52 - EXP_TABLE_BITS); + tail = asdouble (T[idx]); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + sbits = T[idx + 1] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; + /* Without fma the worst case error is 0.25/N ulp larger. */ + /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp. */ +#if EXP_POLY_ORDER == 4 + tmp = tail + r + r2 * C2 + r * r2 * (C3 + r * C4); +#elif EXP_POLY_ORDER == 5 + tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); +#elif EXP_POLY_ORDER == 6 + tmp = tail + r + r2 * (0.5 + r * C3) + r2 * r2 * (C4 + r * C5 + r2 * C6); +#endif + if (unlikely (abstop == 0)) + return specialcase (tmp, sbits, ki); + scale = asdouble (sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double (scale + scale * tmp); +} + +/* May be useful for implementing pow where more than double + precision input is needed. */ +double +__exp_dd (double x, double xtail) +{ + return exp_inline (x, xtail, 1); +} + diff --git a/contrib/arm-optimized-routines/pl/math/exp_data.c b/contrib/arm-optimized-routines/pl/math/exp_data.c new file mode 100644 index 000000000000..2354be76cfab --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/exp_data.c @@ -0,0 +1,1120 @@ +/* + * Shared data between exp, exp2 and pow. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXP_TABLE_BITS) + +const struct exp_data __exp_data = { +// N/ln2 +.invln2N = 0x1.71547652b82fep0 * N, +// -ln2/N +#if N == 64 +.negln2hiN = -0x1.62e42fefa0000p-7, +.negln2loN = -0x1.cf79abc9e3b3ap-46, +#elif N == 128 +.negln2hiN = -0x1.62e42fefa0000p-8, +.negln2loN = -0x1.cf79abc9e3b3ap-47, +#elif N == 256 +.negln2hiN = -0x1.62e42fefc0000p-9, +.negln2loN = 0x1.c610ca86c3899p-45, +#elif N == 512 +.negln2hiN = -0x1.62e42fef80000p-10, +.negln2loN = -0x1.1cf79abc9e3b4p-45, +#endif +// Used for rounding when !TOINT_INTRINSICS +#if EXP_USE_TOINT_NARROW +.shift = 0x1800000000.8p0, +#else +.shift = 0x1.8p52, +#endif +// exp polynomial coefficients. +.poly = { +#if N == 64 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE +// abs error: 1.5543*2^-60 +// ulp error: 0.529 (0.533 without fma) +// if |x| < ln2/128+eps +// abs error if |x| < ln2/64: 1.7157*2^-50 +0x1.fffffffffdbcdp-2, +0x1.555555555444cp-3, +0x1.555573c6a9f7dp-5, +0x1.1111266d28935p-7, +#elif N == 64 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE +// abs error: 1.6735*2^-64 +// ulp error: 0.518 (0.522 without fma) +// if |x| < ln2/64 +0x1.5555555548f9ap-3, +0x1.555555554bf5dp-5, +0x1.11115b75f0f4dp-7, +0x1.6c171a6b6303ep-10, +#elif N == 128 && EXP_POLY_ORDER == 5 && !EXP_POLY_WIDE +// abs error: 1.555*2^-66 +// ulp error: 0.509 (0.511 without fma) +// if |x| < ln2/256+eps +// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65 +// abs error if |x| < ln2/128: 1.7145*2^-56 +0x1.ffffffffffdbdp-2, +0x1.555555555543cp-3, +0x1.55555cf172b91p-5, +0x1.1111167a4d017p-7, +#elif N == 128 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE +// abs error: 1.5542*2^-60 +// ulp error: 0.521 (0.523 without fma) +// if |x| < ln2/128 +0x1.fffffffffdbcep-2, +0x1.55555555543c2p-3, +0x1.555573c64f2e3p-5, +0x1.111126b4eff73p-7, +#elif N == 128 && EXP_POLY_ORDER == 6 && EXP_POLY_WIDE +// abs error: 1.6861*2^-71 +// ulp error: 0.509 (0.511 without fma) +// if |x| < ln2/128 +0x1.55555555548fdp-3, +0x1.555555555658fp-5, +0x1.111123a859bb6p-7, +0x1.6c16ba6920cabp-10, +#elif N == 256 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE +// abs error: 1.43*2^-58 +// ulp error: 0.549 (0.550 without fma) +// if |x| < ln2/512 +0x1p0, // unused +0x1.fffffffffffd4p-2, +0x1.5555571d6ef9p-3, +0x1.5555576a5adcep-5, +#elif N == 256 && EXP_POLY_ORDER == 5 && EXP_POLY_WIDE +// abs error: 1.5547*2^-66 +// ulp error: 0.505 (0.506 without fma) +// if |x| < ln2/256 +0x1.ffffffffffdbdp-2, +0x1.555555555543cp-3, +0x1.55555cf16e1edp-5, +0x1.1111167a4b553p-7, +#elif N == 512 && EXP_POLY_ORDER == 4 && !EXP_POLY_WIDE +// abs error: 1.4300*2^-63 +// ulp error: 0.504 +// if |x| < ln2/1024 +// abs error if |x| < ln2/512: 1.0689*2^-55 +0x1p0, // unused +0x1.ffffffffffffdp-2, +0x1.555555c75bb6p-3, +0x1.555555dec04a8p-5, +#endif +}, +.exp2_shift = 0x1.8p52 / N, +// exp2 polynomial coefficients. +.exp2_poly = { +#if N == 64 && EXP2_POLY_ORDER == 6 && EXP2_POLY_WIDE +// abs error: 1.3054*2^-63 +// ulp error: 0.515 +// if |x| < 1/64 +0x1.62e42fefa39efp-1, +0x1.ebfbdff82c58fp-3, +0x1.c6b08d7045cf1p-5, +0x1.3b2ab6fb8fd0ep-7, +0x1.5d884afec48d7p-10, +0x1.43097dc684ae1p-13, +#elif N == 128 && EXP2_POLY_ORDER == 5 && !EXP2_POLY_WIDE +// abs error: 1.2195*2^-65 +// ulp error: 0.507 (0.511 without fma) +// if |x| < 1/256 +// abs error if |x| < 1/128: 1.9941*2^-56 +0x1.62e42fefa39efp-1, +0x1.ebfbdff82c424p-3, +0x1.c6b08d70cf4b5p-5, +0x1.3b2abd24650ccp-7, +0x1.5d7e09b4e3a84p-10, +#elif N == 256 && EXP2_POLY_ORDER == 5 && EXP2_POLY_WIDE +// abs error: 1.2195*2^-65 +// ulp error: 0.504 (0.508 without fma) +// if |x| < 1/256 +0x1.62e42fefa39efp-1, +0x1.ebfbdff82c424p-3, +0x1.c6b08d70cf4b5p-5, +0x1.3b2abd24650ccp-7, +0x1.5d7e09b4e3a84p-10, +#elif N == 512 && EXP2_POLY_ORDER == 4 && !EXP2_POLY_WIDE +// abs error: 1.4411*2^-64 +// ulp error: 0.5024 (0.5063 without fma) +// if |x| < 1/1024 +// abs error if |x| < 1/512: 1.9430*2^-56 +0x1.62e42fefa39ecp-1, +0x1.ebfbdff82c58bp-3, +0x1.c6b08e46de41fp-5, +0x1.3b2ab786ee1dap-7, +#endif +}, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// tab[2*k] = asuint64(T[k]) +// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N +.tab = { +#if N == 64 +0x0, 0x3ff0000000000000, +0xbc7160139cd8dc5d, 0x3fefec9a3e778061, +0x3c8cd2523567f613, 0x3fefd9b0d3158574, +0x3c60f74e61e6c861, 0x3fefc74518759bc8, +0x3c979aa65d837b6d, 0x3fefb5586cf9890f, +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, +0xbc9556522a2fbd0e, 0x3fef9301d0125b51, +0xbc91c923b9d5f416, 0x3fef829aaea92de0, +0xbc801b15eaa59348, 0x3fef72b83c7d517b, +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, +0x3c968efde3a8a894, 0x3fef387a6e756238, +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, +0x3c834d754db0abb6, 0x3fef06fe0a31b715, +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, +0x3c859f48a72a4c6d, 0x3feedea64c123422, +0xbc58a78f4817895b, 0x3feed60a21f72e2a, +0x3c4363ed60c2ac11, 0x3feece086061892d, +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, +0x3c7690cebb7aafb0, 0x3feebfdad5362a27, +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, +0x3c93350518fdd78e, 0x3feeaf4736b527da, +0x3c9063e1e21c5409, 0x3feeab07dd485429, +0x3c9432e62b64c035, 0x3feea76f15ad2148, +0xbc8c33c53bef4da8, 0x3feea47eb03a5585, +0xbc93cedd78565858, 0x3feea23882552225, +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, +0xbc8619321e55e68a, 0x3fee9feb564267c9, +0xbc7b32dcb94da51d, 0x3feea11473eb0187, +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, +0xbc9369b6f13b3734, 0x3feea589994cce13, +0xbc94d450d872576e, 0x3feea8d99b4492ed, +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, +0x3c7bf68359f35f44, 0x3feeb1ae99157736, +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, +0xbc92434322f4f9aa, 0x3feebd829fde4e50, +0x3c71affc2b91ce27, 0x3feec49182a3f090, +0xbc87c50422622263, 0x3feecc667b5de565, +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, +0x3c8469846e735ab3, 0x3feede6b5579fdbf, +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, +0x3c736eae30af0cb3, 0x3fef199bdd85529c, +0x3c84e08fd10959ac, 0x3fef27f12e57d14b, +0x3c676b2c6c921968, 0x3fef3720dcef9069, +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, +0x3c74a385a63d07a7, 0x3fef5818dcfba487, +0x3c8e5a50d5c192ac, 0x3fef69e603db3285, +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, +0x3c74b604603a88d3, 0x3fef902ee78b3ff6, +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, +0x3c8a64a931d185ee, 0x3fefd0765b6e4540, +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, +#elif N == 128 +0x0, 0x3ff0000000000000, +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, +0xbc7160139cd8dc5d, 0x3fefec9a3e778061, +0xbc905e7a108766d1, 0x3fefe315e86e7f85, +0x3c8cd2523567f613, 0x3fefd9b0d3158574, +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, +0x3c60f74e61e6c861, 0x3fefc74518759bc8, +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, +0x3c979aa65d837b6d, 0x3fefb5586cf9890f, +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, +0xbc6a033489906e0b, 0x3fef9b66affed31b, +0xbc9556522a2fbd0e, 0x3fef9301d0125b51, +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, +0xbc91c923b9d5f416, 0x3fef829aaea92de0, +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, +0xbc801b15eaa59348, 0x3fef72b83c7d517b, +0xbc8f1ff055de323d, 0x3fef6af9388c8dea, +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, +0xbc96d99c7611eb26, 0x3fef5be084045cd4, +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, +0x3c807a05b0e4047d, 0x3fef3f49917ddc96, +0x3c968efde3a8a894, 0x3fef387a6e756238, +0x3c875e18f274487d, 0x3fef31ce4fb2a63f, +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, +0xbc96b87b3f71085e, 0x3fef24dfe1f56381, +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, +0x3c834d754db0abb6, 0x3fef06fe0a31b715, +0x3c864201e2ac744c, 0x3fef0170fc4cd831, +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, +0xbc9907f81b512d8e, 0x3feeecae6d05d866, +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, +0xbc991919b3ce1b15, 0x3feee32dc313a8e5, +0x3c859f48a72a4c6d, 0x3feedea64c123422, +0xbc9312607a28698a, 0x3feeda4504ac801c, +0xbc58a78f4817895b, 0x3feed60a21f72e2a, +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, +0x3c4363ed60c2ac11, 0x3feece086061892d, +0x3c9666093b0664ef, 0x3feeca41ed1d0057, +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, +0x3c7690cebb7aafb0, 0x3feebfdad5362a27, +0x3c931dbdeb54e077, 0x3feebcb299fddd0d, +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, +0xbc87deccdc93a349, 0x3feeb6daa2cf6642, +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, +0x3c93350518fdd78e, 0x3feeaf4736b527da, +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, +0x3c9063e1e21c5409, 0x3feeab07dd485429, +0x3c34c7855019c6ea, 0x3feea9268a5946b7, +0x3c9432e62b64c035, 0x3feea76f15ad2148, +0xbc8ce44a6199769f, 0x3feea5e1b976dc09, +0xbc8c33c53bef4da8, 0x3feea47eb03a5585, +0xbc845378892be9ae, 0x3feea34634ccc320, +0xbc93cedd78565858, 0x3feea23882552225, +0x3c5710aa807e1964, 0x3feea155d44ca973, +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, +0xbc6a12ad8734b982, 0x3feea012750bdabf, +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, +0xbc80dc3d54e08851, 0x3fee9f7df9519484, +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, +0xbc8619321e55e68a, 0x3fee9feb564267c9, +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, +0xbc7b32dcb94da51d, 0x3feea11473eb0187, +0x3c94ecfd5467c06b, 0x3feea1ed0130c132, +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, +0xbc88a1c52fb3cf42, 0x3feea427543e1a12, +0xbc9369b6f13b3734, 0x3feea589994cce13, +0xbc805e843a19ff1e, 0x3feea71a4623c7ad, +0xbc94d450d872576e, 0x3feea8d99b4492ed, +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, +0x3c7bf68359f35f44, 0x3feeb1ae99157736, +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, +0xbc6c23f97c90b959, 0x3feeba44cbc8520f, +0xbc92434322f4f9aa, 0x3feebd829fde4e50, +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, +0x3c71affc2b91ce27, 0x3feec49182a3f090, +0x3c6dd235e10a73bb, 0x3feec86319e32323, +0xbc87c50422622263, 0x3feecc667b5de565, +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, +0x3c90cc319cee31d2, 0x3feed99e1330b358, +0x3c8469846e735ab3, 0x3feede6b5579fdbf, +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, +0xbc907b8f4ad1d9fa, 0x3feeee07298db666, +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, +0xbc90a40e3da6f640, 0x3feef9728de5593a, +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, +0xbc91eee26b588a35, 0x3fef05b030a1064a, +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, +0x3c736eae30af0cb3, 0x3fef199bdd85529c, +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, +0x3c84e08fd10959ac, 0x3fef27f12e57d14b, +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, +0x3c676b2c6c921968, 0x3fef3720dcef9069, +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, +0xbc900dae3875a949, 0x3fef4f87080d89f2, +0x3c74a385a63d07a7, 0x3fef5818dcfba487, +0xbc82919e2040220f, 0x3fef60e316c98398, +0x3c8e5a50d5c192ac, 0x3fef69e603db3285, +0x3c843a59ac016b4b, 0x3fef7321f301b460, +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, +0xbc892ab93b470dc9, 0x3fef864614f5a129, +0x3c74b604603a88d3, 0x3fef902ee78b3ff6, +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, +0xbc8dae98e223747d, 0x3fefaf482d8e67f1, +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, +0x3c842b94c3a9eb32, 0x3fefc52b376bba97, +0x3c8a64a931d185ee, 0x3fefd0765b6e4540, +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, +0x3c5305c14160cc89, 0x3feff3c22b8f71f1, +#elif N == 256 +0x0, 0x3ff0000000000000, +0xbc84e82fc61851ac, 0x3feffb1afa5abcbf, +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, +0xbc82985dd8521d32, 0x3feff168143b0281, +0xbc7160139cd8dc5d, 0x3fefec9a3e778061, +0x3c651e617061bfbd, 0x3fefe7d42e11bbcc, +0xbc905e7a108766d1, 0x3fefe315e86e7f85, +0x3c845fad437fa426, 0x3fefde5f72f654b1, +0x3c8cd2523567f613, 0x3fefd9b0d3158574, +0xbc954529642b232f, 0x3fefd50a0e3c1f89, +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, +0x3c8293708ef5c32e, 0x3fefcbd42b72a836, +0x3c60f74e61e6c861, 0x3fefc74518759bc8, +0xbc95b9280905b2a4, 0x3fefc2bdf66607e0, +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, +0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919, +0x3c979aa65d837b6d, 0x3fefb5586cf9890f, +0x3c9407fb30d06420, 0x3fefb0f145e46c85, +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, +0xbc9a5d04b3b9911b, 0x3fefa83b23395dec, +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, +0xbc937a01f0739546, 0x3fef9fa55fdfa9c5, +0xbc6a033489906e0b, 0x3fef9b66affed31b, +0x3c8b8268b04ef0a5, 0x3fef973028d7233e, +0xbc9556522a2fbd0e, 0x3fef9301d0125b51, +0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6, +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, +0xbc65704e90c9f860, 0x3fef86a814f204ab, +0xbc91c923b9d5f416, 0x3fef829aaea92de0, +0xbc897cea57e46280, 0x3fef7e95934f312e, +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, +0x3c56f01429e2b9d2, 0x3fef76a45471c3c2, +0xbc801b15eaa59348, 0x3fef72b83c7d517b, +0x3c6e653b2459034b, 0x3fef6ed48695bbc0, +0xbc8f1ff055de323d, 0x3fef6af9388c8dea, +0x3c92cc7ea345b7dc, 0x3fef672658375d2f, +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, +0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c, +0xbc96d99c7611eb26, 0x3fef5be084045cd4, +0x3c8cdc1873af2155, 0x3fef582f95281c6b, +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, +0xbc9493684653a131, 0x3fef50e75eb44027, +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, +0xbc98e2899077520a, 0x3fef49c18438ce4d, +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, +0x3c9120fcd4f59273, 0x3fef42be3578a819, +0x3c807a05b0e4047d, 0x3fef3f49917ddc96, +0x3c89b788c188c9b8, 0x3fef3bdda27912d1, +0x3c968efde3a8a894, 0x3fef387a6e756238, +0x3c877afbca90ef84, 0x3fef351ffb82140a, +0x3c875e18f274487d, 0x3fef31ce4fb2a63f, +0x3c91512f082876ee, 0x3fef2e85711ece75, +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, +0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29, +0xbc96b87b3f71085e, 0x3fef24dfe1f56381, +0xbc803297e78260bf, 0x3fef21ba7591bb70, +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, +0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13, +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, +0xbc91e75c40b4251e, 0x3fef157e39771b2f, +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, +0x3c98a911f1f7785a, 0x3fef0f961f641589, +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, +0xbc61e7c998db7dbb, 0x3fef09d24abd886b, +0x3c834d754db0abb6, 0x3fef06fe0a31b715, +0x3c85425c11faadf4, 0x3fef0432edeeb2fd, +0x3c864201e2ac744c, 0x3fef0170fc4cd831, +0xbc979517a03e2847, 0x3feefeb83ba8ea32, +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, +0xbc800e2a46da4bee, 0x3feef96266e3fa2d, +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, +0xbc87430803972b34, 0x3feef431a2de883b, +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, +0xbc954de30ae02d94, 0x3feeef26231e754a, +0xbc9907f81b512d8e, 0x3feeecae6d05d866, +0xbc94f2487e1c03ec, 0x3feeea401b7140ef, +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, +0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4, +0xbc991919b3ce1b15, 0x3feee32dc313a8e5, +0x3c79c3bba5562a2f, 0x3feee0e544ede173, +0x3c859f48a72a4c6d, 0x3feedea64c123422, +0xbc85a71612e21658, 0x3feedc70df1c5175, +0xbc9312607a28698a, 0x3feeda4504ac801c, +0x3c86421f6f1d24d6, 0x3feed822c367a024, +0xbc58a78f4817895b, 0x3feed60a21f72e2a, +0xbc9348a6815fce65, 0x3feed3fb2709468a, +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, +0x3c835c43984d9871, 0x3feecffa3f84b9d4, +0x3c4363ed60c2ac11, 0x3feece086061892d, +0xbc632afc8d9473a0, 0x3feecc2042a7d232, +0x3c9666093b0664ef, 0x3feeca41ed1d0057, +0xbc95fc5e44de020e, 0x3feec86d668b3237, +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, +0xbc7ea0148327c42f, 0x3feec4e1e192aed2, +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, +0xbc7a843ad1a88022, 0x3feec17dea6db7d7, +0x3c7690cebb7aafb0, 0x3feebfdad5362a27, +0x3c892ca3bf144e63, 0x3feebe41b817c114, +0x3c931dbdeb54e077, 0x3feebcb299fddd0d, +0xbc902c99b04aa8b0, 0x3feebb2d81d8abff, +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, +0x3c73e34f67e67118, 0x3feeb8417f4531ee, +0xbc87deccdc93a349, 0x3feeb6daa2cf6642, +0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef, +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, +0x3c81bd2888075068, 0x3feeb2e2f4f6ad27, +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, +0xbc896be8ae89ef8f, 0x3feeb070dde910d2, +0x3c93350518fdd78e, 0x3feeaf4736b527da, +0xbc88e6ac90348602, 0x3feeae27dbe2c4cf, +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, +0xbc91af7f1365c3ac, 0x3feeac0827ff07cc, +0x3c9063e1e21c5409, 0x3feeab07dd485429, +0xbc943a3540d1898a, 0x3feeaa11fba87a03, +0x3c34c7855019c6ea, 0x3feea9268a5946b7, +0xbc951f58ddaa8090, 0x3feea84590998b93, +0x3c9432e62b64c035, 0x3feea76f15ad2148, +0xbc82e1648e50a17c, 0x3feea6a320dceb71, +0xbc8ce44a6199769f, 0x3feea5e1b976dc09, +0x3c95f30eda98a575, 0x3feea52ae6cdf6f4, +0xbc8c33c53bef4da8, 0x3feea47eb03a5585, +0x3c917ecda8a72159, 0x3feea3dd1d1929fd, +0xbc845378892be9ae, 0x3feea34634ccc320, +0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7, +0xbc93cedd78565858, 0x3feea23882552225, +0xbc85c33fdf910406, 0x3feea1c1c70833f6, +0x3c5710aa807e1964, 0x3feea155d44ca973, +0x3c81079ab5789604, 0x3feea0f4b19e9538, +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, +0x3c727df161cd7778, 0x3feea052fa75173e, +0xbc6a12ad8734b982, 0x3feea012750bdabf, +0x3c93f9924a05b767, 0x3fee9fdcddd47645, +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, +0xbc87557939a8b5ef, 0x3fee9f9298593ae5, +0xbc80dc3d54e08851, 0x3fee9f7df9519484, +0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87, +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, +0xbc88e67a9006c909, 0x3fee9f8286ead08a, +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, +0x3c86597566977ac8, 0x3fee9fbd35d7cbfd, +0xbc8619321e55e68a, 0x3fee9feb564267c9, +0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09, +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, +0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6, +0xbc7b32dcb94da51d, 0x3feea11473eb0187, +0xbc92dad3519d7b5b, 0x3feea17b0976cfdb, +0x3c94ecfd5467c06b, 0x3feea1ed0130c132, +0x3c87d51410fd15c2, 0x3feea26a62ff86f0, +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, +0xbc760a3629969871, 0x3feea3878491c491, +0xbc88a1c52fb3cf42, 0x3feea427543e1a12, +0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9, +0xbc9369b6f13b3734, 0x3feea589994cce13, +0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7, +0xbc805e843a19ff1e, 0x3feea71a4623c7ad, +0xbc522cea4f3afa1e, 0x3feea7f4179f5b21, +0xbc94d450d872576e, 0x3feea8d99b4492ed, +0x3c7c88549b958471, 0x3feea9cad931a436, +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, +0x3c931143962f7877, 0x3feeabd0a478580f, +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, +0x3c93e9e96f112479, 0x3feeae05bad61778, +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, +0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9, +0x3c7bf68359f35f44, 0x3feeb1ae99157736, +0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a, +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, +0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2, +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, +0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5, +0xbc6c23f97c90b959, 0x3feeba44cbc8520f, +0xbc51669428996971, 0x3feebbdd9a7670b3, +0xbc92434322f4f9aa, 0x3feebd829fde4e50, +0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2, +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, +0xbc9294f304f166b6, 0x3feec2bb4d53fe0d, +0x3c71affc2b91ce27, 0x3feec49182a3f090, +0xbc8a1e58414c07d3, 0x3feec674194bb8d5, +0x3c6dd235e10a73bb, 0x3feec86319e32323, +0xbc79740b58a20091, 0x3feeca5e8d07f29e, +0xbc87c50422622263, 0x3feecc667b5de565, +0x3c9165830a2b96c2, 0x3feece7aed8eb8bb, +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, +0xbc903d5cbe27874b, 0x3feed2c980460ad8, +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, +0x3c5986178980fce0, 0x3feed74a8af46052, +0x3c90cc319cee31d2, 0x3feed99e1330b358, +0xbc89472975b1f2a5, 0x3feedbfe53c12e59, +0x3c8469846e735ab3, 0x3feede6b5579fdbf, +0x3c7d8157a34b7e7f, 0x3feee0e521356eba, +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, +0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774, +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, +0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff, +0xbc907b8f4ad1d9fa, 0x3feeee07298db666, +0x3c889c2ea41433c7, 0x3feef0ce6c9a8952, +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, +0xbc7274aedac8ff80, 0x3feef68415b749b1, +0xbc90a40e3da6f640, 0x3feef9728de5593a, +0x3c85c620ce76df06, 0x3feefc6e29f1c52a, +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, +0xbc8fda52e1b51e41, 0x3fef028cf22749e4, +0xbc91eee26b588a35, 0x3fef05b030a1064a, +0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f, +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, +0xbc302899507554e5, 0x3fef0f69c3f3a207, +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, +0xbc80dda2d4c0010c, 0x3fef16286141b33d, +0x3c736eae30af0cb3, 0x3fef199bdd85529c, +0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c, +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, +0x3c836909391181d3, 0x3fef244778fafb22, +0x3c84e08fd10959ac, 0x3fef27f12e57d14b, +0xbc811cd7dbdf9547, 0x3fef2ba88988c933, +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, +0xbc7ac28b7bef6621, 0x3fef33405751c4db, +0x3c676b2c6c921968, 0x3fef3720dcef9069, +0xbc7030587207b9e1, 0x3fef3b0f2e6d1675, +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, +0xbc8cc734592af7fc, 0x3fef43155b5bab74, +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, +0x3c87752a44f587e8, 0x3fef4b532b08c968, +0xbc900dae3875a949, 0x3fef4f87080d89f2, +0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6, +0x3c74a385a63d07a7, 0x3fef5818dcfba487, +0x3c5159d9d908a96e, 0x3fef5c76e862e6d3, +0xbc82919e2040220f, 0x3fef60e316c98398, +0x3c8c254d16117a68, 0x3fef655d71ff6075, +0x3c8e5a50d5c192ac, 0x3fef69e603db3285, +0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315, +0x3c843a59ac016b4b, 0x3fef7321f301b460, +0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658, +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, +0xbc63e8e3eab2cbb4, 0x3fef81676b197d17, +0xbc892ab93b470dc9, 0x3fef864614f5a129, +0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12, +0x3c74b604603a88d3, 0x3fef902ee78b3ff6, +0xbc776caa4c2ff1cf, 0x3fef953924676d76, +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, +0xbc81d5fc525d9940, 0x3fef9f7977cdb740, +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, +0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e, +0xbc8dae98e223747d, 0x3fefaf482d8e67f1, +0x3c8269947c2bed4a, 0x3fefb4aaa2188510, +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, +0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a, +0x3c842b94c3a9eb32, 0x3fefc52b376bba97, +0xbc69fa74878ba7c7, 0x3fefcac948dd7274, +0x3c8a64a931d185ee, 0x3fefd0765b6e4540, +0x3c901f3a75ee0efe, 0x3fefd632798844f8, +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, +0xbc516a9ce6ed84fa, 0x3fefe1d802243c89, +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, +0xbc699c7db2effc76, 0x3fefedba3692d514, +0x3c5305c14160cc89, 0x3feff3c22b8f71f1, +0x3c64b458677f9840, 0x3feff9d96b2a23d9, +#elif N == 512 +0x0, 0x3ff0000000000000, +0xbc75d87ade1f60d5, 0x3feffd8c86da1c0a, +0xbc84e82fc61851ac, 0x3feffb1afa5abcbf, +0x3c9bffdaa7ac4bac, 0x3feff8ab5b2cbd11, +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, +0x3c75c18e5ae0563a, 0x3feff3d1e77170b4, +0xbc82985dd8521d32, 0x3feff168143b0281, +0xbc705b1125cf49a5, 0x3fefef003103b10e, +0xbc7160139cd8dc5d, 0x3fefec9a3e778061, +0x3c9f879abbff3f87, 0x3fefea363d42b027, +0x3c651e617061bfbd, 0x3fefe7d42e11bbcc, +0x3c9b14003824712a, 0x3fefe57411915a8a, +0xbc905e7a108766d1, 0x3fefe315e86e7f85, +0x3c61cbf0f38af658, 0x3fefe0b9b35659d8, +0x3c845fad437fa426, 0x3fefde5f72f654b1, +0xbc9a3316383dcbc5, 0x3fefdc0727fc1762, +0x3c8cd2523567f613, 0x3fefd9b0d3158574, +0x3c9901c9e0e797fd, 0x3fefd75c74f0bec2, +0xbc954529642b232f, 0x3fefd50a0e3c1f89, +0xbc89b3236d111646, 0x3fefd2b99fa6407c, +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, +0xbc8cb191be99b1b0, 0x3fefce1ead925493, +0x3c8293708ef5c32e, 0x3fefcbd42b72a836, +0xbc9acb71e83765b7, 0x3fefc98ba42e7d30, +0x3c60f74e61e6c861, 0x3fefc74518759bc8, +0x3c5cd3e58b03697e, 0x3fefc50088f8093f, +0xbc95b9280905b2a4, 0x3fefc2bdf66607e0, +0xbc8bfb07d4755452, 0x3fefc07d61701716, +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, +0x3c8aedeb3e7b14cd, 0x3fefbc02331b9715, +0x3c84f31f32c4b7e7, 0x3fefb9c79b1f3919, +0x3c9a8eb1f3d914b4, 0x3fefb78f03834e52, +0x3c979aa65d837b6d, 0x3fefb5586cf9890f, +0xbc85b9eb0402507b, 0x3fefb323d833d93f, +0x3c9407fb30d06420, 0x3fefb0f145e46c85, +0xbc93f0f225bbf3ee, 0x3fefaec0b6bdae53, +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, +0xbc9c3fe7282d1784, 0x3fefaa65a4b520ba, +0xbc9a5d04b3b9911b, 0x3fefa83b23395dec, +0x3c9c8be44bf4cde8, 0x3fefa612a7b26300, +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, +0x3c820c5444c93c44, 0x3fefa1c7c55189c6, +0xbc937a01f0739546, 0x3fef9fa55fdfa9c5, +0xbc84c6baeb580d7a, 0x3fef9d8503328e6d, +0xbc6a033489906e0b, 0x3fef9b66affed31b, +0x3c8657aa1b0d9f83, 0x3fef994a66f951ce, +0x3c8b8268b04ef0a5, 0x3fef973028d7233e, +0x3c62f2c7fd6ee145, 0x3fef9517f64d9ef1, +0xbc9556522a2fbd0e, 0x3fef9301d0125b51, +0xbc6b0b2789925e90, 0x3fef90edb6db2dc1, +0xbc9ac46e44a2ebcc, 0x3fef8edbab5e2ab6, +0xbc93aad17d197fae, 0x3fef8ccbae51a5c8, +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, +0xbc989c464a07ad70, 0x3fef88b1e264a0e9, +0xbc65704e90c9f860, 0x3fef86a814f204ab, +0xbc72c338fce197f4, 0x3fef84a058cbae1e, +0xbc91c923b9d5f416, 0x3fef829aaea92de0, +0xbc6dca724cea0eb6, 0x3fef809717425438, +0xbc897cea57e46280, 0x3fef7e95934f312e, +0x3c464770b955d34d, 0x3fef7c962388149e, +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, +0xbc962811c114424f, 0x3fef789d83606e12, +0x3c56f01429e2b9d2, 0x3fef76a45471c3c2, +0x3c8ec58e74904dd4, 0x3fef74ad3c92df73, +0xbc801b15eaa59348, 0x3fef72b83c7d517b, +0x3c8d63b0ab2d5bbf, 0x3fef70c554eaea89, +0x3c6e653b2459034b, 0x3fef6ed48695bbc0, +0xbc9ca9effbeeac92, 0x3fef6ce5d23816c9, +0xbc8f1ff055de323d, 0x3fef6af9388c8dea, +0x3c8bda920de0f6e2, 0x3fef690eba4df41f, +0x3c92cc7ea345b7dc, 0x3fef672658375d2f, +0xbc9a597f9a5ff71c, 0x3fef654013041dc2, +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, +0x3c50835b125aa573, 0x3fef6179e2363cf8, +0x3c957bfb2876ea9e, 0x3fef5f99f8138a1c, +0x3c8aaa13d61aec1f, 0x3fef5dbc2dc40bf0, +0xbc96d99c7611eb26, 0x3fef5be084045cd4, +0x3c8a4f81aa7110bd, 0x3fef5a06fb91588f, +0x3c8cdc1873af2155, 0x3fef582f95281c6b, +0xbc6817fd6a313e3e, 0x3fef565a51860746, +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, +0xbc96236af85fd26a, 0x3fef52b6358e15e8, +0xbc9493684653a131, 0x3fef50e75eb44027, +0x3c7795eb4523abe7, 0x3fef4f1aad999e82, +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, +0x3c8fe58b91b40095, 0x3fef4b87bf9cda38, +0xbc98e2899077520a, 0x3fef49c18438ce4d, +0x3c91ecaa860c614a, 0x3fef47fd7190241e, +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, +0xbc3e45c83ba0bbcb, 0x3fef447bc96ffc18, +0x3c9120fcd4f59273, 0x3fef42be3578a819, +0xbc29fd3bea07b4ee, 0x3fef4102cd3d09b9, +0x3c807a05b0e4047d, 0x3fef3f49917ddc96, +0x3c87f1c7350e256d, 0x3fef3d9282fc1f27, +0x3c89b788c188c9b8, 0x3fef3bdda27912d1, +0x3c420dac6c124f4f, 0x3fef3a2af0b63bff, +0x3c968efde3a8a894, 0x3fef387a6e756238, +0xbc99501d09bc09fd, 0x3fef36cc1c78903a, +0x3c877afbca90ef84, 0x3fef351ffb82140a, +0x3c73baf864dc8675, 0x3fef33760c547f15, +0x3c875e18f274487d, 0x3fef31ce4fb2a63f, +0x3c91b0575c1eaf54, 0x3fef3028c65fa1ff, +0x3c91512f082876ee, 0x3fef2e85711ece75, +0xbc90364bc9ce33ab, 0x3fef2ce450b3cb82, +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, +0xbc7548165d85ed32, 0x3fef29a8b16f0a30, +0x3c9a02f0c7d75ec6, 0x3fef280e341ddf29, +0x3c7c3b977a68e32c, 0x3fef2675eeb3ab98, +0xbc96b87b3f71085e, 0x3fef24dfe1f56381, +0xbc93a255f697ecfe, 0x3fef234c0ea83f36, +0xbc803297e78260bf, 0x3fef21ba7591bb70, +0x3c8d2d19edc1e550, 0x3fef202b17779965, +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, +0xbc76b2173113dd8c, 0x3fef1d130f50d65c, +0xbc95b77e5ccd9fbf, 0x3fef1b8a66d10f13, +0x3c811aa5f853590b, 0x3fef1a03fc675d1f, +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, +0x3c61d61a34c8aa02, 0x3fef16fde4f2e280, +0xbc91e75c40b4251e, 0x3fef157e39771b2f, +0xbc91f892bf6b286d, 0x3fef1400cf2f6c18, +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, +0x3c7590c65c20e680, 0x3fef110cc15d5346, +0x3c98a911f1f7785a, 0x3fef0f961f641589, +0x3c86fe320b5c1e9d, 0x3fef0e21c1c14833, +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, +0xbc903cd8b2f25790, 0x3fef0b3fd6a454d2, +0xbc61e7c998db7dbb, 0x3fef09d24abd886b, +0x3c7b3bf786a54a87, 0x3fef08670653dfe4, +0x3c834d754db0abb6, 0x3fef06fe0a31b715, +0x3c74bb6c41732885, 0x3fef05975721b004, +0x3c85425c11faadf4, 0x3fef0432edeeb2fd, +0xbc99d7399abb9a8b, 0x3fef02d0cf63eeac, +0x3c864201e2ac744c, 0x3fef0170fc4cd831, +0xbc5451d60c6ac9eb, 0x3fef001375752b40, +0xbc979517a03e2847, 0x3feefeb83ba8ea32, +0x3c8787a210ceafd9, 0x3feefd5f4fb45e20, +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, +0xbc888d1e4629943d, 0x3feefab46484ebb4, +0xbc800e2a46da4bee, 0x3feef96266e3fa2d, +0xbc93369c544088b6, 0x3feef812ba4ea77d, +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, +0x3c85373ce4eb6dfb, 0x3feef57a577dd72b, +0xbc87430803972b34, 0x3feef431a2de883b, +0x3c83adec8265a67f, 0x3feef2eb428335b4, +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, +0xbc835388bcac6bc5, 0x3feef06581d3f669, +0xbc954de30ae02d94, 0x3feeef26231e754a, +0x3c727cdb4e4b6640, 0x3feeede91be9c811, +0xbc9907f81b512d8e, 0x3feeecae6d05d866, +0x3c86c2696a26af35, 0x3feeeb761742d808, +0xbc94f2487e1c03ec, 0x3feeea401b7140ef, +0x3c888f6ff06b979a, 0x3feee90c7a61d55b, +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, +0xbc89d5efaabc2030, 0x3feee6ac4bcdf3ea, +0x3c914a5432fcb2f4, 0x3feee57fbfec6cf4, +0xbc76b8867f91c9d6, 0x3feee4559212ef89, +0xbc991919b3ce1b15, 0x3feee32dc313a8e5, +0x3c94c9c0b5157fe6, 0x3feee20853c10f28, +0x3c79c3bba5562a2f, 0x3feee0e544ede173, +0xbc62455345b51c8e, 0x3feedfc4976d27fa, +0x3c859f48a72a4c6d, 0x3feedea64c123422, +0xbc93331de45477d0, 0x3feedd8a63b0a09b, +0xbc85a71612e21658, 0x3feedc70df1c5175, +0xbc95f84d39b39b16, 0x3feedb59bf29743f, +0xbc9312607a28698a, 0x3feeda4504ac801c, +0xbc72ba4dc7c4d562, 0x3feed932b07a35df, +0x3c86421f6f1d24d6, 0x3feed822c367a024, +0xbc844f25dc02691f, 0x3feed7153e4a136a, +0xbc58a78f4817895b, 0x3feed60a21f72e2a, +0xbc888d328eb9b501, 0x3feed5016f44d8f5, +0xbc9348a6815fce65, 0x3feed3fb2709468a, +0x3c7f0bec42ddb15a, 0x3feed2f74a1af3f1, +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, +0xbc615f0a2b9cd452, 0x3feed0f6d5817663, +0x3c835c43984d9871, 0x3feecffa3f84b9d4, +0xbc8c2e465a919e1d, 0x3feecf0018321a1a, +0x3c4363ed60c2ac11, 0x3feece086061892d, +0xbc865dfd02bd08f1, 0x3feecd1318eb43ec, +0xbc632afc8d9473a0, 0x3feecc2042a7d232, +0xbc8e68cec89b1762, 0x3feecb2fde7006f4, +0x3c9666093b0664ef, 0x3feeca41ed1d0057, +0xbc48ae858eb682ca, 0x3feec9566f8827d0, +0xbc95fc5e44de020e, 0x3feec86d668b3237, +0x3c5dd71277c0915f, 0x3feec786d3001fe5, +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, +0x3c92001325ecd7fb, 0x3feec5c10fa920a1, +0xbc7ea0148327c42f, 0x3feec4e1e192aed2, +0x3c65ace6e2870332, 0x3feec4052c5916c4, +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, +0xbc9595c55690ffaf, 0x3feec2532feaada6, +0xbc7a843ad1a88022, 0x3feec17dea6db7d7, +0xbc8b401ba9fb5199, 0x3feec0ab213d5283, +0x3c7690cebb7aafb0, 0x3feebfdad5362a27, +0x3c6df82bf324cc57, 0x3feebf0d073537ca, +0x3c892ca3bf144e63, 0x3feebe41b817c114, +0x3c97cae38641c7bb, 0x3feebd78e8bb586b, +0x3c931dbdeb54e077, 0x3feebcb299fddd0d, +0x3c62d80c5c4a2b67, 0x3feebbeeccbd7b2a, +0xbc902c99b04aa8b0, 0x3feebb2d81d8abff, +0x3c8f39c10d12eaf0, 0x3feeba6eba2e35f0, +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, +0xbc80b582d74a55d9, 0x3feeb8f8b804f127, +0x3c73e34f67e67118, 0x3feeb8417f4531ee, +0xbc6b4e327ff434ca, 0x3feeb78ccd3deb0d, +0xbc87deccdc93a349, 0x3feeb6daa2cf6642, +0xbc592dca38593e20, 0x3feeb62b00da3b14, +0xbc75a3b1197ba0f0, 0x3feeb57de83f4eef, +0xbc85daca9994833e, 0x3feeb4d359dfd53d, +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, +0xbc980b4321bc6dae, 0x3feeb385df598d78, +0x3c81bd2888075068, 0x3feeb2e2f4f6ad27, +0xbc8390afec5241c5, 0x3feeb24298571b06, +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, +0x3c8f15cdafe7d586, 0x3feeb1098bed1bdf, +0xbc896be8ae89ef8f, 0x3feeb070dde910d2, +0xbc910aa91ae9b67f, 0x3feeafdac1351819, +0x3c93350518fdd78e, 0x3feeaf4736b527da, +0x3c957e1b67462375, 0x3feeaeb63f4d854c, +0xbc88e6ac90348602, 0x3feeae27dbe2c4cf, +0x3c8124d5051552a7, 0x3feead9c0d59ca07, +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, +0xbc3ca103952ecf1f, 0x3feeac8c32824135, +0xbc91af7f1365c3ac, 0x3feeac0827ff07cc, +0x3c773345c02a4fd6, 0x3feeab86b5f43d92, +0x3c9063e1e21c5409, 0x3feeab07dd485429, +0xbc909d2a0fce20f2, 0x3feeaa8b9ee20d1e, +0xbc943a3540d1898a, 0x3feeaa11fba87a03, +0xbc924f2cb4f81746, 0x3feea99af482fc8f, +0x3c34c7855019c6ea, 0x3feea9268a5946b7, +0xbc943592a0a9846b, 0x3feea8b4be135acc, +0xbc951f58ddaa8090, 0x3feea84590998b93, +0xbc956bc85d444f4f, 0x3feea7d902d47c65, +0x3c9432e62b64c035, 0x3feea76f15ad2148, +0x3c914d1e4218319f, 0x3feea707ca0cbf0f, +0xbc82e1648e50a17c, 0x3feea6a320dceb71, +0x3c971c93709313f4, 0x3feea6411b078d26, +0xbc8ce44a6199769f, 0x3feea5e1b976dc09, +0x3c7f88303b60d222, 0x3feea584fd15612a, +0x3c95f30eda98a575, 0x3feea52ae6cdf6f4, +0x3c70125ca18d4b5b, 0x3feea4d3778bc944, +0xbc8c33c53bef4da8, 0x3feea47eb03a5585, +0x3c9592ea73798b11, 0x3feea42c91c56acd, +0x3c917ecda8a72159, 0x3feea3dd1d1929fd, +0xbc9371d6d7d75739, 0x3feea390532205d8, +0xbc845378892be9ae, 0x3feea34634ccc320, +0xbc8ac05fd996f807, 0x3feea2fec30678b7, +0xbc9345f3cee1ae6e, 0x3feea2b9febc8fb7, +0xbc91f5067d03653a, 0x3feea277e8dcc390, +0xbc93cedd78565858, 0x3feea23882552225, +0x3c917339c86ce3ad, 0x3feea1fbcc140be7, +0xbc85c33fdf910406, 0x3feea1c1c70833f6, +0xbc77e66065ba2500, 0x3feea18a7420a036, +0x3c5710aa807e1964, 0x3feea155d44ca973, +0x3c964c827ee6b49a, 0x3feea123e87bfb7a, +0x3c81079ab5789604, 0x3feea0f4b19e9538, +0xbc928311a3c73480, 0x3feea0c830a4c8d4, +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, +0x3c882c79e185e981, 0x3feea077541ee718, +0x3c727df161cd7778, 0x3feea052fa75173e, +0xbc8b48cea80b043b, 0x3feea0315a736c75, +0xbc6a12ad8734b982, 0x3feea012750bdabf, +0xbc4f4863bc8e5180, 0x3fee9ff64b30aa09, +0x3c93f9924a05b767, 0x3fee9fdcddd47645, +0x3c954835dd4b7548, 0x3fee9fc62dea2f8a, +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, +0xbc8bf41f59b59f8a, 0x3fee9fa10a38cee8, +0xbc87557939a8b5ef, 0x3fee9f9298593ae5, +0xbc8f652fde52775c, 0x3fee9f86e7ba9fef, +0xbc80dc3d54e08851, 0x3fee9f7df9519484, +0xbc7b0300defbcf98, 0x3fee9f77ce1303f6, +0x3c51ed2f56fa9d1a, 0x3fee9f7466f42e87, +0xbc89dab646035dc0, 0x3fee9f73c4eaa988, +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, +0xbc91f0c230588dde, 0x3fee9f7ad3ef9011, +0xbc88e67a9006c909, 0x3fee9f8286ead08a, +0x3c9106450507a28c, 0x3fee9f8d02d50b8f, +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, +0xbc9129729a10f3a0, 0x3fee9faa5953c849, +0x3c86597566977ac8, 0x3fee9fbd35d7cbfd, +0x3c781a70a5124f67, 0x3fee9fd2df29ce7c, +0xbc8619321e55e68a, 0x3fee9feb564267c9, +0x3c941626ea62646d, 0x3feea0069c1a861d, +0x3c92c0b7028a5c3a, 0x3feea024b1ab6e09, +0xbc940b9f54365b7c, 0x3feea04597eeba8f, +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, +0x3c873455e0e826c1, 0x3feea08fda749e5d, +0x3c8a30faf49cc78c, 0x3feea0b938ac1cf6, +0x3c94f006ad874e3e, 0x3feea0e56b7fcf03, +0xbc7b32dcb94da51d, 0x3feea11473eb0187, +0xbc8f6d693d0973bb, 0x3feea14652e958aa, +0xbc92dad3519d7b5b, 0x3feea17b0976cfdb, +0x3c58c5ee2b7e7848, 0x3feea1b2988fb9ec, +0x3c94ecfd5467c06b, 0x3feea1ed0130c132, +0xbc88b25e045d207b, 0x3feea22a4456e7a3, +0x3c87d51410fd15c2, 0x3feea26a62ff86f0, +0xbc69cb3314060ca7, 0x3feea2ad5e2850ac, +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, +0x3c87a0b15d19e0bb, 0x3feea33bedf2e1b9, +0xbc760a3629969871, 0x3feea3878491c491, +0x3c94aa7212bfa73c, 0x3feea3d5fbab091f, +0xbc88a1c52fb3cf42, 0x3feea427543e1a12, +0xbc81e688272a8a12, 0x3feea47b8f4abaa9, +0x3c8b18c6e3fdef5d, 0x3feea4d2add106d9, +0x3c4ab7b7112ec9d5, 0x3feea52cb0d1736a, +0xbc9369b6f13b3734, 0x3feea589994cce13, +0x3c8a1e274eed4476, 0x3feea5e968443d9a, +0x3c90ec1ddcb1390a, 0x3feea64c1eb941f7, +0x3c94a533a59324da, 0x3feea6b1bdadb46d, +0xbc805e843a19ff1e, 0x3feea71a4623c7ad, +0x3c7a56d2760d087d, 0x3feea785b91e07f1, +0xbc522cea4f3afa1e, 0x3feea7f4179f5b21, +0x3c91682c1c6e8b05, 0x3feea86562ab00ec, +0xbc94d450d872576e, 0x3feea8d99b4492ed, +0x3c89ea99cf7a9591, 0x3feea950c27004c2, +0x3c7c88549b958471, 0x3feea9cad931a436, +0xbc59e57d8f92ff8e, 0x3feeaa47e08e1957, +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, +0x3c909b176e05a9cd, 0x3feeab4ac52be8f7, +0x3c931143962f7877, 0x3feeabd0a478580f, +0x3c711607f1952c95, 0x3feeac597875c644, +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, +0x3c869608f0f86431, 0x3feead74029db01e, +0x3c93e9e96f112479, 0x3feeae05bad61778, +0xbc7f1ced15c5c5c0, 0x3feeae9a6bdb5598, +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, +0x3c614b97be3f7b4e, 0x3feeafccbc6c19e6, +0xbc8dac42a4a38df0, 0x3feeb06a5e0866d9, +0x3c81c1701c359530, 0x3feeb10afc931857, +0x3c7bf68359f35f44, 0x3feeb1ae99157736, +0xbc8edb1bf6809287, 0x3feeb2553499284b, +0x3c8b99dd98b1ed84, 0x3feeb2fed0282c8a, +0xbc8ba58ce7a736d3, 0x3feeb3ab6ccce12c, +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, +0xbc93fc025e1db9ce, 0x3feeb50dad829e70, +0xbc7885ad50cbb750, 0x3feeb5c353aa2fe2, +0xbc8d737c7d71382e, 0x3feeb67bff148396, +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, +0x3c6ae88c43905293, 0x3feeb7f669e2802b, +0xbc82d5e85f3e0301, 0x3feeb8b82b5f98e5, +0xbc93d1f7661fe51b, 0x3feeb97cf65253d1, +0xbc6c23f97c90b959, 0x3feeba44cbc8520f, +0x3c651b68797ffc1c, 0x3feebb0faccf9243, +0xbc51669428996971, 0x3feebbdd9a7670b3, +0x3c54579c5ceed70b, 0x3feebcae95cba768, +0xbc92434322f4f9aa, 0x3feebd829fde4e50, +0x3c87298413381667, 0x3feebe59b9bddb5b, +0x3c71f2b2c1c4c014, 0x3feebf33e47a22a2, +0xbc905000be64e965, 0x3feec01121235681, +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, +0xbc89fb12e3454b73, 0x3feec1d4d47f2598, +0xbc9294f304f166b6, 0x3feec2bb4d53fe0d, +0x3c7be2a03697693b, 0x3feec3a4dc5a3dd3, +0x3c71affc2b91ce27, 0x3feec49182a3f090, +0x3c90622b15810eea, 0x3feec581414380f2, +0xbc8a1e58414c07d3, 0x3feec674194bb8d5, +0x3be9a5ecc875d327, 0x3feec76a0bcfc15e, +0x3c6dd235e10a73bb, 0x3feec86319e32323, +0x3c88ea486a3350ef, 0x3feec95f4499c647, +0xbc79740b58a20091, 0x3feeca5e8d07f29e, +0xbc7a2ee551d4c40f, 0x3feecb60f4424fcb, +0xbc87c50422622263, 0x3feecc667b5de565, +0x3c89c31f7e38028b, 0x3feecd6f23701b15, +0x3c9165830a2b96c2, 0x3feece7aed8eb8bb, +0xbc5fac13f4e005a3, 0x3feecf89dacfe68c, +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, +0x3c7d8aced7162e89, 0x3feed1b1231475f7, +0xbc903d5cbe27874b, 0x3feed2c980460ad8, +0xbc848f50cea7269f, 0x3feed3e504f696b1, +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, +0x3c821eb9a08a0542, 0x3feed625893523d4, +0x3c5986178980fce0, 0x3feed74a8af46052, +0xbc6133a953131cfd, 0x3feed872b8950a73, +0x3c90cc319cee31d2, 0x3feed99e1330b358, +0x3c89e95e6f4a0ae4, 0x3feedacc9be14dca, +0xbc89472975b1f2a5, 0x3feedbfe53c12e59, +0xbc90260cf07cb311, 0x3feedd333beb0b7e, +0x3c8469846e735ab3, 0x3feede6b5579fdbf, +0x3c1bca400a7b939d, 0x3feedfa6a1897fd2, +0x3c7d8157a34b7e7f, 0x3feee0e521356eba, +0x3c9140bc34dfc19f, 0x3feee226d59a09ee, +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, +0xbc8c9b1da461ab87, 0x3feee4b3e100301e, +0x3c8c8a4e231ebb7d, 0x3feee5ff3a3c2774, +0x3c8c115f23ebea8e, 0x3feee74dcca5a413, +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, +0xbc6dcab99f23f84e, 0x3feee9f4a17a4735, +0xbc888c8d11a142e5, 0x3feeeb4ce622f2ff, +0x3c60a43e8b7e4bfe, 0x3feeeca868742ee4, +0xbc907b8f4ad1d9fa, 0x3feeee07298db666, +0x3c915b1397075f04, 0x3feeef692a8fa8cd, +0x3c889c2ea41433c7, 0x3feef0ce6c9a8952, +0xbc839f7a1f04d2b0, 0x3feef236f0cf3f3a, +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, +0xbc86a510f31e13e6, 0x3feef511c43bbd62, +0xbc7274aedac8ff80, 0x3feef68415b749b1, +0xbc92887ea88e7340, 0x3feef7f9ade433c6, +0xbc90a40e3da6f640, 0x3feef9728de5593a, +0xbc6e57ac604759ba, 0x3feefaeeb6ddfc87, +0x3c85c620ce76df06, 0x3feefc6e29f1c52a, +0x3c8e6c6db4f83226, 0x3feefdf0e844bfc6, +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, +0xbc8d1bf10460dba0, 0x3fef01004b3a7804, +0xbc8fda52e1b51e41, 0x3fef028cf22749e4, +0x3c8e5d80813dddfc, 0x3fef041ce8e77680, +0xbc91eee26b588a35, 0x3fef05b030a1064a, +0x3c8caff9640f2dcb, 0x3fef0746ca7a67a7, +0xbc32141a7b3e2cd8, 0x3fef08e0b79a6f1f, +0x3c7a77557fd62db3, 0x3fef0a7df9285775, +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, +0xbc651ba6128db749, 0x3fef0dc27e2cb5e5, +0xbc302899507554e5, 0x3fef0f69c3f3a207, +0xbc7c0ffefdc5e251, 0x3fef111462c95b60, +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, +0xbc8b6cd058bfd6fa, 0x3fef1473b0468d30, +0xbc80dda2d4c0010c, 0x3fef16286141b33d, +0x3c923759b8aca76d, 0x3fef17e06ff301f4, +0x3c736eae30af0cb3, 0x3fef199bdd85529c, +0xbc895498a73dac7d, 0x3fef1b5aab23e61e, +0xbc8a007daadf8d68, 0x3fef1d1cd9fa652c, +0x3c851de924583108, 0x3fef1ee26b34e065, +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, +0xbc8c5fe4051ba06c, 0x3fef2277b9881650, +0x3c836909391181d3, 0x3fef244778fafb22, +0xbc6d1816c0a9ac07, 0x3fef261a9f8630ad, +0x3c84e08fd10959ac, 0x3fef27f12e57d14b, +0xbc7af5c67c4e8235, 0x3fef29cb269e601f, +0xbc811cd7dbdf9547, 0x3fef2ba88988c933, +0xbc8304ef0045d575, 0x3fef2d89584661a1, +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, +0x3c8725f94f910375, 0x3fef31553dfa8313, +0xbc7ac28b7bef6621, 0x3fef33405751c4db, +0x3c7b53e99f9191e8, 0x3fef352ee13da7cb, +0x3c676b2c6c921968, 0x3fef3720dcef9069, +0xbc810a79e6d7e2b8, 0x3fef39164b994d23, +0xbc7030587207b9e1, 0x3fef3b0f2e6d1675, +0x3c840635f6d2a9c0, 0x3fef3d0b869d8f0f, +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, +0x3c549eeef9ec910c, 0x3fef410e9be12cb9, +0xbc8cc734592af7fc, 0x3fef43155b5bab74, +0xbc8335827ffb9dce, 0x3fef451f95018d17, +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, +0x3c645563980ef762, 0x3fef493e7ba2c38c, +0x3c87752a44f587e8, 0x3fef4b532b08c968, +0xbc8cd0205eb2aab2, 0x3fef4d6b596f948c, +0xbc900dae3875a949, 0x3fef4f87080d89f2, +0xbc8aab80ceab2b4a, 0x3fef51a638197a3c, +0x3c85b66fefeef52e, 0x3fef53c8eacaa1d6, +0xbc8f870f40a8ba1b, 0x3fef55ef2158a91f, +0x3c74a385a63d07a7, 0x3fef5818dcfba487, +0x3c83c119f18464c5, 0x3fef5a461eec14be, +0x3c5159d9d908a96e, 0x3fef5c76e862e6d3, +0xbc5a628c2be4e7c7, 0x3fef5eab3a99745b, +0xbc82919e2040220f, 0x3fef60e316c98398, +0xbc72550d76be719a, 0x3fef631e7e2d479d, +0x3c8c254d16117a68, 0x3fef655d71ff6075, +0xbc82090274667d12, 0x3fef679ff37adb4a, +0x3c8e5a50d5c192ac, 0x3fef69e603db3285, +0x3c75f7d28150cac4, 0x3fef6c2fa45c4dfd, +0xbc8d8c329fbd0e03, 0x3fef6e7cd63a8315, +0x3c890de9296f4cd1, 0x3fef70cd9ab294e4, +0x3c843a59ac016b4b, 0x3fef7321f301b460, +0x3c832ff9978b34bc, 0x3fef7579e065807d, +0xbc8ea6e6fbd5f2a6, 0x3fef77d5641c0658, +0xbc7303b63dda1980, 0x3fef7a347f63c159, +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, +0xbc81f2ba385f2f95, 0x3fef7efd81a2ece1, +0xbc63e8e3eab2cbb4, 0x3fef81676b197d17, +0x3c768d9144ae12fc, 0x3fef83d4f11f8220, +0xbc892ab93b470dc9, 0x3fef864614f5a129, +0x3c853687f542403b, 0x3fef88bad7dcee90, +0xbc8b7966cd0d2cd9, 0x3fef8b333b16ee12, +0xbc736ed2de40b407, 0x3fef8daf3fe592e8, +0x3c74b604603a88d3, 0x3fef902ee78b3ff6, +0xbc614ef56c770f3b, 0x3fef92b2334ac7ee, +0xbc776caa4c2ff1cf, 0x3fef953924676d76, +0x3c8df7d1353d8e88, 0x3fef97c3bc24e350, +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, +0xbc850bed64091b8a, 0x3fef9ce3e4933c7e, +0xbc81d5fc525d9940, 0x3fef9f7977cdb740, +0x3c89d852381c317f, 0x3fefa212b6bc3181, +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, +0x3c68a00e3cca04c4, 0x3fefa7503ccd2be5, +0x3c855cd8aaea3d21, 0x3fefa9f4867cca6e, +0xbc5a1f25ce94cae7, 0x3fefac9c80faa594, +0xbc8dae98e223747d, 0x3fefaf482d8e67f1, +0xbc6fb5f3ee307976, 0x3fefb1f78d802dc2, +0x3c8269947c2bed4a, 0x3fefb4aaa2188510, +0x3c737e8ae802b851, 0x3fefb7616ca06dd6, +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, +0x3c875119560e34af, 0x3fefbcda28a52e59, +0xbc83b6137e9afe9e, 0x3fefbf9c1cb6412a, +0xbc7431c3840929c6, 0x3fefc261cbdf5be7, +0x3c842b94c3a9eb32, 0x3fefc52b376bba97, +0xbc8cb472d2e86b99, 0x3fefc7f860a70c22, +0xbc69fa74878ba7c7, 0x3fefcac948dd7274, +0x3c83f5df2fde16a8, 0x3fefcd9df15b82ac, +0x3c8a64a931d185ee, 0x3fefd0765b6e4540, +0x3c8eef18336b62e3, 0x3fefd35288633625, +0x3c901f3a75ee0efe, 0x3fefd632798844f8, +0x3c80d23f87b50a2a, 0x3fefd916302bd526, +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, +0x3c8302dee657c8e6, 0x3fefdee8f32a4b45, +0xbc516a9ce6ed84fa, 0x3fefe1d802243c89, +0xbc7b0caa080df170, 0x3fefe4cadbdac61d, +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, +0x3c7617a9f2fd24e5, 0x3fefeabbf4c0ba54, +0xbc699c7db2effc76, 0x3fefedba3692d514, +0x3c75f103b8fd5ca7, 0x3feff0bc4866e8ad, +0x3c5305c14160cc89, 0x3feff3c22b8f71f1, +0x3c8e70b094fa075a, 0x3feff6cbe15f6314, +0x3c64b458677f9840, 0x3feff9d96b2a23d9, +0xbc72ec9a3e5d680a, 0x3feffceaca4391b6, +#endif +}, +}; diff --git a/contrib/arm-optimized-routines/pl/math/expf.c b/contrib/arm-optimized-routines/pl/math/expf.c new file mode 100644 index 000000000000..c325e45d5cc6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/expf.c @@ -0,0 +1,76 @@ +/* + * Single-precision e^x function. + * + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <math.h> +#include <stdint.h> +#include "math_config.h" + +/* +EXPF_TABLE_BITS = 5 +EXPF_POLY_ORDER = 3 + +ULP error: 0.502 (nearest rounding.) +Relative error: 1.69 * 2^-34 in [-ln2/64, ln2/64] (before rounding.) +Wrong count: 170635 (all nearest rounding wrong results with fma.) +Non-nearest ULP error: 1 (rounded ULP error) +*/ + +#define N (1 << EXPF_TABLE_BITS) +#define InvLn2N __expf_data.invln2_scaled +#define T __expf_data.tab +#define C __expf_data.poly_scaled + +static inline uint32_t +top12 (float x) +{ + return asuint (x) >> 20; +} + +float +optr_aor_exp_f32 (float x) +{ + uint32_t abstop; + uint64_t ki, t; + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t kd, xd, z, r, r2, y, s; + + xd = (double_t) x; + abstop = top12 (x) & 0x7ff; + if (unlikely (abstop >= top12 (88.0f))) + { + /* |x| >= 88 or x is nan. */ + if (asuint (x) == asuint (-INFINITY)) + return 0.0f; + if (abstop >= top12 (INFINITY)) + return x + x; + if (x > 0x1.62e42ep6f) /* x > log(0x1p128) ~= 88.72 */ + return __math_oflowf (0); + if (x < -0x1.9fe368p6f) /* x < log(0x1p-150) ~= -103.97 */ + return __math_uflowf (0); + } + + /* x*N/Ln2 = k + r with r in [-1/2, 1/2] and int k. */ + z = InvLn2N * xd; + + /* Round and convert z to int, the result is in [-150*N, 128*N] and + ideally nearest int is used, otherwise the magnitude of r can be + bigger which gives larger approximation error. */ + kd = roundtoint (z); + ki = converttoint (z); + r = z - kd; + + /* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + t = T[ki % N]; + t += ki << (52 - EXPF_TABLE_BITS); + s = asdouble (t); + z = C[0] * r + C[1]; + r2 = r * r; + y = C[2] * r + 1; + y = z * r2 + y; + y = y * s; + return eval_as_float (y); +} diff --git a/contrib/arm-optimized-routines/pl/math/expf_data.c b/contrib/arm-optimized-routines/pl/math/expf_data.c new file mode 100644 index 000000000000..474ad57a29a0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/expf_data.c @@ -0,0 +1,31 @@ +/* + * Coeffs and table entries for single-precision exp. Copied from + * math/exp2f_data.c, with EXP2F_TABLE_BITS == 32. + * + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << EXPF_TABLE_BITS) + +const struct expf_data __expf_data = { + /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) + used for computing 2^(k/N) for an int |k| < 150 N as + double(tab[k%N] + (k << 52-BITS)) */ + .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, + .invln2_scaled = 0x1.71547652b82fep+0 * N, + .poly_scaled = { + 0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, + }, +}; diff --git a/contrib/arm-optimized-routines/pl/math/expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c new file mode 100644 index 000000000000..a3faff70cb62 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/expm1_2u5.c @@ -0,0 +1,86 @@ +/* + * Double-precision e^x - 1 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "estrin.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define InvLn2 0x1.71547652b82fep0 +#define Ln2hi 0x1.62e42fefa39efp-1 +#define Ln2lo 0x1.abc9e3b39803fp-56 +#define Shift 0x1.8p52 +#define TinyBound \ + 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ +#define BigBound 0x1.63108c75a1937p+9 /* Above which expm1(x) overflows. */ +#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */ +#define AbsMask 0x7fffffffffffffff + +#define C(i) __expm1_poly[i] + +/* Approximation for exp(x) - 1 using polynomial on a reduced interval. + The maximum error observed error is 2.17 ULP: + expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2 + want 0x1.a9af566038788p-2. */ +double +expm1 (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ax = ix & AbsMask; + + /* Tiny, +Infinity. */ + if (ax <= TinyBound || ix == 0x7ff0000000000000) + return x; + + /* +/-NaN. */ + if (ax > 0x7ff0000000000000) + return __math_invalid (x); + + /* Result is too large to be represented as a double. */ + if (x >= 0x1.63108c75a1937p+9) + return __math_oflow (0); + + /* Result rounds to -1 in double precision. */ + if (x <= NegBound) + return -1; + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + double j = fma (InvLn2, x, Shift) - Shift; + int64_t i = j; + double f = fma (j, -Ln2hi, x); + f = fma (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + double f2 = f * f; + double f4 = f2 * f2; + double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + + /* Assemble the result, using a slight rearrangement to achieve acceptable + accuracy. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^(i - 1). */ + double t = ldexp (0.5, i); + /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ + return 2 * fma (p, t, t - 0.5); +} + +PL_SIG (S, D, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (expm1, 1.68) +PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000) +PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000) +PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000) +PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) +PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100) +PL_TEST_INTERVAL (expm1, -0x1.740bf7c0d927dp+9, -inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/expm1_data.c b/contrib/arm-optimized-routines/pl/math/expm1_data.c new file mode 100644 index 000000000000..ff7426b90135 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/expm1_data.c @@ -0,0 +1,21 @@ +/* + * Coefficients for double-precision e^x - 1 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Generated using fpminimax, see tools/expm1.sollya for details. */ +const double __expm1_poly[] = {0x1p-1, + 0x1.5555555555559p-3, + 0x1.555555555554bp-5, + 0x1.111111110f663p-7, + 0x1.6c16c16c1b5f3p-10, + 0x1.a01a01affa35dp-13, + 0x1.a01a018b4ecbbp-16, + 0x1.71ddf82db5bb4p-19, + 0x1.27e517fc0d54bp-22, + 0x1.af5eedae67435p-26, + 0x1.1f143d060a28ap-29}; diff --git a/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c new file mode 100644 index 000000000000..70b14e48519d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/expm1f_1u6.c @@ -0,0 +1,80 @@ +/* + * Single-precision e^x - 1 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "hornerf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Shift (0x1.8p23f) +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) +#define AbsMask (0x7fffffff) +#define InfLimit \ + (0x1.644716p6) /* Smallest value of x for which expm1(x) overflows. */ +#define NegLimit \ + (-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */ + +#define C(i) __expm1f_poly[i] + +/* Approximation for exp(x) - 1 using polynomial on a reduced interval. + The maximum error is 1.51 ULP: + expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 + want 0x1.e2fb94p-2. */ +float +expm1f (float x) +{ + uint32_t ix = asuint (x); + uint32_t ax = ix & AbsMask; + + /* Tiny: |x| < 0x1p-23. expm1(x) is closely approximated by x. + Inf: x == +Inf => expm1(x) = x. */ + if (ax <= 0x34000000 || (ix == 0x7f800000)) + return x; + + /* +/-NaN. */ + if (ax > 0x7f800000) + return __math_invalidf (x); + + if (x >= InfLimit) + return __math_oflowf (0); + + if (x <= NegLimit || ix == 0xff800000) + return -1; + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float j = fmaf (InvLn2, x, Shift) - Shift; + int32_t i = j; + float f = fmaf (j, -Ln2hi, x); + f = fmaf (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float p = fmaf (f * f, HORNER_4 (f, C), f); + /* Assemble the result, using a slight rearrangement to achieve acceptable + accuracy. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^(i - 1). */ + float t = ldexpf (0.5f, i); + /* expm1(x) ~= 2 * (p * t + (t - 1/2)). */ + return 2 * fmaf (p, t, t - 0.5f); +} + +PL_SIG (S, F, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (expm1f, 1.02) +PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000) +PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000) +PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000) +PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000) diff --git a/contrib/arm-optimized-routines/pl/math/expm1f_data.c b/contrib/arm-optimized-routines/pl/math/expm1f_data.c new file mode 100644 index 000000000000..9d02dc448ebb --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/expm1f_data.c @@ -0,0 +1,12 @@ +/* + * Coefficients for single-precision e^x - 1 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Generated using fpminimax, see tools/expm1f.sollya for details. */ +const float __expm1f_poly[] = {0x1.fffffep-2, 0x1.5554aep-3, 0x1.555736p-5, + 0x1.12287cp-7, 0x1.6b55a2p-10}; diff --git a/contrib/arm-optimized-routines/pl/math/horner.h b/contrib/arm-optimized-routines/pl/math/horner.h new file mode 100644 index 000000000000..f92ab6752110 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/horner.h @@ -0,0 +1,14 @@ +/* + * Helper macros for single-precision Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f64 +#else +#define FMA fma +#endif + +#include "horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/horner_wrap.h b/contrib/arm-optimized-routines/pl/math/horner_wrap.h new file mode 100644 index 000000000000..6478968db913 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/horner_wrap.h @@ -0,0 +1,34 @@ +/* + * Helper macros for Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i)) +#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i)) +#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i)) +#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i)) +#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i)) +#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i)) +#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i)) +#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i)) +#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i)) +#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i)) +#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i)) +#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i)) + +#define HORNER_1(x, c) HORNER_1_ (x, c, 0) +#define HORNER_2(x, c) HORNER_2_ (x, c, 0) +#define HORNER_3(x, c) HORNER_3_ (x, c, 0) +#define HORNER_4(x, c) HORNER_4_ (x, c, 0) +#define HORNER_5(x, c) HORNER_5_ (x, c, 0) +#define HORNER_6(x, c) HORNER_6_ (x, c, 0) +#define HORNER_7(x, c) HORNER_7_ (x, c, 0) +#define HORNER_8(x, c) HORNER_8_ (x, c, 0) +#define HORNER_9(x, c) HORNER_9_ (x, c, 0) +#define HORNER_10(x, c) HORNER_10_(x, c, 0) +#define HORNER_11(x, c) HORNER_11_(x, c, 0) +#define HORNER_12(x, c) HORNER_12_(x, c, 0) +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/hornerf.h b/contrib/arm-optimized-routines/pl/math/hornerf.h new file mode 100644 index 000000000000..0703817b0fbb --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/hornerf.h @@ -0,0 +1,14 @@ +/* + * Helper macros for double-precision Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f32 +#else +#define FMA fmaf +#endif + +#include "horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/include/mathlib.h b/contrib/arm-optimized-routines/pl/math/include/mathlib.h new file mode 100644 index 000000000000..af5f9f9c6afb --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/include/mathlib.h @@ -0,0 +1,244 @@ +// clang-format off +/* + * Public API. + * + * Copyright (c) 2015-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _MATHLIB_H +#define _MATHLIB_H + +float acoshf (float); +float asinhf (float); +float atan2f (float, float); +float atanf (float); +float atanhf (float); +float cbrtf (float); +float coshf (float); +float erfcf (float); +float erff (float); +float expm1f (float); +float log10f (float); +float log1pf (float); +float sinhf (float); +float tanf (float); +float tanhf (float); + +double acosh (double); +double asinh (double); +double atan (double); +double atan2 (double, double); +double atanh (double); +double cbrt (double); +double cosh (double); +double erfc (double); +double expm1 (double); +double log10 (double); +double log1p (double); +double sinh (double); +double tanh (double); + +float __s_acoshf (float); +float __s_asinhf (float); +float __s_atanf (float); +float __s_atan2f (float, float); +float __s_atanhf (float); +float __s_cbrtf (float); +float __s_coshf (float); +float __s_erfcf (float); +float __s_erff (float); +float __s_expm1f (float); +float __s_log10f (float); +float __s_log1pf (float); +float __s_log2f (float); +float __s_sinhf (float); +float __s_tanf (float); +float __s_tanhf (float); + +double __s_acosh (double); +double __s_asinh (double); +double __s_atan (double); +double __s_atan2 (double, double); +double __s_atanh (double); +double __s_cbrt (double); +double __s_cosh (double); +double __s_erf (double); +double __s_erfc (double); +double __s_expm1 (double); +double __s_log10 (double); +double __s_log1p (double); +double __s_log2 (double); +double __s_sinh (double); +double __s_tan (double); +double __s_tanh (double); + +#if __aarch64__ +#if __GNUC__ >= 5 +typedef __Float32x4_t __f32x4_t; +typedef __Float64x2_t __f64x2_t; +#elif __clang_major__*100+__clang_minor__ >= 305 +typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t; +typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t; +#else +#error Unsupported compiler +#endif + +/* Vector functions following the base PCS. */ +__f32x4_t __v_acoshf (__f32x4_t); +__f64x2_t __v_acosh (__f64x2_t); +__f32x4_t __v_asinhf (__f32x4_t); +__f64x2_t __v_asinh (__f64x2_t); +__f32x4_t __v_atanf (__f32x4_t); +__f64x2_t __v_atan (__f64x2_t); +__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t); +__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t); +__f32x4_t __v_atanhf (__f32x4_t); +__f64x2_t __v_atanh (__f64x2_t); +__f32x4_t __v_cbrtf (__f32x4_t); +__f64x2_t __v_cbrt (__f64x2_t); +__f32x4_t __v_coshf (__f32x4_t); +__f64x2_t __v_cosh (__f64x2_t); +__f32x4_t __v_erff (__f32x4_t); +__f64x2_t __v_erf (__f64x2_t); +__f32x4_t __v_erfcf (__f32x4_t); +__f64x2_t __v_erfc (__f64x2_t); +__f32x4_t __v_expm1f (__f32x4_t); +__f64x2_t __v_expm1 (__f64x2_t); +__f32x4_t __v_log10f (__f32x4_t); +__f64x2_t __v_log10 (__f64x2_t); +__f32x4_t __v_log1pf (__f32x4_t); +__f64x2_t __v_log1p (__f64x2_t); +__f32x4_t __v_log2f (__f32x4_t); +__f64x2_t __v_log2 (__f64x2_t); +__f32x4_t __v_sinhf (__f32x4_t); +__f64x2_t __v_sinh (__f64x2_t); +__f32x4_t __v_tanf (__f32x4_t); +__f64x2_t __v_tan (__f64x2_t); +__f32x4_t __v_tanhf (__f32x4_t); +__f64x2_t __v_tanh (__f64x2_t); + +#if __GNUC__ >= 9 || __clang_major__ >= 8 +#define __vpcs __attribute__((__aarch64_vector_pcs__)) + +/* Vector functions following the vector PCS. */ +__vpcs __f32x4_t __vn_acoshf (__f32x4_t); +__vpcs __f64x2_t __vn_acosh (__f64x2_t); +__vpcs __f32x4_t __vn_asinhf (__f32x4_t); +__vpcs __f64x2_t __vn_asinh (__f64x2_t); +__vpcs __f32x4_t __vn_atanf (__f32x4_t); +__vpcs __f64x2_t __vn_atan (__f64x2_t); +__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t); +__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t); +__vpcs __f32x4_t __vn_atanhf (__f32x4_t); +__vpcs __f64x2_t __vn_atanh (__f64x2_t); +__vpcs __f32x4_t __vn_cbrtf (__f32x4_t); +__vpcs __f64x2_t __vn_cbrt (__f64x2_t); +__vpcs __f32x4_t __vn_coshf (__f32x4_t); +__vpcs __f64x2_t __vn_cosh (__f64x2_t); +__vpcs __f32x4_t __vn_erff (__f32x4_t); +__vpcs __f64x2_t __vn_erf (__f64x2_t); +__vpcs __f32x4_t __vn_erfcf (__f32x4_t); +__vpcs __f64x2_t __vn_erfc (__f64x2_t); +__vpcs __f32x4_t __vn_expm1f (__f32x4_t); +__vpcs __f64x2_t __vn_expm1 (__f64x2_t); +__vpcs __f32x4_t __vn_log10f (__f32x4_t); +__vpcs __f64x2_t __vn_log10 (__f64x2_t); +__vpcs __f32x4_t __vn_log1pf (__f32x4_t); +__vpcs __f64x2_t __vn_log1p (__f64x2_t); +__vpcs __f32x4_t __vn_log2f (__f32x4_t); +__vpcs __f64x2_t __vn_log2 (__f64x2_t); +__vpcs __f32x4_t __vn_sinhf (__f32x4_t); +__vpcs __f64x2_t __vn_sinh (__f64x2_t); +__vpcs __f32x4_t __vn_tanf (__f32x4_t); +__vpcs __f64x2_t __vn_tan (__f64x2_t); +__vpcs __f32x4_t __vn_tanhf (__f32x4_t); +__vpcs __f64x2_t __vn_tanh (__f64x2_t); + +/* Vector functions following the vector PCS using ABI names. */ +__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t); +__vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); +__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t); +__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t); + +#endif + +#if WANT_SVE_MATH +#include <arm_sve.h> +svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t); +svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t); +svfloat64_t __sv_atan_x (svfloat64_t, svbool_t); +svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t); +svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t); +svfloat64_t __sv_cos_x (svfloat64_t, svbool_t); +svfloat32_t __sv_erff_x (svfloat32_t, svbool_t); +svfloat64_t __sv_erf_x (svfloat64_t, svbool_t); +svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t); +svfloat32_t __sv_expf_x (svfloat32_t, svbool_t); +svfloat32_t __sv_logf_x (svfloat32_t, svbool_t); +svfloat64_t __sv_log_x (svfloat64_t, svbool_t); +svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t); +svfloat64_t __sv_log10_x (svfloat64_t, svbool_t); +svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t); +svfloat64_t __sv_log2_x (svfloat64_t, svbool_t); +svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t); +svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t); +svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t); +svfloat64_t __sv_sin_x (svfloat64_t, svbool_t); +svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t); +/* SVE ABI names. */ +svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t); +svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t); +svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t); +svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t); +svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t); +svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t); +svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t); +#endif + +#endif + +#endif +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/include/pl_test.h b/contrib/arm-optimized-routines/pl/math/include/pl_test.h new file mode 100644 index 000000000000..6a81360ba287 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/include/pl_test.h @@ -0,0 +1,26 @@ +/* + * PL macros to aid testing. This version of this file is used for building the + * routine, not the tests. Separate definitions are found in test/pl_test.h + * which emit test parameters. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. + */ + +/* Emit max ULP threshold - silenced for building the routine. */ +#define PL_TEST_ULP(f, l) + +/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of + strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is + also added to the test suite. */ +#define PL_ALIAS(a, b) strong_alias (a, b) + +/* Emit routine name if e == 1 and f is expected to correctly trigger fenv + exceptions. e allows declaration to be emitted conditionally upon certain + build flags - defer expansion by one pass to allow those flags to be expanded + properly. */ +#define PL_TEST_EXPECT_FENV(f, e) +#define PL_TEST_EXPECT_FENV_ALWAYS(f) + +#define PL_TEST_INTERVAL(f, lo, hi, n) +#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) diff --git a/contrib/arm-optimized-routines/pl/math/log.c b/contrib/arm-optimized-routines/pl/math/log.c new file mode 100644 index 000000000000..40b0441d981d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log.c @@ -0,0 +1,161 @@ +/* + * Double-precision log(x) function. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <float.h> +#include <math.h> +#include <stdint.h> +#include "math_config.h" + +#define T __log_data.tab +#define T2 __log_data.tab2 +#define B __log_data.poly1 +#define A __log_data.poly +#define Ln2hi __log_data.ln2hi +#define Ln2lo __log_data.ln2lo +#define N (1 << LOG_TABLE_BITS) +#define OFF 0x3fe6000000000000 + +/* Top 16 bits of a double. */ +static inline uint32_t +top16 (double x) +{ + return asuint64 (x) >> 48; +} + +double +optr_aor_log_f64 (double x) +{ + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64 (x); + top = top16 (x); + +#if LOG_POLY1_ORDER == 10 || LOG_POLY1_ORDER == 11 +#define LO asuint64 (1.0 - 0x1p-5) +#define HI asuint64 (1.0 + 0x1.1p-5) +#elif LOG_POLY1_ORDER == 12 +#define LO asuint64 (1.0 - 0x1p-4) +#define HI asuint64 (1.0 + 0x1.09p-4) +#endif + if (unlikely (ix - LO < HI - LO)) + { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0))) + return 0; + r = x - 1.0; + r2 = r * r; + r3 = r * r2; +#if LOG_POLY1_ORDER == 10 + /* Worst-case error is around 0.516 ULP. */ + y = r3 + * (B[1] + r * B[2] + r2 * B[3] + + r3 * (B[4] + r * B[5] + r2 * B[6] + r3 * (B[7] + r * B[8]))); + w = B[0] * r2; /* B[0] == -0.5. */ + hi = r + w; + y += r - hi + w; + y += hi; +#elif LOG_POLY1_ORDER == 11 + /* Worst-case error is around 0.516 ULP. */ + y = r3 + * (B[1] + r * B[2] + + r2 + * (B[3] + r * B[4] + r2 * B[5] + + r3 * (B[6] + r * B[7] + r2 * B[8] + r3 * B[9]))); + w = B[0] * r2; /* B[0] == -0.5. */ + hi = r + w; + y += r - hi + w; + y += hi; +#elif LOG_POLY1_ORDER == 12 + y = r3 + * (B[1] + r * B[2] + r2 * B[3] + + r3 + * (B[4] + r * B[5] + r2 * B[6] + + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); +#if N <= 64 + /* Worst-case error is around 0.532 ULP. */ + w = B[0] * r2; /* B[0] == -0.5. */ + hi = r + w; + y += r - hi + w; + y += hi; +#else + /* Worst-case error is around 0.507 ULP. */ + w = r * 0x1p27; + double_t rhi = r + w - w; + double_t rlo = r - rhi; + w = rhi * rhi * B[0]; /* B[0] == -0.5. */ + hi = r + w; + lo = r - hi + w; + lo += B[0] * rlo * (rhi + r); + y += lo; + y += hi; +#endif +#endif + return eval_as_double (y); + } + if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010)) + { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero (1); + if (ix == asuint64 (INFINITY)) /* log(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid (x); + /* x is subnormal, normalize it. */ + ix = asuint64 (x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG_TABLE_BITS)) % N; + k = (int64_t) tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble (iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if HAVE_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = fma (z, invc, -1.0); +#else + /* rounding error: 0x1p-55/N + 0x1p-66. */ + r = (z - T2[i].chi - T2[i].clo) * invc; +#endif + kd = (double_t) k; + + /* hi + lo = r + log(c) + k*Ln2. */ + w = kd * Ln2hi + logc; + hi = w + r; + lo = w - hi + r + kd * Ln2lo; + + /* log(x) = lo + (log1p(r) - r) + hi. */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + /* Worst case error if |y| > 0x1p-5: + 0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma) + Worst case error if |y| > 0x1p-4: + 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma). */ +#if LOG_POLY_ORDER == 6 + y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; +#elif LOG_POLY_ORDER == 7 + y = lo + + r2 + * (A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + + r2 * r2 * (A[4] + r * A[5])) + + hi; +#endif + return eval_as_double (y); +} diff --git a/contrib/arm-optimized-routines/pl/math/log10_2u.c b/contrib/arm-optimized-routines/pl/math/log10_2u.c new file mode 100644 index 000000000000..74828ea9ef3c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log10_2u.c @@ -0,0 +1,150 @@ +/* + * Double-precision log10(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Polynomial coefficients and lookup tables. */ +#define T __log10_data.tab +#define T2 __log10_data.tab2 +#define B __log10_data.poly1 +#define A __log10_data.poly +#define Ln2hi __log10_data.ln2hi +#define Ln2lo __log10_data.ln2lo +#define InvLn10 __log10_data.invln10 +#define N (1 << LOG10_TABLE_BITS) +#define OFF 0x3fe6000000000000 +#define LO asuint64 (1.0 - 0x1p-4) +#define HI asuint64 (1.0 + 0x1.09p-4) + +/* Top 16 bits of a double. */ +static inline uint32_t +top16 (double x) +{ + return asuint64 (x) >> 48; +} + +/* Fast and low accuracy implementation of log10. + The implementation is similar to that of math/log, except that: + - Polynomials are computed for log10(1+r) with r on same intervals as log. + - Lookup parameters are scaled (at runtime) to switch from base e to base 10. + Many errors above 1.59 ulp are observed across the whole range of doubles. + The greatest observed error is 1.61 ulp, at around 0.965: + log10(0x1.dc8710333a29bp-1) got -0x1.fee26884905a6p-6 + want -0x1.fee26884905a8p-6. */ +double +log10 (double x) +{ + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64 (x); + top = top16 (x); + + if (unlikely (ix - LO < HI - LO)) + { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && unlikely (ix == asuint64 (1.0))) + return 0; + r = x - 1.0; + r2 = r * r; + r3 = r * r2; + y = r3 + * (B[1] + r * B[2] + r2 * B[3] + + r3 + * (B[4] + r * B[5] + r2 * B[6] + + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); + /* Worst-case error is around 0.507 ULP. */ + w = r * 0x1p27; + double_t rhi = r + w - w; + double_t rlo = r - rhi; + w = rhi * rhi * B[0]; + hi = r + w; + lo = r - hi + w; + lo += B[0] * rlo * (rhi + r); + y += lo; + y += hi; + /* Scale by 1/ln(10). Polynomial already contains scaling. */ + y = y * InvLn10; + + return eval_as_double (y); + } + if (unlikely (top - 0x0010 >= 0x7ff0 - 0x0010)) + { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero (1); + if (ix == asuint64 (INFINITY)) /* log10(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid (x); + /* x is subnormal, normalize it. */ + ix = asuint64 (x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG10_TABLE_BITS)) % N; + k = (int64_t) tmp >> 52; /* arithmetic shift. */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble (iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if HAVE_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = fma (z, invc, -1.0); +#else + /* rounding error: 0x1p-55/N + 0x1p-66. */ + r = (z - T2[i].chi - T2[i].clo) * invc; +#endif + kd = (double_t) k; + + /* w = log(c) + k*Ln2hi. */ + w = kd * Ln2hi + logc; + hi = w + r; + lo = w - hi + r + kd * Ln2lo; + + /* log10(x) = (w + r)/log(10) + (log10(1+r) - r/log(10)). */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + + /* Scale by 1/ln(10). Polynomial already contains scaling. */ + y = lo + r2 * A[0] + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; + y = y * InvLn10; + + return eval_as_double (y); +} + +// clang-format off +#if USE_GLIBC_ABI +strong_alias (log10, __log10_finite) +hidden_alias (log10, __ieee754_log10) +#if LDBL_MANT_DIG == 53 +long double +log10l (long double x) +{ + return log10 (x); +} +#endif +#endif +// clang-format on + +PL_SIG (S, D, 1, log10, 0.01, 11.1) +PL_TEST_ULP (log10, 1.11) +PL_TEST_INTERVAL (log10, 0, 0xffff000000000000, 10000) +PL_TEST_INTERVAL (log10, 0x1p-4, 0x1p4, 40000) +PL_TEST_INTERVAL (log10, 0, inf, 40000) diff --git a/contrib/arm-optimized-routines/pl/math/log10_data.c b/contrib/arm-optimized-routines/pl/math/log10_data.c new file mode 100644 index 000000000000..9976f19cd6df --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log10_data.c @@ -0,0 +1,337 @@ +/* + * Data for log10. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << LOG10_TABLE_BITS) + +const struct log10_data __log10_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.invln10 = 0x1.bcb7b1526e50ep-2, +.poly1 = { +#if LOG10_POLY1_ORDER == 12 +// relative error: 0x1.c04d76cp-63 +// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) +-0x1p-1, +0x1.5555555555577p-2, +-0x1.ffffffffffdcbp-3, +0x1.999999995dd0cp-3, +-0x1.55555556745a7p-3, +0x1.24924a344de3p-3, +-0x1.fffffa4423d65p-4, +0x1.c7184282ad6cap-4, +-0x1.999eb43b068ffp-4, +0x1.78182f7afd085p-4, +-0x1.5521375d145cdp-4, +#endif +}, +.poly = { +#if N == 128 && LOG10_POLY_ORDER == 6 +// relative error: 0x1.926199e8p-56 +// abs error: 0x1.882ff33p-65 +// in -0x1.fp-9 0x1.fp-9 +-0x1.0000000000001p-1, +0x1.555555551305bp-2, +-0x1.fffffffeb459p-3, +0x1.999b324f10111p-3, +-0x1.55575e506c89fp-3, +#endif +}, +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p9 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-66 and + 3) the rounding error in (double)log(c) is minimized (< 0x1p-66). + +Note: 1) ensures that k*ln2hi + logc can be computed without rounding error, +2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to +a single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +#if N == 128 +{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2}, +{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2}, +{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2}, +{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2}, +{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2}, +{0x1.69147332f0cbap+0, -0x1.602d076180000p-2}, +{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2}, +{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2}, +{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2}, +{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2}, +{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2}, +{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2}, +{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2}, +{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2}, +{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2}, +{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2}, +{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2}, +{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2}, +{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2}, +{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2}, +{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2}, +{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2}, +{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2}, +{0x1.4880524d48434p+0, -0x1.feb224586f000p-3}, +{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3}, +{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3}, +{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3}, +{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3}, +{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3}, +{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3}, +{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3}, +{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3}, +{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3}, +{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3}, +{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3}, +{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3}, +{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3}, +{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3}, +{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3}, +{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3}, +{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3}, +{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3}, +{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3}, +{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3}, +{0x1.293726014b530p+0, -0x1.31b996b490000p-3}, +{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3}, +{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3}, +{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3}, +{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3}, +{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3}, +{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4}, +{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4}, +{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4}, +{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4}, +{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4}, +{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4}, +{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4}, +{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4}, +{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4}, +{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4}, +{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4}, +{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4}, +{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4}, +{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4}, +{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5}, +{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5}, +{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5}, +{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5}, +{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5}, +{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5}, +{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5}, +{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5}, +{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6}, +{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6}, +{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6}, +{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6}, +{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7}, +{0x1.02865137932a9p+0, -0x1.419355daa0000p-7}, +{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8}, +{0x1.008040614b195p+0, -0x1.0040979240000p-9}, +{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9}, +{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7}, +{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6}, +{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6}, +{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5}, +{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5}, +{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5}, +{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5}, +{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4}, +{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4}, +{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4}, +{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4}, +{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4}, +{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4}, +{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4}, +{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4}, +{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4}, +{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3}, +{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3}, +{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3}, +{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3}, +{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3}, +{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3}, +{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3}, +{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3}, +{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3}, +{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3}, +{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3}, +{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3}, +{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3}, +{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3}, +{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3}, +{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3}, +{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3}, +{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3}, +{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3}, +{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2}, +{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2}, +{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2}, +{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2}, +{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2}, +{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2}, +{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2}, +{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2}, +{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2}, +{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2}, +{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2}, +{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2}, +#endif +}, +#if !HAVE_FAST_FMA +.tab2 = { +#if N == 128 +{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56}, +{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55}, +{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55}, +{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57}, +{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56}, +{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55}, +{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55}, +{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56}, +{0x1.710000e86978p-1, 0x1.bff6671097952p-56}, +{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55}, +{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57}, +{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57}, +{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55}, +{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56}, +{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55}, +{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55}, +{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55}, +{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55}, +{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55}, +{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55}, +{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55}, +{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56}, +{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55}, +{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55}, +{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55}, +{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56}, +{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55}, +{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56}, +{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55}, +{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55}, +{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60}, +{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55}, +{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56}, +{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55}, +{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55}, +{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55}, +{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55}, +{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57}, +{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55}, +{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57}, +{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58}, +{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56}, +{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56}, +{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55}, +{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56}, +{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57}, +{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57}, +{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55}, +{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55}, +{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57}, +{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55}, +{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55}, +{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56}, +{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57}, +{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55}, +{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55}, +{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56}, +{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55}, +{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58}, +{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56}, +{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56}, +{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55}, +{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55}, +{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57}, +{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56}, +{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56}, +{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56}, +{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58}, +{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55}, +{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56}, +{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58}, +{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55}, +{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59}, +{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55}, +{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55}, +{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57}, +{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56}, +{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57}, +{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56}, +{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57}, +{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55}, +{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54}, +{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54}, +{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55}, +{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57}, +{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54}, +{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55}, +{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56}, +{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55}, +{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54}, +{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54}, +{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55}, +{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54}, +{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54}, +{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57}, +{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54}, +{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54}, +{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54}, +{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56}, +{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56}, +{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56}, +{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54}, +{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55}, +{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55}, +{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55}, +{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54}, +{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54}, +{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55}, +{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54}, +{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55}, +{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56}, +{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54}, +{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57}, +{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55}, +{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55}, +{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54}, +{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54}, +{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54}, +{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54}, +{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54}, +{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57}, +{0x1.530001605277ap+0, -0x1.6bfcece233209p-54}, +{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55}, +{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54}, +{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55}, +{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54}, +{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54}, +{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, +#endif +}, +#endif /* !HAVE_FAST_FMA */ +}; diff --git a/contrib/arm-optimized-routines/pl/math/log10f.c b/contrib/arm-optimized-routines/pl/math/log10f.c new file mode 100644 index 000000000000..5c80008e4e57 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log10f.c @@ -0,0 +1,97 @@ +/* + * Single-precision log10 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <math.h> +#include <stdint.h> + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +/* Data associated to logf: + + LOGF_TABLE_BITS = 4 + LOGF_POLY_ORDER = 4 + + ULP error: 0.818 (nearest rounding.) + Relative error: 1.957 * 2^-26 (before rounding.). */ + +#define T __logf_data.tab +#define A __logf_data.poly +#define Ln2 __logf_data.ln2 +#define InvLn10 __logf_data.invln10 +#define N (1 << LOGF_TABLE_BITS) +#define OFF 0x3f330000 + +/* This naive implementation of log10f mimics that of log + then simply scales the result by 1/log(10) to switch from base e to + base 10. Hence, most computations are carried out in double precision. + Scaling before rounding to single precision is both faster and more accurate. + + ULP error: 0.797 ulp (nearest rounding.). */ +float +log10f (float x) +{ + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t z, r, r2, y, y0, invc, logc; + uint32_t ix, iz, tmp; + int k, i; + + ix = asuint (x); +#if WANT_ROUNDING + /* Fix sign of zero with downward rounding when x==1. */ + if (unlikely (ix == 0x3f800000)) + return 0; +#endif + if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000)) + { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzerof (1); + if (ix == 0x7f800000) /* log(inf) == inf. */ + return x; + if ((ix & 0x80000000) || ix * 2 >= 0xff000000) + return __math_invalidf (x); + /* x is subnormal, normalize it. */ + ix = asuint (x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; + k = (int32_t) tmp >> 23; /* arithmetic shift. */ + iz = ix - (tmp & 0xff800000); + invc = T[i].invc; + logc = T[i].logc; + z = (double_t) asfloat (iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + r = z * invc - 1; + y0 = logc + (double_t) k * Ln2; + + /* Pipelined polynomial evaluation to approximate log1p(r). */ + r2 = r * r; + y = A[1] * r + A[2]; + y = A[0] * r2 + y; + y = y * r2 + (y0 + r); + + /* Multiply by 1/log(10). */ + y = y * InvLn10; + + return eval_as_float (y); +} + +PL_SIG (S, F, 1, log10, 0.01, 11.1) +PL_TEST_ULP (log10f, 0.30) +PL_TEST_INTERVAL (log10f, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (log10f, 0x1p-127, 0x1p-26, 50000) +PL_TEST_INTERVAL (log10f, 0x1p-26, 0x1p3, 50000) +PL_TEST_INTERVAL (log10f, 0x1p-4, 0x1p4, 50000) +PL_TEST_INTERVAL (log10f, 0, inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/log1p_2u.c b/contrib/arm-optimized-routines/pl/math/log1p_2u.c new file mode 100644 index 000000000000..23c8ed4a1914 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log1p_2u.c @@ -0,0 +1,136 @@ +/* + * Double-precision log(1+x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "estrin.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Ln2Hi 0x1.62e42fefa3800p-1 +#define Ln2Lo 0x1.ef35793c76730p-45 +#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */ +#define OneMHfRt2Top \ + 0x00095f62 /* top32(asuint64(1)) - top32(asuint64(sqrt(2)/2)). */ +#define OneTop12 0x3ff +#define BottomMask 0xffffffff +#define OneMHfRt2 0x3fd2bec333018866 +#define Rt2MOne 0x3fda827999fcef32 +#define AbsMask 0x7fffffffffffffff +#define ExpM63 0x3c00 +#define C(i) __log1p_data.coeffs[i] + +static inline double +eval_poly (double f) +{ + double f2 = f * f; + double f4 = f2 * f2; + double f8 = f4 * f4; + return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); +} + +/* log1p approximation using polynomial on reduced interval. Largest + observed errors are near the lower boundary of the region where k + is 0. + Maximum measured error: 1.75ULP. + log1p(-0x1.2e1aea97b3e5cp-2) got -0x1.65fb8659a2f9p-2 + want -0x1.65fb8659a2f92p-2. */ +double +log1p (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint32_t ia16 = ia >> 48; + + /* Handle special cases first. */ + if (unlikely (ia16 >= 0x7ff0 || ix >= 0xbff0000000000000 + || ix == 0x8000000000000000)) + { + if (ix == 0x8000000000000000 || ix == 0x7ff0000000000000) + { + /* x == -0 => log1p(x) = -0. + x == Inf => log1p(x) = Inf. */ + return x; + } + if (ix == 0xbff0000000000000) + { + /* x == -1 => log1p(x) = -Inf. */ + return __math_divzero (-1); + ; + } + if (ia16 >= 0x7ff0) + { + /* x == +/-NaN => log1p(x) = NaN. */ + return __math_invalid (asdouble (ia)); + } + /* x < -1 => log1p(x) = NaN. + x == -Inf => log1p(x) = NaN. */ + return __math_invalid (x); + } + + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f + is in [sqrt(2)/2, sqrt(2)]): + log1p(x) = k*log(2) + log1p(f). + + f may not be representable exactly, so we need a correction term: + let m = round(1 + x), c = (1 + x) - m. + c << m: at very small x, log1p(x) ~ x, hence: + log(1+x) - log(m) ~ c/m. + + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ + + uint64_t sign = ix & ~AbsMask; + if (ia <= OneMHfRt2 || (!sign && ia <= Rt2MOne)) + { + if (unlikely (ia16 <= ExpM63)) + { + /* If exponent of x <= -63 then shortcut the polynomial and avoid + underflow by just returning x, which is exactly rounded in this + region. */ + return x; + } + /* If x is in [sqrt(2)/2 - 1, sqrt(2) - 1] then we can shortcut all the + logic below, as k = 0 and f = x and therefore representable exactly. + All we need is to return the polynomial. */ + return fma (x, eval_poly (x) * x, x); + } + + /* Obtain correctly scaled k by manipulation in the exponent. */ + double m = x + 1; + uint64_t mi = asuint64 (m); + uint32_t u = (mi >> 32) + OneMHfRt2Top; + int32_t k = (int32_t) (u >> 20) - OneTop12; + + /* Correction term c/m. */ + double cm = (x - (m - 1)) / m; + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + uint32_t utop = (u & 0x000fffff) + HfRt2Top; + uint64_t u_red = ((uint64_t) utop << 32) | (mi & BottomMask); + double f = asdouble (u_red) - 1; + + /* Approximate log1p(x) on the reduced input using a polynomial. Because + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... */ + double p = fma (f, eval_poly (f) * f, f); + + double kd = k; + double y = fma (Ln2Lo, kd, cm); + return y + fma (Ln2Hi, kd, p); +} + +PL_SIG (S, D, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (log1p, 1.26) +PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000) +PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000) +PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000) +PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000) +PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000) +PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000) +PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000) +PL_TEST_INTERVAL (log1p, -1.0, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/log1p_data.c b/contrib/arm-optimized-routines/pl/math/log1p_data.c new file mode 100644 index 000000000000..6168a0c9a214 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log1p_data.c @@ -0,0 +1,19 @@ +/* + * Data used in double-precision log(1+x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Polynomial coefficients generated using Remez algorithm, see + log1p.sollya for details. */ +const struct log1p_data __log1p_data = { + .coeffs = {-0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, + 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, + -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, + 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, + -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, + 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, + -0x1.cfa7385bdb37ep-6}}; diff --git a/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c new file mode 100644 index 000000000000..fcfd05a6fcb7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log1pf_2u1.c @@ -0,0 +1,165 @@ +/* + * Single-precision log(1+x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "hornerf.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define Ln2 (0x1.62e43p-1f) +#define SignMask (0x80000000) + +/* Biased exponent of the largest float m for which m^8 underflows. */ +#define M8UFLOW_BOUND_BEXP 112 +/* Biased exponent of the largest float for which we just return x. */ +#define TINY_BOUND_BEXP 103 + +#define C(i) __log1pf_data.coeffs[i] + +static inline float +eval_poly (float m, uint32_t e) +{ +#ifdef LOG1PF_2U5 + + /* 2.5 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using + slightly modified Estrin scheme (no x^0 term, and x term is just x). */ + float p_12 = fmaf (m, C (1), C (0)); + float p_34 = fmaf (m, C (3), C (2)); + float p_56 = fmaf (m, C (5), C (4)); + float p_78 = fmaf (m, C (7), C (6)); + + float m2 = m * m; + float p_02 = fmaf (m2, p_12, m); + float p_36 = fmaf (m2, p_56, p_34); + float p_79 = fmaf (m2, C (8), p_78); + + float m4 = m2 * m2; + float p_06 = fmaf (m4, p_36, p_02); + + if (unlikely (e < M8UFLOW_BOUND_BEXP)) + return p_06; + + float m8 = m4 * m4; + return fmaf (m8, p_79, p_06); + +#elif defined(LOG1PF_1U3) + + /* 1.3 ulp variant. Approximate log(1+m) on [-0.25, 0.5] using Horner + scheme. Our polynomial approximation for log1p has the form + x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ... + Hence approximation has the form m + m^2 * P(m) + where P(x) = C1 + C2 * x + C3 * x^2 + ... . */ + return fmaf (m, m * HORNER_8 (m, C), m); + +#else +#error No log1pf approximation exists with the requested precision. Options are 13 or 25. +#endif +} + +static inline uint32_t +biased_exponent (uint32_t ix) +{ + return (ix & 0x7f800000) >> 23; +} + +/* log1pf approximation using polynomial on reduced interval. Worst-case error + when using Estrin is roughly 2.02 ULP: + log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ +float +log1pf (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & ~SignMask; + uint32_t ia12 = ia >> 20; + uint32_t e = biased_exponent (ix); + + /* Handle special cases first. */ + if (unlikely (ia12 >= 0x7f8 || ix >= 0xbf800000 || ix == 0x80000000 + || e <= TINY_BOUND_BEXP)) + { + if (ix == 0xff800000) + { + /* x == -Inf => log1pf(x) = NaN. */ + return NAN; + } + if ((ix == 0x7f800000 || e <= TINY_BOUND_BEXP) && ia12 <= 0x7f8) + { + /* |x| < TinyBound => log1p(x) = x. + x == Inf => log1pf(x) = Inf. */ + return x; + } + if (ix == 0xbf800000) + { + /* x == -1.0 => log1pf(x) = -Inf. */ + return __math_divzerof (-1); + } + if (ia12 >= 0x7f8) + { + /* x == +/-NaN => log1pf(x) = NaN. */ + return __math_invalidf (asfloat (ia)); + } + /* x < -1.0 => log1pf(x) = NaN. */ + return __math_invalidf (x); + } + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + + if (ix <= 0x3f000000 || ia <= 0x3e800000) + { + /* If x is in [-0.25, 0.5] then we can shortcut all the logic + below, as k = 0 and m = x. All we need is to return the + polynomial. */ + return eval_poly (x, e); + } + + float m = x + 1.0f; + + /* k is used scale the input. 0x3f400000 is chosen as we are trying to + reduce x to the range [-0.25, 0.5]. Inside this range, k is 0. + Outside this range, if k is reinterpreted as (NOT CONVERTED TO) float: + let k = sign * 2^p where sign = -1 if x < 0 + 1 otherwise + and p is a negative integer whose magnitude increases with the + magnitude of x. */ + int k = (asuint (m) - 0x3f400000) & 0xff800000; + + /* By using integer arithmetic, we obtain the necessary scaling by + subtracting the unbiased exponent of k from the exponent of x. */ + float m_scale = asfloat (asuint (x) - k); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number (s in [2**-126,2**26]), and scale m down accordingly. */ + float s = asfloat (asuint (4.0f) - k); + m_scale = m_scale + fmaf (0.25f, s, -1.0f); + + float p = eval_poly (m_scale, biased_exponent (asuint (m_scale))); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + float scale_back = (float) k * 0x1.0p-23f; + + /* Apply the scaling back. */ + return fmaf (scale_back, Ln2, p); +} + +PL_SIG (S, F, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (log1pf, 1.52) +PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000) +PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000) +PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000) +PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000) +PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000) +PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000) +PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000) +PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000) diff --git a/contrib/arm-optimized-routines/pl/math/log1pf_data.c b/contrib/arm-optimized-routines/pl/math/log1pf_data.c new file mode 100644 index 000000000000..8c92d5738fe8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log1pf_data.c @@ -0,0 +1,14 @@ +/* + * Data used in single-precision log1p(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" + +/* Polynomial coefficients generated using floating-point minimax + algorithm, see tools/log1pf.sollya for details. */ +const struct log1pf_data __log1pf_data + = {.coeffs = {-0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, + -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, + -0x1.6f0d5ep-5f}}; diff --git a/contrib/arm-optimized-routines/pl/math/log_data.c b/contrib/arm-optimized-routines/pl/math/log_data.c new file mode 100644 index 000000000000..34715e5036a3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/log_data.c @@ -0,0 +1,511 @@ +/* + * Data for log. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << LOG_TABLE_BITS) + +const struct log_data __log_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.poly1 = { +#if LOG_POLY1_ORDER == 10 +// relative error: 0x1.32eccc6p-62 +// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval) +-0x1p-1, +0x1.55555555554e5p-2, +-0x1.0000000000af2p-2, +0x1.9999999bbe436p-3, +-0x1.55555537f9cdep-3, +0x1.24922fc8127cfp-3, +-0x1.0000b7d6bb612p-3, +0x1.c806ee1ddbcafp-4, +-0x1.972335a9c2d6ep-4, +#elif LOG_POLY1_ORDER == 11 +// relative error: 0x1.52c8b708p-68 +// in -0x1p-5 0x1.1p-5 (|log(1+x)| > 0x1p-5 outside this interval) +-0x1p-1, +0x1.5555555555555p-2, +-0x1.ffffffffffea9p-3, +0x1.999999999c4d4p-3, +-0x1.55555557f5541p-3, +0x1.249248fbe33e4p-3, +-0x1.ffffc9a3c825bp-4, +0x1.c71e1f204435dp-4, +-0x1.9a7f26377d06ep-4, +0x1.71c30cf8f7364p-4, +#elif LOG_POLY1_ORDER == 12 +// relative error: 0x1.c04d76cp-63 +// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) +-0x1p-1, +0x1.5555555555577p-2, +-0x1.ffffffffffdcbp-3, +0x1.999999995dd0cp-3, +-0x1.55555556745a7p-3, +0x1.24924a344de3p-3, +-0x1.fffffa4423d65p-4, +0x1.c7184282ad6cap-4, +-0x1.999eb43b068ffp-4, +0x1.78182f7afd085p-4, +-0x1.5521375d145cdp-4, +#endif +}, +.poly = { +#if N == 64 && LOG_POLY_ORDER == 7 +// relative error: 0x1.906eb8ap-58 +// abs error: 0x1.d2cad5a8p-67 +// in -0x1.fp-8 0x1.fp-8 +-0x1.0000000000027p-1, +0x1.555555555556ap-2, +-0x1.fffffff0440bap-3, +0x1.99999991906c3p-3, +-0x1.555c8d7e8201ep-3, +0x1.24978c59151fap-3, +#elif N == 128 && LOG_POLY_ORDER == 6 +// relative error: 0x1.926199e8p-56 +// abs error: 0x1.882ff33p-65 +// in -0x1.fp-9 0x1.fp-9 +-0x1.0000000000001p-1, +0x1.555555551305bp-2, +-0x1.fffffffeb459p-3, +0x1.999b324f10111p-3, +-0x1.55575e506c89fp-3, +#elif N == 128 && LOG_POLY_ORDER == 7 +// relative error: 0x1.649fc4bp-64 +// abs error: 0x1.c3b5769p-74 +// in -0x1.fp-9 0x1.fp-9 +-0x1.0000000000001p-1, +0x1.5555555555556p-2, +-0x1.fffffffea1a8p-3, +0x1.99999998e9139p-3, +-0x1.555776801b968p-3, +0x1.2493c29331a5cp-3, +#endif +}, +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p9 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-66 and + 3) the rounding error in (double)log(c) is minimized (< 0x1p-66). + +Note: 1) ensures that k*ln2hi + logc can be computed without rounding error, +2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to +a single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +#if N == 64 +{0x1.7242886495cd8p+0, -0x1.79e267bdfe000p-2}, +{0x1.6e1f769340dc9p+0, -0x1.6e60ee0ecb000p-2}, +{0x1.6a13ccc8f195cp+0, -0x1.63002fdbf6000p-2}, +{0x1.661ec72e86f3ap+0, -0x1.57bf76c597000p-2}, +{0x1.623fa6c447b16p+0, -0x1.4c9e07f0d2000p-2}, +{0x1.5e75bbca31702p+0, -0x1.419b42f027000p-2}, +{0x1.5ac05655adb10p+0, -0x1.36b67660e6000p-2}, +{0x1.571ed3e940191p+0, -0x1.2bef0839e4800p-2}, +{0x1.539094ac0fbbfp+0, -0x1.21445727cb000p-2}, +{0x1.5015007e7fc42p+0, -0x1.16b5ca3c3d000p-2}, +{0x1.4cab877c31cf9p+0, -0x1.0c42d3805f800p-2}, +{0x1.49539e76a88d3p+0, -0x1.01eae61b60800p-2}, +{0x1.460cbc12211dap+0, -0x1.ef5adb9fb0000p-3}, +{0x1.42d6624debe3ap+0, -0x1.db13daab99000p-3}, +{0x1.3fb0144f0d462p+0, -0x1.c6ffbe896e000p-3}, +{0x1.3c995a1f9a9b4p+0, -0x1.b31d84722d000p-3}, +{0x1.3991c23952500p+0, -0x1.9f6c3cf6eb000p-3}, +{0x1.3698df35eaa14p+0, -0x1.8beafe7f13000p-3}, +{0x1.33ae463091760p+0, -0x1.7898db878d000p-3}, +{0x1.30d190aae3d72p+0, -0x1.6574efe4ec000p-3}, +{0x1.2e025c9203c89p+0, -0x1.527e620845000p-3}, +{0x1.2b404a7244988p+0, -0x1.3fb457d798000p-3}, +{0x1.288b01dc19544p+0, -0x1.2d1615a077000p-3}, +{0x1.25e2268085f69p+0, -0x1.1aa2b431e5000p-3}, +{0x1.23456812abb74p+0, -0x1.08598f1d2b000p-3}, +{0x1.20b4703174157p+0, -0x1.ec738fee40000p-4}, +{0x1.1e2ef308b4e9bp+0, -0x1.c885768862000p-4}, +{0x1.1bb4a36b70a3fp+0, -0x1.a4e75b6a46000p-4}, +{0x1.194538e960658p+0, -0x1.8197efba9a000p-4}, +{0x1.16e0692a10ac8p+0, -0x1.5e95ad734e000p-4}, +{0x1.1485f1ba1568bp+0, -0x1.3bdf67117c000p-4}, +{0x1.12358e123ed6fp+0, -0x1.1973b744f0000p-4}, +{0x1.0fef01de37c8dp+0, -0x1.eea33446bc000p-5}, +{0x1.0db20b82be414p+0, -0x1.aaef4ab304000p-5}, +{0x1.0b7e6f67f69b3p+0, -0x1.67c962fd2c000p-5}, +{0x1.0953f342fc108p+0, -0x1.252f29acf8000p-5}, +{0x1.0732604ec956bp+0, -0x1.c63d19e9c0000p-6}, +{0x1.051980117f9b0p+0, -0x1.432ab6a388000p-6}, +{0x1.03091aa6810f1p+0, -0x1.8244357f50000p-7}, +{0x1.01010152cf066p+0, -0x1.0080a711c0000p-8}, +{0x1.fc07ef6b6e30bp-1, 0x1.fe03018e80000p-8}, +{0x1.f4465aa1024afp-1, 0x1.7b91986450000p-6}, +{0x1.ecc07a8fd3f5ep-1, 0x1.39e88608c8000p-5}, +{0x1.e573ad856b537p-1, 0x1.b42dc6e624000p-5}, +{0x1.de5d6dc7b8057p-1, 0x1.165372ec20000p-4}, +{0x1.d77b6498bddf7p-1, 0x1.51b07a0170000p-4}, +{0x1.d0cb580315c0fp-1, 0x1.8c3465c7ea000p-4}, +{0x1.ca4b30d1cf449p-1, 0x1.c5e544a290000p-4}, +{0x1.c3f8ef4810d8ep-1, 0x1.fec91aa0a6000p-4}, +{0x1.bdd2b8b311f44p-1, 0x1.1b72acdc5c000p-3}, +{0x1.b7d6c2eeac054p-1, 0x1.371fc65a98000p-3}, +{0x1.b20363474c8f5p-1, 0x1.526e61c1aa000p-3}, +{0x1.ac570165eeab1p-1, 0x1.6d60ffc240000p-3}, +{0x1.a6d019f331df4p-1, 0x1.87fa08a013000p-3}, +{0x1.a16d3ebc9e3c3p-1, 0x1.a23bc630c3000p-3}, +{0x1.9c2d14567ef45p-1, 0x1.bc286a3512000p-3}, +{0x1.970e4efae9169p-1, 0x1.d5c2195697000p-3}, +{0x1.920fb3bd0b802p-1, 0x1.ef0ae132d3000p-3}, +{0x1.8d3018b58699ap-1, 0x1.040259974e000p-2}, +{0x1.886e5ff170ee6p-1, 0x1.1058bd40e2000p-2}, +{0x1.83c977ad35d27p-1, 0x1.1c898c1137800p-2}, +{0x1.7f405ed16c520p-1, 0x1.2895a3e65b000p-2}, +{0x1.7ad220d0335c4p-1, 0x1.347dd8f6bd000p-2}, +{0x1.767dce53474fdp-1, 0x1.4043083cb3800p-2}, +#elif N == 128 +{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2}, +{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2}, +{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2}, +{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2}, +{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2}, +{0x1.69147332f0cbap+0, -0x1.602d076180000p-2}, +{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2}, +{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2}, +{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2}, +{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2}, +{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2}, +{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2}, +{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2}, +{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2}, +{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2}, +{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2}, +{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2}, +{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2}, +{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2}, +{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2}, +{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2}, +{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2}, +{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2}, +{0x1.4880524d48434p+0, -0x1.feb224586f000p-3}, +{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3}, +{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3}, +{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3}, +{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3}, +{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3}, +{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3}, +{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3}, +{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3}, +{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3}, +{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3}, +{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3}, +{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3}, +{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3}, +{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3}, +{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3}, +{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3}, +{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3}, +{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3}, +{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3}, +{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3}, +{0x1.293726014b530p+0, -0x1.31b996b490000p-3}, +{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3}, +{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3}, +{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3}, +{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3}, +{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3}, +{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4}, +{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4}, +{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4}, +{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4}, +{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4}, +{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4}, +{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4}, +{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4}, +{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4}, +{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4}, +{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4}, +{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4}, +{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4}, +{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4}, +{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5}, +{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5}, +{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5}, +{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5}, +{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5}, +{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5}, +{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5}, +{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5}, +{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6}, +{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6}, +{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6}, +{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6}, +{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7}, +{0x1.02865137932a9p+0, -0x1.419355daa0000p-7}, +{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8}, +{0x1.008040614b195p+0, -0x1.0040979240000p-9}, +{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9}, +{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7}, +{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6}, +{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6}, +{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5}, +{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5}, +{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5}, +{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5}, +{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4}, +{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4}, +{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4}, +{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4}, +{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4}, +{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4}, +{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4}, +{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4}, +{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4}, +{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3}, +{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3}, +{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3}, +{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3}, +{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3}, +{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3}, +{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3}, +{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3}, +{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3}, +{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3}, +{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3}, +{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3}, +{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3}, +{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3}, +{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3}, +{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3}, +{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3}, +{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3}, +{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3}, +{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2}, +{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2}, +{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2}, +{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2}, +{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2}, +{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2}, +{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2}, +{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2}, +{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2}, +{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2}, +{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2}, +{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2}, +#endif +}, +#if !HAVE_FAST_FMA +.tab2 = { +#if N == 64 +{0x1.61ffff94c4fecp-1, -0x1.9fe4fc998f325p-56}, +{0x1.66000020377ddp-1, 0x1.e804c7a9519f2p-55}, +{0x1.6a00004c41678p-1, 0x1.902c675d9ecfep-55}, +{0x1.6dffff7384f87p-1, -0x1.2fd6b95e55043p-56}, +{0x1.720000b37216ep-1, 0x1.802bc8d437043p-55}, +{0x1.75ffffbeb3c9dp-1, 0x1.6047ad0a0d4e4p-57}, +{0x1.7a0000628daep-1, -0x1.e00434b49313dp-56}, +{0x1.7dffffd7abd1ap-1, -0x1.6015f8a083576p-56}, +{0x1.81ffffdf40c54p-1, 0x1.7f54bf76a42c9p-57}, +{0x1.860000f334e11p-1, 0x1.60054cb5344d7p-56}, +{0x1.8a0001238aca7p-1, 0x1.c03c9bd132f55p-57}, +{0x1.8dffffb81d212p-1, -0x1.001e519f2764fp-55}, +{0x1.92000086adc7cp-1, 0x1.1fe40f88f49c6p-55}, +{0x1.960000135d8eap-1, -0x1.f832268dc3095p-55}, +{0x1.99ffff9435acp-1, 0x1.7031d8b835edcp-56}, +{0x1.9e00003478565p-1, -0x1.0030b221ce3eep-58}, +{0x1.a20000b592948p-1, 0x1.8fd2f1dbd4639p-55}, +{0x1.a600000ad0bcfp-1, 0x1.901d6a974e6bep-55}, +{0x1.a9ffff55953a5p-1, 0x1.a07556192db98p-57}, +{0x1.adffff29ce03dp-1, -0x1.fff0717ec71c2p-56}, +{0x1.b1ffff34f3ac8p-1, 0x1.8005573de89d1p-57}, +{0x1.b60000894c55bp-1, -0x1.ff2fb51b044c7p-57}, +{0x1.b9fffef45ec7dp-1, -0x1.9ff7c4e8730fp-56}, +{0x1.be0000cda7b2ap-1, 0x1.57d058dbf3c1dp-55}, +{0x1.c1ffff2c57917p-1, 0x1.7e66d7e48dbc9p-58}, +{0x1.c60000ea5b82ap-1, -0x1.47f5e132ed4bep-55}, +{0x1.ca0001121ae98p-1, -0x1.40958c8d5e00ap-58}, +{0x1.ce0000f9241cbp-1, -0x1.7da063caa81c8p-59}, +{0x1.d1fffe8be95a4p-1, -0x1.82e3a411afcd9p-59}, +{0x1.d5ffff035932bp-1, -0x1.00f901b3fe87dp-58}, +{0x1.d9fffe8b54ba7p-1, 0x1.ffef55d6e3a4p-55}, +{0x1.de0000ad95d19p-1, 0x1.5feb2efd4c7c7p-55}, +{0x1.e1fffe925ce47p-1, 0x1.c8085484eaf08p-55}, +{0x1.e5fffe3ddf853p-1, -0x1.fd5ed02c5cadp-60}, +{0x1.e9fffed0a0e5fp-1, -0x1.a80aaef411586p-55}, +{0x1.ee00008f82eep-1, -0x1.b000aeaf97276p-55}, +{0x1.f20000a22d2f4p-1, -0x1.8f8906e13eba3p-56}, +{0x1.f5fffee35b57dp-1, 0x1.1fdd33b2d3714p-57}, +{0x1.fa00014eec3a6p-1, -0x1.3ee0b7a18c1a5p-58}, +{0x1.fdffff5daa89fp-1, -0x1.c1e24c8e3b503p-58}, +{0x1.0200005b93349p+0, -0x1.50197fe6bedcap-54}, +{0x1.05ffff9d597acp+0, 0x1.20160d062d0dcp-55}, +{0x1.0a00005687a63p+0, -0x1.27f3f9307696ep-54}, +{0x1.0dffff779164ep+0, 0x1.b7eb40bb9c4f4p-54}, +{0x1.12000044a0aa8p+0, 0x1.efbc914d512c4p-55}, +{0x1.16000069685bcp+0, -0x1.c0bea3eb2d82cp-57}, +{0x1.1a000093f0d78p+0, 0x1.1fecbf1e8c52p-54}, +{0x1.1dffffb2b1457p+0, -0x1.3fc91365637d6p-55}, +{0x1.2200008824a1p+0, -0x1.dff7e9feb578ap-54}, +{0x1.25ffffeef953p+0, -0x1.b00a61ec912f7p-55}, +{0x1.2a0000a1e7783p+0, 0x1.60048318b0483p-56}, +{0x1.2e0000853d4c7p+0, -0x1.77fbedf2c8cf3p-54}, +{0x1.320000324c55bp+0, 0x1.f81983997354fp-54}, +{0x1.360000594f796p+0, -0x1.cfe4beff900a9p-54}, +{0x1.3a0000a4c1c0fp+0, 0x1.07dbb2e268d0ep-54}, +{0x1.3e0000751c61bp+0, 0x1.80583ed1c566ep-56}, +{0x1.42000069e8a9fp+0, 0x1.f01f1edf82045p-54}, +{0x1.460000b5a1e34p+0, -0x1.dfdf0cf45c14ap-55}, +{0x1.4a0000187e513p+0, 0x1.401306b83a98dp-55}, +{0x1.4dffff3ba420bp+0, 0x1.9fc6539a6454ep-56}, +{0x1.51fffffe391c9p+0, -0x1.601ef3353ac83p-54}, +{0x1.560000e342455p+0, 0x1.3fb7fac8ac151p-55}, +{0x1.59ffffc39676fp+0, 0x1.4fe7dd6659cc2p-55}, +{0x1.5dfffff10ef42p+0, -0x1.48154cb592bcbp-54}, +#elif N == 128 +{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56}, +{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55}, +{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55}, +{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57}, +{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56}, +{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55}, +{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55}, +{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56}, +{0x1.710000e86978p-1, 0x1.bff6671097952p-56}, +{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55}, +{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57}, +{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57}, +{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55}, +{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56}, +{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55}, +{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55}, +{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55}, +{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55}, +{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55}, +{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55}, +{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55}, +{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56}, +{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55}, +{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55}, +{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55}, +{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56}, +{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55}, +{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56}, +{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55}, +{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55}, +{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60}, +{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55}, +{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56}, +{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55}, +{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55}, +{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55}, +{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55}, +{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57}, +{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55}, +{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57}, +{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58}, +{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56}, +{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56}, +{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55}, +{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56}, +{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57}, +{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57}, +{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55}, +{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55}, +{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57}, +{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55}, +{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55}, +{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56}, +{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57}, +{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55}, +{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55}, +{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56}, +{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55}, +{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58}, +{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56}, +{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56}, +{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55}, +{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55}, +{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57}, +{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56}, +{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56}, +{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56}, +{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58}, +{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55}, +{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56}, +{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58}, +{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55}, +{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59}, +{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55}, +{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55}, +{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57}, +{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56}, +{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57}, +{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56}, +{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57}, +{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55}, +{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54}, +{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54}, +{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55}, +{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57}, +{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54}, +{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55}, +{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56}, +{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55}, +{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54}, +{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54}, +{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55}, +{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54}, +{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54}, +{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57}, +{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54}, +{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54}, +{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54}, +{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56}, +{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56}, +{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56}, +{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54}, +{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55}, +{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55}, +{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55}, +{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54}, +{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54}, +{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55}, +{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54}, +{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55}, +{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56}, +{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54}, +{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57}, +{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55}, +{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55}, +{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54}, +{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54}, +{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54}, +{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54}, +{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54}, +{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57}, +{0x1.530001605277ap+0, -0x1.6bfcece233209p-54}, +{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55}, +{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54}, +{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55}, +{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54}, +{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54}, +{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, +#endif +}, +#endif /* !HAVE_FAST_FMA */ +}; diff --git a/contrib/arm-optimized-routines/pl/math/logf.c b/contrib/arm-optimized-routines/pl/math/logf.c new file mode 100644 index 000000000000..17a74ed6d28f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/logf.c @@ -0,0 +1,75 @@ +/* + * Single-precision log function. + * + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <math.h> +#include <stdint.h> +#include "math_config.h" + +/* +LOGF_TABLE_BITS = 4 +LOGF_POLY_ORDER = 4 + +ULP error: 0.818 (nearest rounding.) +Relative error: 1.957 * 2^-26 (before rounding.) +*/ + +#define T __logf_data.tab +#define A __logf_data.poly +#define Ln2 __logf_data.ln2 +#define N (1 << LOGF_TABLE_BITS) +#define OFF 0x3f330000 + +float +optr_aor_log_f32 (float x) +{ + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t z, r, r2, y, y0, invc, logc; + uint32_t ix, iz, tmp; + int k, i; + + ix = asuint (x); +#if WANT_ROUNDING + /* Fix sign of zero with downward rounding when x==1. */ + if (unlikely (ix == 0x3f800000)) + return 0; +#endif + if (unlikely (ix - 0x00800000 >= 0x7f800000 - 0x00800000)) + { + /* x < 0x1p-126 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzerof (1); + if (ix == 0x7f800000) /* log(inf) == inf. */ + return x; + if ((ix & 0x80000000) || ix * 2 >= 0xff000000) + return __math_invalidf (x); + /* x is subnormal, normalize it. */ + ix = asuint (x * 0x1p23f); + ix -= 23 << 23; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF] and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (23 - LOGF_TABLE_BITS)) % N; + k = (int32_t) tmp >> 23; /* arithmetic shift */ + iz = ix - (tmp & 0x1ff << 23); + invc = T[i].invc; + logc = T[i].logc; + z = (double_t) asfloat (iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2 */ + r = z * invc - 1; + y0 = logc + (double_t) k * Ln2; + + /* Pipelined polynomial evaluation to approximate log1p(r). */ + r2 = r * r; + y = A[1] * r + A[2]; + y = A[0] * r2 + y; + y = y * r2 + (y0 + r); + return eval_as_float (y); +} diff --git a/contrib/arm-optimized-routines/pl/math/logf_data.c b/contrib/arm-optimized-routines/pl/math/logf_data.c new file mode 100644 index 000000000000..97d9eb8d0097 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/logf_data.c @@ -0,0 +1,36 @@ +/* + * Data definition for logf and log10f. + * + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct logf_data __logf_data = { + .tab = + { + {0x1.661ec79f8f3bep+0, -0x1.57bf7808caadep-2}, + {0x1.571ed4aaf883dp+0, -0x1.2bef0a7c06ddbp-2}, + {0x1.49539f0f010bp+0, -0x1.01eae7f513a67p-2}, + {0x1.3c995b0b80385p+0, -0x1.b31d8a68224e9p-3}, + {0x1.30d190c8864a5p+0, -0x1.6574f0ac07758p-3}, + {0x1.25e227b0b8eap+0, -0x1.1aa2bc79c81p-3}, + {0x1.1bb4a4a1a343fp+0, -0x1.a4e76ce8c0e5ep-4}, + {0x1.12358f08ae5bap+0, -0x1.1973c5a611cccp-4}, + {0x1.0953f419900a7p+0, -0x1.252f438e10c1ep-5}, + {0x1p+0, 0x0p+0}, + {0x1.e608cfd9a47acp-1, 0x1.aa5aa5df25984p-5}, + {0x1.ca4b31f026aap-1, 0x1.c5e53aa362eb4p-4}, + {0x1.b2036576afce6p-1, 0x1.526e57720db08p-3}, + {0x1.9c2d163a1aa2dp-1, 0x1.bc2860d22477p-3}, + {0x1.886e6037841edp-1, 0x1.1058bc8a07ee1p-2}, + {0x1.767dcf5534862p-1, 0x1.4043057b6ee09p-2}, + }, + .ln2 = 0x1.62e42fefa39efp-1, + .invln10 = 0x1.bcb7b1526e50ep-2, + .poly = { + -0x1.00ea348b88334p-2, + 0x1.5575b0be00b6ap-2, + -0x1.ffffef20a4123p-2, + }}; diff --git a/contrib/arm-optimized-routines/pl/math/math_config.h b/contrib/arm-optimized-routines/pl/math/math_config.h new file mode 100644 index 000000000000..dccb3ce4c775 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/math_config.h @@ -0,0 +1,572 @@ +/* + * Configuration for math routines. + * + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _MATH_CONFIG_H +#define _MATH_CONFIG_H + +#include <math.h> +#include <stdint.h> + +#ifndef WANT_ROUNDING +/* If defined to 1, return correct results for special cases in non-nearest + rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f). + This may be set to 0 if there is no fenv support or if math functions only + get called in round to nearest mode. */ +# define WANT_ROUNDING 1 +#endif +#ifndef WANT_ERRNO +/* If defined to 1, set errno in math functions according to ISO C. Many math + libraries do not set errno, so this is 0 by default. It may need to be + set to 1 if math.h has (math_errhandling & MATH_ERRNO) != 0. */ +# define WANT_ERRNO 0 +#endif +#ifndef WANT_SIMD_EXCEPT +/* If defined to 1, trigger fp exceptions in vector routines, consistently with + behaviour expected from the corresponding scalar routine. */ +#define WANT_SIMD_EXCEPT 0 +#endif + +/* Compiler can inline round as a single instruction. */ +#ifndef HAVE_FAST_ROUND +# if __aarch64__ +# define HAVE_FAST_ROUND 1 +# else +# define HAVE_FAST_ROUND 0 +# endif +#endif + +/* Compiler can inline lround, but not (long)round(x). */ +#ifndef HAVE_FAST_LROUND +# if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__ +# define HAVE_FAST_LROUND 1 +# else +# define HAVE_FAST_LROUND 0 +# endif +#endif + +/* Compiler can inline fma as a single instruction. */ +#ifndef HAVE_FAST_FMA +# if defined FP_FAST_FMA || __aarch64__ +# define HAVE_FAST_FMA 1 +# else +# define HAVE_FAST_FMA 0 +# endif +#endif + +/* Provide *_finite symbols and some of the glibc hidden symbols + so libmathlib can be used with binaries compiled against glibc + to interpose math functions with both static and dynamic linking. */ +#ifndef USE_GLIBC_ABI +# if __GNUC__ +# define USE_GLIBC_ABI 1 +# else +# define USE_GLIBC_ABI 0 +# endif +#endif + +/* Optionally used extensions. */ +#ifdef __GNUC__ +# define HIDDEN __attribute__ ((__visibility__ ("hidden"))) +# define NOINLINE __attribute__ ((noinline)) +# define UNUSED __attribute__ ((unused)) +# define likely(x) __builtin_expect (!!(x), 1) +# define unlikely(x) __builtin_expect (x, 0) +# if __GNUC__ >= 9 +# define attribute_copy(f) __attribute__ ((copy (f))) +# else +# define attribute_copy(f) +# endif +# define strong_alias(f, a) \ + extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f); +# define hidden_alias(f, a) \ + extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \ + attribute_copy (f); +#else +# define HIDDEN +# define NOINLINE +# define UNUSED +# define likely(x) (x) +# define unlikely(x) (x) +#endif + +#if HAVE_FAST_ROUND +/* When set, the roundtoint and converttoint functions are provided with + the semantics documented below. */ +# define TOINT_INTRINSICS 1 + +/* Round x to nearest int in all rounding modes, ties have to be rounded + consistently with converttoint so the results match. If the result + would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */ +static inline double_t +roundtoint (double_t x) +{ + return round (x); +} + +/* Convert x to nearest int in all rounding modes, ties have to be rounded + consistently with roundtoint. If the result is not representible in an + int32_t then the semantics is unspecified. */ +static inline int32_t +converttoint (double_t x) +{ +# if HAVE_FAST_LROUND + return lround (x); +# else + return (long) round (x); +# endif +} +#endif + +static inline uint32_t +asuint (float f) +{ + union + { + float f; + uint32_t i; + } u = {f}; + return u.i; +} + +static inline float +asfloat (uint32_t i) +{ + union + { + uint32_t i; + float f; + } u = {i}; + return u.f; +} + +static inline uint64_t +asuint64 (double f) +{ + union + { + double f; + uint64_t i; + } u = {f}; + return u.i; +} + +static inline double +asdouble (uint64_t i) +{ + union + { + uint64_t i; + double f; + } u = {i}; + return u.f; +} + +#ifndef IEEE_754_2008_SNAN +# define IEEE_754_2008_SNAN 1 +#endif +static inline int +issignalingf_inline (float x) +{ + uint32_t ix = asuint (x); + if (!IEEE_754_2008_SNAN) + return (ix & 0x7fc00000) == 0x7fc00000; + return 2 * (ix ^ 0x00400000) > 2u * 0x7fc00000; +} + +static inline int +issignaling_inline (double x) +{ + uint64_t ix = asuint64 (x); + if (!IEEE_754_2008_SNAN) + return (ix & 0x7ff8000000000000) == 0x7ff8000000000000; + return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; +} + +#if __aarch64__ && __GNUC__ +/* Prevent the optimization of a floating-point expression. */ +static inline float +opt_barrier_float (float x) +{ + __asm__ __volatile__ ("" : "+w" (x)); + return x; +} +static inline double +opt_barrier_double (double x) +{ + __asm__ __volatile__ ("" : "+w" (x)); + return x; +} +/* Force the evaluation of a floating-point expression for its side-effect. */ +static inline void +force_eval_float (float x) +{ + __asm__ __volatile__ ("" : "+w" (x)); +} +static inline void +force_eval_double (double x) +{ + __asm__ __volatile__ ("" : "+w" (x)); +} +#else +static inline float +opt_barrier_float (float x) +{ + volatile float y = x; + return y; +} +static inline double +opt_barrier_double (double x) +{ + volatile double y = x; + return y; +} +static inline void +force_eval_float (float x) +{ + volatile float y UNUSED = x; +} +static inline void +force_eval_double (double x) +{ + volatile double y UNUSED = x; +} +#endif + +/* Evaluate an expression as the specified type, normally a type + cast should be enough, but compilers implement non-standard + excess-precision handling, so when FLT_EVAL_METHOD != 0 then + these functions may need to be customized. */ +static inline float +eval_as_float (float x) +{ + return x; +} +static inline double +eval_as_double (double x) +{ + return x; +} + +/* Error handling tail calls for special cases, with a sign argument. + The sign of the return value is set if the argument is non-zero. */ + +/* The result overflows. */ +HIDDEN float __math_oflowf (uint32_t); +/* The result underflows to 0 in nearest rounding mode. */ +HIDDEN float __math_uflowf (uint32_t); +/* The result underflows to 0 in some directed rounding mode only. */ +HIDDEN float __math_may_uflowf (uint32_t); +/* Division by zero. */ +HIDDEN float __math_divzerof (uint32_t); +/* The result overflows. */ +HIDDEN double __math_oflow (uint32_t); +/* The result underflows to 0 in nearest rounding mode. */ +HIDDEN double __math_uflow (uint32_t); +/* The result underflows to 0 in some directed rounding mode only. */ +HIDDEN double __math_may_uflow (uint32_t); +/* Division by zero. */ +HIDDEN double __math_divzero (uint32_t); + +/* Error handling using input checking. */ + +/* Invalid input unless it is a quiet NaN. */ +HIDDEN float __math_invalidf (float); +/* Invalid input unless it is a quiet NaN. */ +HIDDEN double __math_invalid (double); + +/* Error handling using output checking, only for errno setting. */ + +/* Check if the result overflowed to infinity. */ +HIDDEN double __math_check_oflow (double); +/* Check if the result underflowed to 0. */ +HIDDEN double __math_check_uflow (double); + +/* Check if the result overflowed to infinity. */ +static inline double +check_oflow (double x) +{ + return WANT_ERRNO ? __math_check_oflow (x) : x; +} + +/* Check if the result underflowed to 0. */ +static inline double +check_uflow (double x) +{ + return WANT_ERRNO ? __math_check_uflow (x) : x; +} + +/* Check if the result overflowed to infinity. */ +HIDDEN float __math_check_oflowf (float); +/* Check if the result underflowed to 0. */ +HIDDEN float __math_check_uflowf (float); + +/* Check if the result overflowed to infinity. */ +static inline float +check_oflowf (float x) +{ + return WANT_ERRNO ? __math_check_oflowf (x) : x; +} + +/* Check if the result underflowed to 0. */ +static inline float +check_uflowf (float x) +{ + return WANT_ERRNO ? __math_check_uflowf (x) : x; +} + +extern const struct erff_data +{ + float erff_poly_A[6]; + float erff_poly_B[7]; +} __erff_data HIDDEN; + +/* Data for logf and log10f. */ +#define LOGF_TABLE_BITS 4 +#define LOGF_POLY_ORDER 4 +extern const struct logf_data +{ + struct + { + double invc, logc; + } tab[1 << LOGF_TABLE_BITS]; + double ln2; + double invln10; + double poly[LOGF_POLY_ORDER - 1]; /* First order coefficient is 1. */ +} __logf_data HIDDEN; + +/* Data for low accuracy log10 (with 1/ln(10) included in coefficients). */ +#define LOG10_TABLE_BITS 7 +#define LOG10_POLY_ORDER 6 +#define LOG10_POLY1_ORDER 12 +extern const struct log10_data +{ + double ln2hi; + double ln2lo; + double invln10; + double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */ + double poly1[LOG10_POLY1_ORDER - 1]; + struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS]; +#if !HAVE_FAST_FMA + struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS]; +#endif +} __log10_data HIDDEN; + +#define EXP_TABLE_BITS 7 +#define EXP_POLY_ORDER 5 +/* Use polynomial that is optimized for a wider input range. This may be + needed for good precision in non-nearest rounding and !TOINT_INTRINSICS. */ +#define EXP_POLY_WIDE 0 +/* Use close to nearest rounding toint when !TOINT_INTRINSICS. This may be + needed for good precision in non-nearest rouning and !EXP_POLY_WIDE. */ +#define EXP_USE_TOINT_NARROW 0 +#define EXP2_POLY_ORDER 5 +#define EXP2_POLY_WIDE 0 +extern const struct exp_data +{ + double invln2N; + double shift; + double negln2hiN; + double negln2loN; + double poly[4]; /* Last four coefficients. */ + double exp2_shift; + double exp2_poly[EXP2_POLY_ORDER]; + uint64_t tab[2*(1 << EXP_TABLE_BITS)]; +} __exp_data HIDDEN; + +#define ERFC_NUM_INTERVALS 20 +#define ERFC_POLY_ORDER 12 +extern const struct erfc_data +{ + double interval_bounds[ERFC_NUM_INTERVALS + 1]; + double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1]; +} __erfc_data HIDDEN; +extern const struct v_erfc_data +{ + double interval_bounds[ERFC_NUM_INTERVALS + 1]; + double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1]; +} __v_erfc_data HIDDEN; + +#define ERFCF_POLY_NCOEFFS 16 +extern const struct erfcf_poly_data +{ + double poly[4][ERFCF_POLY_NCOEFFS]; +} __erfcf_poly_data HIDDEN; + +#define V_EXP_TAIL_TABLE_BITS 8 +extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN; + +#define V_ERF_NINTS 49 +#define V_ERF_NCOEFFS 10 +extern const struct v_erf_data +{ + double shifts[V_ERF_NINTS]; + double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS]; +} __v_erf_data HIDDEN; + +#define V_ERFF_NCOEFFS 7 +extern const struct v_erff_data +{ + float coeffs[V_ERFF_NCOEFFS][2]; +} __v_erff_data HIDDEN; + +#define ATAN_POLY_NCOEFFS 20 +extern const struct atan_poly_data +{ + double poly[ATAN_POLY_NCOEFFS]; +} __atan_poly_data HIDDEN; + +#define ATANF_POLY_NCOEFFS 8 +extern const struct atanf_poly_data +{ + float poly[ATANF_POLY_NCOEFFS]; +} __atanf_poly_data HIDDEN; + +#define ASINHF_NCOEFFS 8 +extern const struct asinhf_data +{ + float coeffs[ASINHF_NCOEFFS]; +} __asinhf_data HIDDEN; + +#define LOG_TABLE_BITS 7 +#define LOG_POLY_ORDER 6 +#define LOG_POLY1_ORDER 12 +extern const struct log_data +{ + double ln2hi; + double ln2lo; + double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ + double poly1[LOG_POLY1_ORDER - 1]; + struct + { + double invc, logc; + } tab[1 << LOG_TABLE_BITS]; +#if !HAVE_FAST_FMA + struct + { + double chi, clo; + } tab2[1 << LOG_TABLE_BITS]; +#endif +} __log_data HIDDEN; + +#define ASINH_NCOEFFS 18 +extern const struct asinh_data +{ + double poly[ASINH_NCOEFFS]; +} __asinh_data HIDDEN; + +#define LOG1P_NCOEFFS 19 +extern const struct log1p_data +{ + double coeffs[LOG1P_NCOEFFS]; +} __log1p_data HIDDEN; + +#define LOG1PF_2U5 +#define V_LOG1PF_2U5 +#define LOG1PF_NCOEFFS 9 +extern const struct log1pf_data +{ + float coeffs[LOG1PF_NCOEFFS]; +} __log1pf_data HIDDEN; + +#define TANF_P_POLY_NCOEFFS 6 +/* cotan approach needs order 3 on [0, pi/4] to reach <3.5ulps. */ +#define TANF_Q_POLY_NCOEFFS 4 +extern const struct tanf_poly_data +{ + float poly_tan[TANF_P_POLY_NCOEFFS]; + float poly_cotan[TANF_Q_POLY_NCOEFFS]; +} __tanf_poly_data HIDDEN; + +#define V_LOG2F_POLY_NCOEFFS 9 +extern const struct v_log2f_data +{ + float poly[V_LOG2F_POLY_NCOEFFS]; +} __v_log2f_data HIDDEN; + +#define V_LOG2_TABLE_BITS 7 +#define V_LOG2_POLY_ORDER 6 +extern const struct v_log2_data +{ + double poly[V_LOG2_POLY_ORDER - 1]; + struct + { + double invc, log2c; + } tab[1 << V_LOG2_TABLE_BITS]; +} __v_log2_data HIDDEN; + +#define V_SINF_NCOEFFS 4 +extern const struct sv_sinf_data +{ + float coeffs[V_SINF_NCOEFFS]; +} __sv_sinf_data HIDDEN; + +#define V_LOG10_TABLE_BITS 7 +#define V_LOG10_POLY_ORDER 6 +extern const struct v_log10_data +{ + struct + { + double invc, log10c; + } tab[1 << V_LOG10_TABLE_BITS]; + double poly[V_LOG10_POLY_ORDER - 1]; + double invln10, log10_2; +} __v_log10_data HIDDEN; + +#define V_LOG10F_POLY_ORDER 9 +extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN; + +#define SV_LOGF_POLY_ORDER 8 +extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN; + +#define SV_LOG_POLY_ORDER 6 +#define SV_LOG_TABLE_BITS 7 +extern const struct sv_log_data +{ + double invc[1 << SV_LOG_TABLE_BITS]; + double logc[1 << SV_LOG_TABLE_BITS]; + double poly[SV_LOG_POLY_ORDER - 1]; +} __sv_log_data HIDDEN; + +#ifndef SV_EXPF_USE_FEXPA +#define SV_EXPF_USE_FEXPA 0 +#endif +#define SV_EXPF_POLY_ORDER 6 +extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN; + +#define EXPM1F_POLY_ORDER 5 +extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN; + +#define EXPF_TABLE_BITS 5 +#define EXPF_POLY_ORDER 3 +extern const struct expf_data +{ + uint64_t tab[1 << EXPF_TABLE_BITS]; + double invln2_scaled; + double poly_scaled[EXPF_POLY_ORDER]; +} __expf_data HIDDEN; + +#define EXPM1_POLY_ORDER 11 +extern const double __expm1_poly[EXPM1_POLY_ORDER] HIDDEN; + +extern const struct cbrtf_data +{ + float poly[4]; + float table[5]; +} __cbrtf_data HIDDEN; + +extern const struct cbrt_data +{ + double poly[4]; + double table[5]; +} __cbrt_data HIDDEN; + +extern const struct v_tan_data +{ + double neg_half_pi_hi, neg_half_pi_lo; + double poly[9]; +} __v_tan_data HIDDEN; +#endif diff --git a/contrib/arm-optimized-routines/pl/math/math_err.c b/contrib/arm-optimized-routines/pl/math/math_err.c new file mode 100644 index 000000000000..d246a89982de --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/math_err.c @@ -0,0 +1,78 @@ +/* + * Double-precision math error handling. + * + * Copyright (c) 2018-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#if WANT_ERRNO +#include <errno.h> +/* NOINLINE reduces code size and avoids making math functions non-leaf + when the error handling is inlined. */ +NOINLINE static double +with_errno (double y, int e) +{ + errno = e; + return y; +} +#else +#define with_errno(x, e) (x) +#endif + +/* NOINLINE reduces code size. */ +NOINLINE static double +xflow (uint32_t sign, double y) +{ + y = eval_as_double (opt_barrier_double (sign ? -y : y) * y); + return with_errno (y, ERANGE); +} + +HIDDEN double +__math_uflow (uint32_t sign) +{ + return xflow (sign, 0x1p-767); +} + +/* Underflows to zero in some non-nearest rounding mode, setting errno + is valid even if the result is non-zero, but in the subnormal range. */ +HIDDEN double +__math_may_uflow (uint32_t sign) +{ + return xflow (sign, 0x1.8p-538); +} + +HIDDEN double +__math_oflow (uint32_t sign) +{ + return xflow (sign, 0x1p769); +} + +HIDDEN double +__math_divzero (uint32_t sign) +{ + double y = opt_barrier_double (sign ? -1.0 : 1.0) / 0.0; + return with_errno (y, ERANGE); +} + +HIDDEN double +__math_invalid (double x) +{ + double y = (x - x) / (x - x); + return isnan (x) ? y : with_errno (y, EDOM); +} + +/* Check result and set errno if necessary. */ + +HIDDEN double +__math_check_uflow (double y) +{ + return y == 0.0 ? with_errno (y, ERANGE) : y; +} + +HIDDEN double +__math_check_oflow (double y) +{ + return isinf (y) ? with_errno (y, ERANGE) : y; +} diff --git a/contrib/arm-optimized-routines/pl/math/math_errf.c b/contrib/arm-optimized-routines/pl/math/math_errf.c new file mode 100644 index 000000000000..96271ff18bc1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/math_errf.c @@ -0,0 +1,78 @@ +/* + * Single-precision math error handling. + * + * Copyright (c) 2017-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#if WANT_ERRNO +#include <errno.h> +/* NOINLINE reduces code size and avoids making math functions non-leaf + when the error handling is inlined. */ +NOINLINE static float +with_errnof (float y, int e) +{ + errno = e; + return y; +} +#else +#define with_errnof(x, e) (x) +#endif + +/* NOINLINE reduces code size. */ +NOINLINE static float +xflowf (uint32_t sign, float y) +{ + y = eval_as_float (opt_barrier_float (sign ? -y : y) * y); + return with_errnof (y, ERANGE); +} + +HIDDEN float +__math_uflowf (uint32_t sign) +{ + return xflowf (sign, 0x1p-95f); +} + +/* Underflows to zero in some non-nearest rounding mode, setting errno + is valid even if the result is non-zero, but in the subnormal range. */ +HIDDEN float +__math_may_uflowf (uint32_t sign) +{ + return xflowf (sign, 0x1.4p-75f); +} + +HIDDEN float +__math_oflowf (uint32_t sign) +{ + return xflowf (sign, 0x1p97f); +} + +HIDDEN float +__math_divzerof (uint32_t sign) +{ + float y = opt_barrier_float (sign ? -1.0f : 1.0f) / 0.0f; + return with_errnof (y, ERANGE); +} + +HIDDEN float +__math_invalidf (float x) +{ + float y = (x - x) / (x - x); + return isnan (x) ? y : with_errnof (y, EDOM); +} + +/* Check result and set errno if necessary. */ + +HIDDEN float +__math_check_uflowf (float y) +{ + return y == 0.0f ? with_errnof (y, ERANGE) : y; +} + +HIDDEN float +__math_check_oflowf (float y) +{ + return isinf (y) ? with_errnof (y, ERANGE) : y; +} diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner.h new file mode 100644 index 000000000000..6ad98dccd6aa --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/pairwise_horner.h @@ -0,0 +1,14 @@ +/* + * Helper macros for double-precision pairwise Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f64 +#else +#define FMA fma +#endif + +#include "pairwise_horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h b/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h new file mode 100644 index 000000000000..e56f059514ad --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/pairwise_horner_wrap.h @@ -0,0 +1,48 @@ +/* + * Helper macros for pairwise Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +// clang-format off +#define PW_HORNER_1_(x, c, i) FMA(x, c(i + 1), c(i)) +#define PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) + +#define PAIRWISE_HORNER_1(x, c) PW_HORNER_1_ (x, c, 0) +#define PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0) +#define PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0) +#define PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0) +#define PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0) +#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0) +#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0) +#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0) +#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0) + +#define PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) +#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i)) + +#define PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0) +#define PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0) +#define PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0) +#define PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0) +#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0) +#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0) +#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0) +#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0) +#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0) +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h b/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h new file mode 100644 index 000000000000..784750cde0b6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/pairwise_hornerf.h @@ -0,0 +1,14 @@ +/* + * Helper macros for single-precision pairwise Horner polynomial evaluation. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#if V_SUPPORTED +#define FMA v_fma_f32 +#else +#define FMA fmaf +#endif + +#include "pairwise_horner_wrap.h" diff --git a/contrib/arm-optimized-routines/pl/math/pl_sig.h b/contrib/arm-optimized-routines/pl/math/pl_sig.h new file mode 100644 index 000000000000..686d24f0d9a5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/pl_sig.h @@ -0,0 +1,43 @@ +/* + * PL macros for emitting various ulp/bench entries based on function signature + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. + */ +#define PL_DECL_SF1(fun) float fun##f (float); +#define PL_DECL_SF2(fun) float fun##f (float, float); +#define PL_DECL_SD1(fun) double fun (double); +#define PL_DECL_SD2(fun) double fun (double, double); + +#if V_SUPPORTED +#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t); +#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t); +#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t); +#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t); +#else +#define PL_DECL_VF1(fun) +#define PL_DECL_VF2(fun) +#define PL_DECL_VD1(fun) +#define PL_DECL_VD2(fun) +#endif + +#if SV_SUPPORTED +#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t); +#define PL_DECL_SVF2(fun) \ + sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t); +#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t); +#define PL_DECL_SVD2(fun) \ + sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t); +#else +#define PL_DECL_SVF1(fun) +#define PL_DECL_SVF2(fun) +#define PL_DECL_SVD1(fun) +#define PL_DECL_SVD2(fun) +#endif + +/* For building the routines, emit function prototype from PL_SIG. This + ensures that the correct signature has been chosen (wrong one will be a + compile error). PL_SIG is defined differently by various components of the + build system to emit entries in the wrappers and entries for mathbench and + ulp. */ +#define PL_SIG(v, t, a, f, ...) PL_DECL_##v##t##a (f) diff --git a/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c new file mode 100644 index 000000000000..f62cbd6b53f0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_acosh_3u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_acosh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c new file mode 100644 index 000000000000..374066622a0f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_acoshf_3u1.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_acoshf_3u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c new file mode 100644 index 000000000000..ab8fbd9c3d69 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_asinh_3u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_asinh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c new file mode 100644 index 000000000000..13e1a5fd314a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_asinhf_2u7.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_asinhf_2u7.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c new file mode 100644 index 000000000000..4603e5f72615 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_atan2_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atan2_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c new file mode 100644 index 000000000000..894d843273ea --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_atan2f_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atan2f_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c new file mode 100644 index 000000000000..4b61bc4d1460 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_atan_2u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atan_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c new file mode 100644 index 000000000000..6b6571927195 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_atanf_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atanf_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c new file mode 100644 index 000000000000..f6a5f75b1779 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_atanh_3u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atanh_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c new file mode 100644 index 000000000000..e7e5c6197406 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_atanhf_3u1.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_atanhf_3u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c new file mode 100644 index 000000000000..435e74a546c6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_cbrt_2u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_cbrt_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c new file mode 100644 index 000000000000..5c793704b62a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_cbrtf_1u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_cbrtf_1u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c new file mode 100644 index 000000000000..cdf352cf5793 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_cosh_2u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_cosh_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c new file mode 100644 index 000000000000..8f7d5da6e6ef --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_coshf_2u4.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_coshf_2u4.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erf_2u.c b/contrib/arm-optimized-routines/pl/math/s_erf_2u.c new file mode 100644 index 000000000000..839535c3897f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_erf_2u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_erf_2u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c new file mode 100644 index 000000000000..bf9e3e62bd31 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_erfc_4u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_erfc_4u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c new file mode 100644 index 000000000000..024d22498ff5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_erfcf_1u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_erfcf_1u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c new file mode 100644 index 000000000000..a5b9bf9afa72 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_erff_1u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_erff_1u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_exp_tail.c b/contrib/arm-optimized-routines/pl/math/s_exp_tail.c new file mode 100644 index 000000000000..20b1b41a9689 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_exp_tail.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_exp_tail.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expf.c b/contrib/arm-optimized-routines/pl/math/s_expf.c new file mode 100644 index 000000000000..557a2e3d36af --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_expf.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_expf.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c new file mode 100644 index 000000000000..da2d6e7ebf82 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_expm1_2u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_expm1_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c new file mode 100644 index 000000000000..eea8089da989 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_expm1f_1u6.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_expm1f_1u6.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c new file mode 100644 index 000000000000..2480e5aa2cf1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_log10_2u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_log10_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c new file mode 100644 index 000000000000..173e0fdc3400 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_log10f_3u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_log10f_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c new file mode 100644 index 000000000000..20b395a5a2d0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_log1p_2u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_log1p_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c new file mode 100644 index 000000000000..013ec4c1d903 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_log1pf_2u1.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_log1pf_2u1.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log2_3u.c b/contrib/arm-optimized-routines/pl/math/s_log2_3u.c new file mode 100644 index 000000000000..d46f3f998190 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_log2_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_log2_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c new file mode 100644 index 000000000000..e76c67dceb62 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_log2f_2u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_log2f_2u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c new file mode 100644 index 000000000000..27e5e65db178 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_sinh_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_sinh_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c new file mode 100644 index 000000000000..607f94298a79 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_sinhf_2u3.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_sinhf_2u3.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c new file mode 100644 index 000000000000..adb807c5beb8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_tan_3u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_tan_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c new file mode 100644 index 000000000000..fa64c8aef697 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_tanf_3u5.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_tanf_3u5.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c new file mode 100644 index 000000000000..a4d7bce649f1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_tanh_3u.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_tanh_3u.c" diff --git a/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c new file mode 100644 index 000000000000..896fc62ebe9b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/s_tanhf_2u6.c @@ -0,0 +1,6 @@ +/* + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#define SCALAR 1 +#include "v_tanhf_2u6.c" diff --git a/contrib/arm-optimized-routines/pl/math/sinh_3u.c b/contrib/arm-optimized-routines/pl/math/sinh_3u.c new file mode 100644 index 000000000000..f534815c6674 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sinh_3u.c @@ -0,0 +1,66 @@ +/* + * Double-precision sinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define OFlowBound \ + 0x40862e42fefa39f0 /* 0x1.62e42fefa39fp+9, above which using expm1 results \ + in NaN. */ + +double +__exp_dd (double, double); + +/* Approximation for double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.57 ULP: + __v_sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 + want 0x1.ab34e59d678d9p-2. */ +double +sinh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t iax = ix & AbsMask; + double ax = asdouble (iax); + uint64_t sign = ix & ~AbsMask; + double halfsign = asdouble (Half | sign); + + if (unlikely (iax >= OFlowBound)) + { + /* Special values and overflow. */ + if (unlikely (iax > 0x7ff0000000000000)) + return __math_invalidf (x); + /* expm1 overflows a little before sinh. We have to fill this + gap by using a different algorithm, in this case we use a + double-precision exp helper. For large x sinh(x) is dominated + by exp(x), however we cannot compute exp without overflow + either. We use the identity: exp(a) = (exp(a / 2)) ^ 2 + to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 + ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. */ + double e = __exp_dd (ax / 2, 0); + return (e * halfsign) * e; + } + + /* Use expm1f to retain acceptable precision for small numbers. + Let t = e^(|x|) - 1. */ + double t = expm1 (ax); + /* Then sinh(x) = (t + t / (t + 1)) / 2 for x > 0 + (t + t / (t + 1)) / -2 for x < 0. */ + return (t + t / (t + 1)) * halfsign; +} + +PL_SIG (S, D, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (sinh, 2.08) +PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100) +PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100) +PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000) +PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000) +PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000) +PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c new file mode 100644 index 000000000000..de944288a02b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sinhf_2u3.c @@ -0,0 +1,76 @@ +/* + * Single-precision sinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define Expm1OFlowLimit \ + 0x42b17218 /* 0x1.62e43p+6, 2^7*ln2, minimum value for which expm1f \ + overflows. */ +#define OFlowLimit \ + 0x42b2d4fd /* 0x1.65a9fap+6, minimum positive value for which sinhf should \ + overflow. */ + +float +optr_aor_exp_f32 (float); + +/* Approximation for single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ +float +sinhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + float ax = asfloat (iax); + uint32_t sign = ix & ~AbsMask; + float halfsign = asfloat (Half | sign); + + if (unlikely (iax >= Expm1OFlowLimit)) + { + /* Special values and overflow. */ + if (iax >= 0x7fc00001 || iax == 0x7f800000) + return x; + if (iax >= 0x7f800000) + return __math_invalidf (x); + if (iax >= OFlowLimit) + return __math_oflowf (sign); + + /* expm1f overflows a little before sinhf, (~88.7 vs ~89.4). We have to + fill this gap by using a different algorithm, in this case we use a + double-precision exp helper. For large x sinh(x) dominated by exp(x), + however we cannot compute exp without overflow either. We use the + identity: + exp(a) = (exp(a / 2)) ^ 2. + to compute sinh(x) ~= (exp(|x| / 2)) ^ 2 / 2 for x > 0 + ~= (exp(|x| / 2)) ^ 2 / -2 for x < 0. + Greatest error in this region is 1.89 ULP: + sinhf(0x1.65898cp+6) got 0x1.f00aep+127 want 0x1.f00adcp+127. */ + float e = optr_aor_exp_f32 (ax / 2); + return (e * halfsign) * e; + } + + /* Use expm1f to retain acceptable precision for small numbers. + Let t = e^(|x|) - 1. */ + float t = expm1f (ax); + /* Then sinh(x) = (t + t / (t + 1)) / 2 for x > 0 + (t + t / (t + 1)) / -2 for x < 0. */ + return (t + t / (t + 1)) * halfsign; +} + +PL_SIG (S, F, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (sinhf, 1.76) +PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000) +PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000) +PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100) +PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100) +PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100) +PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c new file mode 100644 index 000000000000..a4bea1dcba09 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atan2_2u5.c @@ -0,0 +1,93 @@ +/* + * Double-precision vector atan2(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#include "sv_atan_common.h" + +/* Useful constants. */ +#define PiOver2 sv_f64 (0x1.921fb54442d18p+0) +#define SignMask sv_u64 (0x8000000000000000) + +/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ +__attribute__ ((noinline)) static sv_f64_t +specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp) +{ + return sv_call2_f64 (atan2, y, x, ret, cmp); +} + +/* Returns a predicate indicating true if the input is the bit representation of + 0, infinity or nan. */ +static inline svbool_t +zeroinfnan (sv_u64_t i, const svbool_t pg) +{ + return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1), + sv_u64 (2 * asuint64 (INFINITY) - 1)); +} + +/* Fast implementation of SVE atan2. Errors are greatest when y and + x are reasonably close together. The greatest observed error is 2.28 ULP: + sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) + got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ +sv_f64_t +__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg) +{ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_u64_t iy = sv_as_u64_f64 (y); + + svbool_t cmp_x = zeroinfnan (ix, pg); + svbool_t cmp_y = zeroinfnan (iy, pg); + svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y); + + sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask); + sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask); + sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y); + + sv_f64_t ax = svabs_f64_x (pg, x); + sv_f64_t ay = svabs_f64_x (pg, y); + + svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0)); + svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax); + + /* Set up z for call to atan. */ + sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay); + sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax); + sv_f64_t z = svdiv_f64_x (pg, n, d); + + /* Work out the correct shift. */ + sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); + shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift); + shift = svmul_f64_x (pg, shift, PiOver2); + + sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift); + + /* Account for the sign of x and y. */ + ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy)); + + if (unlikely (svptest_any (pg, cmp_xy))) + { + return specialcase (y, x, ret, cmp_xy); + } + + return ret; +} + +PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2) + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +PL_SIG (SV, D, 2, atan2) +PL_TEST_ULP (__sv_atan2, 1.78) +PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000) +PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c new file mode 100644 index 000000000000..f7674c441f2f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atan2f_3u.c @@ -0,0 +1,94 @@ +/* + * Single-precision vector atan2f(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#include "sv_atanf_common.h" + +/* Useful constants. */ +#define PiOver2 sv_f32 (0x1.921fb6p+0f) +#define SignMask sv_u32 (0x80000000) + +/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ +static inline sv_f32_t +specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp) +{ + return sv_call2_f32 (atan2f, y, x, ret, cmp); +} + +/* Returns a predicate indicating true if the input is the bit representation of + 0, infinity or nan. */ +static inline svbool_t +zeroinfnan (sv_u32_t i, const svbool_t pg) +{ + return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1), + sv_u32 (2 * 0x7f800000lu - 1)); +} + +/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2) + with reduction to [0,1] using z=1/x and shift = pi/2. + Maximum observed error is 2.95 ULP: + __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +sv_f32_t +__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg) +{ + sv_u32_t ix = sv_as_u32_f32 (x); + sv_u32_t iy = sv_as_u32_f32 (y); + + svbool_t cmp_x = zeroinfnan (ix, pg); + svbool_t cmp_y = zeroinfnan (iy, pg); + svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y); + + sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask); + sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask); + sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y); + + sv_f32_t ax = svabs_f32_x (pg, x); + sv_f32_t ay = svabs_f32_x (pg, y); + + svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0)); + svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax); + + /* Set up z for call to atan. */ + sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay); + sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax); + sv_f32_t z = svdiv_f32_x (pg, n, d); + + /* Work out the correct shift. */ + sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); + shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift); + shift = svmul_f32_x (pg, shift, PiOver2); + + sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift); + + /* Account for the sign of x and y. */ + ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy)); + + if (unlikely (svptest_any (pg, cmp_xy))) + { + return specialcase (y, x, ret, cmp_xy); + } + + return ret; +} + +PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f) + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +PL_SIG (SV, F, 2, atan2) +PL_TEST_ULP (__sv_atan2f, 2.45) +PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000) +PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c new file mode 100644 index 000000000000..02ac331970c9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atan_2u5.c @@ -0,0 +1,62 @@ +/* + * Double-precision vector atan(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#include "sv_atan_common.h" + +/* Useful constants. */ +#define PiOver2 sv_f64 (0x1.921fb54442d18p+0) +#define AbsMask (0x7fffffffffffffff) + +/* Fast implementation of SVE atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed + error is 2.27 ulps: + __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +sv_f64_t +__sv_atan_x (sv_f64_t x, const svbool_t pg) +{ + /* No need to trigger special case. Small cases, infs and nans + are supported by our approximation technique. */ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + svbool_t red = svacgt_n_f64 (pg, x, 1.0); + /* Avoid dependency in abs(x) in division (and comparison). */ + sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x); + /* Use absolute value only when needed (odd powers of z). */ + sv_f64_t az = svabs_f64_x (pg, z); + az = svneg_f64_m (az, red, az); + + sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); + + return y; +} + +PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan) + +PL_SIG (SV, D, 1, atan, -3.1, 3.1) +PL_TEST_ULP (__sv_atan, 1.78) +PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000) +PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_atan_common.h b/contrib/arm-optimized-routines/pl/math/sv_atan_common.h new file mode 100644 index 000000000000..bfe6998d2416 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atan_common.h @@ -0,0 +1,61 @@ +/* + * Double-precision polynomial evaluation function for SVE atan(x) and + * atan2(y,x). + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" +#include "sv_math.h" + +#define P(i) sv_f64 (__atan_poly_data.poly[i]) + +/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations + The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */ +static inline sv_f64_t +__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az, + sv_f64_t shift) +{ + /* Use full Estrin scheme for P(z^2) with deg(P)=19. */ + sv_f64_t z2 = svmul_f64_x (pg, z, z); + + /* Level 1. */ + sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0)); + sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2)); + sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4)); + sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6)); + sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8)); + sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10)); + sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12)); + sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14)); + sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16)); + sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18)); + + /* Level 2. */ + sv_f64_t x2 = svmul_f64_x (pg, z2, z2); + sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0); + sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4); + sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8); + sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12); + sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16); + + /* Level 3. */ + sv_f64_t x4 = svmul_f64_x (pg, x2, x2); + sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0); + sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8); + + /* Level 4. */ + sv_f64_t x8 = svmul_f64_x (pg, x4, x4); + sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8); + y = sv_fma_f64_x (pg, y, x8, P_7_0); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + sv_f64_t z3 = svmul_f64_x (pg, z2, az); + y = sv_fma_f64_x (pg, y, z3, az); + + /* Apply shift as indicated by `red` predicate. */ + y = svadd_f64_m (red, y, shift); + + return y; +} diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c new file mode 100644 index 000000000000..8d38e42b2290 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atanf_2u9.c @@ -0,0 +1,59 @@ +/* + * Single-precision vector atan(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#include "sv_atanf_common.h" + +#define PiOver2 sv_f32 (0x1.921fb6p+0f) +#define AbsMask (0x7fffffff) + +/* Fast implementation of SVE atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=-1/x and shift = pi/2. + Largest observed error is 2.9 ULP, close to +/-1.0: + __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1 + want -0x1.967fp-1. */ +sv_f32_t +__sv_atanf_x (sv_f32_t x, const svbool_t pg) +{ + /* No need to trigger special case. Small cases, infs and nans + are supported by our approximation technique. */ + sv_u32_t ix = sv_as_u32_f32 (x); + sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask); + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + svbool_t red = svacgt_n_f32 (pg, x, 1.0f); + /* Avoid dependency in abs(x) in division (and comparison). */ + sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x); + /* Use absolute value only when needed (odd powers of z). */ + sv_f32_t az = svabs_f32_x (pg, z); + az = svneg_f32_m (az, red, az); + + sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign)); +} + +PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf) + +PL_SIG (SV, F, 1, atan, -3.1, 3.1) +PL_TEST_ULP (__sv_atanf, 2.9) +PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000) +PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000) +PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000) +PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h b/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h new file mode 100644 index 000000000000..dc45effec1cd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_atanf_common.h @@ -0,0 +1,47 @@ +/* + * Single-precision polynomial evaluation function for SVE atan(x) and + * atan2(y,x). + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_SV_ATANF_COMMON_H +#define PL_MATH_SV_ATANF_COMMON_H + +#include "math_config.h" +#include "sv_math.h" + +#define P(i) sv_f32 (__atanf_poly_data.poly[i]) + +/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations + The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2). */ +static inline sv_f32_t +__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az, + sv_f32_t shift) +{ + /* Use full Estrin scheme for P(z^2) with deg(P)=7. */ + + /* First compute square powers of z. */ + sv_f32_t z2 = svmul_f32_x (pg, z, z); + sv_f32_t z4 = svmul_f32_x (pg, z2, z2); + sv_f32_t z8 = svmul_f32_x (pg, z4, z4); + + /* Then assemble polynomial. */ + sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))), + (sv_fma_f32_x (pg, z2, P (5), P (4)))); + sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))), + (sv_fma_f32_x (pg, z2, P (1), P (0)))); + sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3); + + /* Finalize. y = shift + z + z^3 * P(z^2). */ + sv_f32_t z3 = svmul_f32_x (pg, z2, az); + y = sv_fma_f32_x (pg, y, z3, az); + + /* Apply shift as indicated by 'red' predicate. */ + y = svadd_f32_m (red, y, shift); + + return y; +} + +#endif // PL_MATH_SV_ATANF_COMMON_H diff --git a/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c new file mode 100644 index 000000000000..194034802452 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cos_2u5.c @@ -0,0 +1,84 @@ +/* + * Double-precision SVE cos(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1)) +#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0)) +#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26)) +#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54)) +/* Original shift used in Neon cos, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ +#define Shift (sv_f64 (0x1.8000000000001p52)) +#define RangeVal (sv_f64 (0x1p23)) +#define AbsMask (0x7fffffffffffffff) + +static NOINLINE sv_f64_t +__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +{ + return sv_call_f64 (cos, x, y, cmp); +} + +/* A fast SVE implementation of cos based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.108 ULPs. + __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 + want -0x1.fddd4c65c7f05p-3. */ +sv_f64_t +__sv_cos_x (sv_f64_t x, const svbool_t pg) +{ + sv_f64_t n, r, r2, y; + svbool_t cmp; + + r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask)); + cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal)); + + /* n = rint(|x|/(pi/2)). */ + sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift); + n = svsub_f64_x (pg, q, Shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = sv_fma_f64_x (pg, NegPio2_1, n, r); + r = sv_fma_f64_x (pg, NegPio2_2, n, r); + r = sv_fma_f64_x (pg, NegPio2_3, n, r); + + /* cos(r) poly approx. */ + r2 = svtsmul_f64 (r, sv_as_u64_f64 (q)); + y = sv_f64 (0.0); + y = svtmad_f64 (y, r2, 7); + y = svtmad_f64 (y, r2, 6); + y = svtmad_f64 (y, r2, 5); + y = svtmad_f64 (y, r2, 4); + y = svtmad_f64 (y, r2, 3); + y = svtmad_f64 (y, r2, 2); + y = svtmad_f64 (y, r2, 1); + y = svtmad_f64 (y, r2, 0); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q)); + /* Apply factor. */ + y = svmul_f64_x (pg, f, y); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + if (unlikely (svptest_any (pg, cmp))) + return __sv_cos_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos) + +PL_SIG (SV, D, 1, cos, -3.1, 3.1) +PL_TEST_ULP (__sv_cos, 1.61) +PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c new file mode 100644 index 000000000000..8f138bcba7af --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_cosf_2u1.c @@ -0,0 +1,82 @@ +/* + * Single-precision SVE cos(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) +#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) +#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) +#define RangeVal (sv_f32 (0x1p20f)) +#define InvPio2 (sv_f32 (0x1.45f306p-1f)) +/* Original shift used in Neon cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ +#define Shift (sv_f32 (0x1.800002p+23f)) +#define AbsMask (0x7fffffff) + +static NOINLINE sv_f32_t +__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (cosf, x, y, cmp); +} + +/* A fast SVE implementation of cosf based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.06 ULPs. + __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +sv_f32_t +__sv_cosf_x (sv_f32_t x, const svbool_t pg) +{ + sv_f32_t n, r, r2, y; + svbool_t cmp; + + r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); + cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + + /* n = rint(|x|/(pi/2)). */ + sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift); + n = svsub_f32_x (pg, q, Shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = sv_fma_f32_x (pg, NegPio2_1, n, r); + r = sv_fma_f32_x (pg, NegPio2_2, n, r); + r = sv_fma_f32_x (pg, NegPio2_3, n, r); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q)); + + /* cos(r) poly approx. */ + r2 = svtsmul_f32 (r, sv_as_u32_f32 (q)); + y = sv_f32 (0.0f); + y = svtmad_f32 (y, r2, 4); + y = svtmad_f32 (y, r2, 3); + y = svtmad_f32 (y, r2, 2); + y = svtmad_f32 (y, r2, 1); + y = svtmad_f32 (y, r2, 0); + + /* Apply factor. */ + y = svmul_f32_x (pg, f, y); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + if (unlikely (svptest_any (pg, cmp))) + return __sv_cosf_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf) + +PL_SIG (SV, F, 1, cos, -3.1, 3.1) +PL_TEST_ULP (__sv_cosf, 1.57) +PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c b/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c new file mode 100644 index 000000000000..bec7f8a819d2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erf_3u.c @@ -0,0 +1,103 @@ +/* + * Double-precision SVE erf(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define Scale (8.0) +#define AbsMask (0x7fffffffffffffff) + +static NOINLINE sv_f64_t +__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +{ + return sv_call_f64 (erf, x, y, cmp); +} + +/* Optimized double precision SVE error function erf. + Maximum observed error is 2.62 ULP: + __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0 + want 0x1.fffffffffffffp-1. */ +sv_f64_t +__sv_erf_x (sv_f64_t x, const svbool_t pg) +{ + /* Use top 16 bits to test for special cases and small values. */ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff); + + /* Handle both inf/nan as well as small values (|x|<2^-28). */ + svbool_t cmp + = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30); + + /* Get sign and absolute value. */ + sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask)); + sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask); + + /* i = trunc(Scale*x). */ + sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale); + /* Saturate index of intervals. */ + svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018); + sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale); + + /* Load polynomial coefficients. */ + sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i); + sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i); + sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i); + sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i); + sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i); + sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i); + sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i); + sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i); + sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i); + sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i); + + /* Get shift and scale. */ + sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i); + + /* Transform polynomial variable. + Set z = 0 in the boring domain to avoid overflow. */ + sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a); + + /* Evaluate polynomial P(z) using level-2 Estrin. */ + sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0); + sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2); + sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4); + sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6); + sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8); + + sv_f64_t z2 = svmul_f64_x (pg, z, z); + sv_f64_t z4 = svmul_f64_x (pg, z2, z2); + + sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3); + sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1); + + sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2); + y = sv_fma_f64_x (pg, z4, y, q1); + + /* y = erf(x) if x > 0, -erf(-x) otherwise. */ + y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); + + if (unlikely (svptest_any (pg, cmp))) + return __sv_erf_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf) + +PL_SIG (SV, D, 1, erf, -4.0, 4.0) +PL_TEST_ULP (__sv_erf, 2.13) +PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000) +PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000) +PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000) +PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000) +PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000) +PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000) +PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000) +PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c new file mode 100644 index 000000000000..076b47129862 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erfc_4u.c @@ -0,0 +1,146 @@ +/* + * Double-precision SVE erfc(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED +#include "sv_exp_tail.h" + +sv_f64_t __sv_exp_x (sv_f64_t, svbool_t); + +static NOINLINE sv_f64_t +specialcase (sv_f64_t x, sv_f64_t y, svbool_t special) +{ + return sv_call_f64 (erfc, x, y, special); +} + +static inline sv_u64_t +lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x) +{ + /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by + the number of polynomials. */ + sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1); + xp1 = svmul_f64_x (pg, xp1, xp1); + xp1 = svmul_f64_x (pg, xp1, xp1); + sv_u64_t interval_idx + = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023); + return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS), + interval_idx, sv_u64 (ERFC_NUM_INTERVALS)); +} + +static inline sv_f64_t +sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx) +{ + sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1); + const double *base = &__v_erfc_data.poly[0][12]; + sv_f64_t r = sv_lookup_f64_x (pg, base, offset); + for (int i = 0; i < ERFC_POLY_ORDER; i++) + { + base--; + sv_f64_t c = sv_lookup_f64_x (pg, base, offset); + r = sv_fma_f64_x (pg, z, r, c); + } + return r; +} + +static inline sv_f64_t +sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x) +{ + /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding + errors in x^2, so we compute an estimate for the error and use a custom exp + helper which corrects for the calculated error estimate. */ + sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x); + + /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and + a_lo is the 'small' component. */ + const sv_f64_t scale = sv_f64 (0x1.0000002p27); + sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x, + svneg_f64_x (pg, abs_x))); + a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi); + sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi); + + sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi); + sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo); + + /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) - + (a_hi + a_lo) * (a_hi + a_lo). */ + sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2); + e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2); + e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2); + e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2); + + return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2); +} + +/* Optimized double precision vector complementary error function erfc. + Maximum measured error is 3.64 ULP: + __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42 + want 0x1.ff3f4c8e200d9p-42. */ +sv_f64_t +__sv_erfc_x (sv_f64_t x, const svbool_t pg) +{ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_f64_t abs_x = svabs_f64_x (pg, x); + sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52); + + /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes + to 2. As long as the polynomial is 0 in the boring zone, we can assemble + the result correctly. This is dealt with in two ways: + + The 'coarse approach' is that the approximation algorithm is + zero-predicated on in_bounds = |x| < 32, which saves the need to do + coefficient lookup etc for |x| >= 32. + + The coarse approach misses [-32, -6] and [28, 32], which are dealt with in + the polynomial and index calculation, such that the polynomial evaluates to + 0 in these regions. */ + /* in_bounds is true for lanes where |x| < 32. */ + svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404); + /* boring_zone = 2 for x < 0, 0 otherwise. */ + sv_f64_t boring_zone + = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62)); + /* Very small, nan and inf. */ + svbool_t special_cases + = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432); + + /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2) + + Where P_i is a polynomial and x_i is an offset, both defined in + v_erfc_data.c. i is chosen based on which interval x falls in. */ + sv_u64_t i = lookup_interval_idx (in_bounds, abs_x); + sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i); + sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i); + /* 'copy' sign of x to p, i.e. negate p if x is negative. */ + sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff); + p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign)); + + sv_f64_t e = sv_eval_gauss (in_bounds, abs_x); + + /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally + select boring_zone because P[V_ERFC_NINTS-1]=0. */ + sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone); + + if (unlikely (svptest_any (pg, special_cases))) + { + return specialcase (x, y, special_cases); + } + return y; +} + +PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc) + +PL_SIG (SV, D, 1, erfc, -4.0, 10.0) +PL_TEST_ULP (__sv_erfc, 3.15) +PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000) +PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c b/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c new file mode 100644 index 000000000000..c7a738c55f7b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_erff_1u3.c @@ -0,0 +1,104 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define AbsMask (0x7fffffff) + +static NOINLINE sv_f32_t +__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (erff, x, y, cmp); +} + +sv_f32_t __sv_expf_x (svbool_t, sv_f32_t); + +/* Optimized single precision vector erf. Worst-case error is 1.25 ULP: + __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1 + want 0x1.9f9c8ap-1. */ +sv_f32_t +__sv_erff_x (sv_f32_t x, const svbool_t pg) +{ + sv_u32_t ix = sv_as_u32_f32 (x); + sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff); + /* Handle both inf/nan as well as small values (|x|<2^-28). */ + svbool_t cmp + = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180); + + sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask); + /* |x| < 0.921875. */ + svbool_t red = svaclt_n_f32 (pg, x, 0.921875f); + /* |x| > 4.0. */ + svbool_t bor = svacgt_n_f32 (pg, x, 4.0f); + + /* Load polynomial coefficients. */ + sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1)); + sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2); + + const float *base = (float *) __v_erff_data.coeffs; + sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2); + sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6); + sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10); + + /* Do not need to store elem 0 of __v_erff_data as it is not used. */ + sv_f32_t p1 = svtbl (c_2_5, idx_lo); + sv_f32_t p2 = svtbl (c_2_5, idx_hi); + sv_f32_t p3 = svtbl (c_6_9, idx_lo); + sv_f32_t p4 = svtbl (c_6_9, idx_hi); + sv_f32_t p5 = svtbl (c_10_13, idx_lo); + sv_f32_t p6 = svtbl (c_10_13, idx_hi); + + sv_f32_t a = svabs_f32_x (pg, x); + /* Square with merging mul - z is x^2 for reduced, |x| otherwise. */ + sv_f32_t z = svmul_f32_m (red, a, a); + + /* Evaluate polynomial on |x| or x^2. */ + sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5); + r = sv_fma_f32_x (pg, z, r, p4); + r = sv_fma_f32_x (pg, z, r, p3); + r = sv_fma_f32_x (pg, z, r, p2); + r = sv_fma_f32_x (pg, z, r, p1); + /* Use merging svmad for last operation - apply first coefficient if not + reduced, otherwise r is propagated unchanged. This is because the reduced + polynomial has lower order than the non-reduced. */ + r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]); + r = sv_fma_f32_x (pg, a, r, a); + + /* y = |x| + |x| * P(x^2) if |x| < 0.921875 + y = 1 - exp (-(|x| + |x| * P(|x|))) otherwise. */ + sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r)); + y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0)); + + /* Boring domain (absolute value is required to get the sign of erf(-nan) + right). */ + y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y)); + + /* y = erf(x) if x>0, -erf(-x) otherwise. */ + y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign)); + + if (unlikely (svptest_any (pg, cmp))) + return __sv_erff_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff) + +PL_SIG (SV, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (__sv_erff, 0.76) +PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000) +PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000) +PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000) +PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000) +PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000) +PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000) +PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000) +PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h b/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h new file mode 100644 index 000000000000..9b739da9d82a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_exp_tail.h @@ -0,0 +1,79 @@ +/* + * Double-precision SVE e^(x+tail) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef SV_EXP_TAIL_H +#define SV_EXP_TAIL_H + +#include "sv_math.h" +#if SV_SUPPORTED + +#include "v_exp_tail.h" + +#define C1 sv_f64 (C1_scal) +#define C2 sv_f64 (C2_scal) +#define C3 sv_f64 (C3_scal) +#define MinusLn2hi (-Ln2hi_scal) +#define MinusLn2lo (-Ln2lo_scal) + +#define N (1 << V_EXP_TAIL_TABLE_BITS) +#define Tab __v_exp_tail_data +#define IndexMask (N - 1) +#define Shift sv_f64 (0x1.8p+52) +#define Thres 704.0 + +static inline sv_f64_t +sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n) +{ + sv_f64_t absn = svabs_f64_x (pg, n); + + /* 2^(n/N) may overflow, break it up into s1*s2. */ + sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000), + sv_u64 (0)); + sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000)); + sv_f64_t s2 = sv_as_f64_u64 ( + svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000), + b)); + + svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N); + sv_f64_t r1 = svmul_f64_x (pg, s1, s1); + sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1); + return svsel_f64 (cmp, r1, r0); +} + +static inline sv_f64_t +sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail) +{ + /* Calculate exp(x + xtail). */ + sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift); + sv_f64_t n = svsub_f64_x (pg, z, Shift); + + sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x); + r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r); + + sv_u64_t u = sv_as_u64_f64 (z); + sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); + sv_u64_t i = svand_n_u64_x (pg, u, IndexMask); + + sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2); + y = sv_fma_f64_x (pg, y, r, C1); + y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0)); + y = sv_fma_f64_x (pg, y, r, xtail); + + /* s = 2^(n/N). */ + u = sv_lookup_u64_x (pg, Tab, i); + sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e)); + + svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres); + if (unlikely (svptest_any (pg, cmp))) + { + return sv_exp_tail_special_case (pg, s, y, n); + } + return sv_fma_f64_x (pg, y, s, s); +} + +#endif +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c new file mode 100644 index 000000000000..87fbe45df5fd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expf_2u.c @@ -0,0 +1,156 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define C(i) __sv_expf_poly[i] + +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) + +#if SV_EXPF_USE_FEXPA + +#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127. */ +#define Thres \ + (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal \ + and not handled correctly by FEXPA. */ + +static NOINLINE sv_f32_t +special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +{ + /* The special-case handler from the Neon routine does not handle subnormals + in a way that is compatible with FEXPA. For the FEXPA variant we just fall + back to scalar expf. */ + return sv_call_f32 (expf, x, y, special); +} + +#else + +#define Shift (0x1.8p23f) /* 1.5 * 2^23. */ +#define Thres (126.0f) + +/* Special-case handler adapted from Neon variant. Uses s, y and n to produce + the final result (normal cases included). It performs an update of all lanes! + Therefore: + - all previous computation need to be done on all lanes indicated by input + pg + - we cannot simply apply the special case to the special-case-activated + lanes. Besides it is likely that this would not increase performance (no + scatter/gather). */ +static inline sv_f32_t +specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e, + svbool_t p_cmp1, sv_f32_t scale) +{ + /* s=2^(n/N) may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x820...0, 0 otherwise. */ + svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0. */ + sv_u32_t b + = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0. */ + + /* Set s1 to generate overflow depending on sign of exponent n. */ + sv_f32_t s1 + = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000. */ + /* Offset s to avoid overflow in final result if n is below threshold. */ + sv_f32_t s2 = sv_as_f32_u32 ( + svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b. */ + + /* |n| > 192 => 2^(n/N) overflows. */ + svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f); + + sv_f32_t r2 = svmul_f32_x (pg, s1, s1); + sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2); + r1 = svmul_f32_x (pg, r1, s1); + sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale); + + /* Apply condition 1 then 2. + Returns r2 if cond2 is true, otherwise + if cond1 is true then return r1, otherwise return r0. */ + sv_f32_t r = svsel_f32 (p_cmp1, r1, r0); + + return svsel_f32 (p_cmp2, r2, r); +} + +#endif + +/* Optimised single-precision SVE exp function. By default this is an SVE port + of the Neon algorithm from math/. Alternatively, enable a modification of + that algorithm that looks up scale using SVE FEXPA instruction with + SV_EXPF_USE_FEXPA. + + Worst-case error of the default algorithm is 1.95 ulp: + __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8 + want 0x1.6a023p-8. + + Worst-case error when using FEXPA is 1.04 ulp: + __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4 + want 0x1.ba74bap+4. */ +sv_f32_t +__sv_expf_x (sv_f32_t x, const svbool_t pg) +{ + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + /* n = round(x/(ln2/N)). */ + sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift)); + sv_f32_t n = svsub_n_f32_x (pg, z, Shift); + + /* r = x - n*ln2/N. */ + sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x); + r = sv_fma_n_f32_x (pg, -Ln2lo, n, r); + +/* scale = 2^(n/N). */ +#if SV_EXPF_USE_FEXPA + /* NaNs also need special handling with FEXPA. */ + svbool_t is_special_case + = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x)); + sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z)); +#else + sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23); + svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres); + sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000)); +#endif + + /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + sv_f32_t r2 = svmul_f32_x (pg, r, r); + sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1))); + sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3))); + q = sv_fma_f32_x (pg, p, r2, q); + p = svmul_n_f32_x (pg, r, C (4)); + sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p); + + if (unlikely (svptest_any (pg, is_special_case))) +#if SV_EXPF_USE_FEXPA + return special_case (x, sv_fma_f32_x (pg, poly, scale, scale), + is_special_case); +#else + return specialcase (pg, poly, n, e, is_special_case, scale); +#endif + + return sv_fma_f32_x (pg, poly, scale, scale); +} + +PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf) + +PL_SIG (SV, F, 1, exp, -9.9, 9.9) +PL_TEST_ULP (__sv_expf, 1.46) +PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000) +PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000) +PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000) +PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000) +PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000) +PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000) +PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000) +PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000) +#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_expf_data.c b/contrib/arm-optimized-routines/pl/math/sv_expf_data.c new file mode 100644 index 000000000000..6875adf857b6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_expf_data.c @@ -0,0 +1,12 @@ +/* + * Coefficients for single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Coefficients copied from the polynomial in math/v_expf.c. */ +const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f, + 0x1.fffdb6p-2f, 0x1.ffffecp-1f}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c new file mode 100644 index 000000000000..884e2011d2f8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log10_2u5.c @@ -0,0 +1,89 @@ +/* + * Double-precision SVE log10(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define OFF 0x3fe6900900000000 +#define N (1 << V_LOG10_TABLE_BITS) + +#define A(i) __v_log10_data.poly[i] + +static inline sv_f64_t +specialcase (sv_f64_t x, sv_f64_t y, svbool_t special) +{ + return sv_call_f64 (log10, x, y, special); +} + +/* SVE log10 algorithm. Maximum measured error is 2.46 ulps. + __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 + want 0x1.fffbdf6eaa667p-6. */ +sv_f64_t +__sv_log10_x (sv_f64_t x, const svbool_t pg) +{ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); + + svbool_t is_special_case + = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); + sv_u64_t i + = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N); + sv_f64_t k + = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52)); + sv_f64_t z = sv_as_f64_u64 ( + svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52))); + + /* log(x) = k*log(2) + log(c) + log(z/c). */ + + sv_u64_t idx = svmul_n_u64_x (pg, i, 2); + sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx); + sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx); + + /* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1): + r = z/c - 1 (we look up precomputed 1/c) + log(z/c) ~= P(r). */ + sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); + + /* hi = log(c) + k*log(2). */ + sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc); + sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + sv_f64_t r2 = svmul_f64_x (pg, r, r); + sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2))); + sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0))); + y = sv_fma_n_f64_x (pg, A (4), r2, y); + y = sv_fma_f64_x (pg, y, r2, p); + y = sv_fma_f64_x (pg, y, r2, hi); + + if (unlikely (svptest_any (pg, is_special_case))) + { + return specialcase (x, y, is_special_case); + } + return y; +} + +PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10) + +PL_SIG (SV, D, 1, log10, 0.01, 11.1) +PL_TEST_ULP (__sv_log10, 1.97) +PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000) +PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c new file mode 100644 index 000000000000..e7b1e9801fa9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log10f_3u5.c @@ -0,0 +1,88 @@ +/* + * Single-precision SVE log10 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define SpecialCaseMin 0x00800000 +#define SpecialCaseMax 0x7f800000 +#define Offset 0x3f2aaaab /* 0.666667. */ +#define Mask 0x007fffff +#define Ln2 0x1.62e43p-1f /* 0x3f317218. */ +#define InvLn10 0x1.bcb7b2p-2f + +#define P(i) __v_log10f_poly[i] + +static NOINLINE sv_f32_t +special_case (sv_f32_t x, sv_f32_t y, svbool_t special) +{ + return sv_call_f32 (log10f, x, y, special); +} + +/* Optimised implementation of SVE log10f using the same algorithm and + polynomial as v_log10f. Maximum error is 3.31ulps: + __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4. */ +sv_f32_t +__sv_log10f_x (sv_f32_t x, const svbool_t pg) +{ + sv_u32_t ix = sv_as_u32_f32 (x); + svbool_t special_cases + = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin), + SpecialCaseMax - SpecialCaseMin); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + ix = svsub_n_u32_x (pg, ix, Offset); + sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix), + 23)); /* signextend. */ + ix = svand_n_u32_x (pg, ix, Mask); + ix = svadd_n_u32_x (pg, ix, Offset); + sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f); + + /* y = log10(1+r) + n*log10(2) + log10(1+r) ~ r * InvLn(10) + P(r) + where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for + log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3) + + P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67))) + and Qij = Pi + r * Pj. */ + sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); + sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); + sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); + sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); + + sv_f32_t r2 = svmul_f32_x (pg, r, r); + sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56); + y = sv_fma_f32_x (pg, y, r2, q34); + y = sv_fma_f32_x (pg, y, r2, q12); + + /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less + accurate. */ + sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r); + y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10)); + + if (unlikely (svptest_any (pg, special_cases))) + { + return special_case (x, y, special_cases); + } + return y; +} + +PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f) + +PL_SIG (SV, F, 1, log10, 0.01, 11.1) +PL_TEST_ULP (__sv_log10f, 2.82) +PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000) +PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c new file mode 100644 index 000000000000..a0815bb5646f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log2_3u.c @@ -0,0 +1,85 @@ +/* + * Double-precision SVE log2 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define InvLn2 sv_f64 (0x1.71547652b82fep0) +#define N (1 << V_LOG2_TABLE_BITS) +#define OFF 0x3fe6900900000000 +#define P(i) sv_f64 (__v_log2_data.poly[i]) + +NOINLINE static sv_f64_t +specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp) +{ + return sv_call_f64 (log2, x, y, cmp); +} + +/* Double-precision SVE log2 routine. Implements the same algorithm as vector + log10, with coefficients and table entries scaled in extended precision. + The maximum observed error is 2.58 ULP: + __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +sv_f64_t +__sv_log2_x (sv_f64_t x, const svbool_t pg) +{ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); + + svbool_t special + = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); + sv_u64_t i + = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N); + sv_f64_t k + = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52)); + sv_f64_t z = sv_as_f64_u64 ( + svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52))); + + sv_u64_t idx = svmul_n_u64_x (pg, i, 2); + sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx); + sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + + sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); + sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c); + + sv_f64_t r2 = svmul_f64_x (pg, r, r); + sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2)); + sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0)); + sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23); + y = sv_fma_f64_x (pg, y, r2, p_01); + y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w)); + + if (unlikely (svptest_any (pg, special))) + { + return specialcase (x, y, special); + } + return y; +} + +PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2) + +PL_SIG (SV, D, 1, log2, 0.01, 11.1) +PL_TEST_ULP (__sv_log2, 2.09) +PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2) +PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000) +PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000) +PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000) + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c new file mode 100644 index 000000000000..fe2ab16b90b7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log2f_2u5.c @@ -0,0 +1,79 @@ +/* + * Single-precision vector/SVE log2 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define P(i) __v_log2f_data.poly[i] + +#define Ln2 (0x1.62e43p-1f) /* 0x3f317218. */ +#define Min (0x00800000) +#define Max (0x7f800000) +#define Mask (0x007fffff) +#define Off (0x3f2aaaab) /* 0.666667. */ + +static NOINLINE sv_f32_t +specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (log2f, x, y, cmp); +} + +/* Optimised implementation of SVE log2f, using the same algorithm + and polynomial as Neon log2f. Maximum error is 2.48 ULPs: + __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +sv_f32_t +__sv_log2f_x (sv_f32_t x, const svbool_t pg) +{ + sv_u32_t u = sv_as_u32_f32 (x); + svbool_t special + = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min)); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = svsub_n_u32_x (pg, u, Off); + sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u), + 23)); /* Sign-extend. */ + u = svand_n_u32_x (pg, u, Mask); + u = svadd_n_u32_x (pg, u, Off); + sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f); + + /* y = log2(1+r) + n. */ + sv_f32_t r2 = svmul_f32_x (pg, r, r); + + /* Evaluate polynomial using pairwise Horner scheme. */ + sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6))); + sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4))); + sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2))); + sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0))); + sv_f32_t y; + y = sv_fma_n_f32_x (pg, P (8), r2, p67); + y = sv_fma_f32_x (pg, y, r2, p45); + y = sv_fma_f32_x (pg, y, r2, p23); + y = sv_fma_f32_x (pg, y, r2, p01); + y = sv_fma_f32_x (pg, y, r, n); + + if (unlikely (svptest_any (pg, special))) + return specialcase (x, y, special); + return y; +} + +PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f) + +PL_SIG (SV, F, 1, log2, 0.01, 11.1) +PL_TEST_ULP (__sv_log2f, 1.99) +PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f) +PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000) +PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000) +PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000) + +#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c new file mode 100644 index 000000000000..7f06fd31ebf1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log_2u5.c @@ -0,0 +1,85 @@ +/* + * Double-precision SVE log(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define A(i) __sv_log_data.poly[i] +#define Ln2 (0x1.62e42fefa39efp-1) +#define N (1 << SV_LOG_TABLE_BITS) +#define OFF (0x3fe6900900000000) + +double +optr_aor_log_f64 (double); + +static NOINLINE sv_f64_t +__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +{ + return sv_call_f64 (optr_aor_log_f64, x, y, cmp); +} + +/* SVE port of Neon log algorithm from math/. + Maximum measured error is 2.17 ulp: + __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ +sv_f64_t +__sv_log_x (sv_f64_t x, const svbool_t pg) +{ + sv_u64_t ix = sv_as_u64_f64 (x); + sv_u64_t top = svlsr_n_u64_x (pg, ix, 48); + svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), + sv_u64 (0x7ff0 - 0x0010)); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF); + /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power + of 2. */ + sv_u64_t i + = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)), + N - 1); + sv_s64_t k + = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift. */ + sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)); + sv_f64_t z = sv_as_f64_u64 (iz); + /* Lookup in 2 global lists (length N). */ + sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i); + sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0)); + sv_f64_t kd = sv_to_f64_s64_x (pg, k); + /* hi = r + log(c) + k*Ln2. */ + sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r)); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + sv_f64_t r2 = svmul_f64_x (pg, r, r); + sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2))); + sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0))); + y = sv_fma_n_f64_x (pg, A (4), r2, y); + y = sv_fma_f64_x (pg, y, r2, p); + y = sv_fma_f64_x (pg, y, r2, hi); + + if (unlikely (svptest_any (pg, cmp))) + return __sv_log_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_log_x, _ZGVsMxv_log) + +PL_SIG (SV, D, 1, log, 0.01, 11.1) +PL_TEST_ULP (__sv_log, 1.68) +PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000) +PL_TEST_INTERVAL (__sv_log, 100, inf, 50000) +#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_log_data.c b/contrib/arm-optimized-routines/pl/math/sv_log_data.c new file mode 100644 index 000000000000..77f9989444f5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_log_data.c @@ -0,0 +1,146 @@ +/* + * Coefficients for double-precision SVE log(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct sv_log_data __sv_log_data = { + /* All coefficients and table entries are copied from the Neon routine in + math/. See math/v_log_data.c for an explanation of the algorithm. */ + + .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0, + 0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0, + 0x1.623f1d916f323p+0, 0x1.60578da220f65p+0, + 0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0, + 0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0, + 0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0, + 0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0, + 0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0, + 0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0, + 0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0, + 0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0, + 0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0, + 0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0, + 0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0, + 0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0, + 0x1.36987540fbf53p+0, 0x1.352166b648f61p+0, + 0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0, + 0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0, + 0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0, + 0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0, + 0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0, + 0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0, + 0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0, + 0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0, + 0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0, + 0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0, + 0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0, + 0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0, + 0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0, + 0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0, + 0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0, + 0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0, + 0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0, + 0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0, + 0x1.07321489b13eap+0, 0x1.062491aee9904p+0, + 0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0, + 0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0, + 0x1.010037d38bcc2p+0, 1.0, + 0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1, + 0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1, + 0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1, + 0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1, + 0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1, + 0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1, + 0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1, + 0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1, + 0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1, + 0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1, + 0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1, + 0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1, + 0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1, + 0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1, + 0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1, + 0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1, + 0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1, + 0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1, + 0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1, + 0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1, + 0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1, + 0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1, + 0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1, + 0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1, + 0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1, + 0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1}, + + .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2, + -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2, + -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2, + -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2, + -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2, + -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2, + -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2, + -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2, + -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2, + -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3, + -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3, + -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3, + -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3, + -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3, + -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3, + -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3, + -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3, + -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3, + -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3, + -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3, + -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3, + -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3, + -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4, + -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4, + -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4, + -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4, + -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4, + -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4, + -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4, + -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4, + -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5, + -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5, + -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5, + -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5, + -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6, + -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6, + -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7, + -0x1.ff6fe1feb4e53p-9, 0.0, + 0x1.fe91885ec8e20p-8, 0x1.fc516f716296dp-7, + 0x1.7bb4dd70a015bp-6, 0x1.f84c99b34b674p-6, + 0x1.39f9ce4fb2d71p-5, 0x1.7756c0fd22e78p-5, + 0x1.b43ee82db8f3ap-5, 0x1.f0b3fced60034p-5, + 0x1.165bd78d4878ep-4, 0x1.3425d2715ebe6p-4, + 0x1.51b8bd91b7915p-4, 0x1.6f15632c76a47p-4, + 0x1.8c3c88ecbe503p-4, 0x1.a92ef077625dap-4, + 0x1.c5ed5745fa006p-4, 0x1.e27876de1c993p-4, + 0x1.fed104fce4cdcp-4, 0x1.0d7bd9c17d78bp-3, + 0x1.1b76986cef97bp-3, 0x1.295913d24f750p-3, + 0x1.37239fa295d17p-3, 0x1.44d68dd78714bp-3, + 0x1.52722ebe5d780p-3, 0x1.5ff6d12671f98p-3, + 0x1.6d64c2389484bp-3, 0x1.7abc4da40fddap-3, + 0x1.87fdbda1e8452p-3, 0x1.95295b06a5f37p-3, + 0x1.a23f6d34abbc5p-3, 0x1.af403a28e04f2p-3, + 0x1.bc2c06a85721ap-3, 0x1.c903161240163p-3, + 0x1.d5c5aa93287ebp-3, 0x1.e274051823fa9p-3, + 0x1.ef0e656300c16p-3, 0x1.fb9509f05aa2ap-3, + 0x1.04041821f37afp-2, 0x1.0a340a49b3029p-2, + 0x1.105a7918a126dp-2, 0x1.1677819812b84p-2, + 0x1.1c8b405b40c0ep-2, 0x1.2295d16cfa6b1p-2, + 0x1.28975066318a2p-2, 0x1.2e8fd855d86fcp-2, + 0x1.347f83d605e59p-2, 0x1.3a666d1244588p-2, + 0x1.4044adb6f8ec4p-2, 0x1.461a5f077558cp-2, + 0x1.4be799e20b9c8p-2, 0x1.51ac76a6b79dfp-2, + 0x1.57690d5744a45p-2, 0x1.5d1d758e45217p-2}, + + .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2, + 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3}, +}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c new file mode 100644 index 000000000000..11f0b8aa12c5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_logf_3u4.c @@ -0,0 +1,77 @@ +/* + * Single-precision vector log function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define P(i) __sv_logf_poly[i] + +#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */ +#define Min (0x00800000) +#define Max (0x7f800000) +#define Mask (0x007fffff) +#define Off (0x3f2aaaab) /* 0.666667 */ + +float +optr_aor_log_f32 (float); + +static NOINLINE sv_f32_t +__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (optr_aor_log_f32, x, y, cmp); +} + +/* Optimised implementation of SVE logf, using the same algorithm and polynomial + as the Neon routine in math/. Maximum error is 3.34 ULPs: + __sv_logf(0x1.557298p+0) got 0x1.26edecp-2 + want 0x1.26ede6p-2. */ +sv_f32_t +__sv_logf_x (sv_f32_t x, const svbool_t pg) +{ + sv_u32_t u = sv_as_u32_f32 (x); + svbool_t cmp + = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min)); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u = svsub_n_u32_x (pg, u, Off); + sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u), + 23)); /* Sign-extend. */ + u = svand_n_u32_x (pg, u, Mask); + u = svadd_n_u32_x (pg, u, Off); + sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f); + + /* y = log(1+r) + n*ln2. */ + sv_f32_t r2 = svmul_f32_x (pg, r, r); + /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ + sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2))); + sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4))); + sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6))); + p = sv_fma_n_f32_x (pg, P (0), r2, p); + q = sv_fma_f32_x (pg, p, r2, q); + y = sv_fma_f32_x (pg, q, r2, y); + p = sv_fma_n_f32_x (pg, Ln2, n, r); + y = sv_fma_f32_x (pg, y, r2, p); + + if (unlikely (svptest_any (pg, cmp))) + return __sv_logf_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf) + +PL_SIG (SV, F, 1, log, 0.01, 11.1) +PL_TEST_ULP (__sv_logf, 2.85) +PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000) +PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000) +#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_logf_data.c b/contrib/arm-optimized-routines/pl/math/sv_logf_data.c new file mode 100644 index 000000000000..51dd7a7eeb37 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_logf_data.c @@ -0,0 +1,12 @@ +/* + * Coefficients for single-precision SVE log function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +const float __sv_logf_poly[] = { + /* Copied from coeffs for the Neon routine in math/. */ + -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f, + -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f, +}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_math.h b/contrib/arm-optimized-routines/pl/math/sv_math.h new file mode 100644 index 000000000000..5ef0ad3bd5e0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_math.h @@ -0,0 +1,245 @@ +/* + * Wrapper functions for SVE ACLE. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef SV_MATH_H +#define SV_MATH_H + +#ifndef WANT_VMATH +/* Enable the build of vector math code. */ +#define WANT_VMATH 1 +#endif +#if WANT_VMATH + +#if WANT_SVE_MATH +#define SV_SUPPORTED 1 + +#include <arm_sve.h> +#include <stdbool.h> + +#include "math_config.h" + +typedef float f32_t; +typedef uint32_t u32_t; +typedef int32_t s32_t; +typedef double f64_t; +typedef uint64_t u64_t; +typedef int64_t s64_t; + +typedef svfloat64_t sv_f64_t; +typedef svuint64_t sv_u64_t; +typedef svint64_t sv_s64_t; + +typedef svfloat32_t sv_f32_t; +typedef svuint32_t sv_u32_t; +typedef svint32_t sv_s32_t; + +/* Double precision. */ +static inline sv_s64_t +sv_s64 (s64_t x) +{ + return svdup_n_s64 (x); +} + +static inline sv_u64_t +sv_u64 (u64_t x) +{ + return svdup_n_u64 (x); +} + +static inline sv_f64_t +sv_f64 (f64_t x) +{ + return svdup_n_f64 (x); +} + +static inline sv_f64_t +sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z) +{ + return svmla_f64_x (pg, z, x, y); +} + +/* res = z + x * y with x scalar. */ +static inline sv_f64_t +sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z) +{ + return svmla_n_f64_x (pg, z, y, x); +} + +static inline sv_s64_t +sv_as_s64_u64 (sv_u64_t x) +{ + return svreinterpret_s64_u64 (x); +} + +static inline sv_u64_t +sv_as_u64_f64 (sv_f64_t x) +{ + return svreinterpret_u64_f64 (x); +} + +static inline sv_f64_t +sv_as_f64_u64 (sv_u64_t x) +{ + return svreinterpret_f64_u64 (x); +} + +static inline sv_f64_t +sv_to_f64_s64_x (svbool_t pg, sv_s64_t s) +{ + return svcvt_f64_x (pg, s); +} + +static inline sv_f64_t +sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + f64_t elem = svclastb_n_f64 (p, 0, x); + elem = (*f) (elem); + sv_f64_t y2 = svdup_n_f64 (elem); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline sv_f64_t +sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y, + svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + f64_t elem1 = svclastb_n_f64 (p, 0, x1); + f64_t elem2 = svclastb_n_f64 (p, 0, x2); + f64_t ret = (*f) (elem1, elem2); + sv_f64_t y2 = svdup_n_f64 (ret); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +/* Load array of uint64_t into svuint64_t. */ +static inline sv_u64_t +sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx) +{ + return svld1_gather_u64index_u64 (pg, tab, idx); +} + +/* Load array of double into svfloat64_t. */ +static inline sv_f64_t +sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx) +{ + return svld1_gather_u64index_f64 (pg, tab, idx); +} + +static inline sv_u64_t +sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y) +{ + sv_u64_t q = svdiv_n_u64_x (pg, x, y); + return svmls_n_u64_x (pg, x, q, y); +} + +/* Single precision. */ +static inline sv_s32_t +sv_s32 (s32_t x) +{ + return svdup_n_s32 (x); +} + +static inline sv_u32_t +sv_u32 (u32_t x) +{ + return svdup_n_u32 (x); +} + +static inline sv_f32_t +sv_f32 (f32_t x) +{ + return svdup_n_f32 (x); +} + +static inline sv_f32_t +sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z) +{ + return svmla_f32_x (pg, z, x, y); +} + +/* res = z + x * y with x scalar. */ +static inline sv_f32_t +sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z) +{ + return svmla_n_f32_x (pg, z, y, x); +} + +static inline sv_u32_t +sv_as_u32_f32 (sv_f32_t x) +{ + return svreinterpret_u32_f32 (x); +} + +static inline sv_f32_t +sv_as_f32_u32 (sv_u32_t x) +{ + return svreinterpret_f32_u32 (x); +} + +static inline sv_s32_t +sv_as_s32_u32 (sv_u32_t x) +{ + return svreinterpret_s32_u32 (x); +} + +static inline sv_f32_t +sv_to_f32_s32_x (svbool_t pg, sv_s32_t s) +{ + return svcvt_f32_x (pg, s); +} + +static inline sv_s32_t +sv_to_s32_f32_x (svbool_t pg, sv_f32_t x) +{ + return svcvt_s32_f32_x (pg, x); +} + +static inline sv_f32_t +sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + f32_t elem = svclastb_n_f32 (p, 0, x); + elem = (*f) (elem); + sv_f32_t y2 = svdup_n_f32 (elem); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +static inline sv_f32_t +sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y, + svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + f32_t elem1 = svclastb_n_f32 (p, 0, x1); + f32_t elem2 = svclastb_n_f32 (p, 0, x2); + f32_t ret = (*f) (elem1, elem2); + sv_f32_t y2 = svdup_n_f32 (ret); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +#endif +#endif +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_powi.c b/contrib/arm-optimized-routines/pl/math/sv_powi.c new file mode 100644 index 000000000000..1bb0eb3d3498 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_powi.c @@ -0,0 +1,53 @@ +/* + * Double-precision SVE powi(x, n) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#if SV_SUPPORTED + +/* Optimized double-precision vector powi (double base, long integer power). + powi is developed for environments in which accuracy is of much less + importance than performance, hence we provide no estimate for worst-case + error. */ +svfloat64_t +__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p) +{ + /* Compute powi by successive squaring, right to left. */ + svfloat64_t acc = svdup_n_f64 (1.0); + svbool_t want_recip = svcmplt_n_s64 (p, ns, 0); + svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns)); + + /* We use a max to avoid needing to check whether any lane != 0 on each + iteration. */ + uint64_t max_n = svmaxv_u64 (p, ns_abs); + + svfloat64_t c = as; + /* Successively square c, and use merging predication (_m) to determine + whether or not to perform the multiplication or keep the previous + iteration. */ + while (true) + { + svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull); + acc = svmul_f64_m (px, acc, c); + max_n >>= 1; + if (max_n == 0) + break; + + ns_abs = svlsr_n_u64_x (p, ns_abs, 1); + c = svmul_f64_x (p, c, c); + } + + /* Negative powers are handled by computing the abs(n) version and then + taking the reciprocal. */ + if (svptest_any (want_recip, want_recip)) + acc = svdivr_n_f64_m (want_recip, acc, 1.0); + + return acc; +} + +strong_alias (__sv_powi_x, _ZGVsMxvv_powk) + +#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_powif.c b/contrib/arm-optimized-routines/pl/math/sv_powif.c new file mode 100644 index 000000000000..d0567e393927 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_powif.c @@ -0,0 +1,54 @@ +/* + * Single-precision SVE powi(x, n) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#if SV_SUPPORTED + +/* Optimized single-precision vector powi (float base, integer power). + powi is developed for environments in which accuracy is of much less + importance than performance, hence we provide no estimate for worst-case + error. */ +svfloat32_t +__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p) +{ + /* Compute powi by successive squaring, right to left. */ + svfloat32_t acc = svdup_n_f32 (1.f); + svbool_t want_recip = svcmplt_n_s32 (p, ns, 0); + svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns)); + + /* We use a max to avoid needing to check whether any lane != 0 on each + iteration. */ + uint32_t max_n = svmaxv_u32 (p, ns_abs); + + svfloat32_t c = as; + /* Successively square c, and use merging predication (_m) to determine + whether or not to perform the multiplication or keep the previous + iteration. */ + while (true) + { + svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1); + acc = svmul_f32_m (px, acc, c); + max_n >>= 1; + if (max_n == 0) + break; + + ns_abs = svlsr_n_u32_x (p, ns_abs, 1); + c = svmul_f32_x (p, c, c); + } + + /* Negative powers are handled by computing the abs(n) version and then + taking the reciprocal. */ + if (svptest_any (want_recip, want_recip)) + acc = svdivr_n_f32_m (want_recip, acc, 1.0f); + + return acc; +} + +/* Note no trailing f for ZGV... name - 64-bit integer version is powk. */ +strong_alias (__sv_powif_x, _ZGVsMxvv_powi) + +#endif // SV_SUPPORTED diff --git a/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c b/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c new file mode 100644 index 000000000000..3fee08061918 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sin_3u.c @@ -0,0 +1,89 @@ +/* + * Double-precision SVE sin(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define InvPi (sv_f64 (0x1.45f306dc9c883p-2)) +#define HalfPi (sv_f64 (0x1.921fb54442d18p+0)) +#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1)) +#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0)) +#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26)) +#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54)) +#define Shift (sv_f64 (0x1.8p52)) +#define RangeVal (sv_f64 (0x1p23)) +#define AbsMask (0x7fffffffffffffff) + +static NOINLINE sv_f64_t +__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp) +{ + return sv_call_f64 (sin, x, y, cmp); +} + +/* A fast SVE implementation of sin based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum observed error in 2.52 ULP: + __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40 + want 0x1.10ace8f3e7868p-40. */ +sv_f64_t +__sv_sin_x (sv_f64_t x, const svbool_t pg) +{ + sv_f64_t n, r, r2, y; + sv_u64_t sign; + svbool_t cmp; + + r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask)); + sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask); + cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal)); + + /* n = rint(|x|/(pi/2)). */ + sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift); + n = svsub_f64_x (pg, q, Shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = sv_fma_f64_x (pg, NegPio2_1, n, r); + r = sv_fma_f64_x (pg, NegPio2_2, n, r); + r = sv_fma_f64_x (pg, NegPio2_3, n, r); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q)); + + /* sin(r) poly approx. */ + r2 = svtsmul_f64 (r, sv_as_u64_f64 (q)); + y = sv_f64 (0.0); + y = svtmad_f64 (y, r2, 7); + y = svtmad_f64 (y, r2, 6); + y = svtmad_f64 (y, r2, 5); + y = svtmad_f64 (y, r2, 4); + y = svtmad_f64 (y, r2, 3); + y = svtmad_f64 (y, r2, 2); + y = svtmad_f64 (y, r2, 1); + y = svtmad_f64 (y, r2, 0); + + /* Apply factor. */ + y = svmul_f64_x (pg, f, y); + + /* sign = y^sign. */ + y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign)); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + if (unlikely (svptest_any (pg, cmp))) + return __sv_sin_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin) + +PL_SIG (SV, D, 1, sin, -3.1, 3.1) +PL_TEST_ULP (__sv_sin, 2.03) +PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c new file mode 100644 index 000000000000..9184ccd3cf0c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinf_1u9.c @@ -0,0 +1,84 @@ +/* + * Single-precision SVE sin(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +#define A3 (sv_f32 (__sv_sinf_data.coeffs[3])) +#define A5 (sv_f32 (__sv_sinf_data.coeffs[2])) +#define A7 (sv_f32 (__sv_sinf_data.coeffs[1])) +#define A9 (sv_f32 (__sv_sinf_data.coeffs[0])) + +#define NegPi1 (sv_f32 (-0x1.921fb6p+1f)) +#define NegPi2 (sv_f32 (0x1.777a5cp-24f)) +#define NegPi3 (sv_f32 (0x1.ee59dap-49f)) +#define RangeVal (sv_f32 (0x1p20f)) +#define InvPi (sv_f32 (0x1.45f306p-2f)) +#define Shift (sv_f32 (0x1.8p+23f)) +#define AbsMask (0x7fffffff) + +static NOINLINE sv_f32_t +__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (sinf, x, y, cmp); +} + +/* A fast SVE implementation of sinf. + Maximum error: 1.89 ULPs. + This maximum error is achieved at multiple values in [-2^18, 2^18] + but one example is: + __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */ +sv_f32_t +__sv_sinf_x (sv_f32_t x, const svbool_t pg) +{ + sv_f32_t n, r, r2, y; + sv_u32_t sign, odd; + svbool_t cmp; + + r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask)); + sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask); + cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal)); + + /* n = rint(|x|/pi). */ + n = sv_fma_f32_x (pg, InvPi, r, Shift); + odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31); + n = svsub_f32_x (pg, n, Shift); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = sv_fma_f32_x (pg, NegPi1, n, r); + r = sv_fma_f32_x (pg, NegPi2, n, r); + r = sv_fma_f32_x (pg, NegPi3, n, r); + + /* sin(r) approx using a degree 9 polynomial from the Taylor series + expansion. Note that only the odd terms of this are non-zero. */ + r2 = svmul_f32_x (pg, r, r); + y = sv_fma_f32_x (pg, A9, r2, A7); + y = sv_fma_f32_x (pg, y, r2, A5); + y = sv_fma_f32_x (pg, y, r2, A3); + y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r); + + /* sign = y^sign^odd. */ + y = sv_as_f32_u32 ( + sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd))); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + if (unlikely (svptest_any (pg, cmp))) + return __sv_sinf_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf) + +PL_SIG (SV, F, 1, sin, -3.1, 3.1) +PL_TEST_ULP (__sv_sinf, 1.40) +PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c b/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c new file mode 100644 index 000000000000..1e1ab5e48df1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_sinf_poly_data.c @@ -0,0 +1,19 @@ +/* + * Data used in single-precision sin(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Polynomial coefficients for approximating sin(x) in single + precision. These are the non-zero coefficients from the + degree 9 Taylor series expansion of sin. */ + +const struct sv_sinf_data __sv_sinf_data = {.coeffs = { + 0x1.5b2e76p-19f, + -0x1.9f42eap-13f, + 0x1.110df4p-7f, + -0x1.555548p-3f, + }}; diff --git a/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c new file mode 100644 index 000000000000..cca43bd886fd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/sv_tanf_3u5.c @@ -0,0 +1,112 @@ +/* + * Single-precision vector tan(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "sv_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if SV_SUPPORTED + +/* Constants. */ +#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f)) +#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f)) +#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f)) +#define InvPio2 (sv_f32 (0x1.45f306p-1f)) +#define RangeVal (sv_f32 (0x1p15f)) +#define Shift (sv_f32 (0x1.8p+23f)) + +#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i]) + +/* Use full Estrin's scheme to evaluate polynomial. */ +static inline sv_f32_t +eval_poly (svbool_t pg, sv_f32_t z) +{ + sv_f32_t z2 = svmul_f32_x (pg, z, z); + sv_f32_t z4 = svmul_f32_x (pg, z2, z2); + sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0)); + sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2)); + sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4)); + sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10); + sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10); + return y; +} + +static NOINLINE sv_f32_t +__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp) +{ + return sv_call_f32 (tanf, x, y, cmp); +} + +/* Fast implementation of SVE tanf. + Maximum error is 3.45 ULP: + __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +sv_f32_t +__sv_tanf_x (sv_f32_t x, const svbool_t pg) +{ + /* Determine whether input is too large to perform fast regression. */ + svbool_t cmp = svacge_f32 (pg, x, RangeVal); + svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0)); + + /* n = rint(x/(pi/2)). */ + sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift); + sv_f32_t n = svsub_f32_x (pg, q, Shift); + /* n is already a signed integer, simply convert it. */ + sv_s32_t in = sv_to_s32_f32_x (pg, n); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1)); + svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0)); + + /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ + sv_f32_t r; + r = sv_fma_f32_x (pg, NegPio2_1, n, x); + r = sv_fma_f32_x (pg, NegPio2_2, n, r); + r = sv_fma_f32_x (pg, NegPio2_3, n, r); + + /* If x lives in an interval, where |tan(x)| + - is finite, then use a polynomial approximation of the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use symmetries of tangent and the identity + tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use + the same polynomial approximation of tan as above. */ + + /* Perform additional reduction if required. */ + sv_f32_t z = svneg_f32_m (r, pred_alt, r); + + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ + sv_f32_t z2 = svmul_f32_x (pg, z, z); + sv_f32_t p = eval_poly (pg, z2); + sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z); + + /* Transform result back, if necessary. */ + sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y); + y = svsel_f32 (pred_alt, inv_y, y); + + /* Fast reduction does not handle the x = -0.0 case well, + therefore it is fixed here. */ + y = svsel_f32 (pred_minuszero, x, y); + + /* No need to pass pg to specialcase here since cmp is a strict subset, + guaranteed by the cmpge above. */ + if (unlikely (svptest_any (pg, cmp))) + return __sv_tanf_specialcase (x, y, cmp); + return y; +} + +PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf) + +PL_SIG (SV, F, 1, tan, -3.1, 3.1) +PL_TEST_ULP (__sv_tanf, 2.96) +PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000) +PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000) +PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000) +PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000) +PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/tanf_3u3.c b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c new file mode 100644 index 000000000000..ec006dc04c4c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tanf_3u3.c @@ -0,0 +1,202 @@ +/* + * Single-precision scalar tan(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "pairwise_hornerf.h" + +/* Useful constants. */ +#define NegPio2_1 (-0x1.921fb6p+0f) +#define NegPio2_2 (0x1.777a5cp-25f) +#define NegPio2_3 (0x1.ee59dap-50f) +/* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */ +#define RangeVal (0x1p17f) +#define InvPio2 ((0x1.45f306p-1f)) +#define Shift (0x1.8p+23f) +#define AbsMask (0x7fffffff) +#define Pio4 (0x1.921fb6p-1) +/* 2PI * 2^-64. */ +#define Pio2p63 (0x1.921FB54442D18p-62) + +#define P(i) __tanf_poly_data.poly_tan[i] +#define Q(i) __tanf_poly_data.poly_cotan[i] + +static inline float +eval_P (float z) +{ + return PAIRWISE_HORNER_5 (z, z * z, P); +} + +static inline float +eval_Q (float z) +{ + return PAIRWISE_HORNER_3 (z, z * z, Q); +} + +/* Reduction of the input argument x using Cody-Waite approach, such that x = r + + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */ +static inline float +reduce (float x, int32_t *in) +{ + /* n = rint(x/(pi/2)). */ + float r = x; + float q = fmaf (InvPio2, r, Shift); + float n = q - Shift; + /* There is no rounding here, n is representable by a signed integer. */ + *in = (int32_t) n; + /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */ + r = fmaf (NegPio2_1, n, r); + r = fmaf (NegPio2_2, n, r); + r = fmaf (NegPio2_3, n, r); + return r; +} + +/* Table with 4/PI to 192 bit precision. To avoid unaligned accesses + only 8 new bits are added per entry, making the table 4 times larger. */ +static const uint32_t __inv_pio4[24] + = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44, + 0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1, + 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62, + 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041}; + +/* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. + XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). + Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. + Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit + multiply computes the exact 2.62-bit fixed-point modulo. Since the result + can have at most 29 leading zeros after the binary point, the double + precision result is accurate to 33 bits. */ +static inline double +reduce_large (uint32_t xi, int *np) +{ + const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15]; + int shift = (xi >> 23) & 7; + uint64_t n, res0, res1, res2; + + xi = (xi & 0xffffff) | 0x800000; + xi <<= shift; + + res0 = xi * arr[0]; + res1 = (uint64_t) xi * arr[4]; + res2 = (uint64_t) xi * arr[8]; + res0 = (res2 >> 32) | (res0 << 32); + res0 += res1; + + n = (res0 + (1ULL << 61)) >> 62; + res0 -= n << 62; + double x = (int64_t) res0; + *np = n; + return x * Pio2p63; +} + +/* Top 12 bits of the float representation with the sign bit cleared. */ +static inline uint32_t +top12 (float x) +{ + return (asuint (x) >> 20); +} + +/* Fast single-precision tan implementation. + Maximum ULP error: 3.293ulps. + tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */ +float +tanf (float x) +{ + /* Get top words. */ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + uint32_t ia12 = ia >> 20; + + /* Dispatch between no reduction (small numbers), fast reduction and + slow large numbers reduction. The reduction step determines r float + (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */ + int32_t n; + float r; + if (ia12 < top12 (Pio4)) + { + /* Optimize small values. */ + if (unlikely (ia12 < top12 (0x1p-12f))) + { + if (unlikely (ia12 < top12 (0x1p-126f))) + /* Force underflow for tiny x. */ + force_eval_float (x * x); + return x; + } + + /* tan (x) ~= x + x^3 * P(x^2). */ + float x2 = x * x; + float y = eval_P (x2); + return fmaf (x2, x * y, x); + } + /* Similar to other trigonometric routines, fast inaccurate reduction is + performed for values of x from pi/4 up to RangeVal. In order to keep errors + below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for + other trigonometric routines. Above this value more advanced but slower + reduction techniques need to be implemented to reach a similar accuracy. + */ + else if (ia12 < top12 (RangeVal)) + { + /* Fast inaccurate reduction. */ + r = reduce (x, &n); + } + else if (ia12 < 0x7f8) + { + /* Slow accurate reduction. */ + uint32_t sign = ix & ~AbsMask; + double dar = reduce_large (ia, &n); + float ar = (float) dar; + r = asfloat (asuint (ar) ^ sign); + } + else + { + /* tan(Inf or NaN) is NaN. */ + return __math_invalidf (x); + } + + /* If x lives in an interval where |tan(x)| + - is finite then use an approximation of tangent in the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use an approximation of cotangent in the form + cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early. + Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r), + we only need to change the sign of r to obtain tan(x) from cotan(r). + This 2-interval approach requires 2 different sets of coefficients P and + Q, where Q is a lower order polynomial than P. */ + + /* Determine if x lives in an interval where |tan(x)| grows to infinity. */ + uint32_t alt = (uint32_t) n & 1; + + /* Perform additional reduction if required. */ + float z = alt ? -r : r; + + /* Prepare backward transformation. */ + float z2 = r * r; + float offset = alt ? 1.0f / z : z; + float scale = alt ? z : z * z2; + + /* Evaluate polynomial approximation of tan or cotan. */ + float p = alt ? eval_Q (z2) : eval_P (z2); + + /* A unified way of assembling the result on both interval types. */ + return fmaf (scale, p, offset); +} + +PL_SIG (S, F, 1, tan, -3.1, 3.1) +PL_TEST_ULP (tanf, 2.80) +PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) +PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000) +PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000) +PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000) +PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000) +PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000) +PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000) +PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000) +PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) +PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000) +PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000) +PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000) diff --git a/contrib/arm-optimized-routines/pl/math/tanf_data.c b/contrib/arm-optimized-routines/pl/math/tanf_data.c new file mode 100644 index 000000000000..a6b9d512eed2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tanf_data.c @@ -0,0 +1,45 @@ +/* + * Data used in single-precision tan(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +const struct tanf_poly_data __tanf_poly_data = { +.poly_tan = { +/* Coefficients generated using: + poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a*a;b*b]); + optimize relative error + final prec : 23 bits + deg : 5 + a : 0x1p-126 ^ 2 + b : ((pi) / 0x1p2) ^ 2 + dirty rel error: 0x1.f7c2e4p-25 + dirty abs error: 0x1.f7c2ecp-25. */ +0x1.55555p-2, +0x1.11166p-3, +0x1.b88a78p-5, +0x1.7b5756p-6, +0x1.4ef4cep-8, +0x1.0e1e74p-7 +}, +.poly_cotan = { +/* Coefficients generated using: + fpminimax(f(x) = (0x1p0 / tan(sqrt(x)) - 0x1p0 / sqrt(x)) / sqrt(x), deg, [|dtype ...|], [a;b]) + optimize a single polynomial + optimize absolute error + final prec : 23 bits + working prec : 128 bits + deg : 3 + a : 0x1p-126 + b : (pi) / 0x1p2 + dirty rel error : 0x1.81298cp-25 + dirty abs error : 0x1.a8acf4p-25. */ +-0x1.55555p-2, /* -0.33333325. */ +-0x1.6c23e4p-6, /* -2.2225354e-2. */ +-0x1.12dbap-9, /* -2.0969994e-3. */ +-0x1.05a1c2p-12, /* -2.495116e-4. */ +} +}; diff --git a/contrib/arm-optimized-routines/pl/math/tanh_3u.c b/contrib/arm-optimized-routines/pl/math/tanh_3u.c new file mode 100644 index 000000000000..46d9fb3fd7e1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tanh_3u.c @@ -0,0 +1,82 @@ +/* + * Double-precision tanh(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffffffffffff +#define InvLn2 0x1.71547652b82fep0 +#define Ln2hi 0x1.62e42fefa39efp-1 +#define Ln2lo 0x1.abc9e3b39803fp-56 +#define Shift 0x1.8p52 +#define C(i) __expm1_poly[i] + +#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ +#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ +#define One 0x3ff0000000000000 + +static inline double +expm1_inline (double x) +{ + /* Helper routine for calculating exp(x) - 1. Copied from expm1_2u5.c, with + several simplifications: + - No special-case handling for tiny or special values. + - Simpler combination of p and t in final stage of the algorithm. + - Use shift-and-add instead of ldexp to calculate t. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + double j = fma (InvLn2, x, Shift) - Shift; + int64_t i = j; + double f = fma (j, -Ln2hi, x); + f = fma (j, -Ln2lo, f); + + /* Approximate expm1(f) using polynomial. */ + double f2 = f * f; + double f4 = f2 * f2; + double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + + /* t = 2 ^ i. */ + double t = asdouble ((uint64_t) (i + 1023) << 52); + /* expm1(x) = p * t + (t - 1). */ + return fma (p, t, t - 1); +} + +/* Approximation for double-precision tanh(x), using a simplified version of + expm1. The greatest observed error is 2.75 ULP: + tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3 + want -0x1.ba31ba4691ab4p-3. */ +double +tanh (double x) +{ + uint64_t ix = asuint64 (x); + uint64_t ia = ix & AbsMask; + uint64_t sign = ix & ~AbsMask; + + if (unlikely (ia > BoringBound)) + { + if (ia > 0x7ff0000000000000) + return __math_invalid (x); + return asdouble (One | sign); + } + + if (unlikely (ia < TinyBound)) + return x; + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + double q = expm1_inline (2 * x); + return q / (q + 2); +} + +PL_SIG (S, D, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (tanh, 2.26) +PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000) +PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000) +PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000) +PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000) +PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000) +PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000) diff --git a/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c new file mode 100644 index 000000000000..76e54a438e57 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tanhf_2u6.c @@ -0,0 +1,91 @@ +/* + * Single-precision tanh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define BoringBound \ + 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ + negative). */ +#define AbsMask 0x7fffffff +#define One 0x3f800000 + +#define Shift (0x1.8p23f) +#define InvLn2 (0x1.715476p+0f) +#define Ln2hi (0x1.62e4p-1f) +#define Ln2lo (0x1.7f7d1cp-20f) + +#define C(i) __expm1f_poly[i] + +static inline float +expm1f_inline (float x) +{ + /* Helper routine for calculating exp(x) - 1. + Copied from expm1f_1u6.c, with several simplifications: + - No special-case handling for tiny or special values, instead return early + from the main routine. + - No special handling for large values: + - No early return for infinity. + - Simpler combination of p and t in final stage of algorithm. + - |i| < 27, so can calculate t by simpler shift-and-add, instead of + ldexpf (same as vector algorithm). */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + float j = fmaf (InvLn2, x, Shift) - Shift; + int32_t i = j; + float f = fmaf (j, -Ln2hi, x); + f = fmaf (j, -Ln2lo, f); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). + Uses Estrin scheme, where the main expm1f routine uses Horner. */ + float f2 = f * f; + float p_01 = fmaf (f, C (1), C (0)); + float p_23 = fmaf (f, C (3), C (2)); + float p = fmaf (f2, p_23, p_01); + p = fmaf (f2 * f2, C (4), p); + p = fmaf (f2, p, f); + + /* t = 2^i. */ + float t = asfloat ((uint32_t) (i + 127) << 23); + /* expm1(x) ~= p * t + (t - 1). */ + return fmaf (p, t, t - 1); +} + +/* Approximation for single-precision tanh(x), using a simplified version of + expm1f. The maximum error is 2.58 ULP: + tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +float +tanhf (float x) +{ + uint32_t ix = asuint (x); + uint32_t iax = ix & AbsMask; + uint32_t sign = ix & ~AbsMask; + + if (unlikely (iax > BoringBound)) + { + if (iax > 0x7f800000) + return __math_invalidf (x); + return asfloat (One | sign); + } + + if (unlikely (iax < 0x34000000)) + return x; + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + float q = expm1f_inline (2 * x); + return q / (q + 2); +} + +PL_SIG (S, F, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (tanhf, 2.09) +PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000) +PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000) +PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000) +PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100) +PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100) diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h new file mode 100644 index 000000000000..e0f6ac70912c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_funcs.h @@ -0,0 +1,86 @@ +// clang-format off +/* + * Function entries for mathbench. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#define _ZSF1(fun, a, b) F(fun##f, a, b) +#define _ZSD1(f, a, b) D(f, a, b) + +#ifdef __vpcs + +#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b) +#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b) + +#elif __aarch64__ + +#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) +#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) + +#elif WANT_VMATH + +#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) +#define _ZVD1(f, a, b) D(__s_##f, a, b) + +#else + +#define _ZVF1(f, a, b) +#define _ZVD1(f, a, b) + +#endif + +#if WANT_SVE_MATH + +#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b) +#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b) + +#else + +#define _ZSVF1(f, a, b) +#define _ZSVD1(f, a, b) + +#endif + +/* No auto-generated wrappers for binary functions - they have be + manually defined in mathbench_wrappers.h. We have to define silent + macros for them anyway as they will be emitted by PL_SIG. */ +#define _ZSF2(...) +#define _ZSD2(...) +#define _ZVF2(...) +#define _ZVD2(...) +#define _ZSVF2(...) +#define _ZSVD2(...) + +#include "mathbench_funcs_gen.h" + +/* PL_SIG only emits entries for unary functions, since if a function + needs to be wrapped in mathbench there is no way for it to know the + same of the wrapper. Add entries for binary functions, or any other + exotic signatures that need wrapping, below. */ + +{"atan2f", 'f', 0, -10.0, 10.0, {.f = atan2f_wrap}}, +{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}}, +{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}}, + +{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}}, +{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}}, +{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}}, +{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}}, +{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}}, +{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}}, +{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}}, +{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}}, + +#if WANT_SVE_MATH +{"__sv_atan2f_x", 'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}}, +{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}}, +{"__sv_atan2_x", 'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}}, +{"_ZGVsM2vv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}}, +{"__sv_powif_x", 'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}}, +{"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}}, +{"__sv_powi_x", 'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}}, +{"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}}, +#endif + // clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h new file mode 100644 index 000000000000..eba960eb96ac --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/mathbench_wrappers.h @@ -0,0 +1,133 @@ +/* + * Function wrappers for mathbench. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +static double +atan2_wrap (double x) +{ + return atan2 (5.0, x); +} + +static float +atan2f_wrap (float x) +{ + return atan2f (5.0f, x); +} + +static double +powi_wrap (double x) +{ + return __builtin_powi (x, (int) round (x)); +} + +#if WANT_VMATH +#if __aarch64__ + +static double +__s_atan2_wrap (double x) +{ + return __s_atan2 (5.0, x); +} + +static float +__s_atan2f_wrap (float x) +{ + return __s_atan2f (5.0f, x); +} + +static v_double +__v_atan2_wrap (v_double x) +{ + return __v_atan2 (v_double_dup (5.0), x); +} + +static v_float +__v_atan2f_wrap (v_float x) +{ + return __v_atan2f (v_float_dup (5.0f), x); +} + +#ifdef __vpcs + +__vpcs static v_double +__vn_atan2_wrap (v_double x) +{ + return __vn_atan2 (v_double_dup (5.0), x); +} + +__vpcs static v_float +__vn_atan2f_wrap (v_float x) +{ + return __vn_atan2f (v_float_dup (5.0f), x); +} + +__vpcs static v_double +_Z_atan2_wrap (v_double x) +{ + return _ZGVnN2vv_atan2 (v_double_dup (5.0), x); +} + +__vpcs static v_float +_Z_atan2f_wrap (v_float x) +{ + return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x); +} + +#endif // __vpcs +#endif // __arch64__ +#endif // WANT_VMATH + +#if WANT_SVE_MATH + +static sv_float +__sv_atan2f_wrap (sv_float x, sv_bool pg) +{ + return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg); +} + +static sv_float +_Z_sv_atan2f_wrap (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg); +} + +static sv_double +__sv_atan2_wrap (sv_double x, sv_bool pg) +{ + return __sv_atan2_x (x, svdup_n_f64 (5.0), pg); +} + +static sv_double +_Z_sv_atan2_wrap (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg); +} + +static sv_float +_Z_sv_powi_wrap (sv_float x, sv_bool pg) +{ + return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg); +} + +static sv_float +__sv_powif_wrap (sv_float x, sv_bool pg) +{ + return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg); +} + +static sv_double +_Z_sv_powk_wrap (sv_double x, sv_bool pg) +{ + return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg); +} + +static sv_double +__sv_powi_wrap (sv_double x, sv_bool pg) +{ + return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg); +} + +#endif // WANT_SVE_MATH diff --git a/contrib/arm-optimized-routines/pl/math/test/pl_test.h b/contrib/arm-optimized-routines/pl/math/test/pl_test.h new file mode 100644 index 000000000000..467d1cac0c36 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/pl_test.h @@ -0,0 +1,33 @@ +/* + * PL macros for emitting various details about routines for consumption by + * runulp.sh. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception. + */ + +/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV + on PL_TEST_ULP to add EXPECT_FENV to all scalar routines. */ +#if !(V_SUPPORTED || SV_SUPPORTED) +#define PL_TEST_ULP(f, l) \ + PL_TEST_EXPECT_FENV_ALWAYS (f) \ + PL_TEST_ULP f l +#else +#define PL_TEST_ULP(f, l) PL_TEST_ULP f l +#endif + +/* Emit aliases to allow test params to be mapped from aliases back to their + aliasees. */ +#define PL_ALIAS(a, b) PL_TEST_ALIAS a b + +/* Emit routine name if e == 1 and f is expected to correctly trigger fenv + exceptions. e allows declaration to be emitted conditionally upon certain + build flags - defer expansion by one pass to allow those flags to be expanded + properly. */ +#define PL_TEST_EXPECT_FENV(f, e) PL_TEST_EXPECT_FENV_ (f, e) +#define PL_TEST_EXPECT_FENV_(f, e) PL_TEST_EXPECT_FENV_##e (f) +#define PL_TEST_EXPECT_FENV_1(f) PL_TEST_EXPECT_FENV_ENABLED f +#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1) + +#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n +#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c diff --git a/contrib/arm-optimized-routines/pl/math/test/runulp.sh b/contrib/arm-optimized-routines/pl/math/test/runulp.sh new file mode 100755 index 000000000000..4d02530d44b1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/runulp.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# ULP error check script. +# +# Copyright (c) 2019-2023, Arm Limited. +# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +#set -x +set -eu + +# cd to bin directory. +cd "${0%/*}" + +flags="${ULPFLAGS:--q}" +emu="$@" + +# Enable SVE testing +WANT_SVE_MATH=${WANT_SVE_MATH:-0} + +FAIL=0 +PASS=0 + +t() { + key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}') + L=$(cat $LIMITS | grep "^$key " | awk '{print $2}') + [[ $L =~ ^[0-9]+\.[0-9]+$ ]] + extra_flags="" + [[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5" + grep -q "^$key$" $FENV || extra_flags="$extra_flags -f" + $emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1)) +} + +check() { + $emu ./ulp -f -q "$@" #>/dev/null +} + +# Regression-test for correct NaN handling in atan2 +check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000 +check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan +check atan2 nan nan x -nan -nan + +# vector functions +flags="${ULPFLAGS:--q}" +runs= +check __s_log10f 1 && runs=1 +runv= +check __v_log10f 1 && runv=1 +runvn= +check __vn_log10f 1 && runvn=1 +runsv= +if [ $WANT_SVE_MATH -eq 1 ]; then +check __sv_cosf 0 && runsv=1 +check __sv_cos 0 && runsv=1 +check __sv_sinf 0 && runsv=1 +check __sv_sin 0 && runsv=1 +# No guarantees about powi accuracy, so regression-test for exactness +# w.r.t. the custom reference impl in ulp_wrappers.h +check -q -f -e 0 __sv_powif 0 inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 __sv_powif -0 -inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 __sv_powif 0 inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 __sv_powi 0 inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 __sv_powi -0 -inf x 0 1000 100000 && runsv=1 +check -q -f -e 0 __sv_powi 0 inf x -0 -1000 100000 && runsv=1 +check -q -f -e 0 __sv_powi -0 -inf x -0 -1000 100000 && runsv=1 +fi + +while read F LO HI N C +do + t $F $LO $HI $N $C +done << EOF +$(cat $INTERVALS) +EOF + +[ 0 -eq $FAIL ] || { + echo "FAILED $FAIL PASSED $PASS" + exit 1 +} diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosh.tst new file mode 100644 index 000000000000..dd962bd391da --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acosh.tst @@ -0,0 +1,19 @@ +; acosh.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=acosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=acosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=acosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=acosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=acosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=acosh op1=3ff00000.00000000 result=00000000.00000000 errno=0 +func=acosh op1=3fefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=00000000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=80000000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=bfefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=acosh op1=7fe01ac0.7f03a83e result=40862e50.541778f1.8cc error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acoshf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acoshf.tst new file mode 100644 index 000000000000..606c615f9b74 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/acoshf.tst @@ -0,0 +1,19 @@ +; acoshf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=acoshf op1=7fc00001 result=7fc00001 errno=0 +func=acoshf op1=ffc00001 result=7fc00001 errno=0 +func=acoshf op1=7f800001 result=7fc00001 errno=0 status=i +func=acoshf op1=ff800001 result=7fc00001 errno=0 status=i +func=acoshf op1=7f800000 result=7f800000 errno=0 +func=acoshf op1=3f800000 result=00000000 errno=0 +func=acoshf op1=3f7fffff result=7fc00001 errno=EDOM status=i +func=acoshf op1=00000000 result=7fc00001 errno=EDOM status=i +func=acoshf op1=80000000 result=7fc00001 errno=EDOM status=i +func=acoshf op1=bf7fffff result=7fc00001 errno=EDOM status=i +func=acoshf op1=bf800000 result=7fc00001 errno=EDOM status=i +func=acoshf op1=bf800001 result=7fc00001 errno=EDOM status=i +func=acoshf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=acoshf op1=7f767efe result=42b2c19d.83e error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinh.tst new file mode 100644 index 000000000000..1485dfeffecf --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinh.tst @@ -0,0 +1,18 @@ +; asinh.tst +; +; Copyright (c) 2022-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=asinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=asinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=asinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=asinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=asinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=asinh op1=fff00000.00000000 result=fff00000.00000000 errno=0 +func=asinh op1=00000000.00000000 result=00000000.00000000 errno=0 +func=asinh op1=80000000.00000000 result=80000000.00000000 errno=0 +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=asinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=asinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinhf.tst new file mode 100644 index 000000000000..eb76a5892a70 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/asinhf.tst @@ -0,0 +1,18 @@ +; asinhf.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=asinhf op1=7fc00001 result=7fc00001 errno=0 +func=asinhf op1=ffc00001 result=7fc00001 errno=0 +func=asinhf op1=7f800001 result=7fc00001 errno=0 status=i +func=asinhf op1=ff800001 result=7fc00001 errno=0 status=i +func=asinhf op1=7f800000 result=7f800000 errno=0 +func=asinhf op1=ff800000 result=ff800000 errno=0 +func=asinhf op1=00000000 result=00000000 errno=0 +func=asinhf op1=80000000 result=80000000 errno=0 +; No exception is raised on certain machines (different version of glibc) +; Same issue encountered with other function similar to x close to 0 +; Could be due to function so boring no flop is involved in some implementations +func=asinhf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=asinhf op1=80000001 result=80000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan.tst new file mode 100644 index 000000000000..4c670553d58f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan.tst @@ -0,0 +1,22 @@ +; atan.tst +; +; Copyright (c) 1999-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atan op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan op1=7ff00000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan op1=fff00000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan op1=00000000.00000000 result=00000000.00000000 errno=0 +func=atan op1=80000000.00000000 result=80000000.00000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=atan op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=atan op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux + +func=atan op1=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0 +func=atan op1=bff00000.00000000 result=bfe921fb.54442d18.469 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2.tst new file mode 100644 index 000000000000..647b3764072c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2.tst @@ -0,0 +1,110 @@ +; atan2.tst +; +; Copyright (c) 1999-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff80000.00000001 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff80000.00000001 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=7ff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=fff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=00000000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=80000000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=3ff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff80000.00000001 op2=bff00000.00000000 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=7ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=7ff00000.00000000 op2=7ff00000.00000000 result=3fe921fb.54442d18.469 errno=0 +func=atan2 op1=7ff00000.00000000 op2=fff00000.00000000 result=4002d97c.7f3321d2.34f errno=0 +func=atan2 op1=7ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan2 op1=7ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan2 op1=7ff00000.00000000 op2=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan2 op1=7ff00000.00000000 op2=bff00000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan2 op1=fff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=fff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=fff00000.00000000 op2=7ff00000.00000000 result=bfe921fb.54442d18.469 errno=0 +func=atan2 op1=fff00000.00000000 op2=fff00000.00000000 result=c002d97c.7f3321d2.34f errno=0 +func=atan2 op1=fff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan2 op1=fff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan2 op1=fff00000.00000000 op2=3ff00000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan2 op1=fff00000.00000000 op2=bff00000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan2 op1=00000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=00000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=00000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=00000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=00000000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0 +func=atan2 op1=00000000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0 +func=atan2 op1=00000000.00000000 op2=00000000.00000000 result=00000000.00000000 errno=0 +func=atan2 op1=00000000.00000000 op2=80000000.00000000 result=400921fb.54442d18.469 errno=0 +func=atan2 op1=00000000.00000000 op2=3ff00000.00000000 result=00000000.00000000 errno=0 +func=atan2 op1=00000000.00000000 op2=bff00000.00000000 result=400921fb.54442d18.469 errno=0 +; No exception is raised on certain machines (different version of glibc) +; Same issue encountered with other function similar to x close to 0 +; Could be due to function so boring no flop is involved in some implementations +func=atan2 op1=00000000.00000001 op2=3ff00000.00000000 result=00000000.00000001 errno=0 maybestatus=ux +func=atan2 op1=80000000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=80000000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=80000000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=80000000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=80000000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0 +func=atan2 op1=80000000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0 +func=atan2 op1=80000000.00000000 op2=00000000.00000000 result=80000000.00000000 errno=0 +func=atan2 op1=80000000.00000000 op2=80000000.00000000 result=c00921fb.54442d18.469 errno=0 +func=atan2 op1=80000000.00000000 op2=3ff00000.00000000 result=80000000.00000000 errno=0 +func=atan2 op1=80000000.00000000 op2=bff00000.00000000 result=c00921fb.54442d18.469 errno=0 +; No exception is raised on certain machines (different version of glibc) +; Same issue encountered with other function similar to x close to 0 +; Could be due to function so boring no flop is involved in some implementations +func=atan2 op1=80000000.00000001 op2=3ff00000.00000000 result=80000000.00000001 errno=0 maybestatus=ux +func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=3ff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=3ff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=3ff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=3ff00000.00000000 op2=7ff00000.00000000 result=00000000.00000000 errno=0 +func=atan2 op1=3ff00000.00000000 op2=fff00000.00000000 result=400921fb.54442d18.469 errno=0 +func=atan2 op1=3ff00000.00000000 op2=00000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan2 op1=3ff00000.00000000 op2=80000000.00000000 result=3ff921fb.54442d18.469 errno=0 +func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18.469 errno=0 +func=atan2 op1=3ff00000.00000000 op2=bff00000.00000000 result=4002d97c.7f3321d2.34f errno=0 +func=atan2 op1=bff00000.00000000 op2=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=bff00000.00000000 op2=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atan2 op1=bff00000.00000000 op2=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=bff00000.00000000 op2=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atan2 op1=bff00000.00000000 op2=7ff00000.00000000 result=80000000.00000000 errno=0 +func=atan2 op1=bff00000.00000000 op2=fff00000.00000000 result=c00921fb.54442d18.469 errno=0 +func=atan2 op1=bff00000.00000000 op2=00000000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan2 op1=bff00000.00000000 op2=80000000.00000000 result=bff921fb.54442d18.469 errno=0 +func=atan2 op1=bff00000.00000000 op2=3ff00000.00000000 result=bfe921fb.54442d18.469 errno=0 +func=atan2 op1=bff00000.00000000 op2=bff00000.00000000 result=c002d97c.7f3321d2.34f errno=0 +func=atan2 op1=3ff00000.00000000 op2=3ff00000.00000000 result=3fe921fb.54442d18 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2f.tst new file mode 100644 index 000000000000..85c5c5d47e10 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atan2f.tst @@ -0,0 +1,121 @@ +; atan2f.tst +; +; Copyright (c) 1999-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atan2f op1=7f800001 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=7fc00001 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=ffc00001 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=7f800000 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=ff800000 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=00000000 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=80000000 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=3f800000 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800001 op2=bf800000 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=7fc00001 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=ffc00001 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=7f800000 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=ff800000 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=00000000 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=80000000 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=3f800000 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800001 op2=bf800000 result=7fc00001 errno=0 status=i +func=atan2f op1=7fc00001 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=7fc00001 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=7fc00001 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=ffc00001 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=7f800000 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=ff800000 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=00000000 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=80000000 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=3f800000 result=7fc00001 errno=0 +func=atan2f op1=7fc00001 op2=bf800000 result=7fc00001 errno=0 +func=atan2f op1=ffc00001 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=ffc00001 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=ffc00001 op2=7fc00001 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=ffc00001 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=7f800000 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=ff800000 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=00000000 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=80000000 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=3f800000 result=ffc00001 errno=0 +func=atan2f op1=ffc00001 op2=bf800000 result=ffc00001 errno=0 +func=atan2f op1=7f800000 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800000 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=7f800000 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=7f800000 op2=ffc00001 result=7fc00001 errno=0 +func=atan2f op1=7f800000 op2=7f800000 result=3f490fda.a22 errno=0 +func=atan2f op1=7f800000 op2=ff800000 result=4016cbe3.f99 errno=0 +func=atan2f op1=7f800000 op2=00000000 result=3fc90fda.a22 errno=0 +func=atan2f op1=7f800000 op2=80000000 result=3fc90fda.a22 errno=0 +func=atan2f op1=7f800000 op2=3f800000 result=3fc90fda.a22 errno=0 +func=atan2f op1=7f800000 op2=bf800000 result=3fc90fda.a22 errno=0 +func=atan2f op1=ff800000 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800000 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=ff800000 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=ff800000 op2=ffc00001 result=ffc00001 errno=0 +func=atan2f op1=ff800000 op2=7f800000 result=bf490fda.a22 errno=0 +func=atan2f op1=ff800000 op2=ff800000 result=c016cbe3.f99 errno=0 +func=atan2f op1=ff800000 op2=00000000 result=bfc90fda.a22 errno=0 +func=atan2f op1=ff800000 op2=80000000 result=bfc90fda.a22 errno=0 +func=atan2f op1=ff800000 op2=3f800000 result=bfc90fda.a22 errno=0 +func=atan2f op1=ff800000 op2=bf800000 result=bfc90fda.a22 errno=0 +func=atan2f op1=00000000 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=00000000 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=00000000 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=00000000 op2=ffc00001 result=ffc00001 errno=0 +func=atan2f op1=00000000 op2=7f800000 result=00000000 errno=0 +func=atan2f op1=00000000 op2=ff800000 result=40490fda.a22 errno=0 +func=atan2f op1=00000000 op2=00000000 result=00000000 errno=0 +func=atan2f op1=00000000 op2=80000000 result=40490fda.a22 errno=0 +func=atan2f op1=00000000 op2=3f800000 result=00000000 errno=0 +func=atan2f op1=00000000 op2=bf800000 result=40490fda.a22 errno=0 +; No exception is raised on certain machines (different version of glibc) +; Same issue encountered with other function similar to x close to 0 +; Could be due to function so boring no flop is involved in some implementations +func=atan2f op1=00000001 op2=3f800000 result=00000001 errno=0 maybestatus=ux + +func=atan2f op1=80000000 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=80000000 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=80000000 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=80000000 op2=ffc00001 result=ffc00001 errno=0 +func=atan2f op1=80000000 op2=7f800000 result=80000000 errno=0 +func=atan2f op1=80000000 op2=ff800000 result=c0490fda.a22 errno=0 +func=atan2f op1=80000000 op2=00000000 result=80000000 errno=0 +func=atan2f op1=80000000 op2=80000000 result=c0490fda.a22 errno=0 +func=atan2f op1=80000000 op2=3f800000 result=80000000 errno=0 +func=atan2f op1=80000000 op2=bf800000 result=c0490fda.a22 errno=0 +; No exception is raised on certain machines (different version of glibc) +; Same issue encountered with other function similar to x close to 0 +; Could be due to function so boring no flop is involved in some implementations +func=atan2f op1=80000001 op2=3f800000 result=80000001 errno=0 maybestatus=ux + +func=atan2f op1=3f800000 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=3f800000 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=3f800000 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=3f800000 op2=ffc00001 result=ffc00001 errno=0 +func=atan2f op1=3f800000 op2=7f800000 result=00000000 errno=0 +func=atan2f op1=3f800000 op2=ff800000 result=40490fda.a22 errno=0 +func=atan2f op1=3f800000 op2=00000000 result=3fc90fda.a22 errno=0 +func=atan2f op1=3f800000 op2=80000000 result=3fc90fda.a22 errno=0 +func=atan2f op1=3f800000 op2=3f800000 result=3f490fda.a22 errno=0 +func=atan2f op1=3f800000 op2=bf800000 result=4016cbe3.f99 errno=0 +func=atan2f op1=bf800000 op2=7f800001 result=7fc00001 errno=0 status=i +func=atan2f op1=bf800000 op2=ff800001 result=7fc00001 errno=0 status=i +func=atan2f op1=bf800000 op2=7fc00001 result=7fc00001 errno=0 +func=atan2f op1=bf800000 op2=ffc00001 result=ffc00001 errno=0 +func=atan2f op1=bf800000 op2=7f800000 result=80000000 errno=0 +func=atan2f op1=bf800000 op2=ff800000 result=c0490fda.a22 errno=0 +func=atan2f op1=bf800000 op2=00000000 result=bfc90fda.a22 errno=0 +func=atan2f op1=bf800000 op2=80000000 result=bfc90fda.a22 errno=0 +func=atan2f op1=bf800000 op2=3f800000 result=bf490fda.a22 errno=0 +func=atan2f op1=bf800000 op2=bf800000 result=c016cbe3.f99 errno=0 +func=atan2f op1=8005f16d op2=002bb601 result=be0a60a5.d88 error=0 +func=atan2f op1=80818ec8 op2=80ba5db9 result=c0222eda.f42 error=0 + +func=atan2f op1=ff7fffff op2=ff7fffff result=c016cbe3.f99 errno=0 +func=atan2f op1=bfc00001 op2=7f7fffff result=80300000.700 errno=0 status=u +func=atan2f op1=80800001 op2=40000000 result=80400000.800 errno=0 status=u diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanf.tst new file mode 100644 index 000000000000..0a0bfc24c605 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanf.tst @@ -0,0 +1,22 @@ +; atanf.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atanf op1=7fc00001 result=7fc00001 errno=0 +func=atanf op1=ffc00001 result=7fc00001 errno=0 +func=atanf op1=7f800001 result=7fc00001 errno=0 status=i +func=atanf op1=ff800001 result=7fc00001 errno=0 status=i +func=atanf op1=7f800000 result=3fc90fda.a22 errno=0 +func=atanf op1=ff800000 result=bfc90fda.a22 errno=0 +func=atanf op1=00000000 result=00000000 errno=0 +func=atanf op1=80000000 result=80000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=atanf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=atanf op1=80000001 result=80000001 errno=0 maybestatus=ux + +func=atanf op1=3f800000 result=3f490fda.a22 errno=0 +func=atanf op1=bf800000 result=bf490fda.a22 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanh.tst new file mode 100644 index 000000000000..d96ff327fcd9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanh.tst @@ -0,0 +1,22 @@ +; atanh.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=atanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=atanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=atanh op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=atanh op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=atanh op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=atanh op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=atanh op1=3ff00000.00000000 result=7ff00000.00000000 errno=ERANGE status=z +func=atanh op1=bff00000.00000000 result=fff00000.00000000 errno=ERANGE status=z +func=atanh op1=00000000.00000000 result=00000000.00000000 errno=0 +func=atanh op1=80000000.00000000 result=80000000.00000000 errno=0 +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=atanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=atanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanhf.tst new file mode 100644 index 000000000000..21a68a661a11 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/atanhf.tst @@ -0,0 +1,23 @@ +; atanhf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=atanhf op1=7fc00001 result=7fc00001 errno=0 +func=atanhf op1=ffc00001 result=7fc00001 errno=0 +func=atanhf op1=7f800001 result=7fc00001 errno=0 status=i +func=atanhf op1=ff800001 result=7fc00001 errno=0 status=i +func=atanhf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=atanhf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=atanhf op1=3f800001 result=7fc00001 errno=EDOM status=i +func=atanhf op1=bf800001 result=7fc00001 errno=EDOM status=i +func=atanhf op1=3f800000 result=7f800000 errno=ERANGE status=z +func=atanhf op1=bf800000 result=ff800000 errno=ERANGE status=z +func=atanhf op1=00000000 result=00000000 errno=0 +func=atanhf op1=80000000 result=80000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=atanhf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=atanhf op1=80000001 result=80000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cbrtf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cbrtf.tst new file mode 100644 index 000000000000..0dd8d09f1d4f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cbrtf.tst @@ -0,0 +1,29 @@ +; cbrtf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=cbrtf op1=7f800000 result=7f800000 errno=0 +func=cbrtf op1=ff800000 result=ff800000 errno=0 +func=cbrtf op1=7f800001 result=7fc00001 errno=0 status=i +func=cbrtf op1=7fc00001 result=7fc00001 errno=0 +func=cbrtf op1=00000000 result=00000000 errno=0 +func=cbrtf op1=00000001 result=26a14517.cc7 errno=0 +func=cbrtf op1=00000002 result=26cb2ff5.29f errno=0 +func=cbrtf op1=00000003 result=26e89768.579 errno=0 +func=cbrtf op1=00000004 result=27000000.000 errno=0 +func=cbrtf op1=00400000 result=2a4b2ff5.29f errno=0 +func=cbrtf op1=00800000 result=2a800000.000 errno=0 +func=cbrtf op1=3f800000 result=3f800000.000 errno=0 +func=cbrtf op1=40000000 result=3fa14517.cc7 errno=0 +func=cbrtf op1=7f7fffff result=54cb2ff4.e63 errno=0 +func=cbrtf op1=80000000 result=80000000 errno=0 +func=cbrtf op1=80000001 result=a6a14517.cc7 errno=0 +func=cbrtf op1=80000002 result=a6cb2ff5.29f errno=0 +func=cbrtf op1=80000003 result=a6e89768.579 errno=0 +func=cbrtf op1=80000004 result=a7000000.000 errno=0 +func=cbrtf op1=80400000 result=aa4b2ff5.29f errno=0 +func=cbrtf op1=80800000 result=aa800000.000 errno=0 +func=cbrtf op1=bf800000 result=bf800000.000 errno=0 +func=cbrtf op1=c0000000 result=bfa14517.cc7 errno=0 +func=cbrtf op1=ff7fffff result=d4cb2ff4.e63 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cosh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cosh.tst new file mode 100644 index 000000000000..c4efacb7272d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/cosh.tst @@ -0,0 +1,15 @@ +; cosh.tst +; +; Copyright (c) 1999-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=cosh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=cosh op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=cosh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=cosh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=cosh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=cosh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=cosh op1=fff00000.00000000 result=7ff00000.00000000 errno=0 +func=cosh op1=ffefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=cosh op1=00000000.00000000 result=3ff00000.00000000 errno=0 +func=cosh op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/coshf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/coshf.tst new file mode 100644 index 000000000000..2b967e78f4b4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/coshf.tst @@ -0,0 +1,15 @@ +; coshf.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=coshf op1=7fc00001 result=7fc00001 errno=0 +func=coshf op1=ffc00001 result=7fc00001 errno=0 +func=coshf op1=7f800001 result=7fc00001 errno=0 status=i +func=coshf op1=ff800001 result=7fc00001 errno=0 status=i +func=coshf op1=7f800000 result=7f800000 errno=0 +func=coshf op1=7f7fffff result=7f800000 errno=ERANGE status=ox +func=coshf op1=ff800000 result=7f800000 errno=0 +func=coshf op1=ff7fffff result=7f800000 errno=ERANGE status=ox +func=coshf op1=00000000 result=3f800000 errno=0 +func=coshf op1=80000000 result=3f800000 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfc.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfc.tst new file mode 100644 index 000000000000..c03fc591da47 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfc.tst @@ -0,0 +1,23 @@ +; erfc.tst - Directed test cases for erfc +; +; Copyright (c) 2022-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=erfc op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=erfc op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=erfc op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erfc op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=erfc op1=7ff00000.00000000 result=00000000.00000000 errno=0 +func=erfc op1=7fefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux +; We deliberately turned off errno setting in erf, as standard simply +; state that errno `may` be set to ERANGE in case of underflow. +; As a result the following condition on errno cannot be satisfied. +; +; func=erfc op1=403b44af.48b01531 result=00000000.00000000 errno=ERANGE status=ux +; +func=erfc op1=c03b44af.48b01531 result=40000000.00000000 errno=0 +func=erfc op1=403bffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux +func=erfc op1=c03bffff.ffffffff result=40000000.00000000 errno=0 +func=erfc op1=fff00000.00000000 result=40000000.00000000 errno=0 +func=erfc op1=00000000.00000000 result=3ff00000.00000000 errno=0 +func=erfc op1=80000000.00000000 result=3ff00000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfcf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfcf.tst new file mode 100644 index 000000000000..719baccb2e45 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erfcf.tst @@ -0,0 +1,14 @@ +; erfcf.tst - Directed test cases for erfcf +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=erfcf op1=7fc00001 result=7fc00001 errno=0 +func=erfcf op1=ffc00001 result=7fc00001 errno=0 +func=erfcf op1=7f800001 result=7fc00001 errno=0 status=i +func=erfcf op1=ff800001 result=7fc00001 errno=0 status=i +func=erfcf op1=7f800000 result=00000000 errno=0 +func=erfcf op1=7f7fffff result=00000000 errno=ERANGE status=ux +func=erfcf op1=ff800000 result=40000000 errno=0 +func=erfcf op1=00000000 result=3f800000 errno=0 +func=erfcf op1=80000000 result=3f800000 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erff.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erff.tst new file mode 100644 index 000000000000..9b1d3d5114ae --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/erff.tst @@ -0,0 +1,17 @@ +; erff.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=erff op1=7fc00001 result=7fc00001 errno=0 +func=erff op1=ffc00001 result=7fc00001 errno=0 +func=erff op1=7f800001 result=7fc00001 errno=0 status=i +func=erff op1=ff800001 result=7fc00001 errno=0 status=i +func=erff op1=7f800000 result=3f800000 errno=0 +func=erff op1=ff800000 result=bf800000 errno=0 +func=erff op1=00000000 result=00000000 errno=ERANGE +func=erff op1=80000000 result=80000000 errno=ERANGE +func=erff op1=00000001 result=00000001 errno=0 status=ux +func=erff op1=80000001 result=80000001 errno=0 status=ux +func=erff op1=3f800000 result=3f57bb3d.3a0 errno=0 +func=erff op1=bf800000 result=bf57bb3d.3a0 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1.tst new file mode 100644 index 000000000000..609d6f479721 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1.tst @@ -0,0 +1,21 @@ +; expm1.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=expm1 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=expm1 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=expm1 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=expm1 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=expm1 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=expm1 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=expm1 op1=fff00000.00000000 result=bff00000.00000000 errno=0 +func=expm1 op1=ffefffff.ffffffff result=bff00000.00000000 errno=0 +func=expm1 op1=00000000.00000000 result=00000000.00000000 errno=0 +func=expm1 op1=80000000.00000000 result=80000000.00000000 errno=0 +; Inconsistent behavior was detected for the following 2 cases. +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=expm1 op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=expm1 op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1f.tst new file mode 100644 index 000000000000..44c38420a617 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/expm1f.tst @@ -0,0 +1,57 @@ +; expm1f.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=expm1f op1=7fc00001 result=7fc00001 errno=0 +func=expm1f op1=ffc00001 result=7fc00001 errno=0 +func=expm1f op1=7f800001 result=7fc00001 errno=0 status=i +func=expm1f op1=ff800001 result=7fc00001 errno=0 status=i +func=expm1f op1=7f800000 result=7f800000 errno=0 +func=expm1f op1=7f7fffff result=7f800000 errno=ERANGE status=ox +func=expm1f op1=ff800000 result=bf800000 errno=0 +func=expm1f op1=ff7fffff result=bf800000 errno=0 +func=expm1f op1=00000000 result=00000000 errno=0 +func=expm1f op1=80000000 result=80000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. + +func=expm1f op1=00000001 result=00000001 errno=0 maybestatus=ux +func=expm1f op1=80000001 result=80000001 errno=0 maybestatus=ux + +func=expm1f op1=42b145c0 result=7f6ac2dd.9b8 errno=0 + +; Check both sides of the over/underflow thresholds in the code. +func=expm1f op1=c2000000 result=bf7fffff.fff error=0 +func=expm1f op1=c2000001 result=bf7fffff.fff error=0 +func=expm1f op1=43000000 result=7f800000 error=overflow +func=expm1f op1=43000001 result=7f800000 error=overflow +func=expm1f op1=c2a80000 result=bf800000.000 error=0 +func=expm1f op1=c2a80001 result=bf800000.000 error=0 + +; Check values for which exp goes denormal. expm1f should not report +; spurious overflow. +func=expm1f op1=c2b00f34 result=bf800000.000 error=0 +func=expm1f op1=c2ce8ed0 result=bf800000.000 error=0 +func=expm1f op1=c2dc6bba result=bf800000.000 error=0 + +; Regression tests for significance loss when the two components of +; the result have opposite sign but similar magnitude +func=expm1f op1=be8516c1 result=be6a652b.0dc error=0 +func=expm1f op1=be851714 result=be6a65ab.0e5 error=0 +func=expm1f op1=be851cc7 result=be6a6e75.111 error=0 +func=expm1f op1=be851d1a result=be6a6ef5.102 error=0 +func=expm1f op1=be851d6d result=be6a6f75.0f2 error=0 +func=expm1f op1=be852065 result=be6a7409.0e4 error=0 +func=expm1f op1=be8520b8 result=be6a7489.0c7 error=0 +func=expm1f op1=be85210b result=be6a7509.0a8 error=0 +func=expm1f op1=be855401 result=be6ac39b.0d5 error=0 +func=expm1f op1=be933307 result=be7fdbf0.d8d error=0 +func=expm1f op1=be92ed6b result=be7f737a.d81 error=0 +func=expm1f op1=be933b90 result=be7fe8be.d76 error=0 +func=expm1f op1=3eb11364 result=3ed38deb.0c0 error=0 +func=expm1f op1=3f28e830 result=3f6f344b.0da error=0 +func=expm1f op1=3eb1578f result=3ed3ee47.13b error=0 +func=expm1f op1=3f50176a result=3fa08e36.fea error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10.tst new file mode 100644 index 000000000000..34831436234a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10.tst @@ -0,0 +1,16 @@ +; log10.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=log10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=log10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=log10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=log10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=log10 op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i +func=log10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=log10 op1=3ff00000.00000000 result=00000000.00000000 errno=0 +func=log10 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=log10 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z +func=log10 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z +func=log10 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10f.tst new file mode 100644 index 000000000000..d5744a66f092 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log10f.tst @@ -0,0 +1,69 @@ +; log10f.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=log10f op1=7fc00001 result=7fc00001 errno=0 +func=log10f op1=ffc00001 result=7fc00001 errno=0 +func=log10f op1=7f800001 result=7fc00001 errno=0 status=i +func=log10f op1=ff800001 result=7fc00001 errno=0 status=i +func=log10f op1=ff810000 result=7fc00001 errno=0 status=i +func=log10f op1=7f800000 result=7f800000 errno=0 +func=log10f op1=3f800000 result=00000000 errno=0 +func=log10f op1=ff800000 result=7fc00001 errno=EDOM status=i +func=log10f op1=00000000 result=ff800000 errno=ERANGE status=z +func=log10f op1=80000000 result=ff800000 errno=ERANGE status=z +func=log10f op1=80000001 result=7fc00001 errno=EDOM status=i + +; Directed tests for the special-case handling of log10 of things +; very near 1 +func=log10f op1=3f81a618 result=3bb62472.b92 error=0 +func=log10f op1=3f876783 result=3cc811f4.26c error=0 +func=log10f op1=3f816af8 result=3b9cc4c7.057 error=0 +func=log10f op1=3f7bed7d result=bbe432cb.e23 error=0 +func=log10f op1=3f803ece result=3a59ff3a.a84 error=0 +func=log10f op1=3f80089f result=38ef9728.aa6 error=0 +func=log10f op1=3f86ab72 result=3cb4b711.457 error=0 +func=log10f op1=3f780854 result=bc60f953.904 error=0 +func=log10f op1=3f7c6d76 result=bbc7fd01.01c error=0 +func=log10f op1=3f85dff6 result=3c9fa76f.81f error=0 +func=log10f op1=3f7b87f4 result=bbfa9edc.be4 error=0 +func=log10f op1=3f81c710 result=3bc4457b.745 error=0 +func=log10f op1=3f80946d result=3b00a140.c06 error=0 +func=log10f op1=3f7e87ea result=bb23cd70.828 error=0 +func=log10f op1=3f811437 result=3b6ee960.b40 error=0 +func=log10f op1=3f858dcf result=3c971d9b.2ea error=0 +func=log10f op1=3f7f61a3 result=ba89b814.4e0 error=0 +func=log10f op1=3f82d642 result=3c1bfb8d.517 error=0 +func=log10f op1=3f80f3bc result=3b52ebe8.c75 error=0 +func=log10f op1=3f85eff9 result=3ca150d9.7e8 error=0 +func=log10f op1=3f843eb8 result=3c68263f.771 error=0 +func=log10f op1=3f78e691 result=bc481cf4.50a error=0 +func=log10f op1=3f87c56f result=3cd1b268.5e6 error=0 +func=log10f op1=3f83b711 result=3c4b94c5.918 error=0 +func=log10f op1=3f823b2b result=3bf5eb02.e2a error=0 +func=log10f op1=3f7f2c4e result=bab82c80.519 error=0 +func=log10f op1=3f83fc92 result=3c5a3ba1.543 error=0 +func=log10f op1=3f793956 result=bc3ee04e.03c error=0 +func=log10f op1=3f839ba5 result=3c45caca.92a error=0 +func=log10f op1=3f862f30 result=3ca7de76.16f error=0 +func=log10f op1=3f832a20 result=3c2dc6e9.afd error=0 +func=log10f op1=3f810296 result=3b5fb92a.429 error=0 +func=log10f op1=3f7e58c9 result=bb38655a.0a4 error=0 +func=log10f op1=3f8362e7 result=3c39cc65.d15 error=0 +func=log10f op1=3f7fdb85 result=b97d9016.40b error=0 +func=log10f op1=3f84484e result=3c6a29f2.f74 error=0 +func=log10f op1=3f861862 result=3ca5819e.f2d error=0 +func=log10f op1=3f7c027b result=bbdf912d.440 error=0 +func=log10f op1=3f867803 result=3caf6744.34d error=0 +func=log10f op1=3f789a89 result=bc509bce.458 error=0 +func=log10f op1=3f8361d9 result=3c399347.379 error=0 +func=log10f op1=3f7d3ac3 result=bb9ad93a.93d error=0 +func=log10f op1=3f7ee241 result=baf8bd12.a62 error=0 +func=log10f op1=3f83a1fd result=3c4721bd.0a4 error=0 +func=log10f op1=3f840da3 result=3c5dd375.675 error=0 +func=log10f op1=3f79c2fe result=bc2f8a60.8c5 error=0 +func=log10f op1=3f854a93 result=3c901cc9.add error=0 +func=log10f op1=3f87a50a result=3cce6125.cd6 error=0 +func=log10f op1=3f818bf5 result=3baaee68.a55 error=0 +func=log10f op1=3f830a44 result=3c2705c4.d87 error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1p.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1p.tst new file mode 100644 index 000000000000..9ee8c62fc9c0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1p.tst @@ -0,0 +1,22 @@ +; log1p.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=log1p op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=log1p op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=log1p op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=log1p op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=log1p op1=fff02000.00000000 result=7ff80000.00000001 errno=0 status=i +func=log1p op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others. +; The main reason seems to be the handling of errno and exceptions. + +func=log1p op1=00000000.00000000 result=00000000.00000000 errno=0 +func=log1p op1=80000000.00000000 result=80000000.00000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=log1p op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=log1p op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1pf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1pf.tst new file mode 100644 index 000000000000..aaa01d67c2b3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log1pf.tst @@ -0,0 +1,130 @@ +; log1pf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=log1pf op1=7fc00001 result=7fc00001 errno=0 +func=log1pf op1=ffc00001 result=7fc00001 errno=0 +func=log1pf op1=7f800001 result=7fc00001 errno=0 status=i +func=log1pf op1=ff800001 result=7fc00001 errno=0 status=i +func=log1pf op1=ff810000 result=7fc00001 errno=0 status=i +func=log1pf op1=7f800000 result=7f800000 errno=0 + +; Cases 6, 9 , 10, 11, 12 fail with certain versions of GLIBC and not others. +; The main reason seems to be the handling of errno and exceptions. + +func=log1pf op1=00000000 result=00000000 errno=0 +func=log1pf op1=80000000 result=80000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=log1pf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=log1pf op1=80000001 result=80000001 errno=0 maybestatus=ux + +func=log1pf op1=3f1e91ee result=3ef6d127.fdb errno=0 +func=log1pf op1=3f201046 result=3ef8a881.fba errno=0 +func=log1pf op1=3f21b916 result=3efab23b.f9f errno=0 +func=log1pf op1=3f21bde6 result=3efab821.fee errno=0 +func=log1pf op1=3f22a5ee result=3efbd435.ff2 errno=0 +func=log1pf op1=3f231b56 result=3efc63b7.e26 errno=0 +func=log1pf op1=3f23ce96 result=3efd3e83.fc8 errno=0 +func=log1pf op1=3eee18c6 result=3ec38576.02e errno=0 +func=log1pf op1=3eee2f41 result=3ec394ce.057 errno=0 +func=log1pf op1=3eee770d result=3ec3c5cc.00c errno=0 +func=log1pf op1=3eee7fed result=3ec3cbda.065 errno=0 +func=log1pf op1=3eee8fb2 result=3ec3d69c.008 errno=0 +func=log1pf op1=3eeeb8eb result=3ec3f2ba.061 errno=0 +func=log1pf op1=3eeeccfd result=3ec4006a.01d errno=0 +func=log1pf op1=3eeef5f0 result=3ec41c56.020 errno=0 +func=log1pf op1=3eeeff12 result=3ec42290.00c errno=0 +func=log1pf op1=3eef05cf result=3ec42728.052 errno=0 +func=log1pf op1=3eef13d3 result=3ec430b6.00e errno=0 +func=log1pf op1=3eef2e70 result=3ec442da.04a errno=0 +func=log1pf op1=3eef3fbf result=3ec44ea6.055 errno=0 +func=log1pf op1=3eef3feb result=3ec44ec4.021 errno=0 +func=log1pf op1=3eef4399 result=3ec45146.011 errno=0 +func=log1pf op1=3eef452e result=3ec4525a.049 errno=0 +func=log1pf op1=3eef4ea9 result=3ec458d0.020 errno=0 +func=log1pf op1=3eef7365 result=3ec471d8.05e errno=0 +func=log1pf op1=3eefa38f result=3ec492a8.003 errno=0 +func=log1pf op1=3eefb1f1 result=3ec49c74.015 errno=0 +func=log1pf op1=3eefb334 result=3ec49d50.023 errno=0 +func=log1pf op1=3eefb3c1 result=3ec49db0.0bf errno=0 +func=log1pf op1=3eefb591 result=3ec49eec.15d errno=0 +func=log1pf op1=3eefd736 result=3ec4b5d6.02d errno=0 +func=log1pf op1=3eefd797 result=3ec4b618.114 errno=0 +func=log1pf op1=3eefee5d result=3ec4c59a.071 errno=0 +func=log1pf op1=3eeffff4 result=3ec4d194.0a7 errno=0 +func=log1pf op1=3ef00cd1 result=3ec4da56.025 errno=0 +func=log1pf op1=3ef0163a result=3ec4e0be.07a errno=0 +func=log1pf op1=3ef01e89 result=3ec4e666.007 errno=0 +func=log1pf op1=3ef02004 result=3ec4e768.00a errno=0 +func=log1pf op1=3ef02c40 result=3ec4efbc.017 errno=0 +func=log1pf op1=3ef05b50 result=3ec50fc4.031 errno=0 +func=log1pf op1=3ef05bb1 result=3ec51006.05f errno=0 +func=log1pf op1=3ef0651b result=3ec5166e.0d9 errno=0 +func=log1pf op1=3ef06609 result=3ec51710.02a errno=0 +func=log1pf op1=3ef0666a result=3ec51752.049 errno=0 +func=log1pf op1=3ef0791e result=3ec5240c.0a8 errno=0 +func=log1pf op1=3ef07d46 result=3ec526e0.00e errno=0 +func=log1pf op1=3ef091fd result=3ec534f8.03c errno=0 +func=log1pf op1=3ef09602 result=3ec537b4.128 errno=0 +func=log1pf op1=3ef09848 result=3ec53940.044 errno=0 +func=log1pf op1=3ef0a04f result=3ec53eb6.07d errno=0 +func=log1pf op1=3ef0ab6a result=3ec54644.062 errno=0 +func=log1pf op1=3ef0ae49 result=3ec54838.002 errno=0 +func=log1pf op1=3ef0c1b8 result=3ec55570.000 errno=0 +func=log1pf op1=3ef0ca06 result=3ec55b16.00d errno=0 +func=log1pf op1=3ef0cc29 result=3ec55c8a.095 errno=0 +func=log1pf op1=3ef0d228 result=3ec5609e.04f errno=0 +func=log1pf op1=3ef0d8c0 result=3ec5651a.05e errno=0 +func=log1pf op1=3ef0dc0c result=3ec56758.029 errno=0 +func=log1pf op1=3ef0e0e8 result=3ec56aa6.02e errno=0 +func=log1pf op1=3ef0e502 result=3ec56d70.102 errno=0 +func=log1pf op1=3ef0e754 result=3ec56f04.017 errno=0 +func=log1pf op1=3ef0efe9 result=3ec574da.01c errno=0 +func=log1pf op1=3ef0f309 result=3ec576fa.016 errno=0 +func=log1pf op1=3ef0f499 result=3ec5780a.005 errno=0 +func=log1pf op1=3ef0f6c2 result=3ec57982.083 errno=0 +func=log1pf op1=3ef0f852 result=3ec57a92.05d errno=0 +func=log1pf op1=3ef0f9e2 result=3ec57ba2.02e errno=0 +func=log1pf op1=3ef119ee result=3ec5916c.024 errno=0 +func=log1pf op1=3ef11edf result=3ec594c8.03d errno=0 +func=log1pf op1=3ef128c4 result=3ec59b82.001 errno=0 +func=log1pf op1=3ef12ac1 result=3ec59cdc.04b errno=0 +func=log1pf op1=3ef12fea result=3ec5a05e.045 errno=0 +func=log1pf op1=3ef131e7 result=3ec5a1b8.05a errno=0 +func=log1pf op1=3ef134e1 result=3ec5a3be.00e errno=0 +func=log1pf op1=3ef1397a result=3ec5a6de.127 errno=0 +func=log1pf op1=3ef13ade result=3ec5a7d0.0f6 errno=0 +func=log1pf op1=3ef13c0d result=3ec5a89e.054 errno=0 +func=log1pf op1=3ef13d71 result=3ec5a990.016 errno=0 +func=log1pf op1=3ef14074 result=3ec5ab9c.12c errno=0 +func=log1pf op1=3ef146a0 result=3ec5afce.035 errno=0 +func=log1pf op1=3ef14a39 result=3ec5b240.024 errno=0 +func=log1pf op1=3ef14d39 result=3ec5b44a.00c errno=0 +func=log1pf op1=3ef152a3 result=3ec5b7f8.04d errno=0 +func=log1pf op1=3ef170a1 result=3ec5cc5a.021 errno=0 +func=log1pf op1=3ef17855 result=3ec5d196.0dc errno=0 +func=log1pf op1=3ef17ece result=3ec5d5fc.010 errno=0 +func=log1pf op1=3ef1810c result=3ec5d782.08e errno=0 +func=log1pf op1=3ef18da9 result=3ec5e014.0ae errno=0 +func=log1pf op1=3ef19054 result=3ec5e1e4.1a2 errno=0 +func=log1pf op1=3ef190ea result=3ec5e24a.048 errno=0 +func=log1pf op1=3ef1a739 result=3ec5f172.0d8 errno=0 +func=log1pf op1=3ef1a83c result=3ec5f222.018 errno=0 +func=log1pf op1=3ef1bbcc result=3ec5ff6c.09d errno=0 +func=log1pf op1=3ef1bd3c result=3ec60066.03a errno=0 +func=log1pf op1=3ef1d6ee result=3ec611da.056 errno=0 +func=log1pf op1=3ef1de36 result=3ec616cc.01b errno=0 +func=log1pf op1=3ef1e623 result=3ec61c2e.008 errno=0 +func=log1pf op1=3ef1e9b1 result=3ec61e98.029 errno=0 +func=log1pf op1=3ef1ee19 result=3ec62196.0d8 errno=0 +func=log1pf op1=3ef1f13a result=3ec623b6.039 errno=0 +func=log1pf op1=3ef1f1a7 result=3ec62400.091 errno=0 +func=log1pf op1=3ef1f214 result=3ec6244a.0e8 errno=0 +func=log1pf op1=3ef206e1 result=3ec6326a.09b errno=0 +func=log1pf op1=3ef21245 result=3ec63a26.012 errno=0 +func=log1pf op1=3ef217fd result=3ec63e08.048 errno=0 +func=log1pf op1=3ef2186a result=3ec63e52.063 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2.tst new file mode 100644 index 000000000000..5d1eb9b877e8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2.tst @@ -0,0 +1,21 @@ +; Directed test cases for log2 +; +; Copyright (c) 2018-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=log2 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=log2 op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=log2 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=log2 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=log2 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=log2 op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=log2 op1=7fefffff.ffffffff result=408fffff.ffffffff.ffa errno=0 +func=log2 op1=ffefffff.ffffffff result=7ff80000.00000001 errno=EDOM status=i +func=log2 op1=3ff00000.00000000 result=00000000.00000000 errno=0 +func=log2 op1=bff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i +func=log2 op1=00000000.00000000 result=fff00000.00000000 errno=ERANGE status=z +func=log2 op1=80000000.00000000 result=fff00000.00000000 errno=ERANGE status=z +func=log2 op1=00000000.00000001 result=c090c800.00000000 errno=0 +func=log2 op1=80000000.00000001 result=7ff80000.00000001 errno=EDOM status=i +func=log2 op1=40000000.00000000 result=3ff00000.00000000 errno=0 +func=log2 op1=3fe00000.00000000 result=bff00000.00000000 errno=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2f.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2f.tst new file mode 100644 index 000000000000..4e08110878d6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/log2f.tst @@ -0,0 +1,27 @@ +; log2f.tst - Directed test cases for log2f +; +; Copyright (c) 2017-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=log2f op1=7fc00001 result=7fc00001 errno=0 +func=log2f op1=ffc00001 result=7fc00001 errno=0 +func=log2f op1=7f800001 result=7fc00001 errno=0 status=i +func=log2f op1=ff800001 result=7fc00001 errno=0 status=i +func=log2f op1=ff810000 result=7fc00001 errno=0 status=i +func=log2f op1=7f800000 result=7f800000 errno=0 +func=log2f op1=ff800000 result=7fc00001 errno=EDOM status=i +func=log2f op1=3f800000 result=00000000 errno=0 +func=log2f op1=00000000 result=ff800000 errno=ERANGE status=z +func=log2f op1=80000000 result=ff800000 errno=ERANGE status=z +func=log2f op1=80000001 result=7fc00001 errno=EDOM status=i + +func=log2f op1=3f7d70a4 result=bc6d8f8b.7d4 error=0 +func=log2f op1=3f604189 result=be4394c8.395 error=0 +func=log2f op1=3f278034 result=bf1caa73.88e error=0 +func=log2f op1=3edd3c36 result=bf9af3b9.619 error=0 +func=log2f op1=3e61259a result=c00bdb95.650 error=0 +func=log2f op1=3f8147ae result=3c6b3267.d6a error=0 +func=log2f op1=3f8fbe77 result=3e2b5fe2.a1c error=0 +func=log2f op1=3fac3eea result=3edb4d5e.1fc error=0 +func=log2f op1=3fd6e632 result=3f3f5d3a.827 error=0 +func=log2f op1=40070838 result=3f89e055.a0a error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinh.tst new file mode 100644 index 000000000000..d6a3da896693 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinh.tst @@ -0,0 +1,21 @@ +; sinh.tst +; +; Copyright (c) 1999-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=sinh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=sinh op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=sinh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=sinh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=sinh op1=7ff00000.00000000 result=7ff00000.00000000 errno=0 +func=sinh op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox +func=sinh op1=fff00000.00000000 result=fff00000.00000000 errno=0 +func=sinh op1=ffefffff.ffffffff result=fff00000.00000000 errno=ERANGE status=ox +func=sinh op1=00000000.00000000 result=00000000.00000000 errno=0 +func=sinh op1=80000000.00000000 result=80000000.00000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=sinh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=sinh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinhf.tst new file mode 100644 index 000000000000..5f7bd1b04137 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/sinhf.tst @@ -0,0 +1,21 @@ +; sinhf.tst +; +; Copyright (c) 2009-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=sinhf op1=7fc00001 result=7fc00001 errno=0 +func=sinhf op1=ffc00001 result=7fc00001 errno=0 +func=sinhf op1=7f800001 result=7fc00001 errno=0 status=i +func=sinhf op1=ff800001 result=7fc00001 errno=0 status=i +func=sinhf op1=7f800000 result=7f800000 errno=0 +func=sinhf op1=7f7fffff result=7f800000 errno=ERANGE status=ox +func=sinhf op1=ff800000 result=ff800000 errno=0 +func=sinhf op1=ff7fffff result=ff800000 errno=ERANGE status=ox +func=sinhf op1=00000000 result=00000000 errno=0 +func=sinhf op1=80000000 result=80000000 errno=0 + +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=sinhf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=sinhf op1=80000001 result=80000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanf.tst new file mode 100644 index 000000000000..3161f70f4361 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanf.tst @@ -0,0 +1,25 @@ +; tanf.tst +; +; Copyright (c) 2022-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=tanf op1=7fc00001 result=7fc00001 errno=0 +func=tanf op1=ffc00001 result=7fc00001 errno=0 +func=tanf op1=7f800001 result=7fc00001 errno=0 status=i +func=tanf op1=ff800001 result=7fc00001 errno=0 status=i +func=tanf op1=7f800000 result=7fc00001 errno=EDOM status=i +func=tanf op1=ff800000 result=7fc00001 errno=EDOM status=i +func=tanf op1=00000000 result=00000000 errno=0 +func=tanf op1=80000000 result=80000000 errno=0 +; SDCOMP-26094: check tanf in the cases for which the range reducer +; returns values furthest beyond its nominal upper bound of pi/4. +func=tanf op1=46427f1b result=3f80396d.599 error=0 +func=tanf op1=4647e568 result=3f8039a6.c9f error=0 +func=tanf op1=46428bac result=3f803a03.148 error=0 +func=tanf op1=4647f1f9 result=3f803a3c.852 error=0 +func=tanf op1=4647fe8a result=3f803ad2.410 error=0 +func=tanf op1=45d8d7f1 result=bf800669.901 error=0 +func=tanf op1=45d371a4 result=bf800686.3cd error=0 +func=tanf op1=45ce0b57 result=bf8006a2.e9a error=0 +func=tanf op1=45d35882 result=bf80071b.bc4 error=0 +func=tanf op1=45cdf235 result=bf800738.693 error=0 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanh.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanh.tst new file mode 100644 index 000000000000..78776e6f3924 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanh.tst @@ -0,0 +1,18 @@ +; tanh.tst +; +; Copyright (c) 1999-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=tanh op1=7ff80000.00000001 result=7ff80000.00000001 errno=0 +func=tanh op1=fff80000.00000001 result=7ff80000.00000001 errno=0 +func=tanh op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=tanh op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i +func=tanh op1=7ff00000.00000000 result=3ff00000.00000000 errno=0 +func=tanh op1=fff00000.00000000 result=bff00000.00000000 errno=0 +func=tanh op1=00000000.00000000 result=00000000.00000000 errno=0 +func=tanh op1=80000000.00000000 result=80000000.00000000 errno=0 +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +func=tanh op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux +func=tanh op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanhf.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanhf.tst new file mode 100644 index 000000000000..603e3107e44f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/directed/tanhf.tst @@ -0,0 +1,20 @@ +; tanhf.tst +; +; Copyright (c) 2007-2023, Arm Limited. +; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +func=tanhf op1=7fc00001 result=7fc00001 errno=0 +func=tanhf op1=ffc00001 result=7fc00001 errno=0 +func=tanhf op1=7f800001 result=7fc00001 errno=0 status=i +func=tanhf op1=ff800001 result=7fc00001 errno=0 status=i +func=tanhf op1=7f800000 result=3f800000 errno=0 +func=tanhf op1=ff800000 result=bf800000 errno=0 +func=tanhf op1=00000000 result=00000000 errno=0 +func=tanhf op1=80000000 result=80000000 errno=0 +; No exception is raised with certain versions of glibc. Functions +; approximated by x near zero may not generate/implement flops and +; thus may not raise exceptions. +; func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux +; func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux +func=tanhf op1=00000001 result=00000001 errno=0 maybestatus=ux +func=tanhf op1=80000001 result=80000001 errno=0 maybestatus=ux diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/random/double.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/random/double.tst new file mode 100644 index 000000000000..d83283ef7864 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/random/double.tst @@ -0,0 +1,6 @@ +!! double.tst - Random test case specification for DP functions +!! +!! Copyright (c) 1999-2023, Arm Limited. +!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +test log10 10000 diff --git a/contrib/arm-optimized-routines/pl/math/test/testcases/random/float.tst b/contrib/arm-optimized-routines/pl/math/test/testcases/random/float.tst new file mode 100644 index 000000000000..fa77efecfabb --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/testcases/random/float.tst @@ -0,0 +1,8 @@ +!! float.tst - Random test case specification for SP functions +!! +!! Copyright (c) 2022-2023, Arm Limited. +!! SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +test erff 10000 +test log10f 10000 +test tanf 10000 diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h new file mode 100644 index 000000000000..5e3133e1db4c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/ulp_funcs.h @@ -0,0 +1,66 @@ +/* + * Function entries for ulp. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifdef __vpcs + +#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f) +#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f) +#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f) +#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f) + +#elif __aarch64 + +#define _ZVF1(f) SF1 (f) VF1 (f) +#define _ZVD1(f) SD1 (f) VD1 (f) +#define _ZVF2(f) SF2 (f) VF2 (f) +#define _ZVD2(f) SD2 (f) VD2 (f) + +#elif WANT_VMATH + +#define _ZVF1(f) SF1 (f) +#define _ZVD1(f) SD1 (f) +#define _ZVF2(f) SF2 (f) +#define _ZVD2(f) SD2 (f) + +#else + +#define _ZVF1(f) +#define _ZVD1(f) +#define _ZVF2(f) +#define _ZVD2(f) + +#endif + +#if WANT_SVE_MATH + +#define _ZSVF1(f) SVF1 (f) ZSVF1 (f) +#define _ZSVF2(f) SVF2 (f) ZSVF2 (f) +#define _ZSVD1(f) SVD1 (f) ZSVD1 (f) +#define _ZSVD2(f) SVD2 (f) ZSVD2 (f) + +#else + +#define _ZSVF1(f) +#define _ZSVF2(f) +#define _ZSVD1(f) +#define _ZSVD2(f) + +#endif + +#define _ZSF1(f) F1 (f) +#define _ZSF2(f) F2 (f) +#define _ZSD1(f) D1 (f) +#define _ZSD2(f) D2 (f) + +#include "ulp_funcs_gen.h" + +#if WANT_SVE_MATH +F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0) +F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0) +F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0) +F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h new file mode 100644 index 000000000000..b682e939054a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/test/ulp_wrappers.h @@ -0,0 +1,148 @@ +// clang-format off +/* + * Function wrappers for ulp. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include <stdbool.h> + +#if USE_MPFR +static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { + mpfr_cos(y, x, r); + return mpfr_sin(y, x, r); +} +static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { + mpfr_sin(y, x, r); + return mpfr_cos(y, x, r); +} +static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t rnd) { + mpfr_t y2; + mpfr_init(y2); + mpfr_trunc(y2, y); + return mpfr_pow(ret, x, y2, rnd); +} +#endif + +/* Our implementations of powi/powk are too imprecise to verify + against any established pow implementation. Instead we have the + following simple implementation, against which it is enough to + maintain bitwise reproducibility. Note the test framework expects + the reference impl to be of higher precision than the function + under test. For instance this means that the reference for + double-precision powi will be passed a long double, so to check + bitwise reproducibility we have to cast it back down to + double. This is fine since a round-trip to higher precision and + back down is correctly rounded. */ +#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \ + static DBL_T NAME (DBL_T in_val, DBL_T y) \ + { \ + INT_T n = (INT_T) round (y); \ + FLT_T acc = 1.0; \ + bool want_recip = n < 0; \ + n = n < 0 ? -n : n; \ + \ + for (FLT_T c = in_val; n; c *= c, n >>= 1) \ + { \ + if (n & 0x1) \ + { \ + acc *= c; \ + } \ + } \ + if (want_recip) \ + { \ + acc = 1.0 / acc; \ + } \ + return acc; \ + } + +DECL_POW_INT_REF(ref_powif, double, float, int) +DECL_POW_INT_REF(ref_powi, long double, double, int) + +#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; } +#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; } +#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; } +#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; } + +#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; } +#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; } +#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; } +#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; } + +#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; } +#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; } +#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; } +#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; } + +#ifdef __vpcs + +#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func) +#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func) +#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func) +#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func) + +#elif __aarch64__ + +#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) +#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) +#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) +#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) + +#elif WANT_VMATH + +#define ZVNF1_WRAP(func) VF1_WRAP(func) +#define ZVNF2_WRAP(func) VF2_WRAP(func) +#define ZVND1_WRAP(func) VD1_WRAP(func) +#define ZVND2_WRAP(func) VD2_WRAP(func) + +#else + +#define ZVNF1_WRAP(func) +#define ZVNF2_WRAP(func) +#define ZVND1_WRAP(func) +#define ZVND2_WRAP(func) + +#endif + +#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); } +#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); } +#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); } +#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); } + +#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); } +#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); } +#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); } +#define ZSVD2_WRAP(func) static double Z_sv_##func(double x, double y) { return svretd(_ZGVsMxvv_##func(svargd(x), svargd(y), svptrue_b64())); } + +#if WANT_SVE_MATH + +#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func) +#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func) +#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func) +#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func) + +#else + +#define ZSVNF1_WRAP(func) +#define ZSVNF2_WRAP(func) +#define ZSVND1_WRAP(func) +#define ZSVND2_WRAP(func) + +#endif + +/* No wrappers for scalar routines, but PL_SIG will emit them. */ +#define ZSNF1_WRAP(func) +#define ZSNF2_WRAP(func) +#define ZSND1_WRAP(func) +#define ZSND2_WRAP(func) + +#include "ulp_wrappers_gen.h" + +#if WANT_SVE_MATH +static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); } +static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); } +static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); } +static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); } +#endif +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/tools/asinh.sollya b/contrib/arm-optimized-routines/pl/math/tools/asinh.sollya new file mode 100644 index 000000000000..663ee92f3f34 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/asinh.sollya @@ -0,0 +1,28 @@ +// polynomial for approximating asinh(x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// Polynomial is used in [2^-26, 1]. However it is least accurate close to 1, so +// we use 2^-6 as the lower bound for coeff generation, which yields sufficiently +// accurate results in [2^-26, 2^-6]. +a = 0x1p-6; +b = 1.0; + +f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2); + +approx = proc(poly, d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +poly = 0; +for i from 0 to deg do { + i; + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); +}; + + +display = hexadecimal; +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/asinhf.sollya b/contrib/arm-optimized-routines/pl/math/tools/asinhf.sollya new file mode 100644 index 000000000000..ab115b53b8dc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/asinhf.sollya @@ -0,0 +1,29 @@ +// polynomial for approximating asinh(x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 9; + +a = 0x1.0p-12; +b = 1.0; + +f = proc(y) { + return asinh(x); +}; + +approx = proc(poly, d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +poly = x; +for i from 2 to deg do { + p = roundcoefficients(approx(poly,i), [|SG ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 2 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/atan.sollya b/contrib/arm-optimized-routines/pl/math/tools/atan.sollya new file mode 100644 index 000000000000..ad4f33b8516a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/atan.sollya @@ -0,0 +1,23 @@ +// polynomial for approximating atan(x) and atan2(y, x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// atan is odd, so approximate with an odd polynomial: +// x + ax^3 + bx^5 + cx^7 + ... +// We generate a, b, c, ... such that we can approximate atan(x) by: +// x + x^3 * (a + bx^2 + cx^4 + ...) + +// Assemble monomials +deg = 20; +mons = [|1,...,deg|]; +for i from 0 to deg-1 do mons[i] = mons[i] * 2 + 1; + +a = 0x1.0p-1022; +b = 1; + +poly = fpminimax(atan(x)-x, mons, [|double ...|], [a;b]); + +display = hexadecimal; +print("coeffs:"); +for i from 0 to deg-1 do coeff(poly,mons[i]); diff --git a/contrib/arm-optimized-routines/pl/math/tools/atanf.sollya b/contrib/arm-optimized-routines/pl/math/tools/atanf.sollya new file mode 100644 index 000000000000..ed88d0ba90f9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/atanf.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating atanf(x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// Generate list of monomials: +// Taylor series of atan is of the form x + ax^3 + bx^5 + cx^7 + ... +// So generate a, b, c, ... such that we can approximate atan(x) by: +// x + x^3 * (a + bx^2 + cx^4 + ...) + +deg = 7; + +a = 1.1754943508222875e-38; +b = 1; + +poly = fpminimax((atan(sqrt(x))-sqrt(x))/x^(3/2), deg, [|single ...|], [a;b]); + +display = hexadecimal; +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/cbrt.sollya b/contrib/arm-optimized-routines/pl/math/tools/cbrt.sollya new file mode 100644 index 000000000000..1d43dc73d8cd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/cbrt.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating cbrt(x) in double precision +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 3; + +a = 0.5; +b = 1; + + +f = x^(1/3); + +poly = fpminimax(f, deg, [|double ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do round(coeff(poly,i), D, RN); diff --git a/contrib/arm-optimized-routines/pl/math/tools/cbrtf.sollya b/contrib/arm-optimized-routines/pl/math/tools/cbrtf.sollya new file mode 100644 index 000000000000..4e0cc69b46a5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/cbrtf.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating cbrt(x) in single precision +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 3; + +a = 0.5; +b = 1; + + +f = x^(1/3); + +poly = fpminimax(f, deg, [|single ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do round(coeff(poly,i), SG, RN); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya new file mode 100644 index 000000000000..8c40b4b5db6b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/erfc.sollya @@ -0,0 +1,23 @@ +// polynomial for approximating erfc(x)*exp(x*x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 12; // poly degree + +// interval bounds +a = 0x1.60dfc14636e2ap0; +b = 0x1.d413cccfe779ap0; + +f = proc(y) { + t = y + a; + return erfc(t) * exp(t*t); +}; + +poly = remez(f(x), deg, [0;b-a], 1, 1e-16); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do round(coeff(poly,i), 52, RN); diff --git a/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya new file mode 100644 index 000000000000..69c683647af7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/erfcf.sollya @@ -0,0 +1,31 @@ +// polynomial for approximating erfc(x)*exp(x*x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 15; // poly degree + +// interval bounds +a = 0x1.0p-26; +b = 2; + +f = proc(y) { + return erfc(y) * exp(y*y); +}; + +approx = proc(poly, d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +poly = 0; +for i from 0 to deg do { + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); + print(i); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/expm1.sollya b/contrib/arm-optimized-routines/pl/math/tools/expm1.sollya new file mode 100644 index 000000000000..7b6f324eb247 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/expm1.sollya @@ -0,0 +1,21 @@ +// polynomial for approximating exp(x)-1 in double precision +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 12; + +a = -log(2)/2; +b = log(2)/2; + +f = proc(y) { + return exp(y)-1; +}; + +poly = fpminimax(f(x), deg, [|double ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 2 to deg do round(coeff(poly,i), D, RN); diff --git a/contrib/arm-optimized-routines/pl/math/tools/expm1f.sollya b/contrib/arm-optimized-routines/pl/math/tools/expm1f.sollya new file mode 100644 index 000000000000..efdf1bd301e0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/expm1f.sollya @@ -0,0 +1,21 @@ +// polynomial for approximating exp(x)-1 in single precision +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 5; + +a = -log(2)/2; +b = log(2)/2; + +f = proc(y) { + return exp(y)-1; +}; + +poly = fpminimax(f(x), deg, [|single ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 2 to deg do round(coeff(poly,i), SG, RN); diff --git a/contrib/arm-optimized-routines/pl/math/tools/log10.sollya b/contrib/arm-optimized-routines/pl/math/tools/log10.sollya new file mode 100644 index 000000000000..85d1d15c1698 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/log10.sollya @@ -0,0 +1,44 @@ +// polynomial for approximating log10(1+x) +// +// Copyright (c) 2019-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 6; // poly degree +// |log10(1+x)| > 0x1p-5 outside the interval +a = -0x1.p-5; +b = 0x1.p-5; + +ln10 = evaluate(log(10),0); +invln10hi = double(1/ln10 + 0x1p21) - 0x1p21; // round away last 21 bits +invln10lo = double(1/ln10 - invln10hi); + +// find log10(1+x)/x polynomial with minimal relative error +// (minimal relative error polynomial for log10(1+x) is the same * x) +deg = deg-1; // because of /x + +// f = log(1+x)/x; using taylor series +f = 0; +for i from 0 to 60 do { f = f + (-x)^i/(i+1); }; +f = f/ln10; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = invln10hi + invln10lo; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); +}; +display = hexadecimal; +print("invln10hi:", invln10hi); +print("invln10lo:", invln10lo); +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); + +display = decimal; +print("in [",a,b,"]"); diff --git a/contrib/arm-optimized-routines/pl/math/tools/log10f.sollya b/contrib/arm-optimized-routines/pl/math/tools/log10f.sollya new file mode 100644 index 000000000000..94bf32f2c449 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/log10f.sollya @@ -0,0 +1,37 @@ +// polynomial for approximating log10f(1+x) +// +// Copyright (c) 2019-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +// Computation of log10f(1+x) will be carried out in double precision + +deg = 4; // poly degree +// [OFF; 2*OFF] is divided in 2^4 intervals with OFF~0.7 +a = -0.04375; +b = 0.04375; + +// find log(1+x)/x polynomial with minimal relative error +// (minimal relative error polynomial for log(1+x) is the same * x) +deg = deg-1; // because of /x + +// f = log(1+x)/x; using taylor series +f = 0; +for i from 0 to 60 do { f = f + (-x)^i/(i+1); }; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do double(coeff(poly,i)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/log1p.sollya b/contrib/arm-optimized-routines/pl/math/tools/log1p.sollya new file mode 100644 index 000000000000..598a36af0339 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/log1p.sollya @@ -0,0 +1,30 @@ +// polynomial for approximating log(1+x) in double precision +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 20; + +a = sqrt(2)/2-1; +b = sqrt(2)-1; + +f = proc(y) { + return log(1+y); +}; + +approx = proc(poly, d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +poly = x; +for i from 2 to deg do { + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); +}; + + +print("coeffs:"); +display = hexadecimal; +for i from 2 to deg do coeff(poly,i); +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); diff --git a/contrib/arm-optimized-routines/pl/math/tools/log1pf.sollya b/contrib/arm-optimized-routines/pl/math/tools/log1pf.sollya new file mode 100644 index 000000000000..cc1db10e4c0c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/log1pf.sollya @@ -0,0 +1,21 @@ +// polynomial for approximating log(1+x) in single precision +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 10; + +a = -0.25; +b = 0.5; + +f = proc(y) { + return log(1+y); +}; + +poly = fpminimax(f(x), deg, [|single ...|], [a;b]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 2 to deg do round(coeff(poly,i), SG, RN); diff --git a/contrib/arm-optimized-routines/pl/math/tools/tan.sollya b/contrib/arm-optimized-routines/pl/math/tools/tan.sollya new file mode 100644 index 000000000000..bb0bb28270e3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/tan.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating double precision tan(x) +// +// Copyright (c) 2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 8; + +// interval bounds +a = 0x1.0p-126; +b = pi / 8; + +display = hexadecimal; + +f = (tan(sqrt(x))-sqrt(x))/x^(3/2); +poly = fpminimax(f, deg, [|double ...|], [a*a;b*b]); + +//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/tanf.sollya b/contrib/arm-optimized-routines/pl/math/tools/tanf.sollya new file mode 100644 index 000000000000..f4b49b40ae64 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/tanf.sollya @@ -0,0 +1,78 @@ +// polynomial for approximating single precision tan(x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +dtype = single; + +mthd = 0; // approximate tan +deg = 5; // poly degree + +// // Uncomment for cotan +// mthd = 1; // approximate cotan +// deg = 3; // poly degree + +// interval bounds +a = 0x1.0p-126; +b = pi / 4; + +print("Print some useful constants"); +display = hexadecimal!; +if (dtype==double) then { prec = 53!; } +else if (dtype==single) then { prec = 23!; }; + +print("pi/4"); +pi/4; + +// Setup precisions (display and computation) +display = decimal!; +prec=128!; +save_prec=prec; + +// +// Select function to approximate with Sollya +// +if(mthd==0) then { + s = "x + x^3 * P(x^2)"; + g = tan(x); + F = proc(P) { return x + x^3 * P(x^2); }; + f = (g(sqrt(x))-sqrt(x))/(x*sqrt(x)); + init_poly = 0; + // Display info + print("Approximate g(x) =", g, "as F(x)=", s, "."); + poly = fpminimax(f, deg, [|dtype ...|], [a*a;b*b]); +} +else if (mthd==1) then { + s = "1/x + x * P(x^2)"; + g = 1 / tan(x); + F = proc(P) { return 1/x + x * P(x^2); }; + f = (g(sqrt(x))-1/sqrt(x))/(sqrt(x)); + init_poly = 0; + deg_init_poly = -1; // a value such that we actually start by building constant coefficient + // Display info + print("Approximate g(x) =", g, "as F(x)=", s, "."); + // Fpminimax used to minimise absolute error + approx_fpminimax = proc(func, poly, d) { + return fpminimax(func - poly / x^-(deg-d), 0, [|dtype|], [a;b], absolute, floating); + }; + // Optimise all coefficients at once + poly = fpminimax(f, [|0,...,deg|], [|dtype ...|], [a;b], absolute, floating); +}; + + +// +// Display coefficients in Sollya +// +display = hexadecimal!; +if (dtype==double) then { prec = 53!; } +else if (dtype==single) then { prec = 23!; }; +print("_coeffs :_ hex"); +for i from 0 to deg do coeff(poly, i); + +// Compute errors +display = hexadecimal!; +d_rel_err = dirtyinfnorm(1-F(poly)/g(x), [a;b]); +d_abs_err = dirtyinfnorm(g(x)-F(poly), [a;b]); +print("dirty rel error:", d_rel_err); +print("dirty abs error:", d_abs_err); +print("in [",a,b,"]"); diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_erf.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_erf.sollya new file mode 100644 index 000000000000..394ba377df12 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/v_erf.sollya @@ -0,0 +1,20 @@ +// polynomial for approximating erf(x). +// To generate coefficients for interval i (0 to 47) do: +// $ sollya v_erf.sollya $i +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +scale = 1/8; +deg = 9; + +itv = parse(__argv[0]); +if (itv == 0) then { a = 0x1p-1022; } +else { a = itv * scale; }; + +prec=256; + +poly = fpminimax(erf(scale*x+a), deg, [|D ...|], [0; 1]); + +display = hexadecimal; +for i from 0 to deg do coeff(poly, i);
\ No newline at end of file diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_erfc.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_erfc.sollya new file mode 100644 index 000000000000..3b03ba07863d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/v_erfc.sollya @@ -0,0 +1,46 @@ +// polynomial for approximating erfc(x)*exp(x*x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 12; // poly degree + +itv = parse(__argv[0]); + +bounds = [|3.725290298461914e-9, + 0.18920711500272103, + 0.41421356237309515, + 0.681792830507429, + 1, + 1.378414230005442, + 1.8284271247461903, + 2.363585661014858, + 3, + 3.756828460010884, + 4.656854249492381, + 5.727171322029716, + 7, + 8.513656920021768, + 10.313708498984761, + 12.454342644059432, + 15, + 18.027313840043536, + 21.627416997969522, + 25.908685288118864, + 31|]; + +a = bounds[itv]; +b = bounds[itv + 1]; + +f = proc(y) { + t = y + a; + return erfc(t) * exp(t*t); +}; + +poly = fpminimax(f(x), deg, [|double ...|], [0;b-a]); + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly, i); diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_log10.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_log10.sollya new file mode 100644 index 000000000000..e2df4364ada0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/v_log10.sollya @@ -0,0 +1,38 @@ +// polynomial used for __v_log10(x) +// +// Copyright (c) 2019-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 6; // poly degree +a = -0x1.fc1p-9; +b = 0x1.009p-8; + +// find log(1+x)/x polynomial with minimal relative error +// (minimal relative error polynomial for log(1+x) is the same * x) +deg = deg-1; // because of /x + +// f = log(1+x)/x; using taylor series +f = 0; +for i from 0 to 60 do { f = f + (-x)^i/(i+1); }; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = 1; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|D ...|]); + poly = poly + x^i*coeff(p,0); +}; + +// scale coefficients by 1/ln(10) +ln10 = evaluate(log(10),0); +poly = poly/ln10; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do double(coeff(poly,i)); diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_log10f.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_log10f.sollya new file mode 100644 index 000000000000..396d5a92302b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/v_log10f.sollya @@ -0,0 +1,45 @@ +// polynomial for approximating v_log10f(1+x) +// +// Copyright (c) 2019-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 9; // poly degree +// |log10(1+x)| > 0x1p-4 outside the interval +a = -1/3; +b = 1/3; + +display = hexadecimal; +print("log10(2) = ", single(log10(2))); + +ln10 = evaluate(log(10),0); +invln10 = single(1/ln10); + +// find log10(1+x)/x polynomial with minimal relative error +// (minimal relative error polynomial for log10(1+x) is the same * x) +deg = deg-1; // because of /x + +// f = log(1+x)/x; using taylor series +f = 0; +for i from 0 to 60 do { f = f + (-x)^i/(i+1); }; +f = f/ln10; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = invln10; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|SG ...|]); + poly = poly + x^i*coeff(p,0); +}; +display = hexadecimal; +print("invln10:", invln10); +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do single(coeff(poly,i)); + +display = decimal; +print("in [",a,b,"]"); diff --git a/contrib/arm-optimized-routines/pl/math/tools/v_log2f.sollya b/contrib/arm-optimized-routines/pl/math/tools/v_log2f.sollya new file mode 100644 index 000000000000..99e050c91b03 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/tools/v_log2f.sollya @@ -0,0 +1,38 @@ +// polynomial used for __v_log2f(x) +// +// Copyright (c) 2022-2023, Arm Limited. +// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + +deg = 9; // poly degree +a = -1/3; +b = 1/3; + +ln2 = evaluate(log(2),0); +invln2 = single(1/ln2); + +// find log2(1+x)/x polynomial with minimal relative error +// (minimal relative error polynomial for log2(1+x) is the same * x) +deg = deg-1; // because of /x + +// f = log2(1+x)/x; using taylor series +f = 0; +for i from 0 to 60 do { f = f + (-x)^i/(i+1); }; +f = f * invln2; + +// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)| +approx = proc(poly,d) { + return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10); +}; + +// first coeff is fixed, iteratively find optimal double prec coeffs +poly = invln2; +for i from 1 to deg do { + p = roundcoefficients(approx(poly,i), [|SG ...|]); + poly = poly + x^i*coeff(p,0); +}; + +display = hexadecimal; +print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30)); +print("in [",a,b,"]"); +print("coeffs:"); +for i from 0 to deg do coeff(poly,i); diff --git a/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c new file mode 100644 index 000000000000..22f69d7636e4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_acosh_3u5.c @@ -0,0 +1,51 @@ +/* + * Single-precision vector acosh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define WANT_V_LOG1P_K0_SHORTCUT 1 +#include "v_log1p_inline.h" + +#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ + +#if V_SUPPORTED + +static NOINLINE VPCS_ATTR v_f64_t +special_case (v_f64_t x) +{ + return v_call_f64 (acosh, x, x, v_u64 (-1)); +} + +/* Vector approximation for double-precision acosh, based on log1p. + The largest observed error is 3.02 ULP in the region where the + argument to log1p falls in the k=0 interval, i.e. x close to 1: + __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5 + want 0x1.f2d6d823bc9e2p-5. */ +VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x) +{ + v_u64_t itop = v_as_u64_f64 (x) >> 52; + v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop)); + + /* Fall back to scalar routine for all lanes if any of them are special. */ + if (unlikely (v_any_u64 (special))) + return special_case (x); + + v_f64_t xm1 = x - 1; + v_f64_t u = xm1 * (x + 1); + return log1p_inline (xm1 + v_sqrt_f64 (u)); +} +VPCS_ALIAS + +PL_SIG (V, D, 1, acosh, 1.0, 10.0) +PL_TEST_ULP (V_NAME (acosh), 2.53) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh)) +PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000) +PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000) +PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000) +PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c new file mode 100644 index 000000000000..2b5aff591a74 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_acoshf_3u1.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector acosh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define SignMask 0x80000000 +#define One 0x3f800000 +#define SquareLim 0x5f800000 /* asuint(0x1p64). */ + +#if V_SUPPORTED + +#include "v_log1pf_inline.h" + +static NOINLINE VPCS_ATTR v_f32_t +special_case (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (acoshf, x, y, special); +} + +/* Vector approximation for single-precision acosh, based on log1p. Maximum + error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it + is 2.78 ULP: + __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 + want 0x1.ef9ea2p-3. + With exceptions disabled, we can compute u with a shorter dependency chain, + which gives maximum error of 3.07 ULP: + __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4 + want 0x1.fbc7f4p-4. */ + +VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One)); + +#if WANT_SIMD_EXCEPT + /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use + only xm1 to calculate u, as operating on x will trigger invalid for NaN. */ + v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1); + v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1); +#else + v_f32_t xm1 = x - 1; + v_f32_t u = xm1 * (x + 1.0f); +#endif + v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u)); + + if (unlikely (v_any_u32 (special))) + return special_case (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, acosh, 1.0, 10.0) +#if WANT_SIMD_EXCEPT +PL_TEST_ULP (V_NAME (acoshf), 2.29) +#else +PL_TEST_ULP (V_NAME (acoshf), 2.58) +#endif +PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500) +PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000) +PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000) +PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c new file mode 100644 index 000000000000..fd329b6b7f69 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_asinh_3u5.c @@ -0,0 +1,175 @@ +/* + * Double-precision vector asinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define OneTop 0x3ff /* top12(asuint64(1.0f)). */ +#define HugeBound 0x5fe /* top12(asuint64(0x1p511)). */ +#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)). */ +#define AbsMask v_u64 (0x7fffffffffffffff) +#define C(i) v_f64 (__asinh_data.poly[i]) + +/* Constants & data for log. */ +#define OFF 0x3fe6000000000000 +#define Ln2 v_f64 (0x1.62e42fefa39efp-1) +#define A(i) v_f64 (__sv_log_data.poly[i]) +#define T(i) __log_data.tab[i] +#define N (1 << LOG_TABLE_BITS) + +static NOINLINE v_f64_t +special_case (v_f64_t x, v_f64_t y, v_u64_t special) +{ + return v_call_f64 (asinh, x, y, special); +} + +struct entry +{ + v_f64_t invc; + v_f64_t logc; +}; + +static inline struct entry +lookup (v_u64_t i) +{ + struct entry e; +#ifdef SCALAR + e.invc = T (i).invc; + e.logc = T (i).logc; +#else + e.invc[0] = T (i[0]).invc; + e.logc[0] = T (i[0]).logc; + e.invc[1] = T (i[1]).invc; + e.logc[1] = T (i[1]).logc; +#endif + return e; +} + +static inline v_f64_t +log_inline (v_f64_t x) +{ + /* Double-precision vector log, copied from math/v_log.c with some cosmetic + modification and special-cases removed. See that file for details of the + algorithm used. */ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t tmp = ix - OFF; + v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N; + v_s64_t k = v_as_s64_u64 (tmp) >> 52; + v_u64_t iz = ix - (tmp & 0xfffULL << 52); + v_f64_t z = v_as_f64_u64 (iz); + struct entry e = lookup (i); + v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); + v_f64_t kd = v_to_f64_s64 (k); + v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r); + v_f64_t r2 = r * r; + v_f64_t y = v_fma_f64 (A (3), r, A (2)); + v_f64_t p = v_fma_f64 (A (1), r, A (0)); + y = v_fma_f64 (A (4), r2, y); + y = v_fma_f64 (y, r2, p); + y = v_fma_f64 (y, r2, hi); + return y; +} + +/* Double-precision implementation of vector asinh(x). + asinh is very sensitive around 1, so it is impractical to devise a single + low-cost algorithm which is sufficiently accurate on a wide range of input. + Instead we use two different algorithms: + asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + where log(x) is an optimized log approximation, and P(x) is a polynomial + shared with the scalar routine. The greatest observed error 3.29 ULP, in + |x| >= 1: + __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1 + want 0x1.ffffcfd0e2352p-1. */ +VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t iax = ix & AbsMask; + v_f64_t ax = v_as_f64_u64 (iax); + v_u64_t top12 = iax >> 52; + + v_u64_t gt1 = v_cond_u64 (top12 >= OneTop); + v_u64_t special = v_cond_u64 (top12 >= HugeBound); + +#if WANT_SIMD_EXCEPT + v_u64_t tiny = v_cond_u64 (top12 < TinyBound); + special |= tiny; +#endif + + /* Option 1: |x| >= 1. + Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). + If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will + overflow, by setting special lanes to 1. These will be fixed later. */ + v_f64_t option_1 = v_f64 (0); + if (likely (v_any_u64 (gt1))) + { +#if WANT_SIMD_EXCEPT + v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax); +#else + v_f64_t xm = ax; +#endif + option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1)); + } + + /* Option 2: |x| < 1. + Compute asinh(x) using a polynomial. + If WANT_SIMD_EXCEPT is enabled, sidestep special lanes, which will + overflow, and tiny lanes, which will underflow, by setting them to 0. They + will be fixed later, either by selecting x or falling back to the scalar + special-case. The largest observed error in this region is 1.47 ULPs: + __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. */ + v_f64_t option_2 = v_f64 (0); + if (likely (v_any_u64 (~gt1))) + { +#if WANT_SIMD_EXCEPT + ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax); +#endif + v_f64_t x2 = ax * ax; + v_f64_t z2 = x2 * x2; + v_f64_t z4 = z2 * z2; + v_f64_t z8 = z4 * z4; + v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C); + option_2 = v_fma_f64 (p, x2 * ax, ax); +#if WANT_SIMD_EXCEPT + option_2 = v_sel_f64 (tiny, x, option_2); +#endif + } + + /* Choose the right option for each lane. */ + v_f64_t y = v_sel_f64 (gt1, option_1, option_2); + /* Copy sign. */ + y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix)); + + if (unlikely (v_any_u64 (special))) + return special_case (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (asinh), 2.80) +PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT) +/* Test vector asinh 3 times, with control lane < 1, > 1 and special. + Ensures the v_sel is choosing the right option in all cases. */ +#define V_ASINH_INTERVAL(lo, hi, n) \ + PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5) \ + PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2) \ + PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600) +V_ASINH_INTERVAL (0, 0x1p-26, 50000) +V_ASINH_INTERVAL (0x1p-26, 1, 50000) +V_ASINH_INTERVAL (1, 0x1p511, 50000) +V_ASINH_INTERVAL (0x1p511, inf, 40000) +V_ASINH_INTERVAL (-0, -0x1p-26, 50000) +V_ASINH_INTERVAL (-0x1p-26, -1, 50000) +V_ASINH_INTERVAL (-1, -0x1p511, 50000) +V_ASINH_INTERVAL (-0x1p511, -inf, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c new file mode 100644 index 000000000000..9d8c8a936ae3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_asinhf_2u7.c @@ -0,0 +1,70 @@ +/* + * Single-precision vector asinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define SignMask v_u32 (0x80000000) +#define One v_f32 (1.0f) +#define BigBound v_u32 (0x5f800000) /* asuint(0x1p64). */ +#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30). */ + +#include "v_log1pf_inline.h" + +static NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (asinhf, x, y, special); +} + +/* Single-precision implementation of vector asinh(x), using vector log1p. + Worst-case error is 2.66 ULP, at roughly +/-0.25: + __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ +VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & ~SignMask; + v_u32_t sign = ix & SignMask; + v_f32_t ax = v_as_f32_u32 (iax); + v_u32_t special = v_cond_u32 (iax >= BigBound); + +#if WANT_SIMD_EXCEPT + /* Sidestep tiny and large values to avoid inadvertently triggering + under/overflow. */ + special |= v_cond_u32 (iax < TinyBound); + if (unlikely (v_any_u32 (special))) + ax = v_sel_f32 (special, One, ax); +#endif + + /* asinh(x) = log(x + sqrt(x * x + 1)). + For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ + v_f32_t d = One + v_sqrt_f32 (ax * ax + One); + v_f32_t y = log1pf_inline (ax + ax * ax / d); + y = v_as_f32_u32 (sign | v_as_u32_f32 (y)); + + if (unlikely (v_any_u32 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, asinh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (asinhf), 2.17) +PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000) +PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000) +PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000) +PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000) +PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000) +PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000) +PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c new file mode 100644 index 000000000000..6327fea8eb2c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_atan2_3u.c @@ -0,0 +1,90 @@ +/* + * Double-precision vector atan2(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "atan_common.h" + +#define PiOver2 v_f64 (0x1.921fb54442d18p+0) +#define SignMask v_u64 (0x8000000000000000) + +/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp) +{ + return v_call2_f64 (atan2, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline v_u64_t +zeroinfnan (v_u64_t i) +{ + return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1)); +} + +/* Fast implementation of vector atan2. + Maximum observed error is 2.8 ulps: + v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) + got 0x1.92d628ab678ccp-1 + want 0x1.92d628ab678cfp-1. */ +VPCS_ATTR +v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t iy = v_as_u64_f64 (y); + + v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + + v_u64_t sign_x = ix & SignMask; + v_u64_t sign_y = iy & SignMask; + v_u64_t sign_xy = sign_x ^ sign_y; + + v_f64_t ax = v_abs_f64 (x); + v_f64_t ay = v_abs_f64 (y); + + v_u64_t pred_xlt0 = x < 0.0; + v_u64_t pred_aygtax = ay > ax; + + /* Set up z for call to atan. */ + v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay); + v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax); + v_f64_t z = v_div_f64 (n, d); + + /* Work out the correct shift. */ + v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0)); + shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift); + shift *= PiOver2; + + v_f64_t ret = eval_poly (z, z, shift); + + /* Account for the sign of x and y. */ + ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy); + + if (unlikely (v_any_u64 (special_cases))) + { + return specialcase (y, x, ret, special_cases); + } + + return ret; +} +VPCS_ALIAS + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +PL_SIG (V, D, 2, atan2) +// TODO tighten this once __v_atan2 is fixed +PL_TEST_ULP (V_NAME (atan2), 2.9) +PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000) +PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c new file mode 100644 index 000000000000..5d1e6ca4488e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_atan2f_3u.c @@ -0,0 +1,89 @@ +/* + * Single-precision vector atan2(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "atanf_common.h" + +/* Useful constants. */ +#define PiOver2 v_f32 (0x1.921fb6p+0f) +#define SignMask v_u32 (0x80000000) + +/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f32_t +specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp) +{ + return v_call2_f32 (atan2f, y, x, ret, cmp); +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline v_u32_t +zeroinfnan (v_u32_t i) +{ + return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1)); +} + +/* Fast implementation of vector atan2f. Maximum observed error is + 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: + v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 + want 0x1.967f00p-1. */ +VPCS_ATTR +v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iy = v_as_u32_f32 (y); + + v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy); + + v_u32_t sign_x = ix & SignMask; + v_u32_t sign_y = iy & SignMask; + v_u32_t sign_xy = sign_x ^ sign_y; + + v_f32_t ax = v_abs_f32 (x); + v_f32_t ay = v_abs_f32 (y); + + v_u32_t pred_xlt0 = x < 0.0f; + v_u32_t pred_aygtax = ay > ax; + + /* Set up z for call to atanf. */ + v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay); + v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax); + v_f32_t z = v_div_f32 (n, d); + + /* Work out the correct shift. */ + v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f)); + shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift); + shift *= PiOver2; + + v_f32_t ret = eval_poly (z, z, shift); + + /* Account for the sign of y. */ + ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy); + + if (unlikely (v_any_u32 (special_cases))) + { + return specialcase (y, x, ret, special_cases); + } + + return ret; +} +VPCS_ALIAS + +/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */ +PL_SIG (V, F, 2, atan2) +PL_TEST_ULP (V_NAME (atan2f), 2.46) +PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000) +PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000) +PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000) +PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c new file mode 100644 index 000000000000..0f3c2ccf2606 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_atan_2u5.c @@ -0,0 +1,74 @@ +/* + * Double-precision vector atan(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "atan_common.h" + +#define PiOver2 v_f64 (0x1.921fb54442d18p+0) +#define AbsMask v_u64 (0x7fffffffffffffff) +#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)). */ +#define BigBound 0x434 /* top12(asuint64(0x1p53)). */ + +/* Fast implementation of vector atan. + Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using + z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: + __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 + want 0x1.9225645bdd7c3p-1. */ +VPCS_ATTR +v_f64_t V_NAME (atan) (v_f64_t x) +{ + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t sign = ix & ~AbsMask; + +#if WANT_SIMD_EXCEPT + v_u64_t ia12 = (ix >> 52) & 0x7ff; + v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound); + /* If any lane is special, fall back to the scalar routine for all lanes. */ + if (unlikely (v_any_u64 (special))) + return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1)); +#endif + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + v_u64_t red = v_cagt_f64 (x, v_f64 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x); + v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0)); + /* Use absolute value only when needed (odd powers of z). */ + v_f64_t az = v_abs_f64 (z); + az = v_sel_f64 (red, -az, az); + + /* Calculate the polynomial approximation. */ + v_f64_t y = eval_poly (z, az, shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, atan, -10.0, 10.0) +PL_TEST_ULP (V_NAME (atan), 1.78) +PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000) +PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000) +PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000) +PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000) +PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000) +PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000) + +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c new file mode 100644 index 000000000000..67d90b94f5d3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_atanf_3u.c @@ -0,0 +1,83 @@ +/* + * Single-precision vector atan(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "atanf_common.h" + +#define PiOver2 v_f32 (0x1.921fb6p+0f) +#define AbsMask v_u32 (0x7fffffff) +#define TinyBound 0x308 /* top12(asuint(0x1p-30)). */ +#define BigBound 0x4e8 /* top12(asuint(0x1p30)). */ + +#if WANT_SIMD_EXCEPT +static NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (atanf, x, y, special); +} +#endif + +/* Fast implementation of vector atanf based on + atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] + using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: + v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ +VPCS_ATTR +v_f32_t V_NAME (atanf) (v_f32_t x) +{ + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t sign = ix & ~AbsMask; + +#if WANT_SIMD_EXCEPT + v_u32_t ia12 = (ix >> 20) & 0x7ff; + v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound); + /* If any lane is special, fall back to the scalar routine for all lanes. */ + if (unlikely (v_any_u32 (special))) + return specialcase (x, x, v_u32 (-1)); +#endif + + /* Argument reduction: + y := arctan(x) for x < 1 + y := pi/2 + arctan(-1/x) for x > 1 + Hence, use z=-1/a if x>=1, otherwise z=a. */ + v_u32_t red = v_cagt_f32 (x, v_f32 (1.0)); + /* Avoid dependency in abs(x) in division (and comparison). */ + v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x); + v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f)); + /* Use absolute value only when needed (odd powers of z). */ + v_f32_t az = v_abs_f32 (z); + az = v_sel_f32 (red, -az, az); + + /* Calculate the polynomial approximation. */ + v_f32_t y = eval_poly (z, az, shift); + + /* y = atan(x) if x>0, -atan(-x) otherwise. */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); + + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, atan, -10.0, 10.0) +PL_TEST_ULP (V_NAME (atanf), 2.5) +PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000) +PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000) +PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000) +PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000) +PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000) +PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000) +PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000) +PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c new file mode 100644 index 000000000000..bfaf5c2b917f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_atanh_3u5.c @@ -0,0 +1,61 @@ +/* + * Double-precision vector atanh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pairwise_horner.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define One 0x3ff0000000000000 + +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +{ + return v_call_f64 (atanh, x, y, special); +} + +/* Approximation for vector double-precision atanh(x) using modified log1p. + The greatest observed error is 3.31 ULP: + __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 + want 0x1.ffd8ff31b501cp-6. */ +VPCS_ATTR +v_f64_t V_NAME (atanh) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t sign = ix & ~AbsMask; + v_u64_t ia = ix & AbsMask; + v_u64_t special = v_cond_u64 (ia >= One); + v_f64_t halfsign = v_as_f64_u64 (sign | Half); + + /* Mask special lanes with 0 to prevent spurious underflow. */ + v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia)); + v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax)); + + if (unlikely (v_any_u64 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, atanh, -1.0, 1.0) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh)) +PL_TEST_ULP (V_NAME (atanh), 3.32) +PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0) +PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c new file mode 100644 index 000000000000..cd3069661142 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_atanhf_3u1.c @@ -0,0 +1,62 @@ +/* + * Single-precision vector atanh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "v_log1pf_inline.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define One 0x3f800000 +#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */ + +/* Approximation for vector single-precision atanh(x) using modified log1p. + The maximum error is 3.08 ULP: + __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 + want 0x1.ffcb82p-5. */ +VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_f32_t halfsign + = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix)); + v_u32_t iax = ix & AbsMask; + + v_f32_t ax = v_as_f32_u32 (iax); + +#if WANT_SIMD_EXCEPT + v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound)); + /* Side-step special cases by setting those lanes to 0, which will trigger no + exceptions. These will be fixed up later. */ + if (unlikely (v_any_u32 (special))) + ax = v_sel_f32 (special, v_f32 (0), ax); +#else + v_u32_t special = v_cond_u32 (iax >= One); +#endif + + v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax)); + + if (unlikely (v_any_u32 (special))) + return v_call_f32 (atanhf, x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, atanh, -1.0, 1.0) +PL_TEST_ULP (V_NAME (atanhf), 2.59) +PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0) +PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0) +PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0) +PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c new file mode 100644 index 000000000000..d5abe41024bc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cbrt_2u.c @@ -0,0 +1,98 @@ +/* + * Double-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define AbsMask 0x7fffffffffffffff +#define TwoThirds v_f64 (0x1.5555555555555p-1) +#define TinyBound 0x001 /* top12 (smallest_normal). */ +#define BigBound 0x7ff /* top12 (infinity). */ +#define MantissaMask v_u64 (0x000fffffffffffff) +#define HalfExp v_u64 (0x3fe0000000000000) + +#define C(i) v_f64 (__cbrt_data.poly[i]) +#define T(i) v_lookup_f64 (__cbrt_data.table, i) + +static NOINLINE v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +{ + return v_call_f64 (cbrt, x, y, special); +} + +/* Approximation for double-precision vector cbrt(x), using low-order polynomial + and two Newton iterations. Greatest observed error is 1.79 ULP. Errors repeat + according to the exponent, for instance an error observed for double value + m * 2^e will be observed for any input m * 2^(e + 3*i), where i is an + integer. + __v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0 + want 0x1.965fe72821e99p+0. */ +VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t iax = ix & AbsMask; + v_u64_t ia12 = iax >> 52; + + /* Subnormal, +/-0 and special values. */ + v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexp, which gets subnormal values wrong - these have to be + special-cased as a result. */ + v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp)); + v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022; + + /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for + Newton iterations. */ + v_f64_t p_01 = v_fma_f64 (C (1), m, C (0)); + v_f64_t p_23 = v_fma_f64 (C (3), m, C (2)); + v_f64_t p = v_fma_f64 (m * m, p_23, p_01); + + /* Two iterations of Newton's method for iteratively approximating cbrt. */ + v_f64_t m_by_3 = m / 3; + v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p)); + a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is + an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + v_s64_t ey = e / 3; + v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2)); + + /* Vector version of ldexp. */ + v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my; + /* Copy sign. */ + y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix)); + + if (unlikely (v_any_u64 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +PL_TEST_ULP (V_NAME (cbrt), 1.30) +PL_SIG (V, D, 1, cbrt, -10.0, 10.0) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt)) +PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000) +PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c new file mode 100644 index 000000000000..62fa37505834 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cbrtf_1u5.c @@ -0,0 +1,96 @@ +/* + * Single-precision vector cbrt(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define AbsMask 0x7fffffff +#define SignMask v_u32 (0x80000000) +#define TwoThirds v_f32 (0x1.555556p-1f) +#define SmallestNormal 0x00800000 +#define MantissaMask 0x007fffff +#define HalfExp 0x3f000000 + +#define C(i) v_f32 (__cbrtf_data.poly[i]) +#define T(i) v_lookup_f32 (__cbrtf_data.table, i) + +static NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (cbrtf, x, y, special); +} + +/* Approximation for vector single-precision cbrt(x) using Newton iteration with + initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP. + This is observed for every value where the mantissa is 0x1.81410e and the + exponent is a multiple of 3, for example: + __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10 + want 0x1.255d92p+10. */ +VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + + /* Subnormal, +/-0 and special values. */ + v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000)); + + /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector + version of frexpf, which gets subnormal values wrong - these have to be + special-cased as a result. */ + v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp); + v_s32_t e = v_as_s32_u32 (iax >> 23) - 126; + + /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is, + the less accurate the next stage of the algorithm needs to be. An order-4 + polynomial is enough for one Newton iteration. */ + v_f32_t p_01 = v_fma_f32 (C (1), m, C (0)); + v_f32_t p_23 = v_fma_f32 (C (3), m, C (2)); + v_f32_t p = v_fma_f32 (m * m, p_23, p_01); + + /* One iteration of Newton's method for iteratively approximating cbrt. */ + v_f32_t m_by_3 = m / 3; + v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p)); + + /* Assemble the result by the following: + + cbrt(x) = cbrt(m) * 2 ^ (e / 3). + + We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is + not necessarily a multiple of 3 we lose some information. + + Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q. + + Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is + an integer in [-2, 2], and can be looked up in the table T. Hence the + result is assembled as: + + cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */ + + v_s32_t ey = e / 3; + v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2)); + + /* Vector version of ldexpf. */ + v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my; + /* Copy sign. */ + y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y))); + + if (unlikely (v_any_u32 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, cbrt, -10.0, 10.0) +PL_TEST_ULP (V_NAME (cbrtf), 1.03) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf)) +PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000) +PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c new file mode 100644 index 000000000000..0a9fbf817a10 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_cosh_2u.c @@ -0,0 +1,96 @@ +/* + * Double-precision vector cosh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" +#include "v_exp_tail.h" + +#define C1 v_f64 (C1_scal) +#define C2 v_f64 (C2_scal) +#define C3 v_f64 (C3_scal) +#define InvLn2 v_f64 (InvLn2_scal) +#define Ln2hi v_f64 (Ln2hi_scal) +#define Ln2lo v_f64 (Ln2lo_scal) +#define IndexMask v_u64 (IndexMask_scal) +#define Shift v_f64 (Shift_scal) +#define Thres v_f64 (Thres_scal) + +#define AbsMask 0x7fffffffffffffff +#define Half v_f64 (0.5) +#define SpecialBound \ + 0x4086000000000000 /* 0x1.6p9, above which exp overflows. */ + +#if V_SUPPORTED + +static inline v_f64_t +exp_inline (v_f64_t x) +{ + /* Helper for approximating exp(x). Copied from v_exp_tail, with no + special-case handling or tail. */ + + /* n = round(x/(ln2/N)). */ + v_f64_t z = v_fma_f64 (x, InvLn2, Shift); + v_u64_t u = v_as_u64_f64 (z); + v_f64_t n = z - Shift; + + /* r = x - n*ln2/N. */ + v_f64_t r = x; + r = v_fma_f64 (-Ln2hi, n, r); + r = v_fma_f64 (-Ln2lo, n, r); + + v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS); + v_u64_t i = u & IndexMask; + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + v_f64_t y = v_fma_f64 (C3, r, C2); + y = v_fma_f64 (y, r, C1); + y = v_fma_f64 (y, r, v_f64 (1)) * r; + + /* s = 2^(n/N). */ + u = v_lookup_u64 (Tab, i); + v_f64_t s = v_as_f64_u64 (u + e); + + return v_fma_f64 (y, s, s); +} + +/* Approximation for vector double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the same + as the scalar routine, 1.93 ULP: + __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 + want 0x1.f711dcb0c77b1p+7. */ +VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t iax = ix & AbsMask; + v_u64_t special = v_cond_u64 (iax > SpecialBound); + + /* If any inputs are special, fall back to scalar for all lanes. */ + if (unlikely (v_any_u64 (special))) + return v_call_f64 (cosh, x, x, v_u64 (-1)); + + v_f64_t ax = v_as_f64_u64 (iax); + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + v_f64_t t = exp_inline (ax); + return t * Half + Half / t; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (cosh), 1.43) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh)) +PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000) +PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000) +PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000) +PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c new file mode 100644 index 000000000000..1422d4d12b31 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_coshf_2u4.c @@ -0,0 +1,74 @@ +/* + * Single-precision vector cosh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffff +#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */ +#define SpecialBound \ + 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \ + special case. */ +#define Half v_f32 (0.5) + +#if V_SUPPORTED + +v_f32_t V_NAME (expf) (v_f32_t); + +/* Single-precision vector cosh, using vector expf. + Maximum error is 2.38 ULP: + __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4. */ +VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + v_f32_t ax = v_as_f32_u32 (iax); + v_u32_t special = v_cond_u32 (iax >= SpecialBound); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, fall back to the scalar + variant for all inputs if any input is a special value or above the bound + at which expf overflows. */ + if (unlikely (v_any_u32 (special))) + return v_call_f32 (coshf, x, x, v_u32 (-1)); + + v_u32_t tiny = v_cond_u32 (iax <= TinyBound); + /* If any input is tiny, avoid underflow exception by fixing tiny lanes of + input to 1, which will generate no exceptions, and then also fixing tiny + lanes of output to 1 just before return. */ + if (unlikely (v_any_u32 (tiny))) + ax = v_sel_f32 (tiny, v_f32 (1), ax); +#endif + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + v_f32_t t = V_NAME (expf) (ax); + v_f32_t y = t * Half + Half / t; + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (tiny))) + return v_sel_f32 (tiny, v_f32 (1), y); +#else + if (unlikely (v_any_u32 (special))) + return v_call_f32 (coshf, x, y, special); +#endif + + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, cosh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (coshf), 1.89) +PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100) +PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000) +PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000) +PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100) +PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000) +PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_2u.c b/contrib/arm-optimized-routines/pl/math/v_erf_2u.c new file mode 100644 index 000000000000..1d7ddbb1ee3e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erf_2u.c @@ -0,0 +1,116 @@ +/* + * Double-precision vector erf(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "include/mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define AbsMask v_u64 (0x7fffffffffffffff) +#define AbsXMax v_f64 (0x1.8p+2) +#define Scale v_f64 (0x1p+3) + +/* Special cases (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (erf, x, y, cmp); +} + +/* A structure to perform look-up in coeffs and other parameter tables. */ +struct entry +{ + v_f64_t P[V_ERF_NCOEFFS]; + v_f64_t shift; +}; + +static inline struct entry +lookup (v_u64_t i) +{ + struct entry e; +#ifdef SCALAR + for (int j = 0; j < V_ERF_NCOEFFS; ++j) + e.P[j] = __v_erf_data.coeffs[j][i]; + e.shift = __v_erf_data.shifts[i]; +#else + for (int j = 0; j < V_ERF_NCOEFFS; ++j) + { + e.P[j][0] = __v_erf_data.coeffs[j][i[0]]; + e.P[j][1] = __v_erf_data.coeffs[j][i[1]]; + } + e.shift[0] = __v_erf_data.shifts[i[0]]; + e.shift[1] = __v_erf_data.shifts[i[1]]; +#endif + return e; +} + +/* Optimized double precision vector error function erf. Maximum + observed error is 1.75 ULP, in [0.110, 0.111]: + verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4 + want 0x1.fe0ed62a54985p-4. */ +VPCS_ATTR +v_f64_t V_NAME (erf) (v_f64_t x) +{ + /* Handle both inf/nan as well as small values (|x|<2^-28) + If any condition in the lane is true then a loop over + scalar calls will be performed. */ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t atop = (ix >> 48) & v_u64 (0x7fff); + v_u64_t special_case + = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30)); + + /* Get sign and absolute value. */ + v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask; + v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax); + + /* Compute index by truncating 8 * a with a=|x| saturated to 6.0. */ + +#ifdef SCALAR + v_u64_t i = v_trunc_u64 (a * Scale); +#else + v_u64_t i = vcvtq_n_u64_f64 (a, 3); +#endif + /* Get polynomial coefficients and shift parameter using lookup. */ + struct entry dat = lookup (i); + + /* Evaluate polynomial on transformed argument. */ + v_f64_t z = v_fma_f64 (a, Scale, dat.shift); + + v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]); + v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]); + v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]); + v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]); + v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]); + + v_f64_t z2 = z * z; + v_f64_t y = v_fma_f64 (z2, r5, r4); + y = v_fma_f64 (z2, y, r3); + y = v_fma_f64 (z2, y, r2); + y = v_fma_f64 (z2, y, r1); + + /* y=erf(x) if x>0, -erf(-x) otherwise. */ + y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign); + + if (unlikely (v_any_u64 (special_case))) + return specialcase (x, y, special_case); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, erf, -6.0, 6.0) +PL_TEST_ULP (V_NAME (erf), 1.26) +PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erf_data.c b/contrib/arm-optimized-routines/pl/math/v_erf_data.c new file mode 100644 index 000000000000..7bbb281ad912 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erf_data.c @@ -0,0 +1,119 @@ +/* + * Polynomial coefficients and shifts for double-precision erf(x) vector + * function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for + i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for + [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1 + above 6. + + Coefficients for each interval generated using fpminimax algorithm. See + v_erf.sollya for details. Note the array is transposed, so for a set of + coefficients C generated on interval i, C[j] is at coeffs[j][i]. */ + +const struct v_erf_data __v_erf_data + = {.shifts + = {-0x1p-1019, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, + -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, + -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, + -39, -40, -41, -42, -43, -44, -45, -46, -47, 0}, + .coeffs = { + // clang-format off + +{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1, + 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1, + 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1, + 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1, + 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1, + 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1, + 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1, + 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0}, + +{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4, + 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6, + 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10, + 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15, + 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22, + 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31, + 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41, + 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0}, + +{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8, + -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9, + -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12, + -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17, + -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23, + -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32, + -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42, + -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0}, + +{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14, + 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12, + 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14, + 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19, + 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25, + 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33, + 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49, + 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0}, + +{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15, + 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18, + -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18, + -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22, + -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27, + -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35, + -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41, + -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0}, + +{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21, + -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21, + -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22, + 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25, + 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30, + 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38, + 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40, + 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0}, + +{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23, + -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24, + 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28, + -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28, + -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33, + -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43, + -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39, + -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0}, + +{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28, + 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31, + -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31, + -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32, + 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37, + 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40, + 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39, + 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0}, + +{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32, + 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32, + -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34, + 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37, + -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40, + -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41, + -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40, + -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0}, + +{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35, + -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37, + 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39, + -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43, + 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48, + 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43, + 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42, + 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0} + // clang-format on + }}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c new file mode 100644 index 000000000000..c30635153a20 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfc_4u.c @@ -0,0 +1,168 @@ +/* + * Double-precision vector erfc(x) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "horner.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +/* Accurate exponential (vector variant of exp_dd). */ +v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); + +#define One v_f64 (1.0) +#define AbsMask v_u64 (0x7fffffffffffffff) +#define Scale v_f64 (0x1.0000002p27) + +/* Coeffs for polynomial approximation on [0x1.0p-28., 31.]. */ +#define PX __v_erfc_data.poly +#define xint __v_erfc_data.interval_bounds + +/* Special cases (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (erfc, x, y, cmp); +} + +/* A structure to perform look-up in coeffs and other parameter + tables. */ +struct entry +{ + v_f64_t P[ERFC_POLY_ORDER + 1]; + v_f64_t xi; +}; + +static inline struct entry +lookup (v_u64_t i) +{ + struct entry e; +#ifdef SCALAR + for (int j = 0; j <= ERFC_POLY_ORDER; ++j) + e.P[j] = PX[i][j]; + e.xi = xint[i]; +#else + for (int j = 0; j <= ERFC_POLY_ORDER; ++j) + { + e.P[j][0] = PX[i[0]][j]; + e.P[j][1] = PX[i[1]][j]; + } + e.xi[0] = xint[i[0]]; + e.xi[1] = xint[i[1]]; +#endif + return e; +} + +/* Accurate evaluation of exp(x^2) using compensated product + (x^2 ~ x*x + e2) and custom exp(y+d) routine for small + corrections d<<y. */ +static inline v_f64_t +v_eval_gauss (v_f64_t a) +{ + v_f64_t e2; + v_f64_t a2 = a * a; + + /* TwoProduct (Dekker) applied to a * a. */ + v_f64_t a_hi = -v_fma_f64 (Scale, a, -a); + a_hi = v_fma_f64 (Scale, a, a_hi); + v_f64_t a_lo = a - a_hi; + + /* Now assemble error term. */ + e2 = v_fma_f64 (-a_hi, a_hi, a2); + e2 = v_fma_f64 (-a_hi, a_lo, e2); + e2 = v_fma_f64 (-a_lo, a_hi, e2); + e2 = v_fma_f64 (-a_lo, a_lo, e2); + + /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2. */ + return V_NAME (exp_tail) (-a2, e2); +} + +/* Optimized double precision vector complementary error function erfc. + Maximum measured error is 3.64 ULP: + __v_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42 + want 0x1.ff3f4c8e200d9p-42. */ +VPCS_ATTR +v_f64_t V_NAME (erfc) (v_f64_t x) +{ + v_f64_t z, p, y; + v_u64_t ix, atop, sign, i, cmp; + + ix = v_as_u64_f64 (x); + /* Compute fac as early as possible in order to get best performance. */ + v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62); + /* Use 12-bit for small, nan and inf case detection. */ + atop = (ix >> 52) & 0x7ff; + cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd)); + + struct entry dat; + + /* All entries of the vector are out of bounds, take a short path. + Use smallest possible number above 28 representable in 12 bits. */ + v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404)); + + /* Use sign to produce either 0 if x > 0, 2 otherwise. */ + if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp))) + return fac; + + /* erfc(|x|) = P(|x|-x_i)*exp(-x^2). */ + + v_f64_t a = v_abs_f64 (x); + + /* Interval bounds are a logarithmic scale, i.e. interval n has + lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain + the interval index. */ + v_f64_t xp1 = a + v_f64 (1.0); + xp1 = xp1 * xp1; + xp1 = xp1 * xp1; + v_u64_t ixp1 = v_as_u64_f64 (xp1); + i = (ixp1 >> 52) - v_u64 (1023); + + /* Index cannot exceed number of polynomials. */ +#ifdef SCALAR + i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS; +#else + i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS, + i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS}; +#endif + /* Get coeffs of i-th polynomial. */ + dat = lookup (i); + + /* Evaluate Polynomial: P(|x|-x_i). */ + z = a - dat.xi; +#define C(i) dat.P[i] + p = HORNER_12 (z, C); + + /* Evaluate Gaussian: exp(-x^2). */ + v_f64_t e = v_eval_gauss (a); + + /* Copy sign. */ + sign = v_as_u64_f64 (x) & ~AbsMask; + p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign); + + /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise. */ + y = v_fma_f64 (p, e, fac); + + /* No need to fix value of y if x is out of bound, as + P[ERFC_NUM_INTERVALS]=0. */ + if (unlikely (v_any_u64 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (V_NAME (erfc), 3.15) +PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000) +PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erfc_data.c b/contrib/arm-optimized-routines/pl/math/v_erfc_data.c new file mode 100644 index 000000000000..3c47033c1170 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfc_data.c @@ -0,0 +1,96 @@ +/* + * Polynomial coefficients for double-precision erfc(x) vector function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have + the same bounds as the scalar algorithm, with the exception of the lower + bound of the first interval which is larger. This is because the vector + variants fall back to the scalar for tiny arguments, meaning that we can use + a slightly different approach which is more precise for larger inputs but + unacceptably imprecise for tiny inputs. */ + +const struct v_erfc_data __v_erfc_data = { + +/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a + logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the + exception of the first interval. */ +.interval_bounds = { + 0x1p-28, /* If xmin=2^-28, 0 otherwise. */ + 0x1.837f0518db8a9p-3, /* 0.189. */ + 0x1.a827999fcef32p-2, /* 0.414. */ + 0x1.5d13f32b5a75bp-1, /* 0.682. */ + 0x1.0p0, /* 1.000. */ + 0x1.60dfc14636e2ap0, /* 1.378. */ + 0x1.d413cccfe779ap0, /* 1.828. */ + 0x1.2e89f995ad3adp1, /* 2.364. */ + 0x1.8p1, /* 3.000. */ + 0x1.e0dfc14636e2ap1, /* 3.757. */ + 0x1.2a09e667f3bcdp2, /* 4.657. */ + 0x1.6e89f995ad3adp2, /* 5.727. */ + 0x1.cp2, /* 7.000. */ + 0x1.106fe0a31b715p3, /* 8.514. */ + 0x1.4a09e667f3bcdp3, /* 10.31. */ + 0x1.8e89f995ad3adp3, /* 12.45. */ + 0x1.ep3, /* 15.00. */ + 0x1.206fe0a31b715p4, /* 18.03. */ + 0x1.5a09e667f3bcdp4, /* 21.63. */ + 0x1.9e89f995ad3adp4, /* 25.91. */ + 0x1.fp4 /* 31.00. */ +}, + +/* Generated using fpminimax algorithm on each interval separately. The + polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval + [0;b-a], where [a;b] is the interval in which the input lies. Note this is + slightly different from the scalar polynomial, which approximates + erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details. */ +.poly = { +/* 3.725290298461914e-9 < x < 0.18920711500272103. */ +{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9}, +/* 0.18920711500272103 < x < 0.41421356237309515. */ +{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16}, +/* 0.41421356237309515 < x < 0.681792830507429. */ +{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12}, +/* 0.681792830507429 < x < 1. */ +{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15}, +/* 1 < x < 1.378414230005442. */ +{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18}, +/* 1.378414230005442 < x < 1.8284271247461903. */ +{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20}, +/* 1.8284271247461903 < x < 2.363585661014858. */ +{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22}, +/* 2.363585661014858 < x < 3. */ +{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25}, +/* 3 < x < 3.756828460010884. */ +{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28}, +/* 3.756828460010884 < x < 4.656854249492381. */ +{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31}, +/* 4.656854249492381 < x < 5.727171322029716. */ +{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34}, +/* 5.727171322029716 < x < 7. */ +{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37}, +/* 7 < x < 8.513656920021768. */ +{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41}, +/* 8.513656920021768 < x < 10.313708498984761. */ +{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44}, +/* 10.313708498984761 < x < 12.454342644059432. */ +{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47}, +/* 12.454342644059432 < x < 15. */ +{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51}, +/* 15 < x < 18.027313840043536. */ +{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54}, +/* 18.027313840043536 < x < 21.627416997969522. */ +{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58}, +/* 21.627416997969522 < x < 25.908685288118864. */ +{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61}, +/* 25.908685288118864 < x < 31. */ +{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64}, +/* Dummy interval for x>31 */ +{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, + 0x0p0, 0x0p0, 0x0p0} +} +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c new file mode 100644 index 000000000000..963490d789bd --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erfcf_1u.c @@ -0,0 +1,183 @@ +/* + * Single-precision vector erfc(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "erfcf.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)] + +VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t); + +static VPCS_ATTR NOINLINE v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (erfcf, x, y, special); +} + +static inline uint32_t +interval_index (uint32_t ia12) +{ + // clang-format off + return (ia12 < 0x400 ? 0 : + (ia12 < 0x408 ? 1 : + (ia12 < 0x410 ? 2 : + 3))); + // clang-format on +} + +/* The C macro wraps the coeffs argument in order to make the + poynomial evaluation more readable. In the scalarised variant the + second pointer is ignored. */ +#ifdef SCALAR +#define C(i) coeff1[i] +#else +#define C(i) ((v_f64_t){coeff1[i], coeff2[i]}) +#endif + +static inline v_f64_t +v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1, + const double *coeff2) +{ + v_f64_t x2 = x * x; + v_f64_t x4 = x2 * x2; + v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C); + v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0)); + return poly * gauss; +} + +static inline float +approx_poly_gauss (float abs_x, const double *coeff) +{ + return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x)); +} + +static v_f32_t +v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes) +{ +#ifdef SCALAR + float y = approx_poly_gauss (abs_x, P (ia12)); + return sign ? 2 - y : y; +#else + float32x2_t lo32 = {0, 0}; + float32x2_t hi32 = {0, 0}; + /* The polynomial and Gaussian components must be calculated in + double precision in order to meet the required ULP error. This + means we have to promote low and high halves of the + single-precision input vector to two separate double-precision + input vectors. This incurs some overhead, and there is also + overhead to loading the polynomial coefficients as this cannot be + done in a vector fashion. This would be wasted effort for + elements which lie in the 'boring' zone, as they will be + overwritten later. Hence we use the lanes parameter to only do + the promotion on a pair of lanes if both of those lanes are + interesting and not special cases. If one lane is inactive, we + use a scalar routine which is shared with the scalar variant. */ + if (lanes[0] & lanes[1]) + { + lo32 = vcvt_f32_f64 ( + v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)), + P (ia12[0]), P (ia12[1]))); + } + else if (lanes[0]) + { + lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0])); + } + else if (lanes[1]) + { + lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1])); + } + + if (lanes[2] & lanes[3]) + { + hi32 + = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x), + P (ia12[2]), P (ia12[3]))); + } + else if (lanes[2]) + { + hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2])); + } + else if (lanes[3]) + { + hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3])); + } + + v_f32_t y = vcombine_f32 (lo32, hi32); + + if (v_any_u32 (sign)) + { + y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y); + } + + return y; +#endif +} + +/* Optimized single-precision vector complementary error function + erfcf. Max measured error: 0.750092 at various values between + -0x1.06521p-20 and -0x1.add1dap-17. For example: + __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0 + +0.249908 ulp err 0.250092. */ +VPCS_ATTR +v_f32_t V_NAME (erfcf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t ia = ix & 0x7fffffff; + v_u32_t ia12 = ia >> 20; + v_u32_t sign = ix >> 31; + v_u32_t inf_ia12 = v_u32 (0x7f8); + + v_u32_t special_cases + = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328)); + v_u32_t in_bounds + = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3))); + v_f32_t boring_zone = v_as_f32_u32 (sign << 30); + +#ifdef SCALAR + if (unlikely (special_cases)) + { + if (ia12 >= 0x7f8) + return (float) (sign << 1) + 1.0f / x; /* Special cases. */ + else + return 1.0f - x; /* Small case. */ + } + else if (likely (!in_bounds)) + { + return sign ? boring_zone : __math_uflowf (boring_zone); + } +#endif + + v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12, + in_bounds & ~special_cases); + +#ifndef SCALAR + y = vbslq_f32 (~in_bounds, boring_zone, y); + + if (unlikely (v_any_u32 (special_cases))) + { + return specialcase (x, y, special_cases); + } +#endif + + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, erfc, -6.0, 28.0) +PL_TEST_ULP (V_NAME (erfcf), 0.26) +PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c new file mode 100644 index 000000000000..3a25cc8751d1 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erff_1u5.c @@ -0,0 +1,116 @@ +/* + * Single-precision vector erf(x) function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "include/mathlib.h" +#include "math_config.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t); + +#define AbsMask v_u32 (0x7fffffff) + +/* Special cases (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + return v_call_f32 (erff, x, y, cmp); +} + +/* A structure to perform look-up in coeffs and other parameter tables. */ +struct entry +{ + v_f32_t P[V_ERFF_NCOEFFS]; +}; + +static inline struct entry +lookup (v_u32_t i) +{ + struct entry e; +#ifdef SCALAR + for (int j = 0; j < V_ERFF_NCOEFFS; ++j) + e.P[j] = __v_erff_data.coeffs[j][i]; +#else + for (int j = 0; j < V_ERFF_NCOEFFS; ++j) + { + e.P[j][0] = __v_erff_data.coeffs[j][i[0]]; + e.P[j][1] = __v_erff_data.coeffs[j][i[1]]; + e.P[j][2] = __v_erff_data.coeffs[j][i[2]]; + e.P[j][3] = __v_erff_data.coeffs[j][i[3]]; + } +#endif + return e; +} + +/* Optimized single precision vector error function erf. + Maximum measured at +/- 0.931, 1.25ULP: + v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1 + want -0x1.9f9c8ap-1. */ +VPCS_ATTR +v_f32_t V_NAME (erff) (v_f32_t x) +{ + /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition + in the lane is true then a loop over scalar calls will be performed. */ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t atop = (ix >> 16) & v_u32 (0x7fff); + v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180)); + + /* Get sign and absolute value. */ + v_u32_t sign = ix & ~AbsMask; + /* |x| < 0.921875. */ + v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f)); + /* |x| > 4.0. */ + v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f)); + /* Avoid dependency in abs(x) in division (and comparison). */ + v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1)); + + /* Get polynomial coefficients. */ + struct entry dat = lookup (i); + + v_f32_t a = v_abs_f32 (x); + v_f32_t z = v_sel_f32 (red, x * x, a); + + /* Evaluate Polynomial of |x| or x^2. */ + v_f32_t r = dat.P[6]; + r = v_fma_f32 (z, r, dat.P[5]); + r = v_fma_f32 (z, r, dat.P[4]); + r = v_fma_f32 (z, r, dat.P[3]); + r = v_fma_f32 (z, r, dat.P[2]); + r = v_fma_f32 (z, r, dat.P[1]); + r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0])); + r = v_fma_f32 (a, r, a); + + /* y = |x| + |x|*P(|x|) if |x| < 0.921875 + 1 - exp (-(|x|+|x|*P(x^2))) otherwise. */ + v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r)); + + /* Boring domain (absolute value is required to get the sign of erf(-nan) + right). */ + y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y)); + + /* y=erf(x) if x>0, -erf(-x) otherwise. */ + y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, erf, -4.0, 4.0) +PL_TEST_ULP (V_NAME (erff), 0.76) +PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000) +PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000) +PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_erff_data.c b/contrib/arm-optimized-routines/pl/math/v_erff_data.c new file mode 100644 index 000000000000..73ccb5cbcfa8 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_erff_data.c @@ -0,0 +1,18 @@ +/* + * Data for approximation of vector erff. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* Minimax approximation of erff. */ +const struct v_erff_data __v_erff_data + = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f}, + {0x1.06eba6p-03f, 0x1.450aa0p-1}, + {-0x1.8126e0p-02f, 0x1.b55cb0p-4f}, + {0x1.ce1a46p-04f, -0x1.8d6300p-6f}, + {-0x1.b68bd2p-06f, 0x1.fd1336p-9f}, + {0x1.473f48p-08f, -0x1.91d2ccp-12f}, + {-0x1.3a1a82p-11f, 0x1.222900p-16f}}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail.c new file mode 100644 index 000000000000..fd38aa8ae6ea --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail.c @@ -0,0 +1,75 @@ +/* + * Double-precision vector e^(x+tail) function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "math_config.h" +#if V_SUPPORTED +#include "v_exp_tail.h" + +#define C1 v_f64 (C1_scal) +#define C2 v_f64 (C2_scal) +#define C3 v_f64 (C3_scal) +#define InvLn2 v_f64 (InvLn2_scal) +#define Ln2hi v_f64 (Ln2hi_scal) +#define Ln2lo v_f64 (Ln2lo_scal) + +#define IndexMask v_u64 (IndexMask_scal) +#define Shift v_f64 (Shift_scal) +#define Thres v_f64 (Thres_scal) + +VPCS_ATTR +static v_f64_t +specialcase (v_f64_t s, v_f64_t y, v_f64_t n) +{ + v_f64_t absn = v_abs_f64 (n); + + /* 2^(n/N) may overflow, break it up into s1*s2. */ + v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000); + v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b); + v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b); + v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N)); + v_f64_t r1 = s1 * s1; + v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1; + return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0))); +} + +VPCS_ATTR +v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail) +{ + v_f64_t n, r, s, y, z; + v_u64_t cmp, u, e, i; + + cmp = v_cond_u64 (v_abs_f64 (x) > Thres); + + /* n = round(x/(ln2/N)). */ + z = v_fma_f64 (x, InvLn2, Shift); + u = v_as_u64_f64 (z); + n = z - Shift; + + /* r = x - n*ln2/N. */ + r = x; + r = v_fma_f64 (-Ln2hi, n, r); + r = v_fma_f64 (-Ln2lo, n, r); + + e = u << (52 - V_EXP_TAIL_TABLE_BITS); + i = u & IndexMask; + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + y = v_fma_f64 (C3, r, C2); + y = v_fma_f64 (y, r, C1); + y = v_fma_f64 (y, r, v_f64 (1.0)); + y = v_fma_f64 (y, r, xtail); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (Tab, i); + s = v_as_f64_u64 (u + e); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (s, y, n); + return v_fma_f64 (y, s, s); +} +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail.h b/contrib/arm-optimized-routines/pl/math/v_exp_tail.h new file mode 100644 index 000000000000..903f1fd95717 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail.h @@ -0,0 +1,21 @@ +/* + * Constants for double-precision e^(x+tail) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define C1_scal 0x1.fffffffffffd4p-2 +#define C2_scal 0x1.5555571d6b68cp-3 +#define C3_scal 0x1.5555576a59599p-5 +#define InvLn2_scal 0x1.71547652b82fep8 /* N/ln2. */ +#define Ln2hi_scal 0x1.62e42fefa39efp-9 /* ln2/N. */ +#define Ln2lo_scal 0x1.abc9e3b39803f3p-64 + +#define N (1 << V_EXP_TAIL_TABLE_BITS) +#define Tab __v_exp_tail_data +#define IndexMask_scal (N - 1) +#define Shift_scal 0x1.8p+52 +#define Thres_scal 704.0 diff --git a/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c new file mode 100644 index 000000000000..675eb769bf07 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_exp_tail_data.c @@ -0,0 +1,97 @@ +/* + * Lookup table for double-precision e^(x+tail) vector function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* 2^(j/N), j=0..N (where N = 256). */ +const uint64_t __v_exp_tail_data[] + = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9}; diff --git a/contrib/arm-optimized-routines/pl/math/v_expf.c b/contrib/arm-optimized-routines/pl/math/v_expf.c new file mode 100644 index 000000000000..a422e69feb62 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_expf.c @@ -0,0 +1,83 @@ +/* + * Single-precision vector e^x function. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#if V_SUPPORTED + +static const float Poly[] = { + /* maxerr: 1.45358 +0.5 ulp. */ + 0x1.0e4020p-7f, + 0x1.573e2ep-5f, + 0x1.555e66p-3f, + 0x1.fffdb6p-2f, + 0x1.ffffecp-1f, +}; +#define C0 v_f32 (Poly[0]) +#define C1 v_f32 (Poly[1]) +#define C2 v_f32 (Poly[2]) +#define C3 v_f32 (Poly[3]) +#define C4 v_f32 (Poly[4]) + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define Ln2hi v_f32 (0x1.62e4p-1f) +#define Ln2lo v_f32 (0x1.7f7d1cp-20f) + +VPCS_ATTR +static v_f32_t +specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale) +{ + /* 2^n may overflow, break it up into s1*s2. */ + v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000); + v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b); + v_f32_t s2 = v_as_f32_u32 (e - b); + v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f)); + v_u32_t r2 = v_as_u32_f32 (s1 * s1); + v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale)); + return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0)); +} + +VPCS_ATTR +v_f32_t +V_NAME(expf) (v_f32_t x) +{ + v_f32_t n, r, r2, scale, p, q, poly, absn, z; + v_u32_t cmp, e; + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ +#if 1 + z = v_fma_f32 (x, InvLn2, Shift); + n = z - Shift; + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_f32 (z) << 23; +#else + z = x * InvLn2; + n = v_round_f32 (z); + r = v_fma_f32 (n, -Ln2hi, x); + r = v_fma_f32 (n, -Ln2lo, r); + e = v_as_u32_s32 (v_round_s32 (z)) << 23; +#endif + scale = v_as_f32_u32 (e + v_u32 (0x3f800000)); + absn = v_abs_f32 (n); + cmp = v_cond_u32 (absn > v_f32 (126.0f)); + r2 = r * r; + p = v_fma_f32 (C0, r, C1); + q = v_fma_f32 (C2, r, C3); + q = v_fma_f32 (p, r2, q); + p = C4 * r; + poly = v_fma_f32 (q, r2, p); + if (unlikely (v_any_u32 (cmp))) + return specialcase (poly, n, e, absn, cmp, scale); + return v_fma_f32 (poly, scale, scale); +} +VPCS_ALIAS +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c new file mode 100644 index 000000000000..4b491d17feef --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_expm1_2u5.c @@ -0,0 +1,113 @@ +/* + * Double-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define InvLn2 v_f64 (0x1.71547652b82fep0) +#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) +#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) +#define Shift v_f64 (0x1.8p52) +#define TinyBound \ + 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */ +#define SpecialBound \ + 0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the \ + final stage of the algorithm overflows so fall back to \ + scalar. */ +#define AbsMask 0x7fffffffffffffff +#define One 0x3ff0000000000000 + +#define C(i) v_f64 (__expm1_poly[i]) + +static inline v_f64_t +eval_poly (v_f64_t f, v_f64_t f2) +{ + /* Evaluate custom polynomial using Estrin scheme. */ + v_f64_t p_01 = v_fma_f64 (f, C (1), C (0)); + v_f64_t p_23 = v_fma_f64 (f, C (3), C (2)); + v_f64_t p_45 = v_fma_f64 (f, C (5), C (4)); + v_f64_t p_67 = v_fma_f64 (f, C (7), C (6)); + v_f64_t p_89 = v_fma_f64 (f, C (9), C (8)); + + v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01); + v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45); + v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89); + + v_f64_t f4 = f2 * f2; + v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03); + return v_fma_f64 (f4 * f4, p_8a, p_07); +} + +/* Double-precision vector exp(x) - 1 function. + The maximum error observed error is 2.18 ULP: + __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 + want 0x1.a8b9ea8d66e2p-2. */ +VPCS_ATTR +v_f64_t V_NAME (expm1) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t ax = ix & AbsMask; + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, fall back to the scalar + variant for all lanes if any of them should trigger an exception. */ + v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound)); + if (unlikely (v_any_u64 (special))) + return v_call_f64 (expm1, x, x, v_u64 (-1)); +#else + /* Large input, NaNs and Infs. */ + v_u64_t special + = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000)); +#endif + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; + v_s64_t i = v_to_s64_f64 (j); + v_f64_t f = v_fma_f64 (j, MLn2hi, x); + f = v_fma_f64 (j, MLn2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + v_f64_t f2 = f * f; + v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + /* expm1(x) ~= p * t + (t - 1). */ + v_f64_t y = v_fma_f64 (p, t, t - 1); + +#if !WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (special))) + return v_call_f64 (expm1, x, y, special); +#endif + + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (V_NAME (expm1), 1.68) +PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000) +PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000) +PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000) +PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000) +PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100) +PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c new file mode 100644 index 000000000000..ab132427e58d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_1u6.c @@ -0,0 +1,94 @@ +/* + * Single-precision vector exp(x) - 1 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define MLn2hi v_f32 (-0x1.62e4p-1f) +#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) +#define AbsMask (0x7fffffff) +#define One (0x3f800000) +#define SpecialBound \ + (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x) \ + should round to -1. */ +#define TinyBound (0x34000000) /* asuint(0x1p-23). */ + +#define C(i) v_f32 (__expm1f_poly[i]) + +/* Single-precision vector exp(x) - 1 function. + The maximum error is 1.51 ULP: + expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2 + want 0x1.e2fb94p-2. */ +VPCS_ATTR +v_f32_t V_NAME (expm1f) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t ax = ix & AbsMask; + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, fall back to the scalar + variant for all lanes if any of them should trigger an exception. */ + v_u32_t special + = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound)); + if (unlikely (v_any_u32 (special))) + return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff)); +#else + /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0. */ + v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000)); +#endif + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; + v_s32_t i = v_to_s32_f32 (j); + v_f32_t f = v_fma_f32 (j, MLn2hi, x); + f = v_fma_f32 (j, MLn2lo, f); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + + v_f32_t p = v_fma_f32 (C (4), f, C (3)); + p = v_fma_f32 (p, f, C (2)); + p = v_fma_f32 (p, f, C (1)); + p = v_fma_f32 (p, f, C (0)); + p = v_fma_f32 (f * f, p, f); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); + /* expm1(x) ~= p * t + (t - 1). */ + v_f32_t y = v_fma_f32 (p, t, t - 1); + +#if !WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (special))) + return v_call_f32 (expm1f, x, y, special); +#endif + + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, expm1, -9.9, 9.9) +PL_TEST_ULP (V_NAME (expm1f), 1.02) +PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000) +PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000) +PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000) +PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h new file mode 100644 index 000000000000..c261941ebed6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_expm1f_inline.h @@ -0,0 +1,49 @@ +/* + * Helper for single-precision routines which calculate exp(x) - 1 and do not + * need special-case handling + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_V_EXPM1F_INLINE_H +#define PL_MATH_V_EXPM1F_INLINE_H + +#include "v_math.h" +#include "math_config.h" +#include "estrinf.h" + +#define One 0x3f800000 +#define Shift v_f32 (0x1.8p23f) +#define InvLn2 v_f32 (0x1.715476p+0f) +#define MLn2hi v_f32 (-0x1.62e4p-1f) +#define MLn2lo v_f32 (-0x1.7f7d1cp-20f) + +#define C(i) v_f32 (__expm1f_poly[i]) + +static inline v_f32_t +expm1f_inline (v_f32_t x) +{ + /* Helper routine for calculating exp(x) - 1. + Copied from v_expm1f_1u6.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift; + v_s32_t i = v_to_s32_f32 (j); + v_f32_t f = v_fma_f32 (j, MLn2hi, x); + f = v_fma_f32 (j, MLn2lo, f); + + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). + Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */ + v_f32_t f2 = f * f; + v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C); + p = v_fma_f32 (f2, p, f); + + /* t = 2^i. */ + v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One); + /* expm1(x) ~= p * t + (t - 1). */ + return v_fma_f32 (p, t, t - 1); +} + +#endif // PL_MATH_V_EXPM1F_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c new file mode 100644 index 000000000000..86d398ca13a9 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log10_2u5.c @@ -0,0 +1,110 @@ +/* + * Double-precision vector log10(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define A(i) v_f64 (__v_log10_data.poly[i]) +#define T(s, i) __v_log10_data.tab[i].s +#define Ln2 v_f64 (0x1.62e42fefa39efp-1) +#define N (1 << V_LOG10_TABLE_BITS) +#define OFF v_u64 (0x3fe6900900000000) + +struct entry +{ + v_f64_t invc; + v_f64_t log10c; +}; + +static inline struct entry +lookup (v_u64_t i) +{ + struct entry e; +#ifdef SCALAR + e.invc = T (invc, i); + e.log10c = T (log10c, i); +#else + e.invc[0] = T (invc, i[0]); + e.log10c[0] = T (log10c, i[0]); + e.invc[1] = T (invc, i[1]); + e.log10c[1] = T (log10c, i[1]); +#endif + return e; +} + +VPCS_ATTR +inline static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (log10, x, y, cmp); +} + +/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps). + Max ULP error: < 2.5 ulp (nearest rounding.) + Maximum measured at 2.46 ulp for x in [0.96, 0.97] + __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6 + want 0x1.fff6be3cae4b9p-6 + -0.459999 ulp err 1.96. */ +VPCS_ATTR +v_f64_t V_NAME (log10) (v_f64_t x) +{ + v_f64_t z, r, r2, p, y, kd, hi; + v_u64_t ix, iz, tmp, top, i, cmp; + v_s64_t k; + struct entry e; + + ix = v_as_u64_f64 (x); + top = ix >> 48; + cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N; + k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */ + iz = ix - (tmp & v_u64 (0xfffULL << 52)); + z = v_as_f64_u64 (iz); + e = lookup (i); + + /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ + r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); + kd = v_to_f64_s64 (k); + + /* hi = r / log(10) + log10(c) + k*log10(2). + Constants in `v_log10_data.c` are computed (in extended precision) as + e.log10c := e.logc * ivln10. */ + v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c); + + /* y = log10(1+r) + n * log10(2). */ + hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + r2 = r * r; + y = v_fma_f64 (A (3), r, A (2)); + p = v_fma_f64 (A (1), r, A (0)); + y = v_fma_f64 (A (4), r2, y); + y = v_fma_f64 (y, r2, p); + y = v_fma_f64 (y, r2, hi); + + if (unlikely (v_any_u64 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, log10, 0.01, 11.1) +PL_TEST_ULP (V_NAME (log10), 1.97) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10)) +PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000) +PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000) +PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_log10_data.c b/contrib/arm-optimized-routines/pl/math/v_log10_data.c new file mode 100644 index 000000000000..fda85c886963 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log10_data.c @@ -0,0 +1,167 @@ +/* + * Lookup table for double-precision log10(x) vector function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_LOG10_TABLE_BITS) + +/* Algorithm: + + x = 2^k z + log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10) + +where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128) +and log(c) and 1/c for the ith subinterval comes from a lookup table: + + tab[i].invc = 1/c + tab[i].log10c = (double)log10(c) + +where c is near the center of the subinterval and is chosen by trying several +floating point invc candidates around 1/center and selecting one for which +the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval +that contains 1 and the previous one got tweaked to avoid cancellation. +NB: invc should be optimized to minimize error in (double)log10(c) instead. */ +const struct v_log10_data __v_log10_data + = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3}, + {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3}, + {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3}, + {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3}, + {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3}, + {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3}, + {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3}, + {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3}, + {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3}, + {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3}, + {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3}, + {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4}, + {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4}, + {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4}, + {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4}, + {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4}, + {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4}, + {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4}, + {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4}, + {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4}, + {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4}, + {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4}, + {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4}, + {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4}, + {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4}, + {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4}, + {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4}, + {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4}, + {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4}, + {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4}, + {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4}, + {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4}, + {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4}, + {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4}, + {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4}, + {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4}, + {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4}, + {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4}, + {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4}, + {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4}, + {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4}, + {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5}, + {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5}, + {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5}, + {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5}, + {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5}, + {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5}, + {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5}, + {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5}, + {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5}, + {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5}, + {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5}, + {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5}, + {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5}, + {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5}, + {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5}, + {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5}, + {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5}, + {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6}, + {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6}, + {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6}, + {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6}, + {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6}, + {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6}, + {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6}, + {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6}, + {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7}, + {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7}, + {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7}, + {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7}, + {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7}, + {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8}, + {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8}, + {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9}, + {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10}, + {1.0, 0.0}, + {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9}, + {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8}, + {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7}, + {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7}, + {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6}, + {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6}, + {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6}, + {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6}, + {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6}, + {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5}, + {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5}, + {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5}, + {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5}, + {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5}, + {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5}, + {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5}, + {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5}, + {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5}, + {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5}, + {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4}, + {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4}, + {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4}, + {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4}, + {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4}, + {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4}, + {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4}, + {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4}, + {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4}, + {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4}, + {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4}, + {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4}, + {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4}, + {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4}, + {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4}, + {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4}, + {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4}, + {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4}, + {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4}, + {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4}, + {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4}, + {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4}, + {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4}, + {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3}, + {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3}, + {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3}, + {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3}, + {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3}, + {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3}, + {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3}, + {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3}, + {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3}, + {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}}, + + /* Computed from log coeffs div by log(10) then rounded to double + precision. */ + .poly + = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4, + 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4}, + + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2 + +}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c new file mode 100644 index 000000000000..e9f7f0346ca2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log10f_3u5.c @@ -0,0 +1,82 @@ +/* + * Single-precision vector log10 function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define P(i) v_f32 (__v_log10f_poly[i]) + +#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218. */ +#define InvLn10 v_f32 (0x1.bcb7b2p-2f) +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Mask v_u32 (0x007fffff) +#define Off v_u32 (0x3f2aaaab) /* 0.666667. */ + +VPCS_ATTR +NOINLINE static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log10f, x, y, cmp); +} + +/* Our fast implementation of v_log10f uses a similar approach as v_logf. + With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with + order 9. This is more efficient than using a low order polynomial computed in + double precision. + Maximum error: 3.305ulps (nearest rounding.) + __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4 + want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492. */ +VPCS_ATTR +v_f32_t V_NAME (log10f) (v_f32_t x) +{ + v_f32_t n, o, p, q, r, r2, y; + v_u32_t u, cmp; + + u = v_as_u32_f32 (x); + cmp = v_cond_u32 (u - Min >= Max - Min); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u -= Off; + n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ + u &= Mask; + u += Off; + r = v_as_f32_u32 (u) - v_f32 (1.0f); + + /* y = log10(1+r) + n*log10(2). */ + r2 = r * r; + /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 + + r2*(P6+r*P7))). */ + o = v_fma_f32 (P (7), r, P (6)); + p = v_fma_f32 (P (5), r, P (4)); + q = v_fma_f32 (P (3), r, P (2)); + y = v_fma_f32 (P (1), r, P (0)); + p = v_fma_f32 (o, r2, p); + q = v_fma_f32 (p, r2, q); + y = v_fma_f32 (q, r2, y); + /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster + but less accurate. */ + p = v_fma_f32 (Ln2, n, r); + y = v_fma_f32 (y, r2, p * InvLn10); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, log10, 0.01, 11.1) +PL_TEST_ULP (V_NAME (log10f), 2.81) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f)) +PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000) +PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_log10f_data.c b/contrib/arm-optimized-routines/pl/math/v_log10f_data.c new file mode 100644 index 000000000000..537482a92017 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log10f_data.c @@ -0,0 +1,13 @@ +/* + * Coefficients for single-precision vector log10 function. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" + +const float __v_log10f_poly[] = { + /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in + [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ + -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f, + -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f}; diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c new file mode 100644 index 000000000000..e48291081ab3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log1p_2u5.c @@ -0,0 +1,120 @@ +/* + * Double-precision vector log(1+x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) +#define Ln2Lo v_f64 (0x1.ef35793c76730p-45) +#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ +#define OneMHfRt2Top \ + 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ + << 32. */ +#define OneTop12 0x3ff +#define BottomMask 0xffffffff +#define AbsMask 0x7fffffffffffffff +#define C(i) v_f64 (__log1p_data.coeffs[i]) + +static inline v_f64_t +eval_poly (v_f64_t f) +{ + v_f64_t f2 = f * f; + v_f64_t f4 = f2 * f2; + v_f64_t f8 = f4 * f4; + return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C); +} + +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t special) +{ + return v_call_f64 (log1p, x, y, special); +} + +/* Vector log1p approximation using polynomial on reduced interval. Routine is a + modification of the algorithm used in scalar log1p, with no shortcut for k=0 + and no narrowing for f and k. Maximum observed error is 2.46 ULP: + __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2 + want 0x1.fd5565fb590f6p+2 . */ +VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t ia = ix & AbsMask; + v_u64_t special + = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000)) + | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000)); + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (special))) + x = v_sel_f64 (special, v_f64 (0), x); +#endif + + /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f + is in [sqrt(2)/2, sqrt(2)]): + log1p(x) = k*log(2) + log1p(f). + + f may not be representable exactly, so we need a correction term: + let m = round(1 + x), c = (1 + x) - m. + c << m: at very small x, log1p(x) ~ x, hence: + log(1+x) - log(m) ~ c/m. + + We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ + + /* Obtain correctly scaled k by manipulation in the exponent. + The scalar algorithm casts down to 32-bit at this point to calculate k and + u_red. We stay in double-width to obtain f and k, using the same constants + as the scalar algorithm but shifted left by 32. */ + v_f64_t m = x + 1; + v_u64_t mi = v_as_u64_f64 (m); + v_u64_t u = mi + OneMHfRt2Top; + + v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12; + v_f64_t k = v_to_f64_s64 (ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; + v_u64_t u_red = utop | (mi & BottomMask); + v_f64_t f = v_as_f64_u64 (u_red) - 1; + + /* Correction term c/m. */ + v_f64_t cm = (x - (m - 1)) / m; + + /* Approximate log1p(x) on the reduced input using a polynomial. Because + log1p(0)=0 we choose an approximation of the form: + x + C0*x^2 + C1*x^3 + C2x^4 + ... + Hence approximation has the form f + f^2 * P(f) + where P(x) = C0 + C1*x + C2x^2 + ... + Assembling this all correctly is dealt with at the final step. */ + v_f64_t p = eval_poly (f); + + v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); + v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); + v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi); + + if (unlikely (v_any_u64 (special))) + return specialcase (v_as_f64_u64 (ix), y, special); + + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (V_NAME (log1p), 1.97) +PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000) +PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000) +PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000) +PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h new file mode 100644 index 000000000000..e5c733964bc0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log1p_inline.h @@ -0,0 +1,77 @@ +/* + * Helper for vector double-precision routines which calculate log(1 + x) and do + * not need special-case handling + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#ifndef PL_MATH_V_LOG1P_INLINE_H +#define PL_MATH_V_LOG1P_INLINE_H + +#include "v_math.h" +#include "pairwise_horner.h" + +#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1) +#define Ln2Lo v_f64 (0x1.ef35793c76730p-45) +#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */ +#define OneMHfRt2Top \ + 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \ + << 32. */ +#define OneTop 0x3ff +#define BottomMask 0xffffffff +#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */ + +#define C(i) v_f64 (__log1p_data.coeffs[i]) + +static inline v_f64_t +log1p_inline (v_f64_t x) +{ + /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several + modifications: + - No special-case handling - this should be dealt with by the caller. + - Pairwise Horner polynomial evaluation for improved accuracy. + - Optionally simulate the shortcut for k=0, used in the scalar routine, + using v_sel, for improved accuracy when the argument to log1p is close to + 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in + the source of the caller before including this file. + See v_log1pf_2u1.c for details of the algorithm. */ + v_f64_t m = x + 1; + v_u64_t mi = v_as_u64_f64 (m); + v_u64_t u = mi + OneMHfRt2Top; + + v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop; + v_f64_t k = v_to_f64_s64 (ki); + + /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ + v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top; + v_u64_t u_red = utop | (mi & BottomMask); + v_f64_t f = v_as_f64_u64 (u_red) - 1; + + /* Correction term c/m. */ + v_f64_t cm = (x - (m - 1)) / m; + +#ifndef WANT_V_LOG1P_K0_SHORTCUT +#error \ + "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +#elif WANT_V_LOG1P_K0_SHORTCUT + /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is + that the approximation is solely the polynomial. */ + v_u64_t k0 = k == 0; + if (unlikely (v_any_u64 (k0))) + { + cm = v_sel_f64 (k0, v_f64 (0), cm); + f = v_sel_f64 (k0, x, f); + } +#endif + + /* Approximate log1p(f) on the reduced input using a polynomial. */ + v_f64_t f2 = f * f; + v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C); + + /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ + v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm); + v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f); + return v_fma_f64 (f2, p, ylo + yhi); +} + +#endif // PL_MATH_V_LOG1P_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c new file mode 100644 index 000000000000..4a7732b403ec --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_2u1.c @@ -0,0 +1,160 @@ +/* + * Single-precision vector log(1+x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define AbsMask 0x7fffffff +#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ +#define MinusOne 0xbf800000 +#define Ln2 (0x1.62e43p-1f) +#define Four 0x40800000 +#define ThreeQuarters v_u32 (0x3f400000) + +#define C(i) v_f32 (__log1pf_data.coeffs[i]) + +static inline v_f32_t +eval_poly (v_f32_t m) +{ +#ifdef V_LOG1PF_1U3 + + /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme. */ + v_f32_t p = v_fma_f32 (C (8), m, C (7)); + p = v_fma_f32 (p, m, C (6)); + p = v_fma_f32 (p, m, C (5)); + p = v_fma_f32 (p, m, C (4)); + p = v_fma_f32 (p, m, C (3)); + p = v_fma_f32 (p, m, C (2)); + p = v_fma_f32 (p, m, C (1)); + p = v_fma_f32 (p, m, C (0)); + return v_fma_f32 (m, m * p, m); + +#elif defined(V_LOG1PF_2U5) + + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); + v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); + v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); + v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); + + v_f32_t m2 = m * m; + v_f32_t p_02 = v_fma_f32 (m2, p_12, m); + v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); + v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); + + v_f32_t m4 = m2 * m2; + v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); + + return v_fma_f32 (m4, m4 * p_79, p_06); + +#else +#error No precision specified for v_log1pf +#endif +} + +static inline float +handle_special (float x) +{ + uint32_t ix = asuint (x); + uint32_t ia = ix & AbsMask; + if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000) + { + /* x == -Inf => log1pf(x) = NaN. + x < -1.0 => log1pf(x) = NaN. + x == +/-NaN => log1pf(x) = NaN. */ +#if WANT_SIMD_EXCEPT + return __math_invalidf (asfloat (ia)); +#else + return NAN; +#endif + } + if (ix == 0xbf800000) + { + /* x == -1.0 => log1pf(x) = -Inf. */ +#if WANT_SIMD_EXCEPT + return __math_divzerof (ix); +#else + return -INFINITY; +#endif + } + /* |x| < TinyBound => log1p(x) = x. */ + return x; +} + +/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is + the same as for the scalar algorithm, i.e. worst-case error when using Estrin + is roughly 2.02 ULP: + log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ +VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8); + v_u32_t special_cases + = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound)) + | v_cond_u32 (ix >= MinusOne); + v_f32_t special_arg = x; + +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u32 (special_cases))) + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + x = v_sel_f32 (special_cases, v_f32 (1), x); +#endif + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + + v_f32_t m = x + v_f32 (1.0f); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000); + + /* Scale x by exponent manipulation. */ + v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k)); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); + m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); + + /* Evaluate polynomial on the reduced interval. */ + v_f32_t p = eval_poly (m_scale); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f); + + /* Apply the scaling back. */ + v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p); + + if (unlikely (v_any_u32 (special_cases))) + return v_call_f32 (handle_special, special_arg, y, special_cases); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, log1p, -0.9, 10.0) +PL_TEST_ULP (V_NAME (log1pf), 1.53) +PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000) +PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000) +PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000) +PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000) +PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000) +PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h new file mode 100644 index 000000000000..e3048e667c26 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log1pf_inline.h @@ -0,0 +1,55 @@ +/* + * Helper for single-precision routines which calculate log(1 + x) and do not + * need special-case handling + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef PL_MATH_V_LOG1PF_INLINE_H +#define PL_MATH_V_LOG1PF_INLINE_H + +#include "v_math.h" +#include "math_config.h" + +#define Four 0x40800000 +#define Ln2 v_f32 (0x1.62e43p-1f) + +#define C(i) v_f32 (__log1pf_data.coeffs[i]) + +static inline v_f32_t +eval_poly (v_f32_t m) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */ + v_f32_t p_12 = v_fma_f32 (m, C (1), C (0)); + v_f32_t p_34 = v_fma_f32 (m, C (3), C (2)); + v_f32_t p_56 = v_fma_f32 (m, C (5), C (4)); + v_f32_t p_78 = v_fma_f32 (m, C (7), C (6)); + + v_f32_t m2 = m * m; + v_f32_t p_02 = v_fma_f32 (m2, p_12, m); + v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34); + v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78); + + v_f32_t m4 = m2 * m2; + v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02); + + return v_fma_f32 (m4, m4 * p_79, p_06); +} + +static inline v_f32_t +log1pf_inline (v_f32_t x) +{ + /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no + special-case handling. See that file for details of the algorithm. */ + v_f32_t m = x + 1.0f; + v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000; + v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k); + v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k) + + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f)); + v_f32_t p = eval_poly (m_scale); + v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f; + return v_fma_f32 (scale_back, Ln2, p); +} + +#endif // PL_MATH_V_LOG1PF_INLINE_H diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_3u.c b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c new file mode 100644 index 000000000000..fac73f60c600 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log2_3u.c @@ -0,0 +1,100 @@ +/* + * Double-precision vector log2 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "include/mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define InvLn2 v_f64 (0x1.71547652b82fep0) +#define N (1 << V_LOG2_TABLE_BITS) +#define OFF v_u64 (0x3fe6900900000000) +#define P(i) v_f64 (__v_log2_data.poly[i]) + +struct entry +{ + v_f64_t invc; + v_f64_t log2c; +}; + +static inline struct entry +lookup (v_u64_t i) +{ + struct entry e; +#ifdef SCALAR + e.invc = __v_log2_data.tab[i].invc; + e.log2c = __v_log2_data.tab[i].log2c; +#else + e.invc[0] = __v_log2_data.tab[i[0]].invc; + e.log2c[0] = __v_log2_data.tab[i[0]].log2c; + e.invc[1] = __v_log2_data.tab[i[1]].invc; + e.log2c[1] = __v_log2_data.tab[i[1]].log2c; +#endif + return e; +} + +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp) +{ + return v_call_f64 (log2, x, y, cmp); +} + +/* Double-precision vector log2 routine. Implements the same algorithm as vector + log10, with coefficients and table entries scaled in extended precision. + The maximum observed error is 2.58 ULP: + __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5 + want 0x1.fffb34198d9ddp-5. */ +VPCS_ATTR +v_f64_t V_NAME (log2) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t top = ix >> 48; + v_u64_t special + = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010)); + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + v_u64_t tmp = ix - OFF; + v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N; + v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */ + v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52)); + v_f64_t z = v_as_f64_u64 (iz); + struct entry e = lookup (i); + + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + + v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0)); + v_f64_t kd = v_to_f64_s64 (k); + v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c); + + v_f64_t r2 = r * r; + v_f64_t p_23 = v_fma_f64 (P (3), r, P (2)); + v_f64_t p_01 = v_fma_f64 (P (1), r, P (0)); + v_f64_t y = v_fma_f64 (P (4), r2, p_23); + y = v_fma_f64 (r2, y, p_01); + y = v_fma_f64 (r2, y, kd + w); + + if (unlikely (v_any_u64 (special))) + return specialcase (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, log2, 0.01, 11.1) +PL_TEST_ULP (V_NAME (log2), 2.09) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2)) +PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_log2_data.c b/contrib/arm-optimized-routines/pl/math/v_log2_data.c new file mode 100644 index 000000000000..2a1da6823fbc --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log2_data.c @@ -0,0 +1,155 @@ +/* + * Coefficients and table entries for vector log2 + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +#define N (1 << V_LOG2_TABLE_BITS) + +// clang-format off + +const struct v_log2_data __v_log2_data = { + +/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6. + Each coefficient was scaled by log2(e) in extended precision and rounded back to + double. */ +.poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2, + 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 }, + +/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was + calculated by scaling log10(c) by log2(10) in extended precision and rounding + back. */ +.tab = { +{ 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 }, +{ 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 }, +{ 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 }, +{ 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 }, +{ 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 }, +{ 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 }, +{ 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 }, +{ 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 }, +{ 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 }, +{ 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 }, +{ 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 }, +{ 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 }, +{ 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 }, +{ 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 }, +{ 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 }, +{ 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 }, +{ 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 }, +{ 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 }, +{ 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 }, +{ 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 }, +{ 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 }, +{ 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 }, +{ 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 }, +{ 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 }, +{ 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 }, +{ 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 }, +{ 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 }, +{ 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 }, +{ 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 }, +{ 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 }, +{ 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 }, +{ 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 }, +{ 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 }, +{ 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 }, +{ 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 }, +{ 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 }, +{ 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 }, +{ 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 }, +{ 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 }, +{ 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 }, +{ 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 }, +{ 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 }, +{ 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 }, +{ 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 }, +{ 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 }, +{ 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 }, +{ 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 }, +{ 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 }, +{ 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 }, +{ 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 }, +{ 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 }, +{ 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 }, +{ 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 }, +{ 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 }, +{ 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 }, +{ 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 }, +{ 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 }, +{ 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 }, +{ 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 }, +{ 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 }, +{ 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 }, +{ 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 }, +{ 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 }, +{ 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 }, +{ 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 }, +{ 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 }, +{ 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 }, +{ 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 }, +{ 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 }, +{ 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 }, +{ 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 }, +{ 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 }, +{ 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 }, +{ 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 }, +{ 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 }, +{ 1.0, 0.0 }, +{ 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 }, +{ 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 }, +{ 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 }, +{ 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 }, +{ 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 }, +{ 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 }, +{ 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 }, +{ 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 }, +{ 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 }, +{ 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 }, +{ 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 }, +{ 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 }, +{ 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 }, +{ 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 }, +{ 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 }, +{ 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 }, +{ 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 }, +{ 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 }, +{ 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 }, +{ 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 }, +{ 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 }, +{ 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 }, +{ 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 }, +{ 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 }, +{ 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 }, +{ 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 }, +{ 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 }, +{ 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 }, +{ 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 }, +{ 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 }, +{ 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 }, +{ 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 }, +{ 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 }, +{ 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 }, +{ 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 }, +{ 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 }, +{ 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 }, +{ 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 }, +{ 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 }, +{ 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 }, +{ 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 }, +{ 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 }, +{ 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 }, +{ 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 }, +{ 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 }, +{ 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 }, +{ 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 }, +{ 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 }, +{ 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 }, +{ 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 }, +{ 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 }, +{ 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 }} +}; +// clang-format on diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c new file mode 100644 index 000000000000..8f9241bed8e6 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log2f_2u5.c @@ -0,0 +1,68 @@ +/* + * Single-precision vector log2 function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pairwise_hornerf.h" +#include "pl_sig.h" +#include "pl_test.h" +#if V_SUPPORTED + +#define C(i) v_f32 (__v_log2f_data.poly[i]) + +#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */ +#define Min v_u32 (0x00800000) +#define Max v_u32 (0x7f800000) +#define Mask v_u32 (0x007fffff) +#define Off v_u32 (0x3f2aaaab) /* 0.666667 */ + +VPCS_ATTR +NOINLINE static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + /* Fall back to scalar code. */ + return v_call_f32 (log2f, x, y, cmp); +} + +/* Fast implementation for single precision log2, + relies on same argument reduction as Neon logf. + Maximum error: 2.48 ULPs + __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2 + want 0x1.a9be8p-2. */ +VPCS_ATTR +v_f32_t V_NAME (log2f) (v_f32_t x) +{ + v_u32_t u = v_as_u32_f32 (x); + v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min); + + /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ + u -= Off; + v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */ + u &= Mask; + u += Off; + v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f); + + /* y = log2(1+r) + n. */ + v_f32_t r2 = r * r; + v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C); + v_f32_t y = v_fma_f32 (p, r, n); + + if (unlikely (v_any_u32 (cmp))) + return specialcase (x, y, cmp); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, log2, 0.01, 11.1) +PL_TEST_ULP (V_NAME (log2f), 1.99) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f)) +PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000) +PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000) +PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_log2f_data.c b/contrib/arm-optimized-routines/pl/math/v_log2f_data.c new file mode 100644 index 000000000000..b144e8f4992d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_log2f_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients for vector log2f + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "math_config.h" + +/* See tools/v_log2f.sollya for the algorithm used to generate these + coefficients. */ +const struct v_log2f_data __v_log2f_data + = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)). */ + -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f, + -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_math.h b/contrib/arm-optimized-routines/pl/math/v_math.h new file mode 100644 index 000000000000..a8fa091a7cbf --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_math.h @@ -0,0 +1,855 @@ +/* + * Vector math abstractions. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#ifndef WANT_VMATH +/* Enable the build of vector math code. */ +# define WANT_VMATH 1 +#endif +#if WANT_VMATH + +/* The goal of this header is to allow vector (only Neon for now) + and scalar build of the same algorithm. */ + +#if SCALAR +#define V_NAME(x) __s_##x +#elif VPCS && __aarch64__ +#define V_NAME(x) __vn_##x +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) +#else +#define V_NAME(x) __v_##x +#endif + +#ifndef VPCS_ATTR +#define VPCS_ATTR +#endif +#ifndef VPCS_ALIAS +#define VPCS_ALIAS +#endif + +#include <stdint.h> +#include "math_config.h" + +typedef float f32_t; +typedef uint32_t u32_t; +typedef int32_t s32_t; +typedef double f64_t; +typedef uint64_t u64_t; +typedef int64_t s64_t; + +/* reinterpret as type1 from type2. */ +static inline u32_t +as_u32_f32 (f32_t x) +{ + union { f32_t f; u32_t u; } r = {x}; + return r.u; +} +static inline f32_t +as_f32_u32 (u32_t x) +{ + union { u32_t u; f32_t f; } r = {x}; + return r.f; +} +static inline s32_t +as_s32_u32 (u32_t x) +{ + union { u32_t u; s32_t i; } r = {x}; + return r.i; +} +static inline u32_t +as_u32_s32 (s32_t x) +{ + union { s32_t i; u32_t u; } r = {x}; + return r.u; +} +static inline u64_t +as_u64_f64 (f64_t x) +{ + union { f64_t f; u64_t u; } r = {x}; + return r.u; +} +static inline f64_t +as_f64_u64 (u64_t x) +{ + union { u64_t u; f64_t f; } r = {x}; + return r.f; +} +static inline s64_t +as_s64_u64 (u64_t x) +{ + union { u64_t u; s64_t i; } r = {x}; + return r.i; +} +static inline u64_t +as_u64_s64 (s64_t x) +{ + union { s64_t i; u64_t u; } r = {x}; + return r.u; +} + +#if SCALAR +#define V_SUPPORTED 1 +typedef f32_t v_f32_t; +typedef u32_t v_u32_t; +typedef s32_t v_s32_t; +typedef f64_t v_f64_t; +typedef u64_t v_u64_t; +typedef s64_t v_s64_t; + +static inline int +v_lanes32 (void) +{ + return 1; +} + +static inline v_f32_t +v_f32 (f32_t x) +{ + return x; +} +static inline v_u32_t +v_u32 (u32_t x) +{ + return x; +} +static inline v_s32_t +v_s32 (s32_t x) +{ + return x; +} + +static inline f32_t +v_get_f32 (v_f32_t x, int i) +{ + return x; +} +static inline u32_t +v_get_u32 (v_u32_t x, int i) +{ + return x; +} +static inline s32_t +v_get_s32 (v_s32_t x, int i) +{ + return x; +} + +static inline void +v_set_f32 (v_f32_t *x, int i, f32_t v) +{ + *x = v; +} +static inline void +v_set_u32 (v_u32_t *x, int i, u32_t v) +{ + *x = v; +} +static inline void +v_set_s32 (v_s32_t *x, int i, s32_t v) +{ + *x = v; +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (v_u32_t x) +{ + return x != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u32_t +v_cond_u32 (v_u32_t x) +{ + return x ? -1 : 0; +} +static inline v_f32_t +v_abs_f32 (v_f32_t x) +{ + return __builtin_fabsf (x); +} +static inline v_u32_t +v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) +{ + return (y & ~m) | (x & m); +} +static inline v_u32_t +v_cagt_f32 (v_f32_t x, v_f32_t y) +{ + return fabsf (x) > fabsf (y); +} +/* to wrap |x| >= |y|. */ +static inline v_u32_t +v_cage_f32 (v_f32_t x, v_f32_t y) +{ + return fabsf (x) >= fabsf (y); +} +static inline v_u32_t +v_calt_f32 (v_f32_t x, v_f32_t y) +{ + return fabsf (x) < fabsf (y); +} +static inline v_f32_t +v_div_f32 (v_f32_t x, v_f32_t y) +{ + return x / y; +} +static inline v_f32_t +v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) +{ + return __builtin_fmaf (x, y, z); +} +static inline v_f32_t +v_round_f32 (v_f32_t x) +{ + return __builtin_roundf (x); +} +static inline v_s32_t +v_round_s32 (v_f32_t x) +{ + return __builtin_lroundf (x); /* relies on -fno-math-errno. */ +} +static inline v_f32_t +v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) +{ + return p ? x : y; +} +static inline v_u32_t +v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y) +{ + return p ? x : y; +} +static inline v_f32_t +v_sqrt_f32 (v_f32_t x) +{ + return __builtin_sqrtf (x); +} +/* convert to type1 from type2. */ +static inline v_f32_t +v_to_f32_s32 (v_s32_t x) +{ + return x; +} +static inline v_s32_t +v_to_s32_f32 (v_f32_t x) +{ + return x; +} +static inline v_f32_t +v_to_f32_u32 (v_u32_t x) +{ + return x; +} +/* reinterpret as type1 from type2. */ +static inline v_u32_t +v_as_u32_f32 (v_f32_t x) +{ + union { v_f32_t f; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_s32_t +v_as_s32_f32 (v_f32_t x) +{ + union + { + v_f32_t f; + v_s32_t u; + } r = {x}; + return r.u; +} +static inline v_f32_t +v_as_f32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_f32_t f; } r = {x}; + return r.f; +} +static inline v_s32_t +v_as_s32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_s32_t i; } r = {x}; + return r.i; +} +static inline v_u32_t +v_as_u32_s32 (v_s32_t x) +{ + union { v_s32_t i; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_lookup_f32 (const f32_t *tab, v_u32_t idx) +{ + return tab[idx]; +} +static inline v_u32_t +v_lookup_u32 (const u32_t *tab, v_u32_t idx) +{ + return tab[idx]; +} +static inline v_f32_t +v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +{ + return f (x); +} +static inline v_f32_t +v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, + v_u32_t p) +{ + return f (x1, x2); +} + +static inline int +v_lanes64 (void) +{ + return 1; +} +static inline v_f64_t +v_f64 (f64_t x) +{ + return x; +} +static inline v_u64_t +v_u64 (u64_t x) +{ + return x; +} +static inline v_s64_t +v_s64 (s64_t x) +{ + return x; +} +static inline f64_t +v_get_f64 (v_f64_t x, int i) +{ + return x; +} +static inline void +v_set_f64 (v_f64_t *x, int i, f64_t v) +{ + *x = v; +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (v_u64_t x) +{ + return x != 0; +} +/* true if all elements of a v_cond result is non-zero. */ +static inline int +v_all_u64 (v_u64_t x) +{ + return x; +} +/* to wrap the result of relational operators. */ +static inline v_u64_t +v_cond_u64 (v_u64_t x) +{ + return x ? -1 : 0; +} +static inline v_f64_t +v_abs_f64 (v_f64_t x) +{ + return __builtin_fabs (x); +} +static inline v_u64_t +v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y) +{ + return (y & ~m) | (x & m); +} +static inline v_u64_t +v_cagt_f64 (v_f64_t x, v_f64_t y) +{ + return fabs (x) > fabs (y); +} +static inline v_f64_t +v_div_f64 (v_f64_t x, v_f64_t y) +{ + return x / y; +} +static inline v_f64_t +v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) +{ + return __builtin_fma (x, y, z); +} +static inline v_f64_t +v_min_f64(v_f64_t x, v_f64_t y) { + return x < y ? x : y; +} +static inline v_f64_t +v_round_f64 (v_f64_t x) +{ + return __builtin_round (x); +} +static inline v_f64_t +v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) +{ + return p ? x : y; +} +static inline v_f64_t +v_sqrt_f64 (v_f64_t x) +{ + return __builtin_sqrt (x); +} +static inline v_s64_t +v_round_s64 (v_f64_t x) +{ + return __builtin_lround (x); /* relies on -fno-math-errno. */ +} +static inline v_u64_t +v_trunc_u64 (v_f64_t x) +{ + return __builtin_trunc (x); +} +/* convert to type1 from type2. */ +static inline v_f64_t +v_to_f64_s64 (v_s64_t x) +{ + return x; +} +static inline v_f64_t +v_to_f64_u64 (v_u64_t x) +{ + return x; +} + +static inline v_s64_t +v_to_s64_f64 (v_f64_t x) +{ + return x; +} +/* reinterpret as type1 from type2. */ +static inline v_u64_t +v_as_u64_f64 (v_f64_t x) +{ + union { v_f64_t f; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_as_f64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_f64_t f; } r = {x}; + return r.f; +} +static inline v_s64_t +v_as_s64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_s64_t i; } r = {x}; + return r.i; +} +static inline v_u64_t +v_as_u64_s64 (v_s64_t x) +{ + union { v_s64_t i; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_lookup_f64 (const f64_t *tab, v_u64_t idx) +{ + return tab[idx]; +} +static inline v_u64_t +v_lookup_u64 (const u64_t *tab, v_u64_t idx) +{ + return tab[idx]; +} +static inline v_f64_t +v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) +{ + return f (x); +} +static inline v_f64_t +v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y, + v_u64_t p) +{ + return f (x1, x2); +} + +#elif __aarch64__ +#define V_SUPPORTED 1 +#include <arm_neon.h> +typedef float32x4_t v_f32_t; +typedef uint32x4_t v_u32_t; +typedef int32x4_t v_s32_t; +typedef float64x2_t v_f64_t; +typedef uint64x2_t v_u64_t; +typedef int64x2_t v_s64_t; + +static inline int +v_lanes32 (void) +{ + return 4; +} + +static inline v_f32_t +v_f32 (f32_t x) +{ + return (v_f32_t){x, x, x, x}; +} +static inline v_u32_t +v_u32 (u32_t x) +{ + return (v_u32_t){x, x, x, x}; +} +static inline v_s32_t +v_s32 (s32_t x) +{ + return (v_s32_t){x, x, x, x}; +} + +static inline f32_t +v_get_f32 (v_f32_t x, int i) +{ + return x[i]; +} +static inline u32_t +v_get_u32 (v_u32_t x, int i) +{ + return x[i]; +} +static inline s32_t +v_get_s32 (v_s32_t x, int i) +{ + return x[i]; +} + +static inline void +v_set_f32 (v_f32_t *x, int i, f32_t v) +{ + (*x)[i] = v; +} +static inline void +v_set_u32 (v_u32_t *x, int i, u32_t v) +{ + (*x)[i] = v; +} +static inline void +v_set_s32 (v_s32_t *x, int i, s32_t v) +{ + (*x)[i] = v; +} + +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u32 (v_u32_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +/* to wrap the result of relational operators. */ +static inline v_u32_t +v_cond_u32 (v_u32_t x) +{ + return x; +} +static inline v_f32_t +v_abs_f32 (v_f32_t x) +{ + return vabsq_f32 (x); +} +static inline v_u32_t +v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y) +{ + return vbslq_u32 (m, x, y); +} +static inline v_u32_t +v_cagt_f32 (v_f32_t x, v_f32_t y) +{ + return vcagtq_f32 (x, y); +} +/* to wrap |x| >= |y|. */ +static inline v_u32_t +v_cage_f32 (v_f32_t x, v_f32_t y) +{ + return vcageq_f32 (x, y); +} +static inline v_u32_t +v_calt_f32 (v_f32_t x, v_f32_t y) +{ + return vcaltq_f32 (x, y); +} +static inline v_f32_t +v_div_f32 (v_f32_t x, v_f32_t y) +{ + return vdivq_f32 (x, y); +} +static inline v_f32_t +v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z) +{ + return vfmaq_f32 (z, x, y); +} +static inline v_f32_t +v_round_f32 (v_f32_t x) +{ + return vrndaq_f32 (x); +} +static inline v_s32_t +v_round_s32 (v_f32_t x) +{ + return vcvtaq_s32_f32 (x); +} +static inline v_f32_t +v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y) +{ + return vbslq_f32 (p, x, y); +} +static inline v_u32_t +v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y) +{ + return vbslq_u32 (p, x, y); +} +static inline v_f32_t +v_sqrt_f32 (v_f32_t x) +{ + return vsqrtq_f32 (x); +} +/* convert to type1 from type2. */ +static inline v_f32_t +v_to_f32_s32 (v_s32_t x) +{ + return (v_f32_t){x[0], x[1], x[2], x[3]}; +} +static inline v_s32_t +v_to_s32_f32 (v_f32_t x) +{ + return vcvtq_s32_f32 (x); +} +static inline v_f32_t +v_to_f32_u32 (v_u32_t x) +{ + return (v_f32_t){x[0], x[1], x[2], x[3]}; +} +/* reinterpret as type1 from type2. */ +static inline v_u32_t +v_as_u32_f32 (v_f32_t x) +{ + union { v_f32_t f; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_s32_t +v_as_s32_f32 (v_f32_t x) +{ + union + { + v_f32_t f; + v_s32_t u; + } r = {x}; + return r.u; +} +static inline v_f32_t +v_as_f32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_f32_t f; } r = {x}; + return r.f; +} +static inline v_s32_t +v_as_s32_u32 (v_u32_t x) +{ + union { v_u32_t u; v_s32_t i; } r = {x}; + return r.i; +} +static inline v_u32_t +v_as_u32_s32 (v_s32_t x) +{ + union { v_s32_t i; v_u32_t u; } r = {x}; + return r.u; +} +static inline v_f32_t +v_lookup_f32 (const f32_t *tab, v_u32_t idx) +{ + return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline v_u32_t +v_lookup_u32 (const u32_t *tab, v_u32_t idx) +{ + return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]}; +} +static inline v_f32_t +v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p) +{ + return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]}; +} +static inline v_f32_t +v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y, + v_u32_t p) +{ + return ( + v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]}; +} + +static inline int +v_lanes64 (void) +{ + return 2; +} +static inline v_f64_t +v_f64 (f64_t x) +{ + return (v_f64_t){x, x}; +} +static inline v_u64_t +v_u64 (u64_t x) +{ + return (v_u64_t){x, x}; +} +static inline v_s64_t +v_s64 (s64_t x) +{ + return (v_s64_t){x, x}; +} +static inline f64_t +v_get_f64 (v_f64_t x, int i) +{ + return x[i]; +} +static inline void +v_set_f64 (v_f64_t *x, int i, f64_t v) +{ + (*x)[i] = v; +} +/* true if any elements of a v_cond result is non-zero. */ +static inline int +v_any_u64 (v_u64_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +/* true if all elements of a v_cond result is 1. */ +static inline int +v_all_u64 (v_u64_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; +} +/* to wrap the result of relational operators. */ +static inline v_u64_t +v_cond_u64 (v_u64_t x) +{ + return x; +} +static inline v_f64_t +v_abs_f64 (v_f64_t x) +{ + return vabsq_f64 (x); +} +static inline v_u64_t +v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y) +{ + return vbslq_u64 (m, x, y); +} +static inline v_u64_t +v_cagt_f64 (v_f64_t x, v_f64_t y) +{ + return vcagtq_f64 (x, y); +} +static inline v_f64_t +v_div_f64 (v_f64_t x, v_f64_t y) +{ + return vdivq_f64 (x, y); +} +static inline v_f64_t +v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z) +{ + return vfmaq_f64 (z, x, y); +} +static inline v_f64_t +v_min_f64(v_f64_t x, v_f64_t y) { + return vminq_f64(x, y); +} +static inline v_f64_t +v_round_f64 (v_f64_t x) +{ + return vrndaq_f64 (x); +} +static inline v_f64_t +v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y) +{ + return vbslq_f64 (p, x, y); +} +static inline v_f64_t +v_sqrt_f64 (v_f64_t x) +{ + return vsqrtq_f64 (x); +} +static inline v_s64_t +v_round_s64 (v_f64_t x) +{ + return vcvtaq_s64_f64 (x); +} +static inline v_u64_t +v_trunc_u64 (v_f64_t x) +{ + return vcvtq_u64_f64 (x); +} +/* convert to type1 from type2. */ +static inline v_f64_t +v_to_f64_s64 (v_s64_t x) +{ + return (v_f64_t){x[0], x[1]}; +} +static inline v_f64_t +v_to_f64_u64 (v_u64_t x) +{ + return (v_f64_t){x[0], x[1]}; +} +static inline v_s64_t +v_to_s64_f64 (v_f64_t x) +{ + return vcvtq_s64_f64 (x); +} +/* reinterpret as type1 from type2. */ +static inline v_u64_t +v_as_u64_f64 (v_f64_t x) +{ + union { v_f64_t f; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_as_f64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_f64_t f; } r = {x}; + return r.f; +} +static inline v_s64_t +v_as_s64_u64 (v_u64_t x) +{ + union { v_u64_t u; v_s64_t i; } r = {x}; + return r.i; +} +static inline v_u64_t +v_as_u64_s64 (v_s64_t x) +{ + union { v_s64_t i; v_u64_t u; } r = {x}; + return r.u; +} +static inline v_f64_t +v_lookup_f64 (const f64_t *tab, v_u64_t idx) +{ + return (v_f64_t){tab[idx[0]], tab[idx[1]]}; +} +static inline v_u64_t +v_lookup_u64 (const u64_t *tab, v_u64_t idx) +{ + return (v_u64_t){tab[idx[0]], tab[idx[1]]}; +} +static inline v_f64_t +v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p) +{ + return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]}; +} +static inline v_f64_t +v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y, + v_u64_t p) +{ + return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1]}; +} +#endif + +#endif +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c new file mode 100644 index 000000000000..57ec66ecc282 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sinh_3u.c @@ -0,0 +1,94 @@ +/* + * Double-precision vector sinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#define AbsMask 0x7fffffffffffffff +#define Half 0x3fe0000000000000 +#define BigBound \ + 0x4080000000000000 /* 2^9. expm1 helper overflows for large input. */ +#define TinyBound \ + 0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x. */ +#define InvLn2 v_f64 (0x1.71547652b82fep0) +#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) +#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) +#define Shift v_f64 (0x1.8p52) +#define One 0x3ff0000000000000 +#define C(i) v_f64 (__expm1_poly[i]) + +#if V_SUPPORTED + +static inline v_f64_t +expm1_inline (v_f64_t x) +{ + /* Reduce argument: + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where i = round(x / ln2) + and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ + v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; + v_s64_t i = v_to_s64_f64 (j); + v_f64_t f = v_fma_f64 (j, MLn2hi, x); + f = v_fma_f64 (j, MLn2lo, f); + /* Approximate expm1(f) using polynomial. */ + v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4; + v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f); + /* t = 2^i. */ + v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + /* expm1(x) ~= p * t + (t - 1). */ + return v_fma_f64 (p, t, t - 1); +} + +static NOINLINE VPCS_ATTR v_f64_t +special_case (v_f64_t x) +{ + return v_call_f64 (sinh, x, x, v_u64 (-1)); +} + +/* Approximation for vector double-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The greatest observed error is 2.57 ULP: + sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 + want 0x1.ab34e59d678d9p-2. */ +VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t iax = ix & AbsMask; + v_f64_t ax = v_as_f64_u64 (iax); + v_u64_t sign = ix & ~AbsMask; + v_f64_t halfsign = v_as_f64_u64 (sign | Half); + +#if WANT_SIMD_EXCEPT + v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound)); +#else + v_u64_t special = v_cond_u64 (iax >= BigBound); +#endif + + /* Fall back to scalar variant for all lanes if any of them are special. */ + if (unlikely (v_any_u64 (special))) + return special_case (x); + + /* Up to the point that expm1 overflows, we can use it to calculate sinh + using a slight rearrangement of the definition of sinh. This allows us to + retain acceptable accuracy for very small inputs. */ + v_f64_t t = expm1_inline (ax); + return (t + t / (t + 1)) * halfsign; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (sinh), 2.08) +PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000) +PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000) +PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000) +PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000) +PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000) +PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c new file mode 100644 index 000000000000..49cf078d0651 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_sinhf_2u3.c @@ -0,0 +1,69 @@ +/* + * Single-precision vector sinh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "v_expm1f_inline.h" + +#define AbsMask 0x7fffffff +#define Half 0x3f000000 +#define BigBound \ + 0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows. */ +#define TinyBound \ + 0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows. */ + +static NOINLINE VPCS_ATTR v_f32_t +special_case (v_f32_t x) +{ + return v_call_f32 (sinhf, x, x, v_u32 (-1)); +} + +/* Approximation for vector single-precision sinh(x) using expm1. + sinh(x) = (exp(x) - exp(-x)) / 2. + The maximum error is 2.26 ULP: + __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */ +VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + v_f32_t ax = v_as_f32_u32 (iax); + v_u32_t sign = ix & ~AbsMask; + v_f32_t halfsign = v_as_f32_u32 (sign | Half); + +#if WANT_SIMD_EXCEPT + v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound)); +#else + v_u32_t special = v_cond_u32 (iax >= BigBound); +#endif + + /* Fall back to the scalar variant for all lanes if any of them should trigger + an exception. */ + if (unlikely (v_any_u32 (special))) + return special_case (x); + + /* Up to the point that expm1f overflows, we can use it to calculate sinhf + using a slight rearrangement of the definition of asinh. This allows us to + retain acceptable accuracy for very small inputs. */ + v_f32_t t = expm1f_inline (ax); + return (t + t / (t + 1)) * halfsign; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, sinh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (sinhf), 1.76) +PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000) +PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000) +PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000) +PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000) +PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000) +PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c new file mode 100644 index 000000000000..f87baccc4fd7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_tan_3u5.c @@ -0,0 +1,102 @@ +/* + * Double-precision vector tan(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "estrin.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi) +#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo) +#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1) +#define Shift v_f64 (0x1.8p52) +#define AbsMask 0x7fffffffffffffff +#define RangeVal 0x4160000000000000 /* asuint64(2^23). */ +#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */ +#define C(i) v_f64 (__v_tan_data.poly[i]) + +/* Special cases (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f64_t +specialcase (v_f64_t x) +{ + return v_call_f64 (tan, x, x, v_u64 (-1)); +} + +/* Vector approximation for double-precision tan. + Maximum measured error is 3.48 ULP: + __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ +VPCS_ATTR +v_f64_t V_NAME (tan) (v_f64_t x) +{ + v_u64_t iax = v_as_u64_f64 (x) & AbsMask; + + /* Our argument reduction cannot calculate q with sufficient accuracy for very + large inputs. Fall back to scalar routine for all lanes if any are too + large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny + input to avoid underflow. Note pl does not supply a scalar double-precision + tan, so the fallback will be statically linked from the system libm. */ +#if WANT_SIMD_EXCEPT + if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound))) +#else + if (unlikely (v_any_u64 (iax > RangeVal))) +#endif + return specialcase (x); + + /* q = nearest integer to 2 * x / pi. */ + v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift; + v_s64_t qi = v_to_s64_f64 (q); + + /* Use q to reduce x to r in [-pi/4, pi/4], by: + r = x - q * pi/2, in extended precision. */ + v_f64_t r = x; + r = v_fma_f64 (q, MHalfPiHi, r); + r = v_fma_f64 (q, MHalfPiLo, r); + /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle + formula. */ + r = r * 0.5; + + /* Approximate tan(r) using order 8 polynomial. + tan(x) is odd, so polynomial has the form: + tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ... + Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... + Then compute the approximation by: + tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ + v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4; + /* Use offset version of Estrin wrapper to evaluate from C1 onwards. */ + v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1); + p = v_fma_f64 (p, r2, C (0)); + p = v_fma_f64 (r2, p * r, r); + + /* Recombination uses double-angle formula: + tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) + and reciprocity around pi/2: + tan(x) = 1 / (tan(pi/2 - x)) + to assemble result using change-of-sign and conditional selection of + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ + v_f64_t n = v_fma_f64 (p, p, v_f64 (-1)); + v_f64_t d = p * 2; + + v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0); + + return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d); +} +VPCS_ALIAS + +PL_SIG (V, D, 1, tan, -3.1, 3.1) +PL_TEST_ULP (V_NAME (tan), 2.99) +PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000) +PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000) +PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000) +PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000) +PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000) +PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_tan_data.c b/contrib/arm-optimized-routines/pl/math/v_tan_data.c new file mode 100644 index 000000000000..04e25169bd88 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_tan_data.c @@ -0,0 +1,15 @@ +/* + * Coefficients and helpers for double-precision vector tan(x) function. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "math_config.h" + +const struct v_tan_data __v_tan_data + = {.neg_half_pi_hi = -0x1.921fb54442d18p0, + .neg_half_pi_lo = -0x1.1a62633145c07p-54, + .poly + = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, + 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, + 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}}; diff --git a/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c new file mode 100644 index 000000000000..828466b03182 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_tanf_3u5.c @@ -0,0 +1,131 @@ +/* + * Single-precision vector tan(x) function. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "estrinf.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +/* Constants. */ +#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f)) +#define NegPio2_2 (v_f32 (0x1.777a5cp-25f)) +#define NegPio2_3 (v_f32 (0x1.ee59dap-50f)) +#define InvPio2 (v_f32 (0x1.45f306p-1f)) +#define RangeVal (0x47000000) /* asuint32(0x1p15f). */ +#define TinyBound (0x30000000) /* asuint32 (0x1p-31). */ +#define Shift (v_f32 (0x1.8p+23f)) +#define AbsMask (v_u32 (0x7fffffff)) + +#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i]) + +/* Special cases (fall back to scalar calls). */ +VPCS_ATTR +NOINLINE static v_f32_t +specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp) +{ + return v_call_f32 (tanf, x, y, cmp); +} + +/* Use a full Estrin scheme to evaluate polynomial. */ +static inline v_f32_t +eval_poly (v_f32_t z) +{ + v_f32_t z2 = z * z; +#if WANT_SIMD_EXCEPT + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions + are to be triggered correctly, sidestep this by fixing such lanes to 0. */ + v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound); + if (unlikely (v_any_u32 (will_uflow))) + z2 = v_sel_f32 (will_uflow, v_f32 (0), z2); +#endif + v_f32_t z4 = z2 * z2; + return ESTRIN_5 (z, z2, z4, poly); +} + +/* Fast implementation of Neon tanf. + Maximum error is 3.45 ULP: + __v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1 + want 0x1.ff9850p-1. */ +VPCS_ATTR +v_f32_t V_NAME (tanf) (v_f32_t x) +{ + v_f32_t special_arg = x; + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + + /* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast + regression. */ +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, also special-case tiny + input, as this will load to overflow later. Fix any special lanes to 1 to + prevent any exceptions being triggered. */ + v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound); + if (unlikely (v_any_u32 (special))) + x = v_sel_f32 (special, v_f32 (1.0f), x); +#else + /* Otherwise, special-case large and special values. */ + v_u32_t special = v_cond_u32 (iax >= RangeVal); +#endif + + /* n = rint(x/(pi/2)). */ + v_f32_t q = v_fma_f32 (InvPio2, x, Shift); + v_f32_t n = q - Shift; + /* n is representable as a signed integer, simply convert it. */ + v_s32_t in = v_round_s32 (n); + /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ + v_s32_t alt = in & 1; + v_u32_t pred_alt = (alt != 0); + + /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ + v_f32_t r; + r = v_fma_f32 (NegPio2_1, n, x); + r = v_fma_f32 (NegPio2_2, n, r); + r = v_fma_f32 (NegPio2_3, n, r); + + /* If x lives in an interval, where |tan(x)| + - is finite, then use a polynomial approximation of the form + tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). + - grows to infinity then use symmetries of tangent and the identity + tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use + the same polynomial approximation of tan as above. */ + + /* Perform additional reduction if required. */ + v_f32_t z = v_sel_f32 (pred_alt, -r, r); + + /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */ + v_f32_t z2 = r * r; + v_f32_t p = eval_poly (z2); + v_f32_t y = v_fma_f32 (z * z2, p, z); + + /* Compute reciprocal and apply if required. */ + v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y); + y = v_sel_f32 (pred_alt, inv_y, y); + + /* Fast reduction does not handle the x = -0.0 case well, + therefore it is fixed here. */ + y = v_sel_f32 (x == v_f32 (-0.0), x, y); + + if (unlikely (v_any_u32 (special))) + return specialcase (special_arg, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, tan, -3.1, 3.1) +PL_TEST_ULP (V_NAME (tanf), 2.96) +PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100) +PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000) +PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000) +PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000) +PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000) +PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000) +PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000) +PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c new file mode 100644 index 000000000000..c8b6c251d453 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_tanh_3u.c @@ -0,0 +1,94 @@ +/* + * Double-precision vector tanh(x) function. + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "estrin.h" +#include "mathlib.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#define AbsMask v_u64 (0x7fffffffffffffff) +#define InvLn2 v_f64 (0x1.71547652b82fep0) +#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1) +#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56) +#define Shift v_f64 (0x1.8p52) +#define C(i) v_f64 (__expm1_poly[i]) + +#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */ +#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */ +#define One v_u64 (0x3ff0000000000000) + +static inline v_f64_t +expm1_inline (v_f64_t x) +{ + /* Helper routine for calculating exp(x) - 1. Vector port of the helper from + the scalar variant of tanh. */ + + /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ + v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift; + v_s64_t i = v_to_s64_f64 (j); + v_f64_t f = v_fma_f64 (j, MLn2hi, x); + f = v_fma_f64 (j, MLn2lo, f); + + /* Approximate expm1(f) using polynomial. */ + v_f64_t f2 = f * f; + v_f64_t f4 = f2 * f2; + v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f); + + /* t = 2 ^ i. */ + v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One); + /* expm1(x) = p * t + (t - 1). */ + return v_fma_f64 (p, t, t - 1); +} + +static NOINLINE v_f64_t +special_case (v_f64_t x, v_f64_t y, v_u64_t special) +{ + return v_call_f64 (tanh, x, y, special); +} + +/* Vector approximation for double-precision tanh(x), using a simplified + version of expm1. The greatest observed error is 2.75 ULP: + __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3 + want -0x1.ba31ba4691ab4p-3. */ +VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x) +{ + v_u64_t ix = v_as_u64_f64 (x); + v_u64_t ia = ix & AbsMask; + + /* Trigger special-cases for tiny, boring and infinity/NaN. */ + v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound)); + v_f64_t u; + + /* To trigger fp exceptions correctly, set special lanes to a neutral value. + They will be fixed up later by the special-case handler. */ + if (unlikely (v_any_u64 (special))) + u = v_sel_f64 (special, v_f64 (1), x) * 2; + else + u = x * 2; + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + v_f64_t q = expm1_inline (u); + v_f64_t y = q / (q + 2); + + if (unlikely (v_any_u64 (special))) + return special_case (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, D, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (tanh), 2.26) +PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh)) +PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000) +PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000) +PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000) +PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000) +PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000) +PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c new file mode 100644 index 000000000000..36166118c0f0 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/v_tanhf_2u6.c @@ -0,0 +1,69 @@ +/* + * Single-precision vector tanh(x) function. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ + +#include "v_math.h" +#include "pl_sig.h" +#include "pl_test.h" + +#if V_SUPPORTED + +#include "v_expm1f_inline.h" + +#define BoringBound \ + 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \ + negative). */ +#define AbsMask 0x7fffffff + +static NOINLINE v_f32_t +special_case (v_f32_t x, v_f32_t y, v_u32_t special) +{ + return v_call_f32 (tanhf, x, y, special); +} + +/* Approximation for single-precision vector tanh(x), using a simplified version + of expm1f. The maximum error is 2.58 ULP: + __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5 + want 0x1.f9ba08p-5. */ +VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x) +{ + v_u32_t ix = v_as_u32_f32 (x); + v_u32_t iax = ix & AbsMask; + v_u32_t sign = ix & ~AbsMask; + v_u32_t is_boring = v_cond_u32 (iax > BoringBound); + v_f32_t boring = v_as_f32_u32 (sign | One); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered properly, set all special and boring + lanes to 1, which will trigger no exceptions, and fix them up later. */ + v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000)); + ix = v_sel_u32 (is_boring, v_u32 (One), ix); + if (unlikely (v_any_u32 (special))) + ix = v_sel_u32 (special, v_u32 (One), ix); +#else + v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0)); +#endif + + /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ + v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix)); + v_f32_t y = q / (q + 2); + y = v_sel_f32 (is_boring, boring, y); + if (unlikely (v_any_u32 (special))) + return special_case (x, y, special); + return y; +} +VPCS_ALIAS + +PL_SIG (V, F, 1, tanh, -10.0, 10.0) +PL_TEST_ULP (V_NAME (tanhf), 2.09) +PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT) +PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000) +PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000) +PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000) +PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000) +PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100) +PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100) +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c new file mode 100644 index 000000000000..649735b140f3 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_acosh_3u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_acosh. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh) +#include "v_acosh_3u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c new file mode 100644 index 000000000000..8c5f106992a7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_acoshf_3u1.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_acoshf. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf) +#include "v_acoshf_3u1.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c new file mode 100644 index 000000000000..0d2373b5e4b2 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_asinh_3u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_asinh. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh) +#include "v_asinh_3u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c b/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c new file mode 100644 index 000000000000..6c8927f0875b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_asinhf_2u7.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_asinhf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf) +#include "v_asinhf_2u7.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c new file mode 100644 index 000000000000..925b5b4ef324 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_atan2_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atan2. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2) +#include "v_atan2_3u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c new file mode 100644 index 000000000000..51d33d50f6ef --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_atan2f_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atan2f. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f) +#include "v_atan2f_3u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c new file mode 100644 index 000000000000..ccebce2dc2ed --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_atan_2u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atan. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan) +#include "v_atan_2u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c b/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c new file mode 100644 index 000000000000..b8797276d981 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_atanf_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atanf. + * + * Copyright (c) 2021-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf) +#include "v_atanf_3u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c new file mode 100644 index 000000000000..19429b209b3a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_atanh_3u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atanh. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh) +#include "v_atanh_3u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c b/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c new file mode 100644 index 000000000000..7de226dda054 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_atanhf_3u1.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_atanhf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf) +#include "v_atanhf_3u1.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c new file mode 100644 index 000000000000..4cb0dc8cefb5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_cbrt_2u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cbrt. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt) +#include "v_cbrt_2u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c new file mode 100644 index 000000000000..40a72d8c301e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_cbrtf_1u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cbrtf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf) +#include "v_cbrtf_1u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c b/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c new file mode 100644 index 000000000000..9bf7f026447a --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_cosh_2u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_cosh. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh) +#include "v_cosh_2u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c b/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c new file mode 100644 index 000000000000..b149cb34df61 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_coshf_2u4.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_coshf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf) +#include "v_coshf_2u4.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c b/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c new file mode 100644 index 000000000000..95bd141554e4 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_erf_2u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_erf. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf) +#include "v_erf_2u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c b/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c new file mode 100644 index 000000000000..1cf6546ce715 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_erfc_4u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_erfc. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc) +#include "v_erfc_4u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c b/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c new file mode 100644 index 000000000000..ef5a21d6336c --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_erfcf_1u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_erfcf. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf) +#include "v_erfcf_1u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c b/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c new file mode 100644 index 000000000000..ee8848ee24ed --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_erff_1u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_erff. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff) +#include "v_erff_1u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c b/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c new file mode 100644 index 000000000000..52a57feefbff --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_exp_tail.c @@ -0,0 +1,11 @@ +/* + * AdvSIMD vector PCS variant of __v_erfc. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#include "v_exp_tail.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expf.c b/contrib/arm-optimized-routines/pl/math/vn_expf.c new file mode 100644 index 000000000000..83e7f0a2070b --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_expf.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_expf. + * + * Copyright (c) 2019-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf) +#include "v_expf.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c new file mode 100644 index 000000000000..35111e2fc221 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_expm1_2u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_expm1. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1) +#include "v_expm1_2u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c b/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c new file mode 100644 index 000000000000..bea491f4898e --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_expm1f_1u6.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_expm1f. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f) +#include "v_expm1f_1u6.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c new file mode 100644 index 000000000000..5f32c33e059f --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_log10_2u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log10. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10) +#include "v_log10_2u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c new file mode 100644 index 000000000000..2673ef515df7 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_log10f_3u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log10f. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f) +#include "v_log10f_3u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c new file mode 100644 index 000000000000..3f4f8d1bd297 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_log1p_2u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log1p. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p) +#include "v_log1p_2u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c b/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c new file mode 100644 index 000000000000..a319bc98f491 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_log1pf_2u1.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log1pf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf) +#include "v_log1pf_2u1.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c b/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c new file mode 100644 index 000000000000..a87039204439 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_log2_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log2. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2) +#include "v_log2_3u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c b/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c new file mode 100644 index 000000000000..b4a9cb708bae --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_log2f_2u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_log2f. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f) +#include "v_log2f_2u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c new file mode 100644 index 000000000000..7c881de21688 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_sinh_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_sinh. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh) +#include "v_sinh_3u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c b/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c new file mode 100644 index 000000000000..251e73232d01 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_sinhf_2u3.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_sinhf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf) +#include "v_sinhf_2u3.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c new file mode 100644 index 000000000000..a4efb065bc08 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_tan_3u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_tan. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan) +#include "v_tan_3u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c b/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c new file mode 100644 index 000000000000..a88cb4077b3d --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_tanf_3u5.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_tanf. + * + * Copyright (c) 2020-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf) +#include "v_tanf_3u5.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c b/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c new file mode 100644 index 000000000000..cb2746cf22a5 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_tanh_3u.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_tanh. + * + * Copyright (c) 2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh) +#include "v_tanh_3u.c" +#endif diff --git a/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c b/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c new file mode 100644 index 000000000000..47f0a7f57d05 --- /dev/null +++ b/contrib/arm-optimized-routines/pl/math/vn_tanhf_2u6.c @@ -0,0 +1,12 @@ +/* + * AdvSIMD vector PCS variant of __v_tanhf. + * + * Copyright (c) 2022-2023, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception + */ +#include "include/mathlib.h" +#ifdef __vpcs +#define VPCS 1 +#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf) +#include "v_tanhf_2u6.c" +#endif |