aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Turner <andrew@FreeBSD.org>2024-02-20 09:02:15 +0000
committerAndrew Turner <andrew@FreeBSD.org>2024-02-20 09:02:15 +0000
commitedc5c0de794f521eb620d2b6cbaee2434442a8f3 (patch)
tree64dfc547c0b6398e9cf94bd8175b21db8a74c814
parent29866ecb89620f1c798b7f5ff6710255f13aa52e (diff)
downloadsrc-vendor/arm-optimized-routines.tar.gz
src-vendor/arm-optimized-routines.zip
Import the v24.01 release of the Arm Optimized Routines [1]. [1] https://github.com/ARM-software/optimized-routines/tree/v24.01 Sponsored by: Arm Ltd
-rw-r--r--README2
-rw-r--r--config.mk.dist13
-rw-r--r--math/Dir.mk6
-rw-r--r--math/aarch64/v_cos.c87
-rw-r--r--math/aarch64/v_cosf.c82
-rw-r--r--math/aarch64/v_exp.c125
-rw-r--r--math/aarch64/v_exp2f.c113
-rw-r--r--math/aarch64/v_exp2f_1u.c72
-rw-r--r--math/aarch64/v_exp_data.c146
-rw-r--r--math/aarch64/v_expf.c122
-rw-r--r--math/aarch64/v_expf_1u.c77
-rw-r--r--math/aarch64/v_log.c100
-rw-r--r--math/aarch64/v_log_data.c156
-rw-r--r--math/aarch64/v_logf.c74
-rw-r--r--math/aarch64/v_math.h135
-rw-r--r--math/aarch64/v_pow.c22
-rw-r--r--math/aarch64/v_powf.c148
-rw-r--r--math/aarch64/v_sin.c97
-rw-r--r--math/aarch64/v_sinf.c82
-rw-r--r--math/exp10.c129
-rw-r--r--math/exp_data.c23
-rw-r--r--math/include/mathlib.h67
-rw-r--r--math/math_config.h61
-rw-r--r--math/s_cos.c6
-rw-r--r--math/s_cosf.c6
-rw-r--r--math/s_exp.c6
-rw-r--r--math/s_exp2f.c6
-rw-r--r--math/s_exp2f_1u.c6
-rw-r--r--math/s_expf.c6
-rw-r--r--math/s_expf_1u.c6
-rw-r--r--math/s_log.c6
-rw-r--r--math/s_logf.c6
-rw-r--r--math/s_pow.c6
-rw-r--r--math/s_powf.c6
-rw-r--r--math/s_sin.c6
-rw-r--r--math/s_sinf.c6
-rw-r--r--math/test/mathbench.c152
-rw-r--r--math/test/mathbench_funcs.h50
-rw-r--r--math/test/mathbench_wrappers.h42
-rw-r--r--math/test/mathtest.c9
-rwxr-xr-xmath/test/runulp.sh112
-rw-r--r--math/test/testcases/directed/exp10.tst15
-rw-r--r--math/test/ulp.c81
-rw-r--r--math/test/ulp.h29
-rw-r--r--math/test/ulp_funcs.h50
-rw-r--r--math/test/ulp_wrappers.h36
-rw-r--r--math/tgamma128.c356
-rw-r--r--math/tgamma128.h141
-rw-r--r--math/tools/tgamma128_gen.jl212
-rw-r--r--math/v_cos.c95
-rw-r--r--math/v_cosf.c84
-rw-r--r--math/v_exp.c128
-rw-r--r--math/v_exp.h14
-rw-r--r--math/v_exp2f.c117
-rw-r--r--math/v_exp2f_1u.c75
-rw-r--r--math/v_expf.c122
-rw-r--r--math/v_expf_1u.c80
-rw-r--r--math/v_log.c104
-rw-r--r--math/v_log.h18
-rw-r--r--math/v_log_data.c158
-rw-r--r--math/v_logf.c73
-rw-r--r--math/v_math.h661
-rw-r--r--math/v_pow.c27
-rw-r--r--math/v_powf.c235
-rw-r--r--math/v_sin.c103
-rw-r--r--math/v_sinf.c88
-rw-r--r--math/vn_cos.c12
-rw-r--r--math/vn_cosf.c12
-rw-r--r--math/vn_exp.c12
-rw-r--r--math/vn_exp2f.c12
-rw-r--r--math/vn_exp2f_1u.c11
-rw-r--r--math/vn_expf.c12
-rw-r--r--math/vn_expf_1u.c11
-rw-r--r--math/vn_log.c12
-rw-r--r--math/vn_logf.c12
-rw-r--r--math/vn_pow.c12
-rw-r--r--math/vn_powf.c12
-rw-r--r--math/vn_sin.c12
-rw-r--r--math/vn_sinf.c12
-rw-r--r--pl/math/Dir.mk89
-rw-r--r--pl/math/acos_2u.c100
-rw-r--r--pl/math/acosf_1u4.c99
-rw-r--r--pl/math/asin_3u.c106
-rw-r--r--pl/math/asin_data.c19
-rw-r--r--pl/math/asinf_2u5.c100
-rw-r--r--pl/math/asinf_data.c16
-rw-r--r--pl/math/asinh_2u5.c5
-rw-r--r--pl/math/asinhf_3u5.c6
-rw-r--r--pl/math/atan_common.h40
-rw-r--r--pl/math/atanf_2u9.c12
-rw-r--r--pl/math/atanf_common.h33
-rw-r--r--pl/math/atanh_3u.c15
-rw-r--r--pl/math/atanhf_3u1.c12
-rw-r--r--pl/math/cbrt_2u.c5
-rw-r--r--pl/math/cbrtf_1u5.c9
-rw-r--r--pl/math/cosh_2u.c9
-rw-r--r--pl/math/coshf_1u9.c9
-rw-r--r--pl/math/cospi_3u1.c89
-rw-r--r--pl/math/cospif_2u6.c84
-rw-r--r--pl/math/erf_2u5.c102
-rw-r--r--pl/math/erf_data.c788
-rw-r--r--pl/math/erfc_1u8.c153
-rw-r--r--pl/math/erfc_4u5.c155
-rw-r--r--pl/math/erfc_data.c3628
-rw-r--r--pl/math/erfcf.h38
-rw-r--r--pl/math/erfcf_1u7.c103
-rw-r--r--pl/math/erfcf_2u.c133
-rw-r--r--pl/math/erfcf_data.c703
-rw-r--r--pl/math/erff_1u5.c108
-rw-r--r--pl/math/erff_2u.c82
-rw-r--r--pl/math/erff_data.c532
-rw-r--r--pl/math/erfinv_24u5.c81
-rw-r--r--pl/math/erfinvf_4u7.c74
-rw-r--r--pl/math/erfinvl.c114
-rw-r--r--pl/math/estrin.h16
-rw-r--r--pl/math/estrin_wrap.h48
-rw-r--r--pl/math/estrinf.h14
-rw-r--r--pl/math/expf.c4
-rw-r--r--pl/math/expm1_2u5.c19
-rw-r--r--pl/math/expm1f_1u6.c11
-rw-r--r--pl/math/finite_pow.h365
-rw-r--r--pl/math/horner.h14
-rw-r--r--pl/math/horner_wrap.h34
-rw-r--r--pl/math/hornerf.h14
-rw-r--r--pl/math/include/mathlib.h238
-rw-r--r--pl/math/include/pl_test.h8
-rw-r--r--pl/math/log1p_2u.c17
-rw-r--r--pl/math/log1pf_2u1.c16
-rw-r--r--pl/math/math_config.h252
-rw-r--r--pl/math/math_err.c4
-rw-r--r--pl/math/math_errf.c4
-rw-r--r--pl/math/pairwise_horner.h14
-rw-r--r--pl/math/pairwise_horner_wrap.h48
-rw-r--r--pl/math/pairwise_hornerf.h14
-rw-r--r--pl/math/pl_sig.h56
-rw-r--r--pl/math/poly_advsimd_f32.h24
-rw-r--r--pl/math/poly_advsimd_f64.h24
-rw-r--r--pl/math/poly_generic.h277
-rw-r--r--pl/math/poly_scalar_f32.h24
-rw-r--r--pl/math/poly_scalar_f64.h24
-rw-r--r--pl/math/poly_sve_f32.h26
-rw-r--r--pl/math/poly_sve_f64.h26
-rw-r--r--pl/math/poly_sve_generic.h301
-rw-r--r--pl/math/s_acosh_3u5.c6
-rw-r--r--pl/math/s_acoshf_3u1.c6
-rw-r--r--pl/math/s_asinh_3u5.c6
-rw-r--r--pl/math/s_asinhf_2u7.c6
-rw-r--r--pl/math/s_atan2_3u.c6
-rw-r--r--pl/math/s_atan2f_3u.c6
-rw-r--r--pl/math/s_atan_2u5.c6
-rw-r--r--pl/math/s_atanf_3u.c6
-rw-r--r--pl/math/s_atanh_3u5.c6
-rw-r--r--pl/math/s_atanhf_3u1.c6
-rw-r--r--pl/math/s_cbrt_2u.c6
-rw-r--r--pl/math/s_cbrtf_1u5.c6
-rw-r--r--pl/math/s_cosh_2u.c6
-rw-r--r--pl/math/s_coshf_2u4.c6
-rw-r--r--pl/math/s_erf_2u.c6
-rw-r--r--pl/math/s_erfc_4u.c6
-rw-r--r--pl/math/s_erfcf_1u.c6
-rw-r--r--pl/math/s_erff_1u5.c6
-rw-r--r--pl/math/s_exp_tail.c6
-rw-r--r--pl/math/s_expf.c6
-rw-r--r--pl/math/s_expm1_2u5.c6
-rw-r--r--pl/math/s_expm1f_1u6.c6
-rw-r--r--pl/math/s_log10_2u5.c6
-rw-r--r--pl/math/s_log10f_3u5.c6
-rw-r--r--pl/math/s_log1p_2u5.c6
-rw-r--r--pl/math/s_log1pf_2u1.c6
-rw-r--r--pl/math/s_log2_3u.c6
-rw-r--r--pl/math/s_log2f_2u5.c6
-rw-r--r--pl/math/s_sinh_3u.c6
-rw-r--r--pl/math/s_sinhf_2u3.c6
-rw-r--r--pl/math/s_tan_3u5.c6
-rw-r--r--pl/math/s_tanf_3u5.c6
-rw-r--r--pl/math/s_tanh_3u.c6
-rw-r--r--pl/math/s_tanhf_2u6.c6
-rw-r--r--pl/math/sinh_3u.c9
-rw-r--r--pl/math/sinhf_2u3.c9
-rw-r--r--pl/math/sinpi_3u.c90
-rw-r--r--pl/math/sinpif_2u5.c83
-rw-r--r--pl/math/sv_acos_2u.c91
-rw-r--r--pl/math/sv_acosf_1u4.c84
-rw-r--r--pl/math/sv_acosh_3u5.c50
-rw-r--r--pl/math/sv_acoshf_2u8.c47
-rw-r--r--pl/math/sv_asin_3u.c84
-rw-r--r--pl/math/sv_asinf_2u5.c76
-rw-r--r--pl/math/sv_asinh_3u0.c129
-rw-r--r--pl/math/sv_asinhf_2u5.c55
-rw-r--r--pl/math/sv_atan2_2u5.c111
-rw-r--r--pl/math/sv_atan2f_3u.c112
-rw-r--r--pl/math/sv_atan_2u5.c77
-rw-r--r--pl/math/sv_atan_common.h61
-rw-r--r--pl/math/sv_atanf_2u9.c69
-rw-r--r--pl/math/sv_atanf_common.h47
-rw-r--r--pl/math/sv_atanh_3u3.c60
-rw-r--r--pl/math/sv_atanhf_2u8.c56
-rw-r--r--pl/math/sv_cbrt_2u.c122
-rw-r--r--pl/math/sv_cbrtf_1u7.c116
-rw-r--r--pl/math/sv_cexpi_3u5.c45
-rw-r--r--pl/math/sv_cexpif_1u8.c47
-rw-r--r--pl/math/sv_cos_2u5.c104
-rw-r--r--pl/math/sv_cosf_2u1.c94
-rw-r--r--pl/math/sv_cosh_2u.c100
-rw-r--r--pl/math/sv_coshf_2u.c56
-rw-r--r--pl/math/sv_cospi_3u2.c63
-rw-r--r--pl/math/sv_cospif_2u6.c59
-rw-r--r--pl/math/sv_erf_2u5.c111
-rw-r--r--pl/math/sv_erf_3u.c103
-rw-r--r--pl/math/sv_erf_data.c1558
-rw-r--r--pl/math/sv_erfc_1u8.c164
-rw-r--r--pl/math/sv_erfc_4u.c146
-rw-r--r--pl/math/sv_erfcf_1u7.c111
-rw-r--r--pl/math/sv_erff_1u3.c104
-rw-r--r--pl/math/sv_erff_2u.c90
-rw-r--r--pl/math/sv_erff_data.c1046
-rw-r--r--pl/math/sv_exp10_1u5.c122
-rw-r--r--pl/math/sv_exp10f_1u5.c87
-rw-r--r--pl/math/sv_exp2_2u.c107
-rw-r--r--pl/math/sv_exp2f_1u6.c80
-rw-r--r--pl/math/sv_exp_1u5.c137
-rw-r--r--pl/math/sv_exp_tail.h79
-rw-r--r--pl/math/sv_expf_2u.c180
-rw-r--r--pl/math/sv_expf_data.c12
-rw-r--r--pl/math/sv_expf_inline.h66
-rw-r--r--pl/math/sv_expm1_2u5.c95
-rw-r--r--pl/math/sv_expm1f_1u6.c93
-rw-r--r--pl/math/sv_expm1f_inline.h73
-rw-r--r--pl/math/sv_hypot_1u5.c51
-rw-r--r--pl/math/sv_hypotf_1u5.c45
-rw-r--r--pl/math/sv_log10_2u5.c94
-rw-r--r--pl/math/sv_log10f_3u5.c119
-rw-r--r--pl/math/sv_log1p_2u5.c116
-rw-r--r--pl/math/sv_log1p_inline.h96
-rw-r--r--pl/math/sv_log1pf_1u3.c97
-rw-r--r--pl/math/sv_log1pf_inline.h65
-rw-r--r--pl/math/sv_log2_3u.c94
-rw-r--r--pl/math/sv_log2f_2u5.c99
-rw-r--r--pl/math/sv_log_2u5.c101
-rw-r--r--pl/math/sv_log_data.c146
-rw-r--r--pl/math/sv_logf_3u4.c99
-rw-r--r--pl/math/sv_logf_data.c12
-rw-r--r--pl/math/sv_math.h220
-rw-r--r--pl/math/sv_pow_1u5.c444
-rw-r--r--pl/math/sv_powf_2u6.c360
-rw-r--r--pl/math/sv_powi.c25
-rw-r--r--pl/math/sv_powif.c26
-rw-r--r--pl/math/sv_sin_3u.c89
-rw-r--r--pl/math/sv_sin_3u5.c96
-rw-r--r--pl/math/sv_sincos_3u5.c61
-rw-r--r--pl/math/sv_sincos_common.h85
-rw-r--r--pl/math/sv_sincosf_1u8.c62
-rw-r--r--pl/math/sv_sincosf_common.h81
-rw-r--r--pl/math/sv_sinf_1u9.c103
-rw-r--r--pl/math/sv_sinf_poly_data.c19
-rw-r--r--pl/math/sv_sinh_3u.c103
-rw-r--r--pl/math/sv_sinhf_2u3.c64
-rw-r--r--pl/math/sv_sinpi_3u1.c57
-rw-r--r--pl/math/sv_sinpif_2u5.c53
-rw-r--r--pl/math/sv_tan_3u5.c99
-rw-r--r--pl/math/sv_tanf_3u5.c141
-rw-r--r--pl/math/sv_tanh_3u.c96
-rw-r--r--pl/math/sv_tanhf_2u6.c59
-rw-r--r--pl/math/tanf_3u3.c27
-rw-r--r--pl/math/tanh_3u.c22
-rw-r--r--pl/math/tanhf_2u6.c9
-rw-r--r--pl/math/test/mathbench_funcs.h55
-rw-r--r--pl/math/test/mathbench_wrappers.h159
-rw-r--r--pl/math/test/pl_test.h24
-rwxr-xr-xpl/math/test/runulp.sh56
-rw-r--r--pl/math/test/testcases/directed/acos.tst17
-rw-r--r--pl/math/test/testcases/directed/acosf.tst21
-rw-r--r--pl/math/test/testcases/directed/asin.tst24
-rw-r--r--pl/math/test/testcases/directed/asinf.tst24
-rw-r--r--pl/math/test/ulp_funcs.h54
-rw-r--r--pl/math/test/ulp_wrappers.h78
-rw-r--r--pl/math/tools/asin.sollya29
-rw-r--r--pl/math/tools/asinf.sollya36
-rw-r--r--pl/math/tools/erf.sollya25
-rw-r--r--pl/math/tools/erfc.sollya60
-rw-r--r--pl/math/tools/erfcf.sollya41
-rw-r--r--pl/math/tools/erff.sollya20
-rw-r--r--pl/math/tools/exp10.sollya55
-rw-r--r--pl/math/tools/sincos.sollya33
-rw-r--r--pl/math/tools/sincosf.sollya33
-rw-r--r--pl/math/tools/sinpi.sollya33
-rw-r--r--pl/math/trigpi_references.c57
-rw-r--r--pl/math/v_acos_2u.c122
-rw-r--r--pl/math/v_acosf_1u4.c113
-rw-r--r--pl/math/v_acosh_3u5.c63
-rw-r--r--pl/math/v_acoshf_3u1.c70
-rw-r--r--pl/math/v_asin_3u.c113
-rw-r--r--pl/math/v_asinf_2u5.c104
-rw-r--r--pl/math/v_asinh_3u5.c176
-rw-r--r--pl/math/v_asinhf_2u7.c78
-rw-r--r--pl/math/v_atan2_3u.c117
-rw-r--r--pl/math/v_atan2f_3u.c112
-rw-r--r--pl/math/v_atan_2u5.c98
-rw-r--r--pl/math/v_atanf_3u.c96
-rw-r--r--pl/math/v_atanh_3u5.c69
-rw-r--r--pl/math/v_atanhf_3u1.c73
-rw-r--r--pl/math/v_cbrt_2u.c100
-rw-r--r--pl/math/v_cbrtf_1u5.c96
-rw-r--r--pl/math/v_cbrtf_1u7.c116
-rw-r--r--pl/math/v_cexpi_3u5.c45
-rw-r--r--pl/math/v_cexpif_1u8.c47
-rw-r--r--pl/math/v_cosh_2u.c130
-rw-r--r--pl/math/v_coshf_2u4.c76
-rw-r--r--pl/math/v_cospi_3u1.c86
-rw-r--r--pl/math/v_cospif_3u2.c83
-rw-r--r--pl/math/v_erf_2u.c116
-rw-r--r--pl/math/v_erf_2u5.c158
-rw-r--r--pl/math/v_erf_data.c119
-rw-r--r--pl/math/v_erfc_1u8.c198
-rw-r--r--pl/math/v_erfc_4u.c168
-rw-r--r--pl/math/v_erfc_data.c96
-rw-r--r--pl/math/v_erfcf_1u.c183
-rw-r--r--pl/math/v_erfcf_1u7.c166
-rw-r--r--pl/math/v_erff_1u5.c116
-rw-r--r--pl/math/v_erff_2u.c118
-rw-r--r--pl/math/v_erff_data.c18
-rw-r--r--pl/math/v_erfinv_25u.c161
-rw-r--r--pl/math/v_erfinvf_5u.c163
-rw-r--r--pl/math/v_exp10_2u.c144
-rw-r--r--pl/math/v_exp10f_2u4.c138
-rw-r--r--pl/math/v_exp2_2u.c128
-rw-r--r--pl/math/v_exp_data.c55
-rw-r--r--pl/math/v_exp_tail.c75
-rw-r--r--pl/math/v_exp_tail_data.c179
-rw-r--r--pl/math/v_exp_tail_inline.h102
-rw-r--r--pl/math/v_expf.c83
-rw-r--r--pl/math/v_expf_inline.h60
-rw-r--r--pl/math/v_expm1_2u5.c139
-rw-r--r--pl/math/v_expm1f_1u6.c123
-rw-r--r--pl/math/v_expm1f_inline.h56
-rw-r--r--pl/math/v_hypot_1u5.c95
-rw-r--r--pl/math/v_hypotf_1u5.c94
-rw-r--r--pl/math/v_log10_2u5.c140
-rw-r--r--pl/math/v_log10_data.c298
-rw-r--r--pl/math/v_log10f_3u5.c114
-rw-r--r--pl/math/v_log10f_data.c13
-rw-r--r--pl/math/v_log1p_2u5.c144
-rw-r--r--pl/math/v_log1p_inline.h82
-rw-r--r--pl/math/v_log1pf_2u1.c174
-rw-r--r--pl/math/v_log1pf_inline.h74
-rw-r--r--pl/math/v_log2_3u.c133
-rw-r--r--pl/math/v_log2_data.c278
-rw-r--r--pl/math/v_log2f_2u5.c93
-rw-r--r--pl/math/v_log2f_data.c15
-rw-r--r--pl/math/v_log_data.c161
-rw-r--r--pl/math/v_log_inline.h104
-rw-r--r--pl/math/v_logf_inline.h59
-rw-r--r--pl/math/v_math.h874
-rw-r--r--pl/math/v_pow_1u5.c259
-rw-r--r--pl/math/v_pow_exp_data.c (renamed from math/v_exp_data.c)164
-rw-r--r--pl/math/v_pow_log_data.c174
-rw-r--r--pl/math/v_powf_data.c89
-rw-r--r--pl/math/v_sincos_3u5.c57
-rw-r--r--pl/math/v_sincos_common.h86
-rw-r--r--pl/math/v_sincosf_1u8.c58
-rw-r--r--pl/math/v_sincosf_common.h84
-rw-r--r--pl/math/v_sinh_3u.c120
-rw-r--r--pl/math/v_sinhf_2u3.c91
-rw-r--r--pl/math/v_sinpi_3u1.c86
-rw-r--r--pl/math/v_sinpif_3u.c81
-rw-r--r--pl/math/v_tan_3u5.c124
-rw-r--r--pl/math/v_tan_data.c15
-rw-r--r--pl/math/v_tanf_3u5.c134
-rw-r--r--pl/math/v_tanh_3u.c112
-rw-r--r--pl/math/v_tanhf_2u6.c80
-rw-r--r--pl/math/vn_acosh_3u5.c12
-rw-r--r--pl/math/vn_acoshf_3u1.c12
-rw-r--r--pl/math/vn_asinh_3u5.c12
-rw-r--r--pl/math/vn_asinhf_2u7.c12
-rw-r--r--pl/math/vn_atan2_3u.c12
-rw-r--r--pl/math/vn_atan2f_3u.c12
-rw-r--r--pl/math/vn_atan_2u5.c12
-rw-r--r--pl/math/vn_atanf_3u.c12
-rw-r--r--pl/math/vn_atanh_3u5.c12
-rw-r--r--pl/math/vn_atanhf_3u1.c12
-rw-r--r--pl/math/vn_cbrt_2u.c12
-rw-r--r--pl/math/vn_cbrtf_1u5.c12
-rw-r--r--pl/math/vn_cosh_2u.c12
-rw-r--r--pl/math/vn_coshf_2u4.c12
-rw-r--r--pl/math/vn_erf_2u.c12
-rw-r--r--pl/math/vn_erfc_4u.c12
-rw-r--r--pl/math/vn_erfcf_1u.c12
-rw-r--r--pl/math/vn_erff_1u5.c12
-rw-r--r--pl/math/vn_exp_tail.c11
-rw-r--r--pl/math/vn_expf.c12
-rw-r--r--pl/math/vn_expm1_2u5.c12
-rw-r--r--pl/math/vn_expm1f_1u6.c12
-rw-r--r--pl/math/vn_log10_2u5.c12
-rw-r--r--pl/math/vn_log10f_3u5.c12
-rw-r--r--pl/math/vn_log1p_2u5.c12
-rw-r--r--pl/math/vn_log1pf_2u1.c12
-rw-r--r--pl/math/vn_log2_3u.c12
-rw-r--r--pl/math/vn_log2f_2u5.c12
-rw-r--r--pl/math/vn_sinh_3u.c12
-rw-r--r--pl/math/vn_sinhf_2u3.c12
-rw-r--r--pl/math/vn_tan_3u5.c12
-rw-r--r--pl/math/vn_tanf_3u5.c12
-rw-r--r--pl/math/vn_tanh_3u.c12
-rw-r--r--pl/math/vn_tanhf_2u6.c12
-rw-r--r--string/aarch64/asmdefs.h14
-rw-r--r--string/aarch64/memcpy-advsimd.S62
-rw-r--r--string/aarch64/memcpy-mops.S21
-rw-r--r--string/aarch64/memmove-mops.S21
-rw-r--r--string/aarch64/memset-mops.S20
-rw-r--r--string/bench/memcpy.c5
-rw-r--r--string/include/stringlib.h7
-rw-r--r--string/test/memcpy.c5
-rw-r--r--string/test/memmove.c5
-rw-r--r--string/test/memset.c5
414 files changed, 26613 insertions, 10731 deletions
diff --git a/README b/README
index a2143a28488a..651ebdc84bc8 100644
--- a/README
+++ b/README
@@ -12,7 +12,7 @@ contribution requirements are documented in README.contributors of
the appropriate subdirectory.
Regular quarterly releases are tagged as vYY.MM, the latest
-release is v23.01.
+release is v24.01.
Source code layout:
diff --git a/config.mk.dist b/config.mk.dist
index 7a8497507a81..03fb54db52fa 100644
--- a/config.mk.dist
+++ b/config.mk.dist
@@ -1,6 +1,6 @@
# Example config.mk
#
-# Copyright (c) 2018-2022, Arm Limited.
+# Copyright (c) 2018-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
# Subprojects to build
@@ -59,13 +59,14 @@ math-cflags += -ffp-contract=fast -fno-math-errno
# Use with clang.
#math-cflags += -ffp-contract=fast
-# Disable vector math code
-#math-cflags += -DWANT_VMATH=0
-
-# Disable/enable SVE vector math code and tests
+# Disable/enable SVE vector math code and tests.
+# If WANT_SVE_MATH is enabled, math-sve-cflags is added for SVE
+# routines only so that SVE code does not leak into scalar
+# routines. It is also necessary to add it for tools (e.g. ulp,
+# mathbench)
WANT_SVE_MATH = 0
ifeq ($(WANT_SVE_MATH), 1)
- math-cflags += -march=armv8.2-a+sve
+ math-sve-cflags = -march=armv8-a+sve
endif
math-cflags += -DWANT_SVE_MATH=$(WANT_SVE_MATH)
diff --git a/math/Dir.mk b/math/Dir.mk
index 2a9cad10d96a..5e9494a7bd3c 100644
--- a/math/Dir.mk
+++ b/math/Dir.mk
@@ -1,12 +1,14 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2022, Arm Limited.
+# Copyright (c) 2019-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
S := $(srcdir)/math
B := build/math
math-lib-srcs := $(wildcard $(S)/*.[cS])
+math-lib-srcs += $(wildcard $(S)/$(ARCH)/*.[cS])
+
math-test-srcs := \
$(S)/test/mathtest.c \
$(S)/test/mathbench.c \
@@ -65,6 +67,8 @@ build/lib/libmathlib.a: $(math-lib-objs)
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
+# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
+$(math-tools): CFLAGS_ALL += $(math-sve-cflags)
build/bin/rtest: $(math-host-objs)
$(HOST_CC) $(HOST_CFLAGS) $(HOST_LDFLAGS) -o $@ $^ $(HOST_LDLIBS)
diff --git a/math/aarch64/v_cos.c b/math/aarch64/v_cos.c
new file mode 100644
index 000000000000..9a73575bce89
--- /dev/null
+++ b/math/aarch64/v_cos.c
@@ -0,0 +1,87 @@
+/*
+ * Double-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+} data = {
+ /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .half_pi = V2 (0x1.921fb54442d18p+0),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .shift = V2 (0x1.8p52),
+ .range_val = V2 (0x1p23)
+};
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (cos, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, t1, t2, t3, y;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f64 (x);
+ cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r),
+ vreinterpretq_u64_f64 (d->range_val));
+ if (unlikely (v_any_u64 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f64 (cmp, v_f64 (1.0), r);
+#else
+ cmp = vcageq_f64 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+ n = vsubq_f64 (n, d->shift);
+ n = vsubq_f64 (n, v_f64 (0.5));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/math/aarch64/v_cosf.c b/math/aarch64/v_cosf.c
new file mode 100644
index 000000000000..b9890b2998ad
--- /dev/null
+++ b/math/aarch64/v_cosf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector cos function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .shift = V4 (0x1.8p+23f),
+ .half_pi = V4 (0x1.921fb6p0f),
+ .range_val = V4 (0x1p20f)
+};
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (cosf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, r3, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ r = vabsq_f32 (x);
+ cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r),
+ vreinterpretq_u32_f32 (d->range_val));
+ if (unlikely (v_any_u32 (cmp)))
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, v_f32 (1.0f), r);
+#else
+ cmp = vcageq_f32 (x, d->range_val);
+ r = x;
+#endif
+
+ /* n = rint((|x|+pi/2)/pi) - 0.5. */
+ n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+ n = vsubq_f32 (n, d->shift);
+ n = vsubq_f32 (n, v_f32 (0.5f));
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r). */
+ r2 = vmulq_f32 (r, r);
+ r3 = vmulq_f32 (r2, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, y, r3);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/math/aarch64/v_exp.c b/math/aarch64/v_exp.c
new file mode 100644
index 000000000000..bc5609faf4fc
--- /dev/null
+++ b/math/aarch64/v_exp.c
@@ -0,0 +1,125 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.88 +0.5 ulp
+ rel error: 1.4337*2^-53
+ abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
+ .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+ V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V2 (163840.0), /* 1280.0 * N. */
+ .special_bound = V2 (704.0),
+#endif
+ .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2. */
+ .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N. */
+ .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+ .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+ float64x2_t n, r, r2, s, y, z;
+ uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ float64x2_t xm = x;
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+ if (unlikely (v_any_u64 (cmp)))
+ x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+ cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+ /* n = round(x/(ln2/N)). */
+ z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+ u = vreinterpretq_u64_f64 (z);
+ n = vsubq_f64 (z, data.shift);
+
+ /* r = x - n*ln2/N. */
+ r = x;
+ r = vfmsq_f64 (r, data.ln2_hi, n);
+ r = vfmsq_f64 (r, data.ln2_lo, n);
+
+ e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (C (0), C (1), r);
+ y = vfmaq_f64 (y, C (2), r2);
+ y = vfmaq_f64 (r, y, r2);
+
+ /* s = 2^(n/N). */
+ u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+ s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+ return special_case (s, y, n);
+#endif
+
+ return vfmaq_f64 (s, y, s);
+}
diff --git a/math/aarch64/v_exp2f.c b/math/aarch64/v_exp2f.c
new file mode 100644
index 000000000000..e402205e98e6
--- /dev/null
+++ b/math/aarch64/v_exp2f.c
@@ -0,0 +1,113 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.962 ulp. */
+ .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
+ V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine for special lanes. */
+ return v_call_f32 (exp2f, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp2) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, scale, p, q, poly;
+ uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ n = vrndaq_f32 (x);
+ r = vsubq_f32 (x, n);
+ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ r2 = vmulq_f32 (r, r);
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/math/aarch64/v_exp2f_1u.c b/math/aarch64/v_exp2f_1u.c
new file mode 100644
index 000000000000..ba6b02fbb4bc
--- /dev/null
+++ b/math/aarch64/v_exp2f_1u.c
@@ -0,0 +1,72 @@
+/*
+ * Single-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const float Poly[] = {
+ /* maxerr: 0.878 ulp. */
+ 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+#define C5 v_f32 (Poly[5])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+ float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+ uint32x4_t cmp = absn > v_f32 (192.0f);
+ float32x4_t r1 = s1 * s1;
+ float32x4_t r0 = poly * s1 * s2;
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_exp2f_1u (float32x4_t x)
+{
+ float32x4_t n, r, scale, poly, absn;
+ uint32x4_t cmp, e;
+
+ /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+#if 0
+ float32x4_t z;
+ z = x + Shift;
+ n = z - Shift;
+ r = x - n;
+ e = vreinterpretq_u32_f32 (z) << 23;
+#else
+ n = vrndaq_f32 (x);
+ r = x - n;
+ e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)) << 23;
+#endif
+ scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
+ absn = vabsq_f32 (n);
+ cmp = absn > v_f32 (126.0f);
+ poly = vfmaq_f32 (C1, C0, r);
+ poly = vfmaq_f32 (C2, poly, r);
+ poly = vfmaq_f32 (C3, poly, r);
+ poly = vfmaq_f32 (C4, poly, r);
+ poly = vfmaq_f32 (C5, poly, r);
+ poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (poly, n, e, absn);
+ return scale * poly;
+}
diff --git a/math/aarch64/v_exp_data.c b/math/aarch64/v_exp_data.c
new file mode 100644
index 000000000000..45f0848cac5b
--- /dev/null
+++ b/math/aarch64/v_exp_data.c
@@ -0,0 +1,146 @@
+/*
+ * Lookup table for double-precision e^x vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+# define N (1 << V_EXP_TABLE_BITS)
+
+/* 2^(j/N), j=0..N. */
+const uint64_t __v_exp_data[] = {
+# if N == 128
+ 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
+ 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
+ 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
+ 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
+ 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
+ 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
+ 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
+ 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
+ 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
+ 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
+ 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
+ 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
+ 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
+ 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
+ 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
+ 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
+ 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
+ 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
+ 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
+ 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
+ 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
+ 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
+ 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
+ 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
+ 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
+ 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
+ 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
+ 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
+ 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
+ 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
+ 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
+ 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
+ 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
+ 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
+ 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
+ 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
+ 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
+ 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
+ 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
+ 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
+ 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
+ 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
+ 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
+# elif N == 256
+ 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+ 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+ 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+ 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+ 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+ 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+ 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+ 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+ 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+ 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+ 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+ 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+ 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+ 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+ 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+ 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+ 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+ 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+ 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+ 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+ 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+ 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+ 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+ 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+ 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+ 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+ 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+ 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+ 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+ 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+ 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+ 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+ 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+ 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+ 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+ 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+ 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+ 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+ 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+ 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+ 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+ 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+ 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+ 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+ 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+ 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+ 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+ 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+ 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+ 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+ 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+ 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+ 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+ 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+ 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+ 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+ 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+ 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+ 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+ 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+ 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+ 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+ 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+ 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+ 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+ 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+ 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+ 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+ 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+ 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+ 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+ 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+ 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+ 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+ 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+ 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+ 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+ 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+ 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+ 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+ 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+ 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+ 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+ 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+ 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+ 0x3feff9d96b2a23d9,
+# endif
+};
diff --git a/math/aarch64/v_expf.c b/math/aarch64/v_expf.c
new file mode 100644
index 000000000000..34e8b6081bcd
--- /dev/null
+++ b/math/aarch64/v_expf.c
@@ -0,0 +1,122 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+ uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* maxerr: 1.45358 +0.5 ulp. */
+ .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+ V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+ .shift = V4 (0x1.8p23f),
+ .inv_ln2 = V4 (0x1.715476p+0f),
+ .ln2_hi = V4 (0x1.62e4p-1f),
+ .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ .special_bound = V4 (126.0f),
+ .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) d->poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42800000) /* asuint (0x1p6). */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, scale, p, q, poly, z;
+ uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
+ cmp = vcgeq_u32 (
+ vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+ TinyBound),
+ SpecialBound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ z = vfmaq_f32 (d->shift, x, d->inv_ln2);
+ n = vsubq_f32 (z, d->shift);
+ r = vfmsq_f32 (x, n, d->ln2_hi);
+ r = vfmsq_f32 (r, n, d->ln2_lo);
+ e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+ cmp = vcagtq_f32 (n, d->special_bound);
+#endif
+
+ r2 = vmulq_f32 (r, r);
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/math/aarch64/v_expf_1u.c b/math/aarch64/v_expf_1u.c
new file mode 100644
index 000000000000..43d03fa34efa
--- /dev/null
+++ b/math/aarch64/v_expf_1u.c
@@ -0,0 +1,77 @@
+/*
+ * Single-precision vector e^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const float Poly[] = {
+ /* maxerr: 0.36565 +0.5 ulp. */
+ 0x1.6a6000p-10f,
+ 0x1.12718ep-7f,
+ 0x1.555af0p-5f,
+ 0x1.555430p-3f,
+ 0x1.fffff4p-2f,
+};
+#define C0 v_f32 (Poly[0])
+#define C1 v_f32 (Poly[1])
+#define C2 v_f32 (Poly[2])
+#define C3 v_f32 (Poly[3])
+#define C4 v_f32 (Poly[4])
+
+#define Shift v_f32 (0x1.8p23f)
+#define InvLn2 v_f32 (0x1.715476p+0f)
+#define Ln2hi v_f32 (0x1.62e4p-1f)
+#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
+
+static float32x4_t VPCS_ATTR NOINLINE
+specialcase (float32x4_t poly, float32x4_t n, uint32x4_t e, float32x4_t absn)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
+ float32x4_t s1 = vreinterpretq_f32_u32 (v_u32 (0x7f000000) + b);
+ float32x4_t s2 = vreinterpretq_f32_u32 (e - b);
+ uint32x4_t cmp = absn > v_f32 (192.0f);
+ float32x4_t r1 = s1 * s1;
+ float32x4_t r0 = poly * s1 * s2;
+ return vreinterpretq_f32_u32 ((cmp & vreinterpretq_u32_f32 (r1))
+ | (~cmp & vreinterpretq_u32_f32 (r0)));
+}
+
+float32x4_t VPCS_ATTR
+_ZGVnN4v_expf_1u (float32x4_t x)
+{
+ float32x4_t n, r, scale, poly, absn, z;
+ uint32x4_t cmp, e;
+
+ /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+#if 1
+ z = vfmaq_f32 (Shift, x, InvLn2);
+ n = z - Shift;
+ r = vfmaq_f32 (x, n, -Ln2hi);
+ r = vfmaq_f32 (r, n, -Ln2lo);
+ e = vreinterpretq_u32_f32 (z) << 23;
+#else
+ z = x * InvLn2;
+ n = vrndaq_f32 (z);
+ r = vfmaq_f32 (x, n, -Ln2hi);
+ r = vfmaq_f32 (r, n, -Ln2lo);
+ e = vreinterpretq_u32_s32 (vcvtaq_s32_f32 (z)) << 23;
+#endif
+ scale = vreinterpretq_f32_u32 (e + v_u32 (0x3f800000));
+ absn = vabsq_f32 (n);
+ cmp = absn > v_f32 (126.0f);
+ poly = vfmaq_f32 (C1, C0, r);
+ poly = vfmaq_f32 (C2, poly, r);
+ poly = vfmaq_f32 (C3, poly, r);
+ poly = vfmaq_f32 (C4, poly, r);
+ poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ poly = vfmaq_f32 (v_f32 (1.0f), poly, r);
+ if (unlikely (v_any_u32 (cmp)))
+ return specialcase (poly, n, e, absn);
+ return scale * poly;
+}
diff --git a/math/aarch64/v_log.c b/math/aarch64/v_log.c
new file mode 100644
index 000000000000..1d1c1fa62c04
--- /dev/null
+++ b/math/aarch64/v_log.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision vector log(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ uint64x2_t min_norm;
+ uint32x4_t special_bound;
+ float64x2_t poly[5];
+ float64x2_t ln2;
+ uint64x2_t sign_exp_mask;
+} data = {
+ /* Worst-case error: 1.17 + 0.5 ulp.
+ Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2),
+ V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3),
+ V2 (-0x1.554e550bd501ep-3) },
+ .ln2 = V2 (0x1.62e42fefa39efp-1),
+ .min_norm = V2 (0x0010000000000000),
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
+ .sign_exp_mask = V2 (0xfff0000000000000)
+};
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+#define Off v_u64 (0x3fe6900900000000)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+ uint32x2_t cmp)
+{
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t z, r, r2, p, y, kd, hi;
+ uint64x2_t ix, iz, tmp;
+ uint32x2_t cmp;
+ int64x2_t k;
+ struct entry e;
+
+ ix = vreinterpretq_u64_f64 (x);
+ cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+ vget_low_u32 (d->special_bound));
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = vsubq_u64 (ix, Off);
+ k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
+ iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ z = vreinterpretq_f64_u64 (iz);
+ e = lookup (tmp);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (A (2), A (3), r);
+ p = vfmaq_f64 (A (0), A (1), r);
+ y = vfmaq_f64 (y, A (4), r2);
+ y = vfmaq_f64 (p, y, r2);
+
+ if (unlikely (v_any_u32h (cmp)))
+ return special_case (x, y, hi, r2, cmp);
+ return vfmaq_f64 (hi, y, r2);
+}
diff --git a/math/aarch64/v_log_data.c b/math/aarch64/v_log_data.c
new file mode 100644
index 000000000000..82351bb14766
--- /dev/null
+++ b/math/aarch64/v_log_data.c
@@ -0,0 +1,156 @@
+/*
+ * Lookup table for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#define N (1 << V_LOG_TABLE_BITS)
+
+const struct v_log_data __v_log_data = {
+ /* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+ where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+ N=128) and log(c) and 1/c for the ith subinterval comes from lookup tables:
+
+ table[i].invc = 1/c
+ table[i].logc = (double)log(c)
+
+ where c is near the center of the subinterval and is chosen by trying several
+ floating point invc candidates around 1/center and selecting one for which
+ the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
+ that contains 1 and the previous one got tweaked to avoid cancellation. */
+ .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
+ { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
+ { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
+ { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
+ { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
+ { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
+ { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
+ { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
+ { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
+ { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
+ { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
+ { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
+ { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
+ { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
+ { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
+ { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
+ { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
+ { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
+ { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
+ { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
+ { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
+ { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
+ { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
+ { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
+ { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
+ { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
+ { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
+ { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
+ { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
+ { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
+ { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
+ { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
+ { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
+ { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
+ { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
+ { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
+ { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
+ { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
+ { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
+ { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
+ { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
+ { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
+ { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
+ { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
+ { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
+ { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
+ { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
+ { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
+ { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
+ { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
+ { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
+ { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
+ { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
+ { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
+ { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
+ { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
+ { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
+ { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
+ { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
+ { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
+ { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
+ { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
+ { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
+ { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
+ { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
+ { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
+ { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
+ { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
+ { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
+ { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
+ { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
+ { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
+ { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
+ { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
+ { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
+ { 1.0, 0.0 },
+ { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
+ { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
+ { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
+ { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
+ { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
+ { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
+ { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
+ { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
+ { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
+ { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
+ { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
+ { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
+ { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
+ { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
+ { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
+ { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
+ { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
+ { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
+ { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
+ { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
+ { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
+ { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
+ { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
+ { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
+ { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
+ { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
+ { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
+ { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
+ { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
+ { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
+ { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
+ { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
+ { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
+ { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
+ { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
+ { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
+ { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
+ { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
+ { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
+ { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
+ { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
+ { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
+ { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
+ { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
+ { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
+ { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
+ { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
+ { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
+ { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
+ { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
+ { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
+ { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
+};
diff --git a/math/aarch64/v_logf.c b/math/aarch64/v_logf.c
new file mode 100644
index 000000000000..66ebbbcd2b5a
--- /dev/null
+++ b/math/aarch64/v_logf.c
@@ -0,0 +1,74 @@
+/*
+ * Single-precision vector log function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ uint32x4_t min_norm;
+ uint16x8_t special_bound;
+ float32x4_t poly[7];
+ float32x4_t ln2, tiny_bound;
+ uint32x4_t off, mantissa_mask;
+} data = {
+ /* 3.34 ulp error. */
+ .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
+ V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
+ V4 (-0x1.ffffc8p-2f) },
+ .ln2 = V4 (0x1.62e43p-1f),
+ .tiny_bound = V4 (0x1p-126),
+ .min_norm = V4 (0x00800000),
+ .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff)
+};
+
+#define P(i) d->poly[7 - i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
+ uint16x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, p, q, r, r2, y;
+ uint32x4_t u;
+ uint16x4_t cmp;
+
+ u = vreinterpretq_u32_f32 (x);
+ cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+ vget_low_u16 (d->special_bound));
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u = vsubq_u32 (u, d->off);
+ n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
+ u = vandq_u32 (u, d->mantissa_mask);
+ u = vaddq_u32 (u, d->off);
+ r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log(1+r) + n*ln2. */
+ r2 = vmulq_f32 (r, r);
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ p = vfmaq_f32 (P (5), P (6), r);
+ q = vfmaq_f32 (P (3), P (4), r);
+ y = vfmaq_f32 (P (1), P (2), r);
+ p = vfmaq_f32 (p, P (7), r2);
+ q = vfmaq_f32 (q, p, r2);
+ y = vfmaq_f32 (y, q, r2);
+ p = vfmaq_f32 (r, d->ln2, n);
+
+ if (unlikely (v_any_u16h (cmp)))
+ return special_case (x, y, r2, p, cmp);
+ return vfmaq_f32 (p, y, r2);
+}
diff --git a/math/aarch64/v_math.h b/math/aarch64/v_math.h
new file mode 100644
index 000000000000..1dc9916c6fb0
--- /dev/null
+++ b/math/aarch64/v_math.h
@@ -0,0 +1,135 @@
+/*
+ * Vector math abstractions.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef _V_MATH_H
+#define _V_MATH_H
+
+#if !__aarch64__
+# error "Cannot build without AArch64"
+#endif
+
+#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+
+#include <stdint.h>
+#include "../math_config.h"
+#include <arm_neon.h>
+
+/* Shorthand helpers for declaring constants. */
+# define V2(X) { X, X }
+# define V4(X) { X, X, X, X }
+# define V8(X) { X, X, X, X, X, X, X, X }
+
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
+
+static inline int
+v_lanes32 (void)
+{
+ return 4;
+}
+
+static inline float32x4_t
+v_f32 (float x)
+{
+ return (float32x4_t) V4 (x);
+}
+static inline uint32x4_t
+v_u32 (uint32_t x)
+{
+ return (uint32x4_t) V4 (x);
+}
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u32 (uint32x4_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
+}
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
+{
+ return (float32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
+{
+ return (uint32x4_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+}
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+ p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+}
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+ float32x4_t y, uint32x4_t p)
+{
+ return (float32x4_t){p[0] ? f (x1[0], x2[0]) : y[0],
+ p[1] ? f (x1[1], x2[1]) : y[1],
+ p[2] ? f (x1[2], x2[2]) : y[2],
+ p[3] ? f (x1[3], x2[3]) : y[3]};
+}
+
+static inline int
+v_lanes64 (void)
+{
+ return 2;
+}
+static inline float64x2_t
+v_f64 (double x)
+{
+ return (float64x2_t) V2 (x);
+}
+static inline uint64x2_t
+v_u64 (uint64_t x)
+{
+ return (uint64x2_t) V2 (x);
+}
+/* true if any elements of a v_cond result is non-zero. */
+static inline int
+v_any_u64 (uint64x2_t x)
+{
+ /* assume elements in x are either 0 or -1u. */
+ return vpaddd_u64 (x) != 0;
+}
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
+{
+ return (float64x2_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
+{
+ return (uint64x2_t){tab[idx[0]], tab[idx[1]]};
+}
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
+{
+ double p1 = p[1];
+ double x1 = x[1];
+ if (likely (p[0]))
+ y[0] = f (x[0]);
+ if (likely (p1))
+ y[1] = f (x1);
+ return y;
+}
+
+#endif
diff --git a/math/aarch64/v_pow.c b/math/aarch64/v_pow.c
new file mode 100644
index 000000000000..734f1663a283
--- /dev/null
+++ b/math/aarch64/v_pow.c
@@ -0,0 +1,22 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+{
+ float64x2_t z;
+ for (int lane = 0; lane < v_lanes64 (); lane++)
+ {
+ double sx = x[lane];
+ double sy = y[lane];
+ double sz = pow (sx, sy);
+ z[lane] = sz;
+ }
+ return z;
+}
diff --git a/math/aarch64/v_powf.c b/math/aarch64/v_powf.c
new file mode 100644
index 000000000000..3a4163ab0558
--- /dev/null
+++ b/math/aarch64/v_powf.c
@@ -0,0 +1,148 @@
+/*
+ * Single-precision vector powf function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+#define Min v_u32 (0x00800000)
+#define Max v_u32 (0x7f800000)
+#define Thresh v_u32 (0x7f000000) /* Max - Min. */
+#define MantissaMask v_u32 (0x007fffff)
+
+#define A data.log2_poly
+#define C data.exp2f_poly
+
+/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
+#define Off v_u32 (0x3f35d000)
+
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_EXP2F_TABLE_BITS 5
+#define Log2IdxMask v_u32 ((1 << V_POWF_LOG2_TABLE_BITS) - 1)
+#define Scale ((double) (1 << V_EXP2F_TABLE_BITS))
+
+static const struct
+{
+ struct
+ {
+ double invc, logc;
+ } log2_tab[1 << V_POWF_LOG2_TABLE_BITS];
+ double log2_poly[4];
+ uint64_t exp2f_tab[1 << V_EXP2F_TABLE_BITS];
+ double exp2f_poly[3];
+} data = {
+ .log2_tab = {{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * Scale},
+ {0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * Scale},
+ {0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * Scale},
+ {0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * Scale},
+ {0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * Scale},
+ {0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * Scale},
+ {0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * Scale},
+ {0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * Scale},
+ {0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * Scale},
+ {0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * Scale},
+ {0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * Scale},
+ {0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * Scale},
+ {0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * Scale},
+ {0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * Scale},
+ {0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * Scale},
+ {0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * Scale},
+ {0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * Scale},
+ {0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * Scale},
+ {0x1p+0, 0x0p+0 * Scale},
+ {0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * Scale},
+ {0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * Scale},
+ {0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * Scale},
+ {0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * Scale},
+ {0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * Scale},
+ {0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * Scale},
+ {0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * Scale},
+ {0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * Scale},
+ {0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * Scale},
+ {0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * Scale},
+ {0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * Scale},
+ {0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * Scale},
+ {0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * Scale},},
+ .log2_poly = { /* rel err: 1.5 * 2^-30. */
+ -0x1.6ff5daa3b3d7cp-2 * Scale, 0x1.ec81d03c01aebp-2 * Scale,
+ -0x1.71547bb43f101p-1 * Scale, 0x1.7154764a815cbp0 * Scale,},
+ .exp2f_tab = {0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+ 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+ 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+ 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+ 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+ 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+ 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+ 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+ 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+ 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+ 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,},
+ .exp2f_poly = { /* rel err: 1.69 * 2^-34. */
+ 0x1.c6af84b912394p-5 / Scale / Scale / Scale,
+ 0x1.ebfce50fac4f3p-3 / Scale / Scale,
+ 0x1.62e42ff0c52d6p-1 / Scale}};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t ret, uint32x4_t cmp)
+{
+ return v_call2_f32 (powf, x, y, ret, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F2 (pow) (float32x4_t x, float32x4_t y)
+{
+ uint32x4_t u = vreinterpretq_u32_f32 (x);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (u, Min), Thresh);
+ uint32x4_t tmp = vsubq_u32 (u, Off);
+ uint32x4_t i = vandq_u32 (vshrq_n_u32 (tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+ Log2IdxMask);
+ uint32x4_t top = vbicq_u32 (tmp, MantissaMask);
+ uint32x4_t iz = vsubq_u32 (u, top);
+ int32x4_t k = vshrq_n_s32 (vreinterpretq_s32_u32 (top),
+ 23 - V_EXP2F_TABLE_BITS); /* arithmetic shift. */
+
+ float32x4_t ret;
+ for (int lane = 0; lane < 4; lane++)
+ {
+ /* Use double precision for each lane. */
+ double invc = data.log2_tab[i[lane]].invc;
+ double logc = data.log2_tab[i[lane]].logc;
+ double z = (double) asfloat (iz[lane]);
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
+ double r = __builtin_fma (z, invc, -1.0);
+ double y0 = logc + (double) k[lane];
+
+ /* Polynomial to approximate log1p(r)/ln2. */
+ double logx = A[0];
+ logx = r * logx + A[1];
+ logx = r * logx + A[2];
+ logx = r * logx + A[3];
+ logx = r * logx + y0;
+ double ylogx = y[lane] * logx;
+ cmp[lane] = (asuint64 (ylogx) >> 47 & 0xffff)
+ >= asuint64 (126.0 * (1 << V_EXP2F_TABLE_BITS)) >> 47
+ ? 1
+ : cmp[lane];
+
+ /* N*x = k + r with r in [-1/2, 1/2]. */
+ double kd = round (ylogx);
+ uint64_t ki = lround (ylogx);
+ r = ylogx - kd;
+
+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
+ uint64_t t = data.exp2f_tab[ki % (1 << V_EXP2F_TABLE_BITS)];
+ t += ki << (52 - V_EXP2F_TABLE_BITS);
+ double s = asdouble (t);
+ double p = C[0];
+ p = __builtin_fma (p, r, C[1]);
+ p = __builtin_fma (p, r, C[2]);
+ p = __builtin_fma (p, s * r, s);
+
+ ret[lane] = p;
+ }
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, ret, cmp);
+ return ret;
+}
diff --git a/math/aarch64/v_sin.c b/math/aarch64/v_sin.c
new file mode 100644
index 000000000000..04129c31133d
--- /dev/null
+++ b/math/aarch64/v_sin.c
@@ -0,0 +1,97 @@
+/*
+ * Double-precision vector sin function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float64x2_t poly[7];
+ float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+
+ .range_val = V2 (0x1p23),
+ .inv_pi = V2 (0x1.45f306dc9c883p-2),
+ .pi_1 = V2 (0x1.921fb54442d18p+1),
+ .pi_2 = V2 (0x1.1a62633145c06p-53),
+ .pi_3 = V2 (0x1.c1cd129024e09p-106),
+ .shift = V2 (0x1.8p52),
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
+# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
+#endif
+
+#define C(i) d->poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (sin, x, y, cmp);
+}
+
+/* Vector (AdvSIMD) sin approximation.
+ Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
+ is 2.87 ULP:
+ _ZGVnN2v_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
+ want 0x1.fffffffa7dc05p-1
+ Maximum observed error in the entire non-special domain ([-2^23, 2^23])
+ is 3.22 ULP:
+ _ZGVnN2v_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
+ want 0x1.ffdcd125c84f8p-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
+ uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
+ triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+ fenv). These lanes will be fixed by special-case handler later. */
+ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+ r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+#else
+ r = x;
+ cmp = vcageq_f64 (x, d->range_val);
+#endif
+
+ /* n = rint(|x|/pi). */
+ n = vfmaq_f64 (d->shift, d->inv_pi, r);
+ odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+ n = vsubq_f64 (n, d->shift);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
+ r = vfmsq_f64 (r, d->pi_1, n);
+ r = vfmsq_f64 (r, d->pi_2, n);
+ r = vfmsq_f64 (r, d->pi_3, n);
+
+ /* sin(r) poly approx. */
+ r2 = vmulq_f64 (r, r);
+ r3 = vmulq_f64 (r2, r);
+ r4 = vmulq_f64 (r2, r2);
+
+ t1 = vfmaq_f64 (C (4), C (5), r2);
+ t2 = vfmaq_f64 (C (2), C (3), r2);
+ t3 = vfmaq_f64 (C (0), C (1), r2);
+
+ y = vfmaq_f64 (t1, C (6), r4);
+ y = vfmaq_f64 (t2, y, r4);
+ y = vfmaq_f64 (t3, y, r4);
+ y = vfmaq_f64 (r, y, r3);
+
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/math/aarch64/v_sinf.c b/math/aarch64/v_sinf.c
new file mode 100644
index 000000000000..336879844459
--- /dev/null
+++ b/math/aarch64/v_sinf.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision vector sin function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+
+static const struct data
+{
+ float32x4_t poly[4];
+ float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+ /* 1.886 ulp error. */
+ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+ V4 (0x1.5b2e76p-19f) },
+
+ .pi_1 = V4 (0x1.921fb6p+1f),
+ .pi_2 = V4 (-0x1.777a5cp-24f),
+ .pi_3 = V4 (-0x1.ee59dap-49f),
+
+ .inv_pi = V4 (0x1.45f306p-2f),
+ .shift = V4 (0x1.8p+23f),
+ .range_val = V4 (0x1p20f)
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
+# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
+#endif
+
+#define C(i) d->poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (sinf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t n, r, r2, y;
+ uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+ /* If fenv exceptions are to be triggered correctly, set any special lanes
+ to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+ special-case handler later. */
+ r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+#else
+ r = x;
+ cmp = vcageq_f32 (x, d->range_val);
+#endif
+
+ /* n = rint(|x|/pi) */
+ n = vfmaq_f32 (d->shift, d->inv_pi, r);
+ odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+ n = vsubq_f32 (n, d->shift);
+
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ r = vfmsq_f32 (r, d->pi_1, n);
+ r = vfmsq_f32 (r, d->pi_2, n);
+ r = vfmsq_f32 (r, d->pi_3, n);
+
+ /* y = sin(r) */
+ r2 = vmulq_f32 (r, r);
+ y = vfmaq_f32 (C (2), C (3), r2);
+ y = vfmaq_f32 (C (1), y, r2);
+ y = vfmaq_f32 (C (0), y, r2);
+ y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
+
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/math/exp10.c b/math/exp10.c
new file mode 100644
index 000000000000..0fbec4c694ca
--- /dev/null
+++ b/math/exp10.c
@@ -0,0 +1,129 @@
+/*
+ * Double-precision 10^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
+#define UFlowBound -0x1.5ep+8 /* -350. */
+#define SmallTop 0x3c6 /* top12(0x1p-57). */
+#define BigTop 0x407 /* top12(0x1p8). */
+#define Thresh 0x41 /* BigTop - SmallTop. */
+#define Shift __exp_data.shift
+#define C(i) __exp_data.exp10_poly[i]
+
+static double
+special_case (uint64_t sbits, double_t tmp, uint64_t ki)
+{
+ double_t scale, y;
+
+ if (ki - (1ull << 16) < 0x80000000)
+ {
+ /* The exponent of scale might have overflowed by 1. */
+ sbits -= 1ull << 52;
+ scale = asdouble (sbits);
+ y = 2 * (scale + scale * tmp);
+ return check_oflow (eval_as_double (y));
+ }
+
+ /* n < 0, need special care in the subnormal range. */
+ sbits += 1022ull << 52;
+ scale = asdouble (sbits);
+ y = scale + scale * tmp;
+
+ if (y < 1.0)
+ {
+ /* Round y to the right precision before scaling it into the subnormal
+ range to avoid double rounding that can cause 0.5+E/2 ulp error where
+ E is the worst-case ulp error outside the subnormal range. So this
+ is only useful if the goal is better than 1 ulp worst-case error. */
+ double_t lo = scale - y + scale * tmp;
+ double_t hi = 1.0 + y;
+ lo = 1.0 - hi + y + lo;
+ y = eval_as_double (hi + lo) - 1.0;
+ /* Avoid -0.0 with downward rounding. */
+ if (WANT_ROUNDING && y == 0.0)
+ y = 0.0;
+ /* The underflow exception needs to be signaled explicitly. */
+ force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+ }
+ y = 0x1p-1022 * y;
+
+ return check_uflow (y);
+}
+
+/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
+double
+exp10 (double x)
+{
+ uint64_t ix = asuint64 (x);
+ uint32_t abstop = (ix >> 52) & 0x7ff;
+
+ if (unlikely (abstop - SmallTop >= Thresh))
+ {
+ if (abstop - SmallTop >= 0x80000000)
+ /* Avoid spurious underflow for tiny x.
+ Note: 0 is common input. */
+ return x + 1;
+ if (abstop == 0x7ff)
+ return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
+ if (x >= OFlowBound)
+ return __math_oflow (0);
+ if (x < UFlowBound)
+ return __math_uflow (0);
+
+ /* Large x is special-cased below. */
+ abstop = 0;
+ }
+
+ /* Reduce x: z = x * N / log10(2), k = round(z). */
+ double_t z = __exp_data.invlog10_2N * x;
+ double_t kd;
+ int64_t ki;
+#if TOINT_INTRINSICS
+ kd = roundtoint (z);
+ ki = converttoint (z);
+#else
+ kd = eval_as_double (z + Shift);
+ kd -= Shift;
+ ki = kd;
+#endif
+
+ /* r = x - k * log10(2), r in [-0.5, 0.5]. */
+ double_t r = x;
+ r = __exp_data.neglog10_2hiN * kd + r;
+ r = __exp_data.neglog10_2loN * kd + r;
+
+ /* exp10(x) = 2^(k/N) * 2^(r/N).
+ Approximate the two components separately. */
+
+ /* s = 2^(k/N), using lookup table. */
+ uint64_t e = ki << (52 - EXP_TABLE_BITS);
+ uint64_t i = (ki & IndexMask) * 2;
+ uint64_t u = __exp_data.tab[i + 1];
+ uint64_t sbits = u + e;
+
+ double_t tail = asdouble (__exp_data.tab[i]);
+
+ /* 2^(r/N) ~= 1 + r * Poly(r). */
+ double_t r2 = r * r;
+ double_t p = C (0) + r * C (1);
+ double_t y = C (2) + r * C (3);
+ y = y + r2 * C (4);
+ y = p + r2 * y;
+ y = tail + y * r;
+
+ if (unlikely (abstop == 0))
+ return special_case (sbits, y, ki);
+
+ /* Assemble components:
+ y = 2^(r/N) * 2^(k/N)
+ ~= (y + 1) * s. */
+ double_t s = asdouble (sbits);
+ return eval_as_double (s * y + s);
+}
diff --git a/math/exp_data.c b/math/exp_data.c
index 714c845709aa..c20b1b2d3e06 100644
--- a/math/exp_data.c
+++ b/math/exp_data.c
@@ -1,7 +1,7 @@
/*
* Shared data between exp, exp2 and pow.
*
- * Copyright (c) 2018, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -12,6 +12,7 @@
const struct exp_data __exp_data = {
// N/ln2
.invln2N = 0x1.71547652b82fep0 * N,
+.invlog10_2N = 0x1.a934f0979a371p1 * N,
// -ln2/N
#if N == 64
.negln2hiN = -0x1.62e42fefa0000p-7,
@@ -26,6 +27,8 @@ const struct exp_data __exp_data = {
.negln2hiN = -0x1.62e42fef80000p-10,
.negln2loN = -0x1.1cf79abc9e3b4p-45,
#endif
+.neglog10_2hiN = -0x1.3441350ap-2 / N,
+.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
// Used for rounding when !TOINT_INTRINSICS
#if EXP_USE_TOINT_NARROW
.shift = 0x1800000000.8p0,
@@ -147,6 +150,24 @@ const struct exp_data __exp_data = {
0x1.3b2ab786ee1dap-7,
#endif
},
+.exp10_poly = {
+#if EXP10_POLY_WIDE
+/* Range is wider if using shift-based reduction: coeffs generated
+ using Remez in [-log10(2)/128, log10(2)/128 ]. */
+0x1.26bb1bbb55515p1,
+0x1.53524c73cd32bp1,
+0x1.0470591e1a108p1,
+0x1.2bd77b12fe9a8p0,
+0x1.14289fef24b78p-1
+#else
+/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
+0x1.26bb1bbb55516p1,
+0x1.53524c73ce9fep1,
+0x1.0470591ce4b26p1,
+0x1.2bd76577fe684p0,
+0x1.1446eeccd0efbp-1
+#endif
+},
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
// tab[2*k] = asuint64(T[k])
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/math/include/mathlib.h b/math/include/mathlib.h
index c520c3772f7f..64cbb9c1f850 100644
--- a/math/include/mathlib.h
+++ b/math/include/mathlib.h
@@ -1,7 +1,7 @@
/*
* Public API.
*
- * Copyright (c) 2015-2020, Arm Limited.
+ * Copyright (c) 2015-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -18,74 +18,33 @@ float cosf (float);
void sincosf (float, float*, float*);
double exp (double);
+double exp10 (double);
double exp2 (double);
double log (double);
double log2 (double);
double pow (double, double);
-/* Scalar functions using the vector algorithm with identical result. */
-float __s_sinf (float);
-float __s_cosf (float);
-float __s_expf (float);
-float __s_expf_1u (float);
-float __s_exp2f (float);
-float __s_exp2f_1u (float);
-float __s_logf (float);
-float __s_powf (float, float);
-double __s_sin (double);
-double __s_cos (double);
-double __s_exp (double);
-double __s_log (double);
-double __s_pow (double, double);
-
#if __aarch64__
-#if __GNUC__ >= 5
+# if __GNUC__ >= 5
typedef __Float32x4_t __f32x4_t;
typedef __Float64x2_t __f64x2_t;
-#elif __clang_major__*100+__clang_minor__ >= 305
+# elif __clang_major__*100+__clang_minor__ >= 305
typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-#else
-#error Unsupported compiler
-#endif
-
-/* Vector functions following the base PCS. */
-__f32x4_t __v_sinf (__f32x4_t);
-__f32x4_t __v_cosf (__f32x4_t);
-__f32x4_t __v_expf (__f32x4_t);
-__f32x4_t __v_expf_1u (__f32x4_t);
-__f32x4_t __v_exp2f (__f32x4_t);
-__f32x4_t __v_exp2f_1u (__f32x4_t);
-__f32x4_t __v_logf (__f32x4_t);
-__f32x4_t __v_powf (__f32x4_t, __f32x4_t);
-__f64x2_t __v_sin (__f64x2_t);
-__f64x2_t __v_cos (__f64x2_t);
-__f64x2_t __v_exp (__f64x2_t);
-__f64x2_t __v_log (__f64x2_t);
-__f64x2_t __v_pow (__f64x2_t, __f64x2_t);
+# else
+# error Unsupported compiler
+# endif
-#if __GNUC__ >= 9 || __clang_major__ >= 8
-#define __vpcs __attribute__((__aarch64_vector_pcs__))
-
-/* Vector functions following the vector PCS. */
-__vpcs __f32x4_t __vn_sinf (__f32x4_t);
-__vpcs __f32x4_t __vn_cosf (__f32x4_t);
-__vpcs __f32x4_t __vn_expf (__f32x4_t);
-__vpcs __f32x4_t __vn_expf_1u (__f32x4_t);
-__vpcs __f32x4_t __vn_exp2f (__f32x4_t);
-__vpcs __f32x4_t __vn_exp2f_1u (__f32x4_t);
-__vpcs __f32x4_t __vn_logf (__f32x4_t);
-__vpcs __f32x4_t __vn_powf (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t __vn_sin (__f64x2_t);
-__vpcs __f64x2_t __vn_cos (__f64x2_t);
-__vpcs __f64x2_t __vn_exp (__f64x2_t);
-__vpcs __f64x2_t __vn_log (__f64x2_t);
-__vpcs __f64x2_t __vn_pow (__f64x2_t, __f64x2_t);
+# if __GNUC__ >= 9 || __clang_major__ >= 8
+# undef __vpcs
+# define __vpcs __attribute__((__aarch64_vector_pcs__))
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_expf_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp2f_1u (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
@@ -94,7 +53,7 @@ __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
-#endif
+# endif
#endif
#endif
diff --git a/math/math_config.h b/math/math_config.h
index 7ffc0cd2796a..faf77b31fc99 100644
--- a/math/math_config.h
+++ b/math/math_config.h
@@ -1,7 +1,7 @@
/*
* Configuration for math routines.
*
- * Copyright (c) 2017-2020, Arm Limited.
+ * Copyright (c) 2017-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -92,6 +92,46 @@
# define unlikely(x) (x)
#endif
+/* Return ptr but hide its value from the compiler so accesses through it
+ cannot be optimized based on the contents. */
+#define ptr_barrier(ptr) \
+ ({ \
+ __typeof (ptr) __ptr = (ptr); \
+ __asm("" : "+r"(__ptr)); \
+ __ptr; \
+ })
+
+/* Symbol renames to avoid libc conflicts. */
+#define __math_oflowf arm_math_oflowf
+#define __math_uflowf arm_math_uflowf
+#define __math_may_uflowf arm_math_may_uflowf
+#define __math_divzerof arm_math_divzerof
+#define __math_oflow arm_math_oflow
+#define __math_uflow arm_math_uflow
+#define __math_may_uflow arm_math_may_uflow
+#define __math_divzero arm_math_divzero
+#define __math_invalidf arm_math_invalidf
+#define __math_invalid arm_math_invalid
+#define __math_check_oflow arm_math_check_oflow
+#define __math_check_uflow arm_math_check_uflow
+#define __math_check_oflowf arm_math_check_oflowf
+#define __math_check_uflowf arm_math_check_uflowf
+
+#define __sincosf_table arm_math_sincosf_table
+#define __inv_pio4 arm_math_inv_pio4
+#define __exp2f_data arm_math_exp2f_data
+#define __logf_data arm_math_logf_data
+#define __log2f_data arm_math_log2f_data
+#define __powf_log2_data arm_math_powf_log2_data
+#define __exp_data arm_math_exp_data
+#define __log_data arm_math_log_data
+#define __log2_data arm_math_log2_data
+#define __pow_log_data arm_math_pow_log_data
+#define __erff_data arm_math_erff_data
+#define __erf_data arm_math_erf_data
+#define __v_exp_data arm_math_v_exp_data
+#define __v_log_data arm_math_v_log_data
+
#if HAVE_FAST_ROUND
/* When set, the roundtoint and converttoint functions are provided with
the semantics documented below. */
@@ -381,15 +421,22 @@ extern const struct powf_log2_data
#define EXP_USE_TOINT_NARROW 0
#define EXP2_POLY_ORDER 5
#define EXP2_POLY_WIDE 0
+/* Wider exp10 polynomial necessary for good precision in non-nearest rounding
+ and !TOINT_INTRINSICS. */
+#define EXP10_POLY_WIDE 0
extern const struct exp_data
{
double invln2N;
+ double invlog10_2N;
double shift;
double negln2hiN;
double negln2loN;
+ double neglog10_2hiN;
+ double neglog10_2loN;
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
+ double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data HIDDEN;
@@ -459,4 +506,16 @@ extern const struct erf_data
double erfc_poly_F[ERFC_POLY_F_NCOEFFS];
} __erf_data HIDDEN;
+#define V_EXP_TABLE_BITS 7
+extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+
+#define V_LOG_TABLE_BITS 7
+extern const struct v_log_data
+{
+ struct
+ {
+ double invc, logc;
+ } table[1 << V_LOG_TABLE_BITS];
+} __v_log_data HIDDEN;
+
#endif
diff --git a/math/s_cos.c b/math/s_cos.c
deleted file mode 100644
index e66d563d15b5..000000000000
--- a/math/s_cos.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_cos.c"
diff --git a/math/s_cosf.c b/math/s_cosf.c
deleted file mode 100644
index f615d260b39b..000000000000
--- a/math/s_cosf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_cosf.c"
diff --git a/math/s_exp.c b/math/s_exp.c
deleted file mode 100644
index 5da0099e3c65..000000000000
--- a/math/s_exp.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_exp.c"
diff --git a/math/s_exp2f.c b/math/s_exp2f.c
deleted file mode 100644
index dcbfea9e1e79..000000000000
--- a/math/s_exp2f.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_exp2f.c"
diff --git a/math/s_exp2f_1u.c b/math/s_exp2f_1u.c
deleted file mode 100644
index bf387e44cfb2..000000000000
--- a/math/s_exp2f_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_exp2f_1u.c"
diff --git a/math/s_expf.c b/math/s_expf.c
deleted file mode 100644
index dacda7fb4fd5..000000000000
--- a/math/s_expf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_expf.c"
diff --git a/math/s_expf_1u.c b/math/s_expf_1u.c
deleted file mode 100644
index 00096449f7a5..000000000000
--- a/math/s_expf_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_expf_1u.c"
diff --git a/math/s_log.c b/math/s_log.c
deleted file mode 100644
index 27d2eb290f56..000000000000
--- a/math/s_log.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log.c"
diff --git a/math/s_logf.c b/math/s_logf.c
deleted file mode 100644
index 7d98b2ba15c4..000000000000
--- a/math/s_logf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_logf.c"
diff --git a/math/s_pow.c b/math/s_pow.c
deleted file mode 100644
index 6eca2b2b17f1..000000000000
--- a/math/s_pow.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_pow.c"
diff --git a/math/s_powf.c b/math/s_powf.c
deleted file mode 100644
index 1d55d90df7b2..000000000000
--- a/math/s_powf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_powf.c"
diff --git a/math/s_sin.c b/math/s_sin.c
deleted file mode 100644
index 0c6171259c0c..000000000000
--- a/math/s_sin.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_sin.c"
diff --git a/math/s_sinf.c b/math/s_sinf.c
deleted file mode 100644
index 3aae61149618..000000000000
--- a/math/s_sinf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_sinf.c"
diff --git a/math/test/mathbench.c b/math/test/mathbench.c
index 6e18e36fbcb2..ed7e89bb7710 100644
--- a/math/test/mathbench.c
+++ b/math/test/mathbench.c
@@ -1,7 +1,7 @@
/*
* Microbenchmark for math functions.
*
- * Copyright (c) 2018-2022, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -15,11 +15,6 @@
#include <math.h>
#include "mathlib.h"
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
-#endif
-
/* Number of measurements, best result is reported. */
#define MEASURE 60
/* Array size. */
@@ -34,8 +29,9 @@ static float Af[N];
static long measurecount = MEASURE;
static long itercount = ITER;
-#if __aarch64__ && WANT_VMATH
-typedef __f64x2_t v_double;
+#ifdef __vpcs
+#include <arm_neon.h>
+typedef float64x2_t v_double;
#define v_double_len() 2
@@ -51,7 +47,7 @@ v_double_dup (double x)
return (v_double){x, x};
}
-typedef __f32x4_t v_float;
+typedef float32x4_t v_float;
#define v_float_len() 4
@@ -66,6 +62,19 @@ v_float_dup (float x)
{
return (v_float){x, x, x, x};
}
+#else
+/* dummy definitions to make things compile. */
+typedef double v_double;
+typedef float v_float;
+#define v_double_len(x) 1
+#define v_double_load(x) (x)[0]
+#define v_double_dup(x) (x)
+#define v_float_len(x) 1
+#define v_float_load(x) (x)[0]
+#define v_float_dup(x) (x)
+
+#endif
+
#if WANT_SVE_MATH
#include <arm_sve.h>
typedef svbool_t sv_bool;
@@ -102,17 +111,10 @@ sv_float_dup (float x)
{
return svdup_n_f32(x);
}
-#endif
#else
/* dummy definitions to make things compile. */
-typedef double v_double;
-typedef float v_float;
-#define v_double_len(x) 1
-#define v_double_load(x) (x)[0]
-#define v_double_dup(x) (x)
-#define v_float_len(x) 1
-#define v_float_load(x) (x)[0]
-#define v_float_dup(x) (x)
+#define sv_double_len(x) 1
+#define sv_float_len(x) 1
#endif
static double
@@ -126,20 +128,6 @@ dummyf (float x)
{
return x;
}
-#if WANT_VMATH
-#if __aarch64__
-static v_double
-__v_dummy (v_double x)
-{
- return x;
-}
-
-static v_float
-__v_dummyf (v_float x)
-{
- return x;
-}
-
#ifdef __vpcs
__vpcs static v_double
__vn_dummy (v_double x)
@@ -167,8 +155,6 @@ __sv_dummyf (sv_float x, sv_bool pg)
}
#endif
-#endif
-#endif
#include "test/mathbench_wrappers.h"
@@ -183,8 +169,6 @@ static const struct fun
{
double (*d) (double);
float (*f) (float);
- v_double (*vd) (v_double);
- v_float (*vf) (v_float);
#ifdef __vpcs
__vpcs v_double (*vnd) (v_double);
__vpcs v_float (*vnf) (v_float);
@@ -197,18 +181,12 @@ static const struct fun
} funtab[] = {
#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
-#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
-#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}},
#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}},
D (dummy, 1.0, 2.0)
F (dummyf, 1.0, 2.0)
-#if WANT_VMATH
-#if __aarch64__
-VD (__v_dummy, 1.0, 2.0)
-VF (__v_dummyf, 1.0, 2.0)
#ifdef __vpcs
VND (__vn_dummy, 1.0, 2.0)
VNF (__vn_dummyf, 1.0, 2.0)
@@ -217,14 +195,10 @@ VNF (__vn_dummyf, 1.0, 2.0)
SVD (__sv_dummy, 1.0, 2.0)
SVF (__sv_dummyf, 1.0, 2.0)
#endif
-#endif
-#endif
#include "test/mathbench_funcs.h"
{0},
#undef F
#undef D
-#undef VF
-#undef VD
#undef VNF
#undef VND
#undef SVF
@@ -327,38 +301,6 @@ runf_latency (float f (float))
prev = f (Af[i] + prev * z);
}
-static void
-run_v_thruput (v_double f (v_double))
-{
- for (int i = 0; i < N; i += v_double_len ())
- f (v_double_load (A+i));
-}
-
-static void
-runf_v_thruput (v_float f (v_float))
-{
- for (int i = 0; i < N; i += v_float_len ())
- f (v_float_load (Af+i));
-}
-
-static void
-run_v_latency (v_double f (v_double))
-{
- v_double z = v_double_dup (zero);
- v_double prev = z;
- for (int i = 0; i < N; i += v_double_len ())
- prev = f (v_double_load (A+i) + prev * z);
-}
-
-static void
-runf_v_latency (v_float f (v_float))
-{
- v_float z = v_float_dup (zero);
- v_float prev = z;
- for (int i = 0; i < N; i += v_float_len ())
- prev = f (v_float_load (Af+i) + prev * z);
-}
-
#ifdef __vpcs
static void
run_vn_thruput (__vpcs v_double f (v_double))
@@ -377,19 +319,21 @@ runf_vn_thruput (__vpcs v_float f (v_float))
static void
run_vn_latency (__vpcs v_double f (v_double))
{
- v_double z = v_double_dup (zero);
- v_double prev = z;
+ volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 };
+ uint64x2_t sel = vsel;
+ v_double prev = v_double_dup (0);
for (int i = 0; i < N; i += v_double_len ())
- prev = f (v_double_load (A+i) + prev * z);
+ prev = f (vbslq_f64 (sel, prev, v_double_load (A+i)));
}
static void
runf_vn_latency (__vpcs v_float f (v_float))
{
- v_float z = v_float_dup (zero);
- v_float prev = z;
+ volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 };
+ uint32x4_t sel = vsel;
+ v_float prev = v_float_dup (0);
for (int i = 0; i < N; i += v_float_len ())
- prev = f (v_float_load (Af+i) + prev * z);
+ prev = f (vbslq_f32 (sel, prev, v_float_load (Af+i)));
}
#endif
@@ -411,19 +355,21 @@ runf_sv_thruput (sv_float f (sv_float, sv_bool))
static void
run_sv_latency (sv_double f (sv_double, sv_bool))
{
- sv_double z = sv_double_dup (zero);
- sv_double prev = z;
+ volatile sv_bool vsel = svptrue_b64 ();
+ sv_bool sel = vsel;
+ sv_double prev = sv_double_dup (0);
for (int i = 0; i < N; i += sv_double_len ())
- prev = f (svmad_f64_x (svptrue_b64 (), prev, z, sv_double_load (A+i)), svptrue_b64 ());
+ prev = f (svsel_f64 (sel, sv_double_load (A+i), prev), svptrue_b64 ());
}
static void
runf_sv_latency (sv_float f (sv_float, sv_bool))
{
- sv_float z = sv_float_dup (zero);
- sv_float prev = z;
+ volatile sv_bool vsel = svptrue_b32 ();
+ sv_bool sel = vsel;
+ sv_float prev = sv_float_dup (0);
for (int i = 0; i < N; i += sv_float_len ())
- prev = f (svmad_f32_x (svptrue_b32 (), prev, z, sv_float_load (Af+i)), svptrue_b32 ());
+ prev = f (svsel_f32 (sel, sv_float_load (Af+i), prev), svptrue_b32 ());
}
#endif
@@ -458,10 +404,10 @@ bench1 (const struct fun *f, int type, double lo, double hi)
const char *s = type == 't' ? "rthruput" : "latency";
int vlen = 1;
- if (f->vec && f->prec == 'd')
- vlen = v_double_len();
- else if (f->vec && f->prec == 'f')
- vlen = v_float_len();
+ if (f->vec == 'n')
+ vlen = f->prec == 'd' ? v_double_len() : v_float_len();
+ else if (f->vec == 's')
+ vlen = f->prec == 'd' ? sv_double_len() : sv_float_len();
if (f->prec == 'd' && type == 't' && f->vec == 0)
TIMEIT (run_thruput, f->fun.d);
@@ -471,14 +417,6 @@ bench1 (const struct fun *f, int type, double lo, double hi)
TIMEIT (runf_thruput, f->fun.f);
else if (f->prec == 'f' && type == 'l' && f->vec == 0)
TIMEIT (runf_latency, f->fun.f);
- else if (f->prec == 'd' && type == 't' && f->vec == 'v')
- TIMEIT (run_v_thruput, f->fun.vd);
- else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
- TIMEIT (run_v_latency, f->fun.vd);
- else if (f->prec == 'f' && type == 't' && f->vec == 'v')
- TIMEIT (runf_v_thruput, f->fun.vf);
- else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
- TIMEIT (runf_v_latency, f->fun.vf);
#ifdef __vpcs
else if (f->prec == 'd' && type == 't' && f->vec == 'n')
TIMEIT (run_vn_thruput, f->fun.vnd);
@@ -503,16 +441,18 @@ bench1 (const struct fun *f, int type, double lo, double hi)
if (type == 't')
{
ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
- printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
+ printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n",
+ f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
- (unsigned long long) dt, lo, hi);
+ (unsigned long long) dt, lo, hi, vlen);
}
else if (type == 'l')
{
ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
- printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
+ printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n",
+ f->name, s,
(unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
- (unsigned long long) dt, lo, hi);
+ (unsigned long long) dt, lo, hi, vlen);
}
fflush (stdout);
}
diff --git a/math/test/mathbench_funcs.h b/math/test/mathbench_funcs.h
index ad6dd2a2313d..84c4e68650ac 100644
--- a/math/test/mathbench_funcs.h
+++ b/math/test/mathbench_funcs.h
@@ -1,11 +1,13 @@
/*
* Function entries for mathbench.
*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+/* clang-format off */
D (exp, -9.9, 9.9)
D (exp, 0.5, 1.0)
+D (exp10, -9.9, 9.9)
D (exp2, -9.9, 9.9)
D (log, 0.01, 11.1)
D (log, 0.999, 1.001)
@@ -42,59 +44,19 @@ F (cosf, 3.3, 33.3)
F (cosf, 100, 1000)
F (cosf, 1e6, 1e32)
F (erff, -4.0, 4.0)
-#if WANT_VMATH
-D (__s_sin, -3.1, 3.1)
-D (__s_cos, -3.1, 3.1)
-D (__s_exp, -9.9, 9.9)
-D (__s_log, 0.01, 11.1)
-{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
-F (__s_expf, -9.9, 9.9)
-F (__s_expf_1u, -9.9, 9.9)
-F (__s_exp2f, -9.9, 9.9)
-F (__s_exp2f_1u, -9.9, 9.9)
-F (__s_logf, 0.01, 11.1)
-{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
-F (__s_sinf, -3.1, 3.1)
-F (__s_cosf, -3.1, 3.1)
-#if __aarch64__
-VD (__v_sin, -3.1, 3.1)
-VD (__v_cos, -3.1, 3.1)
-VD (__v_exp, -9.9, 9.9)
-VD (__v_log, 0.01, 11.1)
-{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
-VF (__v_expf, -9.9, 9.9)
-VF (__v_expf_1u, -9.9, 9.9)
-VF (__v_exp2f, -9.9, 9.9)
-VF (__v_exp2f_1u, -9.9, 9.9)
-VF (__v_logf, 0.01, 11.1)
-{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
-VF (__v_sinf, -3.1, 3.1)
-VF (__v_cosf, -3.1, 3.1)
#ifdef __vpcs
-VND (__vn_exp, -9.9, 9.9)
VND (_ZGVnN2v_exp, -9.9, 9.9)
-VND (__vn_log, 0.01, 11.1)
VND (_ZGVnN2v_log, 0.01, 11.1)
-{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
-VND (__vn_sin, -3.1, 3.1)
VND (_ZGVnN2v_sin, -3.1, 3.1)
-VND (__vn_cos, -3.1, 3.1)
VND (_ZGVnN2v_cos, -3.1, 3.1)
-VNF (__vn_expf, -9.9, 9.9)
VNF (_ZGVnN4v_expf, -9.9, 9.9)
-VNF (__vn_expf_1u, -9.9, 9.9)
-VNF (__vn_exp2f, -9.9, 9.9)
+VNF (_ZGVnN4v_expf_1u, -9.9, 9.9)
VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
-VNF (__vn_exp2f_1u, -9.9, 9.9)
-VNF (__vn_logf, 0.01, 11.1)
+VNF (_ZGVnN4v_exp2f_1u, -9.9, 9.9)
VNF (_ZGVnN4v_logf, 0.01, 11.1)
-{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
-VNF (__vn_sinf, -3.1, 3.1)
VNF (_ZGVnN4v_sinf, -3.1, 3.1)
-VNF (__vn_cosf, -3.1, 3.1)
VNF (_ZGVnN4v_cosf, -3.1, 3.1)
#endif
-#endif
-#endif
+ /* clang-format on */
diff --git a/math/test/mathbench_wrappers.h b/math/test/mathbench_wrappers.h
index 8311f0f4e173..062b9db56de5 100644
--- a/math/test/mathbench_wrappers.h
+++ b/math/test/mathbench_wrappers.h
@@ -1,18 +1,11 @@
/*
* Function wrappers for mathbench.
*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#if WANT_VMATH
-#if __aarch64__
#ifdef __vpcs
-__vpcs static v_float
-xy__vn_powf (v_float x)
-{
- return __vn_powf (x, x);
-}
__vpcs static v_float
xy_Z_powf (v_float x)
@@ -21,43 +14,12 @@ xy_Z_powf (v_float x)
}
__vpcs static v_double
-xy__vn_pow (v_double x)
-{
- return __vn_pow (x, x);
-}
-
-__vpcs static v_double
xy_Z_pow (v_double x)
{
return _ZGVnN2vv_pow (x, x);
}
-#endif // __vpcs
-
-static v_float
-xy__v_powf (v_float x)
-{
- return __v_powf (x, x);
-}
-static v_double
-xy__v_pow (v_double x)
-{
- return __v_pow (x, x);
-}
-#endif // __aarch64__
-
-static float
-xy__s_powf (float x)
-{
- return __s_powf (x, x);
-}
-
-static double
-xy__s_pow (double x)
-{
- return __s_pow (x, x);
-}
-#endif // WANT_VMATH
+#endif
static double
xypow (double x)
diff --git a/math/test/mathtest.c b/math/test/mathtest.c
index 3168da43b01d..834233fdde9d 100644
--- a/math/test/mathtest.c
+++ b/math/test/mathtest.c
@@ -1,7 +1,7 @@
/*
* mathtest.c - test rig for mathlib
*
- * Copyright (c) 1998-2022, Arm Limited.
+ * Copyright (c) 1998-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -254,6 +254,7 @@ test_func tfuncs[] = {
TFUNCARM(at_s,rt_s, expf, 3*ULPUNIT/4),
TFUNCARM(at_s,rt_s, exp2f, 3*ULPUNIT/4),
TFUNC(at_s,rt_s, expm1f, ULPUNIT),
+ TFUNC(at_d,rt_d, exp10, ULPUNIT),
/* power */
TFUNC(at_d2,rt_d, pow, 3*ULPUNIT/4),
@@ -1021,6 +1022,7 @@ int runtest(testdetail t) {
DO_DOP(d_arg1,op1r);
DO_DOP(d_arg2,op2r);
s_arg1.i = t.op1r[0]; s_arg2.i = t.op2r[0];
+ s_res.i = 0;
/*
* Detect NaNs, infinities and denormals on input, and set a
@@ -1155,22 +1157,25 @@ int runtest(testdetail t) {
tresultr[0] = t.resultr[0];
tresultr[1] = t.resultr[1];
resultr[0] = d_res.i[dmsd]; resultr[1] = d_res.i[dlsd];
+ resulti[0] = resulti[1] = 0;
wres = 2;
break;
case rt_i:
tresultr[0] = t.resultr[0];
resultr[0] = intres;
+ resulti[0] = 0;
wres = 1;
break;
case rt_s:
case rt_s2:
tresultr[0] = t.resultr[0];
resultr[0] = s_res.i;
+ resulti[0] = 0;
wres = 1;
break;
default:
puts("unhandled rettype in runtest");
- wres = 0;
+ abort ();
}
if(t.resultc != rc_none) {
int err = 0;
diff --git a/math/test/runulp.sh b/math/test/runulp.sh
index b4000f6ea01b..e2e03e3ae761 100755
--- a/math/test/runulp.sh
+++ b/math/test/runulp.sh
@@ -2,7 +2,7 @@
# ULP error check script.
#
-# Copyright (c) 2019-2022, Arm Limited.
+# Copyright (c) 2019-2023, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
#set -x
@@ -72,6 +72,16 @@ t pow 0x1.ffffffffffff0p-1 0x1.0000000000008p0 x 0x1p60 0x1p68 50000
t pow 0x1.ffffffffff000p-1 0x1p0 x 0x1p50 0x1p52 50000
t pow -0x1.ffffffffff000p-1 -0x1p0 x 0x1p50 0x1p52 50000
+L=0.02
+t exp10 0 0x1p-47 5000
+t exp10 -0 -0x1p-47 5000
+t exp10 0x1p-47 1 50000
+t exp10 -0x1p-47 -1 50000
+t exp10 1 0x1.34413509f79ffp8 50000
+t exp10 -1 -0x1.434e6420f4374p8 50000
+t exp10 0x1.34413509f79ffp8 inf 5000
+t exp10 -0x1.434e6420f4374p8 -inf 5000
+
L=1.0
Ldir=0.9
t erf 0 0xffff000000000000 10000
@@ -143,15 +153,10 @@ Ldir=0.5
done
# vector functions
+
Ldir=0.5
r='n'
flags="${ULPFLAGS:--q}"
-runs=
-check __s_exp 1 && runs=1
-runv=
-check __v_exp 1 && runv=1
-runvn=
-check __vn_exp 1 && runvn=1
range_exp='
0 0xffff000000000000 10000
@@ -177,9 +182,10 @@ range_pow='
'
range_sin='
- 0 0xffff000000000000 10000
- 0x1p-4 0x1p4 400000
- -0x1p-23 0x1p23 400000
+ 0 0x1p23 500000
+ -0 -0x1p23 500000
+ 0x1p23 inf 10000
+ -0x1p23 -inf 10000
'
range_cos="$range_sin"
@@ -199,9 +205,10 @@ range_logf='
'
range_sinf='
- 0 0xffff0000 10000
- 0x1p-4 0x1p4 300000
--0x1p-9 -0x1p9 300000
+ 0 0x1p20 500000
+ -0 -0x1p20 500000
+ 0x1p20 inf 10000
+ -0x1p20 -inf 10000
'
range_cosf="$range_sinf"
@@ -229,9 +236,8 @@ L_sinf=1.4
L_cosf=1.4
L_powf=2.1
-while read G F R D
+while read G F D
do
- [ "$R" = 1 ] || continue
case "$G" in \#*) continue ;; esac
eval range="\${range_$G}"
eval L="\${L_$G}"
@@ -251,71 +257,23 @@ do
t $D $disable_fenv $F $X
done << EOF
$range
+
EOF
done << EOF
# group symbol run
-exp __s_exp $runs
-exp __v_exp $runv
-exp __vn_exp $runvn
-exp _ZGVnN2v_exp $runvn
-
-log __s_log $runs
-log __v_log $runv
-log __vn_log $runvn
-log _ZGVnN2v_log $runvn
-
-pow __s_pow $runs -f
-pow __v_pow $runv -f
-pow __vn_pow $runvn -f
-pow _ZGVnN2vv_pow $runvn -f
-
-sin __s_sin $runs
-sin __v_sin $runv
-sin __vn_sin $runvn
-sin _ZGVnN2v_sin $runvn
-
-cos __s_cos $runs
-cos __v_cos $runv
-cos __vn_cos $runvn
-cos _ZGVnN2v_cos $runvn
-
-expf __s_expf $runs
-expf __v_expf $runv
-expf __vn_expf $runvn
-expf _ZGVnN4v_expf $runvn
-
-expf_1u __s_expf_1u $runs -f
-expf_1u __v_expf_1u $runv -f
-expf_1u __vn_expf_1u $runvn -f
-
-exp2f __s_exp2f $runs
-exp2f __v_exp2f $runv
-exp2f __vn_exp2f $runvn
-exp2f _ZGVnN4v_exp2f $runvn
-
-exp2f_1u __s_exp2f_1u $runs -f
-exp2f_1u __v_exp2f_1u $runv -f
-exp2f_1u __vn_exp2f_1u $runvn -f
-
-logf __s_logf $runs
-logf __v_logf $runv
-logf __vn_logf $runvn
-logf _ZGVnN4v_logf $runvn
-
-sinf __s_sinf $runs
-sinf __v_sinf $runv
-sinf __vn_sinf $runvn
-sinf _ZGVnN4v_sinf $runvn
-
-cosf __s_cosf $runs
-cosf __v_cosf $runv
-cosf __vn_cosf $runvn
-cosf _ZGVnN4v_cosf $runvn
-
-powf __s_powf $runs -f
-powf __v_powf $runv -f
-powf __vn_powf $runvn -f
-powf _ZGVnN4vv_powf $runvn -f
+exp _ZGVnN2v_exp
+log _ZGVnN2v_log
+pow _ZGVnN2vv_pow -f
+sin _ZGVnN2v_sin -z
+cos _ZGVnN2v_cos
+expf _ZGVnN4v_expf
+expf_1u _ZGVnN4v_expf_1u -f
+exp2f _ZGVnN4v_exp2f
+exp2f_1u _ZGVnN4v_exp2f_1u -f
+logf _ZGVnN4v_logf
+sinf _ZGVnN4v_sinf -z
+cosf _ZGVnN4v_cosf
+powf _ZGVnN4vv_powf -f
EOF
[ 0 -eq $FAIL ] || {
diff --git a/math/test/testcases/directed/exp10.tst b/math/test/testcases/directed/exp10.tst
new file mode 100644
index 000000000000..2cf4273bd1d7
--- /dev/null
+++ b/math/test/testcases/directed/exp10.tst
@@ -0,0 +1,15 @@
+; Directed test cases for exp10
+;
+; Copyright (c) 2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=exp10 op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=exp10 op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=exp10 op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=exp10 op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=exp10 op1=7ff00000.00000000 result=7ff00000.00000000 errno=0
+func=exp10 op1=7fefffff.ffffffff result=7ff00000.00000000 errno=ERANGE status=ox
+func=exp10 op1=fff00000.00000000 result=00000000.00000000 errno=0
+func=exp10 op1=ffefffff.ffffffff result=00000000.00000000 errno=ERANGE status=ux
+func=exp10 op1=00000000.00000000 result=3ff00000.00000000 errno=0
+func=exp10 op1=80000000.00000000 result=3ff00000.00000000 errno=0
diff --git a/math/test/ulp.c b/math/test/ulp.c
index bb8c3ad69900..5ff29972e50e 100644
--- a/math/test/ulp.c
+++ b/math/test/ulp.c
@@ -1,10 +1,11 @@
/*
* ULP error checking tool for math functions.
*
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#define _GNU_SOURCE
#include <ctype.h>
#include <fenv.h>
#include <float.h>
@@ -23,11 +24,6 @@
# include <mpfr.h>
#endif
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
-#endif
-
static inline uint64_t
asuint64 (double f)
{
@@ -212,6 +208,7 @@ struct conf
unsigned long long n;
double softlim;
double errlim;
+ int ignore_zero_sign;
};
/* A bit of a hack: call vector functions twice with the same
@@ -220,7 +217,7 @@ struct conf
static int secondcall;
/* Wrappers for vector functions. */
-#if __aarch64__ && WANT_VMATH
+#ifdef __vpcs
typedef __f32x4_t v_float;
typedef __f64x2_t v_double;
/* First element of fv and dv may be changed by -c argument. */
@@ -264,40 +261,8 @@ static inline double svretd(sv_double vec) {
#endif
#endif
-#if WANT_SVE_MATH
-long double
-dummyl (long double x)
-{
- return x;
-}
-
-double
-dummy (double x)
-{
- return x;
-}
-
-static sv_double
-__sv_dummy (sv_double x)
-{
- return x;
-}
-
-static sv_float
-__sv_dummyf (sv_float x)
-{
- return x;
-}
-#endif
-
#include "test/ulp_wrappers.h"
-/* Wrappers for SVE functions. */
-#if WANT_SVE_MATH
-static double sv_dummy (double x) { return svretd (__sv_dummy (svargd (x))); }
-static float sv_dummyf (float x) { return svretf (__sv_dummyf (svargf (x))); }
-#endif
-
struct fun
{
const char *name;
@@ -358,10 +323,6 @@ static const struct fun fun[] = {
#define ZVNF2(x) VNF2 (x) ZVF2 (x)
#define ZVND1(x) VND1 (x) ZVD1 (x)
#define ZVND2(x) VND2 (x) ZVD2 (x)
-#define SF1(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 1, 1, f1, 0)
-#define SF2(x) F (__s_##x##f, __s_##x##f, x, mpfr_##x, 2, 1, f2, 0)
-#define SD1(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 1, 0, d1, 0)
-#define SD2(x) F (__s_##x, __s_##x, x##l, mpfr_##x, 2, 0, d2, 0)
/* SVE routines. */
#define SVF1(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 1, 1, f1, 0)
#define SVF2(x) F (__sv_##x##f, sv_##x##f, x, mpfr_##x, 2, 1, f2, 0)
@@ -374,11 +335,6 @@ static const struct fun fun[] = {
#include "test/ulp_funcs.h"
-#if WANT_SVE_MATH
- SVD1 (dummy)
- SVF1 (dummy)
-#endif
-
#undef F
#undef F1
#undef F2
@@ -628,17 +584,18 @@ call_mpfr_d2 (mpfr_t y, const struct fun *f, struct args_d2 a, mpfr_rnd_t r)
static void
usage (void)
{
- puts ("./ulp [-q] [-m] [-f] [-r nudz] [-l soft-ulplimit] [-e ulplimit] func "
+ puts ("./ulp [-q] [-m] [-f] [-r {n|u|d|z}] [-l soft-ulplimit] [-e ulplimit] func "
"lo [hi [x lo2 hi2] [count]]");
puts ("Compares func against a higher precision implementation in [lo; hi].");
puts ("-q: quiet.");
puts ("-m: use mpfr even if faster method is available.");
- puts ("-f: disable fenv testing (rounding modes and exceptions).");
-#if __aarch64__ && WANT_VMATH
+ puts ("-f: disable fenv exceptions testing.");
+#ifdef ___vpcs
puts ("-c: neutral 'control value' to test behaviour when one lane can affect another. \n"
" This should be different from tested input in other lanes, and non-special \n"
" (i.e. should not trigger fenv exceptions). Default is 1.");
#endif
+ puts ("-z: ignore sign of 0.");
puts ("Supported func:");
for (const struct fun *f = fun; f->name; f++)
printf ("\t%s\n", f->name);
@@ -762,6 +719,7 @@ main (int argc, char *argv[])
conf.fenv = 1;
conf.softlim = 0;
conf.errlim = INFINITY;
+ conf.ignore_zero_sign = 0;
for (;;)
{
argc--;
@@ -801,12 +759,15 @@ main (int argc, char *argv[])
{
argc--;
argv++;
- if (argc < 1)
+ if (argc < 1 || argv[0][1] != '\0')
usage ();
conf.rc = argv[0][0];
}
break;
-#if __aarch64__ && WANT_VMATH
+ case 'z':
+ conf.ignore_zero_sign = 1;
+ break;
+#ifdef __vpcs
case 'c':
argc--;
argv++;
@@ -839,7 +800,19 @@ main (int argc, char *argv[])
if (strcmp (argv[0], f->name) == 0)
break;
if (!f->name)
- usage ();
+ {
+#ifndef __vpcs
+ /* Ignore vector math functions if vector math is not supported. */
+ if (strncmp (argv[0], "_ZGVnN", 6) == 0)
+ exit (0);
+#endif
+#if !WANT_SVE_MATH
+ if (strncmp (argv[0], "_ZGVsMxv", 8) == 0)
+ exit (0);
+#endif
+ printf ("math function %s not supported\n", argv[0]);
+ exit (1);
+ }
if (!f->singleprec && LDBL_MANT_DIG == DBL_MANT_DIG)
conf.mpfr = 1; /* Use mpfr if long double has no extra precision. */
if (!USE_MPFR && conf.mpfr)
diff --git a/math/test/ulp.h b/math/test/ulp.h
index 327b4bd0fd06..b0bc59aeef8d 100644
--- a/math/test/ulp.h
+++ b/math/test/ulp.h
@@ -1,7 +1,7 @@
/*
* Generic functions for ULP error estimation.
*
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -37,7 +37,8 @@ static int RT(ulpscale_mpfr) (mpfr_t x, int t)
/* Difference between exact result and closest real number that
gets rounded to got, i.e. error before rounding, for a correctly
rounded result the difference is 0. */
-static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
+static double RT (ulperr) (RT (float) got, const struct RT (ret) * p, int r,
+ int ignore_zero_sign)
{
RT(float) want = p->y;
RT(float) d;
@@ -45,10 +46,18 @@ static double RT(ulperr) (RT(float) got, const struct RT(ret) * p, int r)
if (RT(asuint) (got) == RT(asuint) (want))
return 0.0;
+ if (isnan (got) && isnan (want))
+ /* Ignore sign of NaN. */
+ return RT (issignaling) (got) == RT (issignaling) (want) ? 0 : INFINITY;
if (signbit (got) != signbit (want))
- /* May have false positives with NaN. */
- //return isnan(got) && isnan(want) ? 0 : INFINITY;
- return INFINITY;
+ {
+ /* Fall through to ULP calculation if ignoring sign of zero and at
+ exactly one of want and got is non-zero. */
+ if (ignore_zero_sign && want == got)
+ return 0.0;
+ if (!ignore_zero_sign || (want != 0 && got != 0))
+ return INFINITY;
+ }
if (!isfinite (want) || !isfinite (got))
{
if (isnan (got) != isnan (want))
@@ -114,8 +123,12 @@ static inline void T(call_fenv) (const struct fun *f, struct T(args) a, int r,
static inline void T(call_nofenv) (const struct fun *f, struct T(args) a,
int r, RT(float) * y, int *ex)
{
+ if (r != FE_TONEAREST)
+ fesetround (r);
*y = T(call) (f, a);
*ex = 0;
+ if (r != FE_TONEAREST)
+ fesetround (FE_TONEAREST);
}
static inline int T(call_long_fenv) (const struct fun *f, struct T(args) a,
@@ -155,8 +168,12 @@ static inline int T(call_long_nofenv) (const struct fun *f, struct T(args) a,
int r, struct RT(ret) * p,
RT(float) ygot, int exgot)
{
+ if (r != FE_TONEAREST)
+ fesetround (r);
RT(double) yl = T(call_long) (f, a);
p->y = (RT(float)) yl;
+ if (r != FE_TONEAREST)
+ fesetround (FE_TONEAREST);
if (RT(isok_nofenv) (ygot, p->y))
return 1;
p->ulpexp = RT(ulpscale) (p->y);
@@ -288,7 +305,7 @@ static int T(cmp) (const struct fun *f, struct gen *gen,
if (!ok)
{
int print = 0;
- double err = RT(ulperr) (ygot, &want, r);
+ double err = RT (ulperr) (ygot, &want, r, conf->ignore_zero_sign);
double abserr = fabs (err);
// TODO: count errors below accuracy limit.
if (abserr > 0)
diff --git a/math/test/ulp_funcs.h b/math/test/ulp_funcs.h
index f5cea4d6d14c..84f7927d3935 100644
--- a/math/test/ulp_funcs.h
+++ b/math/test/ulp_funcs.h
@@ -1,9 +1,10 @@
/*
* Function entries for ulp.
*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+/* clang-format off */
F1 (sin)
F1 (cos)
F (sincosf_sinf, sincosf_sinf, sincos_sin, sincos_mpfr_sin, 1, 1, f1, 0)
@@ -15,56 +16,18 @@
F2 (pow)
F1 (erf)
D1 (exp)
+ D1 (exp10)
D1 (exp2)
D1 (log)
D1 (log2)
D2 (pow)
D1 (erf)
-#if WANT_VMATH
- F (__s_sinf, __s_sinf, sin, mpfr_sin, 1, 1, f1, 0)
- F (__s_cosf, __s_cosf, cos, mpfr_cos, 1, 1, f1, 0)
- F (__s_expf_1u, __s_expf_1u, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_expf, __s_expf, exp, mpfr_exp, 1, 1, f1, 0)
- F (__s_exp2f_1u, __s_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_exp2f, __s_exp2f, exp2, mpfr_exp2, 1, 1, f1, 0)
- F (__s_powf, __s_powf, pow, mpfr_pow, 2, 1, f2, 0)
- F (__s_logf, __s_logf, log, mpfr_log, 1, 1, f1, 0)
- F (__s_sin, __s_sin, sinl, mpfr_sin, 1, 0, d1, 0)
- F (__s_cos, __s_cos, cosl, mpfr_cos, 1, 0, d1, 0)
- F (__s_exp, __s_exp, expl, mpfr_exp, 1, 0, d1, 0)
- F (__s_log, __s_log, logl, mpfr_log, 1, 0, d1, 0)
- F (__s_pow, __s_pow, powl, mpfr_pow, 2, 0, d2, 0)
-#if __aarch64__
- F (__v_sinf, v_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__v_cosf, v_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__v_expf_1u, v_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_expf, v_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__v_exp2f_1u, v_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_exp2f, v_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__v_logf, v_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__v_powf, v_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__v_sin, v_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__v_cos, v_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__v_exp, v_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__v_log, v_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__v_pow, v_pow, powl, mpfr_pow, 2, 0, d2, 1)
#ifdef __vpcs
- F (__vn_sinf, vn_sinf, sin, mpfr_sin, 1, 1, f1, 1)
- F (__vn_cosf, vn_cosf, cos, mpfr_cos, 1, 1, f1, 1)
- F (__vn_expf_1u, vn_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_expf, vn_expf, exp, mpfr_exp, 1, 1, f1, 1)
- F (__vn_exp2f_1u, vn_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_exp2f, vn_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
- F (__vn_logf, vn_logf, log, mpfr_log, 1, 1, f1, 1)
- F (__vn_powf, vn_powf, pow, mpfr_pow, 2, 1, f2, 1)
- F (__vn_sin, vn_sin, sinl, mpfr_sin, 1, 0, d1, 1)
- F (__vn_cos, vn_cos, cosl, mpfr_cos, 1, 0, d1, 1)
- F (__vn_exp, vn_exp, expl, mpfr_exp, 1, 0, d1, 1)
- F (__vn_log, vn_log, logl, mpfr_log, 1, 0, d1, 1)
- F (__vn_pow, vn_pow, powl, mpfr_pow, 2, 0, d2, 1)
F (_ZGVnN4v_sinf, Z_sinf, sin, mpfr_sin, 1, 1, f1, 1)
F (_ZGVnN4v_cosf, Z_cosf, cos, mpfr_cos, 1, 1, f1, 1)
+ F (_ZGVnN4v_expf_1u, Z_expf_1u, exp, mpfr_exp, 1, 1, f1, 1)
F (_ZGVnN4v_expf, Z_expf, exp, mpfr_exp, 1, 1, f1, 1)
+ F (_ZGVnN4v_exp2f_1u, Z_exp2f_1u, exp2, mpfr_exp2, 1, 1, f1, 1)
F (_ZGVnN4v_exp2f, Z_exp2f, exp2, mpfr_exp2, 1, 1, f1, 1)
F (_ZGVnN4v_logf, Z_logf, log, mpfr_log, 1, 1, f1, 1)
F (_ZGVnN4vv_powf, Z_powf, pow, mpfr_pow, 2, 1, f2, 1)
@@ -74,5 +37,4 @@
F (_ZGVnN2v_log, Z_log, logl, mpfr_log, 1, 0, d1, 1)
F (_ZGVnN2vv_pow, Z_pow, powl, mpfr_pow, 2, 0, d2, 1)
#endif
-#endif
-#endif
+/* clang-format on */
diff --git a/math/test/ulp_wrappers.h b/math/test/ulp_wrappers.h
index fd9e00c0310f..60dc3d6dd652 100644
--- a/math/test/ulp_wrappers.h
+++ b/math/test/ulp_wrappers.h
@@ -1,10 +1,12 @@
/*
* Function wrappers for ulp.
*
- * Copyright (c) 2022, Arm Limited.
+ * Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+/* clang-format off */
+
/* Wrappers for sincos. */
static float sincosf_sinf(float x) {(void)cosf(x); return sinf(x);}
static float sincosf_cosf(float x) {(void)sinf(x); return cosf(x);}
@@ -16,37 +18,12 @@ static int sincos_mpfr_cos(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) { mpfr_sin(y,
#endif
/* Wrappers for vector functions. */
-#if __aarch64__ && WANT_VMATH
-static float v_sinf(float x) { return __v_sinf(argf(x))[0]; }
-static float v_cosf(float x) { return __v_cosf(argf(x))[0]; }
-static float v_expf_1u(float x) { return __v_expf_1u(argf(x))[0]; }
-static float v_expf(float x) { return __v_expf(argf(x))[0]; }
-static float v_exp2f_1u(float x) { return __v_exp2f_1u(argf(x))[0]; }
-static float v_exp2f(float x) { return __v_exp2f(argf(x))[0]; }
-static float v_logf(float x) { return __v_logf(argf(x))[0]; }
-static float v_powf(float x, float y) { return __v_powf(argf(x),argf(y))[0]; }
-static double v_sin(double x) { return __v_sin(argd(x))[0]; }
-static double v_cos(double x) { return __v_cos(argd(x))[0]; }
-static double v_exp(double x) { return __v_exp(argd(x))[0]; }
-static double v_log(double x) { return __v_log(argd(x))[0]; }
-static double v_pow(double x, double y) { return __v_pow(argd(x),argd(y))[0]; }
#ifdef __vpcs
-static float vn_sinf(float x) { return __vn_sinf(argf(x))[0]; }
-static float vn_cosf(float x) { return __vn_cosf(argf(x))[0]; }
-static float vn_expf_1u(float x) { return __vn_expf_1u(argf(x))[0]; }
-static float vn_expf(float x) { return __vn_expf(argf(x))[0]; }
-static float vn_exp2f_1u(float x) { return __vn_exp2f_1u(argf(x))[0]; }
-static float vn_exp2f(float x) { return __vn_exp2f(argf(x))[0]; }
-static float vn_logf(float x) { return __vn_logf(argf(x))[0]; }
-static float vn_powf(float x, float y) { return __vn_powf(argf(x),argf(y))[0]; }
-static double vn_sin(double x) { return __vn_sin(argd(x))[0]; }
-static double vn_cos(double x) { return __vn_cos(argd(x))[0]; }
-static double vn_exp(double x) { return __vn_exp(argd(x))[0]; }
-static double vn_log(double x) { return __vn_log(argd(x))[0]; }
-static double vn_pow(double x, double y) { return __vn_pow(argd(x),argd(y))[0]; }
static float Z_sinf(float x) { return _ZGVnN4v_sinf(argf(x))[0]; }
static float Z_cosf(float x) { return _ZGVnN4v_cosf(argf(x))[0]; }
+static float Z_expf_1u(float x) { return _ZGVnN4v_expf_1u(argf(x))[0]; }
static float Z_expf(float x) { return _ZGVnN4v_expf(argf(x))[0]; }
+static float Z_exp2f_1u(float x) { return _ZGVnN4v_exp2f_1u(argf(x))[0]; }
static float Z_exp2f(float x) { return _ZGVnN4v_exp2f(argf(x))[0]; }
static float Z_logf(float x) { return _ZGVnN4v_logf(argf(x))[0]; }
static float Z_powf(float x, float y) { return _ZGVnN4vv_powf(argf(x),argf(y))[0]; }
@@ -56,4 +33,5 @@ static double Z_exp(double x) { return _ZGVnN2v_exp(argd(x))[0]; }
static double Z_log(double x) { return _ZGVnN2v_log(argd(x))[0]; }
static double Z_pow(double x, double y) { return _ZGVnN2vv_pow(argd(x),argd(y))[0]; }
#endif
-#endif
+
+/* clang-format on */
diff --git a/math/tgamma128.c b/math/tgamma128.c
new file mode 100644
index 000000000000..65deacc49d99
--- /dev/null
+++ b/math/tgamma128.c
@@ -0,0 +1,356 @@
+/*
+ * Implementation of the true gamma function (as opposed to lgamma)
+ * for 128-bit long double.
+ *
+ * Copyright (c) 2006-2024, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/*
+ * This module implements the float128 gamma function under the name
+ * tgamma128. It's expected to be suitable for integration into system
+ * maths libraries under the standard name tgammal, if long double is
+ * 128-bit. Such a library will probably want to check the error
+ * handling and optimize the initial process of extracting the
+ * exponent, which is done here by simple and portable (but
+ * potentially slower) methods.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+/* Only binary128 format is supported. */
+#if LDBL_MANT_DIG == 113
+
+#include "tgamma128.h"
+
+#define lenof(x) (sizeof(x)/sizeof(*(x)))
+
+/*
+ * Helper routine to evaluate a polynomial via Horner's rule
+ */
+static long double poly(const long double *coeffs, size_t n, long double x)
+{
+ long double result = coeffs[--n];
+
+ while (n > 0)
+ result = (result * x) + coeffs[--n];
+
+ return result;
+}
+
+/*
+ * Compute sin(pi*x) / pi, for use in the reflection formula that
+ * relates gamma(-x) and gamma(x).
+ */
+static long double sin_pi_x_over_pi(long double x)
+{
+ int quo;
+ long double fracpart = remquol(x, 0.5L, &quo);
+
+ long double sign = 1.0L;
+ if (quo & 2)
+ sign = -sign;
+ quo &= 1;
+
+ if (quo == 0 && fabsl(fracpart) < 0x1.p-58L) {
+ /* For numbers this size, sin(pi*x) is so close to pi*x that
+ * sin(pi*x)/pi is indistinguishable from x in float128 */
+ return sign * fracpart;
+ }
+
+ if (quo == 0) {
+ return sign * sinl(pi*fracpart) / pi;
+ } else {
+ return sign * cosl(pi*fracpart) / pi;
+ }
+}
+
+/* Return tgamma(x) on the assumption that x >= 8. */
+static long double tgamma_large(long double x,
+ bool negative, long double negadjust)
+{
+ /*
+ * In this range we compute gamma(x) as x^(x-1/2) * e^-x * K,
+ * where K is a correction factor computed as a polynomial in 1/x.
+ *
+ * (Vaguely inspired by the form of the Lanczos approximation, but
+ * I tried the Lanczos approximation itself and it suffers badly
+ * from big cancellation leading to loss of significance.)
+ */
+ long double t = 1/x;
+ long double p = poly(coeffs_large, lenof(coeffs_large), t);
+
+ /*
+ * To avoid overflow in cases where x^(x-0.5) does overflow
+ * but gamma(x) does not, we split x^(x-0.5) in half and
+ * multiply back up _after_ multiplying the shrinking factor
+ * of exp(-(x-0.5)).
+ *
+ * Note that computing x-0.5 and (x-0.5)/2 is exact for the
+ * relevant range of x, so the only sources of error are pow
+ * and exp themselves, plus the multiplications.
+ */
+ long double powhalf = powl(x, (x-0.5L)/2.0L);
+ long double expret = expl(-(x-0.5L));
+
+ if (!negative) {
+ return (expret * powhalf) * powhalf * p;
+ } else {
+ /*
+ * Apply the reflection formula as commented below, but
+ * carefully: negadjust has magnitude less than 1, so it can
+ * turn a case where gamma(+x) would overflow into a case
+ * where gamma(-x) doesn't underflow. Not only that, but the
+ * FP format has greater range in the tiny domain due to
+ * denormals. For both reasons, it's not good enough to
+ * compute the positive result and then adjust it.
+ */
+ long double ret = 1 / ((expret * powhalf) * (x * negadjust) * p);
+ return ret / powhalf;
+ }
+}
+
+/* Return tgamma(x) on the assumption that 0 <= x < 1/32. */
+static long double tgamma_tiny(long double x,
+ bool negative, long double negadjust)
+{
+ /*
+ * For x near zero, we use a polynomial approximation to
+ * g = 1/(x*gamma(x)), and then return 1/(g*x).
+ */
+ long double g = poly(coeffs_tiny, lenof(coeffs_tiny), x);
+ if (!negative)
+ return 1.0L / (g*x);
+ else
+ return g / negadjust;
+}
+
+/* Return tgamma(x) on the assumption that 0 <= x < 2^-113. */
+static long double tgamma_ultratiny(long double x, bool negative,
+ long double negadjust)
+{
+ /* On this interval, gamma can't even be distinguished from 1/x,
+ * so we skip the polynomial evaluation in tgamma_tiny, partly to
+ * save time and partly to avoid the tiny intermediate values
+ * setting the underflow exception flag. */
+ if (!negative)
+ return 1.0L / x;
+ else
+ return 1.0L / negadjust;
+}
+
+/* Return tgamma(x) on the assumption that 1 <= x <= 2. */
+static long double tgamma_central(long double x)
+{
+ /*
+ * In this central interval, our strategy is to finding the
+ * difference between x and the point where gamma has a minimum,
+ * and approximate based on that.
+ */
+
+ /* The difference between the input x and the minimum x. The first
+ * subtraction is expected to be exact, since x and min_hi have
+ * the same exponent (unless x=2, in which case it will still be
+ * exact). */
+ long double t = (x - min_x_hi) - min_x_lo;
+
+ /*
+ * Now use two different polynomials for the intervals [1,m] and
+ * [m,2].
+ */
+ long double p;
+ if (t < 0)
+ p = poly(coeffs_central_neg, lenof(coeffs_central_neg), -t);
+ else
+ p = poly(coeffs_central_pos, lenof(coeffs_central_pos), t);
+
+ return (min_y_lo + p * (t*t)) + min_y_hi;
+}
+
+long double tgamma128(long double x)
+{
+ /*
+ * Start by extracting the number's sign and exponent, and ruling
+ * out cases of non-normalized numbers.
+ *
+ * For an implementation integrated into a system libm, it would
+ * almost certainly be quicker to do this by direct bitwise access
+ * to the input float128 value, using whatever is the local idiom
+ * for knowing its endianness.
+ *
+ * Integration into a system libc may also need to worry about
+ * setting errno, if that's the locally preferred way to report
+ * math.h errors.
+ */
+ int sign = signbit(x);
+ int exponent;
+ switch (fpclassify(x)) {
+ case FP_NAN:
+ return x+x; /* propagate QNaN, make SNaN throw an exception */
+ case FP_ZERO:
+ return 1/x; /* divide by zero on purpose to indicate a pole */
+ case FP_INFINITE:
+ if (sign) {
+ return x-x; /* gamma(-inf) has indeterminate sign, so provoke an
+ * IEEE invalid operation exception to indicate that */
+ }
+ return x; /* but gamma(+inf) is just +inf with no error */
+ case FP_SUBNORMAL:
+ exponent = -16384;
+ break;
+ default:
+ frexpl(x, &exponent);
+ exponent--;
+ break;
+ }
+
+ bool negative = false;
+ long double negadjust = 0.0L;
+
+ if (sign) {
+ /*
+ * Euler's reflection formula is
+ *
+ * gamma(1-x) gamma(x) = pi/sin(pi*x)
+ *
+ * pi
+ * => gamma(x) = --------------------
+ * gamma(1-x) sin(pi*x)
+ *
+ * But computing 1-x is going to lose a lot of accuracy when x
+ * is very small, so instead we transform using the recurrence
+ * gamma(t+1)=t gamma(t). Setting t=-x, this gives us
+ * gamma(1-x) = -x gamma(-x), so we now have
+ *
+ * pi
+ * gamma(x) = ----------------------
+ * -x gamma(-x) sin(pi*x)
+ *
+ * which relates gamma(x) to gamma(-x), which is much nicer,
+ * since x can be turned into -x without rounding.
+ */
+ negadjust = sin_pi_x_over_pi(x);
+ negative = true;
+ x = -x;
+
+ /*
+ * Now the ultimate answer we want is
+ *
+ * 1 / (gamma(x) * x * negadjust)
+ *
+ * where x is the positive value we've just turned it into.
+ *
+ * For some of the cases below, we'll compute gamma(x)
+ * normally and then compute this adjusted value afterwards.
+ * But for others, we can implement the reciprocal operation
+ * in this formula by _avoiding_ an inversion that the
+ * sub-case was going to do anyway.
+ */
+
+ if (negadjust == 0) {
+ /*
+ * Special case for negative integers. Applying the
+ * reflection formula would cause division by zero, but
+ * standards would prefer we treat this error case as an
+ * invalid operation and return NaN instead. (Possibly
+ * because otherwise you'd have to decide which sign of
+ * infinity to return, and unlike the x=0 case, there's no
+ * sign of zero available to disambiguate.)
+ */
+ return negadjust / negadjust;
+ }
+ }
+
+ /*
+ * Split the positive domain into various cases. For cases where
+ * we do the negative-number adjustment the usual way, we'll leave
+ * the answer in 'g' and drop out of the if statement.
+ */
+ long double g;
+
+ if (exponent >= 11) {
+ /*
+ * gamma of any positive value this large overflows, and gamma
+ * of any negative value underflows.
+ */
+ if (!negative) {
+ long double huge = 0x1p+12288L;
+ return huge * huge; /* provoke an overflow */
+ } else {
+ long double tiny = 0x1p-12288L;
+ return tiny * tiny * negadjust; /* underflow, of the right sign */
+ }
+ } else if (exponent >= 3) {
+ /* Negative-number adjustment happens inside here */
+ return tgamma_large(x, negative, negadjust);
+ } else if (exponent < -113) {
+ /* Negative-number adjustment happens inside here */
+ return tgamma_ultratiny(x, negative, negadjust);
+ } else if (exponent < -5) {
+ /* Negative-number adjustment happens inside here */
+ return tgamma_tiny(x, negative, negadjust);
+ } else if (exponent == 0) {
+ g = tgamma_central(x);
+ } else if (exponent < 0) {
+ /*
+ * For x in [1/32,1) we range-reduce upwards to the interval
+ * [1,2), using the inverse of the normal recurrence formula:
+ * gamma(x) = gamma(x+1)/x.
+ */
+ g = tgamma_central(1+x) / x;
+ } else {
+ /*
+ * For x in [2,8) we range-reduce downwards to the interval
+ * [1,2) by repeated application of the recurrence formula.
+ *
+ * Actually multiplying (x-1) by (x-2) by (x-3) and so on
+ * would introduce multiple ULPs of rounding error. We can get
+ * better accuracy by writing x = (k+1/2) + t, where k is an
+ * integer and |t|<1/2, and expanding out the obvious factor
+ * (x-1)(x-2)...(x-k+1) as a polynomial in t.
+ */
+ long double mult;
+ int i = x;
+ if (i == 2) { /* x in [2,3) */
+ mult = (x-1);
+ } else {
+ long double t = x - (i + 0.5L);
+ switch (i) {
+ /* E.g. for x=3.5+t, we want
+ * (x-1)(x-2) = (2.5+t)(1.5+t) = 3.75 + 4t + t^2 */
+ case 3:
+ mult = 3.75L+t*(4.0L+t);
+ break;
+ case 4:
+ mult = 13.125L+t*(17.75L+t*(7.5L+t));
+ break;
+ case 5:
+ mult = 59.0625L+t*(93.0L+t*(51.50L+t*(12.0L+t)));
+ break;
+ case 6:
+ mult = 324.84375L+t*(570.5625L+t*(376.250L+t*(
+ 117.5L+t*(17.5L+t))));
+ break;
+ case 7:
+ mult = 2111.484375L+t*(4033.5L+t*(3016.1875L+t*(
+ 1140.0L+t*(231.25L+t*(24.0L+t)))));
+ break;
+ }
+ }
+
+ g = tgamma_central(x - (i-1)) * mult;
+ }
+
+ if (!negative) {
+ /* Positive domain: return g unmodified */
+ return g;
+ } else {
+ /* Negative domain: apply the reflection formula as commented above */
+ return 1.0L / (g * x * negadjust);
+ }
+}
+
+#endif
diff --git a/math/tgamma128.h b/math/tgamma128.h
new file mode 100644
index 000000000000..90875a22dce4
--- /dev/null
+++ b/math/tgamma128.h
@@ -0,0 +1,141 @@
+/*
+ * Polynomial coefficients and other constants for tgamma128.c.
+ *
+ * Copyright (c) 2006-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* The largest positive value for which 128-bit tgamma does not overflow. */
+static const long double max_x = 0x1.b6e3180cd66a5c4206f128ba77f4p+10L;
+
+/* Coefficients of the polynomial used in the tgamma_large() subroutine */
+static const long double coeffs_large[] = {
+ 0x1.8535745aa79569579b9eec0f3bbcp+0L,
+ 0x1.0378f83c6fb8f0e51269f2b4a973p-3L,
+ 0x1.59f6a05094f69686c3380f4e2783p-8L,
+ -0x1.0b291dee952a82764a4859b081a6p-8L,
+ -0x1.6dd301b2205bf936b5a3eaad0dbbp-12L,
+ 0x1.387a8b5f38dd77e7f139b1021e86p-10L,
+ 0x1.bca46637f65b13750c728cc29e40p-14L,
+ -0x1.d80401c00aef998c9e303151a51cp-11L,
+ -0x1.49cb6bb09f935a2053ccc2cf3711p-14L,
+ 0x1.4e950204437dcaf2be77f73a6f45p-10L,
+ 0x1.cb711a2d65f188bf60110934d6bep-14L,
+ -0x1.7d7ff4bc95dc7faefc5e767f70f1p-9L,
+ -0x1.0305ab9760cddb0d833e73766836p-12L,
+ 0x1.3ef6c84bf1cd5c3f65ac2693bb5bp-7L,
+ 0x1.bb4144740ad9290123fdcea684aap-11L,
+ -0x1.72ab4e88272a229bfafd192450f0p-5L,
+ 0x1.80c70ac6eb3b7a698983d25a62b8p-12L,
+ 0x1.e222791c6743ce3e3cae220fb236p-3L,
+ 0x1.1a2dca1c82a9326c52b465f7cb7ap-2L,
+ -0x1.9d204fa235a42cd901b123d2ad47p+1L,
+ 0x1.55b56d1158f77ddb1c95fc44ab02p+0L,
+ 0x1.37f900a11dbd892abd7dde533e2dp+5L,
+ -0x1.2da49f4188dd89cb958369ef2401p+7L,
+ 0x1.fdae5ec3ec6eb7dffc09edbe6c14p+7L,
+ -0x1.61433cebe649098c9611c4c7774ap+7L,
+};
+
+/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
+static const long double coeffs_tiny[] = {
+ 0x1.0000000000000000000000000000p+0L,
+ 0x1.2788cfc6fb618f49a37c7f0201fep-1L,
+ -0x1.4fcf4026afa2dceb8490ade22796p-1L,
+ -0x1.5815e8fa27047c8f42b5d9217244p-5L,
+ 0x1.5512320b43fbe5dfa771333518f7p-3L,
+ -0x1.59af103c340927bffdd44f954bfcp-5L,
+ -0x1.3b4af28483e210479657e5543366p-7L,
+ 0x1.d919c527f6070bfce9b29c2ace9cp-8L,
+ -0x1.317112ce35337def3556a18aa178p-10L,
+ -0x1.c364fe77a6f27677b985b1fa2e1dp-13L,
+ 0x1.0c8a7a19a3fd40fe1f7e867efe7bp-13L,
+ -0x1.51cf9f090b5dc398ba86305e3634p-16L,
+ -0x1.4e80f64c04a339740de06ca9fa4ap-20L,
+ 0x1.241ddc2aef2ec20e58b08f2fda17p-20L,
+};
+
+/* The location within the interval [1,2] where gamma has a minimum.
+ * Specified as the sum of two 128-bit values, for extra precision. */
+static const long double min_x_hi = 0x1.762d86356be3f6e1a9c8865e0a4fp+0L;
+static const long double min_x_lo = 0x1.ac54d7d218de21303a7c60f08840p-118L;
+
+/* The actual minimum value that gamma takes at that location.
+ * Again specified as the sum of two 128-bit values. */
+static const long double min_y_hi = 0x1.c56dc82a74aee8d8851566d40f32p-1L;
+static const long double min_y_lo = 0x1.8ed98685742c353ce55e5794686fp-114L;
+
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [1,min_x] */
+static const long double coeffs_central_neg[] = {
+ 0x1.b6c53f7377b83839c8a292e43b69p-2L,
+ 0x1.0bae9f40c7d09ed76e732045850ap-3L,
+ 0x1.4981175e14d04c3530e51d01c5fep-3L,
+ 0x1.79f77aaf032c948af3a9edbd2061p-4L,
+ 0x1.1e97bd10821095a5b79fbfdfa1a3p-4L,
+ 0x1.8071ce0935e4dcf0b33b0fbec7c1p-5L,
+ 0x1.0b44c2f92982f887b55ec36dfdb0p-5L,
+ 0x1.6df1de1e178ef72ca7bd63d40870p-6L,
+ 0x1.f63f502bde27e81c0f5e13479b43p-7L,
+ 0x1.57fd67d901f40ea011353ad89a0ap-7L,
+ 0x1.d7151376eed187eb753e2273cafcp-8L,
+ 0x1.427162b5c6ff1d904c71ef53e37cp-8L,
+ 0x1.b954b8c3a56cf93e49ef6538928ap-9L,
+ 0x1.2dff2ec26a3ae5cd3aaccae7a09ep-9L,
+ 0x1.9d35250d9b9378d9b59df734537ap-10L,
+ 0x1.1b2c0c48b9855a28f6dbd6fdff3cp-10L,
+ 0x1.7e0db39bb99cdb52b028d9359380p-11L,
+ 0x1.2164b5e1d364a0b5eaf97c436aa7p-11L,
+ 0x1.27521cf5fd24dcdf43524e6add11p-13L,
+ 0x1.06461d62243bf9a826b42349672fp-10L,
+ -0x1.2b852abead28209b4e0c756dc46ep-9L,
+ 0x1.be673c11a72c826115ec6d286c14p-8L,
+ -0x1.fd9ce330c215c31fcd3cb53c42ebp-7L,
+ 0x1.fa362bd2dc68f41abef2d8600acdp-6L,
+ -0x1.a21585b2f52f8b23855de8e452edp-5L,
+ 0x1.1f234431ed032052fc92e64e0493p-4L,
+ -0x1.40d332476ca0199c60cdae3f9132p-4L,
+ 0x1.1d45dc665d86012eba2eea199cefp-4L,
+ -0x1.8491016cdd08dc9be7ade9b5fef3p-5L,
+ 0x1.7e7e2fbc6d49ad484300d6add324p-6L,
+ -0x1.e63fe3f874a37276a8d7d8b705ecp-8L,
+ 0x1.30a2a73944f8c84998314d69c23fp-10L,
+};
+
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [min_x,2] */
+static const long double coeffs_central_pos[] = {
+ 0x1.b6c53f7377b83839c8a292e22aa2p-2L,
+ -0x1.0bae9f40c7d09ed76e72e1c955dep-3L,
+ 0x1.4981175e14d04c3530ee5e1ecebcp-3L,
+ -0x1.79f77aaf032c948ac983d77f3e07p-4L,
+ 0x1.1e97bd10821095ab7dc94936cc11p-4L,
+ -0x1.8071ce0935e4d7edef8cbf2a1cf1p-5L,
+ 0x1.0b44c2f929837fafef7b5d9e80f1p-5L,
+ -0x1.6df1de1e175fe2a51faa25cddbb4p-6L,
+ 0x1.f63f502be57d11aed2cfe90843ffp-7L,
+ -0x1.57fd67d852f230015b9f64770273p-7L,
+ 0x1.d715138adc07e5fce81077070357p-8L,
+ -0x1.4271618e9fda8992a667adb15f4fp-8L,
+ 0x1.b954d15d9eb772e80fdd760672d7p-9L,
+ -0x1.2dfe391241d3cb79c8c15182843dp-9L,
+ 0x1.9d44396fcd48451c3ba924cee814p-10L,
+ -0x1.1ac195fb99739e341589e39803e6p-10L,
+ 0x1.82e46127b68f002770826e25f146p-11L,
+ -0x1.089dacd90d9f41493119ac178359p-11L,
+ 0x1.6993c007b20394a057d21f3d37f8p-12L,
+ -0x1.ec43a709f4446560c099dec8e31bp-13L,
+ 0x1.4ba36322f4074e9add9450f003cap-13L,
+ -0x1.b3f83a977965ca1b7937bf5b34cap-14L,
+ 0x1.10af346abc09cb25a6d9fe810b6ep-14L,
+ -0x1.38d8ea1188f242f50203edc395bdp-15L,
+ 0x1.39add987a948ec56f62b721a4475p-16L,
+ -0x1.02a4e141f286c8a967e2df9bc9adp-17L,
+ 0x1.433b50af22425f546e87113062d7p-19L,
+ -0x1.0c7b73cb0013f00aafc103e8e382p-21L,
+ 0x1.b852de313ec38da2297f6deaa6b4p-25L,
+};
+
+/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
+ */
+static const long double pi = 0x1.921fb54442d18469898cc51701b8p+1L;
diff --git a/math/tools/tgamma128_gen.jl b/math/tools/tgamma128_gen.jl
new file mode 100644
index 000000000000..ecec174110ea
--- /dev/null
+++ b/math/tools/tgamma128_gen.jl
@@ -0,0 +1,212 @@
+# -*- julia -*-
+#
+# Generate tgamma128.h, containing polynomials and constants used by
+# tgamma128.c.
+#
+# Copyright (c) 2006-2023, Arm Limited.
+# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+# This Julia program depends on the 'Remez' and 'SpecialFunctions'
+# library packages. To install them, run this at the interactive Julia
+# prompt:
+#
+# import Pkg; Pkg.add(["Remez", "SpecialFunctions"])
+#
+# Tested on Julia 1.4.1 (Ubuntu 20.04) and 1.9.0 (22.04).
+
+import Printf
+import Remez
+import SpecialFunctions
+
+# Round a BigFloat to 128-bit long double and format it as a C99 hex
+# float literal.
+function quadhex(x)
+ sign = " "
+ if x < 0
+ sign = "-"
+ x = -x
+ end
+
+ exponent = BigInt(floor(log2(x)))
+ exponent = max(exponent, -16382)
+ @assert(exponent <= 16383) # else overflow
+
+ x /= BigFloat(2)^exponent
+ @assert(1 <= x < 2)
+ x *= BigFloat(2)^112
+ mantissa = BigInt(round(x))
+
+ mantstr = string(mantissa, base=16, pad=29)
+ return Printf.@sprintf("%s0x%s.%sp%+dL", sign, mantstr[1], mantstr[2:end],
+ exponent)
+end
+
+# Round a BigFloat to 128-bit long double and return it still as a
+# BigFloat.
+function quadval(x, round=0)
+ sign = +1
+ if x.sign < 0
+ sign = -1
+ x = -x
+ end
+
+ exponent = BigInt(floor(log2(x)))
+ exponent = max(exponent, -16382)
+ @assert(exponent <= 16383) # else overflow
+
+ x /= BigFloat(2)^exponent
+ @assert(1 <= x < 2)
+ x *= BigFloat(2)^112
+ if round < 0
+ mantissa = floor(x)
+ elseif round > 0
+ mantissa = ceil(x)
+ else
+ mantissa = round(x)
+ end
+
+ return sign * mantissa * BigFloat(2)^(exponent - 112)
+end
+
+# Output an array of BigFloats as a C array declaration.
+function dumparray(a, name)
+ println("static const long double ", name, "[] = {")
+ for x in N
+ println(" ", quadhex(x), ",")
+ end
+ println("};")
+end
+
+print("/*
+ * Polynomial coefficients and other constants for tgamma128.c.
+ *
+ * Copyright (c) 2006,2009,2023 Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+")
+
+Base.MPFR.setprecision(512)
+
+e = exp(BigFloat(1))
+
+print("
+/* The largest positive value for which 128-bit tgamma does not overflow. */
+")
+lo = BigFloat("1000")
+hi = BigFloat("2000")
+while true
+ global lo
+ global hi
+ global max_x
+
+ mid = (lo + hi) / 2
+ if mid == lo || mid == hi
+ max_x = mid
+ break
+ end
+ if SpecialFunctions.logabsgamma(mid)[1] < 16384 * log(BigFloat(2))
+ lo = mid
+ else
+ hi = mid
+ end
+end
+max_x = quadval(max_x, -1)
+println("static const long double max_x = ", quadhex(max_x), ";")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_large() subroutine */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x==0 ? sqrt(BigFloat(2)*pi/e) :
+ exp(SpecialFunctions.logabsgamma(1/x)[1] +
+ (1/x-0.5)*(1+log(x))),
+ (0, 1/BigFloat(8)),
+ 24, 0,
+ (x, y) -> 1/y
+)
+dumparray(N, "coeffs_large")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_tiny() subroutine */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x==0 ? 1 : 1/(x*SpecialFunctions.gamma(x)),
+ (0, 1/BigFloat(32)),
+ 13, 0,
+)
+dumparray(N, "coeffs_tiny")
+
+print("
+/* The location within the interval [1,2] where gamma has a minimum.
+ * Specified as the sum of two 128-bit values, for extra precision. */
+")
+lo = BigFloat("1.4")
+hi = BigFloat("1.5")
+while true
+ global lo
+ global hi
+ global min_x
+
+ mid = (lo + hi) / 2
+ if mid == lo || mid == hi
+ min_x = mid
+ break
+ end
+ if SpecialFunctions.digamma(mid) < 0
+ lo = mid
+ else
+ hi = mid
+ end
+end
+min_x_hi = quadval(min_x, -1)
+println("static const long double min_x_hi = ", quadhex(min_x_hi), ";")
+println("static const long double min_x_lo = ", quadhex(min_x - min_x_hi), ";")
+
+print("
+/* The actual minimum value that gamma takes at that location.
+ * Again specified as the sum of two 128-bit values. */
+")
+min_y = SpecialFunctions.gamma(min_x)
+min_y_hi = quadval(min_y, -1)
+println("static const long double min_y_hi = ", quadhex(min_y_hi), ";")
+println("static const long double min_y_lo = ", quadhex(min_y - min_y_hi), ";")
+
+function taylor_bodge(x)
+ # Taylor series generated by Wolfram Alpha for (gamma(min_x+x)-min_y)/x^2.
+ # Used in the Remez calls below for x values very near the origin, to avoid
+ # significance loss problems when trying to compute it directly via that
+ # formula (even in MPFR's extra precision).
+ return BigFloat("0.428486815855585429730209907810650582960483696962660010556335457558784421896667728014324097132413696263704801646004585959298743677879606168187061990204432200")+x*(-BigFloat("0.130704158939785761928008749242671025181542078105370084716141350308119418619652583986015464395882363802104154017741656168641240436089858504560718773026275797")+x*(BigFloat("0.160890753325112844190519489594363387594505844658437718135952967735294789599989664428071656484587979507034160383271974554122934842441540146372016567834062876")+x*(-BigFloat("0.092277030213334350126864106458600575084335085690780082222880945224248438672595248111704471182201673989215223667543694847795410779036800385804729955729659506"))))
+end
+
+print("
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [1,min_x] */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x < BigFloat(0x1p-64) ? taylor_bodge(-x) :
+ (SpecialFunctions.gamma(min_x - x) - min_y) / (x*x),
+ (0, min_x - 1),
+ 31, 0,
+ (x, y) -> x^2,
+)
+dumparray(N, "coeffs_central_neg")
+
+print("
+/* Coefficients of the polynomial used in the tgamma_central() subroutine
+ * for computing gamma on the interval [min_x,2] */
+")
+N, D, E, X = Remez.ratfn_minimax(
+ x -> x < BigFloat(0x1p-64) ? taylor_bodge(x) :
+ (SpecialFunctions.gamma(min_x + x) - min_y) / (x*x),
+ (0, 2 - min_x),
+ 28, 0,
+ (x, y) -> x^2,
+)
+dumparray(N, "coeffs_central_pos")
+
+print("
+/* 128-bit float value of pi, used by the sin_pi_x_over_pi subroutine
+ */
+")
+println("static const long double pi = ", quadhex(BigFloat(pi)), ";")
diff --git a/math/v_cos.c b/math/v_cos.c
deleted file mode 100644
index 4c8787e66c41..000000000000
--- a/math/v_cos.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Double-precision vector cos function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const double Poly[] = {
-/* worst-case error is 3.5 ulp.
- abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
--0x1.9f4a9c8b21dc9p-41,
- 0x1.60e88a10163f2p-33,
--0x1.ae6361b7254e7p-26,
- 0x1.71de382e8d62bp-19,
--0x1.a01a019aeb4ffp-13,
- 0x1.111111110b25ep-7,
--0x1.55555555554c3p-3,
-};
-
-#define C7 v_f64 (Poly[0])
-#define C6 v_f64 (Poly[1])
-#define C5 v_f64 (Poly[2])
-#define C4 v_f64 (Poly[3])
-#define C3 v_f64 (Poly[4])
-#define C2 v_f64 (Poly[5])
-#define C1 v_f64 (Poly[6])
-
-#define InvPi v_f64 (0x1.45f306dc9c883p-2)
-#define HalfPi v_f64 (0x1.921fb54442d18p+0)
-#define Pi1 v_f64 (0x1.921fb54442d18p+1)
-#define Pi2 v_f64 (0x1.1a62633145c06p-53)
-#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
-#define Shift v_f64 (0x1.8p52)
-#define RangeVal v_f64 (0x1p23)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (cos, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(cos) (v_f64_t x)
-{
- v_f64_t n, r, r2, y;
- v_u64_t odd, cmp;
-
- r = v_as_f64_u64 (v_as_u64_f64 (x) & AbsMask);
- cmp = v_cond_u64 (v_as_u64_f64 (r) >= v_as_u64_f64 (RangeVal));
-
-#if WANT_SIMD_EXCEPT
- if (unlikely (v_any_u64 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- specialcase later. */
- r = v_sel_f64 (cmp, v_f64 (1.0), r);
-#endif
-
- /* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = v_fma_f64 (InvPi, r + HalfPi, Shift);
- odd = v_as_u64_f64 (n) << 63;
- n -= Shift;
- n -= v_f64 (0.5);
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = v_fma_f64 (-Pi1, n, r);
- r = v_fma_f64 (-Pi2, n, r);
- r = v_fma_f64 (-Pi3, n, r);
-
- /* sin(r) poly approx. */
- r2 = r * r;
- y = v_fma_f64 (C7, r2, C6);
- y = v_fma_f64 (y, r2, C5);
- y = v_fma_f64 (y, r2, C4);
- y = v_fma_f64 (y, r2, C3);
- y = v_fma_f64 (y, r2, C2);
- y = v_fma_f64 (y, r2, C1);
- y = v_fma_f64 (y * r2, r, r);
-
- /* sign. */
- y = v_as_f64_u64 (v_as_u64_f64 (y) ^ odd);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_cosf.c b/math/v_cosf.c
deleted file mode 100644
index bd677c3ae173..000000000000
--- a/math/v_cosf.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Single-precision vector cos function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* 1.886 ulp error */
- 0x1.5b2e76p-19f,
- -0x1.9f42eap-13f,
- 0x1.110df4p-7f,
- -0x1.555548p-3f,
-};
-#define Pi1 v_f32 (0x1.921fb6p+1f)
-#define Pi2 v_f32 (-0x1.777a5cp-24f)
-#define Pi3 v_f32 (-0x1.ee59dap-49f)
-#define A3 v_f32 (Poly[3])
-#define A5 v_f32 (Poly[2])
-#define A7 v_f32 (Poly[1])
-#define A9 v_f32 (Poly[0])
-#define RangeVal v_f32 (0x1p20f)
-#define InvPi v_f32 (0x1.45f306p-2f)
-#define Shift v_f32 (0x1.8p+23f)
-#define AbsMask v_u32 (0x7fffffff)
-#define HalfPi v_f32 (0x1.921fb6p0f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (cosf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(cosf) (v_f32_t x)
-{
- v_f32_t n, r, r2, y;
- v_u32_t odd, cmp;
-
- r = v_as_f32_u32 (v_as_u32_f32 (x) & AbsMask);
- cmp = v_cond_u32 (v_as_u32_f32 (r) >= v_as_u32_f32 (RangeVal));
-
-#if WANT_SIMD_EXCEPT
- if (unlikely (v_any_u32 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- specialcase later. */
- r = v_sel_f32 (cmp, v_f32 (1.0f), r);
-#endif
-
- /* n = rint((|x|+pi/2)/pi) - 0.5 */
- n = v_fma_f32 (InvPi, r + HalfPi, Shift);
- odd = v_as_u32_f32 (n) << 31;
- n -= Shift;
- n -= v_f32 (0.5f);
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
- r = v_fma_f32 (-Pi1, n, r);
- r = v_fma_f32 (-Pi2, n, r);
- r = v_fma_f32 (-Pi3, n, r);
-
- /* y = sin(r) */
- r2 = r * r;
- y = v_fma_f32 (A9, r2, A7);
- y = v_fma_f32 (y, r2, A5);
- y = v_fma_f32 (y, r2, A3);
- y = v_fma_f32 (y * r2, r, r);
-
- /* sign fix */
- y = v_as_f32_u32 (v_as_u32_f32 (y) ^ odd);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_exp.c b/math/v_exp.c
deleted file mode 100644
index da23fd1c5f46..000000000000
--- a/math/v_exp.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Double-precision vector e^x function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-#include "v_exp.h"
-
-#if V_EXP_TABLE_BITS == 7
-/* maxerr: 1.88 +0.5 ulp
- rel error: 1.4337*2^-53
- abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ]. */
-#define C1 v_f64 (0x1.ffffffffffd43p-2)
-#define C2 v_f64 (0x1.55555c75adbb2p-3)
-#define C3 v_f64 (0x1.55555da646206p-5)
-#define InvLn2 v_f64 (0x1.71547652b82fep7) /* N/ln2. */
-#define Ln2hi v_f64 (0x1.62e42fefa39efp-8) /* ln2/N. */
-#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-63)
-#elif V_EXP_TABLE_BITS == 8
-/* maxerr: 0.54 +0.5 ulp
- rel error: 1.4318*2^-58
- abs error: 1.4299*2^-58 in [ -ln2/512, ln2/512 ]. */
-#define C1 v_f64 (0x1.fffffffffffd4p-2)
-#define C2 v_f64 (0x1.5555571d6b68cp-3)
-#define C3 v_f64 (0x1.5555576a59599p-5)
-#define InvLn2 v_f64 (0x1.71547652b82fep8)
-#define Ln2hi v_f64 (0x1.62e42fefa39efp-9)
-#define Ln2lo v_f64 (0x1.abc9e3b39803f3p-64)
-#endif
-
-#define N (1 << V_EXP_TABLE_BITS)
-#define Tab __v_exp_data
-#define IndexMask v_u64 (N - 1)
-#define Shift v_f64 (0x1.8p+52)
-
-#if WANT_SIMD_EXCEPT
-
-#define TinyBound 0x200 /* top12 (asuint64 (0x1p-511)). */
-#define BigBound 0x408 /* top12 (asuint64 (0x1p9)). */
-
-VPCS_ATTR static NOINLINE v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f64 (exp, x, y, cmp);
-}
-
-#else
-
-#define Thres v_f64 (704.0)
-
-VPCS_ATTR
-static v_f64_t
-specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
-{
- v_f64_t absn = v_abs_f64 (n);
-
- /* 2^(n/N) may overflow, break it up into s1*s2. */
- v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
- v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
- v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
- v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
- v_f64_t r1 = s1 * s1;
- v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
- return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
-}
-
-#endif
-
-VPCS_ATTR
-v_f64_t
-V_NAME(exp) (v_f64_t x)
-{
- v_f64_t n, r, r2, s, y, z;
- v_u64_t cmp, u, e, i;
-
-#if WANT_SIMD_EXCEPT
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- specialcase to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- v_f64_t xm = x;
- cmp = v_cond_u64 ((v_as_u64_f64 (v_abs_f64 (x)) >> 52) - TinyBound
- >= BigBound - TinyBound);
- if (unlikely (v_any_u64 (cmp)))
- x = v_sel_f64 (cmp, v_f64 (1), x);
-#else
- cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
-#endif
-
- /* n = round(x/(ln2/N)). */
- z = v_fma_f64 (x, InvLn2, Shift);
- u = v_as_u64_f64 (z);
- n = z - Shift;
-
- /* r = x - n*ln2/N. */
- r = x;
- r = v_fma_f64 (-Ln2hi, n, r);
- r = v_fma_f64 (-Ln2lo, n, r);
-
- e = u << (52 - V_EXP_TABLE_BITS);
- i = u & IndexMask;
-
- /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
- r2 = r * r;
- y = v_fma_f64 (C2, r, C1);
- y = v_fma_f64 (C3, r2, y);
- y = v_fma_f64 (y, r2, r);
-
- /* s = 2^(n/N). */
- u = v_lookup_u64 (Tab, i);
- s = v_as_f64_u64 (u + e);
-
- if (unlikely (v_any_u64 (cmp)))
-#if WANT_SIMD_EXCEPT
- return specialcase (xm, v_fma_f64 (y, s, s), cmp);
-#else
- return specialcase (s, y, n);
-#endif
-
- return v_fma_f64 (y, s, s);
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_exp.h b/math/v_exp.h
deleted file mode 100644
index 1e7f7f3b927d..000000000000
--- a/math/v_exp.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Declarations for double-precision e^x vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_EXP_TABLE_BITS 7
-
-extern const u64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
-#endif
diff --git a/math/v_exp2f.c b/math/v_exp2f.c
deleted file mode 100644
index 7f40dbaa6679..000000000000
--- a/math/v_exp2f.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 1.962 ulp. */
- 0x1.59977ap-10f,
- 0x1.3ce9e4p-7f,
- 0x1.c6bd32p-5f,
- 0x1.ebf9bcp-3f,
- 0x1.62e422p-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-
-#if WANT_SIMD_EXCEPT
-
-#define TinyBound 0x20000000 /* asuint (0x1p-63). */
-#define BigBound 0x42800000 /* asuint (0x1p6). */
-
-VPCS_ATTR
-static NOINLINE v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f32 (exp2f, x, y, cmp);
-}
-
-#else
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
- v_u32_t r2 = v_as_u32_f32 (s1 * s1);
- v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
- return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-#endif
-
-VPCS_ATTR
-v_f32_t
-V_NAME(exp2f) (v_f32_t x)
-{
- v_f32_t n, r, r2, scale, p, q, poly;
- v_u32_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
- cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
- >= BigBound - TinyBound);
- v_f32_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- specialcase to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- if (unlikely (v_any_u32 (cmp)))
- x = v_sel_f32 (cmp, v_f32 (1), x);
-#endif
-
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
-#if 0
- v_f32_t z;
- z = x + Shift;
- n = z - Shift;
- r = x - n;
- e = v_as_u32_f32 (z) << 23;
-#else
- n = v_round_f32 (x);
- r = x - n;
- e = v_as_u32_s32 (v_round_s32 (x)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-
-#if !WANT_SIMD_EXCEPT
- v_f32_t absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
-#endif
-
- r2 = r * r;
- p = v_fma_f32 (C0, r, C1);
- q = v_fma_f32 (C2, r, C3);
- q = v_fma_f32 (p, r2, q);
- p = C4 * r;
- poly = v_fma_f32 (q, r2, p);
-
- if (unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
-#else
- return specialcase (poly, n, e, absn, cmp, scale);
-#endif
-
- return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_exp2f_1u.c b/math/v_exp2f_1u.c
deleted file mode 100644
index de1a32d54139..000000000000
--- a/math/v_exp2f_1u.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Single-precision vector 2^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 0.878 ulp. */
- 0x1.416b5ep-13f, 0x1.5f082ep-10f, 0x1.3b2dep-7f, 0x1.c6af7cp-5f, 0x1.ebfbdcp-3f, 0x1.62e43p-1f
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-#define C5 v_f32 (Poly[5])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
- v_f32_t r1 = s1 * s1;
- v_f32_t r0 = poly * s1 * s2;
- return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(exp2f_1u) (v_f32_t x)
-{
- v_f32_t n, r, scale, poly, absn;
- v_u32_t cmp, e;
-
- /* exp2(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
-#if 0
- v_f32_t z;
- z = x + Shift;
- n = z - Shift;
- r = x - n;
- e = v_as_u32_f32 (z) << 23;
-#else
- n = v_round_f32 (x);
- r = x - n;
- e = v_as_u32_s32 (v_round_s32 (x)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- poly = v_fma_f32 (C0, r, C1);
- poly = v_fma_f32 (poly, r, C2);
- poly = v_fma_f32 (poly, r, C3);
- poly = v_fma_f32 (poly, r, C4);
- poly = v_fma_f32 (poly, r, C5);
- poly = v_fma_f32 (poly, r, v_f32 (1.0f));
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn);
- return scale * poly;
-}
-#endif
diff --git a/math/v_expf.c b/math/v_expf.c
deleted file mode 100644
index ade23b2416aa..000000000000
--- a/math/v_expf.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 1.45358 +0.5 ulp. */
- 0x1.0e4020p-7f,
- 0x1.573e2ep-5f,
- 0x1.555e66p-3f,
- 0x1.fffdb6p-2f,
- 0x1.ffffecp-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-#if WANT_SIMD_EXCEPT
-
-#define TinyBound 0x20000000 /* asuint (0x1p-63). */
-#define BigBound 0x42800000 /* asuint (0x1p6). */
-
-VPCS_ATTR
-static NOINLINE v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* If fenv exceptions are to be triggered correctly, fall back to the scalar
- routine to special lanes. */
- return v_call_f32 (expf, x, y, cmp);
-}
-
-#else
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
- v_u32_t r2 = v_as_u32_f32 (s1 * s1);
- v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
- return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-#endif
-
-VPCS_ATTR
-v_f32_t
-V_NAME(expf) (v_f32_t x)
-{
- v_f32_t n, r, r2, scale, p, q, poly, z;
- v_u32_t cmp, e;
-
-#if WANT_SIMD_EXCEPT
- cmp = v_cond_u32 ((v_as_u32_f32 (x) & 0x7fffffff) - TinyBound
- >= BigBound - TinyBound);
- v_f32_t xm = x;
- /* If any lanes are special, mask them with 1 and retain a copy of x to allow
- specialcase to fix special lanes later. This is only necessary if fenv
- exceptions are to be triggered correctly. */
- if (unlikely (v_any_u32 (cmp)))
- x = v_sel_f32 (cmp, v_f32 (1), x);
-#endif
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-#if 1
- z = v_fma_f32 (x, InvLn2, Shift);
- n = z - Shift;
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_f32 (z) << 23;
-#else
- z = x * InvLn2;
- n = v_round_f32 (z);
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_s32 (v_round_s32 (z)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
-
-#if !WANT_SIMD_EXCEPT
- v_f32_t absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
-#endif
-
- r2 = r * r;
- p = v_fma_f32 (C0, r, C1);
- q = v_fma_f32 (C2, r, C3);
- q = v_fma_f32 (p, r2, q);
- p = C4 * r;
- poly = v_fma_f32 (q, r2, p);
-
- if (unlikely (v_any_u32 (cmp)))
-#if WANT_SIMD_EXCEPT
- return specialcase (xm, v_fma_f32 (poly, scale, scale), cmp);
-#else
- return specialcase (poly, n, e, absn, cmp, scale);
-#endif
-
- return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_expf_1u.c b/math/v_expf_1u.c
deleted file mode 100644
index 8f0ae91c582a..000000000000
--- a/math/v_expf_1u.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 0.36565 +0.5 ulp. */
- 0x1.6a6000p-10f,
- 0x1.12718ep-7f,
- 0x1.555af0p-5f,
- 0x1.555430p-3f,
- 0x1.fffff4p-2f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x83000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp = v_cond_u32 (absn > v_f32 (192.0f));
- v_f32_t r1 = s1 * s1;
- v_f32_t r0 = poly * s1 * s2;
- return v_as_f32_u32 ((cmp & v_as_u32_f32 (r1)) | (~cmp & v_as_u32_f32 (r0)));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(expf_1u) (v_f32_t x)
-{
- v_f32_t n, r, scale, poly, absn, z;
- v_u32_t cmp, e;
-
- /* exp(x) = 2^n * poly(r), with poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-#if 1
- z = v_fma_f32 (x, InvLn2, Shift);
- n = z - Shift;
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_f32 (z) << 23;
-#else
- z = x * InvLn2;
- n = v_round_f32 (z);
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_s32 (v_round_s32 (z)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- poly = v_fma_f32 (C0, r, C1);
- poly = v_fma_f32 (poly, r, C2);
- poly = v_fma_f32 (poly, r, C3);
- poly = v_fma_f32 (poly, r, C4);
- poly = v_fma_f32 (poly, r, v_f32 (1.0f));
- poly = v_fma_f32 (poly, r, v_f32 (1.0f));
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn);
- return scale * poly;
-}
-#endif
diff --git a/math/v_log.c b/math/v_log.c
deleted file mode 100644
index 47a829119b3c..000000000000
--- a/math/v_log.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Double-precision vector log(x) function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#include "v_log.h"
-#if V_SUPPORTED
-
-/* Worst-case error: 1.17 + 0.5 ulp. */
-
-static const f64_t Poly[] = {
- /* rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
- -0x1.ffffffffffff7p-2,
- 0x1.55555555170d4p-2,
- -0x1.0000000399c27p-2,
- 0x1.999b2e90e94cap-3,
- -0x1.554e550bd501ep-3,
-};
-
-#define A0 v_f64 (Poly[0])
-#define A1 v_f64 (Poly[1])
-#define A2 v_f64 (Poly[2])
-#define A3 v_f64 (Poly[3])
-#define A4 v_f64 (Poly[4])
-#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
-#define N (1 << V_LOG_TABLE_BITS)
-#define OFF v_u64 (0x3fe6900900000000)
-
-struct entry
-{
- v_f64_t invc;
- v_f64_t logc;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
- struct entry e;
-#ifdef SCALAR
- e.invc = __v_log_data[i].invc;
- e.logc = __v_log_data[i].logc;
-#else
- e.invc[0] = __v_log_data[i[0]].invc;
- e.logc[0] = __v_log_data[i[0]].logc;
- e.invc[1] = __v_log_data[i[1]].invc;
- e.logc[1] = __v_log_data[i[1]].logc;
-#endif
- return e;
-}
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (log, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(log) (v_f64_t x)
-{
- v_f64_t z, r, r2, p, y, kd, hi;
- v_u64_t ix, iz, tmp, top, i, cmp;
- v_s64_t k;
- struct entry e;
-
- ix = v_as_u64_f64 (x);
- top = ix >> 48;
- cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
-
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
- The range is split into N subintervals.
- The ith subinterval contains z and c is near its center. */
- tmp = ix - OFF;
- i = (tmp >> (52 - V_LOG_TABLE_BITS)) % N;
- k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift */
- iz = ix - (tmp & v_u64 (0xfffULL << 52));
- z = v_as_f64_u64 (iz);
- e = lookup (i);
-
- /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
- kd = v_to_f64_s64 (k);
-
- /* hi = r + log(c) + k*Ln2. */
- hi = v_fma_f64 (kd, Ln2, e.logc + r);
- /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- r2 = r * r;
- y = v_fma_f64 (A3, r, A2);
- p = v_fma_f64 (A1, r, A0);
- y = v_fma_f64 (A4, r2, y);
- y = v_fma_f64 (y, r2, p);
- y = v_fma_f64 (y, r2, hi);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_log.h b/math/v_log.h
deleted file mode 100644
index a37bbc2bd6b6..000000000000
--- a/math/v_log.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Declarations for double-precision log(x) vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#if WANT_VMATH
-
-#define V_LOG_TABLE_BITS 7
-
-extern const struct v_log_data
-{
- f64_t invc;
- f64_t logc;
-} __v_log_data[1 << V_LOG_TABLE_BITS] HIDDEN;
-#endif
diff --git a/math/v_log_data.c b/math/v_log_data.c
deleted file mode 100644
index ec1c8e5e16b2..000000000000
--- a/math/v_log_data.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Lookup table for double-precision log(x) vector function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_log.h"
-#if WANT_VMATH
-
-#define N (1 << V_LOG_TABLE_BITS)
-
-/* Algorithm:
-
- x = 2^k z
- log(x) = k ln2 + log(c) + poly(z/c - 1)
-
-where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
-and log(c) and 1/c for the ith subinterval comes from a lookup table:
-
- tab[i].invc = 1/c
- tab[i].logc = (double)log(c)
-
-where c is near the center of the subinterval and is chosen by trying several
-floating point invc candidates around 1/center and selecting one for which
-the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
-that contains 1 and the previous one got tweaked to avoid cancellation. */
-const struct v_log_data __v_log_data[N] = {
-{0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2},
-{0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2},
-{0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2},
-{0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2},
-{0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2},
-{0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2},
-{0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2},
-{0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2},
-{0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2},
-{0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2},
-{0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2},
-{0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2},
-{0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2},
-{0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2},
-{0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2},
-{0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2},
-{0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2},
-{0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2},
-{0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2},
-{0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3},
-{0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3},
-{0x1.446f12b278001p+0, -0x1.e52e160484698p-3},
-{0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3},
-{0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3},
-{0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3},
-{0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3},
-{0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3},
-{0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3},
-{0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3},
-{0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3},
-{0x1.36987540fbf53p+0, -0x1.8be843d796044p-3},
-{0x1.352166b648f61p+0, -0x1.82395ecc477edp-3},
-{0x1.33adddb3eb575p+0, -0x1.7896240966422p-3},
-{0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3},
-{0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3},
-{0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3},
-{0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3},
-{0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3},
-{0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3},
-{0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3},
-{0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3},
-{0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3},
-{0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3},
-{0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3},
-{0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3},
-{0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4},
-{0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4},
-{0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4},
-{0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4},
-{0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4},
-{0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4},
-{0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4},
-{0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4},
-{0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4},
-{0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4},
-{0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4},
-{0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4},
-{0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4},
-{0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4},
-{0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4},
-{0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5},
-{0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5},
-{0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5},
-{0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5},
-{0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5},
-{0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5},
-{0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5},
-{0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5},
-{0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6},
-{0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6},
-{0x1.05193497a7cc5p+0, -0x1.43183683400acp-6},
-{0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6},
-{0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7},
-{0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7},
-{0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9},
-{1.0, 0.0},
-{0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8},
-{0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7},
-{0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6},
-{0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6},
-{0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5},
-{0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5},
-{0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5},
-{0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5},
-{0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4},
-{0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4},
-{0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4},
-{0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4},
-{0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4},
-{0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4},
-{0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4},
-{0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4},
-{0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4},
-{0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3},
-{0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3},
-{0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3},
-{0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3},
-{0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3},
-{0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3},
-{0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3},
-{0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3},
-{0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3},
-{0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3},
-{0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3},
-{0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3},
-{0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3},
-{0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3},
-{0x1.9998e1480b618p-1, 0x1.c903161240163p-3},
-{0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3},
-{0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3},
-{0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3},
-{0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3},
-{0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2},
-{0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2},
-{0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2},
-{0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2},
-{0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2},
-{0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2},
-{0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2},
-{0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2},
-{0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2},
-{0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2},
-{0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2},
-{0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2},
-{0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2},
-{0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2},
-{0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2},
-{0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2},
-};
-#endif
diff --git a/math/v_logf.c b/math/v_logf.c
deleted file mode 100644
index 93a53758bff7..000000000000
--- a/math/v_logf.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Single-precision vector log function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* 3.34 ulp error */
- -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
- -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
-};
-#define P7 v_f32 (Poly[0])
-#define P6 v_f32 (Poly[1])
-#define P5 v_f32 (Poly[2])
-#define P4 v_f32 (Poly[3])
-#define P3 v_f32 (Poly[4])
-#define P2 v_f32 (Poly[5])
-#define P1 v_f32 (Poly[6])
-
-#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Mask v_u32 (0x007fffff)
-#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (logf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(logf) (v_f32_t x)
-{
- v_f32_t n, p, q, r, r2, y;
- v_u32_t u, cmp;
-
- u = v_as_u32_f32 (x);
- cmp = v_cond_u32 (u - Min >= Max - Min);
-
- /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3 */
- u -= Off;
- n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend */
- u &= Mask;
- u += Off;
- r = v_as_f32_u32 (u) - v_f32 (1.0f);
-
- /* y = log(1+r) + n*ln2. */
- r2 = r * r;
- /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
- p = v_fma_f32 (P6, r, P5);
- q = v_fma_f32 (P4, r, P3);
- y = v_fma_f32 (P2, r, P1);
- p = v_fma_f32 (P7, r2, p);
- q = v_fma_f32 (p, r2, q);
- y = v_fma_f32 (q, r2, y);
- p = v_fma_f32 (Ln2, n, r);
- y = v_fma_f32 (y, r2, p);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_math.h b/math/v_math.h
deleted file mode 100644
index 3289916187d2..000000000000
--- a/math/v_math.h
+++ /dev/null
@@ -1,661 +0,0 @@
-/*
- * Vector math abstractions.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef _V_MATH_H
-#define _V_MATH_H
-
-#ifndef WANT_VMATH
-/* Enable the build of vector math code. */
-# define WANT_VMATH 1
-#endif
-#if WANT_VMATH
-
-/* The goal of this header is to allow vector and scalar
- build of the same algorithm, the provided intrinsic
- wrappers are also vector length agnostic so they can
- be implemented for SVE too (or other simd architectures)
- and then the code should work on those targets too. */
-
-#if SCALAR
-#define V_NAME(x) __s_##x
-#elif VPCS && __aarch64__
-#define V_NAME(x) __vn_##x
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-#else
-#define V_NAME(x) __v_##x
-#endif
-
-#ifndef VPCS_ATTR
-#define VPCS_ATTR
-#endif
-#ifndef VPCS_ALIAS
-#define VPCS_ALIAS
-#endif
-
-#include <stdint.h>
-#include "math_config.h"
-
-typedef float f32_t;
-typedef uint32_t u32_t;
-typedef int32_t s32_t;
-typedef double f64_t;
-typedef uint64_t u64_t;
-typedef int64_t s64_t;
-
-/* reinterpret as type1 from type2. */
-static inline u32_t
-as_u32_f32 (f32_t x)
-{
- union { f32_t f; u32_t u; } r = {x};
- return r.u;
-}
-static inline f32_t
-as_f32_u32 (u32_t x)
-{
- union { u32_t u; f32_t f; } r = {x};
- return r.f;
-}
-static inline s32_t
-as_s32_u32 (u32_t x)
-{
- union { u32_t u; s32_t i; } r = {x};
- return r.i;
-}
-static inline u32_t
-as_u32_s32 (s32_t x)
-{
- union { s32_t i; u32_t u; } r = {x};
- return r.u;
-}
-static inline u64_t
-as_u64_f64 (f64_t x)
-{
- union { f64_t f; u64_t u; } r = {x};
- return r.u;
-}
-static inline f64_t
-as_f64_u64 (u64_t x)
-{
- union { u64_t u; f64_t f; } r = {x};
- return r.f;
-}
-static inline s64_t
-as_s64_u64 (u64_t x)
-{
- union { u64_t u; s64_t i; } r = {x};
- return r.i;
-}
-static inline u64_t
-as_u64_s64 (s64_t x)
-{
- union { s64_t i; u64_t u; } r = {x};
- return r.u;
-}
-
-#if SCALAR
-#define V_SUPPORTED 1
-typedef f32_t v_f32_t;
-typedef u32_t v_u32_t;
-typedef s32_t v_s32_t;
-typedef f64_t v_f64_t;
-typedef u64_t v_u64_t;
-typedef s64_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
- return 1;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
- return x;
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
- return x;
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
- return x;
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
- return x;
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
- return x;
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
- return x;
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
- *x = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
- *x = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
- *x = v;
-}
-
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (v_u32_t x)
-{
- return x != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
- return x ? -1 : 0;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
- return __builtin_fabsf (x);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
- return __builtin_fmaf (x, y, z);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
- return __builtin_roundf (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
- return __builtin_lroundf (x); /* relies on -fno-math-errno. */
-}
-static inline v_f32_t
-v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
-{
- return p ? x : y;
-}
-/* convert to type1 from type2. */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
- return x;
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
- return x;
-}
-/* reinterpret as type1 from type2. */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
- union { v_f32_t f; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_f32_t f; } r = {x};
- return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_s32_t i; } r = {x};
- return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
- union { v_s32_t i; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
- return tab[idx];
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
- return tab[idx];
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
- return f (x);
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
- v_u32_t p)
-{
- return f (x1, x2);
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 1;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
- return x;
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
- return x;
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
- return x;
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
- return x;
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
- *x = v;
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (v_u64_t x)
-{
- return x != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
- return x ? -1 : 0;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
- return __builtin_fabs (x);
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
- return __builtin_fma (x, y, z);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
- return __builtin_round (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
- return __builtin_lround (x); /* relies on -fno-math-errno. */
-}
-static inline v_f64_t
-v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
-{
- return p ? x : y;
-}
-/* convert to type1 from type2. */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
- return x;
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
- return x;
-}
-/* reinterpret as type1 from type2. */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
- union { v_f64_t f; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_f64_t f; } r = {x};
- return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_s64_t i; } r = {x};
- return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
- union { v_s64_t i; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
- return tab[idx];
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
- return tab[idx];
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
- return f (x);
-}
-
-#elif __aarch64__
-#define V_SUPPORTED 1
-#include <arm_neon.h>
-typedef float32x4_t v_f32_t;
-typedef uint32x4_t v_u32_t;
-typedef int32x4_t v_s32_t;
-typedef float64x2_t v_f64_t;
-typedef uint64x2_t v_u64_t;
-typedef int64x2_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
- return 4;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
- return (v_f32_t){x, x, x, x};
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
- return (v_u32_t){x, x, x, x};
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
- return (v_s32_t){x, x, x, x};
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
- return x[i];
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
- return x[i];
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
- return x[i];
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
- (*x)[i] = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
- (*x)[i] = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
- (*x)[i] = v;
-}
-
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (v_u32_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
- return x;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
- return vabsq_f32 (x);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
- return vfmaq_f32 (z, x, y);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
- return vrndaq_f32 (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
- return vcvtaq_s32_f32 (x);
-}
-static inline v_f32_t
-v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
-{
- return vbslq_f32 (p, x, y);
-}
-/* convert to type1 from type2. */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
- return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
- return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-/* reinterpret as type1 from type2. */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
- union { v_f32_t f; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_f32_t f; } r = {x};
- return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_s32_t i; } r = {x};
- return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
- union { v_s32_t i; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
- return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
- return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
- return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
- p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
- v_u32_t p)
-{
- return (
- v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
- p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 2;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
- return (v_f64_t){x, x};
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
- return (v_u64_t){x, x};
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
- return (v_s64_t){x, x};
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
- return x[i];
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
- (*x)[i] = v;
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (v_u64_t x)
-{
- /* assume elements in x are either 0 or -1u. */
- return vpaddd_u64 (x) != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
- return x;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
- return vabsq_f64 (x);
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
- return vfmaq_f64 (z, x, y);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
- return vrndaq_f64 (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
- return vcvtaq_s64_f64 (x);
-}
-static inline v_f64_t
-v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
-{
- return vbslq_f64 (p, x, y);
-}
-/* convert to type1 from type2. */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
- return (v_f64_t){x[0], x[1]};
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
- return (v_f64_t){x[0], x[1]};
-}
-/* reinterpret as type1 from type2. */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
- union { v_f64_t f; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_f64_t f; } r = {x};
- return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_s64_t i; } r = {x};
- return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
- union { v_s64_t i; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
- return (v_f64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
- return (v_u64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
- return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
-}
-#endif
-
-#endif
-#endif
diff --git a/math/v_pow.c b/math/v_pow.c
deleted file mode 100644
index 05a83aaa8c0a..000000000000
--- a/math/v_pow.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Double-precision vector pow function.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-VPCS_ATTR
-v_f64_t
-V_NAME(pow) (v_f64_t x, v_f64_t y)
-{
- v_f64_t z;
- for (int lane = 0; lane < v_lanes64 (); lane++)
- {
- f64_t sx = v_get_f64 (x, lane);
- f64_t sy = v_get_f64 (y, lane);
- f64_t sz = pow (sx, sy);
- v_set_f64 (&z, lane, sz);
- }
- return z;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_powf.c b/math/v_powf.c
deleted file mode 100644
index ad8ab8d4f00d..000000000000
--- a/math/v_powf.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Single-precision vector powf function.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define SBITS 5
-#define Tlog v__powf_log2_data.tab
-#define Texp v__exp2f_data.tab
-#define A v__powf_log2_data.poly
-#define C v__exp2f_data.poly
-#define LOGDEG 4
-
-#if LOGDEG == 5
-/* 1.01 ulp */
-#define OFF v_u32 (0x3f330000)
-#define TBITS 4
-#elif LOGDEG == 4
-/* 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2) */
-#define OFF v_u32 (0x3f35d000)
-#define TBITS 5
-#endif
-
-#define V_EXP2F_TABLE_BITS SBITS
-#define V_EXP2F_POLY_ORDER 3
-struct v_exp2f_data
-{
- uint64_t tab[1 << V_EXP2F_TABLE_BITS];
- double poly[V_EXP2F_POLY_ORDER];
-};
-
-#define V_POWF_LOG2_TABLE_BITS TBITS
-#define V_POWF_LOG2_POLY_ORDER LOGDEG
-#define SCALE ((double) (1 << SBITS))
-struct v_powf_log2_data
-{
- struct
- {
- double invc, logc;
- } tab[1 << V_POWF_LOG2_TABLE_BITS];
- double poly[V_POWF_LOG2_POLY_ORDER];
-};
-
-static const struct v_powf_log2_data v__powf_log2_data = {
-#if LOGDEG == 5
- .tab = {
-{ 0x1.661ec79f8f3bep+0, -0x1.efec65b963019p-2 * SCALE },
-{ 0x1.571ed4aaf883dp+0, -0x1.b0b6832d4fca4p-2 * SCALE },
-{ 0x1.49539f0f010bp+0, -0x1.7418b0a1fb77bp-2 * SCALE },
-{ 0x1.3c995b0b80385p+0, -0x1.39de91a6dcf7bp-2 * SCALE },
-{ 0x1.30d190c8864a5p+0, -0x1.01d9bf3f2b631p-2 * SCALE },
-{ 0x1.25e227b0b8eap+0, -0x1.97c1d1b3b7afp-3 * SCALE },
-{ 0x1.1bb4a4a1a343fp+0, -0x1.2f9e393af3c9fp-3 * SCALE },
-{ 0x1.12358f08ae5bap+0, -0x1.960cbbf788d5cp-4 * SCALE },
-{ 0x1.0953f419900a7p+0, -0x1.a6f9db6475fcep-5 * SCALE },
-{ 0x1p+0, 0x0p+0 * SCALE },
-{ 0x1.e608cfd9a47acp-1, 0x1.338ca9f24f53dp-4 * SCALE },
-{ 0x1.ca4b31f026aap-1, 0x1.476a9543891bap-3 * SCALE },
-{ 0x1.b2036576afce6p-1, 0x1.e840b4ac4e4d2p-3 * SCALE },
-{ 0x1.9c2d163a1aa2dp-1, 0x1.40645f0c6651cp-2 * SCALE },
-{ 0x1.886e6037841edp-1, 0x1.88e9c2c1b9ff8p-2 * SCALE },
-{ 0x1.767dcf5534862p-1, 0x1.ce0a44eb17bccp-2 * SCALE },
- },
-/* rel err: 1.46 * 2^-32 */
- .poly = {
-0x1.27616c9496e0bp-2 * SCALE, -0x1.71969a075c67ap-2 * SCALE,
-0x1.ec70a6ca7baddp-2 * SCALE, -0x1.7154748bef6c8p-1 * SCALE,
-0x1.71547652ab82bp0 * SCALE,
- }
-#elif LOGDEG == 4
- .tab = {
-{0x1.6489890582816p+0, -0x1.e960f97b22702p-2 * SCALE},
-{0x1.5cf19b35e3472p+0, -0x1.c993406cd4db6p-2 * SCALE},
-{0x1.55aac0e956d65p+0, -0x1.aa711d9a7d0f3p-2 * SCALE},
-{0x1.4eb0022977e01p+0, -0x1.8bf37bacdce9bp-2 * SCALE},
-{0x1.47fcccda1dd1fp+0, -0x1.6e13b3519946ep-2 * SCALE},
-{0x1.418ceabab68c1p+0, -0x1.50cb8281e4089p-2 * SCALE},
-{0x1.3b5c788f1edb3p+0, -0x1.341504a237e2bp-2 * SCALE},
-{0x1.3567de48e9c9ap+0, -0x1.17eaab624ffbbp-2 * SCALE},
-{0x1.2fabc80fd19bap+0, -0x1.f88e708f8c853p-3 * SCALE},
-{0x1.2a25200ce536bp+0, -0x1.c24b6da113914p-3 * SCALE},
-{0x1.24d108e0152e3p+0, -0x1.8d02ee397cb1dp-3 * SCALE},
-{0x1.1facd8ab2fbe1p+0, -0x1.58ac1223408b3p-3 * SCALE},
-{0x1.1ab614a03efdfp+0, -0x1.253e6fd190e89p-3 * SCALE},
-{0x1.15ea6d03af9ffp+0, -0x1.e5641882c12ffp-4 * SCALE},
-{0x1.1147b994bb776p+0, -0x1.81fea712926f7p-4 * SCALE},
-{0x1.0ccbf650593aap+0, -0x1.203e240de64a3p-4 * SCALE},
-{0x1.0875408477302p+0, -0x1.8029b86a78281p-5 * SCALE},
-{0x1.0441d42a93328p+0, -0x1.85d713190fb9p-6 * SCALE},
-{0x1p+0, 0x0p+0 * SCALE},
-{0x1.f1d006c855e86p-1, 0x1.4c1cc07312997p-5 * SCALE},
-{0x1.e28c3341aa301p-1, 0x1.5e1848ccec948p-4 * SCALE},
-{0x1.d4bdf9aa64747p-1, 0x1.04cfcb7f1196fp-3 * SCALE},
-{0x1.c7b45a24e5803p-1, 0x1.582813d463c21p-3 * SCALE},
-{0x1.bb5f5eb2ed60ap-1, 0x1.a936fa68760ccp-3 * SCALE},
-{0x1.afb0bff8fe6b4p-1, 0x1.f81bc31d6cc4ep-3 * SCALE},
-{0x1.a49badf7ab1f5p-1, 0x1.2279a09fae6b1p-2 * SCALE},
-{0x1.9a14a111fc4c9p-1, 0x1.47ec0b6df5526p-2 * SCALE},
-{0x1.901131f5b2fdcp-1, 0x1.6c71762280f1p-2 * SCALE},
-{0x1.8687f73f6d865p-1, 0x1.90155070798dap-2 * SCALE},
-{0x1.7d7067eb77986p-1, 0x1.b2e23b1d3068cp-2 * SCALE},
-{0x1.74c2c1cf97b65p-1, 0x1.d4e21b0daa86ap-2 * SCALE},
-{0x1.6c77f37cff2a1p-1, 0x1.f61e2a2f67f3fp-2 * SCALE},
- },
-/* rel err: 1.5 * 2^-30 */
- .poly = {
- -0x1.6ff5daa3b3d7cp-2 * SCALE,
- 0x1.ec81d03c01aebp-2 * SCALE,
- -0x1.71547bb43f101p-1 * SCALE,
- 0x1.7154764a815cbp0 * SCALE,
- }
-#endif
-};
-
-static const struct v_exp2f_data v__exp2f_data = {
- .tab = {
-0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51,
-0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1,
-0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
-0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585,
-0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13,
-0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
-0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069,
-0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
- },
-/* rel err: 1.69 * 2^-34 */
- .poly = {
-0x1.c6af84b912394p-5/SCALE/SCALE/SCALE, 0x1.ebfce50fac4f3p-3/SCALE/SCALE, 0x1.62e42ff0c52d6p-1/SCALE
- },
-};
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_f32_t ret, v_u32_t cmp)
-{
- return v_call2_f32 (powf, x, y, ret, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(powf) (v_f32_t x, v_f32_t y)
-{
- v_u32_t u, tmp, cmp, i, top, iz;
- v_s32_t k;
- v_f32_t ret;
-
- u = v_as_u32_f32 (x);
- cmp = v_cond_u32 (u - Min >= Max - Min);
- tmp = u - OFF;
- i = (tmp >> (23 - TBITS)) % (1 << TBITS);
- top = tmp & 0xff800000;
- iz = u - top;
- k = v_as_s32_u32 (top) >> (23 - SBITS); /* arithmetic shift */
-
- for (int lane = 0; lane < v_lanes32 (); lane++)
- {
- uint32_t si, siz;
- int32_t sk;
- float sy;
-
- /* Use double precision for each lane. */
- double invc, logc, z, r, p, y0, logx, ylogx, kd, s;
- uint64_t ki, t;
-
- si = v_get_u32 (i, lane);
- siz = v_get_u32 (iz, lane);
- sk = v_get_s32 (k, lane);
- sy = v_get_f32 (y, lane);
-
- invc = Tlog[si].invc;
- logc = Tlog[si].logc;
- z = (double) as_f32_u32 (siz);
-
- /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k */
- r = __builtin_fma (z, invc, -1.0);
- y0 = logc + (double) sk;
-
- /* Polynomial to approximate log1p(r)/ln2. */
-#if LOGDEG == 5
- logx = A[0];
- logx = r * logx + A[1];
- logx = r * logx + A[2];
- logx = r * logx + A[3];
- logx = r * logx + A[4];
- logx = r * logx + y0;
-#elif LOGDEG == 4
- logx = A[0];
- logx = r * logx + A[1];
- logx = r * logx + A[2];
- logx = r * logx + A[3];
- logx = r * logx + y0;
-#endif
- ylogx = sy * logx;
- v_set_u32 (&cmp, lane,
- (as_u64_f64 (ylogx) >> 47 & 0xffff)
- >= as_u64_f64 (126.0 * (1 << SBITS)) >> 47
- ? 1
- : v_get_u32 (cmp, lane));
-
- /* N*x = k + r with r in [-1/2, 1/2] */
-#if TOINT_INTRINSICS
- kd = roundtoint (ylogx); /* k */
- ki = converttoint (ylogx);
-#else
-# define SHIFT 0x1.8p52
- kd = eval_as_double (ylogx + SHIFT);
- ki = asuint64 (kd);
- kd -= SHIFT;
-#endif
- r = ylogx - kd;
-
- /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
- t = Texp[ki % (1 << SBITS)];
- t += ki << (52 - SBITS);
- s = as_f64_u64 (t);
- p = C[0];
- p = __builtin_fma (p, r, C[1]);
- p = __builtin_fma (p, r, C[2]);
- p = __builtin_fma (p, s * r, s);
-
- v_set_f32 (&ret, lane, p);
- }
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, ret, cmp);
- return ret;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_sin.c b/math/v_sin.c
deleted file mode 100644
index 9dbb9dec04de..000000000000
--- a/math/v_sin.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Double-precision vector sin function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const double Poly[] = {
-/* worst-case error is 3.5 ulp.
- abs error: 0x1.be222a58p-53 in [-pi/2, pi/2]. */
--0x1.9f4a9c8b21dc9p-41,
- 0x1.60e88a10163f2p-33,
--0x1.ae6361b7254e7p-26,
- 0x1.71de382e8d62bp-19,
--0x1.a01a019aeb4ffp-13,
- 0x1.111111110b25ep-7,
--0x1.55555555554c3p-3,
-};
-
-#define C7 v_f64 (Poly[0])
-#define C6 v_f64 (Poly[1])
-#define C5 v_f64 (Poly[2])
-#define C4 v_f64 (Poly[3])
-#define C3 v_f64 (Poly[4])
-#define C2 v_f64 (Poly[5])
-#define C1 v_f64 (Poly[6])
-
-#define InvPi v_f64 (0x1.45f306dc9c883p-2)
-#define Pi1 v_f64 (0x1.921fb54442d18p+1)
-#define Pi2 v_f64 (0x1.1a62633145c06p-53)
-#define Pi3 v_f64 (0x1.c1cd129024e09p-106)
-#define Shift v_f64 (0x1.8p52)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-
-#if WANT_SIMD_EXCEPT
-#define TinyBound 0x202 /* top12 (asuint64 (0x1p-509)). */
-#define Thresh 0x214 /* top12 (asuint64 (RangeVal)) - TinyBound. */
-#else
-#define RangeVal v_f64 (0x1p23)
-#endif
-
-VPCS_ATTR
-__attribute__ ((noinline)) static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (sin, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f64_t
-V_NAME(sin) (v_f64_t x)
-{
- v_f64_t n, r, r2, y;
- v_u64_t sign, odd, cmp, ir;
-
- ir = v_as_u64_f64 (x) & AbsMask;
- r = v_as_f64_u64 (ir);
- sign = v_as_u64_f64 (x) & ~AbsMask;
-
-#if WANT_SIMD_EXCEPT
- /* Detect |x| <= 0x1p-509 or |x| >= RangeVal. If fenv exceptions are to be
- triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
- fenv). These lanes will be fixed by specialcase later. */
- cmp = v_cond_u64 ((ir >> 52) - TinyBound >= Thresh);
- if (unlikely (v_any_u64 (cmp)))
- r = v_sel_f64 (cmp, v_f64 (1), r);
-#else
- cmp = v_cond_u64 (ir >= v_as_u64_f64 (RangeVal));
-#endif
-
- /* n = rint(|x|/pi). */
- n = v_fma_f64 (InvPi, r, Shift);
- odd = v_as_u64_f64 (n) << 63;
- n -= Shift;
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = v_fma_f64 (-Pi1, n, r);
- r = v_fma_f64 (-Pi2, n, r);
- r = v_fma_f64 (-Pi3, n, r);
-
- /* sin(r) poly approx. */
- r2 = r * r;
- y = v_fma_f64 (C7, r2, C6);
- y = v_fma_f64 (y, r2, C5);
- y = v_fma_f64 (y, r2, C4);
- y = v_fma_f64 (y, r2, C3);
- y = v_fma_f64 (y, r2, C2);
- y = v_fma_f64 (y, r2, C1);
- y = v_fma_f64 (y * r2, r, r);
-
- /* sign. */
- y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign ^ odd);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/v_sinf.c b/math/v_sinf.c
deleted file mode 100644
index ce35dacc65cf..000000000000
--- a/math/v_sinf.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Single-precision vector sin function.
- *
- * Copyright (c) 2019-2022, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "mathlib.h"
-#include "v_math.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* 1.886 ulp error */
- 0x1.5b2e76p-19f,
- -0x1.9f42eap-13f,
- 0x1.110df4p-7f,
- -0x1.555548p-3f,
-};
-#define Pi1 v_f32 (0x1.921fb6p+1f)
-#define Pi2 v_f32 (-0x1.777a5cp-24f)
-#define Pi3 v_f32 (-0x1.ee59dap-49f)
-#define A3 v_f32 (Poly[3])
-#define A5 v_f32 (Poly[2])
-#define A7 v_f32 (Poly[1])
-#define A9 v_f32 (Poly[0])
-#define RangeVal v_f32 (0x1p20f)
-#define TinyBound v_f32 (0x1p-61f)
-#define InvPi v_f32 (0x1.45f306p-2f)
-#define Shift v_f32 (0x1.8p+23f)
-#define AbsMask v_u32 (0x7fffffff)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- /* Fall back to scalar code. */
- return v_call_f32 (sinf, x, y, cmp);
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(sinf) (v_f32_t x)
-{
- v_f32_t n, r, r2, y;
- v_u32_t sign, odd, cmp, ir;
-
- ir = v_as_u32_f32 (x) & AbsMask;
- r = v_as_f32_u32 (ir);
- sign = v_as_u32_f32 (x) & ~AbsMask;
-
-#if WANT_SIMD_EXCEPT
- cmp = v_cond_u32 ((ir - v_as_u32_f32 (TinyBound)
- >= v_as_u32_f32 (RangeVal) - v_as_u32_f32 (TinyBound)));
- if (unlikely (v_any_u32 (cmp)))
- /* If fenv exceptions are to be triggered correctly, set any special lanes
- to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
- specialcase later. */
- r = v_sel_f32 (cmp, v_f32 (1), r);
-#else
- cmp = v_cond_u32 (ir >= v_as_u32_f32 (RangeVal));
-#endif
-
- /* n = rint(|x|/pi) */
- n = v_fma_f32 (InvPi, r, Shift);
- odd = v_as_u32_f32 (n) << 31;
- n -= Shift;
-
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
- r = v_fma_f32 (-Pi1, n, r);
- r = v_fma_f32 (-Pi2, n, r);
- r = v_fma_f32 (-Pi3, n, r);
-
- /* y = sin(r) */
- r2 = r * r;
- y = v_fma_f32 (A9, r2, A7);
- y = v_fma_f32 (y, r2, A5);
- y = v_fma_f32 (y, r2, A3);
- y = v_fma_f32 (y * r2, r, r);
-
- /* sign fix */
- y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign ^ odd);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-#endif
diff --git a/math/vn_cos.c b/math/vn_cos.c
deleted file mode 100644
index 4b5b23718a8b..000000000000
--- a/math/vn_cos.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cos.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cos, _ZGVnN2v_cos)
-#include "v_cos.c"
-#endif
diff --git a/math/vn_cosf.c b/math/vn_cosf.c
deleted file mode 100644
index 86dd26ecb3e7..000000000000
--- a/math/vn_cosf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cosf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_cosf, _ZGVnN4v_cosf)
-#include "v_cosf.c"
-#endif
diff --git a/math/vn_exp.c b/math/vn_exp.c
deleted file mode 100644
index 0d85b17de05a..000000000000
--- a/math/vn_exp.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_exp, _ZGVnN2v_exp)
-#include "v_exp.c"
-#endif
diff --git a/math/vn_exp2f.c b/math/vn_exp2f.c
deleted file mode 100644
index da3bb40ae93f..000000000000
--- a/math/vn_exp2f.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp2f.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_exp2f, _ZGVnN4v_exp2f)
-#include "v_exp2f.c"
-#endif
diff --git a/math/vn_exp2f_1u.c b/math/vn_exp2f_1u.c
deleted file mode 100644
index 3e3a24705614..000000000000
--- a/math/vn_exp2f_1u.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_exp2f_1u.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_exp2f_1u.c"
-#endif
diff --git a/math/vn_expf.c b/math/vn_expf.c
deleted file mode 100644
index 6e91a940bbf4..000000000000
--- a/math/vn_expf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
-#include "v_expf.c"
-#endif
diff --git a/math/vn_expf_1u.c b/math/vn_expf_1u.c
deleted file mode 100644
index 57ae6a315b9b..000000000000
--- a/math/vn_expf_1u.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf_1u.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_expf_1u.c"
-#endif
diff --git a/math/vn_log.c b/math/vn_log.c
deleted file mode 100644
index 902bff1fcd4e..000000000000
--- a/math/vn_log.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log, _ZGVnN2v_log)
-#include "v_log.c"
-#endif
diff --git a/math/vn_logf.c b/math/vn_logf.c
deleted file mode 100644
index 07e493685b4d..000000000000
--- a/math/vn_logf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_logf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_logf, _ZGVnN4v_logf)
-#include "v_logf.c"
-#endif
diff --git a/math/vn_pow.c b/math/vn_pow.c
deleted file mode 100644
index 1a980ff6bf2f..000000000000
--- a/math/vn_pow.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_pow.
- *
- * Copyright (c) 2020, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_pow, _ZGVnN2vv_pow)
-#include "v_pow.c"
-#endif
diff --git a/math/vn_powf.c b/math/vn_powf.c
deleted file mode 100644
index a42ade371adc..000000000000
--- a/math/vn_powf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_powf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_powf, _ZGVnN4vv_powf)
-#include "v_powf.c"
-#endif
diff --git a/math/vn_sin.c b/math/vn_sin.c
deleted file mode 100644
index 64b05c8ca0eb..000000000000
--- a/math/vn_sin.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sin.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sin, _ZGVnN2v_sin)
-#include "v_sin.c"
-#endif
diff --git a/math/vn_sinf.c b/math/vn_sinf.c
deleted file mode 100644
index 6e880c60dc39..000000000000
--- a/math/vn_sinf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sinf.
- *
- * Copyright (c) 2019, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_sinf, _ZGVnN4v_sinf)
-#include "v_sinf.c"
-#endif
diff --git a/pl/math/Dir.mk b/pl/math/Dir.mk
index be65344572a8..94b26cf3309c 100644
--- a/pl/math/Dir.mk
+++ b/pl/math/Dir.mk
@@ -1,13 +1,18 @@
# Makefile fragment - requires GNU make
#
-# Copyright (c) 2019-2023, Arm Limited.
+# Copyright (c) 2019-2024, Arm Limited.
# SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
PLM := $(srcdir)/pl/math
AOR := $(srcdir)/math
B := build/pl/math
-math-lib-srcs := $(wildcard $(PLM)/*.[cS])
+pl-lib-srcs := $(wildcard $(PLM)/*.[cS])
+
+ifeq ($(WANT_SVE_MATH), 0)
+pl-lib-srcs := $(filter-out $(PLM)/sv_%, $(pl-lib-srcs))
+endif
+
math-test-srcs := \
$(AOR)/test/mathtest.c \
$(AOR)/test/mathbench.c \
@@ -15,10 +20,10 @@ math-test-srcs := \
math-test-host-srcs := $(wildcard $(AOR)/test/rtest/*.[cS])
-math-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
-math-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
+pl-includes := $(patsubst $(PLM)/%,build/pl/%,$(wildcard $(PLM)/include/*.h))
+pl-test-includes := $(patsubst $(PLM)/%,build/pl/include/%,$(wildcard $(PLM)/test/*.h))
-math-libs := \
+pl-libs := \
build/pl/lib/libmathlib.so \
build/pl/lib/libmathlib.a \
@@ -32,37 +37,39 @@ math-tools := \
math-host-tools := \
build/pl/bin/rtest \
-math-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(math-lib-srcs)))
+pl-lib-objs := $(patsubst $(PLM)/%,$(B)/%.o,$(basename $(pl-lib-srcs)))
math-test-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-srcs)))
math-host-objs := $(patsubst $(AOR)/%,$(B)/%.o,$(basename $(math-test-host-srcs)))
-math-target-objs := $(math-lib-objs) $(math-test-objs)
-math-objs := $(math-target-objs) $(math-target-objs:%.o=%.os) $(math-host-objs)
+pl-target-objs := $(pl-lib-objs) $(math-test-objs)
+pl-objs := $(pl-target-objs) $(pl-target-objs:%.o=%.os) $(math-host-objs)
pl/math-files := \
- $(math-objs) \
- $(math-libs) \
+ $(pl-objs) \
+ $(pl-libs) \
$(math-tools) \
$(math-host-tools) \
- $(math-includes) \
- $(math-test-includes) \
+ $(pl-includes) \
+ $(pl-test-includes) \
-all-pl/math: $(math-libs) $(math-tools) $(math-includes) $(math-test-includes)
+all-pl/math: $(pl-libs) $(math-tools) $(pl-includes) $(pl-test-includes)
-$(math-objs): $(math-includes) $(math-test-includes)
-$(math-objs): CFLAGS_PL += $(math-cflags)
+$(pl-objs): $(pl-includes) $(pl-test-includes)
+$(pl-objs): CFLAGS_PL += $(math-cflags)
$(B)/test/mathtest.o: CFLAGS_PL += -fmath-errno
$(math-host-objs): CC = $(HOST_CC)
$(math-host-objs): CFLAGS_PL = $(HOST_CFLAGS)
-build/pl/include/test/ulp_funcs_gen.h: $(math-lib-srcs)
+$(B)/sv_%: CFLAGS_PL += $(math-sve-cflags)
+
+build/pl/include/test/ulp_funcs_gen.h: $(pl-lib-srcs)
# Replace PL_SIG
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f)" -P > $@
-build/pl/include/test/mathbench_funcs_gen.h: $(math-lib-srcs)
+build/pl/include/test/mathbench_funcs_gen.h: $(pl-lib-srcs)
# Replace PL_SIG macros with mathbench func entries
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=_Z##v##t##a(f, ##__VA_ARGS__)" -P > $@
-build/pl/include/test/ulp_wrappers_gen.h: $(math-lib-srcs)
+build/pl/include/test/ulp_wrappers_gen.h: $(pl-lib-srcs)
# Replace PL_SIG macros with ULP wrapper declarations
cat $^ | grep PL_SIG | $(CC) -xc - -o - -E "-DPL_SIG(v, t, a, f, ...)=Z##v##N##t##a##_WRAP(f)" -P > $@
@@ -72,16 +79,18 @@ $(B)/test/ulp.o: CFLAGS_PL += -I build/pl/include/test
$(B)/test/mathbench.o: build/pl/include/test/mathbench_funcs_gen.h
$(B)/test/mathbench.o: CFLAGS_PL += -I build/pl/include/test
-build/pl/lib/libmathlib.so: $(math-lib-objs:%.o=%.os)
+build/pl/lib/libmathlib.so: $(pl-lib-objs:%.o=%.os)
$(CC) $(CFLAGS_PL) $(LDFLAGS) -shared -o $@ $^
-build/pl/lib/libmathlib.a: $(math-lib-objs)
+build/pl/lib/libmathlib.a: $(pl-lib-objs)
rm -f $@
$(AR) rc $@ $^
$(RANLIB) $@
$(math-host-tools): HOST_LDLIBS += -lm -lmpfr -lmpc
$(math-tools): LDLIBS += $(math-ldlibs) -lm
+# math-sve-cflags should be empty if WANT_SVE_MATH is not enabled
+$(math-tools): CFLAGS_PL += $(math-sve-cflags)
# Some targets to build pl/math/test from math/test sources
build/pl/math/test/%.o: $(srcdir)/math/test/%.S
@@ -145,12 +154,11 @@ check-pl/math-rtest: $(math-host-tools) $(math-tools)
ulp-input-dir=$(B)/test/inputs
-math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(math-lib-srcs)))
-math-lib-aliases = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.alias,$(basename $(math-lib-srcs)))
-math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(math-lib-srcs)))
-math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(math-lib-srcs)))
+math-lib-lims = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.ulp,$(basename $(pl-lib-srcs)))
+math-lib-fenvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.fenv,$(basename $(pl-lib-srcs)))
+math-lib-itvs = $(patsubst $(PLM)/%,$(ulp-input-dir)/%.itv,$(basename $(pl-lib-srcs)))
-ulp-inputs = $(math-lib-lims) $(math-lib-aliases) $(math-lib-fenvs) $(math-lib-itvs)
+ulp-inputs = $(math-lib-lims) $(math-lib-fenvs) $(math-lib-itvs)
$(ulp-inputs): CFLAGS_PL += -I$(PLM) -I$(PLM)/include $(math-cflags)
@@ -158,10 +166,6 @@ $(ulp-input-dir)/%.ulp: $(PLM)/%.c
mkdir -p $(@D)
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_ULP [^ ]* [^ ]*" || true; } > $@
-$(ulp-input-dir)/%.alias: $(PLM)/%.c
- mkdir -p $(@D)
- $(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep "PL_TEST_ALIAS" || true; } | sed "s/_x / /g"> $@
-
$(ulp-input-dir)/%.fenv: $(PLM)/%.c
mkdir -p $(@D)
$(CC) -I$(PLM)/test $(CFLAGS_PL) $< -o - -E | { grep -o "PL_TEST_EXPECT_FENV_ENABLED [^ ]*" || true; } > $@
@@ -174,38 +178,21 @@ ulp-lims := $(ulp-input-dir)/limits
$(ulp-lims): $(math-lib-lims)
cat $^ | sed "s/PL_TEST_ULP //g;s/^ *//g" > $@
-ulp-aliases := $(ulp-input-dir)/aliases
-$(ulp-aliases): $(math-lib-aliases)
- cat $^ | sed "s/PL_TEST_ALIAS //g;s/^ *//g" > $@
-
fenv-exps := $(ulp-input-dir)/fenv
$(fenv-exps): $(math-lib-fenvs)
cat $^ | sed "s/PL_TEST_EXPECT_FENV_ENABLED //g;s/^ *//g" > $@
-ulp-itvs-noalias := $(ulp-input-dir)/itvs_noalias
-$(ulp-itvs-noalias): $(math-lib-itvs)
- cat $^ > $@
-
-rename-aliases := $(ulp-input-dir)/rename_alias.sed
-$(rename-aliases): $(ulp-aliases)
- # Build sed script for replacing aliases from generated alias file
- cat $< | awk '{ print "s/ " $$1 " / " $$2 " /g" }' > $@
-
-ulp-itvs-alias := $(ulp-input-dir)/itvs_alias
-$(ulp-itvs-alias): $(ulp-itvs-noalias) $(rename-aliases)
- cat $< | sed -f $(rename-aliases) > $@
-
ulp-itvs := $(ulp-input-dir)/intervals
-$(ulp-itvs): $(ulp-itvs-alias) $(ulp-itvs-noalias)
+$(ulp-itvs): $(math-lib-itvs)
cat $^ | sort -u | sed "s/PL_TEST_INTERVAL //g" > $@
-check-pl/math-ulp: $(math-tools) $(ulp-lims) $(ulp-aliases) $(fenv-exps) $(ulp-itvs)
+check-pl/math-ulp: $(math-tools) $(ulp-lims) $(fenv-exps) $(ulp-itvs)
WANT_SVE_MATH=$(WANT_SVE_MATH) \
ULPFLAGS="$(math-ulpflags)" \
LIMITS=../../../$(ulp-lims) \
- ALIASES=../../../$(ulp-aliases) \
INTERVALS=../../../$(ulp-itvs) \
FENV=../../../$(fenv-exps) \
+ FUNC=$(func) \
build/pl/bin/runulp.sh $(EMULATOR)
check-pl/math: check-pl/math-test check-pl/math-rtest check-pl/math-ulp
@@ -220,8 +207,8 @@ $(DESTDIR)$(includedir)/pl/%: build/pl/include/%
$(INSTALL) -m 644 -D $< $@
install-pl/math: \
- $(math-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
- $(math-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
+ $(pl-libs:build/pl/lib/%=$(DESTDIR)$(libdir)/pl/%) \
+ $(pl-includes:build/pl/include/%=$(DESTDIR)$(includedir)/pl/%)
clean-pl/math:
rm -f $(pl/math-files)
diff --git a/pl/math/acos_2u.c b/pl/math/acos_2u.c
new file mode 100644
index 000000000000..9ec6894f1d81
--- /dev/null
+++ b/pl/math/acos_2u.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision acos(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "poly_scalar_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffffffffffff)
+#define Half (0x3fe0000000000000)
+#define One (0x3ff0000000000000)
+#define PiOver2 (0x1.921fb54442d18p+0)
+#define Pi (0x1.921fb54442d18p+1)
+#define Small (0x3c90000000000000) /* 2^-53. */
+#define Small16 (0x3c90)
+#define QNaN (0x7ff8)
+
+/* Fast implementation of double-precision acos(x) based on polynomial
+ approximation of double-precision asin(x).
+
+ For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
+ rounding.
+
+ For |x| in [Small, 0.5], use the trigonometric identity
+
+ acos(x) = pi/2 - asin(x)
+
+ and use an order 11 polynomial P such that the final approximation of asin is
+ an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
+
+ The largest observed error in this region is 1.18 ulps,
+ acos(0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
+ want 0x1.0d54d1985c069p+0.
+
+ For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1
+
+ acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z))
+
+ where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the
+ approximation of asin near 0.
+
+ The largest observed error in this region is 1.52 ulps,
+ acos(0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
+ want 0x1.edbbedf8a7d6cp-1.
+
+ For x in [-1.0, -0.5], use this other identity to deduce the negative inputs
+ from their absolute value: acos(x) = pi - acos(-x). */
+double
+acos (double x)
+{
+ uint64_t ix = asuint64 (x);
+ uint64_t ia = ix & AbsMask;
+ uint64_t ia16 = ia >> 48;
+ double ax = asdouble (ia);
+ uint64_t sign = ix & ~AbsMask;
+
+ /* Special values and invalid range. */
+ if (unlikely (ia16 == QNaN))
+ return x;
+ if (ia > One)
+ return __math_invalid (x);
+ if (ia16 < Small16)
+ return PiOver2 - x;
+
+ /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5);
+ double z = ax < 0.5 ? ax : sqrt (z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ double z4 = z2 * z2;
+ double z8 = z4 * z4;
+ double z16 = z8 * z8;
+ double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = fma (z * z2, p, z);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = pi - 2 Q(|x|), for -1.0 < x <= -0.5
+ = 2 Q(|x|) , for -0.5 < x < 0.0. */
+ if (ax < 0.5)
+ return PiOver2 - asdouble (asuint64 (p) | sign);
+
+ return (x <= -0.5) ? fma (-2.0, p, Pi) : 2.0 * p;
+}
+
+PL_SIG (S, D, 1, acos, -1.0, 1.0)
+PL_TEST_ULP (acos, 1.02)
+PL_TEST_INTERVAL (acos, 0, Small, 5000)
+PL_TEST_INTERVAL (acos, Small, 0.5, 50000)
+PL_TEST_INTERVAL (acos, 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (acos, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (acos, 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (acos, -0, -inf, 20000)
diff --git a/pl/math/acosf_1u4.c b/pl/math/acosf_1u4.c
new file mode 100644
index 000000000000..6dde422ef85a
--- /dev/null
+++ b/pl/math/acosf_1u4.c
@@ -0,0 +1,99 @@
+/*
+ * Single-precision acos(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "poly_scalar_f32.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffff)
+#define Half (0x3f000000)
+#define One (0x3f800000)
+#define PiOver2f (0x1.921fb6p+0f)
+#define Pif (0x1.921fb6p+1f)
+#define Small (0x32800000) /* 2^-26. */
+#define Small12 (0x328)
+#define QNaN (0x7fc)
+
+/* Fast implementation of single-precision acos(x) based on polynomial
+ approximation of single-precision asin(x).
+
+ For x < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
+ rounding.
+
+ For |x| in [Small, 0.5], use the trigonometric identity
+
+ acos(x) = pi/2 - asin(x)
+
+ and use an order 4 polynomial P such that the final approximation of asin is
+ an odd polynomial: asin(x) ~ x + x^3 * P(x^2).
+
+ The largest observed error in this region is 1.16 ulps,
+ acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0 want 0x1.0c27f6p+0.
+
+ For |x| in [0.5, 1.0], use the following development of acos(x) near x = 1
+
+ acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z))
+
+ where z = (1-x)/2, z is near 0 when x approaches 1, and P contributes to the
+ approximation of asin near 0.
+
+ The largest observed error in this region is 1.32 ulps,
+ acosf(0x1.15ba56p-1) got 0x1.feb33p-1 want 0x1.feb32ep-1.
+
+ For x in [-1.0, -0.5], use this other identity to deduce the negative inputs
+ from their absolute value.
+
+ acos(x) = pi - acos(-x)
+
+ The largest observed error in this region is 1.28 ulps,
+ acosf(-0x1.002072p-1) got 0x1.0c1e84p+1 want 0x1.0c1e82p+1. */
+float
+acosf (float x)
+{
+ uint32_t ix = asuint (x);
+ uint32_t ia = ix & AbsMask;
+ uint32_t ia12 = ia >> 20;
+ float ax = asfloat (ia);
+ uint32_t sign = ix & ~AbsMask;
+
+ /* Special values and invalid range. */
+ if (unlikely (ia12 == QNaN))
+ return x;
+ if (ia > One)
+ return __math_invalidf (x);
+ if (ia12 < Small12)
+ return PiOver2f - x;
+
+ /* Evaluate polynomial Q(|x|) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f);
+ float z = ax < 0.5 ? ax : sqrtf (z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float p = horner_4_f32 (z2, __asinf_poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = fmaf (z * z2, p, z);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = pi - 2 Q(|x|), for -1.0 < x <= -0.5
+ = 2 Q(|x|) , for -0.5 < x < 0.0. */
+ if (ax < 0.5)
+ return PiOver2f - asfloat (asuint (p) | sign);
+
+ return (x <= -0.5) ? fmaf (-2.0f, p, Pif) : 2.0f * p;
+}
+
+PL_SIG (S, F, 1, acos, -1.0, 1.0)
+PL_TEST_ULP (acosf, 0.82)
+PL_TEST_INTERVAL (acosf, 0, Small, 5000)
+PL_TEST_INTERVAL (acosf, Small, 0.5, 50000)
+PL_TEST_INTERVAL (acosf, 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (acosf, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (acosf, 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (acosf, -0, -inf, 20000)
diff --git a/pl/math/asin_3u.c b/pl/math/asin_3u.c
new file mode 100644
index 000000000000..0b50995449ce
--- /dev/null
+++ b/pl/math/asin_3u.c
@@ -0,0 +1,106 @@
+/*
+ * Double-precision asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "poly_scalar_f64.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffffffffffff)
+#define Half (0x3fe0000000000000)
+#define One (0x3ff0000000000000)
+#define PiOver2 (0x1.921fb54442d18p+0)
+#define Small (0x3e50000000000000) /* 2^-26. */
+#define Small16 (0x3e50)
+#define QNaN (0x7ff8)
+
+/* Fast implementation of double-precision asin(x) based on polynomial
+ approximation.
+
+ For x < Small, approximate asin(x) by x. Small = 2^-26 for correct rounding.
+
+ For x in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 1.01 ulps,
+ asin(0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
+ want 0x1.ed78525a927eep-2.
+
+ No cheap approximation can be obtained near x = 1, since the function is not
+ continuously differentiable on 1.
+
+ For x in [0.5, 1.0], we use a method based on a trigonometric identity
+
+ asin(x) = pi/2 - acos(x)
+
+ and a generalized power series expansion of acos(y) near y=1, that reads as
+
+ acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1)
+
+ The Taylor series of asin(z) near z = 0, reads as
+
+ asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...).
+
+ Therefore, (1) can be written in terms of P(y/2) or even asin(y/2)
+
+ acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2)
+
+ Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and
+
+ asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
+
+ The largest observed error in this region is 2.69 ulps,
+ asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+ want 0x1.110d7e85fdd53p-1. */
+double
+asin (double x)
+{
+ uint64_t ix = asuint64 (x);
+ uint64_t ia = ix & AbsMask;
+ uint64_t ia16 = ia >> 48;
+ double ax = asdouble (ia);
+ uint64_t sign = ix & ~AbsMask;
+
+ /* Special values and invalid range. */
+ if (unlikely (ia16 == QNaN))
+ return x;
+ if (ia > One)
+ return __math_invalid (x);
+ if (ia16 < Small16)
+ return x;
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ double z2 = ax < 0.5 ? x * x : fma (-0.5, ax, 0.5);
+ double z = ax < 0.5 ? ax : sqrt (z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ double z4 = z2 * z2;
+ double z8 = z4 * z4;
+ double z16 = z8 * z8;
+ double p = estrin_11_f64 (z2, z4, z8, z16, __asin_poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = fma (z * z2, p, z);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ double y = ax < 0.5 ? p : fma (-2.0, p, PiOver2);
+
+ /* Copy sign. */
+ return asdouble (asuint64 (y) | sign);
+}
+
+PL_SIG (S, D, 1, asin, -1.0, 1.0)
+PL_TEST_ULP (asin, 2.19)
+PL_TEST_INTERVAL (asin, 0, Small, 5000)
+PL_TEST_INTERVAL (asin, Small, 0.5, 50000)
+PL_TEST_INTERVAL (asin, 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (asin, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (asin, 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (asin, -0, -inf, 20000)
diff --git a/pl/math/asin_data.c b/pl/math/asin_data.c
new file mode 100644
index 000000000000..b5517731c7f4
--- /dev/null
+++ b/pl/math/asin_data.c
@@ -0,0 +1,19 @@
+/*
+ * Coefficients for single-precision asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Approximate asin(x) directly in [0x1p-106, 0.25]. See tools/asin.sollya
+ for these coeffcients were generated. */
+const double __asin_poly[] = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
+ 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+ 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
+ 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6,
+};
diff --git a/pl/math/asinf_2u5.c b/pl/math/asinf_2u5.c
new file mode 100644
index 000000000000..ec608146ff66
--- /dev/null
+++ b/pl/math/asinf_2u5.c
@@ -0,0 +1,100 @@
+/*
+ * Single-precision asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "poly_scalar_f32.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define AbsMask (0x7fffffff)
+#define Half (0x3f000000)
+#define One (0x3f800000)
+#define PiOver2f (0x1.921fb6p+0f)
+#define Small (0x39800000) /* 2^-12. */
+#define Small12 (0x398)
+#define QNaN (0x7fc)
+
+/* Fast implementation of single-precision asin(x) based on polynomial
+ approximation.
+
+ For x < Small, approximate asin(x) by x. Small = 2^-12 for correct rounding.
+
+ For x in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.83 ulps,
+ asinf(0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
+
+ No cheap approximation can be obtained near x = 1, since the function is not
+ continuously differentiable on 1.
+
+ For x in [0.5, 1.0], we use a method based on a trigonometric identity
+
+ asin(x) = pi/2 - acos(x)
+
+ and a generalized power series expansion of acos(y) near y=1, that reads as
+
+ acos(y)/sqrt(2y) ~ 1 + 1/12 * y + 3/160 * y^2 + ... (1)
+
+ The Taylor series of asin(z) near z = 0, reads as
+
+ asin(z) ~ z + z^3 P(z^2) = z + z^3 * (1/6 + 3/40 z^2 + ...).
+
+ Therefore, (1) can be written in terms of P(y/2) or even asin(y/2)
+
+ acos(y) ~ sqrt(2y) (1 + y/2 * P(y/2)) = 2 * sqrt(y/2) (1 + y/2 * P(y/2)
+
+ Hence, if we write z = (1-x)/2, z is near 0 when x approaches 1 and
+
+ asin(x) ~ pi/2 - acos(x) ~ pi/2 - 2 * sqrt(z) (1 + z * P(z)).
+
+ The largest observed error in this region is 2.41 ulps,
+ asinf(0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
+float
+asinf (float x)
+{
+ uint32_t ix = asuint (x);
+ uint32_t ia = ix & AbsMask;
+ uint32_t ia12 = ia >> 20;
+ float ax = asfloat (ia);
+ uint32_t sign = ix & ~AbsMask;
+
+ /* Special values and invalid range. */
+ if (unlikely (ia12 == QNaN))
+ return x;
+ if (ia > One)
+ return __math_invalidf (x);
+ if (ia12 < Small12)
+ return x;
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float z2 = ax < 0.5 ? x * x : fmaf (-0.5f, ax, 0.5f);
+ float z = ax < 0.5 ? ax : sqrtf (z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float p = horner_4_f32 (z2, __asinf_poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = fmaf (z * z2, p, z);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float y = ax < 0.5 ? p : fmaf (-2.0f, p, PiOver2f);
+
+ /* Copy sign. */
+ return asfloat (asuint (y) | sign);
+}
+
+PL_SIG (S, F, 1, asin, -1.0, 1.0)
+PL_TEST_ULP (asinf, 1.91)
+PL_TEST_INTERVAL (asinf, 0, Small, 5000)
+PL_TEST_INTERVAL (asinf, Small, 0.5, 50000)
+PL_TEST_INTERVAL (asinf, 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (asinf, 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (asinf, 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (asinf, -0, -inf, 20000)
diff --git a/pl/math/asinf_data.c b/pl/math/asinf_data.c
new file mode 100644
index 000000000000..1652025e2920
--- /dev/null
+++ b/pl/math/asinf_data.c
@@ -0,0 +1,16 @@
+/*
+ * Coefficients for single-precision asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Approximate asinf(x) directly in [0x1p-24, 0.25]. See for tools/asinf.sollya
+ for these coeffs were generated. */
+const float __asinf_poly[] = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6, 0x1.3af7d8p-5,
+};
diff --git a/pl/math/asinh_2u5.c b/pl/math/asinh_2u5.c
index f1679556d5f8..b7fc81a2b94f 100644
--- a/pl/math/asinh_2u5.c
+++ b/pl/math/asinh_2u5.c
@@ -4,7 +4,7 @@
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "estrin.h"
+#include "poly_scalar_f64.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -60,8 +60,7 @@ asinh (double x)
double z2 = x2 * x2;
double z4 = z2 * z2;
double z8 = z4 * z4;
-#define C(i) __asinh_data.poly[i]
- double p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
+ double p = estrin_17_f64 (x2, z2, z4, z8, z8 * z8, __asinh_data.poly);
double y = fma (p, x2 * ax, ax);
return asdouble (asuint64 (y) | sign);
}
diff --git a/pl/math/asinhf_3u5.c b/pl/math/asinhf_3u5.c
index 2b2c55db56dc..ec26b80ec2ec 100644
--- a/pl/math/asinhf_3u5.c
+++ b/pl/math/asinhf_3u5.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "estrinf.h"
+#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -16,8 +16,6 @@
#define One (0x3f8)
#define ExpM12 (0x398)
-#define C(i) __asinhf_data.coeffs[i]
-
float
optr_aor_log_f32 (float);
@@ -57,7 +55,7 @@ asinhf (float x)
if (ia12 < One)
{
float x2 = ax * ax;
- float p = ESTRIN_7 (ax, x2, x2 * x2, C);
+ float p = estrin_7_f32 (ax, x2, x2 * x2, __asinhf_data.coeffs);
float y = fmaf (x2, p, ax);
return asfloat (asuint (y) | sign);
}
diff --git a/pl/math/atan_common.h b/pl/math/atan_common.h
index da0da6436854..798cc22cc40a 100644
--- a/pl/math/atan_common.h
+++ b/pl/math/atan_common.h
@@ -1,49 +1,33 @@
/*
- * Double-precision polynomial evaluation function for scalar and vector atan(x)
- * and atan2(y,x).
+ * Double-precision polynomial evaluation function for scalar
+ * atan(x) and atan2(y,x).
*
* Copyright (c) 2021-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "estrin.h"
-
-#if V_SUPPORTED
-
-#include "v_math.h"
-
-#define DBL_T v_f64_t
-#define P(i) v_f64 (__atan_poly_data.poly[i])
-
-#else
-
-#define DBL_T double
-#define P(i) __atan_poly_data.poly[i]
-
-#endif
+#include "poly_scalar_f64.h"
/* Polynomial used in fast atan(x) and atan2(y,x) implementations
The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
-static inline DBL_T
-eval_poly (DBL_T z, DBL_T az, DBL_T shift)
+static inline double
+eval_poly (double z, double az, double shift)
{
/* Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
full scheme to avoid underflow in x^16. */
- DBL_T z2 = z * z;
- DBL_T x2 = z2 * z2;
- DBL_T x4 = x2 * x2;
- DBL_T x8 = x4 * x4;
- DBL_T y
- = FMA (ESTRIN_11_ (z2, x2, x4, x8, P, 8), x8, ESTRIN_7 (z2, x2, x4, P));
+ double z2 = z * z;
+ double x2 = z2 * z2;
+ double x4 = x2 * x2;
+ double x8 = x4 * x4;
+ double y = fma (estrin_11_f64 (z2, x2, x4, x8, __atan_poly_data.poly + 8),
+ x8, estrin_7_f64 (z2, x2, x4, __atan_poly_data.poly));
/* Finalize. y = shift + z + z^3 * P(z^2). */
- y = FMA (y, z2 * az, az);
+ y = fma (y, z2 * az, az);
y = y + shift;
return y;
}
-#undef DBL_T
-#undef FMA
#undef P
diff --git a/pl/math/atanf_2u9.c b/pl/math/atanf_2u9.c
index 9d17f252b8b9..ba6f68089de1 100644
--- a/pl/math/atanf_2u9.c
+++ b/pl/math/atanf_2u9.c
@@ -66,11 +66,7 @@ atanf (float x)
PL_SIG (S, F, 1, atan, -10.0, 10.0)
PL_TEST_ULP (atanf, 2.38)
-PL_TEST_INTERVAL (atanf, 0, 0x1p-30, 5000)
-PL_TEST_INTERVAL (atanf, -0, -0x1p-30, 5000)
-PL_TEST_INTERVAL (atanf, 0x1p-30, 1, 40000)
-PL_TEST_INTERVAL (atanf, -0x1p-30, -1, 40000)
-PL_TEST_INTERVAL (atanf, 1, 0x1p30, 40000)
-PL_TEST_INTERVAL (atanf, -1, -0x1p30, 40000)
-PL_TEST_INTERVAL (atanf, 0x1p30, inf, 1000)
-PL_TEST_INTERVAL (atanf, -0x1p30, -inf, 1000)
+PL_TEST_SYM_INTERVAL (atanf, 0, 0x1p-30, 5000)
+PL_TEST_SYM_INTERVAL (atanf, 0x1p-30, 1, 40000)
+PL_TEST_SYM_INTERVAL (atanf, 1, 0x1p30, 40000)
+PL_TEST_SYM_INTERVAL (atanf, 0x1p30, inf, 1000)
diff --git a/pl/math/atanf_common.h b/pl/math/atanf_common.h
index 37ca76dee2f7..8952e7e0078b 100644
--- a/pl/math/atanf_common.h
+++ b/pl/math/atanf_common.h
@@ -1,5 +1,5 @@
/*
- * Single-precision polynomial evaluation function for scalar and vector
+ * Single-precision polynomial evaluation function for scalar
* atan(x) and atan2(y,x).
*
* Copyright (c) 2021-2023, Arm Limited.
@@ -10,26 +10,12 @@
#define PL_MATH_ATANF_COMMON_H
#include "math_config.h"
-#include "estrinf.h"
-
-#if V_SUPPORTED
-
-#include "v_math.h"
-
-#define FLT_T v_f32_t
-#define P(i) v_f32 (__atanf_poly_data.poly[i])
-
-#else
-
-#define FLT_T float
-#define P(i) __atanf_poly_data.poly[i]
-
-#endif
+#include "poly_scalar_f32.h"
/* Polynomial used in fast atanf(x) and atan2f(y,x) implementations
The order 7 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
-static inline FLT_T
-eval_poly (FLT_T z, FLT_T az, FLT_T shift)
+static inline float
+eval_poly (float z, float az, float shift)
{
/* Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
a standard implementation using z8 creates spurious underflow
@@ -37,15 +23,16 @@ eval_poly (FLT_T z, FLT_T az, FLT_T shift)
Therefore, we split the last fma into a mul and and an fma.
Horner and single-level Estrin have higher errors that exceed
threshold. */
- FLT_T z2 = z * z;
- FLT_T z4 = z2 * z2;
+ float z2 = z * z;
+ float z4 = z2 * z2;
/* Then assemble polynomial. */
- FLT_T y = FMA (z4, z4 * ESTRIN_3_ (z2, z4, P, 4), ESTRIN_3 (z2, z4, P));
-
+ float y = fmaf (
+ z4, z4 * pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly + 4),
+ pairwise_poly_3_f32 (z2, z4, __atanf_poly_data.poly));
/* Finalize:
y = shift + z * P(z^2). */
- return FMA (y, z2 * az, az) + shift;
+ return fmaf (y, z2 * az, az) + shift;
}
#endif // PL_MATH_ATANF_COMMON_H
diff --git a/pl/math/atanh_3u.c b/pl/math/atanh_3u.c
index a168cd555ff6..dcfbe8192a22 100644
--- a/pl/math/atanh_3u.c
+++ b/pl/math/atanh_3u.c
@@ -6,7 +6,7 @@
*/
#include "math_config.h"
-#include "estrin.h"
+#include "poly_scalar_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -20,7 +20,6 @@
#define OneTop12 0x3ff
#define HfRt2Top 0x3fe6a09e /* top32(asuint64(sqrt(2)/2)). */
#define BottomMask 0xffffffff
-#define C(i) __log1p_data.coeffs[i]
static inline double
log1p_inline (double x)
@@ -46,7 +45,8 @@ log1p_inline (double x)
double f2 = f * f;
double f4 = f2 * f2;
double f8 = f4 * f4;
- double p = fma (f, ESTRIN_18 (f, f2, f4, f8, f8 * f8, C) * f, f);
+ double p = fma (
+ f, estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs) * f, f);
/* Recombine log1p(x) = k*log2 + log1p(f) + c/m. */
double kd = k;
@@ -78,9 +78,6 @@ atanh (double x)
PL_SIG (S, D, 1, atanh, -1.0, 1.0)
PL_TEST_ULP (atanh, 3.00)
-PL_TEST_INTERVAL (atanh, 0, 0x1p-23, 10000)
-PL_TEST_INTERVAL (atanh, -0, -0x1p-23, 10000)
-PL_TEST_INTERVAL (atanh, 0x1p-23, 1, 90000)
-PL_TEST_INTERVAL (atanh, -0x1p-23, -1, 90000)
-PL_TEST_INTERVAL (atanh, 1, inf, 100)
-PL_TEST_INTERVAL (atanh, -1, -inf, 100)
+PL_TEST_SYM_INTERVAL (atanh, 0, 0x1p-23, 10000)
+PL_TEST_SYM_INTERVAL (atanh, 0x1p-23, 1, 90000)
+PL_TEST_SYM_INTERVAL (atanh, 1, inf, 100)
diff --git a/pl/math/atanhf_3u1.c b/pl/math/atanhf_3u1.c
index fb90aa29c7a3..e99d5a9900a9 100644
--- a/pl/math/atanhf_3u1.c
+++ b/pl/math/atanhf_3u1.c
@@ -15,7 +15,8 @@
#define One 0x3f800000
#define Four 0x40800000
#define Ln2 0x1.62e43p-1f
-#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+/* asuint(0x1p-12), below which atanhf(x) rounds to x. */
+#define TinyBound 0x39800000
#define C(i) __log1pf_data.coeffs[i]
@@ -80,9 +81,6 @@ atanhf (float x)
PL_SIG (S, F, 1, atanh, -1.0, 1.0)
PL_TEST_ULP (atanhf, 2.59)
-PL_TEST_INTERVAL (atanhf, 0, 0x1p-12, 500)
-PL_TEST_INTERVAL (atanhf, 0x1p-12, 1, 200000)
-PL_TEST_INTERVAL (atanhf, 1, inf, 1000)
-PL_TEST_INTERVAL (atanhf, -0, -0x1p-12, 500)
-PL_TEST_INTERVAL (atanhf, -0x1p-12, -1, 200000)
-PL_TEST_INTERVAL (atanhf, -1, -inf, 1000)
+PL_TEST_SYM_INTERVAL (atanhf, 0, 0x1p-12, 500)
+PL_TEST_SYM_INTERVAL (atanhf, 0x1p-12, 1, 200000)
+PL_TEST_SYM_INTERVAL (atanhf, 1, inf, 1000)
diff --git a/pl/math/cbrt_2u.c b/pl/math/cbrt_2u.c
index 83715dd18a3e..80be83c4470c 100644
--- a/pl/math/cbrt_2u.c
+++ b/pl/math/cbrt_2u.c
@@ -31,7 +31,7 @@ cbrt (double x)
uint64_t iax = ix & AbsMask;
uint64_t sign = ix & ~AbsMask;
- if (unlikely (iax == 0 || iax == 0x7f80000000000000))
+ if (unlikely (iax == 0 || iax == 0x7ff0000000000000))
return x;
/* |x| = m * 2^e, where m is in [0.5, 1.0].
@@ -66,5 +66,4 @@ cbrt (double x)
}
PL_TEST_ULP (cbrt, 1.30)
-PL_TEST_INTERVAL (cbrt, 0, inf, 1000000)
-PL_TEST_INTERVAL (cbrt, -0, -inf, 1000000)
+PL_TEST_SYM_INTERVAL (cbrt, 0, inf, 1000000)
diff --git a/pl/math/cbrtf_1u5.c b/pl/math/cbrtf_1u5.c
index adc591786a6a..88fcb7162ef6 100644
--- a/pl/math/cbrtf_1u5.c
+++ b/pl/math/cbrtf_1u5.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "estrinf.h"
+#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -14,7 +14,6 @@
#define SignMask 0x80000000
#define TwoThirds 0x1.555556p-1f
-#define C(i) __cbrtf_data.poly[i]
#define T(i) __cbrtf_data.table[i]
/* Approximation for single-precision cbrt(x), using low-order polynomial and
@@ -41,7 +40,8 @@ cbrtf (float x)
/* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
the less accurate the next stage of the algorithm needs to be. An order-4
polynomial is enough for one Newton iteration. */
- float p = ESTRIN_3 (m, m * m, C);
+ float p = pairwise_poly_3_f32 (m, m * m, __cbrtf_data.poly);
+
/* One iteration of Newton's method for iteratively approximating cbrt. */
float m_by_3 = m / 3;
float a = fmaf (TwoThirds, p, m_by_3 / (p * p));
@@ -63,5 +63,4 @@ cbrtf (float x)
PL_SIG (S, F, 1, cbrt, -10.0, 10.0)
PL_TEST_ULP (cbrtf, 1.03)
-PL_TEST_INTERVAL (cbrtf, 0, inf, 1000000)
-PL_TEST_INTERVAL (cbrtf, -0, -inf, 1000000)
+PL_TEST_SYM_INTERVAL (cbrtf, 0, inf, 1000000)
diff --git a/pl/math/cosh_2u.c b/pl/math/cosh_2u.c
index 5d1df0717453..2240a9c56f15 100644
--- a/pl/math/cosh_2u.c
+++ b/pl/math/cosh_2u.c
@@ -58,9 +58,6 @@ cosh (double x)
PL_SIG (S, D, 1, cosh, -10.0, 10.0)
PL_TEST_ULP (cosh, 1.43)
-PL_TEST_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
-PL_TEST_INTERVAL (cosh, -0, -0x1.61da04cbafe44p+9, 100000)
-PL_TEST_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
-PL_TEST_INTERVAL (cosh, -0x1.61da04cbafe44p+9, -0x1p10, 1000)
-PL_TEST_INTERVAL (cosh, 0x1p10, inf, 100)
-PL_TEST_INTERVAL (cosh, -0x1p10, -inf, 100)
+PL_TEST_SYM_INTERVAL (cosh, 0, 0x1.61da04cbafe44p+9, 100000)
+PL_TEST_SYM_INTERVAL (cosh, 0x1.61da04cbafe44p+9, 0x1p10, 1000)
+PL_TEST_SYM_INTERVAL (cosh, 0x1p10, inf, 100)
diff --git a/pl/math/coshf_1u9.c b/pl/math/coshf_1u9.c
index c125c929aa77..cf737840e0d6 100644
--- a/pl/math/coshf_1u9.c
+++ b/pl/math/coshf_1u9.c
@@ -63,9 +63,6 @@ coshf (float x)
PL_SIG (S, F, 1, cosh, -10.0, 10.0)
PL_TEST_ULP (coshf, 1.89)
-PL_TEST_INTERVAL (coshf, 0, 0x1p-63, 100)
-PL_TEST_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
-PL_TEST_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
-PL_TEST_INTERVAL (coshf, -0, -0x1p-63, 100)
-PL_TEST_INTERVAL (coshf, -0, -0x1.5a92d8p+6, 80000)
-PL_TEST_INTERVAL (coshf, -0x1.5a92d8p+6, -inf, 2000)
+PL_TEST_SYM_INTERVAL (coshf, 0, 0x1p-63, 100)
+PL_TEST_SYM_INTERVAL (coshf, 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_SYM_INTERVAL (coshf, 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/cospi_3u1.c b/pl/math/cospi_3u1.c
new file mode 100644
index 000000000000..4a688a076829
--- /dev/null
+++ b/pl/math/cospi_3u1.c
@@ -0,0 +1,89 @@
+/*
+ * Double-precision scalar cospi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_scalar_f64.h"
+
+/* Taylor series coefficents for sin(pi * x).
+ C2 coefficient (orginally ~=5.16771278) has been split into two parts:
+ C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278)
+ This change in magnitude reduces floating point rounding errors.
+ C2_hi is then reintroduced after the polynomial approxmation. */
+static const double poly[]
+ = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1,
+ -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
+ 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21,
+ -0x1.012a9870eeb7dp-25 };
+
+#define Shift 0x1.8p+52
+
+/* Approximation for scalar double-precision cospi(x).
+ Maximum error: 3.13 ULP:
+ cospi(0x1.160b129300112p-21) got 0x1.fffffffffd16bp-1
+ want 0x1.fffffffffd16ep-1. */
+double
+cospi (double x)
+{
+ if (isinf (x))
+ return __math_invalid (x);
+
+ double ax = asdouble (asuint64 (x) & ~0x8000000000000000);
+
+ /* Edge cases for when cospif should be exactly 1. (Integers)
+ 0x1p53 is the limit for single precision to store any decimal places. */
+ if (ax >= 0x1p53)
+ return 1;
+
+ /* If x is an integer, return +- 1, based upon if x is odd. */
+ uint64_t m = (uint64_t) ax;
+ if (m == ax)
+ return (m & 1) ? -1 : 1;
+
+ /* For very small inputs, squaring r causes underflow.
+ Values below this threshold can be approximated via
+ cospi(x) ~= 1. */
+ if (ax < 0x1p-63)
+ return 1;
+
+ /* Any non-integer values >= 0x1x51 will be int +0.5.
+ These values should return exactly 0. */
+ if (ax >= 0x1p51)
+ return 0;
+
+ /* n = rint(|x|). */
+ double n = ax + Shift;
+ uint64_t sign = asuint64 (n) << 63;
+ n = n - Shift;
+
+ /* We know that cospi(x) = sinpi(0.5 - x)
+ range reduction and offset into sinpi range -1/2 .. 1/2
+ r = 0.5 - |x - rint(x)|. */
+ double r = 0.5 - fabs (ax - n);
+
+ /* y = sin(r). */
+ double r2 = r * r;
+ double y = horner_9_f64 (r2, poly);
+ y = y * r;
+
+ /* Reintroduce C2_hi. */
+ y = fma (-4 * r2, r, y);
+
+ /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be
+ positive, therefore, the sign must be introduced based upon if x rounds to
+ odd or even. */
+ return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, cospi, -0.9, 0.9)
+PL_TEST_ULP (cospi, 2.63)
+PL_TEST_SYM_INTERVAL (cospi, 0, 0x1p-63, 5000)
+PL_TEST_SYM_INTERVAL (cospi, 0x1p-63, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (cospi, 0.5, 0x1p51f, 10000)
+PL_TEST_SYM_INTERVAL (cospi, 0x1p51f, inf, 10000)
diff --git a/pl/math/cospif_2u6.c b/pl/math/cospif_2u6.c
new file mode 100644
index 000000000000..d78a98ed8b2d
--- /dev/null
+++ b/pl/math/cospif_2u6.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision scalar cospi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Taylor series coefficents for sin(pi * x). */
+#define C0 0x1.921fb6p1f
+#define C1 -0x1.4abbcep2f
+#define C2 0x1.466bc6p1f
+#define C3 -0x1.32d2ccp-1f
+#define C4 0x1.50783p-4f
+#define C5 -0x1.e30750p-8f
+
+#define Shift 0x1.0p+23f
+
+/* Approximation for scalar single-precision cospi(x) - cospif.
+ Maximum error: 2.64 ULP:
+ cospif(0x1.37e844p-4) got 0x1.f16b3p-1
+ want 0x1.f16b2ap-1. */
+float
+cospif (float x)
+{
+ if (isinf (x))
+ return __math_invalidf (x);
+
+ float ax = asfloat (asuint (x) & ~0x80000000);
+
+ /* Edge cases for when cospif should be exactly +/- 1. (Integers)
+ 0x1p23 is the limit for single precision to store any decimal places. */
+ if (ax >= 0x1p24f)
+ return 1;
+
+ uint32_t m = roundf (ax);
+ if (m == ax)
+ return (m & 1) ? -1 : 1;
+
+ /* Any non-integer values >= 0x1p22f will be int +0.5.
+ These values should return exactly 0. */
+ if (ax >= 0x1p22f)
+ return 0;
+
+ /* For very small inputs, squaring r causes underflow.
+ Values below this threshold can be approximated via cospi(x) ~= 1 -
+ (pi*x). */
+ if (ax < 0x1p-31f)
+ return 1 - (C0 * x);
+
+ /* n = rint(|x|). */
+ float n = ax + Shift;
+ uint32_t sign = asuint (n) << 31;
+ n = n - Shift;
+
+ /* We know that cospi(x) = sinpi(0.5 - x)
+ range reduction and offset into sinpi range -1/2 .. 1/2
+ r = 0.5 - |x - rint(x)|. */
+ float r = 0.5f - fabs (ax - n);
+
+ /* y = sin(pi * r). */
+ float r2 = r * r;
+ float y = fmaf (C5, r2, C4);
+ y = fmaf (y, r2, C3);
+ y = fmaf (y, r2, C2);
+ y = fmaf (y, r2, C1);
+ y = fmaf (y, r2, C0);
+
+ /* As all values are reduced to -1/2 .. 1/2, the result of cos(x) always be
+ positive, therefore, the sign must be introduced based upon if x rounds to
+ odd or even. */
+ return asfloat (asuint (y * r) ^ sign);
+}
+
+PL_SIG (S, F, 1, cospi, -0.9, 0.9)
+PL_TEST_ULP (cospif, 2.15)
+PL_TEST_SYM_INTERVAL (cospif, 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (cospif, 0x1p-31, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (cospif, 0.5, 0x1p22f, 10000)
+PL_TEST_SYM_INTERVAL (cospif, 0x1p22f, inf, 10000)
diff --git a/pl/math/erf_2u5.c b/pl/math/erf_2u5.c
new file mode 100644
index 000000000000..3ca2a1332c1f
--- /dev/null
+++ b/pl/math/erf_2u5.c
@@ -0,0 +1,102 @@
+/*
+ * Double-precision erf(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
+#define Shift 0x1p45
+
+/* Polynomial coefficients. */
+#define OneThird 0x1.5555555555555p-2
+#define TwoThird 0x1.5555555555555p-1
+
+#define TwoOverFifteen 0x1.1111111111111p-3
+#define TwoOverFive 0x1.999999999999ap-2
+#define Tenth 0x1.999999999999ap-4
+
+#define TwoOverNine 0x1.c71c71c71c71cp-3
+#define TwoOverFortyFive 0x1.6c16c16c16c17p-5
+#define Sixth 0x1.555555555555p-3
+
+/* Fast erf approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r)
+ + scale * d * [
+ + 1
+ - r d
+ + 1/3 (2 r^2 - 1) d^2
+ - 1/6 (r (2 r^2 - 3)) d^3
+ + 1/30 (4 r^4 - 12 r^2 + 3) d^4
+ - 1/90 (4 r^4 - 20 r^2 + 15) d^5
+ ]
+
+ Maximum measure error: 2.29 ULP
+ erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
+ want -0x1.20dd59132ebafp-8. */
+double
+erf (double x)
+{
+ /* Get absolute value and sign. */
+ uint64_t ix = asuint64 (x);
+ uint64_t ia = ix & 0x7fffffffffffffff;
+ uint64_t sign = ix & ~0x7fffffffffffffff;
+
+ /* |x| < 0x1p-508. Triggers exceptions. */
+ if (unlikely (ia < 0x2030000000000000))
+ return fma (TwoOverSqrtPiMinusOne, x, x);
+
+ if (ia < 0x4017f80000000000) /* |x| < 6 - 1 / 128 = 5.9921875. */
+ {
+ /* Set r to multiple of 1/128 nearest to |x|. */
+ double a = asdouble (ia);
+ double z = a + Shift;
+ uint64_t i = asuint64 (z) - asuint64 (Shift);
+ double r = z - Shift;
+ /* Lookup erf(r) and scale(r) in table.
+ Set erf(r) to 0 and scale to 2/sqrt(pi) for |x| <= 0x1.cp-9. */
+ double erfr = __erf_data.tab[i].erf;
+ double scale = __erf_data.tab[i].scale;
+
+ /* erf(x) ~ erf(r) + scale * d * poly (d, r). */
+ double d = a - r;
+ double r2 = r * r;
+ double d2 = d * d;
+
+ /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
+ double p1 = -r;
+ double p2 = fma (TwoThird, r2, -OneThird);
+ double p3 = -r * fma (OneThird, r2, -0.5);
+ double p4 = fma (fma (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth);
+ double p5
+ = -r * fma (fma (TwoOverFortyFive, r2, -TwoOverNine), r2, Sixth);
+
+ double p34 = fma (p4, d, p3);
+ double p12 = fma (p2, d, p1);
+ double y = fma (p5, d2, p34);
+ y = fma (y, d2, p12);
+
+ y = fma (fma (y, d2, d), scale, erfr);
+ return asdouble (asuint64 (y) | sign);
+ }
+
+ /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */
+ if (unlikely (ia >= 0x7ff0000000000000))
+ return (1.0 - (double) (sign >> 62)) + 1.0 / x;
+
+ /* Boring domain (|x| >= 6.0). */
+ return asdouble (sign | asuint64 (1.0));
+}
+
+PL_SIG (S, D, 1, erf, -6.0, 6.0)
+PL_TEST_ULP (erf, 1.79)
+PL_TEST_SYM_INTERVAL (erf, 0, 5.9921875, 40000)
+PL_TEST_SYM_INTERVAL (erf, 5.9921875, inf, 40000)
+PL_TEST_SYM_INTERVAL (erf, 0, inf, 40000)
diff --git a/pl/math/erf_data.c b/pl/math/erf_data.c
new file mode 100644
index 000000000000..138e03578e77
--- /dev/null
+++ b/pl/math/erf_data.c
@@ -0,0 +1,788 @@
+/*
+ * Data for approximation of erf.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Lookup table used in erf.
+ For each possible rounded input r (multiples of 1/128), between
+ r = 0.0 and r = 6.0 (769 values):
+ - the first entry __erff_data.tab.erf contains the values of erf(r),
+ - the second entry __erff_data.tab.scale contains the values of
+ 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
+ algorithm, since lookup is performed only for x >= 1/64-1/512. */
+const struct erf_data __erf_data = {
+ .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
+ { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
+ { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
+ { 0x1.b137e0cf584dcp-6, 0x1.20b4d8bac36c1p+0 },
+ { 0x1.20c5645dd2538p-5, 0x1.209546ad13ccfp+0 },
+ { 0x1.68e5d3bbc9526p-5, 0x1.206cb4897b148p+0 },
+ { 0x1.b0fafef135745p-5, 0x1.203b261cd0052p+0 },
+ { 0x1.f902a77bd3821p-5, 0x1.2000a00ae3804p+0 },
+ { 0x1.207d480e90658p-4, 0x1.1fbd27cdc72d3p+0 },
+ { 0x1.44703e87e8593p-4, 0x1.1f70c3b4f2cc7p+0 },
+ { 0x1.68591a1e83b5dp-4, 0x1.1f1b7ae44867fp+0 },
+ { 0x1.8c36beb8a8d23p-4, 0x1.1ebd5552f795bp+0 },
+ { 0x1.b0081148a873ap-4, 0x1.1e565bca400d4p+0 },
+ { 0x1.d3cbf7e70a4b3p-4, 0x1.1de697e413d28p+0 },
+ { 0x1.f78159ec8bb50p-4, 0x1.1d6e14099944ap+0 },
+ { 0x1.0d939005f65e5p-3, 0x1.1cecdb718d61cp+0 },
+ { 0x1.1f5e1a35c3b89p-3, 0x1.1c62fa1e869b6p+0 },
+ { 0x1.311fc15f56d14p-3, 0x1.1bd07cdd189acp+0 },
+ { 0x1.42d7fc2f64959p-3, 0x1.1b357141d95d5p+0 },
+ { 0x1.548642321d7c6p-3, 0x1.1a91e5a748165p+0 },
+ { 0x1.662a0bdf7a89fp-3, 0x1.19e5e92b964abp+0 },
+ { 0x1.77c2d2a765f9ep-3, 0x1.19318bae53a04p+0 },
+ { 0x1.895010fdbdbfdp-3, 0x1.1874ddcdfce24p+0 },
+ { 0x1.9ad142662e14dp-3, 0x1.17aff0e56ec10p+0 },
+ { 0x1.ac45e37fe2526p-3, 0x1.16e2d7093cd8cp+0 },
+ { 0x1.bdad72110a648p-3, 0x1.160da304ed92fp+0 },
+ { 0x1.cf076d1233237p-3, 0x1.153068581b781p+0 },
+ { 0x1.e05354b96ff36p-3, 0x1.144b3b337c90cp+0 },
+ { 0x1.f190aa85540e2p-3, 0x1.135e3075d076bp+0 },
+ { 0x1.015f78a3dcf3dp-2, 0x1.12695da8b5bdep+0 },
+ { 0x1.09eed6982b948p-2, 0x1.116cd8fd67618p+0 },
+ { 0x1.127631eb8de32p-2, 0x1.1068b94962e5ep+0 },
+ { 0x1.1af54e232d609p-2, 0x1.0f5d1602f7e41p+0 },
+ { 0x1.236bef825d9a2p-2, 0x1.0e4a073dc1b91p+0 },
+ { 0x1.2bd9db0f7827fp-2, 0x1.0d2fa5a70c168p+0 },
+ { 0x1.343ed6989b7d9p-2, 0x1.0c0e0a8223359p+0 },
+ { 0x1.3c9aa8b84bedap-2, 0x1.0ae54fa490722p+0 },
+ { 0x1.44ed18d9f6462p-2, 0x1.09b58f724416bp+0 },
+ { 0x1.4d35ef3e5372ep-2, 0x1.087ee4d9ad247p+0 },
+ { 0x1.5574f4ffac98ep-2, 0x1.07416b4fbfe7cp+0 },
+ { 0x1.5da9f415ff23fp-2, 0x1.05fd3ecbec297p+0 },
+ { 0x1.65d4b75b00471p-2, 0x1.04b27bc403d30p+0 },
+ { 0x1.6df50a8dff772p-2, 0x1.03613f2812dafp+0 },
+ { 0x1.760aba57a76bfp-2, 0x1.0209a65e29545p+0 },
+ { 0x1.7e15944d9d3e4p-2, 0x1.00abcf3e187a9p+0 },
+ { 0x1.861566f5fd3c0p-2, 0x1.fe8fb01a47307p-1 },
+ { 0x1.8e0a01cab516bp-2, 0x1.fbbbbef34b4b2p-1 },
+ { 0x1.95f3353cbb146p-2, 0x1.f8dc092d58ff8p-1 },
+ { 0x1.9dd0d2b721f39p-2, 0x1.f5f0cdaf15313p-1 },
+ { 0x1.a5a2aca209394p-2, 0x1.f2fa4c16c0019p-1 },
+ { 0x1.ad68966569a87p-2, 0x1.eff8c4b1375dbp-1 },
+ { 0x1.b522646bbda68p-2, 0x1.ecec7870ebca7p-1 },
+ { 0x1.bccfec24855b8p-2, 0x1.e9d5a8e4c934ep-1 },
+ { 0x1.c4710406a65fcp-2, 0x1.e6b4982f158b9p-1 },
+ { 0x1.cc058392a6d2dp-2, 0x1.e38988fc46e72p-1 },
+ { 0x1.d38d4354c3bd0p-2, 0x1.e054be79d3042p-1 },
+ { 0x1.db081ce6e2a48p-2, 0x1.dd167c4cf9d2ap-1 },
+ { 0x1.e275eaf25e458p-2, 0x1.d9cf06898cdafp-1 },
+ { 0x1.e9d68931ae650p-2, 0x1.d67ea1a8b5368p-1 },
+ { 0x1.f129d471eabb1p-2, 0x1.d325927fb9d89p-1 },
+ { 0x1.f86faa9428f9dp-2, 0x1.cfc41e36c7df9p-1 },
+ { 0x1.ffa7ea8eb5fd0p-2, 0x1.cc5a8a3fbea40p-1 },
+ { 0x1.03693a371519cp-1, 0x1.c8e91c4d01368p-1 },
+ { 0x1.06f794ab2cae7p-1, 0x1.c5701a484ef9dp-1 },
+ { 0x1.0a7ef5c18edd2p-1, 0x1.c1efca49a5011p-1 },
+ { 0x1.0dff4f247f6c6p-1, 0x1.be68728e29d5dp-1 },
+ { 0x1.1178930ada115p-1, 0x1.bada596f25436p-1 },
+ { 0x1.14eab43841b55p-1, 0x1.b745c55905bf8p-1 },
+ { 0x1.1855a5fd3dd50p-1, 0x1.b3aafcc27502ep-1 },
+ { 0x1.1bb95c3746199p-1, 0x1.b00a46237d5bep-1 },
+ { 0x1.1f15cb50bc4dep-1, 0x1.ac63e7ecc1411p-1 },
+ { 0x1.226ae840d4d70p-1, 0x1.a8b8287ec6a09p-1 },
+ { 0x1.25b8a88b6dd7fp-1, 0x1.a5074e2157620p-1 },
+ { 0x1.28ff0240d52cdp-1, 0x1.a1519efaf889ep-1 },
+ { 0x1.2c3debfd7d6c1p-1, 0x1.9d97610879642p-1 },
+ { 0x1.2f755ce9a21f4p-1, 0x1.99d8da149c13fp-1 },
+ { 0x1.32a54cb8db67bp-1, 0x1.96164fafd8de3p-1 },
+ { 0x1.35cdb3a9a144dp-1, 0x1.925007283d7aap-1 },
+ { 0x1.38ee8a84beb71p-1, 0x1.8e86458169af8p-1 },
+ { 0x1.3c07ca9cb4f9ep-1, 0x1.8ab94f6caa71dp-1 },
+ { 0x1.3f196dcd0f135p-1, 0x1.86e9694134b9ep-1 },
+ { 0x1.42236e79a5fa6p-1, 0x1.8316d6f48133dp-1 },
+ { 0x1.4525c78dd5966p-1, 0x1.7f41dc12c9e89p-1 },
+ { 0x1.4820747ba2dc2p-1, 0x1.7b6abbb7aaf19p-1 },
+ { 0x1.4b13713ad3513p-1, 0x1.7791b886e7403p-1 },
+ { 0x1.4dfeba47f63ccp-1, 0x1.73b714a552763p-1 },
+ { 0x1.50e24ca35fd2cp-1, 0x1.6fdb11b1e0c34p-1 },
+ { 0x1.53be25d016a4fp-1, 0x1.6bfdf0beddaf5p-1 },
+ { 0x1.569243d2b3a9bp-1, 0x1.681ff24b4ab04p-1 },
+ { 0x1.595ea53035283p-1, 0x1.6441563c665d4p-1 },
+ { 0x1.5c2348ecc4dc3p-1, 0x1.60625bd75d07bp-1 },
+ { 0x1.5ee02e8a71a53p-1, 0x1.5c8341bb23767p-1 },
+ { 0x1.61955607dd15dp-1, 0x1.58a445da7c74cp-1 },
+ { 0x1.6442bfdedd397p-1, 0x1.54c5a57629db0p-1 },
+ { 0x1.66e86d0312e82p-1, 0x1.50e79d1749ac9p-1 },
+ { 0x1.69865ee075011p-1, 0x1.4d0a6889dfd9fp-1 },
+ { 0x1.6c1c9759d0e5fp-1, 0x1.492e42d78d2c5p-1 },
+ { 0x1.6eab18c74091bp-1, 0x1.4553664273d24p-1 },
+ { 0x1.7131e5f496a5ap-1, 0x1.417a0c4049fd0p-1 },
+ { 0x1.73b1021fc0cb8p-1, 0x1.3da26d759aef5p-1 },
+ { 0x1.762870f720c6fp-1, 0x1.39ccc1b136d5ap-1 },
+ { 0x1.78983697dc96fp-1, 0x1.35f93fe7d1b3dp-1 },
+ { 0x1.7b00578c26037p-1, 0x1.32281e2fd1a92p-1 },
+ { 0x1.7d60d8c979f7bp-1, 0x1.2e5991bd4cbfcp-1 },
+ { 0x1.7fb9bfaed8078p-1, 0x1.2a8dcede3673bp-1 },
+ { 0x1.820b1202f27fbp-1, 0x1.26c508f6bd0ffp-1 },
+ { 0x1.8454d5f25760dp-1, 0x1.22ff727dd6f7bp-1 },
+ { 0x1.8697120d92a4ap-1, 0x1.1f3d3cf9ffe5ap-1 },
+ { 0x1.88d1cd474a2e0p-1, 0x1.1b7e98fe26217p-1 },
+ { 0x1.8b050ef253c37p-1, 0x1.17c3b626c7a11p-1 },
+ { 0x1.8d30debfc572ep-1, 0x1.140cc3173f007p-1 },
+ { 0x1.8f5544bd00c04p-1, 0x1.1059ed7740313p-1 },
+ { 0x1.91724951b8fc6p-1, 0x1.0cab61f084b93p-1 },
+ { 0x1.9387f53df5238p-1, 0x1.09014c2ca74dap-1 },
+ { 0x1.959651980da31p-1, 0x1.055bd6d32e8d7p-1 },
+ { 0x1.979d67caa6631p-1, 0x1.01bb2b87c6968p-1 },
+ { 0x1.999d4192a5715p-1, 0x1.fc3ee5d1524b0p-2 },
+ { 0x1.9b95e8fd26abap-1, 0x1.f511a91a67d2ap-2 },
+ { 0x1.9d8768656cc42p-1, 0x1.edeeee0959518p-2 },
+ { 0x1.9f71ca72cffb6p-1, 0x1.e6d6ffaa65a25p-2 },
+ { 0x1.a1551a16aaeafp-1, 0x1.dfca26f5bbf88p-2 },
+ { 0x1.a331628a45b92p-1, 0x1.d8c8aace11e63p-2 },
+ { 0x1.a506af4cc00f4p-1, 0x1.d1d2cfff91594p-2 },
+ { 0x1.a6d50c20fa293p-1, 0x1.cae8d93f1d7b6p-2 },
+ { 0x1.a89c850b7d54dp-1, 0x1.c40b0729ed547p-2 },
+ { 0x1.aa5d265064366p-1, 0x1.bd3998457afdap-2 },
+ { 0x1.ac16fc7143263p-1, 0x1.b674c8ffc6283p-2 },
+ { 0x1.adca142b10f98p-1, 0x1.afbcd3afe8ab6p-2 },
+ { 0x1.af767a741088bp-1, 0x1.a911f096fbc26p-2 },
+ { 0x1.b11c3c79bb424p-1, 0x1.a27455e14c93cp-2 },
+ { 0x1.b2bb679ead19cp-1, 0x1.9be437a7de946p-2 },
+ { 0x1.b4540978921eep-1, 0x1.9561c7f23a47bp-2 },
+ { 0x1.b5e62fce16095p-1, 0x1.8eed36b886d93p-2 },
+ { 0x1.b771e894d602ep-1, 0x1.8886b1e5ecfd1p-2 },
+ { 0x1.b8f741ef54f83p-1, 0x1.822e655b417e6p-2 },
+ { 0x1.ba764a2af2b78p-1, 0x1.7be47af1f5d89p-2 },
+ { 0x1.bbef0fbde6221p-1, 0x1.75a91a7f4d2edp-2 },
+ { 0x1.bd61a1453ab44p-1, 0x1.6f7c69d7d3ef8p-2 },
+ { 0x1.bece0d82d1a5cp-1, 0x1.695e8cd31867ep-2 },
+ { 0x1.c034635b66e23p-1, 0x1.634fa54fa285fp-2 },
+ { 0x1.c194b1d49a184p-1, 0x1.5d4fd33729015p-2 },
+ { 0x1.c2ef0812fc1bdp-1, 0x1.575f3483021c3p-2 },
+ { 0x1.c443755820d64p-1, 0x1.517de540ce2a3p-2 },
+ { 0x1.c5920900b5fd1p-1, 0x1.4babff975a04cp-2 },
+ { 0x1.c6dad2829ec62p-1, 0x1.45e99bcbb7915p-2 },
+ { 0x1.c81de16b14cefp-1, 0x1.4036d0468a7a2p-2 },
+ { 0x1.c95b455cce69dp-1, 0x1.3a93b1998736cp-2 },
+ { 0x1.ca930e0e2a825p-1, 0x1.35005285227f1p-2 },
+ { 0x1.cbc54b476248dp-1, 0x1.2f7cc3fe6f423p-2 },
+ { 0x1.ccf20ce0c0d27p-1, 0x1.2a09153529381p-2 },
+ { 0x1.ce1962c0e0d8bp-1, 0x1.24a55399ea239p-2 },
+ { 0x1.cf3b5cdaf0c39p-1, 0x1.1f518ae487dc8p-2 },
+ { 0x1.d0580b2cfd249p-1, 0x1.1a0dc51a9934dp-2 },
+ { 0x1.d16f7dbe41ca0p-1, 0x1.14da0a961fd14p-2 },
+ { 0x1.d281c49d818d0p-1, 0x1.0fb6620c550afp-2 },
+ { 0x1.d38eefdf64fddp-1, 0x1.0aa2d09497f2bp-2 },
+ { 0x1.d4970f9ce00d9p-1, 0x1.059f59af7a906p-2 },
+ { 0x1.d59a33f19ed42p-1, 0x1.00abff4dec7a3p-2 },
+ { 0x1.d6986cfa798e7p-1, 0x1.f79183b101c5bp-3 },
+ { 0x1.d791cad3eff01p-1, 0x1.edeb406d9c824p-3 },
+ { 0x1.d8865d98abe01p-1, 0x1.e4652fadcb6b2p-3 },
+ { 0x1.d97635600bb89p-1, 0x1.daff4969c0b04p-3 },
+ { 0x1.da61623cb41e0p-1, 0x1.d1b982c501370p-3 },
+ { 0x1.db47f43b2980dp-1, 0x1.c893ce1dcbef7p-3 },
+ { 0x1.dc29fb60715afp-1, 0x1.bf8e1b1ca2279p-3 },
+ { 0x1.dd0787a8bb39dp-1, 0x1.b6a856c3ed54fp-3 },
+ { 0x1.dde0a90611a0dp-1, 0x1.ade26b7fbed95p-3 },
+ { 0x1.deb56f5f12d28p-1, 0x1.a53c4135a6526p-3 },
+ { 0x1.df85ea8db188ep-1, 0x1.9cb5bd549b111p-3 },
+ { 0x1.e0522a5dfda73p-1, 0x1.944ec2e4f5630p-3 },
+ { 0x1.e11a3e8cf4eb8p-1, 0x1.8c07329874652p-3 },
+ { 0x1.e1de36c75ba58p-1, 0x1.83deeada4d25ap-3 },
+ { 0x1.e29e22a89d766p-1, 0x1.7bd5c7df3fe9cp-3 },
+ { 0x1.e35a11b9b61cep-1, 0x1.73eba3b5b07b7p-3 },
+ { 0x1.e4121370224ccp-1, 0x1.6c205655be71fp-3 },
+ { 0x1.e4c6372cd8927p-1, 0x1.6473b5b15a7a1p-3 },
+ { 0x1.e5768c3b4a3fcp-1, 0x1.5ce595c455b0ap-3 },
+ { 0x1.e62321d06c5e0p-1, 0x1.5575c8a468361p-3 },
+ { 0x1.e6cc0709c8a0dp-1, 0x1.4e241e912c305p-3 },
+ { 0x1.e7714aec96534p-1, 0x1.46f066040a832p-3 },
+ { 0x1.e812fc64db369p-1, 0x1.3fda6bc016994p-3 },
+ { 0x1.e8b12a44944a8p-1, 0x1.38e1fae1d6a9dp-3 },
+ { 0x1.e94be342e6743p-1, 0x1.3206dceef5f87p-3 },
+ { 0x1.e9e335fb56f87p-1, 0x1.2b48d9e5dea1cp-3 },
+ { 0x1.ea7730ed0bbb9p-1, 0x1.24a7b84d38971p-3 },
+ { 0x1.eb07e27a133aap-1, 0x1.1e233d434b813p-3 },
+ { 0x1.eb9558e6b42cep-1, 0x1.17bb2c8d41535p-3 },
+ { 0x1.ec1fa258c4beap-1, 0x1.116f48a6476ccp-3 },
+ { 0x1.eca6ccd709544p-1, 0x1.0b3f52ce8c383p-3 },
+ { 0x1.ed2ae6489ac1ep-1, 0x1.052b0b1a174eap-3 },
+ { 0x1.edabfc7453e63p-1, 0x1.fe6460fef4680p-4 },
+ { 0x1.ee2a1d004692cp-1, 0x1.f2a901ccafb37p-4 },
+ { 0x1.eea5557137ae0p-1, 0x1.e723726b824a9p-4 },
+ { 0x1.ef1db32a2277cp-1, 0x1.dbd32ac4c99b0p-4 },
+ { 0x1.ef93436bc2daap-1, 0x1.d0b7a0f921e7cp-4 },
+ { 0x1.f006135426b26p-1, 0x1.c5d0497c09e74p-4 },
+ { 0x1.f0762fde45ee6p-1, 0x1.bb1c972f23e50p-4 },
+ { 0x1.f0e3a5e1a1788p-1, 0x1.b09bfb7d11a83p-4 },
+ { 0x1.f14e8211e8c55p-1, 0x1.a64de673e8837p-4 },
+ { 0x1.f1b6d0fea5f4dp-1, 0x1.9c31c6df3b1b8p-4 },
+ { 0x1.f21c9f12f0677p-1, 0x1.92470a61b6965p-4 },
+ { 0x1.f27ff89525acfp-1, 0x1.888d1d8e510a3p-4 },
+ { 0x1.f2e0e9a6a8b09p-1, 0x1.7f036c0107294p-4 },
+ { 0x1.f33f7e43a706bp-1, 0x1.75a96077274bap-4 },
+ { 0x1.f39bc242e43e6p-1, 0x1.6c7e64e7281cbp-4 },
+ { 0x1.f3f5c1558b19ep-1, 0x1.6381e2980956bp-4 },
+ { 0x1.f44d870704911p-1, 0x1.5ab342383d177p-4 },
+ { 0x1.f4a31ebcd47dfp-1, 0x1.5211ebf41880bp-4 },
+ { 0x1.f4f693b67bd77p-1, 0x1.499d478bca735p-4 },
+ { 0x1.f547f10d60597p-1, 0x1.4154bc68d75c3p-4 },
+ { 0x1.f59741b4b97cfp-1, 0x1.3937b1b319259p-4 },
+ { 0x1.f5e4907982a07p-1, 0x1.31458e6542847p-4 },
+ { 0x1.f62fe80272419p-1, 0x1.297db960e4f63p-4 },
+ { 0x1.f67952cff6282p-1, 0x1.21df9981f8e53p-4 },
+ { 0x1.f6c0db3c34641p-1, 0x1.1a6a95b1e786fp-4 },
+ { 0x1.f7068b7b10fd9p-1, 0x1.131e14fa1625dp-4 },
+ { 0x1.f74a6d9a38383p-1, 0x1.0bf97e95f2a64p-4 },
+ { 0x1.f78c8b812d498p-1, 0x1.04fc3a0481321p-4 },
+ { 0x1.f7cceef15d631p-1, 0x1.fc4b5e32d6259p-5 },
+ { 0x1.f80ba18636f07p-1, 0x1.eeea8c1b1db93p-5 },
+ { 0x1.f848acb544e95p-1, 0x1.e1d4cf1e2450ap-5 },
+ { 0x1.f88419ce4e184p-1, 0x1.d508f9a1ea64ep-5 },
+ { 0x1.f8bdf1fb78370p-1, 0x1.c885df3451a07p-5 },
+ { 0x1.f8f63e416ebffp-1, 0x1.bc4a54a84e834p-5 },
+ { 0x1.f92d077f8d56dp-1, 0x1.b055303221015p-5 },
+ { 0x1.f96256700da8ep-1, 0x1.a4a549829587ep-5 },
+ { 0x1.f99633a838a57p-1, 0x1.993979e14fffdp-5 },
+ { 0x1.f9c8a7989af0dp-1, 0x1.8e109c4622913p-5 },
+ { 0x1.f9f9ba8d3c733p-1, 0x1.83298d717210ep-5 },
+ { 0x1.fa2974addae45p-1, 0x1.78832c03aa2b1p-5 },
+ { 0x1.fa57ddfe27376p-1, 0x1.6e1c5893c380bp-5 },
+ { 0x1.fa84fe5e05c8dp-1, 0x1.63f3f5c4de13bp-5 },
+ { 0x1.fab0dd89d1309p-1, 0x1.5a08e85af27e0p-5 },
+ { 0x1.fadb831a9f9c3p-1, 0x1.505a174e9c929p-5 },
+ { 0x1.fb04f6868a944p-1, 0x1.46e66be002240p-5 },
+ { 0x1.fb2d3f20f9101p-1, 0x1.3dacd1a8d8ccdp-5 },
+ { 0x1.fb54641aebbc9p-1, 0x1.34ac36ad8dafep-5 },
+ { 0x1.fb7a6c834b5a2p-1, 0x1.2be38b6d92415p-5 },
+ { 0x1.fb9f5f4739170p-1, 0x1.2351c2f2d1449p-5 },
+ { 0x1.fbc3433260ca5p-1, 0x1.1af5d2e04f3f6p-5 },
+ { 0x1.fbe61eef4cf6ap-1, 0x1.12ceb37ff9bc3p-5 },
+ { 0x1.fc07f907bc794p-1, 0x1.0adb5fcfa8c75p-5 },
+ { 0x1.fc28d7e4f9cd0p-1, 0x1.031ad58d56279p-5 },
+ { 0x1.fc48c1d033c7ap-1, 0x1.f7182a851bca2p-6 },
+ { 0x1.fc67bcf2d7b8fp-1, 0x1.e85c449e377f2p-6 },
+ { 0x1.fc85cf56ecd38p-1, 0x1.da0005e5f28dfp-6 },
+ { 0x1.fca2fee770c79p-1, 0x1.cc0180af00a8bp-6 },
+ { 0x1.fcbf5170b578bp-1, 0x1.be5ecd2fcb5f9p-6 },
+ { 0x1.fcdacca0bfb73p-1, 0x1.b1160991ff737p-6 },
+ { 0x1.fcf57607a6e7cp-1, 0x1.a4255a00b9f03p-6 },
+ { 0x1.fd0f5317f582fp-1, 0x1.978ae8b55ce1bp-6 },
+ { 0x1.fd2869270a56fp-1, 0x1.8b44e6031383ep-6 },
+ { 0x1.fd40bd6d7a785p-1, 0x1.7f5188610ddc8p-6 },
+ { 0x1.fd58550773cb5p-1, 0x1.73af0c737bb45p-6 },
+ { 0x1.fd6f34f52013ap-1, 0x1.685bb5134ef13p-6 },
+ { 0x1.fd85621b0876dp-1, 0x1.5d55cb54cd53ap-6 },
+ { 0x1.fd9ae142795e3p-1, 0x1.529b9e8cf9a1ep-6 },
+ { 0x1.fdafb719e6a69p-1, 0x1.482b8455dc491p-6 },
+ { 0x1.fdc3e835500b3p-1, 0x1.3e03d891b37dep-6 },
+ { 0x1.fdd7790ea5bc0p-1, 0x1.3422fd6d12e2bp-6 },
+ { 0x1.fdea6e062d0c9p-1, 0x1.2a875b5ffab56p-6 },
+ { 0x1.fdfccb62e52d3p-1, 0x1.212f612dee7fbp-6 },
+ { 0x1.fe0e9552ebdd6p-1, 0x1.181983e5133ddp-6 },
+ { 0x1.fe1fcfebe2083p-1, 0x1.0f443edc5ce49p-6 },
+ { 0x1.fe307f2b503d0p-1, 0x1.06ae13b0d3255p-6 },
+ { 0x1.fe40a6f70af4bp-1, 0x1.fcab1483ea7fcp-7 },
+ { 0x1.fe504b1d9696cp-1, 0x1.ec72615a894c4p-7 },
+ { 0x1.fe5f6f568b301p-1, 0x1.dcaf3691fc448p-7 },
+ { 0x1.fe6e1742f7cf6p-1, 0x1.cd5ec93c12431p-7 },
+ { 0x1.fe7c466dc57a1p-1, 0x1.be7e5ac24963bp-7 },
+ { 0x1.fe8a004c19ae6p-1, 0x1.b00b38d6b3575p-7 },
+ { 0x1.fe97483db8670p-1, 0x1.a202bd6372dcep-7 },
+ { 0x1.fea4218d6594ap-1, 0x1.94624e78e0fafp-7 },
+ { 0x1.feb08f7146046p-1, 0x1.87275e3a6869dp-7 },
+ { 0x1.febc950b3fa75p-1, 0x1.7a4f6aca256cbp-7 },
+ { 0x1.fec835695932ep-1, 0x1.6dd7fe3358230p-7 },
+ { 0x1.fed37386190fbp-1, 0x1.61beae53b72b7p-7 },
+ { 0x1.fede5248e38f4p-1, 0x1.56011cc3b036dp-7 },
+ { 0x1.fee8d486585eep-1, 0x1.4a9cf6bda3f4cp-7 },
+ { 0x1.fef2fd00af31ap-1, 0x1.3f8ff5042a88ep-7 },
+ { 0x1.fefcce6813974p-1, 0x1.34d7dbc76d7e5p-7 },
+ { 0x1.ff064b5afffbep-1, 0x1.2a727a89a3f14p-7 },
+ { 0x1.ff0f766697c76p-1, 0x1.205dac02bd6b9p-7 },
+ { 0x1.ff18520700971p-1, 0x1.1697560347b25p-7 },
+ { 0x1.ff20e0a7ba8c2p-1, 0x1.0d1d69569b82dp-7 },
+ { 0x1.ff2924a3f7a83p-1, 0x1.03ede1a45bfeep-7 },
+ { 0x1.ff312046f2339p-1, 0x1.f60d8aa2a88f2p-8 },
+ { 0x1.ff38d5cc4227fp-1, 0x1.e4cc4abf7d065p-8 },
+ { 0x1.ff404760319b4p-1, 0x1.d4143a9dfe965p-8 },
+ { 0x1.ff47772010262p-1, 0x1.c3e1a5f5c077cp-8 },
+ { 0x1.ff4e671a85425p-1, 0x1.b430ecf4a83a8p-8 },
+ { 0x1.ff55194fe19dfp-1, 0x1.a4fe83fb9db25p-8 },
+ { 0x1.ff5b8fb26f5f6p-1, 0x1.9646f35a76623p-8 },
+ { 0x1.ff61cc26c1578p-1, 0x1.8806d70b2fc36p-8 },
+ { 0x1.ff67d08401202p-1, 0x1.7a3ade6c8b3e4p-8 },
+ { 0x1.ff6d9e943c231p-1, 0x1.6cdfcbfc1e263p-8 },
+ { 0x1.ff733814af88cp-1, 0x1.5ff2750fe7820p-8 },
+ { 0x1.ff789eb6130c9p-1, 0x1.536fc18f7ce5cp-8 },
+ { 0x1.ff7dd41ce2b4dp-1, 0x1.4754abacdf1dcp-8 },
+ { 0x1.ff82d9e1a76d8p-1, 0x1.3b9e3f9d06e3fp-8 },
+ { 0x1.ff87b1913e853p-1, 0x1.30499b503957fp-8 },
+ { 0x1.ff8c5cad200a5p-1, 0x1.2553ee2a336bfp-8 },
+ { 0x1.ff90dcaba4096p-1, 0x1.1aba78ba3af89p-8 },
+ { 0x1.ff9532f846ab0p-1, 0x1.107a8c7323a6ep-8 },
+ { 0x1.ff9960f3eb327p-1, 0x1.06918b6355624p-8 },
+ { 0x1.ff9d67f51ddbap-1, 0x1.f9f9cfd9c3035p-9 },
+ { 0x1.ffa14948549a7p-1, 0x1.e77448fb66bb9p-9 },
+ { 0x1.ffa506302ebaep-1, 0x1.d58da68fd1170p-9 },
+ { 0x1.ffa89fe5b3625p-1, 0x1.c4412bf4b8f0bp-9 },
+ { 0x1.ffac17988ef4bp-1, 0x1.b38a3af2e55b4p-9 },
+ { 0x1.ffaf6e6f4f5c0p-1, 0x1.a3645330550ffp-9 },
+ { 0x1.ffb2a5879f35ep-1, 0x1.93cb11a30d765p-9 },
+ { 0x1.ffb5bdf67fe6fp-1, 0x1.84ba3004a50d0p-9 },
+ { 0x1.ffb8b8c88295fp-1, 0x1.762d84469c18fp-9 },
+ { 0x1.ffbb970200110p-1, 0x1.6821000795a03p-9 },
+ { 0x1.ffbe599f4f9d9p-1, 0x1.5a90b00981d93p-9 },
+ { 0x1.ffc10194fcb64p-1, 0x1.4d78bba8ca5fdp-9 },
+ { 0x1.ffc38fcffbb7cp-1, 0x1.40d564548fad7p-9 },
+ { 0x1.ffc60535dd7f5p-1, 0x1.34a305080681fp-9 },
+ { 0x1.ffc862a501fd7p-1, 0x1.28de11c5031ebp-9 },
+ { 0x1.ffcaa8f4c9beap-1, 0x1.1d83170fbf6fbp-9 },
+ { 0x1.ffccd8f5c66d1p-1, 0x1.128eb96be8798p-9 },
+ { 0x1.ffcef371ea4d7p-1, 0x1.07fdb4dafea5fp-9 },
+ { 0x1.ffd0f92cb6ba7p-1, 0x1.fb99b8b8279e1p-10 },
+ { 0x1.ffd2eae369a07p-1, 0x1.e7f232d9e2630p-10 },
+ { 0x1.ffd4c94d29fdbp-1, 0x1.d4fed7195d7e8p-10 },
+ { 0x1.ffd6951b33686p-1, 0x1.c2b9cf7f893bfp-10 },
+ { 0x1.ffd84ef9009eep-1, 0x1.b11d702b3deb1p-10 },
+ { 0x1.ffd9f78c7524ap-1, 0x1.a024365f771bdp-10 },
+ { 0x1.ffdb8f7605ee7p-1, 0x1.8fc8c794b03b5p-10 },
+ { 0x1.ffdd1750e1220p-1, 0x1.8005f08d6f1efp-10 },
+ { 0x1.ffde8fb314ebfp-1, 0x1.70d6a46e07ddap-10 },
+ { 0x1.ffdff92db56e5p-1, 0x1.6235fbd7a4345p-10 },
+ { 0x1.ffe1544d01ccbp-1, 0x1.541f340697987p-10 },
+ { 0x1.ffe2a1988857cp-1, 0x1.468dadf4080abp-10 },
+ { 0x1.ffe3e19349dc7p-1, 0x1.397ced7af2b15p-10 },
+ { 0x1.ffe514bbdc197p-1, 0x1.2ce898809244ep-10 },
+ { 0x1.ffe63b8c8b5f7p-1, 0x1.20cc76202c5fap-10 },
+ { 0x1.ffe7567b7b5e1p-1, 0x1.15246dda49d47p-10 },
+ { 0x1.ffe865fac722bp-1, 0x1.09ec86c75d497p-10 },
+ { 0x1.ffe96a78a04a9p-1, 0x1.fe41cd9bb4eeep-11 },
+ { 0x1.ffea645f6d6dap-1, 0x1.e97ba3b77f306p-11 },
+ { 0x1.ffeb5415e7c44p-1, 0x1.d57f524723822p-11 },
+ { 0x1.ffec39ff380b9p-1, 0x1.c245d4b998479p-11 },
+ { 0x1.ffed167b12ac2p-1, 0x1.afc85e0f82e12p-11 },
+ { 0x1.ffede9e5d3262p-1, 0x1.9e005769dbc1dp-11 },
+ { 0x1.ffeeb49896c6dp-1, 0x1.8ce75e9f6f8a0p-11 },
+ { 0x1.ffef76e956a9fp-1, 0x1.7c7744d9378f7p-11 },
+ { 0x1.fff0312b010b5p-1, 0x1.6caa0d3582fe9p-11 },
+ { 0x1.fff0e3ad91ec2p-1, 0x1.5d79eb71e893bp-11 },
+ { 0x1.fff18ebe2b0e1p-1, 0x1.4ee1429bf7cc0p-11 },
+ { 0x1.fff232a72b48ep-1, 0x1.40daa3c89f5b6p-11 },
+ { 0x1.fff2cfb0453d9p-1, 0x1.3360ccd23db3ap-11 },
+ { 0x1.fff3661e9569dp-1, 0x1.266ea71d4f71ap-11 },
+ { 0x1.fff3f634b79f9p-1, 0x1.19ff4663ae9dfp-11 },
+ { 0x1.fff48032dbe40p-1, 0x1.0e0de78654d1ep-11 },
+ { 0x1.fff50456dab8cp-1, 0x1.0295ef6591848p-11 },
+ { 0x1.fff582dc48d30p-1, 0x1.ef25d37f49fe1p-12 },
+ { 0x1.fff5fbfc8a439p-1, 0x1.da01102b5f851p-12 },
+ { 0x1.fff66feee5129p-1, 0x1.c5b5412dcafadp-12 },
+ { 0x1.fff6dee89352ep-1, 0x1.b23a5a23e4210p-12 },
+ { 0x1.fff7491cd4af6p-1, 0x1.9f8893d8fd1c1p-12 },
+ { 0x1.fff7aebcff755p-1, 0x1.8d986a4187285p-12 },
+ { 0x1.fff80ff8911fdp-1, 0x1.7c629a822bc9ep-12 },
+ { 0x1.fff86cfd3e657p-1, 0x1.6be02102b3520p-12 },
+ { 0x1.fff8c5f702ccfp-1, 0x1.5c0a378c90bcap-12 },
+ { 0x1.fff91b102fca8p-1, 0x1.4cda5374ea275p-12 },
+ { 0x1.fff96c717b695p-1, 0x1.3e4a23d1f4702p-12 },
+ { 0x1.fff9ba420e834p-1, 0x1.30538fbb77ecdp-12 },
+ { 0x1.fffa04a7928b1p-1, 0x1.22f0b496539bdp-12 },
+ { 0x1.fffa4bc63ee9ap-1, 0x1.161be46ad3b50p-12 },
+ { 0x1.fffa8fc0e5f33p-1, 0x1.09cfa445b00ffp-12 },
+ { 0x1.fffad0b901755p-1, 0x1.fc0d55470cf51p-13 },
+ { 0x1.fffb0ecebee1bp-1, 0x1.e577bbcd49935p-13 },
+ { 0x1.fffb4a210b172p-1, 0x1.cfd4a5adec5bfp-13 },
+ { 0x1.fffb82cd9dcbfp-1, 0x1.bb1a9657ce465p-13 },
+ { 0x1.fffbb8f1049c6p-1, 0x1.a740684026555p-13 },
+ { 0x1.fffbeca6adbe9p-1, 0x1.943d4a1d1ed39p-13 },
+ { 0x1.fffc1e08f25f5p-1, 0x1.8208bc334a6a5p-13 },
+ { 0x1.fffc4d3120aa1p-1, 0x1.709a8db59f25cp-13 },
+ { 0x1.fffc7a37857d2p-1, 0x1.5feada379d8b7p-13 },
+ { 0x1.fffca53375ce3p-1, 0x1.4ff207314a102p-13 },
+ { 0x1.fffcce3b57bffp-1, 0x1.40a8c1949f75ep-13 },
+ { 0x1.fffcf564ab6b7p-1, 0x1.3207fb7420eb9p-13 },
+ { 0x1.fffd1ac4135f9p-1, 0x1.2408e9ba3327fp-13 },
+ { 0x1.fffd3e6d5cd87p-1, 0x1.16a501f0e42cap-13 },
+ { 0x1.fffd607387b07p-1, 0x1.09d5f819c9e29p-13 },
+ { 0x1.fffd80e8ce0dap-1, 0x1.fb2b792b40a22p-14 },
+ { 0x1.fffd9fdeabccep-1, 0x1.e3bcf436a1a95p-14 },
+ { 0x1.fffdbd65e5ad0p-1, 0x1.cd55277c18d05p-14 },
+ { 0x1.fffdd98e903b2p-1, 0x1.b7e94604479dcp-14 },
+ { 0x1.fffdf46816833p-1, 0x1.a36eec00926ddp-14 },
+ { 0x1.fffe0e0140857p-1, 0x1.8fdc1b2dcf7b9p-14 },
+ { 0x1.fffe26683972ap-1, 0x1.7d2737527c3f9p-14 },
+ { 0x1.fffe3daa95b18p-1, 0x1.6b4702d7d5849p-14 },
+ { 0x1.fffe53d558ae9p-1, 0x1.5a329b7d30748p-14 },
+ { 0x1.fffe68f4fa777p-1, 0x1.49e17724f4d41p-14 },
+ { 0x1.fffe7d156d244p-1, 0x1.3a4b60ba9aa4dp-14 },
+ { 0x1.fffe904222101p-1, 0x1.2b6875310f785p-14 },
+ { 0x1.fffea2860ee1ep-1, 0x1.1d312098e9dbap-14 },
+ { 0x1.fffeb3ebb267bp-1, 0x1.0f9e1b4dd36dfp-14 },
+ { 0x1.fffec47d19457p-1, 0x1.02a8673a94691p-14 },
+ { 0x1.fffed443e2787p-1, 0x1.ec929a665b449p-15 },
+ { 0x1.fffee34943b15p-1, 0x1.d4f4b4c8e09edp-15 },
+ { 0x1.fffef1960d85dp-1, 0x1.be6abbb10a5aap-15 },
+ { 0x1.fffeff32af7afp-1, 0x1.a8e8cc1fadef6p-15 },
+ { 0x1.ffff0c273bea2p-1, 0x1.94637d5bacfdbp-15 },
+ { 0x1.ffff187b6bc0ep-1, 0x1.80cfdc72220cfp-15 },
+ { 0x1.ffff2436a21dcp-1, 0x1.6e2367dc27f95p-15 },
+ { 0x1.ffff2f5fefcaap-1, 0x1.5c540b4936fd2p-15 },
+ { 0x1.ffff39fe16963p-1, 0x1.4b581b8d170fcp-15 },
+ { 0x1.ffff44178c8d2p-1, 0x1.3b2652b06c2b2p-15 },
+ { 0x1.ffff4db27f146p-1, 0x1.2bb5cc22e5db6p-15 },
+ { 0x1.ffff56d4d5e5ep-1, 0x1.1cfe010e2052dp-15 },
+ { 0x1.ffff5f8435efcp-1, 0x1.0ef6c4c84a0fep-15 },
+ { 0x1.ffff67c604180p-1, 0x1.01984165a5f36p-15 },
+ { 0x1.ffff6f9f67e55p-1, 0x1.e9b5e8d00ce76p-16 },
+ { 0x1.ffff77154e0d6p-1, 0x1.d16f5716c6c1ap-16 },
+ { 0x1.ffff7e2c6aea2p-1, 0x1.ba4f035d60e02p-16 },
+ { 0x1.ffff84e93cd75p-1, 0x1.a447b7b03f045p-16 },
+ { 0x1.ffff8b500e77cp-1, 0x1.8f4ccca7fc90dp-16 },
+ { 0x1.ffff9164f8e46p-1, 0x1.7b5223dac7336p-16 },
+ { 0x1.ffff972be5c59p-1, 0x1.684c227fcacefp-16 },
+ { 0x1.ffff9ca891572p-1, 0x1.562fac4329b48p-16 },
+ { 0x1.ffffa1de8c582p-1, 0x1.44f21e49054f2p-16 },
+ { 0x1.ffffa6d13de73p-1, 0x1.34894a5e24657p-16 },
+ { 0x1.ffffab83e54b8p-1, 0x1.24eb7254ccf83p-16 },
+ { 0x1.ffffaff99bac4p-1, 0x1.160f438c70913p-16 },
+ { 0x1.ffffb43555b5fp-1, 0x1.07ebd2a2d2844p-16 },
+ { 0x1.ffffb839e52f3p-1, 0x1.f4f12e9ab070ap-17 },
+ { 0x1.ffffbc09fa7cdp-1, 0x1.db5ad0b27805cp-17 },
+ { 0x1.ffffbfa82616bp-1, 0x1.c304efa2c6f4ep-17 },
+ { 0x1.ffffc316d9ed0p-1, 0x1.abe09e9144b5ep-17 },
+ { 0x1.ffffc6586abf6p-1, 0x1.95df988e76644p-17 },
+ { 0x1.ffffc96f1165ep-1, 0x1.80f439b4ee04bp-17 },
+ { 0x1.ffffcc5cec0c1p-1, 0x1.6d11788a69c64p-17 },
+ { 0x1.ffffcf23ff5fcp-1, 0x1.5a2adfa0b4bc4p-17 },
+ { 0x1.ffffd1c637b2bp-1, 0x1.4834877429b8fp-17 },
+ { 0x1.ffffd4456a10dp-1, 0x1.37231085c7d9ap-17 },
+ { 0x1.ffffd6a3554a1p-1, 0x1.26eb9daed6f7ep-17 },
+ { 0x1.ffffd8e1a2f22p-1, 0x1.1783ceac28910p-17 },
+ { 0x1.ffffdb01e8546p-1, 0x1.08e1badf0fcedp-17 },
+ { 0x1.ffffdd05a75eap-1, 0x1.f5f7d88472604p-18 },
+ { 0x1.ffffdeee4f810p-1, 0x1.db92b5212fb8dp-18 },
+ { 0x1.ffffe0bd3e852p-1, 0x1.c282cd3957edap-18 },
+ { 0x1.ffffe273c15b7p-1, 0x1.aab7abace48dcp-18 },
+ { 0x1.ffffe41314e06p-1, 0x1.94219bfcb4928p-18 },
+ { 0x1.ffffe59c6698bp-1, 0x1.7eb1a2075864dp-18 },
+ { 0x1.ffffe710d565ep-1, 0x1.6a597219a93d9p-18 },
+ { 0x1.ffffe8717232dp-1, 0x1.570b69502f313p-18 },
+ { 0x1.ffffe9bf4098cp-1, 0x1.44ba864670882p-18 },
+ { 0x1.ffffeafb377d5p-1, 0x1.335a62115bce2p-18 },
+ { 0x1.ffffec2641a9ep-1, 0x1.22df298214423p-18 },
+ { 0x1.ffffed413e5b7p-1, 0x1.133d96ae7e0ddp-18 },
+ { 0x1.ffffee4d01cd6p-1, 0x1.046aeabcfcdecp-18 },
+ { 0x1.ffffef4a55bd4p-1, 0x1.ecb9cfe1d8642p-19 },
+ { 0x1.fffff039f9e8fp-1, 0x1.d21397ead99cbp-19 },
+ { 0x1.fffff11ca4876p-1, 0x1.b8d094c86d374p-19 },
+ { 0x1.fffff1f302bc1p-1, 0x1.a0df0f0c626dcp-19 },
+ { 0x1.fffff2bdb904dp-1, 0x1.8a2e269750a39p-19 },
+ { 0x1.fffff37d63a36p-1, 0x1.74adc8f4064d3p-19 },
+ { 0x1.fffff43297019p-1, 0x1.604ea819f007cp-19 },
+ { 0x1.fffff4dde0118p-1, 0x1.4d0231928c6f9p-19 },
+ { 0x1.fffff57fc4a95p-1, 0x1.3aba85fe22e1fp-19 },
+ { 0x1.fffff618c3da6p-1, 0x1.296a70f414053p-19 },
+ { 0x1.fffff6a956450p-1, 0x1.1905613b3abf2p-19 },
+ { 0x1.fffff731ee681p-1, 0x1.097f6156f32c5p-19 },
+ { 0x1.fffff7b2f8ed6p-1, 0x1.f59a20caf6695p-20 },
+ { 0x1.fffff82cdcf1bp-1, 0x1.d9c73698fb1dcp-20 },
+ { 0x1.fffff89ffc4aap-1, 0x1.bf716c6168baep-20 },
+ { 0x1.fffff90cb3c81p-1, 0x1.a6852c6b58392p-20 },
+ { 0x1.fffff9735b73bp-1, 0x1.8eefd70594a88p-20 },
+ { 0x1.fffff9d446cccp-1, 0x1.789fb715aae95p-20 },
+ { 0x1.fffffa2fc5015p-1, 0x1.6383f726a8e04p-20 },
+ { 0x1.fffffa8621251p-1, 0x1.4f8c96f26a26ap-20 },
+ { 0x1.fffffad7a2652p-1, 0x1.3caa61607f920p-20 },
+ { 0x1.fffffb248c39dp-1, 0x1.2acee2f5ecdb8p-20 },
+ { 0x1.fffffb6d1e95dp-1, 0x1.19ec60b1242edp-20 },
+ { 0x1.fffffbb196132p-1, 0x1.09f5cf4dd2877p-20 },
+ { 0x1.fffffbf22c1e2p-1, 0x1.f5bd95d8730d8p-21 },
+ { 0x1.fffffc2f171e3p-1, 0x1.d9371e2ff7c35p-21 },
+ { 0x1.fffffc688a9cfp-1, 0x1.be41de54d155ap-21 },
+ { 0x1.fffffc9eb76acp-1, 0x1.a4c89e08ef4f3p-21 },
+ { 0x1.fffffcd1cbc28p-1, 0x1.8cb738399b12cp-21 },
+ { 0x1.fffffd01f36afp-1, 0x1.75fa8dbc84becp-21 },
+ { 0x1.fffffd2f57d68p-1, 0x1.608078a70dcbcp-21 },
+ { 0x1.fffffd5a2041fp-1, 0x1.4c37c0394d094p-21 },
+ { 0x1.fffffd8271d12p-1, 0x1.39100d5687bfep-21 },
+ { 0x1.fffffda86faa9p-1, 0x1.26f9df8519bd6p-21 },
+ { 0x1.fffffdcc3b117p-1, 0x1.15e6827001f18p-21 },
+ { 0x1.fffffdedf37edp-1, 0x1.05c803e4831c1p-21 },
+ { 0x1.fffffe0db6b91p-1, 0x1.ed22548cffd35p-22 },
+ { 0x1.fffffe2ba0ea5p-1, 0x1.d06ad6ecdf971p-22 },
+ { 0x1.fffffe47ccb60p-1, 0x1.b551c847fbc96p-22 },
+ { 0x1.fffffe62534d4p-1, 0x1.9bc09f112b494p-22 },
+ { 0x1.fffffe7b4c81ep-1, 0x1.83a1ff0aa239dp-22 },
+ { 0x1.fffffe92ced93p-1, 0x1.6ce1aa3fd7bddp-22 },
+ { 0x1.fffffea8ef9cfp-1, 0x1.576c72b514859p-22 },
+ { 0x1.fffffebdc2ec6p-1, 0x1.43302cc4a0da8p-22 },
+ { 0x1.fffffed15bcbap-1, 0x1.301ba221dc9bbp-22 },
+ { 0x1.fffffee3cc32cp-1, 0x1.1e1e857adc568p-22 },
+ { 0x1.fffffef5251c2p-1, 0x1.0d2966b1746f7p-22 },
+ { 0x1.ffffff0576917p-1, 0x1.fa5b4f49cc6b2p-23 },
+ { 0x1.ffffff14cfb92p-1, 0x1.dc3ae30b55c16p-23 },
+ { 0x1.ffffff233ee1dp-1, 0x1.bfd7555a3bd68p-23 },
+ { 0x1.ffffff30d18e8p-1, 0x1.a517d9e61628ap-23 },
+ { 0x1.ffffff3d9480fp-1, 0x1.8be4f8f6c951fp-23 },
+ { 0x1.ffffff4993c46p-1, 0x1.74287ded49339p-23 },
+ { 0x1.ffffff54dab72p-1, 0x1.5dcd669f2cd34p-23 },
+ { 0x1.ffffff5f74141p-1, 0x1.48bfd38302870p-23 },
+ { 0x1.ffffff6969fb8p-1, 0x1.34ecf8a3c124ap-23 },
+ { 0x1.ffffff72c5fb6p-1, 0x1.22430f521cbcfp-23 },
+ { 0x1.ffffff7b91176p-1, 0x1.10b1488aeb235p-23 },
+ { 0x1.ffffff83d3d07p-1, 0x1.0027c00a263a6p-23 },
+ { 0x1.ffffff8b962bep-1, 0x1.e12ee004efc37p-24 },
+ { 0x1.ffffff92dfba2p-1, 0x1.c3e44ae32b16bp-24 },
+ { 0x1.ffffff99b79d2p-1, 0x1.a854ea14102a8p-24 },
+ { 0x1.ffffffa0248e8p-1, 0x1.8e6761569f45dp-24 },
+ { 0x1.ffffffa62ce54p-1, 0x1.7603bac345f65p-24 },
+ { 0x1.ffffffabd69b4p-1, 0x1.5f1353cdad001p-24 },
+ { 0x1.ffffffb127525p-1, 0x1.4980cb3c80949p-24 },
+ { 0x1.ffffffb624592p-1, 0x1.3537f00b6ad4dp-24 },
+ { 0x1.ffffffbad2affp-1, 0x1.2225b12bffc68p-24 },
+ { 0x1.ffffffbf370cdp-1, 0x1.10380e1adb7e9p-24 },
+ { 0x1.ffffffc355dfdp-1, 0x1.febc107d5efaap-25 },
+ { 0x1.ffffffc733572p-1, 0x1.df0f2a0ee6946p-25 },
+ { 0x1.ffffffcad3626p-1, 0x1.c14b2188bcee4p-25 },
+ { 0x1.ffffffce39b67p-1, 0x1.a553644f7f07dp-25 },
+ { 0x1.ffffffd169d0cp-1, 0x1.8b0cfce0579dfp-25 },
+ { 0x1.ffffffd466fa5p-1, 0x1.725e7c5dd20f7p-25 },
+ { 0x1.ffffffd7344aap-1, 0x1.5b2fe547a1340p-25 },
+ { 0x1.ffffffd9d4aabp-1, 0x1.456a974e92e93p-25 },
+ { 0x1.ffffffdc4ad7ap-1, 0x1.30f93c3699078p-25 },
+ { 0x1.ffffffde9964ep-1, 0x1.1dc7b5b978cf8p-25 },
+ { 0x1.ffffffe0c2bf0p-1, 0x1.0bc30c5d52f15p-25 },
+ { 0x1.ffffffe2c92dbp-1, 0x1.f5b2be65a0c7fp-26 },
+ { 0x1.ffffffe4aed5ep-1, 0x1.d5f3a8dea7357p-26 },
+ { 0x1.ffffffe675bbdp-1, 0x1.b82915b03515bp-26 },
+ { 0x1.ffffffe81fc4ep-1, 0x1.9c3517e789488p-26 },
+ { 0x1.ffffffe9aeb97p-1, 0x1.81fb7df06136ep-26 },
+ { 0x1.ffffffeb24467p-1, 0x1.6961b8d641d06p-26 },
+ { 0x1.ffffffec81ff2p-1, 0x1.524ec4d916caep-26 },
+ { 0x1.ffffffedc95e7p-1, 0x1.3cab1343d18d1p-26 },
+ { 0x1.ffffffeefbc85p-1, 0x1.2860757487a01p-26 },
+ { 0x1.fffffff01a8b6p-1, 0x1.155a09065d4f7p-26 },
+ { 0x1.fffffff126e1ep-1, 0x1.0384250e4c9fcp-26 },
+ { 0x1.fffffff221f30p-1, 0x1.e59890b926c78p-27 },
+ { 0x1.fffffff30cd3fp-1, 0x1.c642116a8a9e3p-27 },
+ { 0x1.fffffff3e8892p-1, 0x1.a8e405e651ab6p-27 },
+ { 0x1.fffffff4b606fp-1, 0x1.8d5f98114f872p-27 },
+ { 0x1.fffffff57632dp-1, 0x1.7397c5a66e307p-27 },
+ { 0x1.fffffff629e44p-1, 0x1.5b71456c5a4c4p-27 },
+ { 0x1.fffffff6d1e56p-1, 0x1.44d26de513197p-27 },
+ { 0x1.fffffff76ef3fp-1, 0x1.2fa31d6371537p-27 },
+ { 0x1.fffffff801c1fp-1, 0x1.1bcca373b7b43p-27 },
+ { 0x1.fffffff88af67p-1, 0x1.0939ab853339fp-27 },
+ { 0x1.fffffff90b2e3p-1, 0x1.efac5187b2863p-28 },
+ { 0x1.fffffff982fc1p-1, 0x1.cf1e86235d0e6p-28 },
+ { 0x1.fffffff9f2e9fp-1, 0x1.b0a68a2128babp-28 },
+ { 0x1.fffffffa5b790p-1, 0x1.9423165bc4444p-28 },
+ { 0x1.fffffffabd229p-1, 0x1.7974e743dea3cp-28 },
+ { 0x1.fffffffb18582p-1, 0x1.607e9eacd1050p-28 },
+ { 0x1.fffffffb6d844p-1, 0x1.4924a74dec728p-28 },
+ { 0x1.fffffffbbd0aap-1, 0x1.334d19e0c2160p-28 },
+ { 0x1.fffffffc0748fp-1, 0x1.1edfa3c5f5ccap-28 },
+ { 0x1.fffffffc4c96cp-1, 0x1.0bc56f1b54701p-28 },
+ { 0x1.fffffffc8d462p-1, 0x1.f3d2185e047d9p-29 },
+ { 0x1.fffffffcc9a41p-1, 0x1.d26cb87945e87p-29 },
+ { 0x1.fffffffd01f89p-1, 0x1.b334fac4b9f99p-29 },
+ { 0x1.fffffffd36871p-1, 0x1.96076f7918d1cp-29 },
+ { 0x1.fffffffd678edp-1, 0x1.7ac2d72fc2c63p-29 },
+ { 0x1.fffffffd954aep-1, 0x1.614801550319ep-29 },
+ { 0x1.fffffffdbff2ap-1, 0x1.4979ac8b28926p-29 },
+ { 0x1.fffffffde7ba0p-1, 0x1.333c68e2d0548p-29 },
+ { 0x1.fffffffe0cd16p-1, 0x1.1e767bce37dd7p-29 },
+ { 0x1.fffffffe2f664p-1, 0x1.0b0fc5b6d05a0p-29 },
+ { 0x1.fffffffe4fa30p-1, 0x1.f1e3523b41d7dp-30 },
+ { 0x1.fffffffe6daf7p-1, 0x1.d00de6608effep-30 },
+ { 0x1.fffffffe89b0cp-1, 0x1.b0778b7b3301ap-30 },
+ { 0x1.fffffffea3c9ap-1, 0x1.92fb04ec0f6cfp-30 },
+ { 0x1.fffffffebc1a9p-1, 0x1.77756ec9f78fap-30 },
+ { 0x1.fffffffed2c21p-1, 0x1.5dc61922d5a06p-30 },
+ { 0x1.fffffffee7dc8p-1, 0x1.45ce65699ff6dp-30 },
+ { 0x1.fffffffefb847p-1, 0x1.2f71a5f159970p-30 },
+ { 0x1.ffffffff0dd2bp-1, 0x1.1a94ff571654fp-30 },
+ { 0x1.ffffffff1ede9p-1, 0x1.071f4bbea09ecp-30 },
+ { 0x1.ffffffff2ebdap-1, 0x1.e9f1ff8ddd774p-31 },
+ { 0x1.ffffffff3d843p-1, 0x1.c818223a202c7p-31 },
+ { 0x1.ffffffff4b453p-1, 0x1.a887bd2b4404dp-31 },
+ { 0x1.ffffffff58126p-1, 0x1.8b1a336c5eb6bp-31 },
+ { 0x1.ffffffff63fc3p-1, 0x1.6fab63324088ap-31 },
+ { 0x1.ffffffff6f121p-1, 0x1.56197e30205bap-31 },
+ { 0x1.ffffffff79626p-1, 0x1.3e44e45301b92p-31 },
+ { 0x1.ffffffff82fabp-1, 0x1.281000bfe4c3fp-31 },
+ { 0x1.ffffffff8be77p-1, 0x1.135f28f2d50b4p-31 },
+ { 0x1.ffffffff94346p-1, 0x1.00187dded5975p-31 },
+ { 0x1.ffffffff9bec8p-1, 0x1.dc479de0ef001p-32 },
+ { 0x1.ffffffffa319fp-1, 0x1.bad4fdad3caa1p-32 },
+ { 0x1.ffffffffa9c63p-1, 0x1.9baed3ed27ab8p-32 },
+ { 0x1.ffffffffaffa4p-1, 0x1.7ead9ce4285bbp-32 },
+ { 0x1.ffffffffb5be5p-1, 0x1.63ac6b4edc88ep-32 },
+ { 0x1.ffffffffbb1a2p-1, 0x1.4a88be2a6390cp-32 },
+ { 0x1.ffffffffc014ep-1, 0x1.332259185f1a0p-32 },
+ { 0x1.ffffffffc4b56p-1, 0x1.1d5b1f3793044p-32 },
+ { 0x1.ffffffffc901cp-1, 0x1.0916f04b6e18bp-32 },
+ { 0x1.ffffffffccfffp-1, 0x1.ec77101de6926p-33 },
+ { 0x1.ffffffffd0b56p-1, 0x1.c960bf23153e0p-33 },
+ { 0x1.ffffffffd4271p-1, 0x1.a8bd20fc65ef7p-33 },
+ { 0x1.ffffffffd759dp-1, 0x1.8a61745ec7d1dp-33 },
+ { 0x1.ffffffffda520p-1, 0x1.6e25d0e756261p-33 },
+ { 0x1.ffffffffdd13cp-1, 0x1.53e4f7d1666cbp-33 },
+ { 0x1.ffffffffdfa2dp-1, 0x1.3b7c27a7ddb0ep-33 },
+ { 0x1.ffffffffe202dp-1, 0x1.24caf2c32af14p-33 },
+ { 0x1.ffffffffe4371p-1, 0x1.0fb3186804d0fp-33 },
+ { 0x1.ffffffffe642ap-1, 0x1.f830c0bb41fd7p-34 },
+ { 0x1.ffffffffe8286p-1, 0x1.d3c0f1a91c846p-34 },
+ { 0x1.ffffffffe9eb0p-1, 0x1.b1e5acf351d87p-34 },
+ { 0x1.ffffffffeb8d0p-1, 0x1.92712d259ce66p-34 },
+ { 0x1.ffffffffed10ap-1, 0x1.7538c60a04476p-34 },
+ { 0x1.ffffffffee782p-1, 0x1.5a14b04b47879p-34 },
+ { 0x1.ffffffffefc57p-1, 0x1.40dfd87456f4cp-34 },
+ { 0x1.fffffffff0fa7p-1, 0x1.2977b1172b9d5p-34 },
+ { 0x1.fffffffff218fp-1, 0x1.13bc07e891491p-34 },
+ { 0x1.fffffffff3227p-1, 0x1.ff1dbb4300811p-35 },
+ { 0x1.fffffffff4188p-1, 0x1.d9a880f306bd8p-35 },
+ { 0x1.fffffffff4fc9p-1, 0x1.b6e45220b55e0p-35 },
+ { 0x1.fffffffff5cfdp-1, 0x1.96a0b33f2c4dap-35 },
+ { 0x1.fffffffff6939p-1, 0x1.78b07e9e924acp-35 },
+ { 0x1.fffffffff748ep-1, 0x1.5ce9ab1670dd2p-35 },
+ { 0x1.fffffffff7f0dp-1, 0x1.4325167006bb0p-35 },
+ { 0x1.fffffffff88c5p-1, 0x1.2b3e53538ff3fp-35 },
+ { 0x1.fffffffff91c6p-1, 0x1.15137a7f44864p-35 },
+ { 0x1.fffffffff9a1bp-1, 0x1.0084ff125639dp-35 },
+ { 0x1.fffffffffa1d2p-1, 0x1.daeb0b7311ec7p-36 },
+ { 0x1.fffffffffa8f6p-1, 0x1.b7937d1c40c52p-36 },
+ { 0x1.fffffffffaf92p-1, 0x1.96d082f59ab06p-36 },
+ { 0x1.fffffffffb5b0p-1, 0x1.7872d9fa10aadp-36 },
+ { 0x1.fffffffffbb58p-1, 0x1.5c4e8e37bc7d0p-36 },
+ { 0x1.fffffffffc095p-1, 0x1.423ac0df49a40p-36 },
+ { 0x1.fffffffffc56dp-1, 0x1.2a117230ad284p-36 },
+ { 0x1.fffffffffc9e8p-1, 0x1.13af4f04f9998p-36 },
+ { 0x1.fffffffffce0dp-1, 0x1.fde703724e560p-37 },
+ { 0x1.fffffffffd1e1p-1, 0x1.d77f0c82e7641p-37 },
+ { 0x1.fffffffffd56cp-1, 0x1.b3ee02611d7ddp-37 },
+ { 0x1.fffffffffd8b3p-1, 0x1.92ff33023d5bdp-37 },
+ { 0x1.fffffffffdbbap-1, 0x1.7481a9e69f53fp-37 },
+ { 0x1.fffffffffde86p-1, 0x1.5847eda620959p-37 },
+ { 0x1.fffffffffe11dp-1, 0x1.3e27c1fcc74bdp-37 },
+ { 0x1.fffffffffe380p-1, 0x1.25f9ee0b923dcp-37 },
+ { 0x1.fffffffffe5b6p-1, 0x1.0f9a0686531ffp-37 },
+ { 0x1.fffffffffe7c0p-1, 0x1.f5cc7718082afp-38 },
+ { 0x1.fffffffffe9a2p-1, 0x1.cf7e53d6a2ca5p-38 },
+ { 0x1.fffffffffeb60p-1, 0x1.ac0f5f3229372p-38 },
+ { 0x1.fffffffffecfbp-1, 0x1.8b498644847eap-38 },
+ { 0x1.fffffffffee77p-1, 0x1.6cfa9bcca59dcp-38 },
+ { 0x1.fffffffffefd6p-1, 0x1.50f411d4fd2cdp-38 },
+ { 0x1.ffffffffff11ap-1, 0x1.370ab8327af5ep-38 },
+ { 0x1.ffffffffff245p-1, 0x1.1f167f88c6b6ep-38 },
+ { 0x1.ffffffffff359p-1, 0x1.08f24085d4597p-38 },
+ { 0x1.ffffffffff457p-1, 0x1.e8f70e181d619p-39 },
+ { 0x1.ffffffffff542p-1, 0x1.c324c20e337dcp-39 },
+ { 0x1.ffffffffff61bp-1, 0x1.a03261574b54ep-39 },
+ { 0x1.ffffffffff6e3p-1, 0x1.7fe903cdf5855p-39 },
+ { 0x1.ffffffffff79bp-1, 0x1.6215c58da3450p-39 },
+ { 0x1.ffffffffff845p-1, 0x1.46897d4b69fc6p-39 },
+ { 0x1.ffffffffff8e2p-1, 0x1.2d1877d731b7bp-39 },
+ { 0x1.ffffffffff973p-1, 0x1.159a386b11517p-39 },
+ { 0x1.ffffffffff9f8p-1, 0x1.ffd27ae9393cep-40 },
+ { 0x1.ffffffffffa73p-1, 0x1.d7c593130dd0bp-40 },
+ { 0x1.ffffffffffae4p-1, 0x1.b2cd607c79bcfp-40 },
+ { 0x1.ffffffffffb4cp-1, 0x1.90ae4d3405651p-40 },
+ { 0x1.ffffffffffbadp-1, 0x1.71312dd1759e2p-40 },
+ { 0x1.ffffffffffc05p-1, 0x1.5422ef5d8949dp-40 },
+ { 0x1.ffffffffffc57p-1, 0x1.39544b0ecc957p-40 },
+ { 0x1.ffffffffffca2p-1, 0x1.20997f73e73ddp-40 },
+ { 0x1.ffffffffffce7p-1, 0x1.09ca0eaacd277p-40 },
+ { 0x1.ffffffffffd27p-1, 0x1.e9810295890ecp-41 },
+ { 0x1.ffffffffffd62p-1, 0x1.c2b45b5aa4a1dp-41 },
+ { 0x1.ffffffffffd98p-1, 0x1.9eee068fa7596p-41 },
+ { 0x1.ffffffffffdcap-1, 0x1.7df2b399c10a8p-41 },
+ { 0x1.ffffffffffdf8p-1, 0x1.5f8b87a31bd85p-41 },
+ { 0x1.ffffffffffe22p-1, 0x1.4385c96e9a2d9p-41 },
+ { 0x1.ffffffffffe49p-1, 0x1.29b2933ef4cbcp-41 },
+ { 0x1.ffffffffffe6cp-1, 0x1.11e68a6378f8ap-41 },
+ { 0x1.ffffffffffe8dp-1, 0x1.f7f338086a86bp-42 },
+ { 0x1.ffffffffffeabp-1, 0x1.cf8d7d9ce040ap-42 },
+ { 0x1.ffffffffffec7p-1, 0x1.aa577251ae484p-42 },
+ { 0x1.ffffffffffee1p-1, 0x1.8811d739efb5ep-42 },
+ { 0x1.ffffffffffef8p-1, 0x1.68823e52970bep-42 },
+ { 0x1.fffffffffff0ep-1, 0x1.4b72ae68e8b4cp-42 },
+ { 0x1.fffffffffff22p-1, 0x1.30b14dbe876bcp-42 },
+ { 0x1.fffffffffff34p-1, 0x1.181012ef86610p-42 },
+ { 0x1.fffffffffff45p-1, 0x1.01647ba798744p-42 },
+ { 0x1.fffffffffff54p-1, 0x1.d90e917701675p-43 },
+ { 0x1.fffffffffff62p-1, 0x1.b2a87e86d0c8ap-43 },
+ { 0x1.fffffffffff6fp-1, 0x1.8f53dcb377293p-43 },
+ { 0x1.fffffffffff7bp-1, 0x1.6ed2f2515e933p-43 },
+ { 0x1.fffffffffff86p-1, 0x1.50ecc9ed47f19p-43 },
+ { 0x1.fffffffffff90p-1, 0x1.356cd5ce7799ep-43 },
+ { 0x1.fffffffffff9ap-1, 0x1.1c229a587ab78p-43 },
+ { 0x1.fffffffffffa2p-1, 0x1.04e15ecc7f3f6p-43 },
+ { 0x1.fffffffffffaap-1, 0x1.deffc7e6a6017p-44 },
+ { 0x1.fffffffffffb1p-1, 0x1.b7b040832f310p-44 },
+ { 0x1.fffffffffffb8p-1, 0x1.938e021f36d76p-44 },
+ { 0x1.fffffffffffbep-1, 0x1.7258610b3b233p-44 },
+ { 0x1.fffffffffffc3p-1, 0x1.53d3bfc82a909p-44 },
+ { 0x1.fffffffffffc8p-1, 0x1.37c92babdc2fdp-44 },
+ { 0x1.fffffffffffcdp-1, 0x1.1e06010120f6ap-44 },
+ { 0x1.fffffffffffd1p-1, 0x1.065b9616170d4p-44 },
+ { 0x1.fffffffffffd5p-1, 0x1.e13dd96b3753ap-45 },
+ { 0x1.fffffffffffd9p-1, 0x1.b950d32467392p-45 },
+ { 0x1.fffffffffffdcp-1, 0x1.94a72263259a5p-45 },
+ { 0x1.fffffffffffdfp-1, 0x1.72fd93e036cdcp-45 },
+ { 0x1.fffffffffffe2p-1, 0x1.54164576929abp-45 },
+ { 0x1.fffffffffffe4p-1, 0x1.37b83c521fe96p-45 },
+ { 0x1.fffffffffffe7p-1, 0x1.1daf033182e96p-45 },
+ { 0x1.fffffffffffe9p-1, 0x1.05ca50205d26ap-45 },
+ { 0x1.fffffffffffebp-1, 0x1.dfbb6235639fap-46 },
+ { 0x1.fffffffffffedp-1, 0x1.b7807e294781fp-46 },
+ { 0x1.fffffffffffeep-1, 0x1.9298add70a734p-46 },
+ { 0x1.ffffffffffff0p-1, 0x1.70beaf9c7ffb6p-46 },
+ { 0x1.ffffffffffff1p-1, 0x1.51b2cd6709222p-46 },
+ { 0x1.ffffffffffff3p-1, 0x1.353a6cf7f7fffp-46 },
+ { 0x1.ffffffffffff4p-1, 0x1.1b1fa8cbe84a7p-46 },
+ { 0x1.ffffffffffff5p-1, 0x1.0330f0fd69921p-46 },
+ { 0x1.ffffffffffff6p-1, 0x1.da81670f96f9bp-47 },
+ { 0x1.ffffffffffff7p-1, 0x1.b24a16b4d09aap-47 },
+ { 0x1.ffffffffffff7p-1, 0x1.8d6eeb6efdbd6p-47 },
+ { 0x1.ffffffffffff8p-1, 0x1.6ba91ac734785p-47 },
+ { 0x1.ffffffffffff9p-1, 0x1.4cb7966770ab5p-47 },
+ { 0x1.ffffffffffff9p-1, 0x1.305e9721d0981p-47 },
+ { 0x1.ffffffffffffap-1, 0x1.1667311fff70ap-47 },
+ { 0x1.ffffffffffffbp-1, 0x1.fd3de10d62855p-48 },
+ { 0x1.ffffffffffffbp-1, 0x1.d1aefbcd48d0cp-48 },
+ { 0x1.ffffffffffffbp-1, 0x1.a9cc93c25aca9p-48 },
+ { 0x1.ffffffffffffcp-1, 0x1.85487ee3ea735p-48 },
+ { 0x1.ffffffffffffcp-1, 0x1.63daf8b4b1e0cp-48 },
+ { 0x1.ffffffffffffdp-1, 0x1.45421e69a6ca1p-48 },
+ { 0x1.ffffffffffffdp-1, 0x1.294175802d99ap-48 },
+ { 0x1.ffffffffffffdp-1, 0x1.0fa17bf41068fp-48 },
+ { 0x1.ffffffffffffdp-1, 0x1.f05e82aae2bb9p-49 },
+ { 0x1.ffffffffffffep-1, 0x1.c578101b29058p-49 },
+ { 0x1.ffffffffffffep-1, 0x1.9e39dc5dd2f7cp-49 },
+ { 0x1.ffffffffffffep-1, 0x1.7a553a728bbf2p-49 },
+ { 0x1.ffffffffffffep-1, 0x1.5982008db1304p-49 },
+ { 0x1.ffffffffffffep-1, 0x1.3b7e00422e51bp-49 },
+ { 0x1.ffffffffffffep-1, 0x1.200c898d9ee3ep-49 },
+ { 0x1.fffffffffffffp-1, 0x1.06f5f7eb65a56p-49 },
+ { 0x1.fffffffffffffp-1, 0x1.e00e9148a1d25p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.b623734024e92p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.8fd4e01891bf8p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.6cd44c7470d89p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.4cd9c04158cd7p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.2fa34bf5c8344p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.14f4890ff2461p-50 },
+ { 0x1.fffffffffffffp-1, 0x1.f92c49dfa4df5p-51 },
+ { 0x1.fffffffffffffp-1, 0x1.ccaaea71ab0dfp-51 },
+ { 0x1.fffffffffffffp-1, 0x1.a40829f001197p-51 },
+ { 0x1.0000000000000p+0, 0x1.7eef13b59e96cp-51 },
+ { 0x1.0000000000000p+0, 0x1.5d11e1a252bf5p-51 },
+ { 0x1.0000000000000p+0, 0x1.3e296303b2297p-51 },
+ { 0x1.0000000000000p+0, 0x1.21f47009f43cep-51 },
+ { 0x1.0000000000000p+0, 0x1.083768c5e4541p-51 },
+ { 0x1.0000000000000p+0, 0x1.e1777d831265ep-52 },
+ { 0x1.0000000000000p+0, 0x1.b69f10b0191b5p-52 },
+ { 0x1.0000000000000p+0, 0x1.8f8a3a05b5b52p-52 },
+ { 0x1.0000000000000p+0, 0x1.6be573c40c8e7p-52 },
+ { 0x1.0000000000000p+0, 0x1.4b645ba991fdbp-52 },
+ { 0x1.0000000000000p+0, 0x1.2dc119095729fp-52 },
+ },
+};
diff --git a/pl/math/erfc_1u8.c b/pl/math/erfc_1u8.c
new file mode 100644
index 000000000000..7f2004e9335d
--- /dev/null
+++ b/pl/math/erfc_1u8.c
@@ -0,0 +1,153 @@
+/*
+ * Double-precision erfc(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Shift 0x1p45
+#define P20 0x1.5555555555555p-2 /* 1/3. */
+#define P21 0x1.5555555555555p-1 /* 2/3. */
+
+#define P40 0x1.999999999999ap-4 /* 1/10. */
+#define P41 0x1.999999999999ap-2 /* 2/5. */
+#define P42 0x1.11111111111111p-3 /* 2/15. */
+
+#define P50 0x1.5555555555555p-3 /* 1/6. */
+#define P51 0x1.c71c71c71c71cp-3 /* 2/9. */
+#define P52 0x1.6c16c16c16c17p-5 /* 2/45. */
+
+/* Qi = (i+1) / i. */
+#define Q5 0x1.3333333333333p0
+#define Q6 0x1.2aaaaaaaaaaabp0
+#define Q7 0x1.2492492492492p0
+#define Q8 0x1.2p0
+#define Q9 0x1.1c71c71c71c72p0
+
+/* Ri = -2 * i / ((i+1)*(i+2)). */
+#define R5 -0x1.e79e79e79e79ep-3
+#define R6 -0x1.b6db6db6db6dbp-3
+#define R7 -0x1.8e38e38e38e39p-3
+#define R8 -0x1.6c16c16c16c17p-3
+#define R9 -0x1.4f2094f2094f2p-3
+
+/* Fast erfc approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+ - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+ + p6(r) d^6 + ... + p10(r) d^10
+
+ Polynomials p6(r) to p10(r) are computed using recurrence relation
+
+ 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
+ with p0 = 1, and p1(r) = -r.
+
+ Values of erfc(r) and scale(r) are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+ Maximum measured error: 1.71 ULP
+ erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
+ want 0x1.e15fcbea3e7adp-608. */
+double
+erfc (double x)
+{
+ /* Get top words and sign. */
+ uint64_t ix = asuint64 (x);
+ uint64_t ia = ix & 0x7fffffffffffffff;
+ double a = asdouble (ia);
+ uint64_t sign = ix & ~0x7fffffffffffffff;
+
+ /* erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2. */
+ if (unlikely (ia >= 0x7ff0000000000000))
+ return asdouble (sign >> 1) + 1.0 / x; /* Special cases. */
+
+ /* Return early for large enough negative values. */
+ if (x < -6.0)
+ return 2.0;
+
+ /* For |x| < 3487.0/128.0, the following approximation holds. */
+ if (likely (ia < 0x403b3e0000000000))
+ {
+ /* |x| < 0x1p-511 => accurate to 0.5 ULP. */
+ if (unlikely (ia < asuint64 (0x1p-511)))
+ return 1.0 - x;
+
+ /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale
+ to 2/sqrt(pi), when x reduced to r = 0. */
+ double z = a + Shift;
+ uint64_t i = asuint64 (z);
+ double r = z - Shift;
+ /* These values are scaled by 2^128. */
+ double erfcr = __erfc_data.tab[i].erfc;
+ double scale = __erfc_data.tab[i].scale;
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */
+ double d = a - r;
+ double d2 = d * d;
+ double r2 = r * r;
+ /* Compute p_i as a regular (low-order) polynomial. */
+ double p1 = -r;
+ double p2 = fma (P21, r2, -P20);
+ double p3 = -r * fma (P20, r2, -0.5);
+ double p4 = fma (fma (P42, r2, -P41), r2, P40);
+ double p5 = -r * fma (fma (P52, r2, -P51), r2, P50);
+ /* Compute p_i using recurrence relation:
+ p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
+ double p6 = fma (Q5 * r, p5, p4) * R5;
+ double p7 = fma (Q6 * r, p6, p5) * R6;
+ double p8 = fma (Q7 * r, p7, p6) * R7;
+ double p9 = fma (Q8 * r, p8, p7) * R8;
+ double p10 = fma (Q9 * r, p9, p8) * R9;
+ /* Compute polynomial in d using pairwise Horner scheme. */
+ double p90 = fma (p10, d, p9);
+ double p78 = fma (p8, d, p7);
+ double p56 = fma (p6, d, p5);
+ double p34 = fma (p4, d, p3);
+ double p12 = fma (p2, d, p1);
+ double y = fma (p90, d2, p78);
+ y = fma (y, d2, p56);
+ y = fma (y, d2, p34);
+ y = fma (y, d2, p12);
+
+ y = fma (-fma (y, d2, d), scale, erfcr);
+
+ /* Handle sign and scale back in a single fma. */
+ double off = asdouble (sign >> 1);
+ double fac = asdouble (asuint64 (0x1p-128) | sign);
+ y = fma (y, fac, off);
+
+ if (unlikely (x > 26.0))
+ {
+ /* The underflow exception needs to be signaled explicitly when
+ result gets into the subnormal range. */
+ if (unlikely (y < 0x1p-1022))
+ force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+ /* Set errno to ERANGE if result rounds to 0. */
+ return __math_check_uflow (y);
+ }
+
+ return y;
+ }
+ /* Above the threshold (x > 3487.0/128.0) erfc is constant and needs to raise
+ underflow exception for positive x. */
+ return __math_uflow (0);
+}
+
+PL_SIG (S, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (erfc, 1.21)
+PL_TEST_SYM_INTERVAL (erfc, 0, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfc, 0x1p-26, 28.0, 100000)
+PL_TEST_INTERVAL (erfc, -0x1p-26, -6.0, 100000)
+PL_TEST_INTERVAL (erfc, 28.0, inf, 40000)
+PL_TEST_INTERVAL (erfc, -6.0, -inf, 40000)
diff --git a/pl/math/erfc_4u5.c b/pl/math/erfc_4u5.c
deleted file mode 100644
index e9af9d3bcdb4..000000000000
--- a/pl/math/erfc_4u5.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Double-precision erfc(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-#include "pairwise_horner.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define AbsMask (0x7fffffffffffffff)
-
-#define xint __erfc_data.interval_bounds
-#define PX __erfc_data.poly
-
-/* Accurate exponential from optimized routines. */
-double
-__exp_dd (double x, double xtail);
-
-static inline double
-eval_poly_horner (double z, int i)
-{
- double z2 = z * z;
-#define C(j) PX[i][j]
- return PAIRWISE_HORNER_12 (z, z2, C);
-}
-
-/* Accurate evaluation of exp(x^2)
- using compensated product (x^2 ~ x*x + e2)
- and the __exp_dd(y,d) routine, that is the
- computation of exp(y+d) with a small correction d<<y. */
-static inline double
-eval_accurate_gaussian (double a)
-{
- double e2;
- double a2 = a * a;
- double aa1 = -fma (0x1.0000002p27, a, -a);
- aa1 = fma (0x1.0000002p27, a, aa1);
- double aa2 = a - aa1;
- e2 = fma (-aa1, aa1, a2);
- e2 = fma (-aa1, aa2, e2);
- e2 = fma (-aa2, aa1, e2);
- e2 = fma (-aa2, aa2, e2);
- return __exp_dd (-a2, e2);
-}
-
-/* Approximation of erfc for |x| > 6.0. */
-static inline double
-approx_erfc_hi (double x, int i)
-{
- double a = fabs (x);
- double z = a - xint[i];
- double p = eval_poly_horner (z, i);
- double e_mx2 = eval_accurate_gaussian (a);
- return p * e_mx2;
-}
-
-static inline int
-get_itv_idx (double x)
-{
- /* Interval bounds are a logarithmic scale, i.e. interval n has
- lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
- the interval index. */
- double a = asdouble (asuint64 (x) & AbsMask);
- double z = a + 1.0;
- z = z * z;
- z = z * z;
- return (asuint64 (z) >> 52) - 1023;
-}
-
-/* Approximation of erfc for |x| < 6.0. */
-static inline double
-approx_erfc_lo (double x, uint32_t sign, int i)
-{
- double a = fabs (x);
- double z = a - xint[i];
- double p = eval_poly_horner (z, i);
- double e_mx2 = eval_accurate_gaussian (a);
- if (sign)
- return fma (-p, e_mx2, 2.0);
- else
- return p * e_mx2;
-}
-
-/* Top 12 bits of a double (sign and exponent bits). */
-static inline uint32_t
-abstop12 (double x)
-{
- return (asuint64 (x) >> 52) & 0x7ff;
-}
-
-/* Top 32 bits of a double. */
-static inline uint32_t
-top32 (double x)
-{
- return asuint64 (x) >> 32;
-}
-
-/* Fast erfc implementation.
- The approximation uses polynomial approximation of
- exp(x^2) * erfc(x) with fixed orders on 20 intervals.
- Maximum measured error is 4.05 ULPs:.
- erfc(0x1.e8ebf6a2b0801p-2) got 0x1.ff84036f8f0b3p-2
- want 0x1.ff84036f8f0b7p-2. */
-double
-erfc (double x)
-{
- /* Get top words. */
- uint32_t ix = top32 (x); /* We need to compare at most 32 bits. */
- uint32_t ia = ix & 0x7fffffff;
- uint32_t sign = ix >> 31;
-
- /* Handle special cases and small values with a single comparison:
- abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
- Special cases erfc(nan)=nan, erfc(+inf)=0 and erfc(-inf)=2
- Errno EDOM does not have to be set in case of erfc(nan).
- Only ERANGE may be set in case of underflow.
- Small values (|x|<small)
- |x|<0x1.0p-56 => accurate up to 0.5 ULP (top12(0x1p-50) = 0x3c7)
- |x|<0x1.0p-50 => accurate up to 1.0 ULP (top12(0x1p-50) = 0x3cd). */
- if (unlikely (abstop12 (x) - 0x3cd >= (abstop12 (INFINITY) & 0x7ff) - 0x3cd))
- {
- if (abstop12 (x) >= 0x7ff)
- return (double) (sign << 1) + 1.0 / x; /* special cases. */
- else
- return 1.0 - x; /* small case. */
- }
- else if (ia < 0x40180000)
- { /* |x| < 6.0. */
- return approx_erfc_lo (x, sign, get_itv_idx (x));
- }
- else if (sign)
- { /* x <= -6.0. */
- return 2.0;
- }
- else if (ia < 0x403c0000)
- { /* 6.0 <= x < 28. */
- return approx_erfc_hi (x, get_itv_idx (x));
- }
- else
- { /* x > 28. */
- return __math_uflow (0);
- }
-}
-
-PL_SIG (S, D, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (erfc, 3.56)
-PL_TEST_INTERVAL (erfc, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (erfc, 0x1p-1022, 0x1p-26, 40000)
-PL_TEST_INTERVAL (erfc, -0x1p-1022, -0x1p-26, 40000)
-PL_TEST_INTERVAL (erfc, 0x1p-26, 0x1p5, 40000)
-PL_TEST_INTERVAL (erfc, -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (erfc, 0, inf, 40000)
diff --git a/pl/math/erfc_data.c b/pl/math/erfc_data.c
index fa7184fcc871..40f72a4d6d5b 100644
--- a/pl/math/erfc_data.c
+++ b/pl/math/erfc_data.c
@@ -7,139 +7,3501 @@
#include "math_config.h"
-/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
- precision. Generated using the Remez algorithm on each interval separately
- (see erfc.sollya for more detail). */
+/* Lookup table used in erfc.
+ For each possible rounded input r (multiples of 1/128), between
+ r = 0.0 and r = ~27.0 (3488 values):
+ - the first entry __erfc_data.tab.erfc contains the values of erfc(r),
+ - the second entry __erfc_data.tab.scale contains the values of
+ 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
+ they are scaled by a large enough value 2^128 (fits in 8bit). */
const struct erfc_data __erfc_data = {
-
-/* Bounds for 20 intervals spanning [0x1.0p-50., 31.]. Interval bounds are a
- logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
- exception of the first interval. */
-.interval_bounds = {
- 0x1.0p-50, /* Tiny boundary. */
- 0x1.837f05c490126p-3, /* 0.189. */
- 0x1.a827997709f7ap-2, /* 0.414. */
- 0x1.5d13f326fe9c8p-1, /* 0.682. */
- 0x1.0p0, /* 1.000. */
- 0x1.60dfc14636e2ap0, /* 1.378. */
- 0x1.d413cccfe779ap0, /* 1.828. */
- 0x1.2e89f995ad3adp1, /* 2.364. */
- 0x1.8p1, /* 3.000. */
- 0x1.e0dfc14636e2ap1, /* 3.757. */
- 0x1.2a09e667f3bcdp2, /* 4.657. */
- 0x1.6e89f995ad3adp2, /* 5.727. */
- 0x1.cp2, /* 7.000. */
- 0x1.106fe0a31b715p3, /* 8.514. */
- 0x1.4a09e667f3bcdp3, /* 10.31. */
- 0x1.8e89f995ad3adp3, /* 12.45. */
- 0x1.ep3, /* 15.00. */
- 0x1.206fe0a31b715p4, /* 18.03. */
- 0x1.5a09e667f3bcdp4, /* 21.63. */
- 0x1.9e89f995ad3adp4, /* 25.91. */
- 0x1.fp4 /* 31.00. */
+ .tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
+ { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
+ { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
+ { 0x1.f27640f9853d9p127, 0x1.20b4d8bac36c1p128 },
+ { 0x1.edf3a9ba22dadp127, 0x1.209546ad13ccfp128 },
+ { 0x1.e971a2c4436aep127, 0x1.206cb4897b148p128 },
+ { 0x1.e4f05010eca8cp127, 0x1.203b261cd0053p128 },
+ { 0x1.e06fd58842c7ep127, 0x1.2000a00ae3804p128 },
+ { 0x1.dbf056fe2df35p127, 0x1.1fbd27cdc72d3p128 },
+ { 0x1.d771f82f02f4ep127, 0x1.1f70c3b4f2cc8p128 },
+ { 0x1.d2f4dcbc2f894p127, 0x1.1f1b7ae44867fp128 },
+ { 0x1.ce792828eae5cp127, 0x1.1ebd5552f795bp128 },
+ { 0x1.c9fefdd6eaf19p127, 0x1.1e565bca400d4p128 },
+ { 0x1.c58681031eb6ap127, 0x1.1de697e413d29p128 },
+ { 0x1.c10fd4c26e896p127, 0x1.1d6e14099944ap128 },
+ { 0x1.bc9b1bfe82687p127, 0x1.1cecdb718d61cp128 },
+ { 0x1.b82879728f11ep127, 0x1.1c62fa1e869b6p128 },
+ { 0x1.b3b80fa82a4bbp127, 0x1.1bd07cdd189acp128 },
+ { 0x1.af4a00f426daap127, 0x1.1b357141d95d5p128 },
+ { 0x1.aade6f7378a0ep127, 0x1.1a91e5a748165p128 },
+ { 0x1.a6757d08215d8p127, 0x1.19e5e92b964abp128 },
+ { 0x1.a20f4b5626818p127, 0x1.19318bae53a04p128 },
+ { 0x1.9dabfbc090901p127, 0x1.1874ddcdfce24p128 },
+ { 0x1.994baf66747adp127, 0x1.17aff0e56ec1p128 },
+ { 0x1.94ee8720076b6p127, 0x1.16e2d7093cd8cp128 },
+ { 0x1.9094a37bbd66ep127, 0x1.160da304ed92fp128 },
+ { 0x1.8c3e24bb73372p127, 0x1.153068581b781p128 },
+ { 0x1.87eb2ad1a4032p127, 0x1.144b3b337c90cp128 },
+ { 0x1.839bd55eaafc8p127, 0x1.135e3075d076bp128 },
+ { 0x1.7f5043ae11862p127, 0x1.12695da8b5bdep128 },
+ { 0x1.7b0894b3ea35cp127, 0x1.116cd8fd67618p128 },
+ { 0x1.76c4e70a390e7p127, 0x1.1068b94962e5ep128 },
+ { 0x1.728558ee694fcp127, 0x1.0f5d1602f7e41p128 },
+ { 0x1.6e4a083ed132fp127, 0x1.0e4a073dc1b91p128 },
+ { 0x1.6a13127843ec1p127, 0x1.0d2fa5a70c168p128 },
+ { 0x1.65e094b3b2413p127, 0x1.0c0e0a8223359p128 },
+ { 0x1.61b2aba3da093p127, 0x1.0ae54fa490723p128 },
+ { 0x1.5d89739304dcfp127, 0x1.09b58f724416bp128 },
+ { 0x1.59650860d6469p127, 0x1.087ee4d9ad247p128 },
+ { 0x1.5545858029b39p127, 0x1.07416b4fbfe7cp128 },
+ { 0x1.512b05f5006e1p127, 0x1.05fd3ecbec298p128 },
+ { 0x1.4d15a4527fdc7p127, 0x1.04b27bc403d3p128 },
+ { 0x1.49057ab900447p127, 0x1.03613f2812dafp128 },
+ { 0x1.44faa2d42c4ap127, 0x1.0209a65e29545p128 },
+ { 0x1.40f535d93160ep127, 0x1.00abcf3e187a9p128 },
+ { 0x1.3cf54c850162p127, 0x1.fe8fb01a47307p127 },
+ { 0x1.38faff1aa574ap127, 0x1.fbbbbef34b4b2p127 },
+ { 0x1.35066561a275dp127, 0x1.f8dc092d58ff8p127 },
+ { 0x1.311796a46f064p127, 0x1.f5f0cdaf15313p127 },
+ { 0x1.2d2ea9aefb636p127, 0x1.f2fa4c16c0019p127 },
+ { 0x1.294bb4cd4b2bdp127, 0x1.eff8c4b1375dbp127 },
+ { 0x1.256ecdca212ccp127, 0x1.ecec7870ebca8p127 },
+ { 0x1.219809edbd524p127, 0x1.e9d5a8e4c934ep127 },
+ { 0x1.1dc77dfcacd02p127, 0x1.e6b4982f158b9p127 },
+ { 0x1.19fd3e36ac96ap127, 0x1.e38988fc46e72p127 },
+ { 0x1.16395e559e218p127, 0x1.e054be79d3042p127 },
+ { 0x1.127bf18c8eadcp127, 0x1.dd167c4cf9d2ap127 },
+ { 0x1.0ec50a86d0dd4p127, 0x1.d9cf06898cdafp127 },
+ { 0x1.0b14bb6728cd8p127, 0x1.d67ea1a8b5368p127 },
+ { 0x1.076b15c70aa28p127, 0x1.d325927fb9d89p127 },
+ { 0x1.03c82ab5eb831p127, 0x1.cfc41e36c7df9p127 },
+ { 0x1.002c0ab8a5018p127, 0x1.cc5a8a3fbea4p127 },
+ { 0x1.f92d8b91d5cc7p126, 0x1.c8e91c4d01368p127 },
+ { 0x1.f210d6a9a6a31p126, 0x1.c5701a484ef9dp127 },
+ { 0x1.eb02147ce245cp126, 0x1.c1efca49a5011p127 },
+ { 0x1.e40161b701275p126, 0x1.be68728e29d5ep127 },
+ { 0x1.dd0ed9ea4bdd6p126, 0x1.bada596f25436p127 },
+ { 0x1.d62a978f7c957p126, 0x1.b745c55905bf8p127 },
+ { 0x1.cf54b4058455fp126, 0x1.b3aafcc27502ep127 },
+ { 0x1.c88d479173ccep126, 0x1.b00a46237d5bep127 },
+ { 0x1.c1d4695e87644p126, 0x1.ac63e7ecc1411p127 },
+ { 0x1.bb2a2f7e5652p126, 0x1.a8b8287ec6a09p127 },
+ { 0x1.b48eaee924501p126, 0x1.a5074e215762p127 },
+ { 0x1.ae01fb7e55a66p126, 0x1.a1519efaf889ep127 },
+ { 0x1.a78428050527ep126, 0x1.9d97610879642p127 },
+ { 0x1.a115462cbbc17p126, 0x1.99d8da149c13fp127 },
+ { 0x1.9ab5668e4930ap126, 0x1.96164fafd8de3p127 },
+ { 0x1.946498acbd766p126, 0x1.925007283d7aap127 },
+ { 0x1.8e22eaf68291ep126, 0x1.8e86458169af8p127 },
+ { 0x1.87f06ac6960c4p126, 0x1.8ab94f6caa71dp127 },
+ { 0x1.81cd2465e1d96p126, 0x1.86e9694134b9ep127 },
+ { 0x1.7bb9230cb40b4p126, 0x1.8316d6f48133dp127 },
+ { 0x1.75b470e454d35p126, 0x1.7f41dc12c9e89p127 },
+ { 0x1.6fbf1708ba47cp126, 0x1.7b6abbb7aaf19p127 },
+ { 0x1.69d91d8a595dap126, 0x1.7791b886e7403p127 },
+ { 0x1.64028b7013867p126, 0x1.73b714a552763p127 },
+ { 0x1.5e3b66b9405a9p126, 0x1.6fdb11b1e0c34p127 },
+ { 0x1.5883b45fd2b63p126, 0x1.6bfdf0beddaf5p127 },
+ { 0x1.52db785a98acap126, 0x1.681ff24b4ab04p127 },
+ { 0x1.4d42b59f95afap126, 0x1.6441563c665d4p127 },
+ { 0x1.47b96e267647ap126, 0x1.60625bd75d07bp127 },
+ { 0x1.423fa2eb1cb59p126, 0x1.5c8341bb23767p127 },
+ { 0x1.3cd553f045d45p126, 0x1.58a445da7c74cp127 },
+ { 0x1.377a8042458d1p126, 0x1.54c5a57629dbp127 },
+ { 0x1.322f25f9da2fdp126, 0x1.50e79d1749ac9p127 },
+ { 0x1.2cf3423f15fdfp126, 0x1.4d0a6889dfd9fp127 },
+ { 0x1.27c6d14c5e341p126, 0x1.492e42d78d2c5p127 },
+ { 0x1.22a9ce717edcbp126, 0x1.4553664273d24p127 },
+ { 0x1.1d9c3416d2b4bp126, 0x1.417a0c4049fdp127 },
+ { 0x1.189dfbc07e69p126, 0x1.3da26d759aef5p127 },
+ { 0x1.13af1e11be721p126, 0x1.39ccc1b136d5ap127 },
+ { 0x1.0ecf92d046d22p126, 0x1.35f93fe7d1b3dp127 },
+ { 0x1.09ff50e7b3f93p126, 0x1.32281e2fd1a92p127 },
+ { 0x1.053e4e6d0c10bp126, 0x1.2e5991bd4cbfcp127 },
+ { 0x1.008c80a24ff1p126, 0x1.2a8dcede3673bp127 },
+ { 0x1.f7d3b7f436013p125, 0x1.26c508f6bd0ffp127 },
+ { 0x1.eeaca836a27ccp125, 0x1.22ff727dd6f7bp127 },
+ { 0x1.e5a3b7c9b56dap125, 0x1.1f3d3cf9ffe5ap127 },
+ { 0x1.dcb8cae2d747fp125, 0x1.1b7e98fe26217p127 },
+ { 0x1.d3ebc436b0f26p125, 0x1.17c3b626c7a12p127 },
+ { 0x1.cb3c8500ea349p125, 0x1.140cc3173f007p127 },
+ { 0x1.c2aaed0bfcfeep125, 0x1.1059ed7740313p127 },
+ { 0x1.ba36dab91c0e9p125, 0x1.0cab61f084b93p127 },
+ { 0x1.b1e02b082b72p125, 0x1.09014c2ca74dap127 },
+ { 0x1.a9a6b99fc973bp125, 0x1.055bd6d32e8d7p127 },
+ { 0x1.a18a60d56673ep125, 0x1.01bb2b87c6968p127 },
+ { 0x1.998af9b56a3aep125, 0x1.fc3ee5d1524bp126 },
+ { 0x1.91a85c0b65519p125, 0x1.f511a91a67d2ap126 },
+ { 0x1.89e25e6a4cef9p125, 0x1.edeeee0959518p126 },
+ { 0x1.8238d634c0127p125, 0x1.e6d6ffaa65a25p126 },
+ { 0x1.7aab97a554544p125, 0x1.dfca26f5bbf88p126 },
+ { 0x1.733a75d6e91b8p125, 0x1.d8c8aace11e63p126 },
+ { 0x1.6be542ccffc2fp125, 0x1.d1d2cfff91594p126 },
+ { 0x1.64abcf7c175b4p125, 0x1.cae8d93f1d7b7p126 },
+ { 0x1.5d8debd20aacep125, 0x1.c40b0729ed548p126 },
+ { 0x1.568b66be6f268p125, 0x1.bd3998457afdbp126 },
+ { 0x1.4fa40e3af3674p125, 0x1.b674c8ffc6283p126 },
+ { 0x1.48d7af53bc19fp125, 0x1.afbcd3afe8ab6p126 },
+ { 0x1.4226162fbddd5p125, 0x1.a911f096fbc26p126 },
+ { 0x1.3b8f0e1912f7p125, 0x1.a27455e14c93cp126 },
+ { 0x1.351261854b991p125, 0x1.9be437a7de946p126 },
+ { 0x1.2eafda1db784ap125, 0x1.9561c7f23a47bp126 },
+ { 0x1.286740c7a7dabp125, 0x1.8eed36b886d93p126 },
+ { 0x1.22385daca7f47p125, 0x1.8886b1e5ecfd1p126 },
+ { 0x1.1c22f842ac1f2p125, 0x1.822e655b417e7p126 },
+ { 0x1.1626d7543522p125, 0x1.7be47af1f5d89p126 },
+ { 0x1.1043c1086777dp125, 0x1.75a91a7f4d2edp126 },
+ { 0x1.0a797aeb152f2p125, 0x1.6f7c69d7d3ef8p126 },
+ { 0x1.04c7c9f4b969p125, 0x1.695e8cd31867ep126 },
+ { 0x1.fe5ce524c8ee5p124, 0x1.634fa54fa285fp126 },
+ { 0x1.f35a715b2f3e1p124, 0x1.5d4fd33729015p126 },
+ { 0x1.e887bf681f218p124, 0x1.575f3483021c3p126 },
+ { 0x1.dde4553ef94dep124, 0x1.517de540ce2a3p126 },
+ { 0x1.d36fb7fa50177p124, 0x1.4babff975a04cp126 },
+ { 0x1.c9296beb09cf1p124, 0x1.45e99bcbb7915p126 },
+ { 0x1.bf10f4a759889p124, 0x1.4036d0468a7a2p126 },
+ { 0x1.b525d5198cb1cp124, 0x1.3a93b1998736cp126 },
+ { 0x1.ab678f8eabedbp124, 0x1.35005285227f1p126 },
+ { 0x1.a1d5a5c4edb96p124, 0x1.2f7cc3fe6f423p126 },
+ { 0x1.986f98f9f96c8p124, 0x1.2a09153529381p126 },
+ { 0x1.8f34e9f8f93a6p124, 0x1.24a55399ea239p126 },
+ { 0x1.8625192879e39p124, 0x1.1f518ae487dc8p126 },
+ { 0x1.7d3fa69816db5p124, 0x1.1a0dc51a9934dp126 },
+ { 0x1.7484120df1b01p124, 0x1.14da0a961fd14p126 },
+ { 0x1.6bf1db13f3983p124, 0x1.0fb6620c550afp126 },
+ { 0x1.63888104d811ap124, 0x1.0aa2d09497f2bp126 },
+ { 0x1.5b478318ff939p124, 0x1.059f59af7a906p126 },
+ { 0x1.532e6073095f2p124, 0x1.00abff4dec7a3p126 },
+ { 0x1.4b3c982c338c7p124, 0x1.f79183b101c5bp125 },
+ { 0x1.4371a960807f8p124, 0x1.edeb406d9c825p125 },
+ { 0x1.3bcd133aa0ffcp124, 0x1.e4652fadcb6b2p125 },
+ { 0x1.344e54ffa23b9p124, 0x1.daff4969c0b04p125 },
+ { 0x1.2cf4ee1a5f0fcp124, 0x1.d1b982c50137p125 },
+ { 0x1.25c05e26b3f99p124, 0x1.c893ce1dcbef7p125 },
+ { 0x1.1eb024fc75285p124, 0x1.bf8e1b1ca2279p125 },
+ { 0x1.17c3c2ba26319p124, 0x1.b6a856c3ed54fp125 },
+ { 0x1.10fab7cf72f94p124, 0x1.ade26b7fbed95p125 },
+ { 0x1.0a548507696cp124, 0x1.a53c4135a6526p125 },
+ { 0x1.03d0ab9273b94p124, 0x1.9cb5bd549b111p125 },
+ { 0x1.fadd5a20258d3p123, 0x1.944ec2e4f563p125 },
+ { 0x1.ee5c1730b147cp123, 0x1.8c07329874652p125 },
+ { 0x1.e21c938a45a83p123, 0x1.83deeada4d25ap125 },
+ { 0x1.d61dd57628999p123, 0x1.7bd5c7df3fe9cp125 },
+ { 0x1.ca5ee4649e31fp123, 0x1.73eba3b5b07b7p125 },
+ { 0x1.bedec8fddb34p123, 0x1.6c205655be72p125 },
+ { 0x1.b39c8d3276d8ap123, 0x1.6473b5b15a7a1p125 },
+ { 0x1.a8973c4b5c03ep123, 0x1.5ce595c455b0ap125 },
+ { 0x1.9dcde2f93a207p123, 0x1.5575c8a468362p125 },
+ { 0x1.933f8f6375f2cp123, 0x1.4e241e912c305p125 },
+ { 0x1.88eb51369acb9p123, 0x1.46f066040a832p125 },
+ { 0x1.7ed039b24c96bp123, 0x1.3fda6bc016994p125 },
+ { 0x1.74ed5bb6bb581p123, 0x1.38e1fae1d6a9dp125 },
+ { 0x1.6b41cbd198bc8p123, 0x1.3206dceef5f87p125 },
+ { 0x1.61cca04a90795p123, 0x1.2b48d9e5dea1cp125 },
+ { 0x1.588cf12f4446bp123, 0x1.24a7b84d38971p125 },
+ { 0x1.4f81d85ecc55bp123, 0x1.1e233d434b813p125 },
+ { 0x1.46aa7194bd324p123, 0x1.17bb2c8d41535p125 },
+ { 0x1.3e05da73b4159p123, 0x1.116f48a6476ccp125 },
+ { 0x1.3593328f6abbep123, 0x1.0b3f52ce8c383p125 },
+ { 0x1.2d519b7653e1ep123, 0x1.052b0b1a174eap125 },
+ { 0x1.254038bac19d6p123, 0x1.fe6460fef468p124 },
+ { 0x1.1d5e2ffb96d4p123, 0x1.f2a901ccafb37p124 },
+ { 0x1.15aaa8ec85205p123, 0x1.e723726b824a9p124 },
+ { 0x1.0e24cd5dd8846p123, 0x1.dbd32ac4c99bp124 },
+ { 0x1.06cbc943d255ap123, 0x1.d0b7a0f921e7cp124 },
+ { 0x1.ff3d957b29b39p122, 0x1.c5d0497c09e74p124 },
+ { 0x1.f13a043742333p122, 0x1.bb1c972f23e5p124 },
+ { 0x1.e38b43cbd0f0fp122, 0x1.b09bfb7d11a84p124 },
+ { 0x1.d62fbdc2e756bp122, 0x1.a64de673e8837p124 },
+ { 0x1.c925e02b41668p122, 0x1.9c31c6df3b1b8p124 },
+ { 0x1.bc6c1da1f3121p122, 0x1.92470a61b6965p124 },
+ { 0x1.b000ed5b4a626p122, 0x1.888d1d8e510a3p124 },
+ { 0x1.a3e2cb2ae9edbp122, 0x1.7f036c0107294p124 },
+ { 0x1.9810378b1f299p122, 0x1.75a96077274bap124 },
+ { 0x1.8c87b7a37834fp122, 0x1.6c7e64e7281cbp124 },
+ { 0x1.8147d54e9cc33p122, 0x1.6381e2980956bp124 },
+ { 0x1.764f1f1f6ddeap122, 0x1.5ab342383d178p124 },
+ { 0x1.6b9c28657041ap122, 0x1.5211ebf41880bp124 },
+ { 0x1.612d893085125p122, 0x1.499d478bca735p124 },
+ { 0x1.5701de53f4d2ep122, 0x1.4154bc68d75c3p124 },
+ { 0x1.4d17c968d062bp122, 0x1.3937b1b31925ap124 },
+ { 0x1.436df0cfabf1dp122, 0x1.31458e6542847p124 },
+ { 0x1.3a02ffb1b7ceep122, 0x1.297db960e4f63p124 },
+ { 0x1.30d5a6013afc5p122, 0x1.21df9981f8e53p124 },
+ { 0x1.27e49879737d3p122, 0x1.1a6a95b1e786fp124 },
+ { 0x1.1f2e909de04d2p122, 0x1.131e14fa1625dp124 },
+ { 0x1.16b24cb8f8f92p122, 0x1.0bf97e95f2a64p124 },
+ { 0x1.0e6e8fda56cf7p122, 0x1.04fc3a0481321p124 },
+ { 0x1.066221d4539d8p122, 0x1.fc4b5e32d6259p123 },
+ { 0x1.fd179e7243e3cp121, 0x1.eeea8c1b1db94p123 },
+ { 0x1.edd4d2aec5adbp121, 0x1.e1d4cf1e2450ap123 },
+ { 0x1.def98c6c79efap121, 0x1.d508f9a1ea64fp123 },
+ { 0x1.d0838121f2418p121, 0x1.c885df3451a07p123 },
+ { 0x1.c2706fa45005ep121, 0x1.bc4a54a84e834p123 },
+ { 0x1.b4be201caa4b4p121, 0x1.b055303221015p123 },
+ { 0x1.a76a63fc95c79p121, 0x1.a4a549829587ep123 },
+ { 0x1.9a7315f1d6a55p121, 0x1.993979e14fffep123 },
+ { 0x1.8dd619d943ca1p121, 0x1.8e109c4622913p123 },
+ { 0x1.81915cb0e3323p121, 0x1.83298d717210ep123 },
+ { 0x1.75a2d48946eb1p121, 0x1.78832c03aa2b1p123 },
+ { 0x1.6a08807632262p121, 0x1.6e1c5893c380bp123 },
+ { 0x1.5ec0687e8dcb2p121, 0x1.63f3f5c4de13bp123 },
+ { 0x1.53c89d8bb3ddbp121, 0x1.5a08e85af27ep123 },
+ { 0x1.491f395818f54p121, 0x1.505a174e9c929p123 },
+ { 0x1.3ec25e5d5af12p121, 0x1.46e66be00224p123 },
+ { 0x1.34b037c1bbfc5p121, 0x1.3dacd1a8d8ccep123 },
+ { 0x1.2ae6f94510dd8p121, 0x1.34ac36ad8dafep123 },
+ { 0x1.2164df2d29765p121, 0x1.2be38b6d92415p123 },
+ { 0x1.18282e31ba3e8p121, 0x1.2351c2f2d1449p123 },
+ { 0x1.0f2f3367cd6aap121, 0x1.1af5d2e04f3f6p123 },
+ { 0x1.0678442cc256fp121, 0x1.12ceb37ff9bc3p123 },
+ { 0x1.fc037c21c3622p120, 0x1.0adb5fcfa8c75p123 },
+ { 0x1.eb940d8319831p120, 0x1.031ad58d56279p123 },
+ { 0x1.db9f17e61c31p120, 0x1.f7182a851bca2p122 },
+ { 0x1.cc218694238a2p120, 0x1.e85c449e377f3p122 },
+ { 0x1.bd18548996419p120, 0x1.da0005e5f28dfp122 },
+ { 0x1.ae808c479c371p120, 0x1.cc0180af00a8bp122 },
+ { 0x1.a05747a543aa7p120, 0x1.be5ecd2fcb5f9p122 },
+ { 0x1.9299afa0246a6p120, 0x1.b1160991ff737p122 },
+ { 0x1.8544fc2c8c1dap120, 0x1.a4255a00b9f03p122 },
+ { 0x1.785674053e8b9p120, 0x1.978ae8b55ce1bp122 },
+ { 0x1.6bcb6c7ad4854p120, 0x1.8b44e6031383ep122 },
+ { 0x1.5fa14942c3d54p120, 0x1.7f5188610ddc8p122 },
+ { 0x1.53d57c461a5a7p120, 0x1.73af0c737bb45p122 },
+ { 0x1.4865856ff632ap120, 0x1.685bb5134ef13p122 },
+ { 0x1.3d4ef27bc49a6p120, 0x1.5d55cb54cd53ap122 },
+ { 0x1.328f5ec350e67p120, 0x1.529b9e8cf9a1ep122 },
+ { 0x1.2824730cacbb4p120, 0x1.482b8455dc491p122 },
+ { 0x1.1e0be557fa673p120, 0x1.3e03d891b37dep122 },
+ { 0x1.144378ad22027p120, 0x1.3422fd6d12e2bp122 },
+ { 0x1.0ac8fce979b96p120, 0x1.2a875b5ffab56p122 },
+ { 0x1.019a4e8d69649p120, 0x1.212f612dee7fbp122 },
+ { 0x1.f16aad1422a55p119, 0x1.181983e5133ddp122 },
+ { 0x1.e030141df7d25p119, 0x1.0f443edc5ce49p122 },
+ { 0x1.cf80d4afc3019p119, 0x1.06ae13b0d3255p122 },
+ { 0x1.bf5908f50b4ap119, 0x1.fcab1483ea7fcp121 },
+ { 0x1.afb4e269693dfp119, 0x1.ec72615a894c4p121 },
+ { 0x1.a090a974cfebep119, 0x1.dcaf3691fc448p121 },
+ { 0x1.91e8bd0830a74p119, 0x1.cd5ec93c12432p121 },
+ { 0x1.83b9923a85f7bp119, 0x1.be7e5ac24963bp121 },
+ { 0x1.75ffb3e6519ap119, 0x1.b00b38d6b3575p121 },
+ { 0x1.68b7c2479902dp119, 0x1.a202bd6372dcep121 },
+ { 0x1.5bde729a6b60fp119, 0x1.94624e78e0fafp121 },
+ { 0x1.4f708eb9fba63p119, 0x1.87275e3a6869ep121 },
+ { 0x1.436af4c058acbp119, 0x1.7a4f6aca256cbp121 },
+ { 0x1.37ca96a6cd1d4p119, 0x1.6dd7fe335823p121 },
+ { 0x1.2c8c79e6f04a3p119, 0x1.61beae53b72b7p121 },
+ { 0x1.21adb71c70c75p119, 0x1.56011cc3b036dp121 },
+ { 0x1.172b79a7a1181p119, 0x1.4a9cf6bda3f4cp121 },
+ { 0x1.0d02ff50ce651p119, 0x1.3f8ff5042a88ep121 },
+ { 0x1.033197ec68c0ep119, 0x1.34d7dbc76d7e5p121 },
+ { 0x1.f3694a0008381p118, 0x1.2a727a89a3f14p121 },
+ { 0x1.e11332d0714c5p118, 0x1.205dac02bd6b9p121 },
+ { 0x1.cf5bf1fed1e7p118, 0x1.1697560347b26p121 },
+ { 0x1.be3eb08ae7c2p118, 0x1.0d1d69569b82dp121 },
+ { 0x1.adb6b810af9e2p118, 0x1.03ede1a45bfeep121 },
+ { 0x1.9dbf721b98dfap118, 0x1.f60d8aa2a88f2p120 },
+ { 0x1.8e54677bb0151p118, 0x1.e4cc4abf7d065p120 },
+ { 0x1.7f713f9cc9784p118, 0x1.d4143a9dfe965p120 },
+ { 0x1.7111bfdfb3cep118, 0x1.c3e1a5f5c077cp120 },
+ { 0x1.6331caf57b5dbp118, 0x1.b430ecf4a83a8p120 },
+ { 0x1.55cd603cc415p118, 0x1.a4fe83fb9db25p120 },
+ { 0x1.48e09b21414bfp118, 0x1.9646f35a76624p120 },
+ { 0x1.3c67b27d50fe7p118, 0x1.8806d70b2fc36p120 },
+ { 0x1.305ef7fdbfb95p118, 0x1.7a3ade6c8b3e5p120 },
+ { 0x1.24c2d787b9e37p118, 0x1.6cdfcbfc1e263p120 },
+ { 0x1.198fd6a0ee7bdp118, 0x1.5ff2750fe782p120 },
+ { 0x1.0ec293d9e6d85p118, 0x1.536fc18f7ce5cp120 },
+ { 0x1.0457c63a9669p118, 0x1.4754abacdf1dcp120 },
+ { 0x1.f49879624a021p117, 0x1.3b9e3f9d06e3fp120 },
+ { 0x1.e139bb05eb49ep117, 0x1.30499b503957fp120 },
+ { 0x1.ce8d4b7fd6c7p117, 0x1.2553ee2a336bfp120 },
+ { 0x1.bc8d516fda8bap117, 0x1.1aba78ba3af89p120 },
+ { 0x1.ab341ee553e25p117, 0x1.107a8c7323a6ep120 },
+ { 0x1.9a7c305336484p117, 0x1.06918b6355624p120 },
+ { 0x1.8a602b88919cp117, 0x1.f9f9cfd9c3035p119 },
+ { 0x1.7adadead962edp117, 0x1.e77448fb66bb9p119 },
+ { 0x1.6be73f45149fbp117, 0x1.d58da68fd117p119 },
+ { 0x1.5d80693276a6dp117, 0x1.c4412bf4b8f0bp119 },
+ { 0x1.4fa19dc42d409p117, 0x1.b38a3af2e55b4p119 },
+ { 0x1.424642c28ff75p117, 0x1.a3645330550ffp119 },
+ { 0x1.3569e18328604p117, 0x1.93cb11a30d765p119 },
+ { 0x1.29082600643fdp117, 0x1.84ba3004a50dp119 },
+ { 0x1.1d1cddf5a82dep117, 0x1.762d84469c18fp119 },
+ { 0x1.11a3f7ffbbfeap117, 0x1.6821000795a03p119 },
+ { 0x1.069982c189a9ep117, 0x1.5a90b00981d93p119 },
+ { 0x1.f7f3581a4dc2cp116, 0x1.4d78bba8ca5fdp119 },
+ { 0x1.e381802242163p116, 0x1.40d564548fad7p119 },
+ { 0x1.cfd6511405b2dp116, 0x1.34a305080681fp119 },
+ { 0x1.bcead7f01492fp116, 0x1.28de11c5031ebp119 },
+ { 0x1.aab859b20ac9ep116, 0x1.1d83170fbf6fbp119 },
+ { 0x1.993851cc9779ap116, 0x1.128eb96be8798p119 },
+ { 0x1.886470ad946a7p116, 0x1.07fdb4dafea5fp119 },
+ { 0x1.78369a4a2cbd6p116, 0x1.fb99b8b8279e1p118 },
+ { 0x1.68a8e4b2fc8c2p116, 0x1.e7f232d9e263p118 },
+ { 0x1.59b596b012aaap116, 0x1.d4fed7195d7e8p118 },
+ { 0x1.4b572664bd2dcp116, 0x1.c2b9cf7f893bfp118 },
+ { 0x1.3d8837fb08d1dp116, 0x1.b11d702b3deb2p118 },
+ { 0x1.30439c56dadf6p116, 0x1.a024365f771bdp118 },
+ { 0x1.23844fd08cb93p116, 0x1.8fc8c794b03b5p118 },
+ { 0x1.174578f6efd5dp116, 0x1.8005f08d6f1efp118 },
+ { 0x1.0b826758a086bp116, 0x1.70d6a46e07ddap118 },
+ { 0x1.003692548d98bp116, 0x1.6235fbd7a4345p118 },
+ { 0x1.eabb2fe335196p115, 0x1.541f340697987p118 },
+ { 0x1.d5e6777a83c2ap115, 0x1.468dadf4080abp118 },
+ { 0x1.c1e6cb6239574p115, 0x1.397ced7af2b15p118 },
+ { 0x1.aeb4423e690e7p115, 0x1.2ce898809244ep118 },
+ { 0x1.9c47374a0974ep115, 0x1.20cc76202c5fbp118 },
+ { 0x1.8a98484a1e8d3p115, 0x1.15246dda49d47p118 },
+ { 0x1.79a0538dd4fc7p115, 0x1.09ec86c75d497p118 },
+ { 0x1.695875fb574ap115, 0x1.fe41cd9bb4eeep117 },
+ { 0x1.59ba0929261c5p115, 0x1.e97ba3b77f306p117 },
+ { 0x1.4abea183bc47p115, 0x1.d57f524723822p117 },
+ { 0x1.3c600c7f477c5p115, 0x1.c245d4b99847ap117 },
+ { 0x1.2e984ed53e777p115, 0x1.afc85e0f82e12p117 },
+ { 0x1.2161a2cd9d894p115, 0x1.9e005769dbc1dp117 },
+ { 0x1.14b67693928cfp115, 0x1.8ce75e9f6f8ap117 },
+ { 0x1.08916a956172p115, 0x1.7c7744d9378f7p117 },
+ { 0x1.f9da9fde95755p114, 0x1.6caa0d3582fe9p117 },
+ { 0x1.e38a4dc27b11bp114, 0x1.5d79eb71e893bp117 },
+ { 0x1.ce283a9e3e33p114, 0x1.4ee1429bf7ccp117 },
+ { 0x1.b9ab1a96e3b3ep114, 0x1.40daa3c89f5b6p117 },
+ { 0x1.a609f7584d32bp114, 0x1.3360ccd23db3ap117 },
+ { 0x1.933c2d52c56c9p114, 0x1.266ea71d4f71ap117 },
+ { 0x1.8139690c0d187p114, 0x1.19ff4663ae9dfp117 },
+ { 0x1.6ff9a4837fa43p114, 0x1.0e0de78654d1ep117 },
+ { 0x1.5f7524a8e81a2p114, 0x1.0295ef6591848p117 },
+ { 0x1.4fa476e59f668p114, 0x1.ef25d37f49fe1p116 },
+ { 0x1.40806eb78e353p114, 0x1.da01102b5f851p116 },
+ { 0x1.3202235dada5p114, 0x1.c5b5412dcafadp116 },
+ { 0x1.2422ed95a3235p114, 0x1.b23a5a23e421p116 },
+ { 0x1.16dc656a14df6p114, 0x1.9f8893d8fd1c1p116 },
+ { 0x1.0a2860115569cp114, 0x1.8d986a4187285p116 },
+ { 0x1.fc01dbb80c841p113, 0x1.7c629a822bc9ep116 },
+ { 0x1.e4c0b066a497p113, 0x1.6be02102b352p116 },
+ { 0x1.ce823f4cc4badp113, 0x1.5c0a378c90bcap116 },
+ { 0x1.b93bf40d5eccbp113, 0x1.4cda5374ea275p116 },
+ { 0x1.a4e3a125adc76p113, 0x1.3e4a23d1f4703p116 },
+ { 0x1.916f7c5f2f764p113, 0x1.30538fbb77ecdp116 },
+ { 0x1.7ed61b5d3db0ap113, 0x1.22f0b496539bep116 },
+ { 0x1.6d0e7045988cbp113, 0x1.161be46ad3b5p116 },
+ { 0x1.5c0fc68335b0cp113, 0x1.09cfa445b00ffp116 },
+ { 0x1.4bd1bfa2aba3dp113, 0x1.fc0d55470cf51p115 },
+ { 0x1.3c4c504792bf8p113, 0x1.e577bbcd49935p115 },
+ { 0x1.2d77bd3a382bcp113, 0x1.cfd4a5adec5cp115 },
+ { 0x1.1f4c988d02149p113, 0x1.bb1a9657ce465p115 },
+ { 0x1.11c3bed8e716ap113, 0x1.a740684026555p115 },
+ { 0x1.04d654905dadp113, 0x1.943d4a1d1ed39p115 },
+ { 0x1.f0fb86d056745p112, 0x1.8208bc334a6a5p115 },
+ { 0x1.d9676faafa27fp112, 0x1.709a8db59f25cp115 },
+ { 0x1.c2e43d417197bp112, 0x1.5feada379d8b7p115 },
+ { 0x1.ad664518e771bp112, 0x1.4ff207314a102p115 },
+ { 0x1.98e25420092dap112, 0x1.40a8c1949f75ep115 },
+ { 0x1.854daa4a49b0fp112, 0x1.3207fb7420eb9p115 },
+ { 0x1.729df6503422ap112, 0x1.2408e9ba3327fp115 },
+ { 0x1.60c95193c542dp112, 0x1.16a501f0e42cap115 },
+ { 0x1.4fc63c27c71aep112, 0x1.09d5f819c9e29p115 },
+ { 0x1.3f8b98f93052ap112, 0x1.fb2b792b40a22p114 },
+ { 0x1.3010aa198de78p112, 0x1.e3bcf436a1a95p114 },
+ { 0x1.214d0d298365p112, 0x1.cd55277c18d05p114 },
+ { 0x1.1338b7e273194p112, 0x1.b7e94604479dcp114 },
+ { 0x1.05cbf4be650abp112, 0x1.a36eec00926ddp114 },
+ { 0x1.f1febf7a916aap111, 0x1.8fdc1b2dcf7b9p114 },
+ { 0x1.d997c68d65936p111, 0x1.7d2737527c3f9p114 },
+ { 0x1.c2556a4e7a90fp111, 0x1.6b4702d7d5849p114 },
+ { 0x1.ac2aa7516ade4p111, 0x1.5a329b7d30748p114 },
+ { 0x1.970b05888fda2p111, 0x1.49e17724f4d41p114 },
+ { 0x1.82ea92dbc1a27p111, 0x1.3a4b60ba9aa4ep114 },
+ { 0x1.6fbdddeff308fp111, 0x1.2b6875310f785p114 },
+ { 0x1.5d79f11e27f6bp111, 0x1.1d312098e9dbap114 },
+ { 0x1.4c144d984e1b8p111, 0x1.0f9e1b4dd36dfp114 },
+ { 0x1.3b82e6ba892a4p111, 0x1.02a8673a94692p114 },
+ { 0x1.2bbc1d878d272p111, 0x1.ec929a665b449p113 },
+ { 0x1.1cb6bc4eaa678p111, 0x1.d4f4b4c8e09edp113 },
+ { 0x1.0e69f27a37df3p111, 0x1.be6abbb10a5aap113 },
+ { 0x1.00cd508511266p111, 0x1.a8e8cc1fadef6p113 },
+ { 0x1.e7b1882bccac5p110, 0x1.94637d5bacfdbp113 },
+ { 0x1.cf09287e48bb9p110, 0x1.80cfdc72220cfp113 },
+ { 0x1.b792bbc489b04p110, 0x1.6e2367dc27f95p113 },
+ { 0x1.a140206ab945p110, 0x1.5c540b4936fd2p113 },
+ { 0x1.8c03d2d39119bp110, 0x1.4b581b8d170fcp113 },
+ { 0x1.77d0e6e5bed21p110, 0x1.3b2652b06c2b2p113 },
+ { 0x1.649b01d73110ap110, 0x1.2bb5cc22e5db6p113 },
+ { 0x1.525654343aad2p110, 0x1.1cfe010e2052dp113 },
+ { 0x1.40f79420887c7p110, 0x1.0ef6c4c84a0fep113 },
+ { 0x1.3073f7cff4a85p110, 0x1.01984165a5f36p113 },
+ { 0x1.20c1303550f0ep110, 0x1.e9b5e8d00ce77p112 },
+ { 0x1.11d563e54f40ep110, 0x1.d16f5716c6c1ap112 },
+ { 0x1.03a72a2bbdc06p110, 0x1.ba4f035d60e03p112 },
+ { 0x1.ec5b0ca2b20f5p109, 0x1.a447b7b03f045p112 },
+ { 0x1.d2bfc6210880ap109, 0x1.8f4ccca7fc90dp112 },
+ { 0x1.ba6c1c6e87c4p109, 0x1.7b5223dac7336p112 },
+ { 0x1.a35068e9c89cfp109, 0x1.684c227fcacefp112 },
+ { 0x1.8d5dbaa383b98p109, 0x1.562fac4329b48p112 },
+ { 0x1.7885ce9f67cdbp109, 0x1.44f21e49054f2p112 },
+ { 0x1.64bb0863504ddp109, 0x1.34894a5e24657p112 },
+ { 0x1.51f06ad20e4c3p109, 0x1.24eb7254ccf83p112 },
+ { 0x1.4019914f0b53ap109, 0x1.160f438c70913p112 },
+ { 0x1.2f2aa92823e8p109, 0x1.07ebd2a2d2844p112 },
+ { 0x1.1f186b432c98bp109, 0x1.f4f12e9ab070ap111 },
+ { 0x1.0fd8160ca94ap109, 0x1.db5ad0b27805cp111 },
+ { 0x1.015f67a552924p109, 0x1.c304efa2c6f4ep111 },
+ { 0x1.e749309831666p108, 0x1.abe09e9144b5ep111 },
+ { 0x1.cd3caa04cdd1bp108, 0x1.95df988e76644p111 },
+ { 0x1.b48774d0f8e45p108, 0x1.80f439b4ee04bp111 },
+ { 0x1.9d189f9f85cbfp108, 0x1.6d11788a69c64p111 },
+ { 0x1.86e0050236315p108, 0x1.5a2adfa0b4bc4p111 },
+ { 0x1.71ce426a561d3p108, 0x1.4834877429b8fp111 },
+ { 0x1.5dd4af79906a9p108, 0x1.37231085c7d9ap111 },
+ { 0x1.4ae555af52cdfp108, 0x1.26eb9daed6f7ep111 },
+ { 0x1.38f2e86f38216p108, 0x1.1783ceac2891p111 },
+ { 0x1.27f0bd5d0e6b1p108, 0x1.08e1badf0fcedp111 },
+ { 0x1.17d2c50b2bfafp108, 0x1.f5f7d88472604p110 },
+ { 0x1.088d83f7e4069p108, 0x1.db92b5212fb8dp110 },
+ { 0x1.f42c17ae0ebf6p107, 0x1.c282cd3957edap110 },
+ { 0x1.d8c3ea48f2889p107, 0x1.aab7abace48dcp110 },
+ { 0x1.beceb1f9f5b3dp107, 0x1.94219bfcb4928p110 },
+ { 0x1.a6399674d366bp107, 0x1.7eb1a2075864ep110 },
+ { 0x1.8ef2a9a18d857p107, 0x1.6a597219a93dap110 },
+ { 0x1.78e8dcd2e6bfdp107, 0x1.570b69502f313p110 },
+ { 0x1.640bf6745325ep107, 0x1.44ba864670882p110 },
+ { 0x1.504c882a97424p107, 0x1.335a62115bce2p110 },
+ { 0x1.3d9be56279ee9p107, 0x1.22df298214423p110 },
+ { 0x1.2bec1a4917edbp107, 0x1.133d96ae7e0ddp110 },
+ { 0x1.1b2fe32991d5cp107, 0x1.046aeabcfcdecp110 },
+ { 0x1.0b5aa42bf5054p107, 0x1.ecb9cfe1d8642p109 },
+ { 0x1.f8c0c2e2ce8dep106, 0x1.d21397ead99cbp109 },
+ { 0x1.dc6b6f1384e18p106, 0x1.b8d094c86d374p109 },
+ { 0x1.c19fa87de37fbp106, 0x1.a0df0f0c626dcp109 },
+ { 0x1.a848df650bea7p106, 0x1.8a2e269750a39p109 },
+ { 0x1.90538b942ea7cp106, 0x1.74adc8f4064d3p109 },
+ { 0x1.79ad1fce5b3d8p106, 0x1.604ea819f007cp109 },
+ { 0x1.6443fdcf0c327p106, 0x1.4d0231928c6f9p109 },
+ { 0x1.50076ad55cc39p106, 0x1.3aba85fe22e2p109 },
+ { 0x1.3ce784b411931p106, 0x1.296a70f414053p109 },
+ { 0x1.2ad53760d7287p106, 0x1.1905613b3abf2p109 },
+ { 0x1.19c232fd50b88p106, 0x1.097f6156f32c5p109 },
+ { 0x1.09a0e254c75ep106, 0x1.f59a20caf6695p108 },
+ { 0x1.f4c8c392fb944p105, 0x1.d9c73698fb1dcp108 },
+ { 0x1.d800ed59bd026p105, 0x1.bf716c6168baep108 },
+ { 0x1.bcd30dfbd611bp105, 0x1.a6852c6b58392p108 },
+ { 0x1.a32923130213fp105, 0x1.8eefd70594a89p108 },
+ { 0x1.8aee4cd06ec1bp105, 0x1.789fb715aae95p108 },
+ { 0x1.740ebfab80eb4p105, 0x1.6383f726a8e04p108 },
+ { 0x1.5e77b6bbd2127p105, 0x1.4f8c96f26a26ap108 },
+ { 0x1.4a1766b6e5e8ap105, 0x1.3caa61607f92p108 },
+ { 0x1.36dcf18a6465cp105, 0x1.2acee2f5ecdb8p108 },
+ { 0x1.24b85a8bf0124p105, 0x1.19ec60b1242edp108 },
+ { 0x1.139a7b37f8475p105, 0x1.09f5cf4dd2877p108 },
+ { 0x1.0374f8792ca97p105, 0x1.f5bd95d8730d8p107 },
+ { 0x1.e87470e4f4246p104, 0x1.d9371e2ff7c35p107 },
+ { 0x1.cbbab18b73217p104, 0x1.be41de54d155ap107 },
+ { 0x1.b0a44aa2f067ep104, 0x1.a4c89e08ef4f3p107 },
+ { 0x1.971a1ec0f40c7p104, 0x1.8cb738399b12cp107 },
+ { 0x1.7f064a8ba8323p104, 0x1.75fa8dbc84becp107 },
+ { 0x1.685414c16188ep104, 0x1.608078a70dcbcp107 },
+ { 0x1.52efdf060cd2p104, 0x1.4c37c0394d094p107 },
+ { 0x1.3ec7176d784b5p104, 0x1.39100d5687bfep107 },
+ { 0x1.2bc82ab9d2302p104, 0x1.26f9df8519bd7p107 },
+ { 0x1.19e277461404p104, 0x1.15e6827001f18p107 },
+ { 0x1.090640946d2d5p104, 0x1.05c803e4831c1p107 },
+ { 0x1.f24946f22d5aep103, 0x1.ed22548cffd35p106 },
+ { 0x1.d45f15b49b35ep103, 0x1.d06ad6ecdf971p106 },
+ { 0x1.b83349fd05191p103, 0x1.b551c847fbc96p106 },
+ { 0x1.9dacb2c432ef4p103, 0x1.9bc09f112b494p106 },
+ { 0x1.84b37e1cbf8ebp103, 0x1.83a1ff0aa239dp106 },
+ { 0x1.6d3126d74b6ccp103, 0x1.6ce1aa3fd7bddp106 },
+ { 0x1.5710631158bffp103, 0x1.576c72b514859p106 },
+ { 0x1.423d13a3b73e1p103, 0x1.43302cc4a0da8p106 },
+ { 0x1.2ea43465e3995p103, 0x1.301ba221dc9bbp106 },
+ { 0x1.1c33cd3c37addp103, 0x1.1e1e857adc568p106 },
+ { 0x1.0adae3e73c2b5p103, 0x1.0d2966b1746f7p106 },
+ { 0x1.f512dd15b73b7p102, 0x1.fa5b4f49cc6b2p105 },
+ { 0x1.d6608dc942687p102, 0x1.dc3ae30b55c16p105 },
+ { 0x1.b9823c51276e1p102, 0x1.bfd7555a3bd68p105 },
+ { 0x1.9e5ce2f93dd76p102, 0x1.a517d9e61628ap105 },
+ { 0x1.84d6fe15b6b93p102, 0x1.8be4f8f6c951fp105 },
+ { 0x1.6cd87746bc76bp102, 0x1.74287ded49339p105 },
+ { 0x1.564a91cd221fp102, 0x1.5dcd669f2cd34p105 },
+ { 0x1.4117d7e2c667dp102, 0x1.48bfd38302871p105 },
+ { 0x1.2d2c0909ebeb9p102, 0x1.34ecf8a3c124ap105 },
+ { 0x1.1a7409475f2f9p102, 0x1.22430f521cbcfp105 },
+ { 0x1.08ddd13bd35e7p102, 0x1.10b1488aeb235p105 },
+ { 0x1.f0b0be22d18e8p101, 0x1.0027c00a263a6p105 },
+ { 0x1.d1a75065a8c74p101, 0x1.e12ee004efc37p104 },
+ { 0x1.b48117843c1c7p101, 0x1.c3e44ae32b16bp104 },
+ { 0x1.99218b8ac7f8ep101, 0x1.a854ea14102a8p104 },
+ { 0x1.7f6dc6010b4adp101, 0x1.8e6761569f45dp104 },
+ { 0x1.674c6ae60d852p101, 0x1.7603bac345f65p104 },
+ { 0x1.50a592e3c968ep101, 0x1.5f1353cdad001p104 },
+ { 0x1.3b62b6aafb0c8p101, 0x1.4980cb3c80949p104 },
+ { 0x1.276e9b681072fp101, 0x1.3537f00b6ad4dp104 },
+ { 0x1.14b54042f445bp101, 0x1.2225b12bffc68p104 },
+ { 0x1.0323ccdc1a3dcp101, 0x1.10380e1adb7e9p104 },
+ { 0x1.e5510173b9a5p100, 0x1.febc107d5efaap103 },
+ { 0x1.c6654733b86adp100, 0x1.df0f2a0ee6947p103 },
+ { 0x1.a964ed354f984p100, 0x1.c14b2188bcee4p103 },
+ { 0x1.8e324c651b064p100, 0x1.a553644f7f07dp103 },
+ { 0x1.74b179d1eba81p100, 0x1.8b0cfce0579ep103 },
+ { 0x1.5cc82d9070d95p100, 0x1.725e7c5dd20f7p103 },
+ { 0x1.465daafca8b1dp100, 0x1.5b2fe547a134p103 },
+ { 0x1.315aaa46df48ep100, 0x1.456a974e92e93p103 },
+ { 0x1.1da9433aebbcfp100, 0x1.30f93c3699078p103 },
+ { 0x1.0b34d93135fcp100, 0x1.1dc7b5b978cf8p103 },
+ { 0x1.f3d41033c44ccp99, 0x1.0bc30c5d52f15p103 },
+ { 0x1.d36d25268cd2bp99, 0x1.f5b2be65a0c7fp102 },
+ { 0x1.b512a1fb1d8fcp99, 0x1.d5f3a8dea7357p102 },
+ { 0x1.98a442fc4fc15p99, 0x1.b82915b03515bp102 },
+ { 0x1.7e03b1cc6d738p99, 0x1.9c3517e789488p102 },
+ { 0x1.651468e010b8ap99, 0x1.81fb7df06136ep102 },
+ { 0x1.4dbb989001d84p99, 0x1.6961b8d641d06p102 },
+ { 0x1.37e00dac4e8b5p99, 0x1.524ec4d916caep102 },
+ { 0x1.236a197bf0b9ap99, 0x1.3cab1343d18d1p102 },
+ { 0x1.10437b1569d7ep99, 0x1.2860757487a01p102 },
+ { 0x1.fcae93fb7323cp98, 0x1.155a09065d4f7p102 },
+ { 0x1.db23c3f816f92p98, 0x1.0384250e4c9fcp102 },
+ { 0x1.bbc1a022c14d4p98, 0x1.e59890b926c78p101 },
+ { 0x1.9e658108af2ep98, 0x1.c642116a8a9e3p101 },
+ { 0x1.82eedbe410407p98, 0x1.a8e405e651ab6p101 },
+ { 0x1.693f22ab61ce9p98, 0x1.8d5f98114f872p101 },
+ { 0x1.5139a5f3661fbp98, 0x1.7397c5a66e307p101 },
+ { 0x1.3ac3788a1b429p98, 0x1.5b71456c5a4c4p101 },
+ { 0x1.25c354b26cb4ep98, 0x1.44d26de513197p101 },
+ { 0x1.122182e9a270fp98, 0x1.2fa31d6371537p101 },
+ { 0x1.ff8f84418d51p97, 0x1.1bcca373b7b43p101 },
+ { 0x1.dd4262aac53e8p97, 0x1.0939ab853339fp101 },
+ { 0x1.bd3474ec16ca5p97, 0x1.efac5187b2863p100 },
+ { 0x1.9f40fd0082b72p97, 0x1.cf1e86235d0e7p100 },
+ { 0x1.8345858c4438dp97, 0x1.b0a68a2128babp100 },
+ { 0x1.6921be96b86b1p97, 0x1.9423165bc4444p100 },
+ { 0x1.50b75c536f927p97, 0x1.7974e743dea3dp100 },
+ { 0x1.39e9f7dcbe479p97, 0x1.607e9eacd105p100 },
+ { 0x1.249ef1c3be817p97, 0x1.4924a74dec729p100 },
+ { 0x1.10bd565b35393p97, 0x1.334d19e0c216p100 },
+ { 0x1.fc5b8748842b2p96, 0x1.1edfa3c5f5ccap100 },
+ { 0x1.d9b4a18a38642p96, 0x1.0bc56f1b54701p100 },
+ { 0x1.b95cede6d524bp96, 0x1.f3d2185e047d9p99 },
+ { 0x1.9b2df77a02225p96, 0x1.d26cb87945e87p99 },
+ { 0x1.7f03b935e8e3ap96, 0x1.b334fac4b9f99p99 },
+ { 0x1.64bc777824f0ep96, 0x1.96076f7918d1cp99 },
+ { 0x1.4c389be9acb83p96, 0x1.7ac2d72fc2c63p99 },
+ { 0x1.355a9387de78cp96, 0x1.614801550319ep99 },
+ { 0x1.2006aeb6bc768p96, 0x1.4979ac8b28927p99 },
+ { 0x1.0c23033e2a376p96, 0x1.333c68e2d0548p99 },
+ { 0x1.f32ea02b55d23p95, 0x1.1e767bce37dd7p99 },
+ { 0x1.d099c5c770f5ap95, 0x1.0b0fc5b6d05ap99 },
+ { 0x1.b05cfe2e99435p95, 0x1.f1e3523b41d7dp98 },
+ { 0x1.92508d0743fc9p95, 0x1.d00de6608effep98 },
+ { 0x1.764f46cf19f9cp95, 0x1.b0778b7b3301bp98 },
+ { 0x1.5c36679625a01p95, 0x1.92fb04ec0f6cfp98 },
+ { 0x1.43e56c3e340a7p95, 0x1.77756ec9f78fap98 },
+ { 0x1.2d3dee1869201p95, 0x1.5dc61922d5a06p98 },
+ { 0x1.182380bd2f494p95, 0x1.45ce65699ff6dp98 },
+ { 0x1.047b91fcb6491p95, 0x1.2f71a5f15997p98 },
+ { 0x1.e45a9790460c1p94, 0x1.1a94ff571654fp98 },
+ { 0x1.c242efeaca76p94, 0x1.071f4bbea09ecp98 },
+ { 0x1.a284cb82c31cep94, 0x1.e9f1ff8ddd774p97 },
+ { 0x1.84f7a1eb7f7f3p94, 0x1.c818223a202c7p97 },
+ { 0x1.697595326d7dcp94, 0x1.a887bd2b4404dp97 },
+ { 0x1.4fdb462549af1p94, 0x1.8b1a336c5eb6bp97 },
+ { 0x1.3807ab51436a8p94, 0x1.6fab63324088ap97 },
+ { 0x1.21dbea9108398p94, 0x1.56197e30205bap97 },
+ { 0x1.0d3b35021d695p94, 0x1.3e44e45301b92p97 },
+ { 0x1.f4154a787cc1bp93, 0x1.281000bfe4c3fp97 },
+ { 0x1.d0623f4f4a28fp93, 0x1.135f28f2d50b4p97 },
+ { 0x1.af2e69a26261p93, 0x1.00187dded5975p97 },
+ { 0x1.904e0b3aa82a3p93, 0x1.dc479de0ef001p96 },
+ { 0x1.73985278fa30ep93, 0x1.bad4fdad3caa1p96 },
+ { 0x1.58e7298af87d9p93, 0x1.9baed3ed27ab8p96 },
+ { 0x1.401708b7e64c6p93, 0x1.7ead9ce4285bbp96 },
+ { 0x1.2906cb94eb40dp93, 0x1.63ac6b4edc88ep96 },
+ { 0x1.139788f2dd663p93, 0x1.4a88be2a6390cp96 },
+ { 0x1.ff58dab4f2a79p92, 0x1.332259185f1ap96 },
+ { 0x1.da552fdd03043p92, 0x1.1d5b1f3793044p96 },
+ { 0x1.b7f1f31b571b6p92, 0x1.0916f04b6e18bp96 },
+ { 0x1.98006c2117e39p92, 0x1.ec77101de6926p95 },
+ { 0x1.7a550f03b145bp92, 0x1.c960bf23153ep95 },
+ { 0x1.5ec74662c5961p92, 0x1.a8bd20fc65ef7p95 },
+ { 0x1.453141082302ap92, 0x1.8a61745ec7d1dp95 },
+ { 0x1.2d6fc2c9e8bcp92, 0x1.6e25d0e756261p95 },
+ { 0x1.1761f87a6dc3dp92, 0x1.53e4f7d1666cbp95 },
+ { 0x1.02e94eb4ac8a5p92, 0x1.3b7c27a7ddb0ep95 },
+ { 0x1.dfd296adef82ap91, 0x1.24caf2c32af14p95 },
+ { 0x1.bc8ed301215ebp91, 0x1.0fb3186804d0fp95 },
+ { 0x1.9bd5efd2c0f15p91, 0x1.f830c0bb41fd7p94 },
+ { 0x1.7d79f2db2d4a5p91, 0x1.d3c0f1a91c846p94 },
+ { 0x1.61500f5293f06p91, 0x1.b1e5acf351d87p94 },
+ { 0x1.47306f04df3d6p91, 0x1.92712d259ce66p94 },
+ { 0x1.2ef5ff0323b28p91, 0x1.7538c60a04476p94 },
+ { 0x1.187e3fb74914dp91, 0x1.5a14b04b47879p94 },
+ { 0x1.03a918225a966p91, 0x1.40dfd87456f4cp94 },
+ { 0x1.e0b15822be4ep90, 0x1.2977b1172b9d5p94 },
+ { 0x1.bce26a2fb7176p90, 0x1.13bc07e891491p94 },
+ { 0x1.9bb1bc445c3c6p90, 0x1.ff1dbb4300811p93 },
+ { 0x1.7cef42e9a617dp90, 0x1.d9a880f306bd8p93 },
+ { 0x1.606e51e0a4963p90, 0x1.b6e45220b55ep93 },
+ { 0x1.460560e841d79p90, 0x1.96a0b33f2c4dap93 },
+ { 0x1.2d8dd47a40ad8p90, 0x1.78b07e9e924acp93 },
+ { 0x1.16e3ca3d4393fp90, 0x1.5ce9ab1670dd2p93 },
+ { 0x1.01e5e8edda47bp90, 0x1.4325167006bbp93 },
+ { 0x1.dcea670907819p89, 0x1.2b3e53538ff3fp93 },
+ { 0x1.b8e9bec48816dp89, 0x1.15137a7f44864p93 },
+ { 0x1.97945aa1c9c35p89, 0x1.0084ff125639dp93 },
+ { 0x1.78b88a4e7107bp89, 0x1.daeb0b7311ec7p92 },
+ { 0x1.5c2827c986b62p89, 0x1.b7937d1c40c53p92 },
+ { 0x1.41b858361b0fep89, 0x1.96d082f59ab06p92 },
+ { 0x1.294150fb19119p89, 0x1.7872d9fa10aadp92 },
+ { 0x1.129e20e732adcp89, 0x1.5c4e8e37bc7dp92 },
+ { 0x1.fb58fa290d436p88, 0x1.423ac0df49a4p92 },
+ { 0x1.d499229819bc6p88, 0x1.2a117230ad284p92 },
+ { 0x1.b0c1a759f7739p88, 0x1.13af4f04f9998p92 },
+ { 0x1.8f9bb6c075486p88, 0x1.fde703724e56p91 },
+ { 0x1.70f4744735c2bp88, 0x1.d77f0c82e7641p91 },
+ { 0x1.549cb0f7ef8e2p88, 0x1.b3ee02611d7ddp91 },
+ { 0x1.3a68a8c1234e1p88, 0x1.92ff33023d5bdp91 },
+ { 0x1.222fc469e8b8cp88, 0x1.7481a9e69f53fp91 },
+ { 0x1.0bcc5fd30f1ddp88, 0x1.5847eda620959p91 },
+ { 0x1.ee3728761897bp87, 0x1.3e27c1fcc74bdp91 },
+ { 0x1.c7fa0c7e3bac7p87, 0x1.25f9ee0b923dcp91 },
+ { 0x1.a4a56eb132a54p87, 0x1.0f9a0686532p91 },
+ { 0x1.8401b5336a8ap87, 0x1.f5cc7718082bp90 },
+ { 0x1.65db58e2358c1p87, 0x1.cf7e53d6a2ca5p90 },
+ { 0x1.4a029a7ea7cd1p87, 0x1.ac0f5f3229372p90 },
+ { 0x1.304b3d1961171p87, 0x1.8b498644847eap90 },
+ { 0x1.188c45630dc53p87, 0x1.6cfa9bcca59dcp90 },
+ { 0x1.029fbd8b92835p87, 0x1.50f411d4fd2cdp90 },
+ { 0x1.dcc4fabf32f1cp86, 0x1.370ab8327af5ep90 },
+ { 0x1.b767ecb334a7ep86, 0x1.1f167f88c6b6ep90 },
+ { 0x1.94ec06c0ff29fp86, 0x1.08f24085d4597p90 },
+ { 0x1.751977e5803d3p86, 0x1.e8f70e181d61ap89 },
+ { 0x1.57bc950253825p86, 0x1.c324c20e337dcp89 },
+ { 0x1.3ca58b816a87fp86, 0x1.a03261574b54ep89 },
+ { 0x1.23a8197d2607ep86, 0x1.7fe903cdf5855p89 },
+ { 0x1.0c9b4b0a6a16fp86, 0x1.6215c58da345p89 },
+ { 0x1.eeb27891d2bb3p85, 0x1.46897d4b69fc6p89 },
+ { 0x1.c77dbfc848866p85, 0x1.2d1877d731b7bp89 },
+ { 0x1.a357936adf17bp85, 0x1.159a386b11517p89 },
+ { 0x1.8203fa7992554p85, 0x1.ffd27ae9393cep88 },
+ { 0x1.634b7f56b0a5cp85, 0x1.d7c593130dd0bp88 },
+ { 0x1.46fada7e6a5fep85, 0x1.b2cd607c79bcfp88 },
+ { 0x1.2ce2a3690576bp85, 0x1.90ae4d3405651p88 },
+ { 0x1.14d707280e6cfp85, 0x1.71312dd1759e2p88 },
+ { 0x1.fd5f08ad2b29ap84, 0x1.5422ef5d8949dp88 },
+ { 0x1.d48d57f7718b7p84, 0x1.39544b0ecc957p88 },
+ { 0x1.aef3ce0add578p84, 0x1.20997f73e73ddp88 },
+ { 0x1.8c52800f939c8p84, 0x1.09ca0eaacd277p88 },
+ { 0x1.6c6e61e57bf9bp84, 0x1.e9810295890ecp87 },
+ { 0x1.4f10e8ebc44a9p84, 0x1.c2b45b5aa4a1dp87 },
+ { 0x1.3407b59d72a5bp84, 0x1.9eee068fa7596p87 },
+ { 0x1.1b2443858c0a1p84, 0x1.7df2b399c10a8p87 },
+ { 0x1.043b9f1621ff3p84, 0x1.5f8b87a31bd85p87 },
+ { 0x1.de4c41eb96b45p83, 0x1.4385c96e9a2d9p87 },
+ { 0x1.b77e5cbd5d147p83, 0x1.29b2933ef4cbcp87 },
+ { 0x1.93c9fc62bfb11p83, 0x1.11e68a6378f8ap87 },
+ { 0x1.72f0c4c8e9bffp83, 0x1.f7f338086a86bp86 },
+ { 0x1.54b92affb11afp83, 0x1.cf8d7d9ce040ap86 },
+ { 0x1.38ee17b150182p83, 0x1.aa577251ae485p86 },
+ { 0x1.1f5e908f70e0cp83, 0x1.8811d739efb5fp86 },
+ { 0x1.07dd6833bb38p83, 0x1.68823e52970bep86 },
+ { 0x1.e481e7f6ac4bcp82, 0x1.4b72ae68e8b4cp86 },
+ { 0x1.bcc58edad5559p82, 0x1.30b14dbe876bcp86 },
+ { 0x1.983ee9896d582p82, 0x1.181012ef8661p86 },
+ { 0x1.76aca47764427p82, 0x1.01647ba798745p86 },
+ { 0x1.57d287836bd3dp82, 0x1.d90e917701675p85 },
+ { 0x1.3b79118c097a1p82, 0x1.b2a87e86d0c8ap85 },
+ { 0x1.216d1b97279a9p82, 0x1.8f53dcb377293p85 },
+ { 0x1.097f82fc04025p82, 0x1.6ed2f2515e933p85 },
+ { 0x1.e709b415656dp81, 0x1.50ecc9ed47f19p85 },
+ { 0x1.beaa3d6c15504p81, 0x1.356cd5ce7799ep85 },
+ { 0x1.9996ed9b83967p81, 0x1.1c229a587ab78p85 },
+ { 0x1.778be2bd9795bp81, 0x1.04e15ecc7f3f6p85 },
+ { 0x1.584a99af8a842p81, 0x1.deffc7e6a6017p84 },
+ { 0x1.3b99832cbefddp81, 0x1.b7b040832f31p84 },
+ { 0x1.2143a112d0466p81, 0x1.938e021f36d76p84 },
+ { 0x1.09182b326b229p81, 0x1.7258610b3b233p84 },
+ { 0x1.e5d47637f5db5p80, 0x1.53d3bfc82a909p84 },
+ { 0x1.bd20fcc3b76d7p80, 0x1.37c92babdc2fdp84 },
+ { 0x1.97c9dda748fc7p80, 0x1.1e06010120f6ap84 },
+ { 0x1.7589207e91ad1p80, 0x1.065b9616170d4p84 },
+ { 0x1.561e669aa7fdbp80, 0x1.e13dd96b3753bp83 },
+ { 0x1.394e7a2ac9fc7p80, 0x1.b950d32467392p83 },
+ { 0x1.1ee2e61eccc99p80, 0x1.94a72263259a5p83 },
+ { 0x1.06a996198f06fp80, 0x1.72fd93e036cdcp83 },
+ { 0x1.e0e8fbad2703ep79, 0x1.54164576929abp83 },
+ { 0x1.b8328ee330ae9p79, 0x1.37b83c521fe96p83 },
+ { 0x1.92e21013a767p79, 0x1.1daf033182e96p83 },
+ { 0x1.70aff489136ebp79, 0x1.05ca50205d26ap83 },
+ { 0x1.515a7c77fab48p79, 0x1.dfbb6235639fap82 },
+ { 0x1.34a53ce0bbb6fp79, 0x1.b7807e294781fp82 },
+ { 0x1.1a58b2b09fdcbp79, 0x1.9298add70a734p82 },
+ { 0x1.0241de6c31e5bp79, 0x1.70beaf9c7ffb6p82 },
+ { 0x1.d863cf753825cp78, 0x1.51b2cd6709222p82 },
+ { 0x1.affb906d0ae09p78, 0x1.353a6cf7f7fffp82 },
+ { 0x1.8afbf9e9520c2p78, 0x1.1b1fa8cbe84a7p82 },
+ { 0x1.691c7c768becep78, 0x1.0330f0fd69921p82 },
+ { 0x1.4a1a79df39cdep78, 0x1.da81670f96f9bp81 },
+ { 0x1.2db8ca9009091p78, 0x1.b24a16b4d09aap81 },
+ { 0x1.13bf4cb384e4ap78, 0x1.8d6eeb6efdbd6p81 },
+ { 0x1.f7f4f88751db4p77, 0x1.6ba91ac734786p81 },
+ { 0x1.cc7626bced452p77, 0x1.4cb7966770ab5p81 },
+ { 0x1.a4ab6470c1c5cp77, 0x1.305e9721d0981p81 },
+ { 0x1.80451c2811052p77, 0x1.1667311fff70ap81 },
+ { 0x1.5efa4d64f59f6p77, 0x1.fd3de10d62855p80 },
+ { 0x1.40880373ed74p77, 0x1.d1aefbcd48d0cp80 },
+ { 0x1.24b0d7368076ep77, 0x1.a9cc93c25aca9p80 },
+ { 0x1.0b3c7b0d960fp77, 0x1.85487ee3ea735p80 },
+ { 0x1.e7eea02e4ed88p76, 0x1.63daf8b4b1e0cp80 },
+ { 0x1.bd6408059b696p76, 0x1.45421e69a6ca1p80 },
+ { 0x1.96826d9e90341p76, 0x1.294175802d99ap80 },
+ { 0x1.72fa4fa12d516p76, 0x1.0fa17bf41068fp80 },
+ { 0x1.5282d2d5803fep76, 0x1.f05e82aae2bb9p79 },
+ { 0x1.34d935f1be064p76, 0x1.c578101b29058p79 },
+ { 0x1.19c050c56d0d7p76, 0x1.9e39dc5dd2f7cp79 },
+ { 0x1.01001dd9c7ccep76, 0x1.7a553a728bbf2p79 },
+ { 0x1.d4ca9b634ecbap75, 0x1.5982008db1304p79 },
+ { 0x1.ab81c5c80cf39p75, 0x1.3b7e00422e51bp79 },
+ { 0x1.85cfacb7477f2p75, 0x1.200c898d9ee3ep79 },
+ { 0x1.6365862923eb9p75, 0x1.06f5f7eb65a56p79 },
+ { 0x1.43fb317b5dc37p75, 0x1.e00e9148a1d25p78 },
+ { 0x1.274ea96044bd7p75, 0x1.b623734024e92p78 },
+ { 0x1.0d23817479c67p75, 0x1.8fd4e01891bf8p78 },
+ { 0x1.ea84dd159259p74, 0x1.6cd44c7470d89p78 },
+ { 0x1.bef1b1a12823ep74, 0x1.4cd9c04158cd7p78 },
+ { 0x1.9730edfda64acp74, 0x1.2fa34bf5c8344p78 },
+ { 0x1.72ede3b7eaa25p74, 0x1.14f4890ff2461p78 },
+ { 0x1.51db1ec3a3087p74, 0x1.f92c49dfa4df5p77 },
+ { 0x1.33b1c9d1576ecp74, 0x1.ccaaea71ab0dfp77 },
+ { 0x1.18311f8a03acap74, 0x1.a40829f001197p77 },
+ { 0x1.fe3bcf4629feap73, 0x1.7eef13b59e96cp77 },
+ { 0x1.d083fda665164p73, 0x1.5d11e1a252bf5p77 },
+ { 0x1.a6d7d18831888p73, 0x1.3e296303b2297p77 },
+ { 0x1.80dcd6603df1bp73, 0x1.21f47009f43cep77 },
+ { 0x1.5e4062d5b6a4ep73, 0x1.083768c5e4542p77 },
+ { 0x1.3eb6ef47c2758p73, 0x1.e1777d831265fp76 },
+ { 0x1.21fb7a81c5444p73, 0x1.b69f10b0191b5p76 },
+ { 0x1.07cefb734d68bp73, 0x1.8f8a3a05b5b53p76 },
+ { 0x1.dfefbdb19ac7ep72, 0x1.6be573c40c8e7p76 },
+ { 0x1.b4831fb12344p72, 0x1.4b645ba991fdbp76 },
+ { 0x1.8cf81557d20b6p72, 0x1.2dc119095729fp76 },
+ { 0x1.68f6f0feb4755p72, 0x1.12bbcfa4d62dep76 },
+ { 0x1.482fa78c40635p72, 0x1.f4343c7d504b9p75 },
+ { 0x1.2a59289a484fbp72, 0x1.c74d4fe1e0e8bp75 },
+ { 0x1.0f30c4d0be5cp72, 0x1.9e614ecbf4af6p75 },
+ { 0x1.ecf3428c48d4fp71, 0x1.791716475420cp75 },
+ { 0x1.bff86d9ec8499p71, 0x1.571d34563050ap75 },
+ { 0x1.970bb87f4ae14p71, 0x1.3829407a207d8p75 },
+ { 0x1.71d0b55b79b86p71, 0x1.1bf74244aed5ap75 },
+ { 0x1.4ff315d036fbdp71, 0x1.024924c7520d1p75 },
+ { 0x1.3125f6a3d257p71, 0x1.d5cc6ba567f29p74 },
+ { 0x1.15233ae8815f2p71, 0x1.ab3560167ccaap74 },
+ { 0x1.f755ea760487dp70, 0x1.846e9dda7a163p74 },
+ { 0x1.c905bbd9ab5a6p70, 0x1.6121d7db32bddp74 },
+ { 0x1.9eebaa0589b4ep70, 0x1.410047ead6894p74 },
+ { 0x1.78a6de0f41b89p70, 0x1.23c2090cdde78p74 },
+ { 0x1.55df1790f2f61p70, 0x1.09257fca001cp74 },
+ { 0x1.3643ec463a3cfp70, 0x1.e1dd9ec677783p73 },
+ { 0x1.198c18435598dp70, 0x1.b5ceb5a13221bp73 },
+ { 0x1.fee9bab9f4e14p69, 0x1.8dbaa11de2037p73 },
+ { 0x1.cf82e0eb6196bp69, 0x1.694680a9a3ee6p73 },
+ { 0x1.a474e7029a919p69, 0x1.481f73b3778e8p73 },
+ { 0x1.7d5af6513e2bep69, 0x1.29f9e7d8fd094p73 },
+ { 0x1.59d93e1d8f57dp69, 0x1.0e90f64b5b103p73 },
+ { 0x1.399c279e4699ap69, 0x1.eb4b9e47b58c9p72 },
+ { 0x1.1c579bbca6885p69, 0x1.bdfe62f60dd7p72 },
+ { 0x1.01c659160612dp69, 0x1.94d1de5c4576fp72 },
+ { 0x1.d352b1ae2694p68, 0x1.6f66f6ab90c3cp72 },
+ { 0x1.a78e8252c204dp68, 0x1.4d67050b31c2ap72 },
+ { 0x1.7fd7c80f3410ep68, 0x1.2e8318008cf89p72 },
+ { 0x1.5bcf92cc55d86p68, 0x1.1273463a1589bp72 },
+ { 0x1.3b1f876b10da7p68, 0x1.f1ec20afad0e2p71 },
+ { 0x1.1d791bb1324a1p68, 0x1.c39fa0d4a5a2bp71 },
+ { 0x1.0294e37abcee8p68, 0x1.99946bf7e02a1p71 },
+ { 0x1.d463db5fa3c13p67, 0x1.73679b24aeb9bp71 },
+ { 0x1.a82a5f4047a5bp67, 0x1.50bf2558ab78fp71 },
+ { 0x1.8011fb05fe09p67, 0x1.314916abfa1eap71 },
+ { 0x1.5bb91decf8a58p67, 0x1.14bad9006f53bp71 },
+ { 0x1.3ac71ce35c1d3p67, 0x1.f5a1196b5bb2ep70 },
+ { 0x1.1ceb656955c59p67, 0x1.c698e001f6d3p70 },
+ { 0x1.01dcc2acf7755p67, 0x1.9beca74b0f147p70 },
+ { 0x1.d2b166911c178p66, 0x1.753637caac6d9p70 },
+ { 0x1.a6459c5b11342p66, 0x1.5218993857afcp70 },
+ { 0x1.7e086accc805dp66, 0x1.323f3f19cff3ep70 },
+ { 0x1.59962aef547b3p66, 0x1.155d47fdb9c94p70 },
+ { 0x1.3894608650edep66, 0x1.f6599b70323cap69 },
+ { 0x1.1ab0e4d284f44p66, 0x1.c6dc8a4bb3ba6p69 },
+ { 0x1.ff4248ebb8299p65, 0x1.9bcfd83a431e9p69 },
+ { 0x1.ce42dd8e4fa23p65, 0x1.74ca889bbacd5p69 },
+ { 0x1.a1e8aa1400997p65, 0x1.516d33e26c04p69 },
+ { 0x1.79c430435a7fcp65, 0x1.31612a7ef535fp69 },
+ { 0x1.557046eb39249p65, 0x1.1457ab75c2489p69 },
+ { 0x1.349127b59b217p65, 0x1.f41259c9550cp68 },
+ { 0x1.16d392dff5104p65, 0x1.c46969ca99a2ep68 },
+ { 0x1.f7d80dc993f2fp64, 0x1.993e82b76e726p68 },
+ { 0x1.c72c149cb214bp64, 0x1.72267ac1b25ap68 },
+ { 0x1.9b270c24cc8fap64, 0x1.4ec0062aeeb78p68 },
+ { 0x1.73585df7b6643p64, 0x1.2eb2d18a2081bp68 },
+ { 0x1.4f59f9910367ep64, 0x1.11aeb0b11d1a1p68 },
+ { 0x1.2ecf5b7f6abe3p64, 0x1.eed5c0bbf1061p67 },
+ { 0x1.1164ab45aa235p64, 0x1.bf4ab21b4f3fp67 },
+ { 0x1.ed9bdbc6f1b0ap63, 0x1.944462d4d5991p67 },
+ { 0x1.bd8c96533b39bp63, 0x1.6d561de54f6a1p67 },
+ { 0x1.921ec84d5860ep63, 0x1.4a1d472804fc8p67 },
+ { 0x1.6ae172414cebap63, 0x1.2a406e25fcb44p67 },
+ { 0x1.476e3b661be8cp63, 0x1.0d6e7662dda9dp67 },
+ { 0x1.276873924f0b4p63, 0x1.e6bba6770e22dp66 },
+ { 0x1.0a7c2c9322f59p63, 0x1.b797ab2ba22d2p66 },
+ { 0x1.e0bad18c4e37dp62, 0x1.8cf813910fdcdp66 },
+ { 0x1.b18eba0be4d24p62, 0x1.666f488db6e0ap66 },
+ { 0x1.86f7884e1caadp62, 0x1.4399f7770045fp66 },
+ { 0x1.608484d592328p62, 0x1.241e1ebbbf4ecp66 },
+ { 0x1.3dcfaee52a8f5p62, 0x1.07aa30ce6a5ap66 },
+ { 0x1.1e7cbac093f27p62, 0x1.dbe8969a24c6fp65 },
+ { 0x1.023827dc88ed9p62, 0x1.ad7301258d788p65 },
+ { 0x1.d16cd999791c3p61, 0x1.837a640fa9d3dp65 },
+ { 0x1.a3666de0788bp61, 0x1.5d90f358d61f6p65 },
+ { 0x1.79e17816df1e8p61, 0x1.3b5342f7be9cp65 },
+ { 0x1.546e385224d1p61, 0x1.1c674ecd152d3p65 },
+ { 0x1.32a7a483e977bp61, 0x1.007b997a0b531p65 },
+ { 0x1.1432649c86c4dp61, 0x1.ce8cc007a6432p64 },
+ { 0x1.f177ce0bd5836p60, 0x1.a109c0bccbc39p64 },
+ { 0x1.bff3166bc36eep60, 0x1.77f5624913c3ap64 },
+ { 0x1.934fc0975fb3p60, 0x1.52e251d5d3b1fp64 },
+ { 0x1.6b13ebb9a5ad4p60, 0x1.316da780bc4d9p64 },
+ { 0x1.46d17a80cc174p60, 0x1.133deb1d3526p64 },
+ { 0x1.2624f3a0a887p60, 0x1.f00460b24acf8p63 },
+ { 0x1.08b47d7733cb6p60, 0x1.bee2903d584f9p63 },
+ { 0x1.dc5de496b181p59, 0x1.92920a7c80e26p63 },
+ { 0x1.ac9615b3c9fd7p59, 0x1.6a9b25345c773p63 },
+ { 0x1.818d3a356669ep59, 0x1.4691b26b9c82fp63 },
+ { 0x1.5acbdab2ed713p59, 0x1.2613e9610f6d1p63 },
+ { 0x1.37e61fd4c0fep59, 0x1.08c969adf0beap63 },
+ { 0x1.187ab3d71db11p59, 0x1.dcc4ac4f59be5p62 },
+ { 0x1.f8637ea4e52acp58, 0x1.ad2d0a9a18288p62 },
+ { 0x1.c577fd709b099p58, 0x1.82498a7cc94b9p62 },
+ { 0x1.97a3dc62119c8p58, 0x1.5ba462dee8a02p62 },
+ { 0x1.6e66137bb7ccap58, 0x1.38d330d8806ap62 },
+ { 0x1.494a3f6a9a70ep58, 0x1.1975e0627306cp62 },
+ { 0x1.27e767bb79ea2p58, 0x1.fa6b5ee8f3088p61 },
+ { 0x1.09dee32687729p58, 0x1.c78892308bd9p61 },
+ { 0x1.ddb6ae2f39381p57, 0x1.99b5ec6741cb3p61 },
+ { 0x1.ad1f9fba4b2abp57, 0x1.7073c400e10dcp61 },
+ { 0x1.816dde4c11ca3p57, 0x1.4b4ee0b3a84d6p61 },
+ { 0x1.5a245d5e5289cp57, 0x1.29df4862ac231p61 },
+ { 0x1.36d26a686daafp57, 0x1.0bc7294e0cbafp61 },
+ { 0x1.171277cbbce9cp57, 0x1.e163bd8df864p60 },
+ { 0x1.f5120b45c00e6p56, 0x1.b0a61bce91993p60 },
+ { 0x1.c1c74b30d0bbp56, 0x1.84cbb00f925fp60 },
+ { 0x1.93b02e5cf0324p56, 0x1.5d5841ce6cb73p60 },
+ { 0x1.6a46f43f3118cp56, 0x1.39dbcd485dd07p60 },
+ { 0x1.45132973bb79bp56, 0x1.19f153b38a108p60 },
+ { 0x1.23a85891dc72bp56, 0x1.fa7b9159fc471p59 },
+ { 0x1.05a4dba466c4ep56, 0x1.c6de3429e31fap59 },
+ { 0x1.d561964307dc4p55, 0x1.98769faac8a1bp59 },
+ { 0x1.a4fa0f13737e8p55, 0x1.6ebf82977acfp59 },
+ { 0x1.7984b636ad1bep55, 0x1.4940bc89fa5aap59 },
+ { 0x1.5281628cb373ap55, 0x1.278e135bcf0a4p59 },
+ { 0x1.2f7cc38bc628dp55, 0x1.0946088b6f8edp59 },
+ { 0x1.100f1aef8eaf5p55, 0x1.dc21972b9e9f4p58 },
+ { 0x1.e7b62ce66acdep54, 0x1.ab3e8cfada51ap58 },
+ { 0x1.b5198cf325114p54, 0x1.7f5483f729c27p58 },
+ { 0x1.87b15da6677afp54, 0x1.57e33e2b1c6dap58 },
+ { 0x1.5ef5de2e68985p54, 0x1.3477480d89e25p58 },
+ { 0x1.3a6d00852a688p54, 0x1.14a8b54629fb2p58 },
+ { 0x1.19a90b14f53afp54, 0x1.f033fa073d52p57 },
+ { 0x1.f88eba04114cbp53, 0x1.bcede5acc0d4p57 },
+ { 0x1.c3dea36b87937p53, 0x1.8ee7b29d0b081p57 },
+ { 0x1.94a28136fa731p53, 0x1.659917bbb6632p57 },
+ { 0x1.6a4b2c9663fa1p53, 0x1.40877b79cd868p57 },
+ { 0x1.44580945b8452p53, 0x1.1f44979177348p57 },
+ { 0x1.22558f1aa9f03p53, 0x1.016d3f035816p57 },
+ { 0x1.03dbf8db89298p53, 0x1.cd508600d0ba8p56 },
+ { 0x1.d11c2965639f6p52, 0x1.9d4ae77a21604p56 },
+ { 0x1.a03065db54a4bp52, 0x1.723974e9529d8p56 },
+ { 0x1.745e6013d8cf3p52, 0x1.4b9a944f57915p56 },
+ { 0x1.4d1f2eb8531p52, 0x1.28f9c9b769ee3p56 },
+ { 0x1.29f9b7c4f56dfp52, 0x1.09ee66b6e99e9p56 },
+ { 0x1.0a814a1dfc5edp52, 0x1.dc34b6999ff72p55 },
+ { 0x1.dca8b63e38fa9p51, 0x1.aa5249b4cca57p55 },
+ { 0x1.aa36c9242f8bcp51, 0x1.7d9db080918bap55 },
+ { 0x1.7d0fbfa6c3c19p51, 0x1.558e88e8945efp55 },
+ { 0x1.54a6b679dd96fp51, 0x1.31aa564e92066p55 },
+ { 0x1.307d4e71272d7p51, 0x1.11831a9c3763dp55 },
+ { 0x1.1022313b11381p51, 0x1.e96c265c21fbfp54 },
+ { 0x1.e65f78e13edcdp50, 0x1.b5d52c19374fep54 },
+ { 0x1.b2959e487c93fp50, 0x1.87a2188252d5fp54 },
+ { 0x1.84436cf62b6f8p50, 0x1.5e440cc8caaf9p54 },
+ { 0x1.5ad66c67f3f63p50, 0x1.393ad199301dep54 },
+ { 0x1.35cb549c616ebp50, 0x1.18135a0647102p54 },
+ { 0x1.14ac7e9322a1ap50, 0x1.f4ccd98eab06bp53 },
+ { 0x1.ee20fae75a2c5p49, 0x1.bfaedff2748c1p53 },
+ { 0x1.b931b883c77f2p49, 0x1.9026a7e3c9538p53 },
+ { 0x1.89e1f8e1d4be6p49, 0x1.659f3419269eep53 },
+ { 0x1.5f9a24050e89fp49, 0x1.3f92e9472ca4cp53 },
+ { 0x1.39d2746cbe57fp49, 0x1.1d89fb6602df9p53 },
+ { 0x1.18115431b6c4ap49, 0x1.fe32077e095c4p52 },
+ { 0x1.f3d3ca19edf64p48, 0x1.c7bf775863df5p52 },
+ { 0x1.bdf55dd9bdcep48, 0x1.970fb0b5580dcp52 },
+ { 0x1.8dd8e25d2255dp48, 0x1.6b88087e4af9fp52 },
+ { 0x1.62e225ebca19p48, 0x1.449de67f2c6b2p52 },
+ { 0x1.3c855ef212badp48, 0x1.21d51dc348d4dp52 },
+ { 0x1.1a4576cd5cddcp48, 0x1.02be7023a443ep52 },
+ { 0x1.f765035c713d8p47, 0x1.cdec7155697e1p51 },
+ { 0x1.c0d0bdeb46ae2p47, 0x1.9c4671c1a6e3cp51 },
+ { 0x1.901afbd3819bep47, 0x1.6feb0af26f865p51 },
+ { 0x1.64a386137b955p47, 0x1.484b1e63b3be4p51 },
+ { 0x1.3ddb15521ce49p47, 0x1.24e68a1458bd7p51 },
+ { 0x1.1b418ba2217c6p47, 0x1.054a9a7c2f05ap51 },
+ { 0x1.f8c8bad8e2a2p46, 0x1.d2214ad33ca5ep50 },
+ { 0x1.c1ba4950b8f4fp46, 0x1.9fb9933adac68p50 },
+ { 0x1.90a0b40dd690cp46, 0x1.72b99eccc462ep50 },
+ { 0x1.64d860502b279p46, 0x1.4a8e4dbe3539cp50 },
+ { 0x1.3dcf1aadc099dp46, 0x1.26b4018ef81f7p50 },
+ { 0x1.1b02414a73357p46, 0x1.06b4fe82cc6aep50 },
+ { 0x1.f7fa3e4bec2aep45, 0x1.d44feffb34893p49 },
+ { 0x1.c0aee6d6b1406p45, 0x1.a15d86bb23572p49 },
+ { 0x1.8f684065398bfp45, 0x1.73ea5ac0d71a9p49 },
+ { 0x1.637ff9397e989p45, 0x1.4b5fdd0f567fap49 },
+ { 0x1.3c618d3c706ebp45, 0x1.2737769828878p49 },
+ { 0x1.1988625955723p45, 0x1.06f8da87263cep49 },
+ { 0x1.f4fc2f6d50e41p44, 0x1.d4710a9e149edp48 },
+ { 0x1.bdb204ff1cda3p44, 0x1.a12cc7b1bf616p48 },
+ { 0x1.8c75a6fa17116p44, 0x1.73793d6253bd7p48 },
+ { 0x1.609ec277b8703p44, 0x1.4abd0af44c7f8p48 },
+ { 0x1.399725d96eb63p44, 0x1.266f2e981ccfbp48 },
+ { 0x1.16d8d1241b86bp44, 0x1.06154a07d21a2p48 },
+ { 0x1.efd875a51d28dp43, 0x1.d2842b40e25fp47 },
+ { 0x1.b8cd873c4de72p43, 0x1.9f27fa465d061p47 },
+ { 0x1.87d2a89e5ac65p43, 0x1.7167c3937ded9p47 },
+ { 0x1.5c3e42539c769p43, 0x1.48a7fb96552cap47 },
+ { 0x1.35791e04cd29fp43, 0x1.245dcbaa25b1bp47 },
+ { 0x1.12fc6cdafd10dp43, 0x1.040d4ab2de626p47 },
+ { 0x1.e8a0077a1ed47p42, 0x1.ce8fcb8dadc2cp46 },
+ { 0x1.b2118f75a4eb7p42, 0x1.9b55e7c11d9e6p46 },
+ { 0x1.818e8b1c2616fp42, 0x1.6dbce02ec5c77p46 },
+ { 0x1.566cdf4525ebp42, 0x1.4527acab6dfebp46 },
+ { 0x1.3014fd204bc71p42, 0x1.210a3ddcb4706p46 },
+ { 0x1.0dffe0bfc0c74p42, 0x1.00e7aba6527c9p46 },
+ { 0x1.df6a8d5e14f11p41, 0x1.c8a12a152d814p45 },
+ { 0x1.a9942579915cdp41, 0x1.95c35893651c9p45 },
+ { 0x1.79bdc576e403ap41, 0x1.6884d52cc9914p45 },
+ { 0x1.4f3d9114d799bp41, 0x1.4047ce663f641p45 },
+ { 0x1.297c4e6eb62fcp41, 0x1.1c7f9c74f3e7cp45 },
+ { 0x1.07f35ef1a4fcp41, 0x1.f95dcee779f74p44 },
+ { 0x1.d455e0a3b0d94p40, 0x1.c0cc007cc808ep44 },
+ { 0x1.9f70bf04a77cep40, 0x1.8e82cd2a6133cp44 },
+ { 0x1.707990a8defefp40, 0x1.61d0ef76712e4p44 },
+ { 0x1.46c779ebb14aep40, 0x1.3a1882865d26ep44 },
+ { 0x1.21c4420bc9879p40, 0x1.16cce86450b2p44 },
+ { 0x1.00ea48df1e7fbp40, 0x1.eee1d41e1e516p43 },
+ { 0x1.c7856a7693627p39, 0x1.b72a1658393d4p43 },
+ { 0x1.93c7abef59a2cp39, 0x1.85ac17b553c4fp43 },
+ { 0x1.65df602b1e0ffp39, 0x1.59b72775450f3p43 },
+ { 0x1.3d256a5ee461dp39, 0x1.32ae03812fcp43 },
+ { 0x1.19053bac5f645p39, 0x1.1004b9cd4bae6p43 },
+ { 0x1.f1f58fe66e142p38, 0x1.e27d88d5289bfp42 },
+ { 0x1.b9216793da422p38, 0x1.abdab3fb224cep42 },
+ { 0x1.86bd6adace04ep38, 0x1.7b5bd9f52a89ep42 },
+ { 0x1.5a104640aeb74p38, 0x1.5051a941eb13p42 },
+ { 0x1.32755417b50ddp38, 0x1.2a20366f6a0dep42 },
+ { 0x1.0f5a5274f5c45p38, 0x1.083cdb1163405p42 },
+ { 0x1.e07ab300dc4b9p37, 0x1.d458a013d18b4p41 },
+ { 0x1.a956163a49613p37, 0x1.9f01f97b2e043p41 },
+ { 0x1.7879eb52380edp37, 0x1.6fb2eaf7d8102p41 },
+ { 0x1.4d30488394e18p37, 0x1.45be480207b14p41 },
+ { 0x1.26d7af2869fc5p37, 0x1.208a2b041836ep41 },
+ { 0x1.04e0c593552f5p37, 0x1.ff1ba8cbc9c8dp40 },
+ { 0x1.cd98a274acae3p36, 0x1.c49f8a8ec4aebp40 },
+ { 0x1.9852d44d7528bp36, 0x1.90c81ede57558p40 },
+ { 0x1.6927c2c3e497p36, 0x1.62d5a948b6358p40 },
+ { 0x1.3f65a98c177c9p36, 0x1.3a1de0952fd2bp40 },
+ { 0x1.1a6ed66936eeap36, 0x1.16098d4b94692p40 },
+ { 0x1.f36ed3084aa81p35, 0x1.ec24d6a8bc072p39 },
+ { 0x1.b986ab7ebdd54p35, 0x1.b3828ebcc128bp39 },
+ { 0x1.864933f3c0573p35, 0x1.8158a3038115ep39 },
+ { 0x1.58f359f0c4e8fp35, 0x1.54eb3e9a3e72bp39 },
+ { 0x1.30d82cb8a968cp35, 0x1.2d93b0174f61ap39 },
+ { 0x1.0d5e5f59de7c1p35, 0x1.0abe0d45fd5c2p39 },
+ { 0x1.dbfc240ab5f81p34, 0x1.d7ce33a39bd89p38 },
+ { 0x1.a47db588b15cfp34, 0x1.a134d30d655e4p38 },
+ { 0x1.736c0d0a31187p34, 0x1.70e16f315ef4p38 },
+ { 0x1.480a1879e8f57p34, 0x1.461cda38e2783p38 },
+ { 0x1.21b0591ce1cfdp34, 0x1.2044a2faebb7bp38 },
+ { 0x1.ff94e3fca1752p33, 0x1.fd91813f8cc8cp37 },
+ { 0x1.c3a9f9558ffap33, 0x1.c2530177987fep37 },
+ { 0x1.8eb738c76b2f2p33, 0x1.8deb61106f334p37 },
+ { 0x1.5fee91a43fef1p33, 0x1.5f91f55e86346p37 },
+ { 0x1.3699940a6a811p33, 0x1.3694e7b13691bp37 },
+ { 0x1.1216c07263dep33, 0x1.1256a18de488bp37 },
+ { 0x1.e3ae49fef5535p32, 0x1.e49705a5ebd5fp36 },
+ { 0x1.aab87fb8e4441p32, 0x1.abefb3186e784p36 },
+ { 0x1.786c3dca158c4p32, 0x1.79dc285401b7dp36 },
+ { 0x1.4c036b7451223p32, 0x1.4d9a4f359ba1ep36 },
+ { 0x1.24cec8453db03p32, 0x1.267e46fd85893p36 },
+ { 0x1.02334e92993b9p32, 0x1.03efdea0a0506p36 },
+ { 0x1.c74fc41217dfbp31, 0x1.cad0afbb569b1p35 },
+ { 0x1.9166837399532p31, 0x1.94e0d5e7a8744p35 },
+ { 0x1.61d46c11dd916p31, 0x1.653d077d9eefp35 },
+ { 0x1.37dbe7711fcd4p31, 0x1.3b2a639494566p35 },
+ { 0x1.12d55c1e73c65p31, 0x1.16038b4af0a0ep35 },
+ { 0x1.e4594b115943bp30, 0x1.ea6c598920c48p34 },
+ { 0x1.aabdabdb93484p30, 0x1.b081aaf25ade1p34 },
+ { 0x1.77f073eb945dfp30, 0x1.7d62079a4e4a6p34 },
+ { 0x1.4b252d0bc8bebp30, 0x1.5042e1a8664edp34 },
+ { 0x1.23a7345c57ccap30, 0x1.287117d29a9e6p34 },
+ { 0x1.00d6f8a57f06ep30, 0x1.054e44f8ee735p34 },
+ { 0x1.c44f136cf3bd8p29, 0x1.cc9cbc5fe04a8p33 },
+ { 0x1.8e38df2790b7ap29, 0x1.95eb2cb828067p33 },
+ { 0x1.5e8f828661e21p29, 0x1.65acfefcd0029p33 },
+ { 0x1.3490e7e2bc31cp29, 0x1.3b20c56ad84f5p33 },
+ { 0x1.0f91b7ff9bb2ap29, 0x1.159b917beb87ap33 },
+ { 0x1.ddf56913a541ep28, 0x1.e90cb5cac7057p32 },
+ { 0x1.a48cc1b8a7bc7p28, 0x1.aeb7659e5f7efp32 },
+ { 0x1.71fde01e2ca8cp28, 0x1.7b4b752e86e5fp32 },
+ { 0x1.4578e0b906b32p28, 0x1.4df8ace15322ep32 },
+ { 0x1.1e4659a2a2156p28, 0x1.26072a17961ap32 },
+ { 0x1.f788fc218597bp27, 0x1.02d48c75e7d9bp32 },
+ { 0x1.bac92daac0b9dp27, 0x1.c7a2ecd5f05ap31 },
+ { 0x1.85518c3484796p27, 0x1.90feaede7f2aep31 },
+ { 0x1.56441b55bfff1p27, 0x1.60dcef1cedc3ap31 },
+ { 0x1.2cdd203ab43a1p27, 0x1.36787980e7387p31 },
+ { 0x1.08700c199ad4fp27, 0x1.112346e13dd7ep31 },
+ { 0x1.d0c9857c390f3p26, 0x1.e087915129a98p30 },
+ { 0x1.986a650394095p26, 0x1.a6a5096da5b7dp30 },
+ { 0x1.66d6688315ad6p26, 0x1.73aff07c7874ep30 },
+ { 0x1.3b3d55ebd8547p26, 0x1.46d572e10e216p30 },
+ { 0x1.14e7b714e7093p26, 0x1.1f5ba17e5a90bp30 },
+ { 0x1.e667d9a8bcd9ep25, 0x1.f93d0d186fbcdp29 },
+ { 0x1.ab2733e383ad8p25, 0x1.bc1b22cec72bp29 },
+ { 0x1.7712b76c8c7f6p25, 0x1.86529e9df069cp29 },
+ { 0x1.494d8e1d4fc61p25, 0x1.5702d052bf73ap29 },
+ { 0x1.2115447c6627dp25, 0x1.2d65aee08874cp29 },
+ { 0x1.fb7d503fc65c8p24, 0x1.08ccb49580d43p29 },
+ { 0x1.bd660913b938cp24, 0x1.d13c32a98512bp28 },
+ { 0x1.86db66e158524p24, 0x1.98a4bfd5a5fadp28 },
+ { 0x1.56f3ed5aa4222p24, 0x1.66e459a7794f4p28 },
+ { 0x1.2ce2265a96befp24, 0x1.3b28bbce3c1c6p28 },
+ { 0x1.07f14a8d0c116p24, 0x1.14b8b6b67144ep28 },
+ { 0x1.cf049ebedf60dp23, 0x1.e5e26dbef0e28p27 },
+ { 0x1.96129ca292f7ep23, 0x1.aa854b5c4f131p27 },
+ { 0x1.6416763f6b3bcp23, 0x1.765d329106241p27 },
+ { 0x1.3837bf030f4a8p23, 0x1.488b9479ee1c4p27 },
+ { 0x1.11b82880134f9p23, 0x1.204c8d940530bp27 },
+ { 0x1.dfe0c1b8af1f3p22, 0x1.f9e77238e0031p26 },
+ { 0x1.a49aa1651cfcap22, 0x1.bbd2c8fd7e193p26 },
+ { 0x1.709b5a3a79128p22, 0x1.85502f16a0f8dp26 },
+ { 0x1.42ffa7e9ace3fp22, 0x1.5574ceffe3945p26 },
+ { 0x1.1affd2eccd616p22, 0x1.2b72182c97af5p26 },
+ { 0x1.efd8be43ac9a9p21, 0x1.06925da53a0fcp26 },
+ { 0x1.b2564005de7e5p21, 0x1.cc6bb6d71090dp25 },
+ { 0x1.7c694cd2b4ffdp21, 0x1.93a02d0c97221p25 },
+ { 0x1.4d23fa69bd814p21, 0x1.61cb1a027e057p25 },
+ { 0x1.23b556e6e918ep21, 0x1.361358dd1f243p25 },
+ { 0x1.fecbcf04dca9p20, 0x1.0fba0d2660d89p25 },
+ { 0x1.bf29264dcdc82p20, 0x1.dc2ef387bd0ep24 },
+ { 0x1.8767d7fc43eb6p20, 0x1.a130711aadcdap24 },
+ { 0x1.568f9937abc79p20, 0x1.6d758e1ac9659p24 },
+ { 0x1.2bc67d8c20136p20, 0x1.401abca024479p24 },
+ { 0x1.064d4616b0094p20, 0x1.185819a7f8c6ap24 },
+ { 0x1.caf8458ad2a12p19, 0x1.eafc2b00a99b1p23 },
+ { 0x1.917faff93e54p19, 0x1.ade505ba61e89p23 },
+ { 0x1.5f2e79283b1cap19, 0x1.785c00b5cb27ep23 },
+ { 0x1.33220b1da4f59p19, 0x1.4973634932c1ap23 },
+ { 0x1.0c93ac678b0ccp19, 0x1.205a7d78be568p23 },
+ { 0x1.d5aa313452daep18, 0x1.f8b4440d68221p22 },
+ { 0x1.9a9b05368c88bp18, 0x1.b9a31a7b9868cp22 },
+ { 0x1.66ede7f0c2d55p18, 0x1.826756e1a42e2p22 },
+ { 0x1.39b7fc18e5891p18, 0x1.5209676e4b424p22 },
+ { 0x1.122b662569616p18, 0x1.27b019965e362p22 },
+ { 0x1.df2779ceabfc8p17, 0x1.029ce648133fdp22 },
+ { 0x1.a2a5d2945d2b7p17, 0x1.c45161cd95fe8p21 },
+ { 0x1.6dbccf848794ap17, 0x1.8b81d680cdfc5p21 },
+ { 0x1.3f79bf21caa96p17, 0x1.59ca24a7521ddp21 },
+ { 0x1.17080ae674896p17, 0x1.2e48f266999cfp21 },
+ { 0x1.e75b024885f54p16, 0x1.0838b13324d03p21 },
+ { 0x1.a98e26924c6c8p16, 0x1.cdd86b83e679dp20 },
+ { 0x1.738bf4bc8d296p16, 0x1.93977456406ddp20 },
+ { 0x1.445a6a9a273c6p16, 0x1.60a47aca18e96p20 },
+ { 0x1.1b1eabeffc3a5p16, 0x1.341669953fe1cp20 },
+ { 0x1.ee324e1fde417p15, 0x1.0d210b765b3d6p20 },
+ { 0x1.af4465e9c5668p15, 0x1.d622fa53c02cep19 },
+ { 0x1.784e3008fb46bp15, 0x1.9a961d6383ef7p19 },
+ { 0x1.484eecd2f1383p15, 0x1.66890cd0bf55fp19 },
+ { 0x1.1e65fd1ef2701p15, 0x1.390b73f2a4fbp19 },
+ { 0x1.f39dc6baaccd7p14, 0x1.114ae59581395p19 },
+ { 0x1.b3bb863d26278p14, 0x1.dd1e5296953a3p18 },
+ { 0x1.7bf89f052b591p14, 0x1.a06dfa21b6c59p18 },
+ { 0x1.4b4e35dbe0cddp14, 0x1.6b6a7a27c9005p18 },
+ { 0x1.20d6781986167p14, 0x1.3d1cca3d4f6d8p18 },
+ { 0x1.f790f6877f51ep13, 0x1.14acc164c64fep18 },
+ { 0x1.b6e93fa7299b3p13, 0x1.e2ba80b9c3a1bp17 },
+ { 0x1.7e82cde922833p13, 0x1.a511aa3827999p17 },
+ { 0x1.4d515a14a6132p13, 0x1.6f3d9139319edp17 },
+ { 0x1.226a790f97768p13, 0x1.404113d7d18e6p17 },
+ { 0x1.fa02b8ac73416p12, 0x1.173ed60fcd6fap17 },
+ { 0x1.b8c634233722p12, 0x1.e6ea95e92c624p16 },
+ { 0x1.7fe6d7fbcef2cp12, 0x1.a8767775dd309p16 },
+ { 0x1.4e53acc7531b1p12, 0x1.71f97a2983044p16 },
+ { 0x1.231e547065724p12, 0x1.42710a88aab19p16 },
+ { 0x1.faed5c4559717p11, 0x1.18fb2ded8ebb1p16 },
+ { 0x1.b94e0bfb59934p11, 0x1.e9a4d9b21386ep15 },
+ { 0x1.80217e57d8a3fp11, 0x1.aa947efe69879p15 },
+ { 0x1.4e52d23cf50bp11, 0x1.7397d8e2bd385p15 },
+ { 0x1.22f0652094ae6p11, 0x1.43a79684f6ef6p15 },
+ { 0x1.fa4eba730bf6p10, 0x1.19ddbd8138a9p15 },
+ { 0x1.b87f86a26fad7p10, 0x1.eae2ef93df996p14 },
+ { 0x1.7f323487ff94ap10, 0x1.ab66cfccafb75p14 },
+ { 0x1.4d4ec8ea8ee67p10, 0x1.7414e5b5ca43cp14 },
+ { 0x1.21e112e39bf18p10, 0x1.43e1e22ebfdb4p14 },
+ { 0x1.f8283ec45f117p9, 0x1.19e4732be2ffp14 },
+ { 0x1.b65c7f9f1fbedp9, 0x1.eaa1efb3b003ep13 },
+ { 0x1.7d1b22b6810f6p9, 0x1.aaeb7de6855e2p13 },
+ { 0x1.4b49e984886ep9, 0x1.736f7c0d13f06p13 },
+ { 0x1.1ff2d0d5a2649p9, 0x1.431f651be2ff4p13 },
+ { 0x1.f47ee1cab73ddp8, 0x1.190f3f39e9af4p13 },
+ { 0x1.b2e9e76c8d9f9p8, 0x1.e8e2722ca46cfp12 },
+ { 0x1.79e11d635b9a7p8, 0x1.a923a9d8d5019p12 },
+ { 0x1.4848ddf7dfffep8, 0x1.71a91ee04e82cp12 },
+ { 0x1.1d2a13fdd2709p8, 0x1.4161e6298ed3ap12 },
+ { 0x1.ef5b15f73200ap7, 0x1.176014201ab17p12 },
+ { 0x1.ae2fb07705cc3p7, 0x1.e5a88cbf394e4p11 },
+ { 0x1.758b92cdfdc64p7, 0x1.a6137c537bf6dp11 },
+ { 0x1.44528f79b1b51p7, 0x1.6ec5f2d1367f4p11 },
+ { 0x1.198d422be3f8cp7, 0x1.3ead7491061afp11 },
+ { 0x1.e8c8a7276c93p6, 0x1.14dadee76975ap11 },
+ { 0x1.a838b09afcf62p6, 0x1.e0fbc2ec572b9p10 },
+ { 0x1.70246e766d2f3p6, 0x1.a1c215fcd0beap10 },
+ { 0x1.3f700c0d99876p6, 0x1.6accae115453ep10 },
+ { 0x1.1524997d01ap6, 0x1.3b08582357e32p10 },
+ { 0x1.e0d68d9047f7ap5, 0x1.118577f06b2f2p10 },
+ { 0x1.a11277ca2bd3fp5, 0x1.dae6e8d292a1ep9 },
+ { 0x1.69b7f34ec048ep5, 0x1.9c3973d4c9b08p9 },
+ { 0x1.39ac6410ceb63p5, 0x1.65c67e684d1e6p9 },
+ { 0x1.0ffa110b113fp5, 0x1.367af901b137p9 },
+ { 0x1.d796b4f7aaf7fp4, 0x1.0d678c614f535p9 },
+ { 0x1.98cd1cb38dccp4, 0x1.d377f96b9fd62p8 },
+ { 0x1.62548d6675835p4, 0x1.958648bd6035p8 },
+ { 0x1.331480815e7cdp4, 0x1.5fbee5e7590f4p8 },
+ { 0x1.0a19336cc73a1p4, 0x1.310fbf558eca2p8 },
+ { 0x1.cd1db96a6c6efp3, 0x1.088a80b837328p8 },
+ { 0x1.8f7b007e1de49p3, 0x1.cabfe10b3371ap7 },
+ { 0x1.5a0a9c047e3c7p3, 0x1.8db7ccf7600f4p7 },
+ { 0x1.2bb6f2dd8e254p3, 0x1.58c38f07b7c3bp7 },
+ { 0x1.038ef3cbdc1c7p3, 0x1.2ad2ebb6268bdp7 },
+ { 0x1.c1829acfb62b3p2, 0x1.02f94d1fb1ba4p7 },
+ { 0x1.85308ad209551p2, 0x1.c0d23d3daadadp6 },
+ { 0x1.50ec3549a202dp2, 0x1.84df8496cc3aep6 },
+ { 0x1.23a3bf963c1ebp2, 0x1.50e4191e1b76cp6 },
+ { 0x1.f8d2fce0ebb41p1, 0x1.23d2690dc7344p6 },
+ { 0x1.b4de68e608347p1, 0x1.f980a88588961p5 },
+ { 0x1.7a03df8f9f479p1, 0x1.b5c5135a44acbp5 },
+ { 0x1.470ce4924af72p1, 0x1.7b10fe1f0aeaap5 },
+ { 0x1.1aec242758b4fp1, 0x1.4831de32e25bdp5 },
+ { 0x1.e9700b697ec96p0, 0x1.1c1d98f1b1f71p5 },
+ { 0x1.a74be9568f922p0, 0x1.ebda6af103d07p4 },
+ { 0x1.6e0c8fadbb05p0, 0x1.a9b07f491a273p4 },
+ { 0x1.3c8164e42f29cp0, 0x1.70618a9c019dap4 },
+ { 0x1.11a259faba91ep0, 0x1.3ebfb36da371bp4 },
+ { 0x1.d91518c2acaf6p-1, 0x1.13c51b7852ecp4 },
+ { 0x1.98e739a118b5ep-1, 0x1.dd1d36683753bp3 },
+ { 0x1.616346ca3be0ep-1, 0x1.9cae5c1f5de61p3 },
+ { 0x1.315f58c13df9cp-1, 0x1.64e7f0a95542fp3 },
+ { 0x1.07d957435b8c4p-1, 0x1.34a1a5595e9cbp3 },
+ { 0x1.c7e35cf4db634p-2, 0x1.0ada93ac2688ep3 },
+ { 0x1.89cd6ead31b71p-2, 0x1.cd680d6a376d2p2 },
+ { 0x1.542176fe1c2b2p-2, 0x1.8ed9e84be9bacp2 },
+ { 0x1.25bd00bd97eddp-2, 0x1.58bc1beb8e117p2 },
+ { 0x1.fb491e02b7c15p-3, 0x1.29ecb15514182p2 },
+ { 0x1.b5fcd30c7e1f6p-3, 0x1.017069c4b54cfp2 },
+ { 0x1.7a1c33cc1922bp-3, 0x1.bcdb33f7b88f9p1 },
+ { 0x1.46610483f2395p-3, 0x1.804f671a7a35cp1 },
+ { 0x1.19b0f23241b88p-3, 0x1.4bf6ca87a4707p1 },
+ { 0x1.e62f62b4555dcp-4, 0x1.1eb67d8a75351p1 },
+ { 0x1.a383ca9f98a0fp-4, 0x1.ef3318a5788dep0 },
+ { 0x1.69f16aeb3677p-4, 0x1.ab97c2106c4d2p0 },
+ { 0x1.383bf2b37a037p-4, 0x1.712bc1550fb6ap0 },
+ { 0x1.0d51cf5a16254p-4, 0x1.3eb13a24821e2p0 },
+ { 0x1.d08cdac87dce6p-5, 0x1.131510c1da6adp0 },
+ { 0x1.909a7c3ac6f99p-5, 0x1.dad26311e9efp-1 },
+ { 0x1.596acfa0bcc8fp-5, 0x1.99bf36c7ef068p-1 },
+ { 0x1.29cc13bfd53ap-5, 0x1.618c26c1169a6p-1 },
+ { 0x1.00b60212cf113p-5, 0x1.3104d5f799552p-1 },
+ { 0x1.ba886ae6e40ep-6, 0x1.071e8b6003b16p-1 },
+ { 0x1.7d62a282a4851p-6, 0x1.c5e5338097f6bp-2 },
+ { 0x1.48a59e9cb1eb1p-6, 0x1.87730de08c821p-2 },
+ { 0x1.1b2abc895a771p-6, 0x1.518db221cf8bap-2 },
+ { 0x1.e7e6f4c33ededp-7, 0x1.230ae74a714aap-2 },
+ { 0x1.a4480db60fe17p-7, 0x1.f5d1c58fdc6acp-3 },
+ { 0x1.69fd19aacb90ap-7, 0x1.b091a88a72f08p-3 },
+ { 0x1.37be42e1159e7p-7, 0x1.74d459ba38afep-3 },
+ { 0x1.0c707db025298p-7, 0x1.414d114bdcde1p-3 },
+ { 0x1.ce3ee3757dbe5p-8, 0x1.14dc49cbc0c3p-3 },
+ { 0x1.8df06bfb34f6dp-8, 0x1.dd13408401cdcp-4 },
+ { 0x1.568986affafc5p-8, 0x1.9afd0eca1593dp-4 },
+ { 0x1.26d009f5af049p-8, 0x1.6203633a6814ap-4 },
+ { 0x1.fb69c5d6b524ep-9, 0x1.30e632b0008c9p-4 },
+ { 0x1.b49c67cd1611fp-9, 0x1.069124dc6eaefp-4 },
+ { 0x1.77a47ec4e9fa1p-9, 0x1.c42b48d5cfe42p-5 },
+ { 0x1.43260788f0a1fp-9, 0x1.854b792c33d4ap-5 },
+ { 0x1.15f4e018a09eep-9, 0x1.4f1f511f7b2d7p-5 },
+ { 0x1.de1c72f739a49p-10, 0x1.2073f996519cp-5 },
+ { 0x1.9b25dc6d6642ep-10, 0x1.f08155c194aadp-6 },
+ { 0x1.61853cc8eddacp-10, 0x1.ab41e011814e5p-6 },
+ { 0x1.2feeed430b87bp-10, 0x1.6f9f62ec4193ap-6 },
+ { 0x1.05451535e8102p-10, 0x1.3c45d7f9e2fbp-6 },
+ { 0x1.c122bcbda7f8ep-11, 0x1.100ffa10ff0f3p-6 },
+ { 0x1.81ff0b26f3b6ap-11, 0x1.d401bee3a7787p-7 },
+ { 0x1.4bb153d2d0728p-11, 0x1.927ce5fbbe352p-7 },
+ { 0x1.1cfe80beb05a4p-11, 0x1.5a195c6e2a08ep-7 },
+ { 0x1.e9ae566e02486p-12, 0x1.2992f3c7d2ce7p-7 },
+ { 0x1.a4a3297375461p-12, 0x1.ffa47aef63bd2p-8 },
+ { 0x1.6948e77b6c537p-12, 0x1.b7ccca35ce88ep-8 },
+ { 0x1.3644eed5b1126p-12, 0x1.79ffc3cd6bc92p-8 },
+ { 0x1.0a6cd27d913d7p-12, 0x1.44d7c3dca9cc8p-8 },
+ { 0x1.c97f5c053e775p-13, 0x1.1720abf01aa9bp-8 },
+ { 0x1.88c0c973b68fcp-13, 0x1.dfa22008cf2c8p-9 },
+ { 0x1.512157ee1d8bep-13, 0x1.9c08a63df00dcp-9 },
+ { 0x1.215988e86b086p-13, 0x1.61eb258af5a93p-9 },
+ { 0x1.f09f2b684fb31p-14, 0x1.2ff68a28f7dc4p-9 },
+ { 0x1.aa222a98ba953p-14, 0x1.0506e21782262p-9 },
+ { 0x1.6d9b06046eb66p-14, 0x1.c041afe3a1ad2p-10 },
+ { 0x1.39a30e3030664p-14, 0x1.80d8271e40929p-10 },
+ { 0x1.0d05cd2b64652p-14, 0x1.4a5cc1e67b046p-10 },
+ { 0x1.cd740d2318d4dp-15, 0x1.1b8f04bdfa1bfp-10 },
+ { 0x1.8bb7603d9828p-15, 0x1.e6b65816f0ff1p-11 },
+ { 0x1.534d810db5377p-15, 0x1.a1a7ec86c94fbp-11 },
+ { 0x1.22e56de90dc1ap-15, 0x1.665a9398034f1p-11 },
+ { 0x1.f2bb06a7069e2p-16, 0x1.336f30c8d3345p-11 },
+ { 0x1.ab79b6edb04e1p-16, 0x1.07b7cbf13abf4p-11 },
+ { 0x1.6e5b33b150249p-16, 0x1.c461717dacbd8p-12 },
+ { 0x1.39f005226a7dbp-16, 0x1.83f56253c12f1p-12 },
+ { 0x1.0cfc8192e69bdp-16, 0x1.4cab82baddd6cp-12 },
+ { 0x1.cce310b024fd4p-17, 0x1.1d39d04e50424p-12 },
+ { 0x1.8acc81455f971p-17, 0x1.e9094beff3587p-13 },
+ { 0x1.522570529739fp-17, 0x1.a3308036822dbp-13 },
+ { 0x1.219685023e1bep-17, 0x1.67464f8a36affp-13 },
+ { 0x1.eff1f945e7f7bp-18, 0x1.33e2c9c277148p-13 },
+ { 0x1.a89fa515a2b44p-18, 0x1.07d0b7bb52fc7p-13 },
+ { 0x1.6b83bb4ee4348p-18, 0x1.c40cfbd11fd1p-14 },
+ { 0x1.372982e2fde1dp-18, 0x1.833ffa698fa8bp-14 },
+ { 0x1.0a51297b20ab7p-18, 0x1.4bb29dadf3acp-14 },
+ { 0x1.c7d093fb7e463p-19, 0x1.1c147957723bdp-14 },
+ { 0x1.8607006600009p-19, 0x1.e6896f5762306p-15 },
+ { 0x1.4db1c7b733812p-19, 0x1.a096cc3260668p-15 },
+ { 0x1.1d76959a6b622p-19, 0x1.64a7647d3f88ap-15 },
+ { 0x1.e858d8b3acc8p-20, 0x1.314deba7bab37p-15 },
+ { 0x1.a1a94b14e3d7fp-20, 0x1.0550e92636252p-15 },
+ { 0x1.6529df3d1cf1cp-20, 0x1.bf46cd0f972c3p-16 },
+ { 0x1.316449a955429p-20, 0x1.7ebd49fbb30eep-16 },
+ { 0x1.0517b9e1f89dep-20, 0x1.47796af08285bp-16 },
+ { 0x1.be627dddb55d7p-21, 0x1.1827a73755ec7p-16 },
+ { 0x1.7d8a7f2a8a2dp-21, 0x1.df49a10ccc568p-17 },
+ { 0x1.4613bf000c71dp-21, 0x1.99ee7037b652bp-17 },
+ { 0x1.16a45fcb7b882p-21, 0x1.5e9197017791dp-17 },
+ { 0x1.dc283bcbe780fp-22, 0x1.2bc40c543e36bp-17 },
+ { 0x1.96ca751cac37fp-22, 0x1.004b34180a4a9p-17 },
+ { 0x1.5b7cd13179ddep-22, 0x1.b632d58444fadp-18 },
+ { 0x1.28cb2cb8b4015p-22, 0x1.768f3e13d3bdcp-18 },
+ { 0x1.faedd62dabd96p-23, 0x1.401fa7657909ep-18 },
+ { 0x1.b0de982dbf111p-23, 0x1.1190d162109abp-18 },
+ { 0x1.7195b2becea19p-23, 0x1.d3803e22a78e4p-19 },
+ { 0x1.3b8387eea3f9dp-23, 0x1.8f694ad8ac632p-19 },
+ { 0x1.0d521f8291cd6p-23, 0x1.55326d6aac6fap-19 },
+ { 0x1.cbb9be9cbac1ep-24, 0x1.236e8d3a9e0e7p-19 },
+ { 0x1.8852e54d26542p-24, 0x1.f1ca221c0b98bp-20 },
+ { 0x1.4ec36b8fdf428p-24, 0x1.a914b62872bc3p-20 },
+ { 0x1.1d9d0055d11dp-24, 0x1.6af2ae42db58p-20 },
+ { 0x1.e74cb7ebdea0ap-25, 0x1.35dbe86ed95c7p-20 },
+ { 0x1.9fa735b03463ap-25, 0x1.0880cfe68041ep-20 },
+ { 0x1.627f6220ca6a9p-25, 0x1.c3847cbf78a3bp-21 },
+ { 0x1.2e4d9d8b5b22fp-25, 0x1.81550cf271bfdp-21 },
+ { 0x1.01c325e8bb3cp-25, 0x1.48cefa0aac509p-21 },
+ { 0x1.b783bc148fcefp-26, 0x1.188ab9ce5fdddp-21 },
+ { 0x1.76aa8791eba33p-26, 0x1.dea9996bf1c0fp-22 },
+ { 0x1.3f58d390caeecp-26, 0x1.984c7bb9c53ffp-22 },
+ { 0x1.10299f255a2cap-26, 0x1.5c3c6ce5f2f75p-22 },
+ { 0x1.cfd7e08a13b2p-27, 0x1.28f8faa7c3202p-22 },
+ { 0x1.8b368e0429dacp-27, 0x1.fa7304087353p-23 },
+ { 0x1.50b2501707be6p-27, 0x1.afca3c464e1d5p-23 },
+ { 0x1.1ecf2c897b782p-27, 0x1.701780b38d71ap-23 },
+ { 0x1.e891642306feep-28, 0x1.39c08dab159ep-23 },
+ { 0x1.a013c6709bdd5p-28, 0x1.0b66dac93672bp-23 },
+ { 0x1.624c9a2f2f8fcp-28, 0x1.c7bde43ebd873p-24 },
+ { 0x1.2da83d59392f5p-28, 0x1.84520ec5eb55ap-24 },
+ { 0x1.00ce3767b77a8p-28, 0x1.4ad54236cf6b4p-24 },
+ { 0x1.b5312d520a3f4p-29, 0x1.19d258cf47194p-24 },
+ { 0x1.74191dcab90bcp-29, 0x1.e015665e4efbdp-25 },
+ { 0x1.3ca855a30dad5p-29, 0x1.98dc92b26aeap-25 },
+ { 0x1.0d71d1069e44fp-29, 0x1.5c29c3e79c162p-25 },
+ { 0x1.ca7c7b61a5357p-30, 0x1.28708aaed4d7p-25 },
+ { 0x1.86083aaabaf73p-30, 0x1.f8bd2046619b5p-26 },
+ { 0x1.4bc21b880f9dep-30, 0x1.ada636f165959p-26 },
+ { 0x1.1a28183b0e32p-30, 0x1.6dafa60f704a1p-26 },
+ { 0x1.dfe23a6ad4f8bp-31, 0x1.37351629c53cp-26 },
+ { 0x1.980956bea8ccp-31, 0x1.08cff68f5874cp-26 },
+ { 0x1.5ae767663002ep-31, 0x1.c29ce58c1fc1p-27 },
+ { 0x1.26e4fd1165b76p-31, 0x1.7f5772973d16cp-27 },
+ { 0x1.f54dde2ba8f56p-32, 0x1.4612c5674eed9p-27 },
+ { 0x1.aa0af3e698b26p-32, 0x1.15539e864d70fp-27 },
+ { 0x1.6a0956d7d1b63p-32, 0x1.d7ad5cdc3741ep-28 },
+ { 0x1.339bd6e517d44p-32, 0x1.9110bc4b50f8cp-28 },
+ { 0x1.0554f0943ba8cp-32, 0x1.54fb970dbe54ep-28 },
+ { 0x1.bbfac9007ec07p-33, 0x1.21dd98bc7de87p-28 },
+ { 0x1.791862715d02fp-33, 0x1.ecc34851c9763p-29 },
+ { 0x1.403f77382e654p-33, 0x1.a2ca34863bfcbp-29 },
+ { 0x1.0feff2a4fc49p-33, 0x1.63e0d12d4d288p-29 },
+ { 0x1.cdc5de1ae8c09p-34, 0x1.2e615f0543e41p-29 },
+ { 0x1.8804761a993c4p-34, 0x1.00e4ae934cb56p-29 },
+ { 0x1.4cc23eb3b5ffap-34, 0x1.b471c42165f4ap-30 },
+ { 0x1.1a6c6c06ea18bp-34, 0x1.72b316e47cc93p-30 },
+ { 0x1.df58ab9ae4fcbp-35, 0x1.3ad1e7143aa75p-30 },
+ { 0x1.96bd0bd6c9a31p-35, 0x1.0b54bd6a9e23fp-30 },
+ { 0x1.59163428fb3a6p-35, 0x1.c5f4a785a88d1p-31 },
+ { 0x1.24be8d0138113p-35, 0x1.8162809b8dff6p-31 },
+ { 0x1.f09f3c1618809p-36, 0x1.4721b76389525p-31 },
+ { 0x1.a53148c3fc482p-36, 0x1.15a6678e0082cp-31 },
+ { 0x1.652d1d62b45e1p-36, 0x1.d73f8da963966p-32 },
+ { 0x1.2eda549c16ee8p-36, 0x1.8fdeb6a9e8ebcp-32 },
+ { 0x1.00c2a84aed164p-36, 0x1.5342fe16e83a5p-32 },
+ { 0x1.b3501c0fdbbcfp-37, 0x1.1fcdfea216d16p-32 },
+ { 0x1.70f8998ccf075p-37, 0x1.e83eb9bce31c4p-33 },
+ { 0x1.38b3a7222dd33p-37, 0x1.9e170e2dbff8cp-33 },
+ { 0x1.08fb437656229p-37, 0x1.5f27a9aa5f66p-33 },
+ { 0x1.c1085f96d9feep-38, 0x1.29bfa42bc7b76p-33 },
+ { 0x1.7c6a3cf1c9dcfp-38, 0x1.f8de2739c95a9p-34 },
+ { 0x1.423e65b2a3a8cp-38, 0x1.abfaa7d4233fap-34 },
+ { 0x1.10ef40de709bcp-38, 0x1.6ac1833360c58p-34 },
+ { 0x1.ce48f9d9e5928p-39, 0x1.336f5ff042b88p-34 },
+ { 0x1.8773adc5703cep-39, 0x1.0484d7ff5f6bdp-34 },
+ { 0x1.4b6e86a5aa9d8p-39, 0x1.b978904649f57p-35 },
+ { 0x1.189488e2e9743p-39, 0x1.760249f31a968p-35 },
+ { 0x1.db0100ef385d3p-40, 0x1.3cd13761f1731p-35 },
+ { 0x1.9206c1ae9fb29p-40, 0x1.0c569a0b1627cp-35 },
+ { 0x1.54382e8081943p-40, 0x1.c67fe1e83e91p-36 },
+ { 0x1.1fe13002859cap-40, 0x1.80dbcff1d72cfp-36 },
+ { 0x1.e71fde0c5e218p-41, 0x1.45d945dc4844dp-36 },
+ { 0x1.9c159bbc9900ap-41, 0x1.13da615eb6c5fp-36 },
+ { 0x1.5c8fc931c6d94p-41, 0x1.d2ffe78d87996p-37 },
+ { 0x1.26cb8c1920344p-41, 0x1.8b4017551e03bp-37 },
+ { 0x1.f295714275bc3p-42, 0x1.4e7bd56b77338p-37 },
+ { 0x1.a592ca70605e5p-42, 0x1.1b06621cfb60ep-37 },
+ { 0x1.646a234bddd88p-42, 0x1.dee83fc205fc8p-38 },
+ { 0x1.2d4a498c21371p-42, 0x1.9521701d324dap-38 },
+ { 0x1.fd5235020e009p-43, 0x1.56ad77d8efe38p-38 },
+ { 0x1.ae71657ff542ep-43, 0x1.21d11201bfbcfp-38 },
+ { 0x1.6bbc82f12468ap-43, 0x1.ea290040397f4p-39 },
+ { 0x1.3354802504d9ep-43, 0x1.9e7295f29cf91p-39 },
+ { 0x1.03a3b07cf84bp-43, 0x1.5e631fb2a96dbp-39 },
+ { 0x1.b6a52af7c7202p-44, 0x1.28313d62cbf4fp-39 },
+ { 0x1.727cc024d462ap-44, 0x1.f4b2d92a8da6ap-40 },
+ { 0x1.38e1c7590edafp-44, 0x1.a726cda9c5fc4p-40 },
+ { 0x1.083385f1e344cp-44, 0x1.6592390114765p-40 },
+ { 0x1.be229b5ed10ebp-45, 0x1.2e1e1bdc1cff3p-40 },
+ { 0x1.78a15c33bf0d1p-45, 0x1.fe77379b5869ap-41 },
+ { 0x1.3dea49bdca04dp-45, 0x1.af3202215009fp-41 },
+ { 0x1.0c5225e967ce3p-45, 0x1.6c30c15ee186bp-41 },
+ { 0x1.c4df14833b32ep-46, 0x1.338f646703f05p-41 },
+ { 0x1.7e2197e99732ep-46, 0x1.03b4338f71d3bp-41 },
+ { 0x1.4266d76b7e9efp-46, 0x1.b688e02001605p-42 },
+ { 0x1.0ff9aa4df55cbp-46, 0x1.72355f261c90fp-42 },
+ { 0x1.cad0ea9847218p-47, 0x1.387d609c076c8p-42 },
+ { 0x1.82f5884a3c4ffp-47, 0x1.07bcd8d61f54dp-42 },
+ { 0x1.4650f71159187p-47, 0x1.bd20f0d88c869p-43 },
+ { 0x1.1324c9f973607p-47, 0x1.77977767b819cp-43 },
+ { 0x1.cfef7f529f1bfp-48, 0x1.3ce0fee10ae91p-43 },
+ { 0x1.8716298a66d68p-48, 0x1.0b4fbeda58aa9p-43 },
+ { 0x1.49a2f582864b8p-48, 0x1.c2f0b2bc85943p-44 },
+ { 0x1.15cee56fb8f8p-48, 0x1.7c4f426570458p-44 },
+ { 0x1.d43356b5d1bc3p-49, 0x1.40b3e347db73ap-44 },
+ { 0x1.8a7d700826ce3p-49, 0x1.0e67b4f33d066p-44 },
+ { 0x1.4c57f38808af9p-49, 0x1.c7efb04c36011p-45 },
+ { 0x1.17f41219f6e6ep-49, 0x1.8055de49eb405p-45 },
+ { 0x1.d796294cc09e7p-50, 0x1.43f076e4dac86p-45 },
+ { 0x1.8d265709c8b81p-50, 0x1.11003322f9f2ap-45 },
+ { 0x1.4e6bf1c869176p-50, 0x1.cc169496c493bp-46 },
+ { 0x1.199123dce7f7cp-50, 0x1.83a55fe01c77fp-46 },
+ { 0x1.da12f38ef6065p-51, 0x1.4691f56a0b9d1p-46 },
+ { 0x1.8f0ced10d0db4p-51, 0x1.131565242338p-46 },
+ { 0x1.4fdbda9c9106cp-51, 0x1.cf5f3d25346p-47 },
+ { 0x1.1aa3b4e8f3caap-51, 0x1.8638e1112031dp-47 },
+ { 0x1.dba6023e1257ap-52, 0x1.489478d82c425p-47 },
+ { 0x1.902e5d96b5dc7p-52, 0x1.14a433d21a4e2p-47 },
+ { 0x1.50a589affacc9p-52, 0x1.d1c4c912f9acbp-48 },
+ { 0x1.1b2a2ba958505p-52, 0x1.880c8cf6ecf16p-48 },
+ { 0x1.dc4cfb90a7ce5p-53, 0x1.49f5031dc194p-48 },
+ { 0x1.9088f811b7254p-53, 0x1.15aa4ccc2f79bp-48 },
+ { 0x1.50c7d151d73dp-53, 0x1.d343a5202c7c4p-49 },
+ { 0x1.1b23bebdcda6dp-53, 0x1.891da95a3a6f5p-49 },
+ { 0x1.dc06e50abd949p-54, 0x1.4ab18582d9df2p-49 },
+ { 0x1.901c34297491p-54, 0x1.1626283914e64p-49 },
+ { 0x1.50427d64b1c7dp-54, 0x1.d3d994938f3adp-50 },
+ { 0x1.1a9076f0d2e24p-54, 0x1.896a9d7ab89b1p-50 },
+ { 0x1.dad425efa38efp-55, 0x1.4ac8e5c7c8723p-50 },
+ { 0x1.8ee8b30ca2586p-55, 0x1.16170c969f828p-50 },
+ { 0x1.4f1653e256f41p-55, 0x1.d385b6cd88b32p-51 },
+ { 0x1.19712f23cae3dp-55, 0x1.88f2f609fe4d3p-51 },
+ { 0x1.d8b686448b5afp-56, 0x1.4a3b00e506616p-51 },
+ { 0x1.8cf03de32b406p-56, 0x1.157d10888e2f3p-51 },
+ { 0x1.4d4512f22a65dp-56, 0x1.d2488978a2f74p-52 },
+ { 0x1.17c7923127a39p-56, 0x1.87b7664b4e00cp-52 },
+ { 0x1.d5b12a674c804p-57, 0x1.4908ab62a09acp-52 },
+ { 0x1.8a35c1621f2ccp-57, 0x1.14591aa0080cap-52 },
+ { 0x1.4ad16c988b007p-57, 0x1.d023e74fea7e1p-53 },
+ { 0x1.159616cbf8a0cp-57, 0x1.85b9c65443c51p-53 },
+ { 0x1.d1c88b489c5c3p-58, 0x1.4733af4601fe1p-53 },
+ { 0x1.86bd4690c0845p-58, 0x1.12acdf1c9738cp-53 },
+ { 0x1.47bf000e37ae9p-58, 0x1.cd1b037f7490bp-54 },
+ { 0x1.12dff96b26d81p-58, 0x1.82fd0e7486194p-54 },
+ { 0x1.cd026b64a0ca8p-59, 0x1.44bec79d5416cp-54 },
+ { 0x1.828be8d7b2e74p-59, 0x1.107adbae7661dp-54 },
+ { 0x1.441250d6b8cc7p-59, 0x1.c93261af2cd0dp-55 },
+ { 0x1.0fa934555eb5ap-59, 0x1.7f854fd47e7d3p-55 },
+ { 0x1.c765c89feb632p-60, 0x1.41ad99b7fc9ebp-55 },
+ { 0x1.7da7c97c8ea4bp-60, 0x1.0dc65148f57fcp-55 },
+ { 0x1.3fd0bbb47d67cp-60, 0x1.c46fcad39a071p-56 },
+ { 0x1.0bf675e9015a3p-60, 0x1.7b57aa64c1e42p-56 },
+ { 0x1.c0facb396944ap-61, 0x1.3e04ac23c3f11p-56 },
+ { 0x1.781800b4c5862p-61, 0x1.0a933c1a65e31p-56 },
+ { 0x1.3b0069a07f02dp-61, 0x1.beda3eeb5f0a2p-57 },
+ { 0x1.07cd15415698ap-61, 0x1.767a404101f5ap-57 },
+ { 0x1.b9cab20b7b4acp-62, 0x1.39c95b8dcd835p-57 },
+ { 0x1.71e48c82b190ap-62, 0x1.06e649c54a11dp-57 },
+ { 0x1.35a840f1bb9bfp-62, 0x1.b879e3daa485dp-58 },
+ { 0x1.0333055f872d1p-62, 0x1.70f426b1f5c67p-58 },
+ { 0x1.b1dfbc5f13465p-63, 0x1.3501cdad9df5bp-58 },
+ { 0x1.6b163d96b3dd9p-63, 0x1.02c4cdfc5722cp-58 },
+ { 0x1.2fcfd4e6913cap-63, 0x1.b157f19f267eap-59 },
+ { 0x1.fc5d8e0519af3p-64, 0x1.6acd55017e4e2p-59 },
+ { 0x1.a945119b38a65p-64, 0x1.2fb4e266d3e9fp-59 },
+ { 0x1.63b6a2745bde1p-64, 0x1.fc696b5025168p-60 },
+ { 0x1.297f53c6e927fp-64, 0x1.a97e9c202c067p-60 },
+ { 0x1.f18eb2ba6357fp-65, 0x1.640e915b3f3eap-60 },
+ { 0x1.a006a7219c6a4p-65, 0x1.29ea2353deb28p-60 },
+ { 0x1.5bcff1208eb99p-65, 0x1.f278f182d5ccep-61 },
+ { 0x1.22bf73da1838dp-65, 0x1.a0f8fae51588p-61 },
+ { 0x1.e60853b8b4b65p-66, 0x1.5cc15bf9dbbbbp-61 },
+ { 0x1.963124add21cp-66, 0x1.23a9b1f0c9515p-61 },
+ { 0x1.536cefa1810b4p-66, 0x1.e7c6162103b4ep-62 },
+ { 0x1.1b995f6e584afp-66, 0x1.97d2ef035140ap-62 },
+ { 0x1.d9da06644bc9dp-67, 0x1.54efd8e5e8a15p-62 },
+ { 0x1.8bd1c79049ec2p-67, 0x1.1cfc34a10ee47p-62 },
+ { 0x1.4a98db9bff0e8p-67, 0x1.dc5f9803d5324p-63 },
+ { 0x1.1416a031bacf2p-67, 0x1.8e1907994f8d3p-63 },
+ { 0x1.cd13f7b7c3414p-68, 0x1.4ca4b88f6234cp-63 },
+ { 0x1.80f645203dff7p-68, 0x1.15eac2ce52257p-63 },
+ { 0x1.415f515af2672p-68, 0x1.d054eb8db2ad5p-64 },
+ { 0x1.0c410a1d6b3cap-68, 0x1.83d8652f7235cp-64 },
+ { 0x1.bfc6c8b2d1c95p-69, 0x1.43eb1f8cfdcf1p-64 },
+ { 0x1.75acacc068ebep-69, 0x1.0e7ed05fb3af3p-64 },
+ { 0x1.37cc328e513e5p-69, 0x1.c3b617ec3cfd6p-65 },
+ { 0x1.0422a6340a512p-69, 0x1.791e9c59e2b42p-65 },
+ { 0x1.b2036a988beadp-70, 0x1.3ace8dce03fbdp-65 },
+ { 0x1.6a0349d192d1ap-70, 0x1.06c218ca5f25ap-65 },
+ { 0x1.2deb8d0dae905p-70, 0x1.b69393c895b87p-66 },
+ { 0x1.f78b3aa5bebbep-71, 0x1.6df997f6bab1bp-66 },
+ { 0x1.a3dafb67a96cfp-71, 0x1.315ac58b7d6b7p-66 },
+ { 0x1.5e0885ebd9cc3p-71, 0x1.fd7d13f78002dp-67 },
+ { 0x1.23c981e88b022p-71, 0x1.a8fe21d205ebp-67 },
+ { 0x1.e66846a73c925p-72, 0x1.62777b62fde0cp-67 },
+ { 0x1.955ea2f392221p-72, 0x1.279bb2446baf4p-67 },
+ { 0x1.51cacbb42476ep-72, 0x1.ecfc5eb955129p-68 },
+ { 0x1.19722d0b598a4p-72, 0x1.9b06ad8cbcafbp-68 },
+ { 0x1.d4f0c5733dbc9p-73, 0x1.56a684fe99fcap-68 },
+ { 0x1.869f70ffc1fcbp-73, 0x1.1d9d500e92622p-68 },
+ { 0x1.45586a9e82938p-73, 0x1.dc163a555fefbp-69 },
+ { 0x1.0ef18dbc017ffp-73, 0x1.8cbe28ca7c426p-69 },
+ { 0x1.c338d2435fb4bp-74, 0x1.4a94f1540c9eap-69 },
+ { 0x1.77ae3cb88b469p-74, 0x1.136b93820fc76p-69 },
+ { 0x1.38bf7be87e681p-74, 0x1.cadeb8c3bba05p-70 },
+ { 0x1.0453702b9a5bbp-74, 0x1.7e356a2db5e15p-70 },
+ { 0x1.b154294e891dap-75, 0x1.3e50df3387f95p-70 },
+ { 0x1.689b85dc875b1p-75, 0x1.09125281c373ap-70 },
+ { 0x1.2c0dc90fab5bap-75, 0x1.b969aedac7779p-71 },
+ { 0x1.f346b0aa94647p-76, 0x1.6f7d0d10edd84p-71 },
+ { 0x1.9f5604d9610bp-76, 0x1.31e8350b95daep-71 },
+ { 0x1.597757e14e4e8p-76, 0x1.fd3a5c3ac18bbp-72 },
+ { 0x1.1f50b401397f7p-76, 0x1.a7ca8fa24018p-72 },
+ { 0x1.ddd8dcb76e388p-77, 0x1.60a5532471804p-72 },
+ { 0x1.8d50fcdd2a012p-77, 0x1.256887c26e498p-72 },
+ { 0x1.4a512f5483d32p-77, 0x1.e82efb884fa7p-73 },
+ { 0x1.129521372a709p-77, 0x1.961449f1f5f93p-73 },
+ { 0x1.c872d91eff745p-78, 0x1.51be080b9d49dp-73 },
+ { 0x1.7b56e9895b756p-78, 0x1.18df034ba2c47p-73 },
+ { 0x1.3b37e1b01d1bdp-78, 0x1.d31877f1753bap-74 },
+ { 0x1.05e763ef1c6e1p-78, 0x1.845928aac023dp-74 },
+ { 0x1.b3291e83a6ddap-79, 0x1.42d6673958cf7p-74 },
+ { 0x1.6978c8d7d61b8p-79, 0x1.0c58552d896bdp-74 },
+ { 0x1.2c3987ce2b431p-79, 0x1.be0be95f0126ep-75 },
+ { 0x1.f2a6593b4ee39p-80, 0x1.72aab5cc51918p-75 },
+ { 0x1.9e0f0cfd57ab4p-80, 0x1.33fd04413c4e8p-75 },
+ { 0x1.57c6a75ebbd36p-80, 0x1.ffc132424c87ap-76 },
+ { 0x1.1d636b1da2b46p-80, 0x1.a91d6af35687bp-76 },
+ { 0x1.d9c6f3705063cp-81, 0x1.6119a09e14fe5p-76 },
+ { 0x1.8936d384f421ap-81, 0x1.253fb5c838ba6p-76 },
+ { 0x1.464f8c7e074fcp-81, 0x1.e7068fdcaeb4ep-77 },
+ { 0x1.0ec1f5aebc21fp-81, 0x1.945fff2eb1b17p-77 },
+ { 0x1.c14515cb6f8fp-82, 0x1.4fb5a7146299ap-77 },
+ { 0x1.74b15b6eeceb1p-82, 0x1.16ab8334ccb0ap-77 },
+ { 0x1.352169fa33216p-82, 0x1.ce965139dad89p-78 },
+ { 0x1.0060a522d6818p-82, 0x1.7fe578074e0c8p-78 },
+ { 0x1.a933ad3e37ea3p-83, 0x1.3e8d828e807b4p-78 },
+ { 0x1.608e37fe916b7p-83, 0x1.084c9533fea9dp-78 },
+ { 0x1.24490f08ca22dp-83, 0x1.b68488148e38cp-79 },
+ { 0x1.e4940102c0a26p-84, 0x1.6bbe630bdc58cp-79 },
+ { 0x1.91a40479b1837p-84, 0x1.2daed7fd23569p-79 },
+ { 0x1.4cdb9a0d20ef7p-84, 0x1.f45c523b5ec4ep-80 },
+ { 0x1.13d21ec7ce7a5p-84, 0x1.9ee3b5d440d2p-80 },
+ { 0x1.c90f21d2d475fp-85, 0x1.57f9f997e1f52p-80 },
+ { 0x1.7aa5b8d4b4359p-85, 0x1.1d262b74c69e4p-80 },
+ { 0x1.39a647b21bed6p-85, 0x1.d8b50e711660ap-81 },
+ { 0x1.03c70a0dadb1dp-85, 0x1.87c4bc616ed3dp-81 },
+ { 0x1.ae43ba1c85bb1p-86, 0x1.44a615135e868p-81 },
+ { 0x1.6446b3db12c58p-86, 0x1.0cfed72363bb7p-81 },
+ { 0x1.26f997cdc041dp-86, 0x1.bdb5f7a82d0f4p-82 },
+ { 0x1.e86218ea3e6acp-87, 0x1.7136d3b897e11p-82 },
+ { 0x1.9440cec9f5e3ap-87, 0x1.31cf2729ac24dp-82 },
+ { 0x1.4e93295651e9bp-87, 0x1.fa860b2bf75f8p-83 },
+ { 0x1.14df714b2cc27p-87, 0x1.a36fa64c5b19fp-83 },
+ { 0x1.ca3058fde005fp-88, 0x1.5b478418ed951p-83 },
+ { 0x1.7b135dc219792p-88, 0x1.1f8035d726d41p-83 },
+ { 0x1.3995999427ba7p-88, 0x1.dbf75e60682c2p-84 },
+ { 0x1.03604de581436p-88, 0x1.89f0afa1deecap-84 },
+ { 0x1.ad067d36fa2c8p-89, 0x1.4602a49df0a52p-84 },
+ { 0x1.62c6642f5d4b9p-89, 0x1.0dc2db21eaf21p-84 },
+ { 0x1.2556d7a42568ap-89, 0x1.be61355e30a98p-85 },
+ { 0x1.e5068065139bep-90, 0x1.7145a7dd1cf8cp-85 },
+ { 0x1.90efd5cd13c3p-90, 0x1.31725e0702649p-85 },
+ { 0x1.4b62e9374c452p-90, 0x1.f93e90900fd6bp-86 },
+ { 0x1.11de133cc6916p-90, 0x1.a1d0c10ff74dfp-86 },
+ { 0x1.c49bf95c5f745p-91, 0x1.597928f3e0c7p-86 },
+ { 0x1.75f56ab48bd89p-91, 0x1.1d9f316556fccp-86 },
+ { 0x1.34f00cbd8ea42p-91, 0x1.d8389849eaf01p-87 },
+ { 0x1.fe61cbe17950dp-92, 0x1.8650e1db268ebp-87 },
+ { 0x1.a589caf82618cp-92, 0x1.4293ddcb013c1p-87 },
+ { 0x1.5c1e107375834p-92, 0x1.0a90025fd130cp-87 },
+ { 0x1.1f7319c565581p-92, 0x1.b87eb911fc5efp-88 },
+ { 0x1.daa6c6af5c17fp-93, 0x1.6bea387f6b0ap-88 },
+ { 0x1.87d63120a742cp-93, 0x1.2c9c915a28ddap-88 },
+ { 0x1.436e80df031fp-93, 0x1.f094496a5e827p-89 },
+ { 0x1.0aef9bffa708dp-93, 0x1.9a19446f657ccp-89 },
+ { 0x1.b890579385cdcp-94, 0x1.52a33b4b8094cp-89 },
+ { 0x1.6b84ffdb5d885p-94, 0x1.179841589cdp-89 },
+ { 0x1.2be9773700384p-94, 0x1.cda2d93f291abp-90 },
+ { 0x1.eecef0206652cp-95, 0x1.7d0e0e7cac5bp-90 },
+ { 0x1.9821029662ccfp-95, 0x1.3a804f20fd2f4p-90 },
+ { 0x1.5097c74b3d08ep-95, 0x1.038a34010e13fp-90 },
+ { 0x1.158fcf12f6c8ep-95, 0x1.ac508371be502p-91 },
+ { 0x1.c9b60c296975dp-96, 0x1.61608ea10db83p-91 },
+ { 0x1.7958bc88e6006p-96, 0x1.2383e3bce375p-91 },
+ { 0x1.370dfa8e149d1p-96, 0x1.e0e820ef7463p-92 },
+ { 0x1.0060a594f59c7p-96, 0x1.8c9f67fa9c048p-92 },
+ { 0x1.a6925bee98d74p-97, 0x1.471203b047e85p-92 },
+ { 0x1.5c351b499632p-97, 0x1.0dae92b93887p-92 },
+ { 0x1.1ee518d278c58p-97, 0x1.bcabf2ba981bfp-93 },
+ { 0x1.d8b2f8b0b2924p-98, 0x1.6e8f25135d13fp-93 },
+ { 0x1.855f0a34582a6p-98, 0x1.2e219acb023aep-93 },
+ { 0x1.40b1881e58e3p-98, 0x1.f1fe817902cebp-94 },
+ { 0x1.0818d80634105p-98, 0x1.9a5d5233d8e13p-94 },
+ { 0x1.b2ecbb2e8d76cp-99, 0x1.521d0766f8b85p-94 },
+ { 0x1.6614d9da549fbp-99, 0x1.168c985c93c95p-94 },
+ { 0x1.26c7736a63e7fp-99, 0x1.cae6809d7d445p-95 },
+ { 0x1.e546a107b57d5p-100, 0x1.79f71edd3cb51p-95 },
+ { 0x1.8f64020effd9cp-100, 0x1.37443c37e4835p-95 },
+ { 0x1.48aa64075b15p-100, 0x1.004e8297ce819p-95 },
+ { 0x1.0e6e891142764p-100, 0x1.a60ceba01346ap-96 },
+ { 0x1.bcfa525d16889p-101, 0x1.5b71dfbe662f9p-96 },
+ { 0x1.6e0be1ed4e4ccp-101, 0x1.1dfe04c5b884ap-96 },
+ { 0x1.2d14568fa3103p-101, 0x1.d6c299b6b03dep-97 },
+ { 0x1.ef39c9c67da7p-102, 0x1.8366f8264d161p-97 },
+ { 0x1.973b86e9a718fp-102, 0x1.3ec401194be5fp-97 },
+ { 0x1.4ed55e6d4d5dfp-102, 0x1.0641ea45be131p-97 },
+ { 0x1.1345b1de4a541p-102, 0x1.af7b06dd7c2fap-98 },
+ { 0x1.c48e8cf8e20edp-103, 0x1.62e7924beab28p-98 },
+ { 0x1.73f6cd7db5a56p-103, 0x1.23e2123cac1dcp-98 },
+ { 0x1.31afb2e91937bp-103, 0x1.e00be39adba8fp-99 },
+ { 0x1.f6600b76754fcp-104, 0x1.8ab4ee2717624p-99 },
+ { 0x1.9cc2881babafp-104, 0x1.447fa5b4e25fep-99 },
+ { 0x1.5316d5b010b17p-104, 0x1.0abf02c055867p-99 },
+ { 0x1.1688993cfebe3p-104, 0x1.b67d9f35f4de8p-100 },
+ { 0x1.c98758b0a4ebap-105, 0x1.685ccfe1e2ab5p-100 },
+ { 0x1.77baf72da4868p-105, 0x1.281e65593d67p-100 },
+ { 0x1.3484c1e2418cbp-105, 0x1.e698bd1000fd2p-101 },
+ { 0x1.fa991c211034p-106, 0x1.8fc0326c87b11p-101 },
+ { 0x1.9fe006460b912p-106, 0x1.485d5ed97243ep-101 },
+ { 0x1.555b844a27ecdp-106, 0x1.0db191585c5a2p-101 },
+ { 0x1.182875c9f3984p-106, 0x1.baf50ff65044dp-102 },
+ { 0x1.cbce2423a80acp-107, 0x1.6bb8ebe73c54ap-102 },
+ { 0x1.794741d4d28c6p-107, 0x1.2a9fd1221e357p-102 },
+ { 0x1.3586a18110b0ep-107, 0x1.ea4b746dbeae3p-103 },
+ { 0x1.fbd1c1dcb3991p-108, 0x1.9271dfe5687e7p-103 },
+ { 0x1.a085cf5d6c87ep-108, 0x1.4a4b9ae2c857dp-103 },
+ { 0x1.559911f8b7812p-108, 0x1.0f0c2d578f06ap-103 },
+ { 0x1.181ddd71c27fbp-108, 0x1.bccd0201398bap-104 },
+ { 0x1.cb5889458c00ep-109, 0x1.6cec95dfef21ap-104 },
+ { 0x1.789499da6bff1p-109, 0x1.2b5ae7721763fp-104 },
+ { 0x1.34b0b5ddf82c6p-109, 0x1.eb1327842cc63p-105 },
+ { 0x1.fa04646636ebep-110, 0x1.92bda7bca05b7p-105 },
+ { 0x1.9eb0ea42d451ep-110, 0x1.4a4186866270ap-105 },
+ { 0x1.53ce6234f7db7p-110, 0x1.0ec8a57831ec5p-105 },
+ { 0x1.1668fdbb007d5p-110, 0x1.bbfd05e1b64f3p-106 },
+ { 0x1.c8289c5fd0187p-111, 0x1.6bf24d893426cp-106 },
+ { 0x1.75a62b0407aefp-111, 0x1.2a4c4fb42b862p-106 },
+ { 0x1.3206cc37b0e4ap-111, 0x1.e8ec43d273fbap-107 },
+ { 0x1.f53937c26236ep-112, 0x1.90a22ee0d506ep-107 },
+ { 0x1.9a69ad7793258p-112, 0x1.483f4fee6553cp-107 },
+ { 0x1.50039cbf56e41p-112, 0x1.0ce82f0139653p-107 },
+ { 0x1.13119a81ee824p-112, 0x1.b888d3fea2a71p-108 },
+ { 0x1.c24cdc6a6909bp-113, 0x1.68ce8cbb7eaebp-108 },
+ { 0x1.7089487e1182ep-113, 0x1.2778e05f0f826p-108 },
+ { 0x1.2d94fe2dcd5a4p-113, 0x1.e3e0a1bcb7b9p-109 },
+ { 0x1.ed85fe218f015p-114, 0x1.8c29185861611p-109 },
+ { 0x1.93c37ffa2be3p-114, 0x1.444e2559eb861p-109 },
+ { 0x1.4a49efe08b764p-114, 0x1.09735c9244f77p-109 },
+ { 0x1.0e26d33274acdp-114, 0x1.b28030446d467p-110 },
+ { 0x1.b9dfc560135fp-115, 0x1.638fa554a9791p-110 },
+ { 0x1.6955081ac80b2p-115, 0x1.22ed7a20d2031p-110 },
+ { 0x1.276f565251c73p-115, 0x1.dc07399fb9ebdp-111 },
+ { 0x1.e30d639687648p-116, 0x1.8566bbf3afdccp-111 },
+ { 0x1.8adc46e842374p-116, 0x1.3e7fef514c8f7p-111 },
+ { 0x1.42bb0eedd3fb2p-116, 0x1.0479dd0162987p-111 },
+ { 0x1.07beb0edff1b8p-116, 0x1.a9fe7272a642bp-112 },
+ { 0x1.af070915be74ep-117, 0x1.5c4d5495043b3p-112 },
+ { 0x1.602994f04daa5p-117, 0x1.1cbea64272b5fp-112 },
+ { 0x1.1fb139d7ad13p-117, 0x1.d18375dee0b86p-113 },
+ { 0x1.d5fdfa65dd70dp-118, 0x1.7c798c690caf6p-113 },
+ { 0x1.7fdb85ec65bd4p-118, 0x1.36eec953c25e3p-113 },
+ { 0x1.39787263ebbcap-118, 0x1.fc2409fc1812ep-114 },
+ { 0x1.ffeb0495cc103p-119, 0x1.9f29b80329143p-114 },
+ { 0x1.a1f276c1aeb71p-119, 0x1.5328106ecc8f8p-114 },
+ { 0x1.552f40714fe54p-119, 0x1.1507fc4d2f4bap-114 },
+ { 0x1.167c9d827337cp-119, 0x1.c484291d11ffp-115 },
+ { 0x1.c690e28b6a9bfp-120, 0x1.7189333483e3bp-115 },
+ { 0x1.72f13b97db104p-120, 0x1.2dbc3e931f24dp-115 },
+ { 0x1.2eaa616a9b21cp-120, 0x1.ecb050b3055ap-116 },
+ { 0x1.edda16b7edc87p-121, 0x1.9231c8255bcdbp-116 },
+ { 0x1.92da9c960076ap-121, 0x1.4848161f4e509p-116 },
+ { 0x1.48955baf138afp-121, 0x1.0beb55467080ap-116 },
+ { 0x1.0bf90e157d9dap-121, 0x1.b542338309321p-117 },
+ { 0x1.b5082a5d8de09p-122, 0x1.64c56b8fb3cecp-117 },
+ { 0x1.6454856772fedp-122, 0x1.231052b5f7dd6p-117 },
+ { 0x1.227ecea87251dp-122, 0x1.dadb937ed07ebp-118 },
+ { 0x1.d99724acabf71p-123, 0x1.834eb55a1d18ep-118 },
+ { 0x1.81ff31715569ap-123, 0x1.3bdc43dd8955fp-118 },
+ { 0x1.3a90e48619574p-123, 0x1.018fd4cd15479p-118 },
+ { 0x1.005296113b586p-123, 0x1.a3fee5158c03fp-119 },
+ { 0x1.a1acf8c750894p-124, 0x1.5664a8518a142p-119 },
+ { 0x1.54421936100c1p-124, 0x1.171860917e7c8p-119 },
+ { 0x1.152813e135602p-124, 0x1.c6f152728fb8fp-120 },
+ { 0x1.c375a4cba7b23p-125, 0x1.72bf4ab4db677p-120 },
+ { 0x1.6fa5568fa20f3p-125, 0x1.2e18c95c4bfb1p-120 },
+ { 0x1.2b5b13ef0805cp-125, 0x1.ec41a3d4cf576p-121 },
+ { 0x1.e77117811a7d2p-126, 0x1.91022d83bf8f5p-121 },
+ { 0x1.8ccd934db2cbp-126, 0x1.46a292659269ep-121 },
+ { 0x1.42faa33070d2ap-126, 0x1.0a05da41d6048p-121 },
+ { 0x1.06db98d7f6125p-126, 0x1.b14375f322de2p-122 },
+ { 0x1.abcdbdfcc9f7cp-127, 0x1.60c75486158bp-122 },
+ { 0x1.5c15c23fbb403p-127, 0x1.1f35bc35fb59fp-122 },
+ { 0x1.1b2fdb7cab6dfp-127, 0x1.d39954e0a9d3dp-123 },
+ { 0x1.ccb8a64624f6cp-128, 0x1.7c98ab66270f5p-123 },
+ { 0x1.76bb52e82b59ap-128, 0x1.35be6eb898758p-123 },
+ { 0x1.30c117f001ac3p-128, 0x1.f819edd38db9cp-124 },
+ { 0x1.efa0e49e3feccp-129, 0x1.9a2821242ebdp-124 },
+ { 0x1.92fa046d58d4ep-129, 0x1.4dadd528d6ea9p-124 },
+ { 0x1.479ae4e865feep-129, 0x1.0f6d9e092345cp-124 },
+ { 0x1.0a4c603089f16p-129, 0x1.b987187720ae4p-125 },
+ { 0x1.b0e03e96a5485p-130, 0x1.6711ad9310ce1p-125 },
+ { 0x1.5fc89a9e03199p-130, 0x1.23f97aea9f29fp-125 },
+ { 0x1.1dd90a3522c75p-130, 0x1.dac6b554960ffp-126 },
+ { 0x1.d07c0b8b30398p-131, 0x1.81f77dc55f2bdp-126 },
+ { 0x1.795540ea5dda7p-131, 0x1.39bb36d1a51dap-126 },
+ { 0x1.327f191dd6247p-131, 0x1.fdf7c425dfb89p-127 },
+ { 0x1.f1db008e061d6p-132, 0x1.9e6c7f42ee3ap-127 },
+ { 0x1.944b7c8850269p-132, 0x1.50bd38f4b0e14p-127 },
+ { 0x1.4846e1e475567p-132, 0x1.11954fcd9d596p-127 },
+ { 0x1.0a8512d6deebp-132, 0x1.bc7d8a23288e1p-128 },
+ { 0x1.b0b57b848dfd5p-133, 0x1.69099571fea27p-128 },
+ { 0x1.5f385601a1095p-133, 0x1.25378a982372p-128 },
+ { 0x1.1d0aee3f21eaep-133, 0x1.dc36feecfa2bap-129 },
+ { 0x1.ce9ce0f1b56b8p-134, 0x1.82a9fb7ad076bp-129 },
+ { 0x1.775af322a6fb6p-134, 0x1.39ea243c7bf71p-129 },
+ { 0x1.3084e2fb958e5p-134, 0x1.fda4af81b306ap-130 },
+ { 0x1.ee0aaff5c7275p-135, 0x1.9da7a2c5ab52cp-130 },
+ { 0x1.90b5b261712acp-135, 0x1.4fb44aa933f5cp-130 },
+ { 0x1.44f853ca3d2a1p-135, 0x1.1068e39733d5fp-130 },
+ { 0x1.07839b24e2329p-135, 0x1.ba0b385a9673fp-131 },
+ { 0x1.ab4ef712ea53cp-136, 0x1.669cb88b98bb4p-131 },
+ { 0x1.5a6a27edc2aafp-136, 0x1.22e458ff074e2p-131 },
+ { 0x1.18ccfb2383c0dp-136, 0x1.d7dccacf16bdfp-132 },
+ { 0x1.c72c7d427b5c7p-137, 0x1.7ea9a57d9c3fdp-132 },
+ { 0x1.70debd3477d7cp-137, 0x1.364981b4fcaccp-132 },
+ { 0x1.2ae4c8505c4dcp-137, 0x1.f723b60a4c45ap-133 },
+ { 0x1.e45347f37826dp-138, 0x1.97e0b5db827a8p-133 },
+ { 0x1.8859d9d834871p-138, 0x1.4a9cae44d02aap-133 },
+ { 0x1.3dcdd6f53a761p-138, 0x1.0bf347561e06fp-133 },
+ { 0x1.0163c7a1b8ce3p-138, 0x1.b246ea577dcd5p-134 },
+ { 0x1.a0de9e4d0326ap-139, 0x1.5fe1a8f2ffd47p-134 },
+ { 0x1.518a7407eb90ep-139, 0x1.1d15869af1a46p-134 },
+ { 0x1.1146574533e59p-139, 0x1.cde08f63664fdp-135 },
+ { 0x1.ba6f77161f191p-140, 0x1.761ba88bf6eedp-135 },
+ { 0x1.661c59f17faep-140, 0x1.2efafc89163c3p-135 },
+ { 0x1.21d2894bdd4c7p-140, 0x1.eab12c8aa7e5p-136 },
+ { 0x1.d50e0eba3e44dp-141, 0x1.8d4d432dee077p-136 },
+ { 0x1.7b84a5753cf1fp-141, 0x1.41a589d11cb19p-136 },
+ { 0x1.33091416396dbp-141, 0x1.045db9ec2ba81p-136 },
+ { 0x1.f0bb3ff173143p-142, 0x1.a57861242277fp-137 },
+ { 0x1.91c3cacc75aaap-142, 0x1.551681b8d361p-137 },
+ { 0x1.44ea256a84bbp-142, 0x1.140098b38820cp-137 },
+ { 0x1.06bb841410434p-142, 0x1.be9e2feb561ep-138 },
+ { 0x1.a8d98b0d5771p-143, 0x1.694e9fdcb7be5p-138 },
+ { 0x1.57755a2313bdfp-143, 0x1.24419d9ce37ffp-138 },
+ { 0x1.15a03d39bca43p-143, 0x1.d8bf1578b3aacp-139 },
+ { 0x1.c0c4e9f387792p-144, 0x1.7e4dfe2cee6a2p-139 },
+ { 0x1.6aa9b63079411p-144, 0x1.3520b0bf08a51p-139 },
+ { 0x1.250ad98a67e4fp-144, 0x1.f3daa3dd37f3ap-140 },
+ { 0x1.d9842421f4af1p-145, 0x1.94140b3abb78ep-140 },
+ { 0x1.7e859d0226582p-145, 0x1.469d2facc66f7p-140 },
+ { 0x1.34f9e5d4c96d3p-145, 0x1.07f7c6b04c092p-140 },
+ { 0x1.f314a5f5af6d7p-146, 0x1.aa9f80ec12e52p-141 },
+ { 0x1.9306ca687d568p-146, 0x1.58b5e63278412p-141 },
+ { 0x1.456b681315dafp-146, 0x1.167dcc97a0fd3p-141 },
+ { 0x1.06b98180e66fp-146, 0x1.c1ee5bab4ede7p-142 },
+ { 0x1.a82a4c036e3f3p-147, 0x1.6b69077bfc3c7p-142 },
+ { 0x1.565cda5d05a6ap-147, 0x1.257dcc5bc2717p-142 },
+ { 0x1.144d77262f022p-147, 0x1.d9fdd2296338fp-143 },
+ { 0x1.bdec7b50a66cp-148, 0x1.7eb427b4ddd71p-143 },
+ { 0x1.67cb265d8483ap-148, 0x1.34f5aee91217p-143 },
+ { 0x1.224399b226996p-148, 0x1.f2ca4dc8ff69fp-144 },
+ { 0x1.d448f86c23d12p-149, 0x1.92943634830d2p-144 },
+ { 0x1.79b2a15ae0faap-149, 0x1.44e2d8e947442p-144 },
+ { 0x1.3098d833c2dap-149, 0x1.0627b1e47c261p-144 },
+ { 0x1.eb3aa595948f3p-150, 0x1.a705784809825p-145 },
+ { 0x1.8c0f08dff4e68p-150, 0x1.554226cd542efp-145 },
+ { 0x1.3f49a8880f6adp-150, 0x1.1343e7a202e9p-145 },
+ { 0x1.015dd1c62a082p-150, 0x1.bc0384ab3550dp-146 },
+ { 0x1.9edb80143a705p-151, 0x1.660fe966c4e28p-146 },
+ { 0x1.4e52056f2dec4p-151, 0x1.20b6b60dae611p-146 },
+ { 0x1.0d62a769875ep-151, 0x1.d1893fc15ba16p-147 },
+ { 0x1.b2128dd015485p-152, 0x1.7747e31ddd25cp-147 },
+ { 0x1.5dad6d3a16694p-152, 0x1.2e7c997078049p-147 },
+ { 0x1.19a81ef58dfc6p-152, 0x1.e790d89e8e564p-148 },
+ { 0x1.c5ae1b79c4ee8p-153, 0x1.88e545d12ba57p-148 },
+ { 0x1.6d56e11abc8a7p-153, 0x1.3c919aea9787p-148 },
+ { 0x1.262a204b39df1p-153, 0x1.fe13c6f07b6aep-149 },
+ { 0x1.d9a774b67b183p-154, 0x1.9ae2b16a9550ap-149 },
+ { 0x1.7d48e51f6d6edp-154, 0x1.4af14f857334ep-149 },
+ { 0x1.32e43016e50e4p-154, 0x1.0a8564eab8ff5p-149 },
+ { 0x1.edf747f9f14f1p-155, 0x1.ad3a33350402p-150 },
+ { 0x1.8d7d80e14b91p-155, 0x1.5996d7e13f467p-150 },
+ { 0x1.3fd1708b687cbp-155, 0x1.1636f3d76858ap-150 },
+ { 0x1.014ad3fec9ec4p-155, 0x1.bfe545fce7a55p-151 },
+ { 0x1.9dee40ecc2982p-156, 0x1.687ce08618977p-151 },
+ { 0x1.4ceca2b27454p-156, 0x1.221a377d62eb4p-151 },
+ { 0x1.0bbd071377b87p-156, 0x1.d2dcd30499eb7p-152 },
+ { 0x1.ae9438e9a5c0bp-157, 0x1.779da2df7a30cp-152 },
+ { 0x1.5a30285652adp-157, 0x1.2e2a7c1fe1c5fp-152 },
+ { 0x1.164daef1c2b15p-157, 0x1.e61933d473856p-153 },
+ { 0x1.bf6806876a635p-158, 0x1.86f2e6e7e582ap-153 },
+ { 0x1.67960688424efp-158, 0x1.3a62b4892ce6ep-153 },
+ { 0x1.20f7f47f404a7p-158, 0x1.f99234ed0089ep-154 },
+ { 0x1.d061d530972c5p-159, 0x1.9676058974913p-154 },
+ { 0x1.7517e8c57f622p-159, 0x1.46bd7c1e28efp-154 },
+ { 0x1.2bb6ba79809edp-159, 0x1.069f8cb02119fp-154 },
+ { 0x1.e17962871247p-160, 0x1.a61febb6d574dp-155 },
+ { 0x1.82af24bbe81ddp-160, 0x1.53351984f5d61p-155 },
+ { 0x1.3684a09debb18p-160, 0x1.108b4faaa8971p-155 },
+ { 0x1.f2a603a977e7cp-161, 0x1.b5e91e3ee196dp-156 },
+ { 0x1.9054beadf5a51p-161, 0x1.5fc381e001854p-156 },
+ { 0x1.415c074fc9065p-161, 0x1.1a8782bc000bep-156 },
+ { 0x1.01ef55a0092e3p-161, 0x1.c5c9be5ba37d4p-157 },
+ { 0x1.9e016e74801cbp-162, 0x1.6c625c9dd5c05p-157 },
+ { 0x1.4c3713bae315dp-162, 0x1.248f08aa2a9f5p-157 },
+ { 0x1.0a8cf82738469p-162, 0x1.d5b98efc2e8d5p-158 },
+ { 0x1.abada51b7b47ep-163, 0x1.790b07dcc17ddp-158 },
+ { 0x1.570fb47030aa8p-163, 0x1.2e9c8b4dec3dep-158 },
+ { 0x1.13270ae279a57p-163, 0x1.e5affac730013p-159 },
+ { 0x1.b951931589ad6p-164, 0x1.85b69d604d483p-159 },
+ { 0x1.61dfa678e3296p-164, 0x1.38aa7fa8655e3p-159 },
+ { 0x1.1bb88966006c4p-164, 0x1.f5a41ad29abd6p-160 },
+ { 0x1.c6e52f00f28e6p-165, 0x1.925df815332e1p-160 },
+ { 0x1.6ca07adb2cabep-165, 0x1.42b32a68b6433p-160 },
+ { 0x1.243c4de072741p-165, 0x1.02c65f05a223cp-160 },
+ { 0x1.d4603cf73627ep-166, 0x1.9ef9ba1f58105p-161 },
+ { 0x1.774b9c8b0652p-166, 0x1.4cb0a4ddc2264p-161 },
+ { 0x1.2cad15ed5f00dp-166, 0x1.0ab038a2ddd17p-161 },
+ { 0x1.e1ba565f2f2dap-167, 0x1.ab82536c08c11p-162 },
+ { 0x1.81da56c03901cp-167, 0x1.569ce24f30cadp-162 },
+ { 0x1.350587b61e2e7p-167, 0x1.128ac3f80b9acp-162 },
+ { 0x1.eeeaf2386ba73p-168, 0x1.b7f008c184953p-163 },
+ { 0x1.8c45dba9ebaffp-168, 0x1.6071b5b7d5f0bp-163 },
+ { 0x1.3d40375ab2fc9p-168, 0x1.1a5112ad78884p-163 },
+ { 0x1.fbe96dd52dd2ap-169, 0x1.c43afb43abf3ap-164 },
+ { 0x1.96874b77050b3p-169, 0x1.6a28d7dab475p-164 },
+ { 0x1.4557ac9b8a4ffp-169, 0x1.21fe234726979p-164 },
+ { 0x1.04568afbad70bp-169, 0x1.d05b30647f5b6p-165 },
+ { 0x1.a097bba9c5bbap-170, 0x1.73bbedaae952fp-165 },
+ { 0x1.4d4668bc3c638p-170, 0x1.298ce64edbc52p-165 },
+ { 0x1.0a969821c25d4p-170, 0x1.dc489a35fd89p-166 },
+ { 0x1.aa703eac27071p-171, 0x1.7d248efdebaf1p-166 },
+ { 0x1.5506ec96ce1d8p-171, 0x1.30f843b6c62b7p-166 },
+ { 0x1.10b0827e1c59fp-171, 0x1.e7fb2011e1175p-167 },
+ { 0x1.b409eb99c2287p-172, 0x1.865c4d7ebd336p-167 },
+ { 0x1.5c93bed6568e9p-172, 0x1.383b206d0bb99p-167 },
+ { 0x1.169ff47b694c6p-172, 0x1.f36aa78ac249dp-168 },
+ { 0x1.bd5de633517f7p-173, 0x1.8f5cbbd7e3bd9p-168 },
+ { 0x1.63e7724f64774p-173, 0x1.3f5064180659dp-168 },
+ { 0x1.1c60a3dd2224ep-173, 0x1.fe8f1d993bb19p-169 },
+ { 0x1.c66566ef40333p-174, 0x1.981f750955121p-169 },
+ { 0x1.6afcac6c09d1ap-174, 0x1.4632fef2669ecp-169 },
+ { 0x1.21ee56dbc8c6ap-174, 0x1.04b03ffb7174ap-169 },
+ { 0x1.cf19c31a391acp-175, 0x1.a09e23dee12dbp-170 },
+ { 0x1.71ce2ba111a68p-175, 0x1.4cddefbe00daep-170 },
+ { 0x1.2744e94597dfp-175, 0x1.09eb734c1a314p-170 },
+ { 0x1.d77474fa3c96fp-176, 0x1.a8d28a7b21f9ep-171 },
+ { 0x1.7856cde19858bp-176, 0x1.534c49c3a48ap-171 },
+ { 0x1.2c60519b06073p-176, 0x1.0ef5469afe541p-171 },
+ { 0x1.df6f23e67822ep-177, 0x1.b0b689ea896fp-172 },
+ { 0x1.7e9197060941ap-177, 0x1.59793ad60d8abp-172 },
+ { 0x1.313ca61e59763p-177, 0x1.13c9ee6b2a529p-172 },
+ { 0x1.e703ac45eb1a5p-178, 0x1.b84429b1d33d8p-173 },
+ { 0x1.8479b71b66ff2p-178, 0x1.5f60114dc317ap-173 },
+ { 0x1.35d621cd7892fp-178, 0x1.1865baa279b03p-173 },
+ { 0x1.ee2c2766d39aep-179, 0x1.bf759f4ae6481p-174 },
+ { 0x1.8a0a908fbee34p-179, 0x1.64fc41f392bcdp-174 },
+ { 0x1.3a29293d26666p-179, 0x1.1cc51b3533d1bp-174 },
+ { 0x1.f4e2f320ed2f5p-180, 0x1.c645558315ad7p-175 },
+ { 0x1.8f3fbe30bc1d8p-180, 0x1.6a496dcf4682p-175 },
+ { 0x1.3e324f4cf0981p-180, 0x1.20e4a4b8e031ep-175 },
+ { 0x1.fb22b934b993p-181, 0x1.ccadf3adb1afp-176 },
+ { 0x1.941518f17ca26p-181, 0x1.6f4367d03dbd8p-176 },
+ { 0x1.41ee59ab3f625p-181, 0x1.24c114d62226p-176 },
+ { 0x1.00733b2d2d2a7p-181, 0x1.d2aa649df6e65p-177 },
+ { 0x1.9886bd6d1085bp-182, 0x1.73e63a45afd4dp-177 },
+ { 0x1.455a452136a6p-182, 0x1.285756918be22p-177 },
+ { 0x1.0314c07978175p-182, 0x1.d835dd5ba6335p-178 },
+ { 0x1.9c91111b6c15fp-183, 0x1.782e2c1c97a81p-178 },
+ { 0x1.4873499e69a71p-183, 0x1.2ba486638ab1ep-178 },
+ { 0x1.0573c7a800f18p-183, 0x1.dd4be385e972p-179 },
+ { 0x1.a030c72f0cf33p-184, 0x1.7c17c5d99552cp-179 },
+ { 0x1.4b36ddfcc8743p-184, 0x1.2ea5f617d321fp-179 },
+ { 0x1.078e5ec28bafdp-184, 0x1.e1e853589fe15p-180 },
+ { 0x1.a362e51221b9fp-185, 0x1.7f9fd64579e1ap-180 },
+ { 0x1.4da2bb75a5c65p-185, 0x1.3159306d0abdp-180 },
+ { 0x1.0962c95c3eb5p-185, 0x1.e6076548c0765p-181 },
+ { 0x1.a624c67aa97dfp-186, 0x1.82c376c3acddfp-181 },
+ { 0x1.4fb4e0c13d49p-186, 0x1.33bbfc6dd55a6p-181 },
+ { 0x1.0aef82f484486p-186, 0x1.e9a5b32d2ef52p-182 },
+ { 0x1.a874210dbadcfp-187, 0x1.85800f4a2d262p-182 },
+ { 0x1.516b94dabb86dp-187, 0x1.35cc607ce4fd8p-182 },
+ { 0x1.0c33410fd4c56p-187, 0x1.ecc03cea2935dp-183 },
+ { 0x1.aa4f078af0321p-188, 0x1.87d359f39448ep-183 },
+ { 0x1.52c5696370c9dp-188, 0x1.3788a50e33e44p-183 },
+ { 0x1.0d2cf5025ba2dp-188, 0x1.ef546c9652b0ap-184 },
+ { 0x1.abb3ec79d594dp-189, 0x1.89bb66243bfd5p-184 },
+ { 0x1.53c13ca08d951p-189, 0x1.38ef570827673p-184 },
+ { 0x1.0ddbcd68fc943p-189, 0x1.f1601a115b514p-185 },
+ { 0x1.aca1a45423b35p-190, 0x1.8b369b3c6ec4fp-185 },
+ { 0x1.545e3b0f8838ap-190, 0x1.39ff49c7fe5e8p-185 },
+ { 0x1.0e3f374dd9d68p-190, 0x1.f2e18e05495b4p-186 },
+ { 0x1.ad1767288e013p-191, 0x1.8c43bad265564p-186 },
+ { 0x1.549be08e15927p-191, 0x1.3ab798c59d4c2p-186 },
+ { 0x1.0e56def61fbc4p-191, 0x1.f3d7844c8a592p-187 },
+ { 0x1.ad14d1b2f0b5fp-192, 0x1.8ce1e26fb8214p-187 },
+ { 0x1.5479f9137160bp-192, 0x1.3b17a8d383f04p-187 },
+ { 0x1.0e22b05782284p-192, 0x1.f4412db819edfp-188 },
+ { 0x1.ac99e5e7b9269p-193, 0x1.8d108ccedcd75p-188 },
+ { 0x1.53f8a0f98a8b8p-193, 0x1.3b1f28f8795cap-188 },
+ { 0x1.0da2d734853ffp-193, 0x1.f41e3132440dap-189 },
+ { 0x1.aba70af1767bp-194, 0x1.8ccf9296410aep-189 },
+ { 0x1.531844d58365ep-194, 0x1.3ace12e143377p-189 },
+ { 0x1.0cd7bedf59779p-194, 0x1.f36eac3bc78c2p-190 },
+ { 0x1.aa3d0ca096eedp-195, 0x1.8c1f2a8f92477p-190 },
+ { 0x1.51d9a0dfd2e93p-195, 0x1.3a24aae988ae7p-190 },
+ { 0x1.0bc211a3c2859p-195, 0x1.f23332c263066p-191 },
+ { 0x1.a85d1a4e6bedcp-196, 0x1.8affe95ac6f2ap-191 },
+ { 0x1.503dbfed30324p-196, 0x1.39237fbbcfa18p-191 },
+ { 0x1.0a62b7d92f095p-196, 0x1.f06cce511da3ep-192 },
+ { 0x1.a608c535a2ba1p-197, 0x1.8972c09d7f45cp-192 },
+ { 0x1.4e45f9fa4adffp-197, 0x1.37cb698950bdap-192 },
+ { 0x1.08bad69ed20a4p-197, 0x1.ee1cfc9be3df9p-193 },
+ { 0x1.a341fe436d2d7p-198, 0x1.8778fdb058321p-193 },
+ { 0x1.4bf3f24d273a5p-198, 0x1.361d88db2b95bp-193 },
+ { 0x1.06cbce44363ecp-198, 0x1.eb45ad695330ap-194 },
+ { 0x1.a00b13659be7cp-199, 0x1.851447ccc879bp-194 },
+ { 0x1.4949952fc2371p-199, 0x1.341b44ff4c3c6p-194 },
+ { 0x1.0497386163a39p-199, 0x1.e7e93fdecaep-195 },
+ { 0x1.9c66ac5ae65b3p-200, 0x1.82469dbf1833ep-195 },
+ { 0x1.464915486577bp-200, 0x1.31c64a141680ep-195 },
+ { 0x1.021ee5a248c7fp-200, 0x1.e40a7f340982ap-196 },
+ { 0x1.9857c70b8b2bcp-201, 0x1.7f125320f1e94p-196 },
+ { 0x1.42f4e894cc71ap-201, 0x1.2f2086b6a5cf4p-196 },
+ { 0x1.fec9b69351b7p-202, 0x1.dfac9ed4c27cep-197 },
+ { 0x1.93e1b371520a1p-202, 0x1.7b7a0d21f0262p-197 },
+ { 0x1.3f4fc50de840ap-202, 0x1.2c2c295822108p-197 },
+ { 0x1.f8d6a0e0a9508p-203, 0x1.dad335f7aacdbp-198 },
+ { 0x1.8f080f16c57cp-203, 0x1.7780bee4609a1p-198 },
+ { 0x1.3b5c9cfaada16p-203, 0x1.28eb9d3f5000ap-198 },
+ { 0x1.f269560bdbf92p-204, 0x1.d5823ab37d92ep-199 },
+ { 0x1.89cec0363502dp-204, 0x1.7329a5753ca24p-199 },
+ { 0x1.371e9af8e6ccfp-204, 0x1.2561873c1cc7ap-199 },
+ { 0x1.eb86f931c309dp-205, 0x1.cfbdfc9b64d6ep-200 },
+ { 0x1.8439f081b525ap-205, 0x1.6e7843670c8d2p-200 },
+ { 0x1.32991dc38028ep-205, 0x1.2190c2136fc76p-200 },
+ { 0x1.e434fdd743954p-206, 0x1.c98b1eed08258p-201 },
+ { 0x1.7e4e079de1a2ep-206, 0x1.69705c180d6c1p-201 },
+ { 0x1.2dcfb3be31ebdp-206, 0x1.1d7c5aaa0949p-201 },
+ { 0x1.dc7920bafc5dcp-207, 0x1.c2ee925b3e3f6p-202 },
+ { 0x1.780fa5599d558p-207, 0x1.6415eeac7f744p-202 },
+ { 0x1.28c6164ec1235p-207, 0x1.19278bf59ff34p-202 },
+ { 0x1.d459605b63623p-208, 0x1.bbed8e8100752p-203 },
+ { 0x1.71839bad6a45bp-208, 0x1.5e6d30c67b96bp-203 },
+ { 0x1.2380250c57526p-208, 0x1.1495babbc8d8ep-203 },
+ { 0x1.cbdbf53eed588p-209, 0x1.b48d8b08c37b5p-204 },
+ { 0x1.6aaee88d3a5e6p-209, 0x1.587a8905112ebp-204 },
+ { 0x1.1e01e0cda0c0ep-209, 0x1.0fca71267dd26p-204 },
+ { 0x1.c3074a0c1c67dp-210, 0x1.acd43894c1f06p-205 },
+ { 0x1.6396af97c5f7fp-210, 0x1.52428954b7c2fp-205 },
+ { 0x1.184f669e7e645p-210, 0x1.0ac95a364b406p-205 },
+ { 0x1.b9e1f37f768c9p-211, 0x1.a4c779750fb77p-206 },
+ { 0x1.5c4033ae88d94p-211, 0x1.4bc9e91b546a8p-206 },
+ { 0x1.126ceaa621095p-211, 0x1.05963d1a5105bp-206 },
+ { 0x1.b072a84d6770bp-212, 0x1.9c6d5a387a6d7p-207 },
+ { 0x1.54b0d08180ac6p-212, 0x1.45157f4a2e598p-207 },
+ { 0x1.0c5eb30658611p-212, 0x1.0034f87652744p-207 },
+ { 0x1.a6c038fdf5aedp-213, 0x1.93cc0a254a9f5p-208 },
+ { 0x1.4cedf419a9b38p-213, 0x1.3e2a3c60327aap-208 },
+ { 0x1.062912bcc23f9p-213, 0x1.f552fb3e1c70bp-209 },
+ { 0x1.9cd187cff951cp-214, 0x1.8ae9d3a6eb66fp-209 },
+ { 0x1.44fd186d008c2p-214, 0x1.370d2466d3327p-209 },
+ { 0x1.ffa0c91caab55p-215, 0x1.e9ef97aa04b46p-210 },
+ { 0x1.92ad80b12a09bp-215, 0x1.81cd14bd535bbp-210 },
+ { 0x1.3ce3bd0683046p-215, 0x1.2fc348f3a8121p-210 },
+ { 0x1.f2b20c0b002abp-216, 0x1.de47d70b3398cp-211 },
+ { 0x1.885b1157e885cp-216, 0x1.787c377ac34cdp-211 },
+ { 0x1.34a760cc47acap-216, 0x1.2851c338b22e4p-211 },
+ { 0x1.e58ea51580badp-217, 0x1.d263d33512bb6p-212 },
+ { 0x1.7de1218b19542p-217, 0x1.6efdaa9c0e45ep-212 },
+ { 0x1.2c4d7bed4d522p-217, 0x1.20bdae2cd61c6p-212 },
+ { 0x1.d83f3d3e6d15p-218, 0x1.c64ba5bdb46dep-213 },
+ { 0x1.73468ba3c29b8p-218, 0x1.6557da47246f7p-213 },
+ { 0x1.23db7a001a935p-218, 0x1.190c20d5b5808p-213 },
+ { 0x1.cacc668087b83p-219, 0x1.ba075f0192b6p-214 },
+ { 0x1.689215536317fp-219, 0x1.5b9128fb09361p-214 },
+ { 0x1.1b56b45aac06fp-219, 0x1.114228bb99133p-214 },
+ { 0x1.bd3e92f58e3aep-220, 0x1.ad9efd6e7e35p-215 },
+ { 0x1.5dca68b92a62fp-220, 0x1.51afe8bbb6b6cp-215 },
+ { 0x1.12c46cab86e91p-220, 0x1.0964c48f92b05p-215 },
+ { 0x1.af9e0c680145ap-221, 0x1.a11a652260dp-216 },
+ { 0x1.52f60dcf5b39p-221, 0x1.47ba5483b6e8fp-216 },
+ { 0x1.0a29c7db10f7p-221, 0x1.0178df0b67157p-216 },
+ { 0x1.a1f2ec5b27de2p-222, 0x1.948157e97fbd7p-217 },
+ { 0x1.481b643932becp-222, 0x1.3db68a0470a4fp-217 },
+ { 0x1.018bc93b8e2e5p-222, 0x1.f306942454ae6p-218 },
+ { 0x1.9445149305037p-223, 0x1.87db6da6dd3cap-218 },
+ { 0x1.3d409d78b6819p-223, 0x1.33aa83bd4deabp-218 },
+ { 0x1.f1de9c1ab95aap-224, 0x1.e311742f9561bp-219 },
+ { 0x1.869c2824b4b6bp-224, 0x1.7b300d303ed2cp-219 },
+ { 0x1.326bb792c8c5bp-224, 0x1.299c1370fc2d1p-219 },
+ { 0x1.e0b212b870715p-225, 0x1.d31b83aa1a53bp-220 },
+ { 0x1.78ff85165ac91p-225, 0x1.6e8665a634affp-220 },
+ { 0x1.27a27826da7a5p-225, 0x1.1f90dcff1976ep-220 },
+ { 0x1.cf9b0072f8176p-226, 0x1.c32d9c998168ap-221 },
+ { 0x1.6b763e947db08p-226, 0x1.61e5684f4d137p-221 },
+ { 0x1.1cea67fe8699cp-226, 0x1.158e51a7ac97ep-221 },
+ { 0x1.bea20cad09b1fp-227, 0x1.b350464c51c99p-222 },
+ { 0x1.5e0717c155a1cp-227, 0x1.5553c2fc66728p-222 },
+ { 0x1.1248cf18568a2p-227, 0x1.0b99abbccdbb1p-222 },
+ { 0x1.adcf760300963p-228, 0x1.a38baebfb68e4p-223 },
+ { 0x1.50b87f214792dp-228, 0x1.48d7dafad7ffep-223 },
+ { 0x1.07c2b12fe4dbap-228, 0x1.01b7eac5ea688p-223 },
+ { 0x1.9d2b0d0c4a0b1p-229, 0x1.93e7a4bb0743p-224 },
+ { 0x1.43908aa677d25p-229, 0x1.3c77c897ed254p-224 },
+ { 0x1.fab995891c153p-230, 0x1.efdba02e2ceffp-225 },
+ { 0x1.8cbc2fe600108p-230, 0x1.846b92a47c343p-225 },
+ { 0x1.3694f45c1b92fp-230, 0x1.30395337f89bbp-225 },
+ { 0x1.e6371d3dc0233p-231, 0x1.dc7fb7bbca8adp-226 },
+ { 0x1.7c89c6867890ep-231, 0x1.751e7a10e8264p-226 },
+ { 0x1.29cb17b0f706bp-231, 0x1.2421ee0211f87p-226 },
+ { 0x1.d20647a807a0cp-232, 0x1.c9649548abac7p-227 },
+ { 0x1.6c9a3fd812077p-232, 0x1.6606f00ed6d5dp-227 },
+ { 0x1.1d37ef5f490cdp-232, 0x1.1836b52067807p-227 },
+ { 0x1.be2ec88ae1479p-233, 0x1.b6922692e74d4p-228 },
+ { 0x1.5cf38f9818abfp-233, 0x1.572b1a2c0293ap-228 },
+ { 0x1.10e013ef486f7p-233, 0x1.0c7c6b93f06a1p-228 },
+ { 0x1.aab7b734b99f6p-234, 0x1.a40fcadcdd133p-229 },
+ { 0x1.4d9b2cf546b09p-234, 0x1.4890ac32b69b5p-229 },
+ { 0x1.04c7bad04b57cp-234, 0x1.00f779993bbc1p-229 },
+ { 0x1.97a78d5f1c6dbp-235, 0x1.91e450ac30542p-230 },
+ { 0x1.3e9611e8218p-235, 0x1.3a3ce69b6a143p-230 },
+ { 0x1.f1e56c0773bb7p-236, 0x1.eb57d7362f984p-231 },
+ { 0x1.850426f2df55dp-236, 0x1.8015f467ddd4p-231 },
+ { 0x1.2fe8bb3e4f4d8p-236, 0x1.2c3495adab7d8p-231 },
+ { 0x1.dac8e8a813f1fp-237, 0x1.d53ae35dbfa26p-232 },
+ { 0x1.72d2c2a7422abp-237, 0x1.6eaa5fce4af3ap-232 },
+ { 0x1.21972950f570dp-237, 0x1.1e7c114a57a33p-232 },
+ { 0x1.c44004226dc17p-238, 0x1.bf9ebf2ac34cfp-233 },
+ { 0x1.6118037139874p-238, 0x1.5da6aa3adb7a3p-233 },
+ { 0x1.13a4e15d42467p-238, 0x1.11173d5813f4dp-233 },
+ { 0x1.ae501496e23f2p-239, 0x1.aa895a750e0f6p-234 },
+ { 0x1.4fd7f2b705e64p-239, 0x1.4d0f59b16ac32p-234 },
+ { 0x1.0614ef7575b09p-239, 0x1.04098aca1b898p-234 },
+ { 0x1.98fdb1084fd1cp-240, 0x1.95ffef5a788b3p-235 },
+ { 0x1.3f16033b4da17p-240, 0x1.3ce864a4f75bbp-235 },
+ { 0x1.f1d3d20014dd3p-241, 0x1.eeabf27142ccbp-236 },
+ { 0x1.844cb59a101a9p-241, 0x1.82070510e6e91p-236 },
+ { 0x1.2ed514b22b68bp-241, 0x1.2d35346de60f3p-236 },
+ { 0x1.d84bdf7421499p-242, 0x1.d5fe3202b4d44p-237 },
+ { 0x1.7040489842ad7p-242, 0x1.6ea2738b3dbebp-237 },
+ { 0x1.1f1777f205012p-242, 0x1.1df8a8637ba9cp-237 },
+ { 0x1.bf956a62adf73p-243, 0x1.be0e1bcc5bf2bp-238 },
+ { 0x1.5cdae0381ff94p-243, 0x1.5bd567e120a1cp-238 },
+ { 0x1.0fdef3b187063p-243, 0x1.0f35198b8b7f7p-238 },
+ { 0x1.a7b2fd5556b6ap-244, 0x1.a6df243f2c6f4p-239 },
+ { 0x1.4a1e48fd99b8ep-244, 0x1.49a26968a8fd1p-239 },
+ { 0x1.012cc9c3d142ap-244, 0x1.00ec5ed2dbe3ep-239 },
+ { 0x1.90a652d08b6ecp-245, 0x1.9073f3afbdfebp-240 },
+ { 0x1.380bacb3471d9p-245, 0x1.380b5f70c487dp-240 },
+ { 0x1.e603798765b0ap-246, 0x1.e63fa380d130bp-241 },
+ { 0x1.7a705e88ab4c8p-246, 0x1.7ace6e086aab7p-241 },
+ { 0x1.26a399e180e7cp-246, 0x1.2711978a97cf7p-241 },
+ { 0x1.cabc2c3d98d7cp-247, 0x1.cba0a72ae9c08p-242 },
+ { 0x1.651157275ac6fp-247, 0x1.65efbb20adf2dp-242 },
+ { 0x1.15e60bb1a2bacp-247, 0x1.16b5cc5019368p-242 },
+ { 0x1.b08358e30e1b1p-248, 0x1.b1fca598944c3p-243 },
+ { 0x1.5088c08941b89p-248, 0x1.51d84fa353951p-243 },
+ { 0x1.05d2722aa0abep-248, 0x1.06f82c9619b9p-243 },
+ { 0x1.9757d44a0d5d1p-249, 0x1.9953a1cf16aadp-244 },
+ { 0x1.3cd5765cc7b51p-249, 0x1.3e87f66d27bbp-244 },
+ { 0x1.eccf7568ff3afp-250, 0x1.efb0c5f0312cdp-245 },
+ { 0x1.7f37a88128933p-250, 0x1.81a4d1085cfd1p-245 },
+ { 0x1.29f5b70afae6ep-250, 0x1.2bfdda4e2b20cp-245 },
+ { 0x1.cf48b1a182cb9p-251, 0x1.d2ab3b59164a6p-246 },
+ { 0x1.682022c0d8296p-251, 0x1.6aeea740e7e26p-246 },
+ { 0x1.17e72ed48d1c2p-251, 0x1.1a389017ca93cp-246 },
+ { 0x1.b30c9decefa86p-252, 0x1.b6dd2d215fccfp-247 },
+ { 0x1.520de188c8ff4p-252, 0x1.552ee415230cdp-247 },
+ { 0x1.06a7030db71fbp-252, 0x1.093620e33d9f9p-247 },
+ { 0x1.98166f02e00aap-253, 0x1.9c4336b720df7p-248 },
+ { 0x1.3cfce2d301755p-253, 0x1.40629fd47fda6p-248 },
+ { 0x1.ec63bac9af50ap-254, 0x1.f1e828f7f1e6ep-249 },
+ { 0x1.7e609b497d4bfp-254, 0x1.82d92bd0fbc5bp-249 },
+ { 0x1.28e89244647b5p-254, 0x1.2c8658b1c7fabp-249 },
+ { 0x1.cd07ee41894f6p-255, 0x1.d2def7b6139fbp-250 },
+ { 0x1.65e4eca3c47cep-255, 0x1.6a9a29142865ap-250 },
+ { 0x1.15cbd7439af48p-255, 0x1.1995fff959855p-250 },
+ { 0x1.af324889fe32ep-256, 0x1.b549f742691f7p-251 },
+ { 0x1.4e9c920d5db05p-256, 0x1.5380a4af4c2e9p-251 },
+ { 0x1.03a122e1077b7p-256, 0x1.078d07375b0bp-251 },
+ { 0x1.92d9bd168c63p-257, 0x1.9921acfd99f39p-252 },
+ { 0x1.388030ea8589cp-257, 0x1.3d867ecfb60a5p-252 },
+ { 0x1.e4c4faf832008p-258, 0x1.ecccda72dba49p-253 },
+ { 0x1.77f4a046c515ep-258, 0x1.7e5deef2de87bp-253 },
+ { 0x1.2387f5f4b712ep-258, 0x1.28a511d87ce7dp-253 },
+ { 0x1.c413282821079p-259, 0x1.cc3995b1e2c4p-254 },
+ { 0x1.5e78bc56d0fbbp-259, 0x1.64f5f80200f46p-254 },
+ { 0x1.0faba5af01355p-259, 0x1.14d5424501d7ep-254 },
+ { 0x1.a51f8a6830159p-260, 0x1.ad54bef9112dp-255 },
+ { 0x1.465b65a83bdbbp-260, 0x1.4ce07b8d50856p-255 },
+ { 0x1.f9c5589e7201fp-261, 0x1.020f8e226943ep-255 },
+ { 0x1.87dc5ad8af9ecp-261, 0x1.90123a8271991p-256 },
+ { 0x1.2f918e4d3f95cp-261, 0x1.3613b89391a8fp-256 },
+ { 0x1.d6485a170413ap-262, 0x1.e098381b76cd3p-257 },
+ { 0x1.6c3b66970be3dp-262, 0x1.7465697a54c64p-257 },
+ { 0x1.1a0fd8c3a4e6fp-262, 0x1.20858c20a1795p-257 },
+ { 0x1.b4ce217bd5e55p-263, 0x1.bf05934cfa1ccp-258 },
+ { 0x1.522e259c7017ap-263, 0x1.5a41409f84e49p-258 },
+ { 0x1.05caa9cf257c4p-263, 0x1.0c2b83023243dp-258 },
+ { 0x1.954427a430b11p-264, 0x1.9f5672cf62a4fp-259 },
+ { 0x1.39a5d07601e71p-264, 0x1.41985de8f7a14p-259 },
+ { 0x1.e56c72cc01fccp-265, 0x1.f1f5d5615d783p-260 },
+ { 0x1.7797a6e64ddc9p-265, 0x1.8179bfb69c631p-260 },
+ { 0x1.229374c83806p-265, 0x1.2a5d1d1f1ae5cp-260 },
+ { 0x1.c18d454a503aep-266, 0x1.cdd1c2bddbb9ep-261 },
+ { 0x1.5bb5b3e414ad3p-266, 0x1.655e203c78adp-261 },
+ { 0x1.0ce808921de57p-266, 0x1.1481ab5a1469ap-261 },
+ { 0x1.9fdfe587f056ap-267, 0x1.abd4ca4bd8884p-262 },
+ { 0x1.418b54bd6a895p-267, 0x1.4af20f59f283dp-262 },
+ { 0x1.f128f851039d9p-268, 0x1.fff032b2dbde7p-263 },
+ { 0x1.804c6e03f60cbp-268, 0x1.8be8c488684b4p-263 },
+ { 0x1.290596a08a94fp-268, 0x1.3223f2e5be0fp-263 },
+ { 0x1.cb1395c8187f6p-269, 0x1.d964d959533d1p-264 },
+ { 0x1.62bb1316ec5fcp-269, 0x1.6df780d5ecc43p-264 },
+ { 0x1.1211a1b47d3aep-269, 0x1.1ae2302fd4bcdp-264 },
+ { 0x1.a772150026811p-270, 0x1.b5455f4e2ce45p-265 },
+ { 0x1.47143aa78b5fep-270, 0x1.51eade2a24279p-265 },
+ { 0x1.f93996ba5e93dp-271, 0x1.051b3f15282e5p-265 },
+ { 0x1.8626f2553e204p-271, 0x1.93760037df87ap-266 },
+ { 0x1.2d4091cd12adcp-271, 0x1.37ace1ccc1a8dp-266 },
+ { 0x1.d1294db79df79p-272, 0x1.e17b7713cf17fp-267 },
+ { 0x1.6715149108678p-272, 0x1.73db39c4b278bp-267 },
+ { 0x1.1529206516167p-272, 0x1.1f27cc2724f9p-267 },
+ { 0x1.abce28a1f17f2p-273, 0x1.bb70eb3792a1cp-268 },
+ { 0x1.4a1fe3e55f964p-273, 0x1.5659e4463ddd1p-268 },
+ { 0x1.fd6eb54be7326p-274, 0x1.08462ba9624dbp-268 },
+ { 0x1.89049c51b8388p-274, 0x1.97f4ffe1284a1p-269 },
+ { 0x1.2f2b5e6789756p-274, 0x1.3ad748e88c53fp-269 },
+ { 0x1.d3aa617478594p-275, 0x1.e5e5db98318a5p-270 },
+ { 0x1.68a9e9f7b2f9ap-275, 0x1.76e6798f53e9ap-270 },
+ { 0x1.161c2a1de488ep-275, 0x1.21393590da64bp-270 },
+ { 0x1.acda38e82463bp-276, 0x1.be32dc731f12cp-271 },
+ { 0x1.4a9c33e05809ap-276, 0x1.5824d30f3fce1p-271 },
+ { 0x1.fdaf4969fc45p-277, 0x1.09660e736b8bdp-271 },
+ { 0x1.88d45a53c41c5p-277, 0x1.994b0856743cbp-272 },
+ { 0x1.2eba8f55fe897p-277, 0x1.3b9051c5e7679p-272 },
+ { 0x1.d287e1e77c85ap-278, 0x1.e689bae600601p-273 },
+ { 0x1.6770239fc87e6p-278, 0x1.77071c1633b26p-273 },
+ { 0x1.14e513c1b20dcp-278, 0x1.210a174166fcdp-273 },
+ { 0x1.aa90041143186p-279, 0x1.bd7abebe480e6p-274 },
+ { 0x1.488642c71cfa6p-279, 0x1.5740f6d4ed277p-274 },
+ { 0x1.f9f9ce5a157bbp-280, 0x1.0874302ee34fdp-274 },
+ { 0x1.85974997b931fp-280, 0x1.97701e51a6bfep-275 },
+ { 0x1.2bf0c37efc00bp-280, 0x1.39d3aac239fe2p-275 },
+ { 0x1.cdc89092e43c3p-281, 0x1.e36341a88ea0cp-276 },
+ { 0x1.636f0e2785c54p-281, 0x1.743c5e4db43f9p-276 },
+ { 0x1.118b19def65f8p-281, 0x1.1e9b8ad36fd99p-276 },
+ { 0x1.a4fd2c459c71p-282, 0x1.b94cde5e4fc3p-277 },
+ { 0x1.43ea7a73d5cfp-282, 0x1.53b3a109a94aep-277 },
+ { 0x1.f26454740b953p-283, 0x1.057635a1ed1dfp-277 },
+ { 0x1.7f60ab495565cp-283, 0x1.926f55b776f91p-278 },
+ { 0x1.26de8be09d876p-283, 0x1.35abb1f1cadefp-278 },
+ { 0x1.c5889cb51dbb9p-284, 0x1.dc853b381e5ap-279 },
+ { 0x1.5cbe6a335189cp-284, 0x1.6e96e5d005f5dp-279 },
+ { 0x1.0c22190c33c65p-284, 0x1.19fc0dba0e848p-279 },
+ { 0x1.9c42b0a7816acp-285, 0x1.b1c21d6e11086p-280 },
+ { 0x1.3ce41b9a97542p-285, 0x1.4d91f3701143cp-280 },
+ { 0x1.e71ba6efe048bp-286, 0x1.007de792cfd6ep-280 },
+ { 0x1.76552635a3b27p-286, 0x1.8a6663a0ececbp-281 },
+ { 0x1.1fa1c7f04e719p-286, 0x1.2f310e41037d6p-281 },
+ { 0x1.b9f88d1e59fb3p-287, 0x1.d2185735c5ad9p-282 },
+ { 0x1.538582347c59ep-287, 0x1.66381bdd98a02p-282 },
+ { 0x1.04c9ca3c242adp-287, 0x1.1346f1ba5a69ap-282 },
+ { 0x1.9093a8968bba5p-288, 0x1.a706fd9470fb8p-283 },
+ { 0x1.339c31e0d51b7p-288, 0x1.45000f1eec014p-283 },
+ { 0x1.d8619415342d3p-289, 0x1.f3510620184eap-284 },
+ { 0x1.6aa95f63dd017p-289, 0x1.7f84791f6fdbbp-284 },
+ { 0x1.16648113f6ec6p-289, 0x1.2689bc620188bp-284 },
+ { 0x1.ab5b65b277be7p-290, 0x1.c45998d7521aep-285 },
+ { 0x1.47f9aad3382fep-290, 0x1.5b50e4b7d6356p-285 },
+ { 0x1.f7591b1b1c875p-291, 0x1.0aa3508d5dbp-285 },
+ { 0x1.82335294ba26p-291, 0x1.9959eb6f64db6p-286 },
+ { 0x1.2848053b7dfb1p-291, 0x1.3a2fb2a16d1ccp-286 },
+ { 0x1.c68a6f5a8ef62p-292, 0x1.e23b370697cbbp-287 },
+ { 0x1.5c9ffcce7e5fdp-292, 0x1.720876851d9fbp-287 },
+ { 0x1.0b5b54d487d35p-292, 0x1.1be79c992aff6p-287 },
+ { 0x1.9a0421e5c5d71p-293, 0x1.b3980569c43a5p-288 },
+ { 0x1.3a5c4268d4e27p-293, 0x1.4e1fc4f822568p-288 },
+ { 0x1.e1fba80d34a41p-294, 0x1.0042910b94342p-288 },
+ { 0x1.7172912ec21f8p-294, 0x1.8908e30f7a1b3p-289 },
+ { 0x1.1b271db151968p-294, 0x1.2d5e5a1b8288ep-289 },
+ { 0x1.b1f9ef2d6b135p-295, 0x1.ce1b3b9ea6267p-290 },
+ { 0x1.4c872d1af92bcp-295, 0x1.623e8fb994f23p-290 },
+ { 0x1.fd87064e02a6fp-296, 0x1.0f8695160ca38p-290 },
+ { 0x1.8652a61cdcd3bp-296, 0x1.a031b186be289p-291 },
+ { 0x1.2af84a660968dp-296, 0x1.3eee8e04dc3ap-291 },
+ { 0x1.c9f07af149226p-297, 0x1.e8bd23cc416fp-292 },
+ { 0x1.5eacf76fffc0cp-297, 0x1.766e8d5583265p-292 },
+ { 0x1.0c80f3efbbf3fp-297, 0x1.1ed2fab014c43p-292 },
+ { 0x1.9b1f8ffd8f3c8p-298, 0x1.b76010ebb6c6ap-293 },
+ { 0x1.3ab5d5023fe4ap-298, 0x1.507d813502ab7p-293 },
+ { 0x1.e1c174ea2aaa6p-299, 0x1.01aa61c90eaccp-293 },
+ { 0x1.70b05029068dap-299, 0x1.8a90544ab274dp-294 },
+ { 0x1.1a1fba21de5fp-299, 0x1.2e0fb0911dd84p-294 },
+ { 0x1.afb70654af059p-300, 0x1.ce6f24739f7c7p-295 },
+ { 0x1.4a458b53b2a84p-300, 0x1.61eefc532711fp-295 },
+ { 0x1.f944d95c81983p-301, 0x1.0edb77098a96p-295 },
+ { 0x1.8272ab43f7156p-301, 0x1.9e82e04d9025fp-296 },
+ { 0x1.278886c5a4d73p-301, 0x1.3d237a2e0f859p-296 },
+ { 0x1.c3f57b512a1f2p-302, 0x1.e5385c7d0efep-297 },
+ { 0x1.598c52c5d1746p-302, 0x1.73258d0b919ebp-297 },
+ { 0x1.0828ad1da0983p-302, 0x1.1bdb57d01ceccp-297 },
+ { 0x1.93d4935512f54p-303, 0x1.b223e5e67d24ap-298 },
+ { 0x1.34a3670d3cd59p-303, 0x1.4bf43098a2ef1p-298 },
+ { 0x1.d7b67cefff216p-304, 0x1.fb93db1e39a21p-299 },
+ { 0x1.686e7356020d2p-304, 0x1.8402d3eada60ap-299 },
+ { 0x1.135e695d6d4f8p-304, 0x1.2892e3159736p-299 },
+ { 0x1.a4b6028e1ae52p-305, 0x1.c5502f868f04bp-300 },
+ { 0x1.415808da66669p-305, 0x1.5a670a5d83e0ep-300 },
+ { 0x1.ead51e60a821dp-306, 0x1.08ac71830fd4ep-300 },
+ { 0x1.76cfe88ffbfa7p-306, 0x1.9467d9d3bce7dp-301 },
+ { 0x1.1e2e61d740a91p-306, 0x1.34ea92731d6fp-301 },
+ { 0x1.b4f6c22875415p-307, 0x1.d7e402cf49a21p-302 },
+ { 0x1.4d8e03e448998p-307, 0x1.6860e96265ba8p-302 },
+ { 0x1.fd2c6816f010bp-308, 0x1.132f279000564p-302 },
+ { 0x1.8494b75728df1p-308, 0x1.a4356bd52863ep-303 },
+ { 0x1.28836b62851b4p-308, 0x1.40cac092d16a6p-303 },
+ { 0x1.c476ceb4ce0a6p-309, 0x1.e9bb8c8c45eaap-304 },
+ { 0x1.592d26553a529p-309, 0x1.75c6ad9777c96p-304 },
+ { 0x1.074be65f60432p-309, 0x1.1d3d889242361p-304 },
+ { 0x1.91a14719373e5p-310, 0x1.b34c7bf3e0108p-305 },
+ { 0x1.3248b33f78dd9p-310, 0x1.4c1bf325b5886p-305 },
+ { 0x1.d316bfa6ecf07p-311, 0x1.fab351a6d7271p-306 },
+ { 0x1.641dc398561efp-311, 0x1.827d8b273a859p-306 },
+ { 0x1.0f79d08c027e2p-311, 0x1.26c35a8453a6ep-306 },
+ { 0x1.9ddabce45ff88p-312, 0x1.c18e854f7a653p-307 },
+ { 0x1.3b6a0443345f1p-312, 0x1.56c727238c10ep-307 },
+ { 0x1.e0b830517633fp-313, 0x1.05545196af9e3p-307 },
+ { 0x1.6e4903f595976p-313, 0x1.8e6b62ae03487p-308 },
+ { 0x1.170eca4e7a4cap-313, 0x1.2facf384d3a3bp-308 },
+ { 0x1.a92756c27d93ap-314, 0x1.ceddf1e753b81p-309 },
+ { 0x1.43d40bf74392dp-314, 0x1.60b61e0028436p-309 },
+ { 0x1.ed3e286c4c0dep-315, 0x1.0cbd09b1e5e1p-309 },
+ { 0x1.77993389df313p-315, 0x1.997719e8b73a8p-310 },
+ { 0x1.1dfa945eaae99p-315, 0x1.37e77cf85ca37p-310 },
+ { 0x1.b36ec5aa0588p-316, 0x1.db1e802a6c81fp-311 },
+ { 0x1.4b749e64b35f5p-316, 0x1.69d3aa6fccfd9p-311 },
+ { 0x1.f88d823260c9ep-317, 0x1.1383f4dd09079p-311 },
+ { 0x1.7ffa0f1fabb65p-317, 0x1.a388f33976b7bp-312 },
+ { 0x1.242e12375b352p-317, 0x1.3f613589599c6p-312 },
+ { 0x1.bc9a844ffd2b5p-318, 0x1.e635a66e3ebe7p-313 },
+ { 0x1.523af73f84783p-318, 0x1.720bfb4a981d7p-313 },
+ { 0x1.0146a610e0588p-318, 0x1.199a49bcc51p-313 },
+ { 0x1.87590d6d36008p-319, 0x1.ac8ae259e160cp-314 },
+ { 0x1.299b80ea6bb7fp-319, 0x1.4609b0c4183cap-314 },
+ { 0x1.c496292aa266bp-320, 0x1.f00af26520f9dp-315 },
+ { 0x1.5817f72c95e4cp-320, 0x1.794ce31e24c7bp-315 },
+ { 0x1.059392396d038p-320, 0x1.1ef2877dbfcadp-315 },
+ { 0x1.8da5a346cbb3fp-321, 0x1.b468dc95cb829p-316 },
+ { 0x1.2e36a9eb80d32p-321, 0x1.4bd213115ac94p-316 },
+ { 0x1.cb4fb203e18ap-322, 0x1.f88862b544527p-317 },
+ { 0x1.5cfe5be9615c7p-322, 0x1.7f861b04cbe3ap-317 },
+ { 0x1.0923c6394f695p-322, 0x1.2380a7a548a2fp-317 },
+ { 0x1.92d18166ccd51p-323, 0x1.bb1122f6e5762p-318 },
+ { 0x1.31f510cb3f507p-323, 0x1.50ad48dd9b3a6p-318 },
+ { 0x1.d0b7c794af438p-324, 0x1.ff9ab8e5d6631p-319 },
+ { 0x1.60e2f23228dedp-324, 0x1.84a97f6b3e853p-319 },
+ { 0x1.0bef1906dac58p-324, 0x1.273a4b16ba84fp-319 },
+ { 0x1.96d0ca88e4fcp-325, 0x1.c07484e1da469p-320 },
+ { 0x1.34ce1af3c1b6p-325, 0x1.549037ceef1fep-320 },
+ { 0x1.d4c1f7c67dd18p-326, 0x1.0298e0fc06037p-320 },
+ { 0x1.63bcc0600e3b1p-326, 0x1.88ab45875f419p-321 },
+ { 0x1.0def17046c37ep-326, 0x1.2a16e161fa35fp-321 },
+ { 0x1.999a40ba75f42p-327, 0x1.c48699c75f345p-322 },
+ { 0x1.36bb3093bcf7fp-327, 0x1.5771e906a9978p-322 },
+ { 0x1.d764e5657aa2p-328, 0x1.04a04a1699caap-322 },
+ { 0x1.658528dc53bd5p-328, 0x1.8b822865b44e6p-323 },
+ { 0x1.0f1f1acd583cp-328, 0x1.2c0fc98ac934cp-323 },
+ { 0x1.9b2768ee2e28p-329, 0x1.c73df0b6d4334p-324 },
+ { 0x1.37b7d60833afbp-329, 0x1.594bab8ddacb1p-324 },
+ { 0x1.d89a6c43f4c1p-330, 0x1.05dee05833b3cp-324 },
+ { 0x1.663803afd90e2p-330, 0x1.8d278c9cbfc58p-325 },
+ { 0x1.0f7c5f2e4265p-330, 0x1.2d206b997c2ccp-325 },
+ { 0x1.9b74a41343d69p-331, 0x1.c89434d36542fp-326 },
+ { 0x1.37c1bd3bb9cfep-331, 0x1.5a192e33cf627p-326 },
+ { 0x1.d85fb90bdf218p-332, 0x1.0651bc0c61b2p-326 },
+ { 0x1.65d3aea4b609ep-332, 0x1.8d9799e5f2521p-327 },
+ { 0x1.0f0609e7aa674p-332, 0x1.2d464a6b30dc2p-327 },
+ { 0x1.9a813d2878f74p-333, 0x1.c88645e6c88eep-328 },
+ { 0x1.36d8ce9d2217bp-333, 0x1.59d89052b0525p-328 },
+ { 0x1.d6b5543d3c94p-334, 0x1.05f7d07f3fb02p-328 },
+ { 0x1.645913a262a36p-334, 0x1.8cd14a1185c8dp-329 },
+ { 0x1.0dbd2f003b6a5p-334, 0x1.2c810d60e767ep-329 },
+ { 0x1.984f6bfe6778p-335, 0x1.c714448c370a6p-330 },
+ { 0x1.34ff297cd534dp-335, 0x1.588a691f2cd1fp-330 },
+ { 0x1.d39f201da2255p-336, 0x1.04d1f01416963p-330 },
+ { 0x1.61cba521cabb4p-336, 0x1.8ad66d03eba59p-331 },
+ { 0x1.0ba4cc94c45b3p-336, 0x1.2ad281b8cc2ap-331 },
+ { 0x1.94e44c9a075e7p-337, 0x1.c44191b160ec2p-332 },
+ { 0x1.32391bcecdc03p-337, 0x1.5631c55b5d22cp-332 },
+ { 0x1.cf2449a3fda4bp-338, 0x1.02e2c911c7929p-332 },
+ { 0x1.5e3150cc8eda4p-338, 0x1.87aba1a7120bfp-333 },
+ { 0x1.08c1bf3c985fap-338, 0x1.283e938a586f7p-333 },
+ { 0x1.9047cb663bb8cp-339, 0x1.c014c17012593p-334 },
+ { 0x1.2e8d117dfdd44p-339, 0x1.52d41b7968429p-334 },
+ { 0x1.c94f2cb2815a8p-340, 0x1.002edb3674f27p-334 },
+ { 0x1.599268900e7bcp-340, 0x1.835843f5f0b0cp-335 },
+ { 0x1.051aaf415041dp-340, 0x1.24cb3e8b7d756p-335 },
+ { 0x1.8a84869fc8267p-341, 0x1.ba9781881c8a9p-336 },
+ { 0x1.2a037bab743e1p-341, 0x1.4e79366e7a47p-336 },
+ { 0x1.c22d2c350e306p-342, 0x1.f978cc962d426p-337 },
+ { 0x1.53f982a03a248p-342, 0x1.7de65083f0e21p-337 },
+ { 0x1.00b7f70f68972p-342, 0x1.208076f18ea3p-337 },
+ { 0x1.83a7a5a0b9d4dp-343, 0x1.b3d6740403453p-338 },
+ { 0x1.24a6b05eb3edap-343, 0x1.492b17a8d9ad4p-338 },
+ { 0x1.b9ce7efad864cp-344, 0x1.f126a42ab2a64p-339 },
+ { 0x1.4d7351162fad8p-344, 0x1.77623e1a3ca2fp-339 },
+ { 0x1.f74706d1f613cp-345, 0x1.1b680aeae0c3cp-339 },
+ { 0x1.7bc0a6e57fbc5p-345, 0x1.abe0fed214bcap-340 },
+ { 0x1.1e82c35430e3dp-345, 0x1.42f5d0cb0afebp-340 },
+ { 0x1.b045f25c98b4bp-346, 0x1.e77a20528f8f5p-341 },
+ { 0x1.460e7202036c7p-346, 0x1.6fdace394b03cp-341 },
+ { 0x1.ebd15c07c2acdp-347, 0x1.158d7d54f1681p-341 },
+ { 0x1.72e125d540295p-347, 0x1.a2c9115542385p-342 },
+ { 0x1.17a558b9c184fp-347, 0x1.3be755f8b210cp-342 },
+ { 0x1.a5a8a3f3de092p-348, 0x1.dc88f077bd369p-343 },
+ { 0x1.3ddb38ecb5b52p-348, 0x1.6760d57bb9982p-343 },
+ { 0x1.df2826b036578p-349, 0x1.0efdda755dbb3p-343 },
+ { 0x1.691c997f37f0ep-349, 0x1.98a2e123c782ep-344 },
+ { 0x1.101d72c627ff7p-349, 0x1.340f49a72211p-344 },
+ { 0x1.9a0db3d2b8dacp-350, 0x1.d06b3f65f6fdp-345 },
+ { 0x1.34eb72e63e592p-350, 0x1.5e06fcff790f4p-345 },
+ { 0x1.d166c8f34fca4p-351, 0x1.07c787991a68p-345 },
+ { 0x1.5e880d9f1fe43p-351, 0x1.8d849f54265f7p-346 },
+ { 0x1.07fb3b2ff1602p-351, 0x1.2b7ec30262d2bp-346 },
+ { 0x1.8d8df0cbffd52p-352, 0x1.c33b5a8ad639fp-347 },
+ { 0x1.2b52265317648p-352, 0x1.53e17e1a8afadp-347 },
+ { 0x1.c2aa6bd34f17bp-353, 0x1.fff41d2913dabp-348 },
+ { 0x1.5339d751ff2a1p-353, 0x1.818627da2e9e4p-348 },
+ { 0x1.fe9f93308c405p-354, 0x1.2248100f21115p-348 },
+ { 0x1.80438073219dep-354, 0x1.b515531d535ebp-349 },
+ { 0x1.21234fbc4a127p-354, 0x1.4905d9b84e0cbp-349 },
+ { 0x1.b31198aa5f8abp-355, 0x1.ef4bcc5f71a72p-350 },
+ { 0x1.474946f304456p-355, 0x1.74c0ac8d03b2bp-350 },
+ { 0x1.ec59d00f3fe38p-356, 0x1.187e74c209a91p-350 },
+ { 0x1.7249848679fa9p-356, 0x1.a6169b09c4411p-351 },
+ { 0x1.16739cec78bd4p-356, 0x1.3d8a8ccb26cd9p-351 },
+ { 0x1.a2bbd0795adeep-357, 0x1.ddb87127c2076p-352 },
+ { 0x1.3ace589cd3352p-357, 0x1.674e5d7be735cp-352 },
+ { 0x1.d949ad392f075p-358, 0x1.0e35e84d33d3fp-352 },
+ { 0x1.63bbbf78651ccp-358, 0x1.965d9f895d99cp-353 },
+ { 0x1.0b5827a3ba382p-358, 0x1.3186c3440696p-353 },
+ { 0x1.91c922f9ee4cp-359, 0x1.cb5d51a48d7d4p-354 },
+ { 0x1.2de164c74e725p-359, 0x1.594a1039f0199p-354 },
+ { 0x1.c5941f108d9d1p-360, 0x1.0382d1e479246p-354 },
+ { 0x1.54b639c219649p-360, 0x1.8609634a384ccp-355 },
+ { 0x1.ffcc62473097ap-361, 0x1.25120afe02122p-355 },
+ { 0x1.8059c757355aep-361, 0x1.b85e31314f4b4p-356 },
+ { 0x1.209ad26ca18d9p-361, 0x1.4acee7c0fcbafp-356 },
+ { 0x1.b15e18d0d2d12p-362, 0x1.f0f38c6449ad9p-357 },
+ { 0x1.4554e9983b016p-362, 0x1.753919ff4b182p-357 },
+ { 0x1.e865bf893f8f4p-363, 0x1.1844080030d76p-357 },
+ { 0x1.6e8db855aac9ap-363, 0x1.a4dede3a3eb93p-358 },
+ { 0x1.1312cc0ae5d04p-363, 0x1.3bf7fe7aa33ap-358 },
+ { 0x1.9ccc1bfbf7ecbp-364, 0x1.da5e8d4d639edp-359 },
+ { 0x1.35b35e7d0088ep-364, 0x1.640bc7176cda7p-359 },
+ { 0x1.d0a5ff60b92cfp-365, 0x1.0b342b640cc13p-359 },
+ { 0x1.5c84558f35d95p-365, 0x1.9102c47629cb9p-360 },
+ { 0x1.0560f8bafb2c7p-365, 0x1.2ce013e375d0fp-360 },
+ { 0x1.8801ce509ea26p-366, 0x1.c36f07720a932p-361 },
+ { 0x1.25ec7207b3c64p-366, 0x1.529fe13854ed9p-361 },
+ { 0x1.b8b58f7c67c36p-367, 0x1.fbf2dc269c35dp-362 },
+ { 0x1.4a5c0b3b7424dp-367, 0x1.7cec854a40ddcp-362 },
+ { 0x1.ef3874e46141bp-368, 0x1.1da13f1aaaee6p-362 },
+ { 0x1.732197e24d857p-368, 0x1.ac4c46230c45cp-363 },
+ { 0x1.1619ff0ea7ec6p-368, 0x1.4112fbeff8a1fp-363 },
+ { 0x1.a0bb46a0a2c53p-369, 0x1.e15420dda8758p-364 },
+ { 0x1.383201c8ba71ap-369, 0x1.68bd97eb5b05dp-364 },
+ { 0x1.d3b4e4b894768p-370, 0x1.0e54a78756b6bp-364 },
+ { 0x1.5e4c4aaef013p-370, 0x1.951c14f527745p-365 },
+ { 0x1.0654a030d3e7p-370, 0x1.2f8178dd14a04p-365 },
+ { 0x1.88dc03d1ca801p-371, 0x1.c6b6bf9361ee4p-366 },
+ { 0x1.2621d65152a67p-371, 0x1.5495f2949c65ep-366 },
+ { 0x1.b860981f4834ap-372, 0x1.fe24891c8ca0cp-367 },
+ { 0x1.49a0d4c97c281p-372, 0x1.7e02609a87253p-367 },
+ { 0x1.ed66ed1143993p-373, 0x1.1e064158c947bp-367 },
+ { 0x1.713a5a10cc9bp-373, 0x1.ac4304f253262p-368 },
+ { 0x1.14455cbbff469p-373, 0x1.4093bdea6e36fp-368 },
+ { 0x1.9d62205df47a6p-374, 0x1.dfe14a435c3c2p-369 },
+ { 0x1.353bfdeb15aa4p-374, 0x1.6720e3d624fdcp-369 },
+ { 0x1.ce97f23783a55p-375, 0x1.0cba8970a9d66p-369 },
+ { 0x1.59f649793ea9ap-375, 0x1.921e961b81171p-370 },
+ { 0x1.02b46c188f22dp-375, 0x1.2cd3135c626d1p-370 },
+ { 0x1.82dcfdba2d59cp-376, 0x1.c2097f7f7c953p-371 },
+ { 0x1.213830f44d648p-376, 0x1.5096e15b063dbp-371 },
+ { 0x1.b0639acae41c7p-377, 0x1.f76b39886a20dp-372 },
+ { 0x1.432d063e4cc5ap-377, 0x1.786c2636e4e2ap-372 },
+ { 0x1.e3096b161ade1p-378, 0x1.196dc712e8651p-372 },
+ { 0x1.68f1646f450ccp-378, 0x1.a4c39680abb0bp-373 },
+ { 0x1.0dad51a121c5fp-378, 0x1.3a80eb1934625p-373 },
+ { 0x1.92ed52465cf13p-379, 0x1.d6196b3830612p-374 },
+ { 0x1.2cf8cdb32b26dp-379, 0x1.5f4b3b930a91ap-374 },
+ { 0x1.c1934bb7035c1p-380, 0x1.067b3db09279ep-374 },
+ { 0x1.4fbc11c19c0b7p-380, 0x1.8832413bcb6f5p-375 },
+ { 0x1.f5613cdc1ad52p-381, 0x1.24f8b72bbd6eep-375 },
+ { 0x1.76547ab0f816ap-381, 0x1.b5a5bcacf14ddp-376 },
+ { 0x1.1770c93ef3136p-381, 0x1.46d8046ba690cp-376 },
+ { 0x1.a128a30d837ebp-382, 0x1.e8209bd7c6d4dp-377 },
+ { 0x1.375630e92b79p-382, 0x1.6c744b66f6406p-377 },
+ { 0x1.d0a93cd8add1ep-383, 0x1.1015024fefc8dp-377 },
+ { 0x1.5ab4549d6cf15p-383, 0x1.9631ba1694964p-378 },
+ { 0x1.02a8fed4a1944p-383, 0x1.2f2b3b1ae197dp-378 },
+ { 0x1.81e6d5efc2ecep-384, 0x1.c47e5b8f9de0cp-379 },
+ { 0x1.1fd54f3e20bfcp-384, 0x1.51a481761d265p-379 },
+ { 0x1.ad523512d80aep-385, 0x1.f7d2ff106229cp-380 },
+ { 0x1.4023f854f9c86p-385, 0x1.77da522f79ec5p-380 },
+ { 0x1.dd649c8fad0d5p-386, 0x1.185a192bd02b4p-380 },
+ { 0x1.63e684c4d4572p-386, 0x1.a22ed5ef67f83p-381 },
+ { 0x1.094b5ecc6e29p-386, 0x1.37d9a85948033p-381 },
+ { 0x1.8b7643330549ep-387, 0x1.d10da89b8212ap-382 },
+ { 0x1.26b65f14cd4dap-387, 0x1.5ab7d4224f7e2p-382 },
+ { 0x1.b734f53e57228p-388, 0x1.0276587fa1c2p-382 },
+ { 0x1.473b9d1931175p-388, 0x1.814bdb918424dp-383 },
+ { 0x1.e78d8c6e84fddp-389, 0x1.1f2684f2af658p-383 },
+ { 0x1.6b2a2c93cd65ap-389, 0x1.abf540fb4e1a1p-384 },
+ { 0x1.0e7a7b055d281p-389, 0x1.3eddfeeed0dd2p-384 },
+ { 0x1.92d87cacce695p-390, 0x1.db1c82f79707dp-385 },
+ { 0x1.2bf57b6e0d98dp-390, 0x1.61ea0b7eb4c3cp-385 },
+ { 0x1.bea4f9488e121p-391, 0x1.0799f1fb897d8p-385 },
+ { 0x1.4c7d8bf7bdc41p-391, 0x1.889f21fdb1d69p-386 },
+ { 0x1.eef6b8bfa9225p-392, 0x1.245c20ba28a39p-386 },
+ { 0x1.705ed2bbfd521p-392, 0x1.b3598a0d5984p-387 },
+ { 0x1.121f1b69882ebp-392, 0x1.4418fde75923ep-387 },
+ { 0x1.97ec608197c79p-393, 0x1.e27e05b6c31f9p-388 },
+ { 0x1.2f7b0edc74f1cp-393, 0x1.671af7f5d8858p-388 },
+ { 0x1.c380c41f7503p-394, 0x1.0b3d4442eda68p-388 },
+ { 0x1.4fd20f15083b3p-394, 0x1.8db341e4d4306p-389 },
+ { 0x1.f37ea8d01e9c5p-395, 0x1.27e37e3bc73c9p-389 },
+ { 0x1.736cebb19a201p-395, 0x1.b83a639f29a8p-390 },
+ { 0x1.1428c012e2c57p-395, 0x1.47730acf38edcp-390 },
+ { 0x1.9a9ae80c06018p-396, 0x1.e710d5155d028p-391 },
+ { 0x1.31371c2b63b8p-396, 0x1.6a331ab64b688p-391 },
+ { 0x1.c5b240b14f4d6p-397, 0x1.0d4fd25f7f52ep-391 },
+ { 0x1.5129ffd17a136p-397, 0x1.90712f4e38e37p-392 },
+ { 0x1.f510ba62354a5p-398, 0x1.29ac951c1e60bp-392 },
+ { 0x1.74468acd1611cp-398, 0x1.ba819d5f14678p-393 },
+ { 0x1.148e1d96c299ep-398, 0x1.48dce2dc3ecd5p-393 },
+ { 0x1.9ad7d58aaba44p-399, 0x1.e8c0193d16d55p-394 },
+ { 0x1.3121b71d77179p-399, 0x1.6b2456938b866p-394 },
+ { 0x1.c52f68dd90e64p-400, 0x1.0dc826696c76cp-394 },
+ { 0x1.507f397188496p-400, 0x1.90cc63cdbf2a2p-395 },
+ { 0x1.f3a5bdf92c388p-401, 0x1.29af3c144f8cp-395 },
+ { 0x1.72e7cbdbb95dbp-401, 0x1.ba24cc0f4c8e2p-396 },
+ { 0x1.134d638b07143p-401, 0x1.48500e815d897p-396 },
+ { 0x1.98a2111174d79p-402, 0x1.e7841c45926dp-397 },
+ { 0x1.2f3b409e1b7b6p-402, 0x1.69ea5b1b71301p-397 },
+ { 0x1.c1fa91a869695p-403, 0x1.0ca4195cda6d3p-397 },
+ { 0x1.4dd4c7d7ec9fap-403, 0x1.8ec33daf13649p-398 },
+ { 0x1.ef442d8796795p-404, 0x1.27eb66fea5e85p-398 },
+ { 0x1.6f56f0c0f22b9p-404, 0x1.b72598c77c448p-399 },
+ { 0x1.106c4a594a047p-404, 0x1.45cf12a60cb9ap-399 },
+ { 0x1.9403b0e4bd1b9p-405, 0x1.e36284e81b5ffp-400 },
+ { 0x1.2b8c63e7468c1p-405, 0x1.668ac570f2fc8p-400 },
+ { 0x1.bc22598793379p-406, 0x1.09e8e37ef2488p-400 },
+ { 0x1.4936d06178106p-406, 0x1.8a5f0c63b5c24p-401 },
+ { 0x1.e7fffb3b16a7dp-407, 0x1.2469273320bdap-401 },
+ { 0x1.69a431ed205ap-407, 0x1.b191b44e70edfp-402 },
+ { 0x1.0bf7e7cce4d07p-407, 0x1.41655d7606103p-402 },
+ { 0x1.8d11ace4d8996p-408, 0x1.dc6e2b76185d5p-403 },
+ { 0x1.2625d4b960a47p-408, 0x1.6114f58eab906p-403 },
+ { 0x1.b3c139841a735p-409, 0x1.05a2f4a403a4dp-403 },
+ { 0x1.42ba35d81be5cp-409, 0x1.83b3c9af7ee45p-404 },
+ { 0x1.ddf9fa6fc513ap-410, 0x1.1f386e3013e68p-404 },
+ { 0x1.61e943a26f542p-410, 0x1.a9826f127d04dp-405 },
+ { 0x1.06044c28d2704p-410, 0x1.3b26ef9596f74p-405 },
+ { 0x1.83eb403668f94p-411, 0x1.d2c68adc24dd3p-406 },
+ { 0x1.1f1fd15ed30fep-411, 0x1.59a199b7c8167p-406 },
+ { 0x1.a8fcbdc7eab51p-412, 0x1.ffcb2bfa5b8dap-407 },
+ { 0x1.3a7bfb4be9962p-412, 0x1.7adf828472cfdp-407 },
+ { 0x1.d15ee90987618p-413, 0x1.1870951a86a79p-407 },
+ { 0x1.584895194492p-413, 0x1.9f1bfa110cbbap-408 },
+ { 0x1.fd57d7b45b3cap-414, 0x1.332fc55367264p-408 },
+ { 0x1.78b8ffae32bfp-414, 0x1.c696d39db75f3p-409 },
+ { 0x1.16996dab0cd1ep-414, 0x1.5051f4ea04fdfp-409 },
+ { 0x1.9c046dcaa75a4p-415, 0x1.f194b2a4cb97p-410 },
+ { 0x1.30a06c462f23ep-415, 0x1.700975cbb46aap-410 },
+ { 0x1.c2662350ce7fap-416, 0x1.102fae0ec7794p-410 },
+ { 0x1.4cec5169fb931p-416, 0x1.928c588cfb6d9p-411 },
+ { 0x1.ec1db7d8e44b5p-417, 0x1.29a3060c44f3ap-411 },
+ { 0x1.6babae8929706p-417, 0x1.b814aa869e0e4p-412 },
+ { 0x1.0cb7ae5506e7ep-417, 0x1.454ee7edd0063p-412 },
+ { 0x1.8d106f7f4047ep-418, 0x1.e0e0b72e6ef2ep-413 },
+ { 0x1.255213192c405p-418, 0x1.6360f251c2f1fp-413 },
+ { 0x1.b1500fc71b69ap-419, 0x1.0699a6631f93fp-413 },
+ { 0x1.40052c8ba04b4p-419, 0x1.840a0d97bb129p-414 },
+ { 0x1.d8a3d24511c07p-420, 0x1.1eaa023d58a69p-414 },
+ { 0x1.5cfadd7b9716p-420, 0x1.a77ea01d8b821p-415 },
+ { 0x1.01a47ddad3ea8p-420, 0x1.38c7c7057a652p-415 },
+ { 0x1.7c5ff3799c35bp-421, 0x1.cdf6c504a93e5p-416 },
+ { 0x1.18c087e86a1f3p-421, 0x1.551bff88c1175p-416 },
+ { 0x1.9e64530b957f4p-422, 0x1.f7ae8590bb8p-417 },
+ { 0x1.31c908986e1a8p-422, 0x1.73d293026bc2ap-417 },
+ { 0x1.c33b25da2082ep-423, 0x1.12730a9790f69p-417 },
+ { 0x1.4ce362055227ep-423, 0x1.951a7082f394ap-418 },
+ { 0x1.eb1b0ae0a386ap-424, 0x1.2af1081b22794p-418 },
+ { 0x1.6a3779e1ff3bp-424, 0x1.b925bc48353ep-419 },
+ { 0x1.0b1f245435eeap-424, 0x1.4575deb5305a2p-419 },
+ { 0x1.89efddb97fd18p-425, 0x1.e029ff0fc8645p-420 },
+ { 0x1.227180cb0a8cap-425, 0x1.6228a92a17423p-420 },
+ { 0x1.ac39e8a7de062p-426, 0x1.05302bb5e3a1ap-420 },
+ { 0x1.3ba5b5279aa24p-426, 0x1.81331d3a2cc81p-421 },
+ { 0x1.d145ea8ff6403p-427, 0x1.1c02d69097c72p-421 },
+ { 0x1.56df011e743b9p-427, 0x1.a2c1b0ae83a64p-422 },
+ { 0x1.f94750d0f9308p-428, 0x1.34ad734ae6135p-422 },
+ { 0x1.7442e7172840ap-428, 0x1.c703bfdc748cdp-423 },
+ { 0x1.123a683e9b9d5p-428, 0x1.4f5290291de6ep-423 },
+ { 0x1.93f94a8e393e5p-429, 0x1.ee2bb5a2a447p-424 },
+ { 0x1.298449094a08p-429, 0x1.6c16f34d9525ep-424 },
+ { 0x1.b62c8f87855a8p-430, 0x1.0c379a70923bcp-424 },
+ { 0x1.42a02f59d51efp-430, 0x1.8b21b8919710fp-425 },
+ { 0x1.db09bb0ffb21fp-431, 0x1.2303a1b68b2dep-425 },
+ { 0x1.5daee76f997a8p-431, 0x1.ac9c706a79cfcp-426 },
+ { 0x1.01604a662bf4cp-431, 0x1.3b983b3f72fb5p-426 },
+ { 0x1.7ad33d50dacdp-432, 0x1.d0b33fd9b6e85p-427 },
+ { 0x1.16c1e4c8c451ap-432, 0x1.5615904c6373ap-427 },
+ { 0x1.9a32159dea0d8p-433, 0x1.f7950165d693dp-428 },
+ { 0x1.2dc48781056c9p-433, 0x1.729dc070c926ap-428 },
+ { 0x1.bbf2871addffbp-434, 0x1.10b9b38c6e833p-428 },
+ { 0x1.4684a4152d4ep-434, 0x1.9154f9f73ee5fp-429 },
+ { 0x1.e03df4eb2c204p-435, 0x1.27418ebfd96bep-429 },
+ { 0x1.6120558a89b12p-435, 0x1.b26192fa2f36ep-430 },
+ { 0x1.03a014bcb5352p-435, 0x1.3f7df7d25b3e6p-430 },
+ { 0x1.7db773a6f6623p-436, 0x1.d5ec232ba3385p-431 },
+ { 0x1.1893b9023690dp-436, 0x1.598c75ff21ea4p-431 },
+ { 0x1.9c6ba6a49465ap-437, 0x1.fc1f9e46a53e2p-432 },
+ { 0x1.2f125d64e7642p-437, 0x1.758c452444076p-432 },
+ { 0x1.bd607b51aff83p-438, 0x1.1294b791c6529p-432 },
+ { 0x1.4735d5e25dd32p-438, 0x1.939e692035be7p-433 },
+ { 0x1.e0bb7795ebab2p-439, 0x1.289cc9b3b4107p-433 },
+ { 0x1.611962fb4b008p-439, 0x1.b3e5c199dc217p-434 },
+ { 0x1.035217aa6e0adp-439, 0x1.40415be2c6028p-434 },
+ { 0x1.7cd9c096da3b3p-440, 0x1.d6871e2c76342p-435 },
+ { 0x1.17a22cd2a508fp-440, 0x1.599d2a64857abp-435 },
+ { 0x1.9a95351e8c9f1p-441, 0x1.fba952efabe51p-436 },
+ { 0x1.2d63f329a8bcbp-441, 0x1.74cc660d4897ap-436 },
+ { 0x1.ba6ba0cb47e2bp-442, 0x1.11baa6a990cd8p-436 },
+ { 0x1.44ae89d144108p-442, 0x1.91ecc31adec4ep-437 },
+ { 0x1.dc7e8d1b8f556p-443, 0x1.270b14a1f9816p-437 },
+ { 0x1.5d9a42222275cp-443, 0x1.b11d883fd3ec1p-438 },
+ { 0x1.00789e350bd1ap-443, 0x1.3ddca348b8e79p-438 },
+ { 0x1.7840aaba80c98p-444, 0x1.d27f9dd765764p-439 },
+ { 0x1.13f45ccd8c935p-444, 0x1.56472f42babf3p-439 },
+ { 0x1.94bc9a9955f26p-445, 0x1.f6359d3980ea5p-440 },
+ { 0x1.28c5f3eaf8eddp-445, 0x1.7063ccd1b83c6p-440 },
+ { 0x1.b32a3c3e46a35p-446, 0x1.0e31f012ad2b3p-440 },
+ { 0x1.3f01c91fe7f47p-446, 0x1.8c4cd2c02ec2dp-441 },
+ { 0x1.d3a718c61d154p-447, 0x1.2298481c2ca0dp-441 },
+ { 0x1.56bd3dd5a05c1p-447, 0x1.aa1de55237abcp-442 },
+ { 0x1.f65222fadfcp-448, 0x1.3861db33230bp-442 },
+ { 0x1.700eb717cfb77p-448, 0x1.c9f401331dbf6p-443 },
+ { 0x1.0da5e12700c8dp-448, 0x1.4fa3a533642f6p-443 },
+ { 0x1.8b0da54d3c71fp-449, 0x1.ebed8656f1a7bp-444 },
+ { 0x1.215aeed941b43p-449, 0x1.6873a105b43c2p-444 },
+ { 0x1.a7d28bd609e5p-450, 0x1.081521636047p-444 },
+ { 0x1.3659f3261d19p-450, 0x1.82e8d038330cap-445 },
+ { 0x1.c6770887b13f6p-451, 0x1.1b65bea6b7e6ap-445 },
+ { 0x1.4cb570f463d9dp-451, 0x1.9f1b427ce89a2p-446 },
+ { 0x1.e715dafe5cd6p-452, 0x1.2ff9fffd4f5f9p-446 },
+ { 0x1.6480ba9b1723cp-452, 0x1.bd241d06b6757p-447 },
+ { 0x1.04e575dd6f2ebp-452, 0x1.45e411382662bp-447 },
+ { 0x1.7dcff6d521467p-453, 0x1.dd1da1bc7ec85p-448 },
+ { 0x1.1759a98201ff3p-453, 0x1.5d36e9f7af39cp-448 },
+ { 0x1.98b82586ccf2dp-454, 0x1.ff233639de02ap-449 },
+ { 0x1.2af6afc0ce651p-454, 0x1.7606528b3cf28p-449 },
+ { 0x1.b54f244df93dfp-455, 0x1.11a8b54a30c34p-449 },
+ { 0x1.3fcc4e4385b18p-455, 0x1.9066e8a3084adp-450 },
+ { 0x1.d3abb2d5b9282p-456, 0x1.24e2ffedd9f78p-450 },
+ { 0x1.55eaec016b2b5p-456, 0x1.ac6e23cde6ac9p-451 },
+ { 0x1.f3e576e5bfb2cp-457, 0x1.394ff72563c26p-451 },
+ { 0x1.6d6394041cb01p-457, 0x1.ca3259bb8013ep-452 },
+ { 0x1.0b0a8012d71fbp-457, 0x1.4effb58fcce2p-452 },
+ { 0x1.8647f7f3a91dep-458, 0x1.e9cac23b8427ep-453 },
+ { 0x1.1d29e5c60946bp-458, 0x1.6602f707600f3p-453 },
+ { 0x1.a0aa72640fd47p-459, 0x1.05a7bd790a4bcp-453 },
+ { 0x1.305e23384e58ap-459, 0x1.7e6b1b23c38f4p-454 },
+ { 0x1.bc9e08de1532fp-460, 0x1.176cc55ca9b8p-454 },
+ { 0x1.44b4e89c6a35fp-460, 0x1.984a277e8539ap-455 },
+ { 0x1.da366d9d2b975p-461, 0x1.2a417253e014bp-455 },
+ { 0x1.5a3c60cb2c6b1p-461, 0x1.b3b2c9b4277c6p-456 },
+ { 0x1.f98800fc076dbp-462, 0x1.3e333559670c8p-456 },
+ { 0x1.71033226bf0afp-462, 0x1.d0b8591b88278p-457 },
+ { 0x1.0d53e944a7e18p-462, 0x1.534ff7f271b4dp-457 },
+ { 0x1.89187f3d75a14p-463, 0x1.ef6ed82d51675p-458 },
+ { 0x1.1ed5d0deddfb7p-463, 0x1.69a61d0edc9d2p-458 },
+ { 0x1.a28be72757b85p-464, 0x1.07f57aca805f1p-458 },
+ { 0x1.3154ef266983dp-464, 0x1.814481a9f253cp-459 },
+ { 0x1.bd6d859990532p-465, 0x1.1921067277b5dp-459 },
+ { 0x1.44dcd404b4fcdp-465, 0x1.9a3a7d2712f82p-460 },
+ { 0x1.d9cdf2aadd6a6p-466, 0x1.2b45137355f77p-460 },
+ { 0x1.5979672b76b96p-466, 0x1.b497e1657b91bp-461 },
+ { 0x1.f7be424410479p-467, 0x1.3e6cfcc06ed27p-461 },
+ { 0x1.6f36e7903ba4fp-467, 0x1.d06cfa865bc4ep-462 },
+ { 0x1.0ba8019bd4e86p-467, 0x1.52a47395ed2aep-462 },
+ { 0x1.8621eaa755f34p-468, 0x1.edca8e605e67ap-463 },
+ { 0x1.1c4a9efdce654p-468, 0x1.67f77ef705254p-463 },
+ { 0x1.9e475b5aaea97p-469, 0x1.0660edcde1e02p-463 },
+ { 0x1.2dd03980220acp-469, 0x1.7e727aec99554p-464 },
+ { 0x1.b7b478b8fda1cp-470, 0x1.16b24c391593bp-464 },
+ { 0x1.40424c4fd21f7p-470, 0x1.96221780dfe95p-465 },
+ { 0x1.d276d459f43c7p-471, 0x1.27e2788696d86p-465 },
+ { 0x1.53aa8c500f5dp-471, 0x1.af1357749947cp-466 },
+ { 0x1.ee9c5073f397ep-472, 0x1.39fac2bf7a531p-466 },
+ { 0x1.6812e6a2e8fcp-472, 0x1.c9538eaa71fbp-467 },
+ { 0x1.06198ecffc0ep-472, 0x1.4d04b3a802aeep-467 },
+ { 0x1.7d857ef6fe55ap-473, 0x1.e4f0604536408p-468 },
+ { 0x1.15a4dc243cc5fp-473, 0x1.610a0b4ec8401p-468 },
+ { 0x1.940cad97ee071p-474, 0x1.00fbde3ac71c6p-468 },
+ { 0x1.25f772e00c70ap-474, 0x1.7614bf61d6bfap-469 },
+ { 0x1.abb2fd3f529efp-475, 0x1.103beefa0765p-469 },
+ { 0x1.3718d87e8a0afp-475, 0x1.8c2ef94786008p-470 },
+ { 0x1.c48328a4346ebp-476, 0x1.203fa39242793p-470 },
+ { 0x1.4910b37b4de72p-476, 0x1.a36313f8e64ecp-471 },
+ { 0x1.de8817c6f33b9p-477, 0x1.310e5f6fbfd44p-471 },
+ { 0x1.5be6c950a7e6fp-477, 0x1.bbbb999bb060ap-472 },
+ { 0x1.f9ccdcf7c94fep-478, 0x1.42afa66f9fdc1p-472 },
+ { 0x1.6fa2fc442a9d3p-478, 0x1.d54340d9c375dp-473 },
+ { 0x1.0b2e58cb15f5cp-478, 0x1.552b1ae6aeaa2p-473 },
+ { 0x1.844d490056942p-479, 0x1.f004e9f45a94bp-474 },
+ { 0x1.1a217943b9ac7p-479, 0x1.68887b7750462p-474 },
+ { 0x1.99edc3fa555f4p-480, 0x1.0605cdc8a1e5ep-474 },
+ { 0x1.29c58e31af831p-480, 0x1.7ccfa0b55e3f7p-475 },
+ { 0x1.b08c96a2d341cp-481, 0x1.14b13fa04509fp-475 },
+ { 0x1.3a2063aa9bfc9p-481, 0x1.92087a96ea8f4p-476 },
+ { 0x1.c831fc61280f7p-482, 0x1.240a6edc95f53p-476 },
+ { 0x1.4b37d15842e1dp-482, 0x1.a83b0db0fa5b6p-477 },
+ { 0x1.e0e63f582488bp-483, 0x1.34170d65d2fe5p-477 },
+ { 0x1.5d11b81c3fea7p-483, 0x1.bf6f703f6c8b1p-478 },
+ { 0x1.fab1b4f400c2ep-484, 0x1.44dcd884a52dcp-478 },
+ { 0x1.6fb3ff8ccf41cp-484, 0x1.d7adc6f76430fp-479 },
+ { 0x1.0ace5d20891a2p-484, 0x1.5661968fc8c68p-479 },
+ { 0x1.8324934a763f4p-485, 0x1.f0fe41a3b588bp-480 },
+ { 0x1.18d7d8058e531p-485, 0x1.68ab147365bffp-480 },
+ { 0x1.9769602e7d2c4p-486, 0x1.05b48bc57ed71p-480 },
+ { 0x1.27797b62a04a4p-486, 0x1.7bbf2311e9661p-481 },
+ { 0x1.ac8851524d431p-487, 0x1.137b41cf9c9a4p-481 },
+ { 0x1.36b7751d5da7fp-487, 0x1.8fa3947e525d9p-482 },
+ { 0x1.c2874cefea298p-488, 0x1.21d7603b6e2ccp-482 },
+ { 0x1.4695ee8470b66p-488, 0x1.a45e3910021acp-483 },
+ { 0x1.d96c311be3eb3p-489, 0x1.30cd0207d04edp-483 },
+ { 0x1.571909f179506p-489, 0x1.b9f4dc504a668p-484 },
+ { 0x1.f13cd05945d89p-490, 0x1.40603dadb780ap-484 },
+ { 0x1.6844e0504f766p-490, 0x1.d06d41c212c13p-485 },
+ { 0x1.04ff770417c7ep-490, 0x1.509522cc01f2fp-485 },
+ { 0x1.7a1d7e8c27e5p-491, 0x1.e7cd2184183ebp-486 },
+ { 0x1.11dc1d57f7df8p-491, 0x1.616fb7b910c11p-486 },
+ { 0x1.8ca6e2e342651p-492, 0x1.000d1267395e3p-486 },
+ { 0x1.1f372812d1e14p-492, 0x1.72f3f6faafe57p-487 },
+ { 0x1.9fe4fa21e8c98p-493, 0x1.0cacf12619fe1p-487 },
+ { 0x1.2d1356c845fd1p-493, 0x1.8525cca4f244dp-488 },
+ { 0x1.b3db9cc5a58f3p-494, 0x1.19c8ed29100e2p-488 },
+ { 0x1.3b7359a6b9391p-494, 0x1.980913a0c5f1ep-489 },
+ { 0x1.c88e8c09b9bb2p-495, 0x1.2763b979d57b5p-489 },
+ { 0x1.4a59cf5958098p-495, 0x1.aba192db244fdp-490 },
+ { 0x1.de016eddfacadp-496, 0x1.357ff9fbc97f4p-490 },
+ { 0x1.59c942db45eaep-496, 0x1.bff2fa5de1e9dp-491 },
+ { 0x1.f437cec9632b8p-497, 0x1.44204156d00fcp-491 },
+ { 0x1.69c4293cefa3fp-497, 0x1.d500e0534289dp-492 },
+ { 0x1.059a8a5ce0ce7p-497, 0x1.53470ed39dd97p-492 },
+ { 0x1.7a4cdf5c8de47p-498, 0x1.eacebdf5973c2p-493 },
+ { 0x1.117e42e10afc5p-498, 0x1.62f6cc2a62dbdp-493 },
+ { 0x1.8b65a792fe14p-499, 0x1.00aff63626acfp-493 },
+ { 0x1.1dc89fe4a5f8ap-499, 0x1.7331cb44dd6ecp-494 },
+ { 0x1.9d10a7562f377p-500, 0x1.0c5bd0cbfba3p-494 },
+ { 0x1.2a7b1b1593291p-500, 0x1.83fa43f4f73d5p-495 },
+ { 0x1.af4fe4d278bf9p-501, 0x1.186c76677c8f7p-495 },
+ { 0x1.37971726a776ep-501, 0x1.955251a12574cp-496 },
+ { 0x1.c225447c48b85p-502, 0x1.24e359c6528bbp-496 },
+ { 0x1.451dde15504ecp-502, 0x1.a73bf0e7dcf7bp-497 },
+ { 0x1.d592869bae136p-503, 0x1.31c1d70a5a26cp-497 },
+ { 0x1.53109f6b70a02p-503, 0x1.b9b8fd3b82acep-498 },
+ { 0x1.e99944d35a898p-504, 0x1.3f09320694d4p-498 },
+ { 0x1.61706e7ea0b42p-504, 0x1.cccb2e7856e93p-499 },
+ { 0x1.fe3aefa4cdaa2p-505, 0x1.4cba948866255p-499 },
+ { 0x1.703e40ae0b133p-505, 0x1.e0741675f15a5p-500 },
+ { 0x1.09bc65f9b8064p-505, 0x1.5ad70c9e433d4p-500 },
+ { 0x1.7f7aeba02f7efp-506, 0x1.f4b51e95f89d5p-501 },
+ { 0x1.14a9f8443d058p-506, 0x1.695f8add0a062p-501 },
+ { 0x1.8f272381e3222p-507, 0x1.04c7c2a8ead79p-501 },
+ { 0x1.1fe6a1ccca721p-507, 0x1.7854e0a5444cfp-502 },
+ { 0x1.9f437947f2743p-508, 0x1.0f822de49bc54p-502 },
+ { 0x1.2b72bc2a1bb29p-508, 0x1.87b7be69a8c26p-503 },
+ { 0x1.afd058f4d5cb9p-509, 0x1.1a8a41a9a734p-503 },
+ { 0x1.374e8637e822fp-509, 0x1.9788b1f83908ep-504 },
+ { 0x1.c0ce07e3f5247p-510, 0x1.25e0558a5c077p-504 },
+ { 0x1.437a22e46ffc9p-510, 0x1.a7c824c7683f1p-505 },
+ { 0x1.d23ca31c0220cp-511, 0x1.3184a6ce13b46p-505 },
+ { 0x1.4ff5980398e02p-511, 0x1.b8765a48c0cf1p-506 },
+ { 0x1.e41c1da9f8a5fp-512, 0x1.3d775743f06aep-506 },
+ { 0x1.5cc0cd28b81e5p-512, 0x1.c9936e428a9d9p-507 },
+ { 0x1.f66c3f065ea05p-513, 0x1.49b86c1b194cep-507 },
+ { 0x1.69db8a882e29p-513, 0x1.db1f5331fbe71p-508 },
+ { 0x1.049650c331274p-513, 0x1.5647ccc18e717p-508 },
+ { 0x1.774577e1faf4fp-514, 0x1.ed19d0b78718cp-509 },
+ { 0x1.0e2e586d3df5cp-514, 0x1.632541cab3acp-509 },
+ { 0x1.84fe1b767669bp-515, 0x1.ff82820edeaabp-510 },
+ { 0x1.17fdd44e1dc6cp-515, 0x1.705073deb552ap-510 },
+ { 0x1.9304d9065a4b9p-516, 0x1.092c6a4a26abfp-510 },
+ { 0x1.220449767742ap-516, 0x1.7dc8eab3ed87ap-511 },
+ { 0x1.a158f0df4c356p-517, 0x1.12ce032c827cep-511 },
+ { 0x1.2c4123936432bp-517, 0x1.8b8e0c1372c25p-512 },
+ { 0x1.aff97ef6163edp-518, 0x1.1ca5926404568p-512 },
+ { 0x1.36b3b4511d82bp-518, 0x1.999f1ae9f978bp-513 },
+ { 0x1.bee57a0fbbbdcp-519, 0x1.26b285aeabdbep-513 },
+ { 0x1.415b32c89327cp-519, 0x1.a7fb366632c72p-514 },
+ { 0x1.ce1bb2fa9523ep-520, 0x1.30f431387ee69p-514 },
+ { 0x1.4c36baf8c2285p-520, 0x1.b6a15925d0c25p-515 },
+ { 0x1.dd9ad3d89a4a5p-521, 0x1.3b69cf0bd5608p-515 },
+ { 0x1.57454d4c97f21p-521, 0x1.c590587256b75p-516 },
+ { 0x1.ed615f7bfd7d2p-522, 0x1.46127e8d37ba7p-516 },
+ { 0x1.6285ce2e2e29bp-522, 0x1.d4c6e38ed7f06p-517 },
+ { 0x1.fd6db0d73348ep-523, 0x1.50ed44039bd53p-517 },
+ { 0x1.6df705a8252f7p-523, 0x1.e4438317c2a1ep-518 },
+ { 0x1.06defd40bdb09p-523, 0x1.5bf9082dc8412p-518 },
+ { 0x1.79979f15ddb0dp-524, 0x1.f4049875ce63p-519 },
+ { 0x1.0f2823287afb6p-524, 0x1.673497e5a0d03p-519 },
+ { 0x1.856628e34ac2cp-525, 0x1.02042eb28efefp-519 },
+ { 0x1.17913a85a33a7p-525, 0x1.729ea3d219a53p-520 },
+ { 0x1.9161145d0e326p-526, 0x1.0a2671c8cdbeep-520 },
+ { 0x1.20191f16dc709p-526, 0x1.7e35c0288722ep-521 },
+ { 0x1.9d86b59187f4ep-527, 0x1.12680a24c58f5p-521 },
+ { 0x1.28be97e6e9065p-527, 0x1.89f8647df9662p-522 },
+ { 0x1.a9d5434377e7bp-528, 0x1.1ac7d823a316cp-522 },
+ { 0x1.31805749922c3p-528, 0x1.95e4eba9494cap-523 },
+ { 0x1.b64ad6eec66d3p-529, 0x1.2344a7c981006p-523 },
+ { 0x1.3a5cfae5998ecp-529, 0x1.a1f993b67371dp-524 },
+ { 0x1.c2e56cdffce02p-530, 0x1.2bdd30bebc795p-524 },
+ { 0x1.43530bcc0ee3ap-530, 0x1.ae347debd307p-525 },
+ { 0x1.cfa2e45eea63dp-531, 0x1.3490165a1de5p-525 },
+ { 0x1.4c60fe9d5cbc1p-531, 0x1.ba93aee1c301fp-526 },
+ { 0x1.dc80ffece4451p-532, 0x1.3d5be7b8309a9p-526 },
+ { 0x1.558533bc564e3p-532, 0x1.c7150ead1fd0ep-527 },
+ { 0x1.e97d659702f92p-533, 0x1.463f1fe01b7dap-527 },
+ { 0x1.5ebdf78f85a03p-533, 0x1.d3b6691d169e3p-528 },
+ { 0x1.f6959f5cadd73p-534, 0x1.4f3825f642bp-528 },
+ { 0x1.680982d0eea8ap-534, 0x1.e0756e0ca137bp-529 },
+ { 0x1.01e38dd55bfc7p-534, 0x1.58454d7cf072p-529 },
+ { 0x1.7165faec70a1p-535, 0x1.ed4fb1c7fef16p-530 },
+ { 0x1.088796f5a026p-535, 0x1.6164d6a338985p-530 },
+ { 0x1.7ad1726ce2f3cp-536, 0x1.fa42ad866b6p-531 },
+ { 0x1.0f3587953aeb5p-536, 0x1.6a94eea23ecd2p-531 },
+ { 0x1.8449e977fef01p-537, 0x1.03a5dffc21d0dp-531 },
+ { 0x1.15ebef6827c9dp-537, 0x1.73d3b028fc2cfp-532 },
+ { 0x1.8dcd4e591ac76p-538, 0x1.0a3416f4dd0f1p-532 },
+ { 0x1.1ca951b79a938p-538, 0x1.7d1f23d694b62p-533 },
+ { 0x1.97597e1aad586p-539, 0x1.10ca917d13a59p-533 },
+ { 0x1.236c25d3c18a2p-539, 0x1.867540c340902p-534 },
+ { 0x1.a0ec452e85047p-540, 0x1.1767d933fa0f7p-534 },
+ { 0x1.2a32d78fe110fp-540, 0x1.8fd3ed17c059fp-535 },
+ { 0x1.aa8360248e3edp-541, 0x1.1e0a6bf884441p-535 },
+ { 0x1.30fbc7c8ab284p-541, 0x1.9938feb3469d1p-536 },
+ { 0x1.b41c7c6ff8cc6p-542, 0x1.24b0bc63cac6bp-536 },
+ { 0x1.37c54cf4ab1fcp-542, 0x1.a2a23bdfb3241p-537 },
+ { 0x1.bdb5393a7ccd2p-543, 0x1.2b59324d7fd9bp-537 },
+ { 0x1.3e8db3be9418cp-543, 0x1.ac0d5c13ef72ap-538 },
+ { 0x1.c74b284572b4cp-544, 0x1.32022b5a4d882p-538 },
+ { 0x1.45533fa93710cp-544, 0x1.b57808c42df0bp-539 },
+ { 0x1.d0dbced86364cp-545, 0x1.38a9fb93eb86p-539 },
+ { 0x1.4c142bbcdb51bp-545, 0x1.bedfde3fbf9f1p-540 },
+ { 0x1.da64a6bca7adp-546, 0x1.3f4eee0ab230dp-540 },
+ { 0x1.52ceab3daa53bp-546, 0x1.c8426c9c266d4p-541 },
+ { 0x1.e3e31f45a0a96p-547, 0x1.45ef458066425p-541 },
+ { 0x1.5980ea6ad6692p-547, 0x1.d19d38acfc932p-542 },
+ { 0x1.ed549e6504cf2p-548, 0x1.4c893d1bef1fep-542 },
+ { 0x1.60290f4619f98p-548, 0x1.daedbd083bb8ep-543 },
+ { 0x1.f6b681cab013bp-549, 0x1.531b0925a021ep-543 },
+ { 0x1.66c53a6323b06p-549, 0x1.e4316b16614afp-544 },
+ { 0x1.00031007ac3e3p-549, 0x1.59a2d7cbb3c39p-544 },
+ { 0x1.6d5387be7adf6p-550, 0x1.ed65ac2de0264p-545 },
+ { 0x1.04a064f4bdd38p-550, 0x1.601ed1ee8e719p-545 },
+ { 0x1.73d20f9b5e73bp-551, 0x1.f687e2b942e41p-546 },
+ { 0x1.0931e5b5e6c43p-551, 0x1.668d1bf455ad8p-546 },
+ { 0x1.7a3ee7681856fp-552, 0x1.ff956b675583bp-547 },
+ { 0x1.0db636a632668p-552, 0x1.6cebd6a35f863p-547 },
+ { 0x1.809822a836e1fp-553, 0x1.0445cf3250898p-547 },
+ { 0x1.122bfb19eafe7p-553, 0x1.73392002f5fc2p-548 },
+ { 0x1.86dbd3e416493p-554, 0x1.08b3e84ebc2b9p-548 },
+ { 0x1.1691d609b1ec9p-554, 0x1.79731441e1e21p-549 },
+ { 0x1.8d080d9d1c96dp-555, 0x1.0d13aa83e4b01p-549 },
+ { 0x1.1ae66ac0b0b6ap-555, 0x1.7f97cea22928bp-550 },
+ { 0x1.931ae34603f62p-556, 0x1.1163bef9eebc1p-550 },
+ { 0x1.1f285d8d6c817p-556, 0x1.85a56a6965552p-551 },
+ { 0x1.99126a3e88ca5p-557, 0x1.15a2cf3193875p-551 },
+ { 0x1.23565474c154ep-557, 0x1.8b9a03d510324p-552 },
+ { 0x1.9eecbad1cb519p-558, 0x1.19cf85b21a11fp-552 },
+ { 0x1.276ef7e686addp-558, 0x1.9173b9121e9f7p-553 },
+ { 0x1.a4a7f136af77ep-559, 0x1.1de88eb969b39p-553 },
+ { 0x1.2b70f3735b79fp-559, 0x1.9730ab373bc61p-554 },
+ { 0x1.aa422e918100dp-560, 0x1.21ec98edb9593p-554 },
+ { 0x1.2f5af68314ac2p-560, 0x1.9cceff40f1fb1p-555 },
+ { 0x1.afb999f61e5d4p-561, 0x1.25da56105b758p-555 },
+ { 0x1.332bb50b471fbp-561, 0x1.a24cdf0f0a2e7p-556 },
+ { 0x1.b50c6169e961bp-562, 0x1.29b07bb123c75p-556 },
+ { 0x1.36e1e845638bbp-562, 0x1.a7a87a6267113p-557 },
+ { 0x1.ba38bae4baa67p-563, 0x1.2d6dc3e1e1b47p-557 },
+ { 0x1.3a7c4f63d9d53p-563, 0x1.ace007da9e0c8p-558 },
+ { 0x1.bf3ce55012ad1p-564, 0x1.3110ede9680cep-558 },
+ { 0x1.3df9b045b81fcp-564, 0x1.b1f1c5f28dcc9p-559 },
+ { 0x1.c4172983c2f7ep-565, 0x1.3498bef599a58p-559 },
+ { 0x1.4158d828399aep-565, 0x1.b6dbfbfb30836p-560 },
+ { 0x1.c8c5db3f49157p-566, 0x1.380402cbf1542p-560 },
+ { 0x1.44989c55b9312p-566, 0x1.bb9cfb13e7262p-561 },
+ { 0x1.cd475a1f163eep-567, 0x1.3b518c77fb7d2p-561 },
+ { 0x1.47b7dad17cf31p-567, 0x1.c0331f1f7ac71p-562 },
+ { 0x1.d19a128cff8a4p-568, 0x1.3e8036f737914p-562 },
+ { 0x1.4ab57affd05a9p-568, 0x1.c49ccfb511d2cp-563 },
+ { 0x1.d5bc7eab14dfbp-569, 0x1.418ee5e1d890ep-563 },
+ { 0x1.4d906e49e5535p-569, 0x1.c8d8810c585d4p-564 },
+ { 0x1.d9ad27381fd3dp-570, 0x1.447c860fdcf2cp-564 },
+ { 0x1.5047b0bcf6527p-570, 0x1.cce4b4e41cdcap-565 },
+ { 0x1.dd6aa46d0f45cp-571, 0x1.47480e39f8181p-565 },
+ { 0x1.52da49a426b16p-571, 0x1.d0bffb62a59f5p-566 },
+ { 0x1.e0f39ed2991f9p-572, 0x1.49f07f95c9d66p-566 },
+ { 0x1.55474c1ca1f2bp-572, 0x1.d468f3ef07049p-567 },
+ { 0x1.e446d00e60d84p-573, 0x1.4c74e66ce3841p-567 },
+ { 0x1.578dd7a37e92bp-573, 0x1.d7de4e02c6f6fp-568 },
+ { 0x1.e76303a6f7572p-574, 0x1.4ed45aae1d60cp-568 },
+ { 0x1.59ad189ced845p-574, 0x1.db1ec9f31f5e1p-569 },
+ { 0x1.ea4717be0f8c8p-575, 0x1.510e0078c325ep-569 },
+ { 0x1.5ba448d444792p-575, 0x1.de2939b1372f7p-570 },
+ { 0x1.ecf1fdc04a7dbp-576, 0x1.532108a122ff3p-570 },
+ { 0x1.5d72aff4768dap-576, 0x1.e0fc8180b06b8p-571 },
+ { 0x1.ef62bb0a0594ap-577, 0x1.550cb12e0f1dbp-571 },
+ { 0x1.5f17a3f894e1dp-577, 0x1.e39798a3f0a89p-572 },
+ { 0x1.f19869809eb8ap-578, 0x1.56d045cee7811p-572 },
+ { 0x1.60928993f7077p-578, 0x1.e5f989fd91cadp-573 },
+ { 0x1.f392381fab056p-579, 0x1.586b2049c7737p-573 },
+ { 0x1.61e2d491b1f68p-579, 0x1.e82174a67122fp-574 },
+ { 0x1.f54f6b79a6d5fp-580, 0x1.59dca8e17880fp-574 },
+ { 0x1.6308082b0b65cp-580, 0x1.ea0e8c77dc629p-575 },
+ { 0x1.f6cf5e2bb03dcp-581, 0x1.5b2456b2d3672p-575 },
+ { 0x1.6401b7549eebbp-581, 0x1.ebc01a8965943p-576 },
+ { 0x1.f8118143e7ebp-582, 0x1.5c41b0093e8e9p-576 },
+ { 0x1.64cf8501f223bp-582, 0x1.ed357da1f18bap-577 },
+ { 0x1.f9155c9a1fbd1p-583, 0x1.5d344aaa010f1p-577 },
+ { 0x1.6571245f3d39ap-583, 0x1.ee6e2a9b9efdp-578 },
+ { 0x1.f9da8f1a8a0ccp-584, 0x1.5dfbcc1628fd2p-578 },
+ { 0x1.65e6590135ap-584, 0x1.ef69acba2f951p-579 },
+ { 0x1.fa60cf0228aadp-585, 0x1.5e97e9c2cbc7fp-579 },
+ { 0x1.662ef70ab154bp-585, 0x1.f027a5f3a7f56p-580 },
+ { 0x1.faa7ea0cc6ecbp-586, 0x1.5f0869476fb64p-580 },
+ { 0x1.664ae34801e0ep-586, 0x1.f0a7cf2ae7563p-581 },
+ { 0x1.faafc59456a8cp-587, 0x1.5f4d2082760f5p-581 },
+ { 0x1.663a133fef35p-587, 0x1.f0e9f85c03b41p-582 },
+ { 0x1.fa785ea194bf2p-588, 0x1.5f65f5b366281p-582 },
+ { 0x1.65fc8d3a43882p-588, 0x1.f0ee08ba43cd5p-583 },
+ { 0x1.fa01c9ede6a16p-589, 0x1.5f52df8b025d3p-583 },
+ { 0x1.6592683be2829p-589, 0x1.f0b3febf9cbcdp-584 },
+ { 0x1.f94c33d66f35bp-590, 0x1.5f13e53118eaap-584 },
+ { 0x1.64fbcbf86f1abp-590, 0x1.f03bf02da5a7ap-585 },
+ { 0x1.f857e040665ap-591, 0x1.5ea91e400b8afp-585 },
+ { 0x1.6438f0b98cabp-591, 0x1.ef860a0000a7ap-586 },
+ { 0x1.f7252a6ecb2bbp-592, 0x1.5e12b2b611c72p-586 },
+ { 0x1.634a1f3bd0d7ep-592, 0x1.ee92905044d53p-587 },
+ { 0x1.f5b484c995f72p-593, 0x1.5d50dadc42d9dp-587 },
+ { 0x1.622fb08184d56p-593, 0x1.ed61de2b81fc4p-588 },
+ { 0x1.f40678969b4f4p-594, 0x1.5c63df237cf4dp-588 },
+ { 0x1.60ea0d9b5d711p-594, 0x1.ebf4655983167p-589 },
+ { 0x1.f21ba5a45e2afp-595, 0x1.5b4c17f7488b1p-589 },
+ { 0x1.5f79af6759efdp-595, 0x1.ea4aae160108ap-590 },
+ { 0x1.eff4c1e71b057p-596, 0x1.5a09ed86def16p-590 },
+ { 0x1.5ddf1e460242cp-596, 0x1.e86556bc034fep-591 },
+ { 0x1.ed92990861c73p-597, 0x1.589dd784842fp-591 },
+ { 0x1.5c1af1c6454bep-597, 0x1.e6451363b8311p-592 },
+ { 0x1.eaf60be99fa59p-598, 0x1.57085cdb6c23ep-592 },
+ { 0x1.5a2dd0483fd76p-598, 0x1.e3eaad7319948p-593 },
+ { 0x1.e820101a05296p-599, 0x1.554a135c6b3d2p-593 },
+ { 0x1.58186e973c8cbp-599, 0x1.e1570321beee3p-594 },
+ { 0x1.e511af403f0e1p-600, 0x1.53639f61bab8bp-594 },
+ { 0x1.55db8f7b445c6p-600, 0x1.de8b06f0475d8p-595 },
+ { 0x1.e1cc067882b19p-601, 0x1.5155b36a1ff17p-595 },
+ { 0x1.537803429dd3dp-601, 0x1.db87bf13d1856p-596 },
+ { 0x1.de5045a77840fp-602, 0x1.4f210fabcd4fep-596 },
+ { 0x1.50eea743a03bp-602, 0x1.d84e44d6006fdp-597 },
+ { 0x1.da9faec295ac1p-603, 0x1.4cc6819f5a3a9p-597 },
+ { 0x1.4e406557456e3p-603, 0x1.d4dfc3ea1615fp-598 },
+ { 0x1.d6bb950e85a76p-604, 0x1.4a46e38335bf7p-598 },
+ { 0x1.4b6e334ceafc3p-604, 0x1.d13d79b7b4d75p-599 },
+ { 0x1.d2a55c543d97bp-605, 0x1.47a31bd7fd98ap-599 },
+ { 0x1.48791257b832ep-605, 0x1.cd68b49be13bdp-600 },
+ { 0x1.ce5e780d6c294p-606, 0x1.44dc1cd628aecp-600 },
+ { 0x1.45620e7623619p-606, 0x1.c962d320e4c77p-601 },
+ { 0x1.c9e86a88f07ffp-607, 0x1.41f2e3dd79383p-601 },
+ { 0x1.422a3dd414b5ep-607, 0x1.c52d432db963cp-602 },
+ { 0x1.c544c4080f626p-608, 0x1.3ee878deaf1c1p-602 },
+ { 0x1.3ed2c02828af5p-608, 0x1.c0c9812daaed1p-603 },
+ { 0x1.c07521d52071ep-609, 0x1.3bbdedbff743p-603 },
+ { 0x1.3b5cbe0c97302p-609, 0x1.bc391730e1bf4p-604 },
+ { 0x1.bb7b2d547171ap-610, 0x1.38745dbc97fd1p-604 },
+ { 0x1.37c9685446b6bp-610, 0x1.b77d9c068db21p-605 },
+ { 0x1.b6589b1020c3ep-611, 0x1.350cecc05d9cfp-605 },
+ { 0x1.3419f75c953bcp-611, 0x1.b298b2516cc35p-606 },
+ { 0x1.b10f29bfb2a68p-612, 0x1.3188c6bf4cd49p-606 },
+ { 0x1.304faa5c619afp-612, 0x1.ad8c07976bbcp-607 },
+ { 0x1.aba0a14c264ccp-613, 0x1.2de91f0a22435p-607 },
+ { 0x1.2c6bc6b0e1424p-613, 0x1.a859534d21642p-608 },
+ { 0x1.a60ed1d150c44p-614, 0x1.2a2f2fa027fc3p-608 },
+ { 0x1.286f9728ce321p-614, 0x1.a30255dde65bep-609 },
+ { 0x1.a05b929d439abp-615, 0x1.265c387eea954p-609 },
+ { 0x1.245c6b4e79163p-615, 0x1.9d88d7b14c6d3p-610 },
+ { 0x1.9a88c12e847c2p-616, 0x1.22717ef05792fp-610 },
+ { 0x1.203396b14a77p-616, 0x1.97eea82eb8229p-611 },
+ { 0x1.94984031d9858p-617, 0x1.1e704cd7ceb7cp-611 },
+ { 0x1.1bf6702f3caf4p-617, 0x1.92359cbfdea74p-612 },
+ { 0x1.8e8bf6806bcabp-618, 0x1.1a59effeaeef1p-612 },
+ { 0x1.17a6513ed67fap-618, 0x1.8c5f8fd2e86f6p-613 },
+ { 0x1.8865ce1efe9b6p-619, 0x1.162fb960e6361p-613 },
+ { 0x1.1344953a2bc16p-619, 0x1.866e5fdcf6e5cp-614 },
+ { 0x1.8227b33ef66f4p-620, 0x1.11f2fc7a0a0a9p-614 },
+ { 0x1.0ed298ab66e97p-620, 0x1.8063ee5dc8676p-615 },
+ { 0x1.7bd39341e60d2p-621, 0x1.0da50e937b941p-615 },
+ { 0x1.0a51b89b5ac38p-621, 0x1.7a421ee53231bp-616 },
+ { 0x1.756b5bc0538cfp-622, 0x1.0947461417eb2p-616 },
+ { 0x1.05c351e298147p-622, 0x1.740ad61b23997p-617 },
+ { 0x1.6ef0f9946142ep-623, 0x1.04daf9d1f19dp-617 },
+ { 0x1.0128c07d7eac9p-623, 0x1.6dbff8cae0f32p-618 },
+ { 0x1.686657e900799p-624, 0x1.006180668cd93p-618 },
+ { 0x1.f906bdc779cfcp-625, 0x1.67636af21f0cbp-619 },
+ { 0x1.61cd5f4e4d33cp-625, 0x1.f7b85f0c272bbp-620 },
+ { 0x1.efa90ac757637p-626, 0x1.60f70ed4a200ep-620 },
+ { 0x1.5b27f4d3aafafp-626, 0x1.ee98b6b3e4f34p-621 },
+ { 0x1.e63b1303dfbfbp-627, 0x1.5a7cc414fb8aap-621 },
+ { 0x1.5477f92833195p-627, 0x1.e566abbe94f87p-622 },
+ { 0x1.dcbf7abb88524p-628, 0x1.53f666d2fde17p-622 },
+ { 0x1.4dbf47c1fc8ap-628, 0x1.dc24dc933bf6dp-623 },
+ { 0x1.d338de3492428p-629, 0x1.4d65ced070949p-623 },
+ { 0x1.46ffb60cbd76p-629, 0x1.d2d5e0d43505p-624 },
+ { 0x1.c9a9d09a6515fp-630, 0x1.46ccce9c8cdf5p-624 },
+ { 0x1.403b12a03d499p-630, 0x1.c97c4837b573ep-625 },
+ { 0x1.c014dae645fc3p-631, 0x1.402d32c6be96dp-625 },
+ { 0x1.3973247f05596p-631, 0x1.c01a996aebdb3p-626 },
+ { 0x1.b67c7ad400b86p-632, 0x1.3988c1191e211p-626 },
+ { 0x1.32a9aa5db4bb3p-632, 0x1.b6b3510058b7ap-627 },
+ { 0x1.ace321e309c7bp-633, 0x1.32e137db0ef23p-627 },
+ { 0x1.2be059f3526f7p-633, 0x1.ad48e069f2207p-628 },
+ { 0x1.a34b346493cc3p-634, 0x1.2c384d1c64d5bp-628 },
+ { 0x1.2518df52ef492p-634, 0x1.a3ddacff96f65p-629 },
+ { 0x1.99b70897047dcp-635, 0x1.258fae0968e74p-629 },
+ { 0x1.1e54dc4edf3a3p-635, 0x1.9a740f1248851p-630 },
+ { 0x1.9028e5cf277c7p-636, 0x1.1ee8fe480d92cp-630 },
+ { 0x1.1795e7e5c7ccap-636, 0x1.910e510c93fe1p-631 },
+ { 0x1.86a303af6f699p-637, 0x1.1845d75e974c6p-631 },
+ { 0x1.10dd8db9b7b2p-637, 0x1.87aeaea087811p-632 },
+ { 0x1.7d27896d87b8ep-638, 0x1.11a7c823f5ff5p-632 },
+ { 0x1.0a2d4d917179ap-638, 0x1.7e57540380a9p-633 },
+ { 0x1.73b88d266bc5ap-639, 0x1.0b10543a01766p-633 },
+ { 0x1.03869ae409b27p-639, 0x1.750a5d3814d59p-634 },
+ { 0x1.6a58134129f18p-640, 0x1.0480f391c14fcp-634 },
+ { 0x1.f9d5b8ddde221p-641, 0x1.6bc9d56645be6p-635 },
+ { 0x1.61080de06bfbp-641, 0x1.fbf623f3bedbap-636 },
+ { 0x1.ecb6d7acd34f7p-642, 0x1.6297b642274f2p-636 },
+ { 0x1.57ca5c62d05ddp-642, 0x1.ef001d6eb49dfp-637 },
+ { 0x1.dfb32aa129cc6p-643, 0x1.5975e7810e7p-637 },
+ { 0x1.4ea0caf213789p-643, 0x1.e222785106b16p-638 },
+ { 0x1.d2cd2eb59de4cp-644, 0x1.50663e5d53392p-638 },
+ { 0x1.458d1220fa79dp-644, 0x1.d55fbee497ep-639 },
+ { 0x1.c60744f31e198p-645, 0x1.476a7d28a437bp-639 },
+ { 0x1.3c90d697e5b5dp-645, 0x1.c8ba606fb6833p-640 },
+ { 0x1.b963b20518321p-646, 0x1.3e8452ecdbe84p-640 },
+ { 0x1.33ada8cfe418fp-646, 0x1.bc34b0b8bbc6p-641 },
+ { 0x1.ace49de2283aep-647, 0x1.35b55b1b3d652p-641 },
+ { 0x1.2ae504dc15f24p-647, 0x1.afd0e79df00ebp-642 },
+ { 0x1.a08c1388db34fp-648, 0x1.2cff1d49f192cp-642 },
+ { 0x1.223852412258p-648, 0x1.a39120c175c51p-643 },
+ { 0x1.945c00d028182p-649, 0x1.24630cff92d39p-643 },
+ { 0x1.19a8e3da77fbep-649, 0x1.97775b48ec1aap-644 },
+ { 0x1.8856364b336c5p-650, 0x1.1be2898c8a8a4p-644 },
+ { 0x1.1137f7cd08642p-650, 0x1.8b8579b06ca2cp-645 },
+ { 0x1.7c7c673fe436ep-651, 0x1.137eddf1f97aep-645 },
+ { 0x1.08e6b787233bap-651, 0x1.7fbd41b078795p-646 },
+ { 0x1.70d029afc4472p-652, 0x1.0b3940d5da6fcp-646 },
+ { 0x1.00b637cd0ec0bp-652, 0x1.74205c365c73ep-647 },
+ { 0x1.6552f6729a259p-653, 0x1.0312d48405757p-647 },
+ { 0x1.f14ef1a3e4ac2p-654, 0x1.68b0556e87723p-648 },
+ { 0x1.5a06296220023p-654, 0x1.f6194df7630e5p-649 },
+ { 0x1.e176ccb941b53p-655, 0x1.5d6e9ce0425a7p-649 },
+ { 0x1.4eeb0196310cdp-655, 0x1.e64f64121563ep-650 },
+ { 0x1.d1e5afef936dap-656, 0x1.525c859a2ea9ap-650 },
+ { 0x1.4402a1b0bd9dfp-656, 0x1.d6c9b6d4d6fc5p-651 },
+ { 0x1.c29d225a230e3p-657, 0x1.477b466ee6cc1p-651 },
+ { 0x1.394e1038ce88ep-657, 0x1.c789ea0183d02p-652 },
+ { 0x1.b39e83951bdaap-658, 0x1.3ccbfa4112a58p-652 },
+ { 0x1.2ece3803d8d68p-658, 0x1.b8917a154498bp-653 },
+ { 0x1.a4eb0c6436cf4p-659, 0x1.324fa05e3adc4p-653 },
+ { 0x1.2483e8ac9d061p-659, 0x1.a9e1bcd30af1fp-654 },
+ { 0x1.9683cf6400112p-660, 0x1.28071ce79e917p-654 },
+ { 0x1.1a6fd716c7c18p-660, 0x1.9b7be1e1550cbp-655 },
+ { 0x1.8869b9cc95345p-661, 0x1.1df33948493fap-655 },
+ { 0x1.10929dfe85b79p-661, 0x1.8d60f37a227b9p-656 },
+ { 0x1.7a9d9444b613ep-662, 0x1.1414a4b7a1729p-656 },
+ { 0x1.06ecbe9338febp-662, 0x1.7f91d72bfd333p-657 },
+ { 0x1.6d2003c3fdf54p-663, 0x1.0a6bf4c7a4f95p-657 },
+ { 0x1.fafd4238f8063p-664, 0x1.720f4eaaf4bbbp-658 },
+ { 0x1.5ff18a8317f0ap-664, 0x1.00f9a5fe04069p-658 },
+ { 0x1.e8912b5139031p-665, 0x1.64d9f8b065b73p-659 },
+ { 0x1.531288f8c01c7p-665, 0x1.ef7c38ee94e41p-660 },
+ { 0x1.d695a98770e4bp-666, 0x1.57f251e86550ep-660 },
+ { 0x1.46833ee262b1p-666, 0x1.dd73492689d2p-661 },
+ { 0x1.c50b006d4e015p-667, 0x1.4b58b5eba6cc7p-661 },
+ { 0x1.3a43cc572b3d3p-667, 0x1.cbd8e7539eac7p-662 },
+ { 0x1.b3f14799b1616p-668, 0x1.3f0d6044b145dp-662 },
+ { 0x1.2e5432e458097p-668, 0x1.baad518e7426ep-663 },
+ { 0x1.a3486c40b74f1p-669, 0x1.33106d7f3cac9p-663 },
+ { 0x1.22b456b1a8db7p-669, 0x1.a9f09adee91e3p-664 },
+ { 0x1.931032d667261p-670, 0x1.2761dc408f1efp-664 },
+ { 0x1.1763ffacc46acp-670, 0x1.99a2acce5bd7fp-665 },
+ { 0x1.834838ba6fe3dp-671, 0x1.1c018e67b6eaep-665 },
+ { 0x1.0c62daba74e7cp-671, 0x1.89c349043d67ep-666 },
+ { 0x1.73eff5eb5eca5p-672, 0x1.10ef4a3481a29p-666 },
+ { 0x1.01b07aeca1f42p-672, 0x1.7a520aeb63faep-667 },
+ { 0x1.6506bebfc67bdp-673, 0x1.062abb7415c63p-667 },
+ { 0x1.ee98b577ea7cap-674, 0x1.6b4e695e9099fp-668 },
+ { 0x1.568bc5a3d72eep-674, 0x1.f766e96435041p-669 },
+ { 0x1.da6bba883d22ap-675, 0x1.5cb7b85aa6067p-669 },
+ { 0x1.487e1cd9f3e43p-675, 0x1.e311e0dabf963p-670 },
+ { 0x1.c6d89f0368fc1p-676, 0x1.4e8d2ab5187d6p-670 },
+ { 0x1.3adcb83cdccc3p-676, 0x1.cf55249e0172ap-671 },
+ { 0x1.b3ddd3216f86ep-677, 0x1.40cdd3d52967cp-671 },
+ { 0x1.2da66f0214306p-677, 0x1.bc2f50c60488ep-672 },
+ { 0x1.a1799fd5925f4p-678, 0x1.3378a96e8e29ap-672 },
+ { 0x1.20d9fd7b31257p-678, 0x1.a99ed8a2f2e6bp-673 },
+ { 0x1.8faa294857a39p-679, 0x1.268c853c2e48dp-673 },
+ { 0x1.147606d4e1ee3p-679, 0x1.97a2092e9b19dp-674 },
+ { 0x1.7e6d714d6fce7p-680, 0x1.1a0826b9b2f1ep-674 },
+ { 0x1.087916d26f37cp-680, 0x1.86370b7b69b46p-675 },
+ { 0x1.6dc159d3dbce3p-681, 0x1.0dea34dab05c3p-675 },
+ { 0x1.f9c3470942341p-682, 0x1.755be71f29feap-676 },
+ { 0x1.5da3a74ec8bc7p-682, 0x1.02313fbe40a01p-676 },
+ { 0x1.e35c1df5edf07p-683, 0x1.650e8497f58cdp-677 },
+ { 0x1.4e120315adc06p-683, 0x1.edb784bbee452p-678 },
+ { 0x1.cdb951dc67cbfp-684, 0x1.554cafa9d0c34p-678 },
+ { 0x1.3f09fdba5037ep-684, 0x1.d7d0486e476ccp-679 },
+ { 0x1.b8d760c6a3faap-685, 0x1.461419b3892c2p-679 },
+ { 0x1.308911536a23dp-685, 0x1.c2a975dad9bep-680 },
+ { 0x1.a4b2aa8c000cap-686, 0x1.37625bf981bdbp-680 },
+ { 0x1.228ca3bac6e07p-686, 0x1.ae3f97cbb25cep-681 },
+ { 0x1.914773f3bbbacp-687, 0x1.2934f9e530badp-681 },
+ { 0x1.151208bdc254ep-687, 0x1.9a8f1bb2e0d78p-682 },
+ { 0x1.7e91e9c37a26bp-688, 0x1.1b8963382a86p-682 },
+ { 0x1.0816843f2edd8p-688, 0x1.879454bd5bf1ap-683 },
+ { 0x1.6c8e23b87885fp-689, 0x1.0e5cf631ac83bp-683 },
+ { 0x1.f72e98937c4f8p-690, 0x1.754b7ed21d736p-684 },
+ { 0x1.5b38276a48eap-690, 0x1.01ad01a5b2ddp-684 },
+ { 0x1.df23162441e8bp-691, 0x1.63b0c17c2afp-685 },
+ { 0x1.4a8beb16012edp-691, 0x1.eaed8e09770edp-686 },
+ { 0x1.c804c1d0522ebp-692, 0x1.52c032be62aabp-686 },
+ { 0x1.3a855850eeeeap-692, 0x1.d36ef8a6e08fap-687 },
+ { 0x1.b1cdcc2ca0214p-693, 0x1.4275d9d00481dp-687 },
+ { 0x1.2b204ea20186ep-693, 0x1.bcd89c2310d59p-688 },
+ { 0x1.9c78595e362cep-694, 0x1.32cdb1c10f0eep-688 },
+ { 0x1.1c58a6013aaeep-694, 0x1.a724c21e93002p-689 },
+ { 0x1.87fe848fd6bffp-695, 0x1.23c3ac05a8c19p-689 },
+ { 0x1.0e2a313c94bb5p-695, 0x1.924da8624908p-690 },
+ { 0x1.745a6341bd9d3p-696, 0x1.1553b2e7eba16p-690 },
+ { 0x1.0090c041eb55fp-696, 0x1.7e4d844204d5fp-691 },
+ { 0x1.61860872f36c7p-697, 0x1.0779abdf88654p-691 },
+ { 0x1.e710449b20327p-698, 0x1.6b1e85d9cfdc3p-692 },
+ { 0x1.4f7b87a3ccd22p-698, 0x1.f462f39da55f5p-693 },
+ { 0x1.ce184ffaa0275p-699, 0x1.58badb2559681p-693 },
+ { 0x1.3e34f7b15484dp-699, 0x1.daedfe49c8a9fp-694 },
+ { 0x1.b6314a8f93441p-700, 0x1.471cb2f12adecp-694 },
+ { 0x1.2dac75898461p-700, 0x1.c28c3fc94131bp-695 },
+ { 0x1.9f52e6b0168fbp-701, 0x1.363e3fa56683p-695 },
+ { 0x1.1ddc26b854422p-701, 0x1.ab358720f461fp-696 },
+ { 0x1.8974e49b18481p-702, 0x1.2619b9e9f9276p-696 },
+ { 0x1.0ebe3bcdc6652p-702, 0x1.94e1adf5ef17ap-697 },
+ { 0x1.748f15c14a99p-703, 0x1.16a96324493c1p-697 },
+ { 0x1.004cf29d383afp-703, 0x1.7f889bf8109c7p-698 },
+ { 0x1.60995fd7916b4p-704, 0x1.07e787ce8decbp-698 },
+ { 0x1.e50530acb7a2bp-705, 0x1.6b224a16aa4ep-699 },
+ { 0x1.4d8bbfb38c98p-705, 0x1.f39d03522ee6ep-700 },
+ { 0x1.cab316f0b29dep-706, 0x1.57a6c57f8fed2p-700 },
+ { 0x1.3b5e4bf3051bbp-706, 0x1.d8b1738bdcb74p-701 },
+ { 0x1.b1987b3f62cd2p-707, 0x1.450e32693ba8dp-701 },
+ { 0x1.2a09376f26716p-707, 0x1.bf0154de94403p-702 },
+ { 0x1.99aa6a5f22416p-708, 0x1.3350cea8cd61ap-702 },
+ { 0x1.1984d37c8d151p-708, 0x1.a681c1d2f0b94p-703 },
+ { 0x1.82de1daeb9c47p-709, 0x1.2266f414ce57bp-703 },
+ { 0x1.09c991f950457p-709, 0x1.8f27fe21c9591p-704 },
+ { 0x1.6d28fdea9871ap-710, 0x1.12491ab5c17d9p-704 },
+ { 0x1.f5a00e548f085p-711, 0x1.78e979aa0c9bep-705 },
+ { 0x1.5880a5ae03598p-711, 0x1.02efdac5a4ff4p-705 },
+ { 0x1.d921d6d1c821bp-712, 0x1.63bbd32217718p-706 },
+ { 0x1.44dae3b23367bp-712, 0x1.e8a7dcff4677cp-707 },
+ { 0x1.be0a394617721p-713, 0x1.4f94da865b2a3p-707 },
+ { 0x1.322dbccd73cabp-713, 0x1.ccdc67829105bp-708 },
+ { 0x1.a44b3f5ce9c8bp-714, 0x1.3c6a934743c05p-708 },
+ { 0x1.206f6db46b93p-714, 0x1.b26f5afd4ebc9p-709 },
+ { 0x1.8bd742e227a38p-715, 0x1.2a3336386b4d7p-709 },
+ { 0x1.0f966c7fd2396p-715, 0x1.99530a15ce61ap-710 },
+ { 0x1.74a0efc06d36ep-716, 0x1.18e533433f227p-710 },
+ { 0x1.ff32d3f1c0a49p-717, 0x1.817a166d90dbdp-711 },
+ { 0x1.5e9b45aff1bep-717, 0x1.087732df4f3abp-711 },
+ { 0x1.e0dea55db81c4p-718, 0x1.6ad7728d6db01p-712 },
+ { 0x1.49b9999981d6cp-718, 0x1.f1c02ea5235f3p-713 },
+ { 0x1.c41e9fb058b1ep-719, 0x1.555e63841a093p-713 },
+ { 0x1.35ef96b0fe655p-719, 0x1.d42dfb77e321ep-714 },
+ { 0x1.a8e19002cb47fp-720, 0x1.4102823a6a0a2p-714 },
+ { 0x1.23313f4adb099p-720, 0x1.b8267dd51660dp-715 },
+ { 0x1.8f16bf19917acp-721, 0x1.2db7bc80b123ep-715 },
+ { 0x1.1172ed701cd4p-721, 0x1.9d98e007ff597p-716 },
+ { 0x1.76adf2095d808p-722, 0x1.1b7255d8af1cep-716 },
+ { 0x1.00a953345bce4p-722, 0x1.8474c5f89cf1fp-717 },
+ { 0x1.5f976a86ba7a3p-723, 0x1.0a26e7ff7c8ap-717 },
+ { 0x1.e192f5a290a0dp-724, 0x1.6caa4dc34bcc6p-718 },
+ { 0x1.49c3e6e576cf8p-724, 0x1.f394c675d5da1p-719 },
+ { 0x1.c3918d16606afp-725, 0x1.562a0ffd36fefp-719 },
+ { 0x1.3524a1ccb90cep-725, 0x1.d4a41cdb95576p-720 },
+ { 0x1.a739e0c3f00b3p-726, 0x1.40e51faa74ee4p-720 },
+ { 0x1.21ab51a49a64p-726, 0x1.b7670ded07be7p-721 },
+ { 0x1.8c781323e2b8bp-727, 0x1.2ccd09eaa341p-721 },
+ { 0x1.0f4a27c210b83p-727, 0x1.9bc980b6cd88bp-722 },
+ { 0x1.7338f3cfd4b18p-728, 0x1.19d3d560c7458p-722 },
+ { 0x1.fbe79eabbab8bp-729, 0x1.81b807901b2ddp-723 },
+ { 0x1.5b69fdd784131p-729, 0x1.07ec015b26bbfp-723 },
+ { 0x1.db36d8463b3e1p-730, 0x1.691fdebe382bep-724 },
+ { 0x1.44f955c9776f6p-730, 0x1.ee11097f70374p-725 },
+ { 0x1.bc693203fe92cp-731, 0x1.51eeeac7320bep-725 },
+ { 0x1.2fd5c7756dd24p-731, 0x1.ce39998362bf9p-726 },
+ { 0x1.9f66cc65fb2cbp-732, 0x1.3c13b67a17ff2p-726 },
+ { 0x1.1beec36eb8502p-732, 0x1.b03976c943068p-727 },
+ { 0x1.8418af0dd65edp-733, 0x1.277d70b2ebc6fp-727 },
+ { 0x1.09345c546e7cdp-733, 0x1.93f94ba2c6b6ap-728 },
+ { 0x1.6a68c4bfd764bp-734, 0x1.141be9e049453p-728 },
+ { 0x1.ef2e87ca7b717p-735, 0x1.7962a50231832p-729 },
+ { 0x1.5241d71eb6e19p-735, 0x1.01df915097b64p-729 },
+ { 0x1.ce118fc8beeeap-736, 0x1.605fee84767fp-730 },
+ { 0x1.3b8f8a28fd848p-736, 0x1.e172e498cd2fcp-731 },
+ { 0x1.aef59daa19c93p-737, 0x1.48dc6e3757e71p-731 },
+ { 0x1.263e577f574dp-737, 0x1.c1366206ca036p-732 },
+ { 0x1.91bfa9231de5cp-738, 0x1.32c440230ef3ap-732 },
+ { 0x1.123b897af1af4p-738, 0x1.a2ee0ea25a216p-733 },
+ { 0x1.7655cd85a2773p-739, 0x1.1e04519eb8f87p-733 },
+ { 0x1.feea6c3554149p-740, 0x1.867f82bdccb8fp-734 },
+ { 0x1.5c9f427a491a4p-740, 0x1.0a8a5c7678dffp-734 },
+ { 0x1.dbb4739afff2ep-741, 0x1.6bd1744d1513ep-735 },
+ { 0x1.4484548d479a3p-741, 0x1.f089c3d3d8b6fp-736 },
+ { 0x1.bab46440d8e4bp-742, 0x1.52cbafb8bc99fp-736 },
+ { 0x1.2dee5d96e696ep-742, 0x1.ce464b1286c0dp-737 },
+ { 0x1.9bcaf0aad775cp-743, 0x1.3b571085ef9dbp-737 },
+ { 0x1.18c7bd07b007fp-743, 0x1.ae2a4fedee59cp-738 },
+ { 0x1.7eda37d26ae66p-744, 0x1.255d79dbe3905p-738 },
+ { 0x1.04fbd01fd3b9ap-744, 0x1.9017432798e26p-739 },
+ { 0x1.63c5ba199716fp-745, 0x1.10c9ceee61d28p-739 },
+ { 0x1.e4edd431a7a4p-746, 0x1.73effa34f57abp-740 },
+ { 0x1.4a724e2f6eadep-746, 0x1.fb0fd6a99ec28p-741 },
+ { 0x1.c24c9890314cdp-747, 0x1.5998a4600495bp-741 },
+ { 0x1.32c615eef6a3dp-747, 0x1.d70936a92f04ap-742 },
+ { 0x1.a1f03c81340fdp-748, 0x1.40f6bfdad1f14p-742 },
+ { 0x1.1ca87340e1c39p-748, 0x1.b55b284add8c1p-743 },
+ { 0x1.83b6cbf2ba29fp-749, 0x1.29f10ece9036ep-743 },
+ { 0x1.0801fd07f7284p-749, 0x1.95e2d86ae92c8p-744 },
+ { 0x1.677ffffc31b92p-750, 0x1.146f8c6e8dc57p-744 },
+ { 0x1.e978e83ebd95dp-751, 0x1.787f26e598ebbp-745 },
+ { 0x1.4d2d2f5dd4096p-751, 0x1.005b6216a17eap-745 },
+ { 0x1.c58570e2f641dp-752, 0x1.5d10973fbab06p-746 },
+ { 0x1.34a13f272cdfap-752, 0x1.db3db8f832a58p-747 },
+ { 0x1.a4017c5ace0dep-753, 0x1.4379416dfac63p-747 },
+ { 0x1.1dc0938cfb932p-753, 0x1.b84ac1ef46255p-748 },
+ { 0x1.84c7064147f81p-754, 0x1.2b9cc2c3d6738p-748 },
+ { 0x1.087100f5e6429p-754, 0x1.97b6c5dc3637ap-749 },
+ { 0x1.67b20873fc995p-755, 0x1.15602f1227af8p-749 },
+ { 0x1.e9337a8979dap-756, 0x1.795cb2bb480b6p-750 },
+ { 0x1.4ca0667456eb8p-756, 0x1.00aa01fc8a73ep-750 },
+ { 0x1.c446a2ccade1cp-757, 0x1.5d196927cdaccp-751 },
+ { 0x1.3371d92c55c69p-757, 0x1.dac421184af19p-752 },
+ { 0x1.a1ef1650d3562p-758, 0x1.42cba823b93cbp-752 },
+ { 0x1.1c07db1df4cf6p-758, 0x1.b6e2f60b615c1p-753 },
+ { 0x1.8202debc2593cp-759, 0x1.2a53f94211ba9p-753 },
+ { 0x1.064595037ce7bp-759, 0x1.95853e0fd75adp-754 },
+ { 0x1.645a58ac6913cp-760, 0x1.13949d3b2fbd2p-754 },
+ { 0x1.e41f95cc492cep-761, 0x1.768213ee2ba9cp-755 },
+ { 0x1.48d0194e5b153p-761, 0x1.fce2f1e195a7ap-756 },
+ { 0x1.be99935f38c42p-762, 0x1.59b2d772c1b04p-756 },
+ { 0x1.2f40d4a5d287p-762, 0x1.d5a005ce1b15dp-757 },
+ { 0x1.9bc8aa74c3805p-763, 0x1.3ef3138f8ae58p-757 },
+ { 0x1.178b448b82b16p-763, 0x1.b12e626e3c8a1p-758 },
+ { 0x1.7b7f2dc7fa066p-764, 0x1.2620652c3102cp-758 },
+ { 0x1.0190106456396p-764, 0x1.8f5ecffd9c995p-759 },
+ { 0x1.5d92194746ef2p-765, 0x1.0f1a62a97a48ep-759 },
+ { 0x1.da636b2add63ap-766, 0x1.7004d0a0dd3fcp-760 },
+ { 0x1.41d8f14e2d235p-766, 0x1.f38508375a815p-761 },
+ { 0x1.b4a8e16df3a2ep-767, 0x1.52f67f4a45dbdp-761 },
+ { 0x1.282da2ee06e9fp-767, 0x1.cbf8187da97p-762 },
+ { 0x1.91bc4f0e82a1p-768, 0x1.380c6fa6ddd1bp-762 },
+ { 0x1.106c65473611bp-768, 0x1.a757e44dde4fbp-763 },
+ { 0x1.716ca73d3a1dcp-769, 0x1.1f218f165083cp-763 },
+ { 0x1.f4e737e667fe6p-770, 0x1.8571975a9ba0cp-764 },
+ { 0x1.538bdbc88035p-770, 0x1.081306aee058bp-764 },
+ { 0x1.cc4774fe05a13p-771, 0x1.661571375ee31p-765 },
+ { 0x1.37eeb586702afp-771, 0x1.e5803c9b677cp-766 },
+ { 0x1.a6be51e94d2c3p-772, 0x1.49169d29f057fp-766 },
+ { 0x1.1e6cae3cc5ce4p-772, 0x1.be144165bfdadp-767 },
+ { 0x1.841452e30c6ecp-773, 0x1.2e4b0b7596d86p-767 },
+ { 0x1.06dfcc0330324p-773, 0x1.99a8814f82396p-768 },
+ { 0x1.64157d8dbcaa1p-774, 0x1.158b4c1d7aa61p-768 },
+ { 0x1.e248fc3725278p-775, 0x1.7806fe5adc0dep-769 },
+ { 0x1.4691284199248p-775, 0x1.fd64d63539ac4p-770 },
+ { 0x1.ba32f675bcca1p-776, 0x1.58fd2560c98e3p-770 },
+ { 0x1.2b59cb5fcd07p-776, 0x1.d33b9c01b8858p-771 },
+ { 0x1.953f4278d9771p-777, 0x1.3c5b9e7be019ep-771 },
+ { 0x1.1244d4a198783p-777, 0x1.ac5a261b57bd2p-772 },
+ { 0x1.7333ac721d353p-778, 0x1.21f61f6e6a3a5p-772 },
+ { 0x1.f654f8b2c9938p-779, 0x1.8883e334bf813p-773 },
+ { 0x1.53d9d5f4e3889p-779, 0x1.09a33ffab8174p-773 },
+ { 0x1.cbcb3935e8707p-780, 0x1.678037d69a88ap-774 },
+ { 0x1.36fefd85e37f7p-780, 0x1.e678a0474dd4dp-775 },
+ { 0x1.a4a7147e53789p-781, 0x1.491a44a8cc267p-775 },
+ { 0x1.1c73c8c2f3143p-781, 0x1.bd3a60953bab8p-776 },
+ { 0x1.80a7df6e9e4abp-782, 0x1.2d20af56e98e4p-776 },
+ { 0x1.040c111171b21p-782, 0x1.9748563f2a02cp-777 },
+ { 0x1.5f9153468350dp-783, 0x1.13656dff66048p-777 },
+ { 0x1.db3d65827b6f1p-784, 0x1.7463a2ae57157p-778 },
+ { 0x1.412b4a3b0b6bbp-784, 0x1.f77b2a384d071p-779 },
+ { 0x1.b20abd232bd72p-785, 0x1.5451ae34b02aep-779 },
+ { 0x1.25417f5fe18aap-785, 0x1.cc024fa52d21ep-780 },
+ { 0x1.8c38db09c3d68p-786, 0x1.36dbe645ba702p-780 },
+ { 0x1.0ba351c6b2c44p-786, 0x1.a415d531b6e85p-781 },
+ { 0x1.69856de02317p-787, 0x1.1bcf7eeeba2f5p-781 },
+ { 0x1.e847157246bfcp-788, 0x1.7f70703ac5558p-782 },
+ { 0x1.49b2d16422141p-788, 0x1.02fd377359b1p-782 },
+ { 0x1.bd304de355d85p-789, 0x1.5dd1b0bb84b26p-783 },
+ { 0x1.2c87c2ff697dcp-789, 0x1.d87243e77ecadp-784 },
+ { 0x1.95b4456f24a66p-790, 0x1.3efdb3b369292p-784 },
+ { 0x1.11cf1a60f1d84p-790, 0x1.aeb4dc01a4631p-785 },
+ { 0x1.718a9184a8678p-791, 0x1.22bcd99dbdb06p-785 },
+ { 0x1.f2af0be1fde49p-792, 0x1.88766c06b0833p-786 },
+ { 0x1.507007917e3d9p-792, 0x1.08db80d427d79p-786 },
+ { 0x1.c5e695f15072bp-793, 0x1.65709eb54bf5ep-787 },
+ { 0x1.32266540e08c2p-793, 0x1.e253876b38acep-788 },
+ { 0x1.9cf012acb820bp-794, 0x1.45623a2f6a451p-788 },
+ { 0x1.1673fda512b46p-794, 0x1.b6f674d703273p-789 },
+ { 0x1.777d05328bd26p-795, 0x1.280eca736b4b1p-789 },
+ { 0x1.fa46d62b8e57dp-796, 0x1.8f4d804e3ad6fp-790 },
+ { 0x1.5544c8bc23e1cp-796, 0x1.0d3e50a2eecdcp-790 },
+ { 0x1.cc068b1dc8ab2p-797, 0x1.6b0c7763ce52bp-791 },
+ { 0x1.36042b906571p-797, 0x1.e979edc5b3767p-792 },
+ { 0x1.a1cbbab815b4cp-798, 0x1.49ecd657d5dd6p-792 },
+ { 0x1.197d0fe71564cp-798, 0x1.bcb59141dc715p-793 },
+ { 0x1.7b41f3bcb1869p-799, 0x1.2bad65a82bb23p-793 },
+ { 0x1.feec24eca8006p-800, 0x1.93d6de18ac6bfp-794 },
+ { 0x1.581b387627669p-800, 0x1.1011dd6dfecf6p-794 },
+ { 0x1.cf746ccaba032p-801, 0x1.6e8be31f2fe24p-795 },
+ { 0x1.380f8b864e1acp-801, 0x1.edc51c8649aaap-796 },
+ { 0x1.a4312cc2f816ap-802, 0x1.4c88f43732a1p-796 },
+ { 0x1.1adc83c96accfp-802, 0x1.bfd81ed74f1cdp-797 },
+ { 0x1.7cc835281bbf3p-803, 0x1.2d883a292df3bp-797 },
+ { 0x1.0044e6f2b903fp-803, 0x1.95fde403b5724p-798 },
+ { 0x1.58e66674c0f82p-804, 0x1.11494966870b7p-798 },
+ { 0x1.d0209514d613dp-805, 0x1.6fdef1ca550b3p-799 },
+ { 0x1.383f2f4495aedp-805, 0x1.ef217eb67d36dp-800 },
+ { 0x1.a41575f0363d6p-806, 0x1.4d2aaa5b8e28ap-800 },
+ { 0x1.1a8c12a0cae91p-806, 0x1.c04fcbf1fddd8p-801 },
+ { 0x1.7c08d08f2ccbbp-807, 0x1.2d96cdd2a30b8p-801 },
+ { 0x1.ff186c5b90604p-808, 0x1.95b8ba50a2687p-802 },
+ { 0x1.57a2b0b1c4c86p-808, 0x1.10df03cd711e3p-802 },
+ { 0x1.ce07ef98af2aep-809, 0x1.6eff939f51c8fp-803 },
+ { 0x1.36923c5eb270bp-809, 0x1.ed88d96607fb4p-804 },
+ { 0x1.a1791489717bfp-810, 0x1.4bcf1445c1d61p-804 },
+ { 0x1.188d2c2d680a3p-810, 0x1.be1a747b458c8p-805 },
+ { 0x1.7907312c7e255p-811, 0x1.2bd8dde16ba8ap-805 },
+ { 0x1.fa9e995f4c414p-812, 0x1.93089dc23e417p-806 },
+ { 0x1.5455df149c7b5p-812, 0x1.0ed4f34d6e965p-806 },
+ { 0x1.c93410e8142f8p-813, 0x1.6bf1c754a3325p-807 },
+ { 0x1.33105a5b594f7p-813, 0x1.e9027b1c5a4abp-808 },
+ { 0x1.9c67f441e11b3p-814, 0x1.487c687197597p-808 },
+ { 0x1.14e8ebae7496ep-814, 0x1.b942323a72767p-809 },
+ { 0x1.73d10c597b774p-815, 0x1.285660efb3e9ap-809 },
+ { 0x1.f330b99c7f9e7p-816, 0x1.8df9d62fb9c5ep-810 },
+ { 0x1.4f0ef77c81a6fp-816, 0x1.0b34677fe9486p-810 },
+ { 0x1.c1baedb5f2e65p-817, 0x1.66c37bb05de1ep-811 },
+ { 0x1.2dc9788ad9864p-817, 0x1.e1a30436bcde5p-812 },
+ { 0x1.94f913add4907p-818, 0x1.4341c90c553e7p-812 },
+ { 0x1.0fafd2c40ba27p-818, 0x1.b1dd0ffc5d04bp-813 },
+ { 0x1.6c7df995241d1p-819, 0x1.231f4a6757469p-813 },
+ { 0x1.e8f062cc963cep-820, 0x1.86a35930ed5e1p-814 },
+ { 0x1.47e5cbff0d92ep-820, 0x1.060dd236f49a3p-814 },
+ { 0x1.b7be34be4e18dp-821, 0x1.5f8c25cd122d7p-815 },
+ { 0x1.26d5559b935e7p-821, 0x1.d78bca82e9f37p-816 },
+ { 0x1.8b4dd6af9c05dp-822, 0x1.3c36d15093021p-816 },
+ { 0x1.08f94cfc79158p-822, 0x1.a80c62c44a65bp-817 },
+ { 0x1.632ec0e0d009cp-823, 0x1.1c4b11ed6627ap-817 },
+ { 0x1.dc0b5f2e40ea4p-824, 0x1.7d261cc2edf72p-818 },
+ { 0x1.3efa480ea698bp-824, 0x1.fef096f5252fp-819 },
+ { 0x1.ab6a5245de9e5p-825, 0x1.566c107178d1fp-819 },
+ { 0x1.1e52cde409267p-825, 0x1.cae9de8f00c0bp-820 },
+ { 0x1.7f910d0084829p-826, 0x1.337ae444bd293p-820 },
+ { 0x1.00e3012bd4171p-826, 0x1.9bfbcfe9dc1e8p-821 },
+ { 0x1.580c66bfc7cf5p-827, 0x1.13f803c0631d9p-821 },
+ { 0x1.ccba595fe34b5p-828, 0x1.71ac2109d33c9p-822 },
+ { 0x1.347383dcf4a9bp-828, 0x1.ef21caa7d80c3p-823 },
+ { 0x1.9cf52785fcd1fp-829, 0x1.4b8b6bbdb7a4fp-823 },
+ { 0x1.1466f7a4ba4b3p-829, 0x1.bbf4bcf8ca0c3p-824 },
+ { 0x1.71f5b701cb667p-830, 0x1.2934441fdae8bp-824 },
+ { 0x1.ef1fef5338f87p-831, 0x1.8de00a5d4cff3p-825 },
+ { 0x1.4b46ffc2e70ccp-831, 0x1.0a4a61359d63ap-825 },
+ { 0x1.bb3f3e667d5e5p-832, 0x1.64673b39bdd54p-826 },
+ { 0x1.287ea78b8278fp-832, 0x1.dcf3acd0cc1f4p-827 },
+ { 0x1.8c9c8347a2863p-833, 0x1.3f1926f0c2aa4p-827 },
+ { 0x1.093c166d47d9p-833, 0x1.aaecb94ca24e1p-828 },
+ { 0x1.62b5957e6b822p-834, 0x1.1d8efbbc88d6cp-828 },
+ { 0x1.da4f3c5b8c56fp-835, 0x1.7df554174928cp-829 },
+ { 0x1.3d1457a1afdaep-835, 0x1.fed6b4a9440a8p-830 },
+ { 0x1.a7e3665ffae25p-836, 0x1.558fae0fed7aap-830 },
+ { 0x1.1b4da97b89113p-836, 0x1.c8b307e047613p-831 },
+ { 0x1.7aa46b2ec675cp-837, 0x1.3149a005e5984p-831 },
+ { 0x1.fa00e080e536p-838, 0x1.9819329634547p-832 },
+ { 0x1.520f92dcad4a2p-838, 0x1.10bba52994e8ep-832 },
+ { 0x1.c3a9666328faap-839, 0x1.6c7dd2d93c0f9p-833 },
+ { 0x1.2dae795ce73b6p-839, 0x1.e70fd5d6d806dp-834 },
+ { 0x1.92f5963d343cfp-840, 0x1.45629dffe1fa7p-834 },
+ { 0x1.0d15f439254bep-840, 0x1.b2b2e959996bp-835 },
+ { 0x1.675546ac2c967p-841, 0x1.2255364dfcfd7p-835 },
+ { 0x1.dfca1ff236f02p-842, 0x1.83c6a3841fccap-836 },
+ { 0x1.4046155930cfbp-842, 0x1.02ee197efc99dp-836 },
+ { 0x1.ab8846c89a496p-843, 0x1.59bfc8bdbfffep-837 },
+ { 0x1.1d5226b496f7ep-843, 0x1.cd9f4c973304p-838 },
+ { 0x1.7cc7edd2bedd1p-844, 0x1.3420703d360eap-838 },
+ { 0x1.fc1e021531b11p-845, 0x1.9b4a6e4580455p-839 },
+ { 0x1.52f9fd29afa7bp-845, 0x1.1276cde31355ep-839 },
+ { 0x1.c439018f9e7bp-846, 0x1.6e44a0da72dedp-840 },
+ { 0x1.2d9d4a3bfacfap-846, 0x1.e8b82d35e9882p-841 },
+ { 0x1.9247c7d6b7109p-847, 0x1.4603c1a2de688p-841 },
+ { 0x1.0c3d4d5746632p-847, 0x1.b2e6fa531d555p-842 },
+ { 0x1.65add59367765p-848, 0x1.220b241172407p-842 },
+ { 0x1.dce1e8301e6efp-849, 0x1.82d28ae825549p-843 },
+ { 0x1.3dde18cb97a8dp-849, 0x1.01ea51e3f541cp-843 },
+ { 0x1.a7b31ccb0b2f4p-850, 0x1.57e3d8e31e749p-844 },
+ { 0x1.1a59798dd7aa2p-850, 0x1.ca77ce984ce61p-845 },
+ { 0x1.7843a7981f8e3p-851, 0x1.3192c63185ef2p-845 },
+ { 0x1.f55b0f3ffe463p-852, 0x1.974911a73b1a7p-846 },
+ { 0x1.4df9fe655b0fbp-852, 0x1.0f64b579273f6p-846 },
+ { 0x1.bce68ce6bcfedp-853, 0x1.69a3e1bad13dap-847 },
+ { 0x1.284bfe1cdea24p-853, 0x1.e1d6859c11527p-848 },
+ { 0x1.8a9c29acbf47dp-854, 0x1.40f425a16dca3p-848 },
+ { 0x1.06bd70b72892bp-854, 0x1.ab8633790b1e2p-849 },
+ { 0x1.5dd55c1a48477p-855, 0x1.1cb4a43b9229fp-849 },
+ { 0x1.d1bd6b173b9f2p-856, 0x1.7b25cc6523c3bp-850 },
+ { 0x1.35fc8451ff49ep-856, 0x1.f8db2dc70232bp-851 },
+ { 0x1.9c9712232f548p-857, 0x1.5014bc06e7f91p-851 },
+ { 0x1.128b47439dcd5p-857, 0x1.bf66ba3b9066cp-852 },
+ { 0x1.6d53d2be0a0b6p-858, 0x1.29c2c1dc958dbp-852 },
+ { 0x1.e6122171333dfp-859, 0x1.8c4a9d76af90fp-853 },
+ { 0x1.435229d0cc681p-859, 0x1.07ae5a7347d0bp-853 },
+ { 0x1.ae1371b74ea2dp-860, 0x1.5ed9539dfd0c9p-854 },
+ { 0x1.1e01427183001p-860, 0x1.d2c69c7599edcp-855 },
+ { 0x1.7c589442700ecp-861, 0x1.3677341a98a13p-855 },
+ { 0x1.f9be9e1d7b4e4p-862, 0x1.9cf2c5625685ep-856 },
+ { 0x1.5033c96eb757p-862, 0x1.1298aebe8af0fp-856 },
+ { 0x1.bef014f36ffa9p-863, 0x1.6d2655c8560ebp-857 },
+ { 0x1.290979be09b3bp-863, 0x1.e58166789d0bcp-858 },
+ { 0x1.8ac6ba86dcc3cp-864, 0x1.42b9e90b536b6p-858 },
+ { 0x1.064e638fb2517p-864, 0x1.acfe7e64002b1p-859 },
+ { 0x1.5c884857d8adep-865, 0x1.1d179e12ade6ep-859 },
+ { 0x1.cf0beaeb1b319p-866, 0x1.7ae01eb0f55cbp-860 },
+ { 0x1.338e29511ffcdp-866, 0x1.f772a9e0423a1p-861 },
+ { 0x1.9881a23b2ff9bp-867, 0x1.4e72e15f0f016p-861 },
+ { 0x1.0f43798c4f845p-867, 0x1.bc4e2f5a8c9afp-862 },
+ { 0x1.6836e63bd7d88p-868, 0x1.27165d875ec78p-862 },
+ { 0x1.de466f9c32fdap-869, 0x1.87eb54ae1860dp-863 },
+ { 0x1.3d79f883687bfp-869, 0x1.043b38d103ec9p-863 },
+ { 0x1.a56d48500b8a3p-870, 0x1.598a7d65e3b67p-864 },
+ { 0x1.17ac327f9b5e5p-870, 0x1.cac2d1ee89db1p-865 },
+ { 0x1.73278f241bb95p-871, 0x1.308090afcd9f3p-865 },
+ { 0x1.ec801820c3f3dp-872, 0x1.942d41e7bf2a3p-866 },
+ { 0x1.46b841565ab3ep-872, 0x1.0c34dc595f4bfp-866 },
+ { 0x1.b16ea850bfa34p-873, 0x1.63e9cb83e74b2p-867 },
+ { 0x1.1f76e44abf0ecp-873, 0x1.d83e5a3ffd7adp-868 },
+ { 0x1.7d432d7dd0ca1p-874, 0x1.39428e0fd00c5p-868 },
+ { 0x1.f99abec00b682p-875, 0x1.9f8c2eadfb109p-869 },
+ { 0x1.4f35579392d4bp-875, 0x1.13957092e7741p-869 },
+ { 0x1.bc6c19eee10e8p-876, 0x1.6d7ad6ac744f9p-870 },
+ { 0x1.2692d6adc530fp-876, 0x1.e4a41e3c393c2p-871 },
+ { 0x1.8673fad41c337p-877, 0x1.4149a31665d1ep-871 },
+ { 0x1.02bd066e6e446p-877, 0x1.a9efbad7c9909p-872 },
+ { 0x1.56dece3f159c3p-878, 0x1.1a4d14ca40e6p-872 },
+ { 0x1.c64dabfd6babdp-879, 0x1.7628f37011dc7p-873 },
+ { 0x1.2cf07ed3ac7cap-879, 0x1.efd93aae49244p-874 },
+ { 0x1.8ea5cdb1b77f8p-880, 0x1.4884565714d83p-874 },
+ { 0x1.0801f05da3babp-880, 0x1.b341347ab9d2ep-875 },
+ { 0x1.5da3ba0723cbcp-881, 0x1.204d0f497ca7dp-875 },
+ { 0x1.cefd7b19fc691p-882, 0x1.7de10a24a9be3p-876 },
+ { 0x1.3281b7ca3d771p-882, 0x1.f9c4f419d97b9p-877 },
+ { 0x1.95c663259c5d8p-883, 0x1.4ee2a6bb63f1dp-877 },
+ { 0x1.0c90568fe453bp-883, 0x1.bb6bea4d790c6p-878 },
+ { 0x1.6374ef6370a23p-884, 0x1.258802fee3a1bp-878 },
+ { 0x1.d668024e6e773p-885, 0x1.8491dcb50d65p-879 },
+ { 0x1.3739f6c74a992p-885, 0x1.012888bcf5e1bp-879 },
+ { 0x1.9bc5a2748239p-886, 0x1.5456466d99824p-880 },
+ { 0x1.105de86fb726ep-886, 0x1.c25d7813e5a28p-881 },
+ { 0x1.68453b252f9afp-887, 0x1.29f220ff323bdp-881 },
+ { 0x1.dc7c640bf856fp-888, 0x1.8a2c46b36447dp-882 },
+ { 0x1.3b0e7a2d8004dp-888, 0x1.04b5178932d9ep-882 },
+ { 0x1.a095d99893beap-889, 0x1.58d2d04dcdef9p-883 },
+ { 0x1.1361f24d04a1ep-889, 0x1.c8060b8a624d8p-884 },
+ { 0x1.6c0994513d45bp-890, 0x1.2d8154e3020f5p-884 },
+ { 0x1.e12caa0268707p-891, 0x1.8ea37661d565fp-885 },
+ { 0x1.3df6725a60cf5p-891, 0x1.078003d294269p-885 },
+ { 0x1.a42bf15180a09p-892, 0x1.5c4df6da1a5fp-886 },
+ { 0x1.15957e82800c6p-892, 0x1.cc58a0676d26ep-887 },
+ { 0x1.6eb9463d29a0dp-893, 0x1.302d6b1661efp-887 },
+ { 0x1.e46dfa81a2018p-894, 0x1.91ed1d851d1ddp-888 },
+ { 0x1.3feb236502138p-894, 0x1.0982d94421652p-888 },
+ { 0x1.a67f97b02e026p-895, 0x1.5ebfab91b4a2bp-889 },
+ { 0x1.16f37032d6085p-895, 0x1.cf4b3235443f5p-890 },
+ { 0x1.704e120e656fdp-896, 0x1.31f0304f01ddbp-890 },
+ { 0x1.e638c247f445dp-897, 0x1.940198fd0e1c2p-891 },
+ { 0x1.40e7ff18c854cp-897, 0x1.0ab8eaa8fae67p-891 },
+ { 0x1.a78b6039c7039p-898, 0x1.60223e0067b2cp-892 },
+ { 0x1.1778970df4481p-898, 0x1.d0d6e2f89dd66p-893 },
+ { 0x1.70c446e7535ccp-899, 0x1.32c589802b4bap-893 },
+ { 0x1.e688d1dc06742p-900, 0x1.94dc0e4e3bd62p-894 },
+ { 0x1.40eab69ffb357p-900, 0x1.0b1f64079cf15p-894 },
+ { 0x1.a74cd8f49285bp-901, 0x1.607271cb1c23p-895 },
+ { 0x1.1723bbb37e71p-901, 0x1.d0f815d3e30e4p-896 },
+ { 0x1.701ad03f5aba2p-902, 0x1.32ab83cb1b9aap-896 },
+ { 0x1.e55d6dd34aeb5p-903, 0x1.947a7e7d08e62p-897 },
+ { 0x1.3ff3437e5e592p-903, 0x1.0ab555a059592p-897 },
+ { 0x1.a5c493ec4b75bp-904, 0x1.5faf8b45ee11cp-898 },
+ { 0x1.15f5a46f2a8c5p-904, 0x1.cfae7d166a387p-899 },
+ { 0x1.6e533a1804da5p-905, 0x1.31a25c153692fp-899 },
+ { 0x1.e2b951ac76b4bp-906, 0x1.92ddcdd3a585ap-900 },
+ { 0x1.3e03e7aaf4a23p-906, 0x1.097bb793410b5p-900 },
+ { 0x1.a2f624fa2da41p-907, 0x1.5ddb524f58124p-901 },
+ { 0x1.13f112353b2e2p-907, 0x1.ccfd1b6b2b0d1p-902 },
+ { 0x1.6b71aaf8395acp-908, 0x1.2fac7e1ac1a55p-902 },
+ { 0x1.dea2a52e6f8d6p-909, 0x1.9009c068a7447p-903 },
+ { 0x1.3b2124c85eb7dp-909, 0x1.077566199da13p-903 },
+ { 0x1.9ee813dcc82f4p-910, 0x1.5afa0b60e30adp-904 },
+ { 0x1.111ab5ef7d9cep-910, 0x1.c8ea38207b48cp-905 },
+ { 0x1.677cd3ce598a2p-911, 0x1.2cce7b0334e93p-905 },
+ { 0x1.d922e485849dfp-912, 0x1.8c04eb792831bp-906 },
+ { 0x1.3751aaab95803p-912, 0x1.04a716678c7d9p-906 },
+ { 0x1.99a3c2eb312dfp-913, 0x1.571266fb205e7p-907 },
+ { 0x1.0d791e54efc95p-913, 0x1.c37f46c8a36cep-908 },
+ { 0x1.627dd610c1f2fp-914, 0x1.290ef7aa6784ep-908 },
+ { 0x1.d246bba093dddp-915, 0x1.86d89be61c44fp-909 },
+ { 0x1.329e3d8fc35e5p-915, 0x1.011744722e8f8p-909 },
+ { 0x1.93354aecb0f91p-916, 0x1.522d67c700dd9p-910 },
+ { 0x1.09149eae599f4p-916, 0x1.bcc8c2b79e5e6p-911 },
+ { 0x1.5c8020a89d6a7p-917, 0x1.247692feaf7c7p-911 },
+ { 0x1.ca1dd59404578p-918, 0x1.8090b25f1fb1cp-912 },
+ { 0x1.2d1194826d1d9p-918, 0x1.f99c33fa36826p-913 },
+ { 0x1.8bab4cd7bc185p-919, 0x1.4c563ff8738edp-913 },
+ { 0x1.03f72f0fa181cp-919, 0x1.b4d5ff233ee8bp-914 },
+ { 0x1.559144638d7d2p-920, 0x1.1f0fc4fe41aefp-914 },
+ { 0x1.c0baa10766979p-921, 0x1.793b75fbd2367p-915 },
+ { 0x1.26b830bbc4f33p-921, 0x1.efaa9eeaa4992p-916 },
+ { 0x1.8316ba6f8ef74p-922, 0x1.459a26ac43fcfp-916 },
+ { 0x1.fc588d5eeb3p-923, 0x1.abb8ece685efep-917 },
+ { 0x1.4dc0c0d42f863p-923, 0x1.18e6b704952c1p-917 },
+ { 0x1.b6320aea7077ap-924, 0x1.70e95e366ca95p-918 },
+ { 0x1.1fa02ebad6485p-924, 0x1.e4700e7fab75ep-919 },
+ { 0x1.798a96e59845bp-925, 0x1.3e0826243926dp-919 },
+ { 0x1.ef81624855ca5p-926, 0x1.a185d71d9ae78p-920 },
+ { 0x1.451fcaaed5e7p-926, 0x1.1209163a43d8ap-920 },
+ { 0x1.aa9b30dd7b333p-927, 0x1.67acd56555624p-921 },
+ { 0x1.17d9121b4ff43p-927, 0x1.d805487b20ec2p-922 },
+ { 0x1.6f1bb0c9eff18p-928, 0x1.35b0e3e76f72ap-922 },
+ { 0x1.e184bec96bcc5p-929, 0x1.965317fc3f8ebp-923 },
+ { 0x1.3bc10ccdff1d7p-929, 0x1.0a85e11600392p-923 },
+ { 0x1.9e0f0cdf83a76p-930, 0x1.5d99f4f4fa7a2p-924 },
+ { 0x1.0f738d3253e75p-930, 0x1.ca8538b911cc2p-925 },
+ { 0x1.63e056b37b486p-931, 0x1.2ca663e8f6c6ep-925 },
+ { 0x1.d2806afda0512p-932, 0x1.8a38c763ae5p-926 },
+ { 0x1.31b865207923bp-932, 0x1.026d30f31261ep-926 },
+ { 0x1.90a81bef15367p-933, 0x1.52c63cbe5201dp-927 },
+ { 0x1.068145905baddp-933, 0x1.bc0c903e2dd51p-928 },
+ { 0x1.57f0081c7461bp-934, 0x1.22fbc7eb40c8ep-928 },
+ { 0x1.c293abfeb81c1p-935, 0x1.7d5064d5d2e6ap-929 },
+ { 0x1.271a9ed146425p-935, 0x1.f3a001a1da12ap-930 },
+ { 0x1.8282015bfd093p-936, 0x1.474846e880b8p-930 },
+ { 0x1.fa292d1f4b615p-937, 0x1.acb96019278e3p-931 },
+ { 0x1.4b6323fa7fafcp-937, 0x1.18c50c637e437p-931 },
+ { 0x1.b1ded81f6cf48p-938, 0x1.6fb47e7243b1p-932 },
+ { 0x1.1bfd2aff12d23p-938, 0x1.e17fe4af1cdcdp-933 },
+ { 0x1.73b9288cf980bp-939, 0x1.3b3779cd081bcp-933 },
+ { 0x1.e680a6315c8f9p-940, 0x1.9caab20737c4bp-934 },
+ { 0x1.3e52969a46a03p-940, 0x1.0e16c42489121p-934 },
+ { 0x1.a082ea93d471fp-941, 0x1.618056ad2fa0dp-935 },
+ { 0x1.1075d9566cab2p-941, 0x1.ce9e247afa7efp-936 },
+ { 0x1.646a66f6fb197p-942, 0x1.2eabb9557e4c3p-936 },
+ { 0x1.d22f0f82317a8p-943, 0x1.8c0020c90fd02p-937 },
+ { 0x1.30d7883df3e07p-943, 0x1.0305d4157bdecp-937 },
+ { 0x1.8ea1187daf8b3p-944, 0x1.52cf8a69cbdeep-938 },
+ { 0x1.049a91d747c02p-944, 0x1.bb1f3a4ce848cp-939 },
+ { 0x1.54b29ff375e83p-945, 0x1.21bd19407d3a8p-939 },
+ { 0x1.bd5a7cbaf896dp-946, 0x1.7ad97206eb3e9p-940 },
+ { 0x1.230b0dec754dap-946, 0x1.ef4e6059f1fe4p-941 },
+ { 0x1.7c5a693980a4p-947, 0x1.43bdb9112e65bp-941 },
+ { 0x1.f10221f87a1cap-948, 0x1.a7278c0b2c815p-942 },
+ { 0x1.44ae6c097e3b8p-948, 0x1.148391a9b5b7p-942 },
+ { 0x1.a8288818abb4p-949, 0x1.69563388e87eep-943 },
},
-
-/* Coefficients for each order 12 polynomial on each of the 20 intervals. */
-.poly = {
- {0x1.ffffffffffff6p-1, -0x1.20dd750429b66p0, 0x1.fffffffffffdcp-1,
- -0x1.812746b03713ap-1, 0x1.ffffffffbe94cp-2, -0x1.341f6bb6ec9a6p-2,
- 0x1.555553a70ec2ep-3, -0x1.6023b4617a388p-4, 0x1.5550f0e40bfbap-5,
- -0x1.38c290c0c8de8p-6, 0x1.0e84002c6274ep-7, -0x1.a599eb0ac5d04p-9,
- 0x1.c9bfafa73899cp-11},
- {0x1.a2b43dbd503c8p-1, -0x1.a3495b7c9e6a4p-1, 0x1.535f3fb8cb92ap-1,
- -0x1.d96ee9c714f44p-2, 0x1.26956676d2c64p-2, -0x1.4e2820da90c08p-3,
- 0x1.5ea0cffac775ap-4, -0x1.57fb82ca373e8p-5, 0x1.3e0e8f48ba0f8p-6,
- -0x1.16a695af1bbd4p-7, 0x1.cc836241a87d4p-9, -0x1.531de41264fdap-10,
- 0x1.526a8a14e9bfcp-12},
- {0x1.532e75821ed48p-1, -0x1.28be350460782p-1, 0x1.b08873adbf108p-2,
- -0x1.14377569249e2p-2, 0x1.3e1ece8cd10dap-3, -0x1.5087e2e6dc2e8p-4,
- 0x1.4b3adb3bb335ap-5, -0x1.32342d711a4f4p-6, 0x1.0bc4f6ce2b656p-7,
- -0x1.bcdaa331f2144p-9, 0x1.5c21c9e0ca954p-10, -0x1.dfdc9b3b5c402p-12,
- 0x1.b451af7dd52fep-14},
- {0x1.10f9745a4f44ap-1, -0x1.9b03213e6963ap-2, 0x1.09b942bc8de66p-2,
- -0x1.32755394481e4p-3, 0x1.42819b18af0e4p-4, -0x1.3a6d643aaa572p-5,
- 0x1.1f17897603eaep-6, -0x1.eefb8d3f89d42p-8, 0x1.95559544f2fbp-9,
- -0x1.3c2a67c33338p-10, 0x1.cffa784efe6cp-12, -0x1.282646774689cp-13,
- 0x1.e654e67532b44p-16},
- {0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c04dp-2, 0x1.3c27283c328dbp-3,
- -0x1.44837f88ea4bdp-4, 0x1.33cad0e887482p-5, -0x1.10fcf0bc8963cp-6,
- 0x1.c8cb68153ec42p-8, -0x1.6aef9a9842c54p-9, 0x1.1334345d6467cp-10,
- -0x1.8ebe8763a2a8cp-12, 0x1.0f457219dec0dp-13, -0x1.3d2501dcd2a0fp-15,
- 0x1.d213a128a75c9p-18},
- {0x1.5ee444130b7dbp-2, -0x1.78396ab208478p-3, 0x1.6e617ec5c0cc3p-4,
- -0x1.49e60f63656b5p-5, 0x1.16064fddbbcb9p-6, -0x1.ba80af6a31018p-8,
- 0x1.4ec374269d4ecp-9, -0x1.e40be960703a4p-11, 0x1.4fb029f35a144p-12,
- -0x1.be45fd71a60eap-14, 0x1.161235cd2a3e7p-15, -0x1.264890eb1b5ebp-17,
- 0x1.7f90154bde15dp-20},
- {0x1.19a22c064d4eap-2, -0x1.f645498cae217p-4, 0x1.a0565950e3f08p-5,
- -0x1.446605c21c178p-6, 0x1.df1231d75622fp-8, -0x1.515167553de25p-9,
- 0x1.c72c1b4a2a57fp-11, -0x1.276ae9394ecf1p-12, 0x1.71d2696d6c8c3p-14,
- -0x1.bd4152984ce1dp-16, 0x1.f5afd2b450df7p-18, -0x1.dafdaddc7f943p-20,
- 0x1.1020f4741f79ep-22},
- {0x1.c57f0542a7637p-3, -0x1.4e5535c17afc8p-4, 0x1.d312725242824p-6,
- -0x1.3727cbc12a4bbp-7, 0x1.8d6730fc45b6bp-9, -0x1.e8855055c9b53p-11,
- 0x1.21f73b70cc792p-12, -0x1.4d4fe06f13831p-14, 0x1.73867a82f7484p-16,
- -0x1.8fab204d1d75ep-18, 0x1.91d9ba10367f4p-20, -0x1.5077ce4b334ddp-22,
- 0x1.501716d098f14p-25},
- {0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b135p-5, 0x1.043fe1a989f11p-6,
- -0x1.259061b98cf96p-8, 0x1.409cc2b1c4fc2p-10, -0x1.53dec152f6abfp-12,
- 0x1.5e72cb4cc919fp-14, -0x1.6018b68100642p-16, 0x1.58d859380fb24p-18,
- -0x1.471723286dad5p-20, 0x1.21c1a0f7a6593p-22, -0x1.a872678d91154p-25,
- 0x1.6eb74e2e99662p-28},
- {0x1.29a8a4e95063ep-3, -0x1.29a8a316d3318p-5, 0x1.21876b3fe4f84p-7,
- -0x1.1276f2d8ee36cp-9, 0x1.fbff52181a454p-12, -0x1.cb9ce9bde195ep-14,
- 0x1.9710786fa90c5p-16, -0x1.6145ad5b471dcp-18, 0x1.2c52fac57009cp-20,
- -0x1.f02a8711f07cfp-23, 0x1.7eb574960398cp-25, -0x1.e58ce325343aap-28,
- 0x1.68510d1c32842p-31},
- {0x1.e583024e2bc8p-4, -0x1.8fb458acb5b0fp-6, 0x1.42b9dffac2531p-8,
- -0x1.ff9fe9a553dddp-11, 0x1.8e7e86883ba0bp-13, -0x1.313af0bb12375p-15,
- 0x1.cc29ccb17372ep-18, -0x1.55895fbb1ae42p-20, 0x1.f2bd2d6c7fd07p-23,
- -0x1.62ec031844613p-25, 0x1.d7d69ce7c1847p-28, -0x1.0106b95e4db03p-30,
- 0x1.45aabbe505f6ap-34},
- {0x1.8d9cbafa30408p-4, -0x1.0dd14614ed20fp-6, 0x1.6943976ea9dcap-9,
- -0x1.dd6f05f4d7ce8p-12, 0x1.37891334aa621p-14, -0x1.91a8207766e1ep-17,
- 0x1.ffcb0c613d75cp-20, -0x1.425116a6c88dfp-22, 0x1.90cb7c902d428p-25,
- -0x1.e70fc740c3b6dp-28, 0x1.14a09ae5851ep-30, -0x1.00f9e03eae993p-33,
- 0x1.14989aac741c2p-37},
- {0x1.46dc6bf900f68p-4, -0x1.6e4b45246f8dp-7, 0x1.96a3de47cfdb5p-10,
- -0x1.bf5070eb6823bp-13, 0x1.e7af6e4aa8ef8p-16, -0x1.078bf26142831p-18,
- 0x1.1a6e547aa40bep-21, -0x1.2c1c68f62f614p-24, 0x1.3bb8b473dd9e7p-27,
- -0x1.45576cacb45a1p-30, 0x1.39ab71899b44ep-33, -0x1.ee307d46e2866p-37,
- 0x1.c21ba1b404f5ap-41},
- {0x1.0d9a17e032288p-4, -0x1.f3e942ff4e097p-8, 0x1.cc77f09db5af8p-11,
- -0x1.a56e8bffaab5cp-14, 0x1.7f49e36974e03p-17, -0x1.5a73fc0025d2fp-20,
- 0x1.3742ae06a8be6p-23, -0x1.15ecf5317789bp-26, 0x1.ec74dd2b109fp-30,
- -0x1.ac28325f88dc1p-33, 0x1.5ca9e8d7841b2p-36, -0x1.cfef04667185fp-40,
- 0x1.6487c50052867p-44},
- {0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cb33p-8, 0x1.0645980ec8568p-11,
- -0x1.8f86f88695a8cp-15, 0x1.2ef80cb1dca7cp-18, -0x1.c97ff7c599a6dp-22,
- 0x1.57f0ac907d436p-25, -0x1.016be8d812c69p-28, 0x1.7ef6d33c73b75p-32,
- -0x1.17f9784eda0d4p-35, 0x1.7fd8662b486f1p-39, -0x1.ae21758156d89p-43,
- 0x1.165732f1ae138p-47},
- {0x1.71eafbd9f5877p-5, -0x1.d83714d904525p-9, 0x1.2c74dbaccea28p-12,
- -0x1.7d27f3cdea565p-16, 0x1.e20b13581fcf8p-20, -0x1.2fe336f089679p-23,
- 0x1.7dfce36129db3p-27, -0x1.dea026ee03f14p-31, 0x1.2a6019f7c64b1p-34,
- -0x1.6e0eeb9f98eeap-38, 0x1.a58b4ed07d741p-42, -0x1.8d12c77071e4cp-46,
- 0x1.b0241c6d5b761p-51},
- {0x1.33714a024097ep-5, -0x1.467f441a50cbdp-9, 0x1.59fa2994d0e65p-13,
- -0x1.6dd369d9306cap-17, 0x1.81fb2b2af9413p-21, -0x1.96604d3c1bb6ep-25,
- 0x1.aaef2da14243p-29, -0x1.bf7f1b935d3ebp-33, 0x1.d3261ebcd2061p-37,
- -0x1.e04c803bbd875p-41, 0x1.cff98a43bacdep-45, -0x1.6ef39a63cf675p-49,
- 0x1.4f8abb4398a0dp-54},
- {0x1.fff97acd75487p-6, -0x1.c502e8e46ec0cp-10, 0x1.903b0650672eap-14,
- -0x1.6110aa5fb096fp-18, 0x1.36fd4c3e4040cp-22, -0x1.118489fe28728p-26,
- 0x1.e06601208ac47p-31, -0x1.a52b90c21650ap-35, 0x1.6ffc42c05429bp-39,
- -0x1.3ce3322a6972ep-43, 0x1.009d8ef37ff8cp-47, -0x1.5498d2cc51c99p-52,
- 0x1.058cd4ea9bf04p-57},
- {0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf97dp-10, 0x1.d0ddfb8593f4p-15,
- -0x1.5673f4aa86542p-19, 0x1.f8048954325f6p-24, -0x1.72839959ab3e9p-28,
- 0x1.101597113be2ap-32, -0x1.8f1cf0ff4adeep-37, 0x1.23dca407fd66p-41,
- -0x1.a4f387e57a6a5p-46, 0x1.1dafd753f65e9p-50, -0x1.3e15343c973d6p-55,
- 0x1.9a2af47d77e44p-61},
- {0x1.64839d636f92bp-6, -0x1.b7adf7536232dp-11, 0x1.0eec0b6357148p-15,
- -0x1.4da09b7f2c52bp-20, 0x1.9a8b146de838ep-25, -0x1.f8d1f145e7b6fp-30,
- 0x1.3624435b3ba11p-34, -0x1.7cba19b4af977p-39, 0x1.d2282481ba91ep-44,
- -0x1.198c1e91f9564p-48, 0x1.4046224f8ccp-53, -0x1.2b1dc676c096fp-58,
- 0x1.43d3358c64dafp-64}
-}
};
diff --git a/pl/math/erfcf.h b/pl/math/erfcf.h
deleted file mode 100644
index 8f1e5f4226e3..000000000000
--- a/pl/math/erfcf.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Shared functions for scalar and vector single-precision erfc(x) functions.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_ERFCF_H
-#define PL_MATH_ERFCF_H
-
-#include "math_config.h"
-
-#define FMA fma
-#include "estrin_wrap.h"
-
-/* Accurate exponential from optimized-routines. */
-double
-__exp_dd (double x, double xtail);
-
-static inline double
-eval_poly (double z, const double *coeff)
-{
- double z2 = z * z;
- double z4 = z2 * z2;
- double z8 = z4 * z4;
-#define C(i) coeff[i]
- return ESTRIN_15 (z, z2, z4, z8, C);
-#undef C
-}
-
-static inline double
-eval_exp_mx2 (double x)
-{
- return __exp_dd (-(x * x), 0.0);
-}
-
-#undef FMA
-#endif // PL_MATH_ERFCF_H
diff --git a/pl/math/erfcf_1u7.c b/pl/math/erfcf_1u7.c
new file mode 100644
index 000000000000..c8ce95cca058
--- /dev/null
+++ b/pl/math/erfcf_1u7.c
@@ -0,0 +1,103 @@
+/*
+ * Single-precision erfc(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define Shift 0x1p17f
+#define OneThird 0x1.555556p-2f
+#define TwoThird 0x1.555556p-1f
+
+#define TwoOverFifteen 0x1.111112p-3f
+#define TwoOverFive 0x1.99999ap-2f
+#define Tenth 0x1.99999ap-4f
+
+#define SignMask 0x7fffffff
+
+/* Fast erfcf approximation based on series expansion near x rounded to
+ nearest multiple of 1/64.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+ Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
+ erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
+ want 0x1.f51216p-120. */
+float
+erfcf (float x)
+{
+ /* Get top words and sign. */
+ uint32_t ix = asuint (x);
+ uint32_t ia = ix & SignMask;
+ uint32_t sign = ix & ~SignMask;
+
+ /* |x| < 0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */
+ if (unlikely (ia < 0x32800000))
+ return 1.0f - x; /* Small case. */
+
+ /* For |x| < 10.0625, the following approximation holds. */
+ if (likely (ia < 0x41210000))
+ {
+ /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 1 and scale
+ to 2/sqrt(pi), when x reduced to r = 0. */
+ float a = asfloat (ia);
+ float z = a + Shift;
+ uint32_t i = asuint (z) - asuint (Shift);
+ float r = z - Shift;
+
+ /* These values are scaled by 2^-47. */
+ float erfcr = __erfcf_data.tab[i].erfc;
+ float scale = __erfcf_data.tab[i].scale;
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly (r, d). */
+ float d = a - r;
+ float d2 = d * d;
+ float r2 = r * r;
+ float p1 = -r;
+ float p2 = fmaf (TwoThird, r2, -OneThird);
+ float p3 = -r * fmaf (OneThird, r2, -0.5f);
+ float p4 = fmaf (fmaf (TwoOverFifteen, r2, -TwoOverFive), r2, Tenth);
+ float y = fmaf (p4, d, p3);
+ y = fmaf (y, d, p2);
+ y = fmaf (y, d, p1);
+ y = fmaf (-fmaf (y, d2, d), scale, erfcr);
+ /* Handle sign and scale back in a single fma. */
+ float off = asfloat (sign >> 1);
+ float fac = asfloat (asuint (0x1p-47f) | sign);
+ y = fmaf (y, fac, off);
+ /* The underflow exception needs to be signaled explicitly when
+ result gets into subormnal range. */
+ if (x >= 0x1.2639cp+3f)
+ force_eval_float (opt_barrier_float (0x1p-123f) * 0x1p-123f);
+ return y;
+ }
+
+ /* erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2. */
+ if (unlikely (ia >= 0x7f800000))
+ return asfloat (sign >> 1) + 1.0f / x; /* Special cases. */
+
+ /* Above this threshold erfcf is constant and needs to raise underflow
+ exception for positive x. */
+ return sign ? 2.0f : __math_uflowf (0);
+}
+
+PL_SIG (S, F, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (erfcf, 1.14)
+PL_TEST_SYM_INTERVAL (erfcf, 0, 0x1p-26, 40000)
+PL_TEST_INTERVAL (erfcf, 0x1p-26, 10.0625, 40000)
+PL_TEST_INTERVAL (erfcf, -0x1p-26, -4.0, 40000)
+PL_TEST_INTERVAL (erfcf, 10.0625, inf, 40000)
+PL_TEST_INTERVAL (erfcf, -4.0, -inf, 40000)
diff --git a/pl/math/erfcf_2u.c b/pl/math/erfcf_2u.c
deleted file mode 100644
index 5a3f9b00aa5c..000000000000
--- a/pl/math/erfcf_2u.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Single-precision erfc(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "erfcf.h"
-#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define P(i) __erfcf_poly_data.poly[i]
-
-/* Approximation of erfcf for |x| > 4.0. */
-static inline float
-approx_erfcf_hi (float x, uint32_t sign, const double *coeff)
-{
- if (sign)
- {
- return 2.0f;
- }
-
- /* Polynomial contribution. */
- double z = (double) fabs (x);
- float p = (float) eval_poly (z, coeff);
- /* Gaussian contribution. */
- float e_mx2 = (float) eval_exp_mx2 (z);
-
- return p * e_mx2;
-}
-
-/* Approximation of erfcf for |x| < 4.0. */
-static inline float
-approx_erfcf_lo (float x, uint32_t sign, const double *coeff)
-{
- /* Polynomial contribution. */
- double z = (double) fabs (x);
- float p = (float) eval_poly (z, coeff);
- /* Gaussian contribution. */
- float e_mx2 = (float) eval_exp_mx2 (z);
-
- if (sign)
- return fmaf (-p, e_mx2, 2.0f);
- else
- return p * e_mx2;
-}
-
-/* Top 12 bits of a float (sign and exponent bits). */
-static inline uint32_t
-abstop12 (float x)
-{
- return (asuint (x) >> 20) & 0x7ff;
-}
-
-/* Top 12 bits of a float. */
-static inline uint32_t
-top12 (float x)
-{
- return asuint (x) >> 20;
-}
-
-/* Fast erfcf approximation using polynomial approximation
- multiplied by gaussian.
- Most of the computation is carried out in double precision,
- and is very sensitive to accuracy of polynomial and exp
- evaluation.
- Worst-case error is 1.968ulps, obtained for x = 2.0412941.
- erfcf(0x1.05492p+1) got 0x1.fe10f6p-9 want 0x1.fe10f2p-9 ulp
- err 1.46788. */
-float
-erfcf (float x)
-{
- /* Get top words and sign. */
- uint32_t ix = asuint (x); /* We need to compare at most 32 bits. */
- uint32_t sign = ix >> 31;
- uint32_t ia12 = top12 (x) & 0x7ff;
-
- /* Handle special cases and small values with a single comparison:
- abstop12(x)-abstop12(small) >= abstop12(INFINITY)-abstop12(small)
-
- Special cases
- erfcf(nan)=nan, erfcf(+inf)=0 and erfcf(-inf)=2
-
- Errno
- EDOM does not have to be set in case of erfcf(nan).
- Only ERANGE may be set in case of underflow.
-
- Small values (|x|<small)
- |x|<0x1.0p-26 => accurate to 0.5 ULP (top12(0x1p-26) = 0x328). */
- if (unlikely (abstop12 (x) - 0x328 >= (abstop12 (INFINITY) & 0x7f8) - 0x328))
- {
- if (abstop12 (x) >= 0x7f8)
- return (float) (sign << 1) + 1.0f / x; /* Special cases. */
- else
- return 1.0f - x; /* Small case. */
- }
-
- /* Normalized numbers divided in 4 intervals
- with bounds: 2.0, 4.0, 8.0 and 10.0. 10 was chosen as the upper bound for
- the interesting region as it is the smallest value, representable as a
- 12-bit integer, for which returning 0 gives <1.5 ULP. */
- if (ia12 < 0x400)
- {
- return approx_erfcf_lo (x, sign, P (0));
- }
- if (ia12 < 0x408)
- {
- return approx_erfcf_lo (x, sign, P (1));
- }
- if (ia12 < 0x410)
- {
- return approx_erfcf_hi (x, sign, P (2));
- }
- if (ia12 < 0x412)
- {
- return approx_erfcf_hi (x, sign, P (3));
- }
- if (sign)
- {
- return 2.0f;
- }
- return __math_uflowf (0);
-}
-
-PL_SIG (S, F, 1, erfc, -4.0, 10.0)
-PL_TEST_ULP (erfcf, 1.5)
-PL_TEST_INTERVAL (erfcf, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (erfcf, 0x1p-127, 0x1p-26, 40000)
-PL_TEST_INTERVAL (erfcf, -0x1p-127, -0x1p-26, 40000)
-PL_TEST_INTERVAL (erfcf, 0x1p-26, 0x1p5, 40000)
-PL_TEST_INTERVAL (erfcf, -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (erfcf, 0, inf, 40000)
diff --git a/pl/math/erfcf_data.c b/pl/math/erfcf_data.c
index 2e018c8c6710..a54e11973819 100644
--- a/pl/math/erfcf_data.c
+++ b/pl/math/erfcf_data.c
@@ -1,57 +1,664 @@
/*
* Data used in single-precision erfc(x) function.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* Polynomial coefficients for approximating erfc(x)*exp(x*x) in double
- precision. Generated using the Remez algorithm on each interval separately
- (see erfcf.sollya for more detail). */
-const struct erfcf_poly_data __erfcf_poly_data
- = {.poly
- = {{
-#if ERFCF_POLY_NCOEFFS == 16
- 0x1.ffffffffe7c59p-1, -0x1.20dd74f8cecc5p0, 0x1.fffffc67a0fbdp-1,
- -0x1.81270c3ced2d6p-1, 0x1.fffc0c6606e45p-2, -0x1.340a779e8a8e3p-2,
- 0x1.54c1663fc5a01p-3, -0x1.5d468c9269dafp-4, 0x1.4afe6b00df9d5p-5,
- -0x1.1d22d2720cb91p-6, 0x1.afa399a5761b1p-8, -0x1.113851b5858adp-9,
- 0x1.0f992e4d5c6a4p-11, -0x1.86534d558052ap-14, 0x1.63e537bfb7cd5p-17,
- -0x1.32712a6275c4dp-21
-#endif
+/* Lookup table used in erfcf.
+ For each possible rounded input r (multiples of 1/64), between
+ r = 0.0 and r = 10.0625 (645 values):
+ - the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
+ - the second entry __erfcf_data.tab.scale contains the values of
+ 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
+ they are scaled by a large enough value 2^47 (fits in 8 bits). */
+const struct erfcf_data __erfcf_data = {
+ .tab = { { 0x1p47, 0x1.20dd76p47 },
+ { 0x1.f6f944p46, 0x1.20cb68p47 },
+ { 0x1.edf3aap46, 0x1.209546p47 },
+ { 0x1.e4f05p46, 0x1.203b26p47 },
+ { 0x1.dbf056p46, 0x1.1fbd28p47 },
+ { 0x1.d2f4dcp46, 0x1.1f1b7ap47 },
+ { 0x1.c9fefep46, 0x1.1e565cp47 },
+ { 0x1.c10fd4p46, 0x1.1d6e14p47 },
+ { 0x1.b8287ap46, 0x1.1c62fap47 },
+ { 0x1.af4ap46, 0x1.1b3572p47 },
+ { 0x1.a6757ep46, 0x1.19e5eap47 },
+ { 0x1.9dabfcp46, 0x1.1874dep47 },
+ { 0x1.94ee88p46, 0x1.16e2d8p47 },
+ { 0x1.8c3e24p46, 0x1.153068p47 },
+ { 0x1.839bd6p46, 0x1.135e3p47 },
+ { 0x1.7b0894p46, 0x1.116cd8p47 },
+ { 0x1.728558p46, 0x1.0f5d16p47 },
+ { 0x1.6a1312p46, 0x1.0d2fa6p47 },
+ { 0x1.61b2acp46, 0x1.0ae55p47 },
+ { 0x1.596508p46, 0x1.087ee4p47 },
+ { 0x1.512b06p46, 0x1.05fd3ep47 },
+ { 0x1.49057ap46, 0x1.03614p47 },
+ { 0x1.40f536p46, 0x1.00abdp47 },
+ { 0x1.38fbp46, 0x1.fbbbbep46 },
+ { 0x1.311796p46, 0x1.f5f0cep46 },
+ { 0x1.294bb4p46, 0x1.eff8c4p46 },
+ { 0x1.21980ap46, 0x1.e9d5a8p46 },
+ { 0x1.19fd3ep46, 0x1.e38988p46 },
+ { 0x1.127bf2p46, 0x1.dd167cp46 },
+ { 0x1.0b14bcp46, 0x1.d67ea2p46 },
+ { 0x1.03c82ap46, 0x1.cfc41ep46 },
+ { 0x1.f92d8cp45, 0x1.c8e91cp46 },
+ { 0x1.eb0214p45, 0x1.c1efcap46 },
+ { 0x1.dd0edap45, 0x1.bada5ap46 },
+ { 0x1.cf54b4p45, 0x1.b3aafcp46 },
+ { 0x1.c1d46ap45, 0x1.ac63e8p46 },
+ { 0x1.b48eaep45, 0x1.a5074ep46 },
+ { 0x1.a78428p45, 0x1.9d9762p46 },
+ { 0x1.9ab566p45, 0x1.96165p46 },
+ { 0x1.8e22eap45, 0x1.8e8646p46 },
+ { 0x1.81cd24p45, 0x1.86e96ap46 },
+ { 0x1.75b47p45, 0x1.7f41dcp46 },
+ { 0x1.69d91ep45, 0x1.7791b8p46 },
+ { 0x1.5e3b66p45, 0x1.6fdb12p46 },
+ { 0x1.52db78p45, 0x1.681ff2p46 },
+ { 0x1.47b96ep45, 0x1.60625cp46 },
+ { 0x1.3cd554p45, 0x1.58a446p46 },
+ { 0x1.322f26p45, 0x1.50e79ep46 },
+ { 0x1.27c6d2p45, 0x1.492e42p46 },
+ { 0x1.1d9c34p45, 0x1.417a0cp46 },
+ { 0x1.13af1ep45, 0x1.39ccc2p46 },
+ { 0x1.09ff5p45, 0x1.32281ep46 },
+ { 0x1.008c8p45, 0x1.2a8dcep46 },
+ { 0x1.eeaca8p44, 0x1.22ff72p46 },
+ { 0x1.dcb8cap44, 0x1.1b7e98p46 },
+ { 0x1.cb3c86p44, 0x1.140cc4p46 },
+ { 0x1.ba36dap44, 0x1.0cab62p46 },
+ { 0x1.a9a6bap44, 0x1.055bd6p46 },
+ { 0x1.998afap44, 0x1.fc3ee6p45 },
+ { 0x1.89e25ep44, 0x1.edeeeep45 },
+ { 0x1.7aab98p44, 0x1.dfca26p45 },
+ { 0x1.6be542p44, 0x1.d1d2dp45 },
+ { 0x1.5d8decp44, 0x1.c40b08p45 },
+ { 0x1.4fa40ep44, 0x1.b674c8p45 },
+ { 0x1.422616p44, 0x1.a911fp45 },
+ { 0x1.351262p44, 0x1.9be438p45 },
+ { 0x1.28674p44, 0x1.8eed36p45 },
+ { 0x1.1c22f8p44, 0x1.822e66p45 },
+ { 0x1.1043c2p44, 0x1.75a91ap45 },
+ { 0x1.04c7cap44, 0x1.695e8cp45 },
+ { 0x1.f35a72p43, 0x1.5d4fd4p45 },
+ { 0x1.dde456p43, 0x1.517de6p45 },
+ { 0x1.c9296cp43, 0x1.45e99cp45 },
+ { 0x1.b525d6p43, 0x1.3a93b2p45 },
+ { 0x1.a1d5a6p43, 0x1.2f7cc4p45 },
+ { 0x1.8f34eap43, 0x1.24a554p45 },
+ { 0x1.7d3fa6p43, 0x1.1a0dc6p45 },
+ { 0x1.6bf1dcp43, 0x1.0fb662p45 },
+ { 0x1.5b4784p43, 0x1.059f5ap45 },
+ { 0x1.4b3c98p43, 0x1.f79184p44 },
+ { 0x1.3bcd14p43, 0x1.e4653p44 },
+ { 0x1.2cf4eep43, 0x1.d1b982p44 },
+ { 0x1.1eb024p43, 0x1.bf8e1cp44 },
+ { 0x1.10fab8p43, 0x1.ade26cp44 },
+ { 0x1.03d0acp43, 0x1.9cb5bep44 },
+ { 0x1.ee5c18p42, 0x1.8c0732p44 },
+ { 0x1.d61dd6p42, 0x1.7bd5c8p44 },
+ { 0x1.bedec8p42, 0x1.6c2056p44 },
+ { 0x1.a8973cp42, 0x1.5ce596p44 },
+ { 0x1.933f9p42, 0x1.4e241ep44 },
+ { 0x1.7ed03ap42, 0x1.3fda6cp44 },
+ { 0x1.6b41ccp42, 0x1.3206dcp44 },
+ { 0x1.588cf2p42, 0x1.24a7b8p44 },
+ { 0x1.46aa72p42, 0x1.17bb2cp44 },
+ { 0x1.359332p42, 0x1.0b3f52p44 },
+ { 0x1.254038p42, 0x1.fe646p43 },
+ { 0x1.15aaa8p42, 0x1.e72372p43 },
+ { 0x1.06cbcap42, 0x1.d0b7ap43 },
+ { 0x1.f13a04p41, 0x1.bb1c98p43 },
+ { 0x1.d62fbep41, 0x1.a64de6p43 },
+ { 0x1.bc6c1ep41, 0x1.92470ap43 },
+ { 0x1.a3e2ccp41, 0x1.7f036cp43 },
+ { 0x1.8c87b8p41, 0x1.6c7e64p43 },
+ { 0x1.764f2p41, 0x1.5ab342p43 },
+ { 0x1.612d8ap41, 0x1.499d48p43 },
+ { 0x1.4d17cap41, 0x1.3937b2p43 },
+ { 0x1.3a03p41, 0x1.297dbap43 },
+ { 0x1.27e498p41, 0x1.1a6a96p43 },
+ { 0x1.16b24cp41, 0x1.0bf97ep43 },
+ { 0x1.066222p41, 0x1.fc4b5ep42 },
+ { 0x1.edd4d2p40, 0x1.e1d4dp42 },
+ { 0x1.d08382p40, 0x1.c885ep42 },
+ { 0x1.b4be2p40, 0x1.b0553p42 },
+ { 0x1.9a7316p40, 0x1.99397ap42 },
+ { 0x1.81915cp40, 0x1.83298ep42 },
+ { 0x1.6a088p40, 0x1.6e1c58p42 },
+ { 0x1.53c89ep40, 0x1.5a08e8p42 },
+ { 0x1.3ec25ep40, 0x1.46e66cp42 },
+ { 0x1.2ae6fap40, 0x1.34ac36p42 },
+ { 0x1.18282ep40, 0x1.2351c2p42 },
+ { 0x1.067844p40, 0x1.12ceb4p42 },
+ { 0x1.eb940ep39, 0x1.031ad6p42 },
+ { 0x1.cc2186p39, 0x1.e85c44p41 },
+ { 0x1.ae808cp39, 0x1.cc018p41 },
+ { 0x1.9299bp39, 0x1.b1160ap41 },
+ { 0x1.785674p39, 0x1.978ae8p41 },
+ { 0x1.5fa14ap39, 0x1.7f5188p41 },
+ { 0x1.486586p39, 0x1.685bb6p41 },
+ { 0x1.328f5ep39, 0x1.529b9ep41 },
+ { 0x1.1e0be6p39, 0x1.3e03d8p41 },
+ { 0x1.0ac8fcp39, 0x1.2a875cp41 },
+ { 0x1.f16aaep38, 0x1.181984p41 },
+ { 0x1.cf80d4p38, 0x1.06ae14p41 },
+ { 0x1.afb4e2p38, 0x1.ec7262p40 },
+ { 0x1.91e8bep38, 0x1.cd5ecap40 },
+ { 0x1.75ffb4p38, 0x1.b00b38p40 },
+ { 0x1.5bde72p38, 0x1.94624ep40 },
+ { 0x1.436af4p38, 0x1.7a4f6ap40 },
+ { 0x1.2c8c7ap38, 0x1.61beaep40 },
+ { 0x1.172b7ap38, 0x1.4a9cf6p40 },
+ { 0x1.033198p38, 0x1.34d7dcp40 },
+ { 0x1.e11332p37, 0x1.205dacp40 },
+ { 0x1.be3ebp37, 0x1.0d1d6ap40 },
+ { 0x1.9dbf72p37, 0x1.f60d8ap39 },
+ { 0x1.7f714p37, 0x1.d4143ap39 },
+ { 0x1.6331cap37, 0x1.b430ecp39 },
+ { 0x1.48e09cp37, 0x1.9646f4p39 },
+ { 0x1.305ef8p37, 0x1.7a3adep39 },
+ { 0x1.198fd6p37, 0x1.5ff276p39 },
+ { 0x1.0457c6p37, 0x1.4754acp39 },
+ { 0x1.e139bcp36, 0x1.30499cp39 },
+ { 0x1.bc8d52p36, 0x1.1aba78p39 },
+ { 0x1.9a7c3p36, 0x1.06918cp39 },
+ { 0x1.7adadep36, 0x1.e77448p38 },
+ { 0x1.5d806ap36, 0x1.c4412cp38 },
+ { 0x1.424642p36, 0x1.a36454p38 },
+ { 0x1.290826p36, 0x1.84ba3p38 },
+ { 0x1.11a3f8p36, 0x1.6821p38 },
+ { 0x1.f7f358p35, 0x1.4d78bcp38 },
+ { 0x1.cfd652p35, 0x1.34a306p38 },
+ { 0x1.aab85ap35, 0x1.1d8318p38 },
+ { 0x1.88647p35, 0x1.07fdb4p38 },
+ { 0x1.68a8e4p35, 0x1.e7f232p37 },
+ { 0x1.4b5726p35, 0x1.c2b9dp37 },
+ { 0x1.30439cp35, 0x1.a02436p37 },
+ { 0x1.174578p35, 0x1.8005fp37 },
+ { 0x1.003692p35, 0x1.6235fcp37 },
+ { 0x1.d5e678p34, 0x1.468daep37 },
+ { 0x1.aeb442p34, 0x1.2ce898p37 },
+ { 0x1.8a9848p34, 0x1.15246ep37 },
+ { 0x1.695876p34, 0x1.fe41cep36 },
+ { 0x1.4abea2p34, 0x1.d57f52p36 },
+ { 0x1.2e984ep34, 0x1.afc85ep36 },
+ { 0x1.14b676p34, 0x1.8ce75ep36 },
+ { 0x1.f9daap33, 0x1.6caa0ep36 },
+ { 0x1.ce283ap33, 0x1.4ee142p36 },
+ { 0x1.a609f8p33, 0x1.3360ccp36 },
+ { 0x1.81396ap33, 0x1.19ff46p36 },
+ { 0x1.5f7524p33, 0x1.0295fp36 },
+ { 0x1.40806ep33, 0x1.da011p35 },
+ { 0x1.2422eep33, 0x1.b23a5ap35 },
+ { 0x1.0a286p33, 0x1.8d986ap35 },
+ { 0x1.e4c0bp32, 0x1.6be022p35 },
+ { 0x1.b93bf4p32, 0x1.4cda54p35 },
+ { 0x1.916f7cp32, 0x1.30539p35 },
+ { 0x1.6d0e7p32, 0x1.161be4p35 },
+ { 0x1.4bd1cp32, 0x1.fc0d56p34 },
+ { 0x1.2d77bep32, 0x1.cfd4a6p34 },
+ { 0x1.11c3bep32, 0x1.a74068p34 },
+ { 0x1.f0fb86p31, 0x1.8208bcp34 },
+ { 0x1.c2e43ep31, 0x1.5feadap34 },
+ { 0x1.98e254p31, 0x1.40a8c2p34 },
+ { 0x1.729df6p31, 0x1.2408eap34 },
+ { 0x1.4fc63cp31, 0x1.09d5f8p34 },
+ { 0x1.3010aap31, 0x1.e3bcf4p33 },
+ { 0x1.1338b8p31, 0x1.b7e946p33 },
+ { 0x1.f1fecp30, 0x1.8fdc1cp33 },
+ { 0x1.c2556ap30, 0x1.6b4702p33 },
+ { 0x1.970b06p30, 0x1.49e178p33 },
+ { 0x1.6fbddep30, 0x1.2b6876p33 },
+ { 0x1.4c144ep30, 0x1.0f9e1cp33 },
+ { 0x1.2bbc1ep30, 0x1.ec929ap32 },
+ { 0x1.0e69f2p30, 0x1.be6abcp32 },
+ { 0x1.e7b188p29, 0x1.94637ep32 },
+ { 0x1.b792bcp29, 0x1.6e2368p32 },
+ { 0x1.8c03d2p29, 0x1.4b581cp32 },
+ { 0x1.649b02p29, 0x1.2bb5ccp32 },
+ { 0x1.40f794p29, 0x1.0ef6c4p32 },
+ { 0x1.20c13p29, 0x1.e9b5e8p31 },
+ { 0x1.03a72ap29, 0x1.ba4f04p31 },
+ { 0x1.d2bfc6p28, 0x1.8f4cccp31 },
+ { 0x1.a35068p28, 0x1.684c22p31 },
+ { 0x1.7885cep28, 0x1.44f21ep31 },
+ { 0x1.51f06ap28, 0x1.24eb72p31 },
+ { 0x1.2f2aaap28, 0x1.07ebd2p31 },
+ { 0x1.0fd816p28, 0x1.db5adp30 },
+ { 0x1.e7493p27, 0x1.abe09ep30 },
+ { 0x1.b48774p27, 0x1.80f43ap30 },
+ { 0x1.86e006p27, 0x1.5a2aep30 },
+ { 0x1.5dd4bp27, 0x1.37231p30 },
+ { 0x1.38f2e8p27, 0x1.1783cep30 },
+ { 0x1.17d2c6p27, 0x1.f5f7d8p29 },
+ { 0x1.f42c18p26, 0x1.c282cep29 },
+ { 0x1.beceb2p26, 0x1.94219cp29 },
+ { 0x1.8ef2aap26, 0x1.6a5972p29 },
+ { 0x1.640bf6p26, 0x1.44ba86p29 },
+ { 0x1.3d9be6p26, 0x1.22df2ap29 },
+ { 0x1.1b2fe4p26, 0x1.046aeap29 },
+ { 0x1.f8c0c2p25, 0x1.d21398p28 },
+ { 0x1.c19fa8p25, 0x1.a0df1p28 },
+ { 0x1.90538cp25, 0x1.74adc8p28 },
+ { 0x1.6443fep25, 0x1.4d0232p28 },
+ { 0x1.3ce784p25, 0x1.296a7p28 },
+ { 0x1.19c232p25, 0x1.097f62p28 },
+ { 0x1.f4c8c4p24, 0x1.d9c736p27 },
+ { 0x1.bcd30ep24, 0x1.a6852cp27 },
+ { 0x1.8aee4cp24, 0x1.789fb8p27 },
+ { 0x1.5e77b6p24, 0x1.4f8c96p27 },
+ { 0x1.36dcf2p24, 0x1.2acee2p27 },
+ { 0x1.139a7cp24, 0x1.09f5dp27 },
+ { 0x1.e8747p23, 0x1.d9371ep26 },
+ { 0x1.b0a44ap23, 0x1.a4c89ep26 },
+ { 0x1.7f064ap23, 0x1.75fa8ep26 },
+ { 0x1.52efep23, 0x1.4c37cp26 },
+ { 0x1.2bc82ap23, 0x1.26f9ep26 },
+ { 0x1.09064p23, 0x1.05c804p26 },
+ { 0x1.d45f16p22, 0x1.d06ad6p25 },
+ { 0x1.9dacb2p22, 0x1.9bc0ap25 },
+ { 0x1.6d3126p22, 0x1.6ce1aap25 },
+ { 0x1.423d14p22, 0x1.43302cp25 },
+ { 0x1.1c33cep22, 0x1.1e1e86p25 },
+ { 0x1.f512dep21, 0x1.fa5b5p24 },
+ { 0x1.b9823cp21, 0x1.bfd756p24 },
+ { 0x1.84d6fep21, 0x1.8be4f8p24 },
+ { 0x1.564a92p21, 0x1.5dcd66p24 },
+ { 0x1.2d2c0ap21, 0x1.34ecf8p24 },
+ { 0x1.08ddd2p21, 0x1.10b148p24 },
+ { 0x1.d1a75p20, 0x1.e12eep23 },
+ { 0x1.99218cp20, 0x1.a854eap23 },
+ { 0x1.674c6ap20, 0x1.7603bap23 },
+ { 0x1.3b62b6p20, 0x1.4980ccp23 },
+ { 0x1.14b54p20, 0x1.2225b2p23 },
+ { 0x1.e55102p19, 0x1.febc1p22 },
+ { 0x1.a964eep19, 0x1.c14b22p22 },
+ { 0x1.74b17ap19, 0x1.8b0cfcp22 },
+ { 0x1.465daap19, 0x1.5b2fe6p22 },
+ { 0x1.1da944p19, 0x1.30f93cp22 },
+ { 0x1.f3d41p18, 0x1.0bc30cp22 },
+ { 0x1.b512a2p18, 0x1.d5f3a8p21 },
+ { 0x1.7e03b2p18, 0x1.9c3518p21 },
+ { 0x1.4dbb98p18, 0x1.6961b8p21 },
+ { 0x1.236a1ap18, 0x1.3cab14p21 },
+ { 0x1.fcae94p17, 0x1.155a0ap21 },
+ { 0x1.bbc1ap17, 0x1.e5989p20 },
+ { 0x1.82eedcp17, 0x1.a8e406p20 },
+ { 0x1.5139a6p17, 0x1.7397c6p20 },
+ { 0x1.25c354p17, 0x1.44d26ep20 },
+ { 0x1.ff8f84p16, 0x1.1bcca4p20 },
+ { 0x1.bd3474p16, 0x1.efac52p19 },
+ { 0x1.834586p16, 0x1.b0a68ap19 },
+ { 0x1.50b75cp16, 0x1.7974e8p19 },
+ { 0x1.249ef2p16, 0x1.4924a8p19 },
+ { 0x1.fc5b88p15, 0x1.1edfa4p19 },
+ { 0x1.b95ceep15, 0x1.f3d218p18 },
+ { 0x1.7f03bap15, 0x1.b334fap18 },
+ { 0x1.4c389cp15, 0x1.7ac2d8p18 },
+ { 0x1.2006aep15, 0x1.4979acp18 },
+ { 0x1.f32eap14, 0x1.1e767cp18 },
+ { 0x1.b05cfep14, 0x1.f1e352p17 },
+ { 0x1.764f46p14, 0x1.b0778cp17 },
+ { 0x1.43e56cp14, 0x1.77756ep17 },
+ { 0x1.18238p14, 0x1.45ce66p17 },
+ { 0x1.e45a98p13, 0x1.1a95p17 },
+ { 0x1.a284ccp13, 0x1.e9f2p16 },
+ { 0x1.697596p13, 0x1.a887bep16 },
+ { 0x1.3807acp13, 0x1.6fab64p16 },
+ { 0x1.0d3b36p13, 0x1.3e44e4p16 },
+ { 0x1.d0624p12, 0x1.135f28p16 },
+ { 0x1.904e0cp12, 0x1.dc479ep15 },
+ { 0x1.58e72ap12, 0x1.9baed4p15 },
+ { 0x1.2906ccp12, 0x1.63ac6cp15 },
+ { 0x1.ff58dap11, 0x1.33225ap15 },
+ { 0x1.b7f1f4p11, 0x1.0916fp15 },
+ { 0x1.7a551p11, 0x1.c960cp14 },
+ { 0x1.453142p11, 0x1.8a6174p14 },
+ { 0x1.1761f8p11, 0x1.53e4f8p14 },
+ { 0x1.dfd296p10, 0x1.24caf2p14 },
+ { 0x1.9bd5fp10, 0x1.f830cp13 },
+ { 0x1.61501p10, 0x1.b1e5acp13 },
+ { 0x1.2ef6p10, 0x1.7538c6p13 },
+ { 0x1.03a918p10, 0x1.40dfd8p13 },
+ { 0x1.bce26ap9, 0x1.13bc08p13 },
+ { 0x1.7cef42p9, 0x1.d9a88p12 },
+ { 0x1.46056p9, 0x1.96a0b4p12 },
+ { 0x1.16e3cap9, 0x1.5ce9acp12 },
+ { 0x1.dcea68p8, 0x1.2b3e54p12 },
+ { 0x1.97945ap8, 0x1.0085p12 },
+ { 0x1.5c2828p8, 0x1.b7937ep11 },
+ { 0x1.29415p8, 0x1.7872dap11 },
+ { 0x1.fb58fap7, 0x1.423acp11 },
+ { 0x1.b0c1a8p7, 0x1.13af5p11 },
+ { 0x1.70f474p7, 0x1.d77f0cp10 },
+ { 0x1.3a68a8p7, 0x1.92ff34p10 },
+ { 0x1.0bcc6p7, 0x1.5847eep10 },
+ { 0x1.c7fa0cp6, 0x1.25f9eep10 },
+ { 0x1.8401b6p6, 0x1.f5cc78p9 },
+ { 0x1.4a029ap6, 0x1.ac0f6p9 },
+ { 0x1.188c46p6, 0x1.6cfa9cp9 },
+ { 0x1.dcc4fap5, 0x1.370ab8p9 },
+ { 0x1.94ec06p5, 0x1.08f24p9 },
+ { 0x1.57bc96p5, 0x1.c324c2p8 },
+ { 0x1.23a81ap5, 0x1.7fe904p8 },
+ { 0x1.eeb278p4, 0x1.46897ep8 },
+ { 0x1.a35794p4, 0x1.159a38p8 },
+ { 0x1.634b8p4, 0x1.d7c594p7 },
+ { 0x1.2ce2a4p4, 0x1.90ae4ep7 },
+ { 0x1.fd5f08p3, 0x1.5422fp7 },
+ { 0x1.aef3cep3, 0x1.20998p7 },
+ { 0x1.6c6e62p3, 0x1.e98102p6 },
+ { 0x1.3407b6p3, 0x1.9eee06p6 },
+ { 0x1.043bap3, 0x1.5f8b88p6 },
+ { 0x1.b77e5cp2, 0x1.29b294p6 },
+ { 0x1.72f0c4p2, 0x1.f7f338p5 },
+ { 0x1.38ee18p2, 0x1.aa5772p5 },
+ { 0x1.07dd68p2, 0x1.68823ep5 },
+ { 0x1.bcc58ep1, 0x1.30b14ep5 },
+ { 0x1.76aca4p1, 0x1.01647cp5 },
+ { 0x1.3b7912p1, 0x1.b2a87ep4 },
+ { 0x1.097f82p1, 0x1.6ed2f2p4 },
+ { 0x1.beaa3ep0, 0x1.356cd6p4 },
+ { 0x1.778be2p0, 0x1.04e15ep4 },
+ { 0x1.3b9984p0, 0x1.b7b04p3 },
+ { 0x1.09182cp0, 0x1.725862p3 },
+ { 0x1.bd20fcp-1, 0x1.37c92cp3 },
+ { 0x1.75892p-1, 0x1.065b96p3 },
+ { 0x1.394e7ap-1, 0x1.b950d4p2 },
+ { 0x1.06a996p-1, 0x1.72fd94p2 },
+ { 0x1.b8328ep-2, 0x1.37b83cp2 },
+ { 0x1.70aff4p-2, 0x1.05ca5p2 },
+ { 0x1.34a53cp-2, 0x1.b7807ep1 },
+ { 0x1.0241dep-2, 0x1.70bebp1 },
+ { 0x1.affb9p-3, 0x1.353a6cp1 },
+ { 0x1.691c7cp-3, 0x1.0330fp1 },
+ { 0x1.2db8cap-3, 0x1.b24a16p0 },
+ { 0x1.f7f4f8p-4, 0x1.6ba91ap0 },
+ { 0x1.a4ab64p-4, 0x1.305e98p0 },
+ { 0x1.5efa4ep-4, 0x1.fd3de2p-1 },
+ { 0x1.24b0d8p-4, 0x1.a9cc94p-1 },
+ { 0x1.e7eeap-5, 0x1.63daf8p-1 },
+ { 0x1.96826ep-5, 0x1.294176p-1 },
+ { 0x1.5282d2p-5, 0x1.f05e82p-2 },
+ { 0x1.19c05p-5, 0x1.9e39dcp-2 },
+ { 0x1.d4ca9cp-6, 0x1.5982p-2 },
+ { 0x1.85cfacp-6, 0x1.200c8ap-2 },
+ { 0x1.43fb32p-6, 0x1.e00e92p-3 },
+ { 0x1.0d2382p-6, 0x1.8fd4ep-3 },
+ { 0x1.bef1b2p-7, 0x1.4cd9cp-3 },
+ { 0x1.72ede4p-7, 0x1.14f48ap-3 },
+ { 0x1.33b1cap-7, 0x1.ccaaeap-4 },
+ { 0x1.fe3bdp-8, 0x1.7eef14p-4 },
+ { 0x1.a6d7d2p-8, 0x1.3e2964p-4 },
+ { 0x1.5e4062p-8, 0x1.083768p-4 },
+ { 0x1.21fb7ap-8, 0x1.b69f1p-5 },
+ { 0x1.dfefbep-9, 0x1.6be574p-5 },
+ { 0x1.8cf816p-9, 0x1.2dc11ap-5 },
+ { 0x1.482fa8p-9, 0x1.f4343cp-6 },
+ { 0x1.0f30c4p-9, 0x1.9e614ep-6 },
+ { 0x1.bff86ep-10, 0x1.571d34p-6 },
+ { 0x1.71d0b6p-10, 0x1.1bf742p-6 },
+ { 0x1.3125f6p-10, 0x1.d5cc6cp-7 },
+ { 0x1.f755eap-11, 0x1.846e9ep-7 },
+ { 0x1.9eebaap-11, 0x1.410048p-7 },
+ { 0x1.55df18p-11, 0x1.09258p-7 },
+ { 0x1.198c18p-11, 0x1.b5ceb6p-8 },
+ { 0x1.cf82ep-12, 0x1.69468p-8 },
+ { 0x1.7d5af6p-12, 0x1.29f9e8p-8 },
+ { 0x1.399c28p-12, 0x1.eb4b9ep-9 },
+ { 0x1.01c65ap-12, 0x1.94d1dep-9 },
+ { 0x1.a78e82p-13, 0x1.4d6706p-9 },
+ { 0x1.5bcf92p-13, 0x1.127346p-9 },
+ { 0x1.1d791cp-13, 0x1.c39fap-10 },
+ { 0x1.d463dcp-14, 0x1.73679cp-10 },
+ { 0x1.8011fcp-14, 0x1.314916p-10 },
+ { 0x1.3ac71cp-14, 0x1.f5a11ap-11 },
+ { 0x1.01dcc2p-14, 0x1.9beca8p-11 },
+ { 0x1.a6459cp-15, 0x1.52189ap-11 },
+ { 0x1.59962ap-15, 0x1.155d48p-11 },
+ { 0x1.1ab0e4p-15, 0x1.c6dc8ap-12 },
+ { 0x1.ce42dep-16, 0x1.74ca88p-12 },
+ { 0x1.79c43p-16, 0x1.31612ap-12 },
+ { 0x1.349128p-16, 0x1.f4125ap-13 },
+ { 0x1.f7d80ep-17, 0x1.993e82p-13 },
+ { 0x1.9b270cp-17, 0x1.4ec006p-13 },
+ { 0x1.4f59fap-17, 0x1.11aebp-13 },
+ { 0x1.1164acp-17, 0x1.bf4ab2p-14 },
+ { 0x1.bd8c96p-18, 0x1.6d561ep-14 },
+ { 0x1.6ae172p-18, 0x1.2a406ep-14 },
+ { 0x1.276874p-18, 0x1.e6bba6p-15 },
+ { 0x1.e0bad2p-19, 0x1.8cf814p-15 },
+ { 0x1.86f788p-19, 0x1.4399f8p-15 },
+ { 0x1.3dcfaep-19, 0x1.07aa3p-15 },
+ { 0x1.023828p-19, 0x1.ad7302p-16 },
+ { 0x1.a3666ep-20, 0x1.5d90f4p-16 },
+ { 0x1.546e38p-20, 0x1.1c674ep-16 },
+ { 0x1.143264p-20, 0x1.ce8ccp-17 },
+ { 0x1.bff316p-21, 0x1.77f562p-17 },
+ { 0x1.6b13ecp-21, 0x1.316da8p-17 },
+ { 0x1.2624f4p-21, 0x1.f0046p-18 },
+ { 0x1.dc5de4p-22, 0x1.92920ap-18 },
+ { 0x1.818d3ap-22, 0x1.4691b2p-18 },
+ { 0x1.37e62p-22, 0x1.08c96ap-18 },
+ { 0x1.f8637ep-23, 0x1.ad2d0ap-19 },
+ { 0x1.97a3dcp-23, 0x1.5ba462p-19 },
+ { 0x1.494a4p-23, 0x1.1975ep-19 },
+ { 0x1.09dee4p-23, 0x1.c78892p-20 },
+ { 0x1.ad1fap-24, 0x1.7073c4p-20 },
+ { 0x1.5a245ep-24, 0x1.29df48p-20 },
+ { 0x1.171278p-24, 0x1.e163bep-21 },
+ { 0x1.c1c74cp-25, 0x1.84cbbp-21 },
+ { 0x1.6a46f4p-25, 0x1.39dbcep-21 },
+ { 0x1.23a858p-25, 0x1.fa7b92p-22 },
+ { 0x1.d56196p-26, 0x1.9876ap-22 },
+ { 0x1.7984b6p-26, 0x1.4940bcp-22 },
+ { 0x1.2f7cc4p-26, 0x1.094608p-22 },
+ { 0x1.e7b62cp-27, 0x1.ab3e8cp-23 },
+ { 0x1.87b15ep-27, 0x1.57e33ep-23 },
+ { 0x1.3a6dp-27, 0x1.14a8b6p-23 },
+ { 0x1.f88ebap-28, 0x1.bcede6p-24 },
+ { 0x1.94a282p-28, 0x1.659918p-24 },
+ { 0x1.44580ap-28, 0x1.1f4498p-24 },
+ { 0x1.03dbf8p-28, 0x1.cd5086p-25 },
+ { 0x1.a03066p-29, 0x1.723974p-25 },
+ { 0x1.4d1f2ep-29, 0x1.28f9cap-25 },
+ { 0x1.0a814ap-29, 0x1.dc34b6p-26 },
+ { 0x1.aa36cap-30, 0x1.7d9dbp-26 },
+ { 0x1.54a6b6p-30, 0x1.31aa56p-26 },
+ { 0x1.102232p-30, 0x1.e96c26p-27 },
+ { 0x1.b2959ep-31, 0x1.87a218p-27 },
+ { 0x1.5ad66cp-31, 0x1.393ad2p-27 },
+ { 0x1.14ac7ep-31, 0x1.f4ccdap-28 },
+ { 0x1.b931b8p-32, 0x1.9026a8p-28 },
+ { 0x1.5f9a24p-32, 0x1.3f92eap-28 },
+ { 0x1.181154p-32, 0x1.fe3208p-29 },
+ { 0x1.bdf55ep-33, 0x1.970fbp-29 },
+ { 0x1.62e226p-33, 0x1.449de6p-29 },
+ { 0x1.1a4576p-33, 0x1.02be7p-29 },
+ { 0x1.c0d0bep-34, 0x1.9c4672p-30 },
+ { 0x1.64a386p-34, 0x1.484b1ep-30 },
+ { 0x1.1b418cp-34, 0x1.054a9ap-30 },
+ { 0x1.c1ba4ap-35, 0x1.9fb994p-31 },
+ { 0x1.64d86p-35, 0x1.4a8e4ep-31 },
+ { 0x1.1b0242p-35, 0x1.06b4fep-31 },
+ { 0x1.c0aee6p-36, 0x1.a15d86p-32 },
+ { 0x1.637ffap-36, 0x1.4b5fdep-32 },
+ { 0x1.198862p-36, 0x1.06f8dap-32 },
+ { 0x1.bdb204p-37, 0x1.a12cc8p-33 },
+ { 0x1.609ec2p-37, 0x1.4abd0ap-33 },
+ { 0x1.16d8d2p-37, 0x1.06154ap-33 },
+ { 0x1.b8cd88p-38, 0x1.9f27fap-34 },
+ { 0x1.5c3e42p-38, 0x1.48a7fcp-34 },
+ { 0x1.12fc6cp-38, 0x1.040d4ap-34 },
+ { 0x1.b2119p-39, 0x1.9b55e8p-35 },
+ { 0x1.566cep-39, 0x1.4527acp-35 },
+ { 0x1.0dffep-39, 0x1.00e7acp-35 },
+ { 0x1.a99426p-40, 0x1.95c358p-36 },
+ { 0x1.4f3d92p-40, 0x1.4047cep-36 },
+ { 0x1.07f35ep-40, 0x1.f95dcep-37 },
+ { 0x1.9f70cp-41, 0x1.8e82cep-37 },
+ { 0x1.46c77ap-41, 0x1.3a1882p-37 },
+ { 0x1.00ea48p-41, 0x1.eee1d4p-38 },
+ { 0x1.93c7acp-42, 0x1.85ac18p-38 },
+ { 0x1.3d256ap-42, 0x1.32ae04p-38 },
+ { 0x1.f1f59p-43, 0x1.e27d88p-39 },
+ { 0x1.86bd6ap-43, 0x1.7b5bdap-39 },
+ { 0x1.327554p-43, 0x1.2a2036p-39 },
+ { 0x1.e07ab4p-44, 0x1.d458ap-40 },
+ { 0x1.7879ecp-44, 0x1.6fb2eap-40 },
+ { 0x1.26d7bp-44, 0x1.208a2cp-40 },
+ { 0x1.cd98a2p-45, 0x1.c49f8ap-41 },
+ { 0x1.6927c2p-45, 0x1.62d5aap-41 },
+ { 0x1.1a6ed6p-45, 0x1.16098ep-41 },
+ { 0x1.b986acp-46, 0x1.b3828ep-42 },
+ { 0x1.58f35ap-46, 0x1.54eb3ep-42 },
+ { 0x1.0d5e6p-46, 0x1.0abe0ep-42 },
+ { 0x1.a47db6p-47, 0x1.a134d4p-43 },
+ { 0x1.480a18p-47, 0x1.461cdap-43 },
+ { 0x1.ff94e4p-48, 0x1.fd9182p-44 },
+ { 0x1.8eb738p-48, 0x1.8deb62p-44 },
+ { 0x1.369994p-48, 0x1.3694e8p-44 },
+ { 0x1.e3ae4ap-49, 0x1.e49706p-45 },
+ { 0x1.786c3ep-49, 0x1.79dc28p-45 },
+ { 0x1.24cec8p-49, 0x1.267e46p-45 },
+ { 0x1.c74fc4p-50, 0x1.cad0bp-46 },
+ { 0x1.61d46cp-50, 0x1.653d08p-46 },
+ { 0x1.12d55cp-50, 0x1.16038cp-46 },
+ { 0x1.aabdacp-51, 0x1.b081aap-47 },
+ { 0x1.4b252ep-51, 0x1.5042e2p-47 },
+ { 0x1.00d6f8p-51, 0x1.054e44p-47 },
+ { 0x1.8e38ep-52, 0x1.95eb2cp-48 },
+ { 0x1.3490e8p-52, 0x1.3b20c6p-48 },
+ { 0x1.ddf56ap-53, 0x1.e90cb6p-49 },
+ { 0x1.71fdep-53, 0x1.7b4b76p-49 },
+ { 0x1.1e465ap-53, 0x1.26072ap-49 },
+ { 0x1.bac92ep-54, 0x1.c7a2ecp-50 },
+ { 0x1.56441cp-54, 0x1.60dcfp-50 },
+ { 0x1.08700cp-54, 0x1.112346p-50 },
+ { 0x1.986a66p-55, 0x1.a6a50ap-51 },
+ { 0x1.3b3d56p-55, 0x1.46d572p-51 },
+ { 0x1.e667dap-56, 0x1.f93d0ep-52 },
+ { 0x1.7712b8p-56, 0x1.86529ep-52 },
+ { 0x1.211544p-56, 0x1.2d65aep-52 },
+ { 0x1.bd660ap-57, 0x1.d13c32p-53 },
+ { 0x1.56f3eep-57, 0x1.66e45ap-53 },
+ { 0x1.07f14ap-57, 0x1.14b8b6p-53 },
+ { 0x1.96129cp-58, 0x1.aa854cp-54 },
+ { 0x1.3837cp-58, 0x1.488b94p-54 },
+ { 0x1.dfe0c2p-59, 0x1.f9e772p-55 },
+ { 0x1.709b5ap-59, 0x1.85503p-55 },
+ { 0x1.1affd2p-59, 0x1.2b7218p-55 },
+ { 0x1.b2564p-60, 0x1.cc6bb6p-56 },
+ { 0x1.4d23fap-60, 0x1.61cb1ap-56 },
+ { 0x1.fecbdp-61, 0x1.0fba0ep-56 },
+ { 0x1.8767d8p-61, 0x1.a13072p-57 },
+ { 0x1.2bc67ep-61, 0x1.401abcp-57 },
+ { 0x1.caf846p-62, 0x1.eafc2cp-58 },
+ { 0x1.5f2e7ap-62, 0x1.785cp-58 },
+ { 0x1.0c93acp-62, 0x1.205a7ep-58 },
+ { 0x1.9a9b06p-63, 0x1.b9a31ap-59 },
+ { 0x1.39b7fcp-63, 0x1.520968p-59 },
+ { 0x1.df277ap-64, 0x1.029ce6p-59 },
+ { 0x1.6dbcdp-64, 0x1.8b81d6p-60 },
+ { 0x1.17080ap-64, 0x1.2e48f2p-60 },
+ { 0x1.a98e26p-65, 0x1.cdd86cp-61 },
+ { 0x1.445a6ap-65, 0x1.60a47ap-61 },
+ { 0x1.ee324ep-66, 0x1.0d210cp-61 },
+ { 0x1.784e3p-66, 0x1.9a961ep-62 },
+ { 0x1.1e65fep-66, 0x1.390b74p-62 },
+ { 0x1.b3bb86p-67, 0x1.dd1e52p-63 },
+ { 0x1.4b4e36p-67, 0x1.6b6a7ap-63 },
+ { 0x1.f790f6p-68, 0x1.14acc2p-63 },
+ { 0x1.7e82cep-68, 0x1.a511aap-64 },
+ { 0x1.226a7ap-68, 0x1.404114p-64 },
+ { 0x1.b8c634p-69, 0x1.e6ea96p-65 },
+ { 0x1.4e53acp-69, 0x1.71f97ap-65 },
+ { 0x1.faed5cp-70, 0x1.18fb2ep-65 },
+ { 0x1.80217ep-70, 0x1.aa947ep-66 },
+ { 0x1.22f066p-70, 0x1.43a796p-66 },
+ { 0x1.b87f86p-71, 0x1.eae2fp-67 },
+ { 0x1.4d4ec8p-71, 0x1.7414e6p-67 },
+ { 0x1.f8283ep-72, 0x1.19e474p-67 },
+ { 0x1.7d1b22p-72, 0x1.aaeb7ep-68 },
+ { 0x1.1ff2dp-72, 0x1.431f66p-68 },
+ { 0x1.b2e9e8p-73, 0x1.e8e272p-69 },
+ { 0x1.4848dep-73, 0x1.71a91ep-69 },
+ { 0x1.ef5b16p-74, 0x1.176014p-69 },
+ { 0x1.758b92p-74, 0x1.a6137cp-70 },
+ { 0x1.198d42p-74, 0x1.3ead74p-70 },
+ { 0x1.a838bp-75, 0x1.e0fbc2p-71 },
+ { 0x1.3f700cp-75, 0x1.6accaep-71 },
+ { 0x1.e0d68ep-76, 0x1.118578p-71 },
+ { 0x1.69b7f4p-76, 0x1.9c3974p-72 },
+ { 0x1.0ffa12p-76, 0x1.367afap-72 },
+ { 0x1.98cd1cp-77, 0x1.d377fap-73 },
+ { 0x1.33148p-77, 0x1.5fbee6p-73 },
+ { 0x1.cd1dbap-78, 0x1.088a8p-73 },
+ { 0x1.5a0a9cp-78, 0x1.8db7ccp-74 },
+ { 0x1.038ef4p-78, 0x1.2ad2ecp-74 },
+ { 0x1.85308ap-79, 0x1.c0d23ep-75 },
+ { 0x1.23a3cp-79, 0x1.50e41ap-75 },
+ { 0x1.b4de68p-80, 0x1.f980a8p-76 },
+ { 0x1.470ce4p-80, 0x1.7b10fep-76 },
+ { 0x1.e9700cp-81, 0x1.1c1d98p-76 },
+ { 0x1.6e0c9p-81, 0x1.a9b08p-77 },
+ { 0x1.11a25ap-81, 0x1.3ebfb4p-77 },
+ { 0x1.98e73ap-82, 0x1.dd1d36p-78 },
+ { 0x1.315f58p-82, 0x1.64e7fp-78 },
+ { 0x1.c7e35cp-83, 0x1.0ada94p-78 },
+ { 0x1.542176p-83, 0x1.8ed9e8p-79 },
+ { 0x1.fb491ep-84, 0x1.29ecb2p-79 },
+ { 0x1.7a1c34p-84, 0x1.bcdb34p-80 },
+ { 0x1.19b0f2p-84, 0x1.4bf6cap-80 },
+ { 0x1.a383cap-85, 0x1.ef3318p-81 },
+ { 0x1.383bf2p-85, 0x1.712bc2p-81 },
+ { 0x1.d08cdap-86, 0x1.13151p-81 },
+ { 0x1.596adp-86, 0x1.99bf36p-82 },
+ { 0x1.00b602p-86, 0x1.3104d6p-82 },
+ { 0x1.7d62a2p-87, 0x1.c5e534p-83 },
+ { 0x1.1b2abcp-87, 0x1.518db2p-83 },
+ { 0x1.a4480ep-88, 0x1.f5d1c6p-84 },
+ { 0x1.37be42p-88, 0x1.74d45ap-84 },
+ { 0x1.ce3ee4p-89, 0x1.14dc4ap-84 },
+ { 0x1.568986p-89, 0x1.9afd0ep-85 },
+ { 0x1.fb69c6p-90, 0x1.30e632p-85 },
+ { 0x1.77a47ep-90, 0x1.c42b48p-86 },
+ { 0x1.15f4ep-90, 0x1.4f1f52p-86 },
+ { 0x1.9b25dcp-91, 0x1.f08156p-87 },
+ { 0x1.2feeeep-91, 0x1.6f9f62p-87 },
+ { 0x1.c122bcp-92, 0x1.100ffap-87 },
+ { 0x1.4bb154p-92, 0x1.927ce6p-88 },
+ { 0x1.e9ae56p-93, 0x1.2992f4p-88 },
+ { 0x1.6948e8p-93, 0x1.b7cccap-89 },
+ { 0x1.0a6cd2p-93, 0x1.44d7c4p-89 },
+ { 0x1.88c0cap-94, 0x1.dfa22p-90 },
+ { 0x1.215988p-94, 0x1.61eb26p-90 },
+ { 0x1.aa222ap-95, 0x1.0506e2p-90 },
+ { 0x1.39a30ep-95, 0x1.80d828p-91 },
+ { 0x1.cd740ep-96, 0x1.1b8f04p-91 },
+ { 0x1.534d82p-96, 0x1.a1a7ecp-92 },
+ { 0x1.f2bb06p-97, 0x1.336f3p-92 },
+ { 0x1.6e5b34p-97, 0x1.c46172p-93 },
+ { 0x1.0cfc82p-97, 0x1.4cab82p-93 },
+ { 0x1.8acc82p-98, 0x1.e9094cp-94 },
+ { 0x1.219686p-98, 0x1.67465p-94 },
+ { 0x1.a89fa6p-99, 0x1.07d0b8p-94 },
+ { 0x1.372982p-99, 0x1.833ffap-95 },
+ { 0x1.c7d094p-100, 0x1.1c147ap-95 },
+ { 0x1.4db1c8p-100, 0x1.a096ccp-96 },
+ { 0x1.e858d8p-101, 0x1.314decp-96 },
+ { 0x1.6529ep-101, 0x1.bf46cep-97 },
+ { 0x1.0517bap-101, 0x1.47796ap-97 },
+ { 0x1.7d8a8p-102, 0x1.df49a2p-98 },
+ { 0x1.16a46p-102, 0x1.5e9198p-98 },
+ { 0x1.96ca76p-103, 0x1.004b34p-98 },
+ { 0x1.28cb2cp-103, 0x1.768f3ep-99 },
+ { 0x1.b0de98p-104, 0x1.1190d2p-99 },
},
-
- {
-#if ERFCF_POLY_NCOEFFS == 16
- 0x1.fea5663f75cd1p-1, -0x1.1cb5a82adf1c4p0, 0x1.e7c8da942d86fp-1,
- -0x1.547ba0456bac7p-1, 0x1.8a6fc0f4421a4p-2, -0x1.7c14f9301ee58p-3,
- 0x1.2f67c8351577p-4, -0x1.8e733f6d159d9p-6, 0x1.aa6a0ec249067p-8,
- -0x1.6f4ec45b11f3fp-10, 0x1.f4c00c4b33ba8p-13, -0x1.0795faf7846d2p-15,
- 0x1.9cef9031810ddp-19, -0x1.c4d60c3fecdb6p-23, 0x1.360547ec2229dp-27,
- -0x1.8ec1581647f9fp-33
-#endif
- },
-
- {
-#if ERFCF_POLY_NCOEFFS == 16
- 0x1.dae421147c591p-1, -0x1.c211957a0abfcp-1, 0x1.28a8d87aa1b12p-1,
- -0x1.224d2a58cbef4p-2, 0x1.b3d45dcaef898p-4, -0x1.ff99d8b33e7a9p-6,
- 0x1.dac66375b99f6p-8, -0x1.5e1786f0f91ap-10, 0x1.9a2588deaec4fp-13,
- -0x1.7b886b183b235p-16, 0x1.1209e7da8ff82p-19, -0x1.2e5c870c6ed8p-23,
- 0x1.ec6a89422928ep-28, -0x1.16e7d837b61bcp-32, 0x1.88868a73e4b43p-38,
- -0x1.027034672f11cp-44
-#endif
- },
-
- {
-#if ERFCF_POLY_NCOEFFS == 16
- 0x1.8ae320c1bad5ap-1, -0x1.1cdd6aa6929aap-1, 0x1.0e39a7b285f58p-2,
- -0x1.6fb12a95e351dp-4, 0x1.77dd0649e352cp-6, -0x1.28a9e9560c461p-8,
- 0x1.6f7d7778e9433p-11, -0x1.68363698afe4ap-14, 0x1.17e94cdf35d82p-17,
- -0x1.5766a817bd3ffp-21, 0x1.48d892094a2c1p-25, -0x1.e1b6511ab6d0bp-30,
- 0x1.04c7b8143f6a4p-34, -0x1.898831961065bp-40, 0x1.71ae8a56142a6p-46,
- -0x1.45abac612344bp-53
-#endif
- }}};
+ };
diff --git a/pl/math/erff_1u5.c b/pl/math/erff_1u5.c
deleted file mode 100644
index 1a69872c43e5..000000000000
--- a/pl/math/erff_1u5.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Single-precision erf(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "estrinf.h"
-#include "hornerf.h"
-#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
-#define A __erff_data.erff_poly_A
-#define B __erff_data.erff_poly_B
-
-/* Top 12 bits of a float. */
-static inline uint32_t
-top12 (float x)
-{
- return asuint (x) >> 20;
-}
-
-/* Efficient implementation of erff using either a pure polynomial approximation
- or the exponential of a polynomial. Worst-case error is 1.09ulps at
- 0x1.c111acp-1. */
-float
-erff (float x)
-{
- float r, x2;
-
- /* Get top word. */
- uint32_t ix = asuint (x);
- uint32_t sign = ix >> 31;
- uint32_t ia12 = top12 (x) & 0x7ff;
-
- /* Limit of both intervals is 0.875 for performance reasons but coefficients
- computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
- from 0.94 to 1.1ulps. */
- if (ia12 < 0x3f6)
- { /* a = |x| < 0.875. */
-
- /* Tiny and subnormal cases. */
- if (unlikely (ia12 < 0x318))
- { /* |x| < 2^(-28). */
- if (unlikely (ia12 < 0x040))
- { /* |x| < 2^(-119). */
- float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
- return check_uflowf (y);
- }
- return x + TwoOverSqrtPiMinusOne * x;
- }
-
- x2 = x * x;
-
- /* Normalized cases (|x| < 0.921875) - Use Horner scheme for x+x*P(x^2).
- */
-#define C(i) A[i]
- r = fmaf (HORNER_5 (x2, C), x, x);
-#undef C
- }
- else if (ia12 < 0x408)
- { /* |x| < 4.0 - Use a custom Estrin scheme. */
-
- float a = fabsf (x);
- /* Use Estrin scheme on high order (small magnitude) coefficients. */
-#define C(i) B[i]
- r = ESTRIN_3_ (a, x * x, C, 3);
-#undef C
- /* Then switch to pure Horner scheme. */
- r = fmaf (r, a, B[2]);
- r = fmaf (r, a, B[1]);
- r = fmaf (r, a, B[0]);
- r = fmaf (r, a, a);
- /* Single precision exponential with ~0.5ulps ensures erff has maximum
- relative error below 1ulp on [0.921875, 4.0] and below 1.1ulps on
- [0.875, 4.0]. */
- r = expf (-r);
- /* Explicit copysign (calling copysignf increases latency). */
- if (sign)
- r = -1.0f + r;
- else
- r = 1.0f - r;
- }
- else
- { /* |x| >= 4.0. */
-
- /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */
- if (unlikely (ia12 >= 0x7f8))
- return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
-
- /* Explicit copysign (calling copysignf increases latency). */
- if (sign)
- r = -1.0f;
- else
- r = 1.0f;
- }
- return r;
-}
-
-PL_SIG (S, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (erff, 0.6)
-PL_TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
-PL_TEST_INTERVAL (erff, -0x1p-127, -0x1p-26, 40000)
-PL_TEST_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
-PL_TEST_INTERVAL (erff, -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (erff, 0, inf, 40000)
diff --git a/pl/math/erff_2u.c b/pl/math/erff_2u.c
new file mode 100644
index 000000000000..f43e647072f8
--- /dev/null
+++ b/pl/math/erff_2u.c
@@ -0,0 +1,82 @@
+/*
+ * Single-precision erf(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
+#define Shift 0x1p16f
+#define OneThird 0x1.555556p-2f
+
+/* Fast erff approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r)
+ + scale * d * [
+ + 1
+ - r d
+ + 1/3 (2 r^2 - 1) d^2
+ - 1/6 (r (2 r^2 - 3) ) d^3
+ + 1/30 (4 r^4 - 12 r^2 + 3) d^4
+ ]
+
+ This single precision implementation uses only the following terms:
+
+ erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
+
+ Values of erf(r) and scale are read from lookup tables.
+ For |x| > 3.9375, erf(|x|) rounds to 1.0f.
+
+ Maximum error: 1.93 ULP
+ erff(0x1.c373e6p-9) got 0x1.fd686cp-9
+ want 0x1.fd6868p-9. */
+float
+erff (float x)
+{
+ /* Get absolute value and sign. */
+ uint32_t ix = asuint (x);
+ uint32_t ia = ix & 0x7fffffff;
+ uint32_t sign = ix & ~0x7fffffff;
+
+ /* |x| < 0x1p-62. Triggers exceptions. */
+ if (unlikely (ia < 0x20800000))
+ return fmaf (TwoOverSqrtPiMinusOne, x, x);
+
+ if (ia < 0x407b8000) /* |x| < 4 - 8 / 128 = 3.9375. */
+ {
+ /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale
+ to 2/sqrt(pi), when x reduced to r = 0. */
+ float a = asfloat (ia);
+ float z = a + Shift;
+ uint32_t i = asuint (z) - asuint (Shift);
+ float r = z - Shift;
+ float erfr = __erff_data.tab[i].erf;
+ float scale = __erff_data.tab[i].scale;
+
+ /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */
+ float d = a - r;
+ float d2 = d * d;
+ float y = -fmaf (OneThird, d, r);
+ y = fmaf (fmaf (y, d2, d), scale, erfr);
+ return asfloat (asuint (y) | sign);
+ }
+
+ /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */
+ if (unlikely (ia >= 0x7f800000))
+ return (1.0f - (float) (sign >> 30)) + 1.0f / x;
+
+ /* Boring domain (|x| >= 4.0). */
+ return asfloat (sign | asuint (1.0f));
+}
+
+PL_SIG (S, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (erff, 1.43)
+PL_TEST_SYM_INTERVAL (erff, 0, 3.9375, 40000)
+PL_TEST_SYM_INTERVAL (erff, 3.9375, inf, 40000)
+PL_TEST_SYM_INTERVAL (erff, 0, inf, 40000)
diff --git a/pl/math/erff_data.c b/pl/math/erff_data.c
index 2352baefd35f..84c0d2e95463 100644
--- a/pl/math/erff_data.c
+++ b/pl/math/erff_data.c
@@ -1,16 +1,532 @@
/*
* Data for approximation of erff.
*
- * Copyright (c) 2019-2023, Arm Limited.
+ * Copyright (c) 2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-/* Minimax approximation of erff. */
-const struct erff_data __erff_data
- = {.erff_poly_A = {0x1.06eba6p-03f, -0x1.8126e0p-02f, 0x1.ce1a46p-04f,
- -0x1.b68bd2p-06f, 0x1.473f48p-08f, -0x1.3a1a82p-11f},
- .erff_poly_B
- = {0x1.079d0cp-3f, 0x1.450aa0p-1f, 0x1.b55cb0p-4f, -0x1.8d6300p-6f,
- 0x1.fd1336p-9f, -0x1.91d2ccp-12f, 0x1.222900p-16f}};
+/* Lookup table used in erff.
+ For each possible rounded input r (multiples of 1/128), between
+ r = 0.0 and r = 4.0 (513 values):
+ - the first entry __erff_data.tab.erf contains the values of erf(r),
+ - the second entry __erff_data.tab.scale contains the values of
+ 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
+ algorithm, since lookup is performed only for x >= 1/64-1/512. */
+const struct erff_data __erff_data = {
+ .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
+ { 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
+ { 0x1.20d770p-6, 0x1.20cb68p+0 },
+ { 0x1.b137e0p-6, 0x1.20b4d8p+0 },
+ { 0x1.20c564p-5, 0x1.209546p+0 },
+ { 0x1.68e5d4p-5, 0x1.206cb4p+0 },
+ { 0x1.b0fafep-5, 0x1.203b26p+0 },
+ { 0x1.f902a8p-5, 0x1.2000a0p+0 },
+ { 0x1.207d48p-4, 0x1.1fbd28p+0 },
+ { 0x1.44703ep-4, 0x1.1f70c4p+0 },
+ { 0x1.68591ap-4, 0x1.1f1b7ap+0 },
+ { 0x1.8c36bep-4, 0x1.1ebd56p+0 },
+ { 0x1.b00812p-4, 0x1.1e565cp+0 },
+ { 0x1.d3cbf8p-4, 0x1.1de698p+0 },
+ { 0x1.f7815ap-4, 0x1.1d6e14p+0 },
+ { 0x1.0d9390p-3, 0x1.1cecdcp+0 },
+ { 0x1.1f5e1ap-3, 0x1.1c62fap+0 },
+ { 0x1.311fc2p-3, 0x1.1bd07cp+0 },
+ { 0x1.42d7fcp-3, 0x1.1b3572p+0 },
+ { 0x1.548642p-3, 0x1.1a91e6p+0 },
+ { 0x1.662a0cp-3, 0x1.19e5eap+0 },
+ { 0x1.77c2d2p-3, 0x1.19318cp+0 },
+ { 0x1.895010p-3, 0x1.1874dep+0 },
+ { 0x1.9ad142p-3, 0x1.17aff0p+0 },
+ { 0x1.ac45e4p-3, 0x1.16e2d8p+0 },
+ { 0x1.bdad72p-3, 0x1.160da4p+0 },
+ { 0x1.cf076ep-3, 0x1.153068p+0 },
+ { 0x1.e05354p-3, 0x1.144b3cp+0 },
+ { 0x1.f190aap-3, 0x1.135e30p+0 },
+ { 0x1.015f78p-2, 0x1.12695ep+0 },
+ { 0x1.09eed6p-2, 0x1.116cd8p+0 },
+ { 0x1.127632p-2, 0x1.1068bap+0 },
+ { 0x1.1af54ep-2, 0x1.0f5d16p+0 },
+ { 0x1.236bf0p-2, 0x1.0e4a08p+0 },
+ { 0x1.2bd9dcp-2, 0x1.0d2fa6p+0 },
+ { 0x1.343ed6p-2, 0x1.0c0e0ap+0 },
+ { 0x1.3c9aa8p-2, 0x1.0ae550p+0 },
+ { 0x1.44ed18p-2, 0x1.09b590p+0 },
+ { 0x1.4d35f0p-2, 0x1.087ee4p+0 },
+ { 0x1.5574f4p-2, 0x1.07416cp+0 },
+ { 0x1.5da9f4p-2, 0x1.05fd3ep+0 },
+ { 0x1.65d4b8p-2, 0x1.04b27cp+0 },
+ { 0x1.6df50ap-2, 0x1.036140p+0 },
+ { 0x1.760abap-2, 0x1.0209a6p+0 },
+ { 0x1.7e1594p-2, 0x1.00abd0p+0 },
+ { 0x1.861566p-2, 0x1.fe8fb0p-1 },
+ { 0x1.8e0a02p-2, 0x1.fbbbbep-1 },
+ { 0x1.95f336p-2, 0x1.f8dc0ap-1 },
+ { 0x1.9dd0d2p-2, 0x1.f5f0cep-1 },
+ { 0x1.a5a2acp-2, 0x1.f2fa4cp-1 },
+ { 0x1.ad6896p-2, 0x1.eff8c4p-1 },
+ { 0x1.b52264p-2, 0x1.ecec78p-1 },
+ { 0x1.bccfecp-2, 0x1.e9d5a8p-1 },
+ { 0x1.c47104p-2, 0x1.e6b498p-1 },
+ { 0x1.cc0584p-2, 0x1.e38988p-1 },
+ { 0x1.d38d44p-2, 0x1.e054bep-1 },
+ { 0x1.db081cp-2, 0x1.dd167cp-1 },
+ { 0x1.e275eap-2, 0x1.d9cf06p-1 },
+ { 0x1.e9d68ap-2, 0x1.d67ea2p-1 },
+ { 0x1.f129d4p-2, 0x1.d32592p-1 },
+ { 0x1.f86faap-2, 0x1.cfc41ep-1 },
+ { 0x1.ffa7eap-2, 0x1.cc5a8ap-1 },
+ { 0x1.03693ap-1, 0x1.c8e91cp-1 },
+ { 0x1.06f794p-1, 0x1.c5701ap-1 },
+ { 0x1.0a7ef6p-1, 0x1.c1efcap-1 },
+ { 0x1.0dff50p-1, 0x1.be6872p-1 },
+ { 0x1.117894p-1, 0x1.bada5ap-1 },
+ { 0x1.14eab4p-1, 0x1.b745c6p-1 },
+ { 0x1.1855a6p-1, 0x1.b3aafcp-1 },
+ { 0x1.1bb95cp-1, 0x1.b00a46p-1 },
+ { 0x1.1f15ccp-1, 0x1.ac63e8p-1 },
+ { 0x1.226ae8p-1, 0x1.a8b828p-1 },
+ { 0x1.25b8a8p-1, 0x1.a5074ep-1 },
+ { 0x1.28ff02p-1, 0x1.a1519ep-1 },
+ { 0x1.2c3decp-1, 0x1.9d9762p-1 },
+ { 0x1.2f755cp-1, 0x1.99d8dap-1 },
+ { 0x1.32a54cp-1, 0x1.961650p-1 },
+ { 0x1.35cdb4p-1, 0x1.925008p-1 },
+ { 0x1.38ee8ap-1, 0x1.8e8646p-1 },
+ { 0x1.3c07cap-1, 0x1.8ab950p-1 },
+ { 0x1.3f196ep-1, 0x1.86e96ap-1 },
+ { 0x1.42236ep-1, 0x1.8316d6p-1 },
+ { 0x1.4525c8p-1, 0x1.7f41dcp-1 },
+ { 0x1.482074p-1, 0x1.7b6abcp-1 },
+ { 0x1.4b1372p-1, 0x1.7791b8p-1 },
+ { 0x1.4dfebap-1, 0x1.73b714p-1 },
+ { 0x1.50e24cp-1, 0x1.6fdb12p-1 },
+ { 0x1.53be26p-1, 0x1.6bfdf0p-1 },
+ { 0x1.569244p-1, 0x1.681ff2p-1 },
+ { 0x1.595ea6p-1, 0x1.644156p-1 },
+ { 0x1.5c2348p-1, 0x1.60625cp-1 },
+ { 0x1.5ee02ep-1, 0x1.5c8342p-1 },
+ { 0x1.619556p-1, 0x1.58a446p-1 },
+ { 0x1.6442c0p-1, 0x1.54c5a6p-1 },
+ { 0x1.66e86ep-1, 0x1.50e79ep-1 },
+ { 0x1.69865ep-1, 0x1.4d0a68p-1 },
+ { 0x1.6c1c98p-1, 0x1.492e42p-1 },
+ { 0x1.6eab18p-1, 0x1.455366p-1 },
+ { 0x1.7131e6p-1, 0x1.417a0cp-1 },
+ { 0x1.73b102p-1, 0x1.3da26ep-1 },
+ { 0x1.762870p-1, 0x1.39ccc2p-1 },
+ { 0x1.789836p-1, 0x1.35f940p-1 },
+ { 0x1.7b0058p-1, 0x1.32281ep-1 },
+ { 0x1.7d60d8p-1, 0x1.2e5992p-1 },
+ { 0x1.7fb9c0p-1, 0x1.2a8dcep-1 },
+ { 0x1.820b12p-1, 0x1.26c508p-1 },
+ { 0x1.8454d6p-1, 0x1.22ff72p-1 },
+ { 0x1.869712p-1, 0x1.1f3d3cp-1 },
+ { 0x1.88d1cep-1, 0x1.1b7e98p-1 },
+ { 0x1.8b050ep-1, 0x1.17c3b6p-1 },
+ { 0x1.8d30dep-1, 0x1.140cc4p-1 },
+ { 0x1.8f5544p-1, 0x1.1059eep-1 },
+ { 0x1.91724ap-1, 0x1.0cab62p-1 },
+ { 0x1.9387f6p-1, 0x1.09014cp-1 },
+ { 0x1.959652p-1, 0x1.055bd6p-1 },
+ { 0x1.979d68p-1, 0x1.01bb2cp-1 },
+ { 0x1.999d42p-1, 0x1.fc3ee6p-2 },
+ { 0x1.9b95e8p-1, 0x1.f511aap-2 },
+ { 0x1.9d8768p-1, 0x1.edeeeep-2 },
+ { 0x1.9f71cap-1, 0x1.e6d700p-2 },
+ { 0x1.a1551ap-1, 0x1.dfca26p-2 },
+ { 0x1.a33162p-1, 0x1.d8c8aap-2 },
+ { 0x1.a506b0p-1, 0x1.d1d2d0p-2 },
+ { 0x1.a6d50cp-1, 0x1.cae8dap-2 },
+ { 0x1.a89c86p-1, 0x1.c40b08p-2 },
+ { 0x1.aa5d26p-1, 0x1.bd3998p-2 },
+ { 0x1.ac16fcp-1, 0x1.b674c8p-2 },
+ { 0x1.adca14p-1, 0x1.afbcd4p-2 },
+ { 0x1.af767ap-1, 0x1.a911f0p-2 },
+ { 0x1.b11c3cp-1, 0x1.a27456p-2 },
+ { 0x1.b2bb68p-1, 0x1.9be438p-2 },
+ { 0x1.b4540ap-1, 0x1.9561c8p-2 },
+ { 0x1.b5e630p-1, 0x1.8eed36p-2 },
+ { 0x1.b771e8p-1, 0x1.8886b2p-2 },
+ { 0x1.b8f742p-1, 0x1.822e66p-2 },
+ { 0x1.ba764ap-1, 0x1.7be47ap-2 },
+ { 0x1.bbef10p-1, 0x1.75a91ap-2 },
+ { 0x1.bd61a2p-1, 0x1.6f7c6ap-2 },
+ { 0x1.bece0ep-1, 0x1.695e8cp-2 },
+ { 0x1.c03464p-1, 0x1.634fa6p-2 },
+ { 0x1.c194b2p-1, 0x1.5d4fd4p-2 },
+ { 0x1.c2ef08p-1, 0x1.575f34p-2 },
+ { 0x1.c44376p-1, 0x1.517de6p-2 },
+ { 0x1.c5920ap-1, 0x1.4bac00p-2 },
+ { 0x1.c6dad2p-1, 0x1.45e99cp-2 },
+ { 0x1.c81de2p-1, 0x1.4036d0p-2 },
+ { 0x1.c95b46p-1, 0x1.3a93b2p-2 },
+ { 0x1.ca930ep-1, 0x1.350052p-2 },
+ { 0x1.cbc54cp-1, 0x1.2f7cc4p-2 },
+ { 0x1.ccf20cp-1, 0x1.2a0916p-2 },
+ { 0x1.ce1962p-1, 0x1.24a554p-2 },
+ { 0x1.cf3b5cp-1, 0x1.1f518ap-2 },
+ { 0x1.d0580cp-1, 0x1.1a0dc6p-2 },
+ { 0x1.d16f7ep-1, 0x1.14da0ap-2 },
+ { 0x1.d281c4p-1, 0x1.0fb662p-2 },
+ { 0x1.d38ef0p-1, 0x1.0aa2d0p-2 },
+ { 0x1.d49710p-1, 0x1.059f5ap-2 },
+ { 0x1.d59a34p-1, 0x1.00ac00p-2 },
+ { 0x1.d6986cp-1, 0x1.f79184p-3 },
+ { 0x1.d791cap-1, 0x1.edeb40p-3 },
+ { 0x1.d8865ep-1, 0x1.e46530p-3 },
+ { 0x1.d97636p-1, 0x1.daff4ap-3 },
+ { 0x1.da6162p-1, 0x1.d1b982p-3 },
+ { 0x1.db47f4p-1, 0x1.c893cep-3 },
+ { 0x1.dc29fcp-1, 0x1.bf8e1cp-3 },
+ { 0x1.dd0788p-1, 0x1.b6a856p-3 },
+ { 0x1.dde0aap-1, 0x1.ade26cp-3 },
+ { 0x1.deb570p-1, 0x1.a53c42p-3 },
+ { 0x1.df85eap-1, 0x1.9cb5bep-3 },
+ { 0x1.e0522ap-1, 0x1.944ec2p-3 },
+ { 0x1.e11a3ep-1, 0x1.8c0732p-3 },
+ { 0x1.e1de36p-1, 0x1.83deeap-3 },
+ { 0x1.e29e22p-1, 0x1.7bd5c8p-3 },
+ { 0x1.e35a12p-1, 0x1.73eba4p-3 },
+ { 0x1.e41214p-1, 0x1.6c2056p-3 },
+ { 0x1.e4c638p-1, 0x1.6473b6p-3 },
+ { 0x1.e5768cp-1, 0x1.5ce596p-3 },
+ { 0x1.e62322p-1, 0x1.5575c8p-3 },
+ { 0x1.e6cc08p-1, 0x1.4e241ep-3 },
+ { 0x1.e7714ap-1, 0x1.46f066p-3 },
+ { 0x1.e812fcp-1, 0x1.3fda6cp-3 },
+ { 0x1.e8b12ap-1, 0x1.38e1fap-3 },
+ { 0x1.e94be4p-1, 0x1.3206dcp-3 },
+ { 0x1.e9e336p-1, 0x1.2b48dap-3 },
+ { 0x1.ea7730p-1, 0x1.24a7b8p-3 },
+ { 0x1.eb07e2p-1, 0x1.1e233ep-3 },
+ { 0x1.eb9558p-1, 0x1.17bb2cp-3 },
+ { 0x1.ec1fa2p-1, 0x1.116f48p-3 },
+ { 0x1.eca6ccp-1, 0x1.0b3f52p-3 },
+ { 0x1.ed2ae6p-1, 0x1.052b0cp-3 },
+ { 0x1.edabfcp-1, 0x1.fe6460p-4 },
+ { 0x1.ee2a1ep-1, 0x1.f2a902p-4 },
+ { 0x1.eea556p-1, 0x1.e72372p-4 },
+ { 0x1.ef1db4p-1, 0x1.dbd32ap-4 },
+ { 0x1.ef9344p-1, 0x1.d0b7a0p-4 },
+ { 0x1.f00614p-1, 0x1.c5d04ap-4 },
+ { 0x1.f07630p-1, 0x1.bb1c98p-4 },
+ { 0x1.f0e3a6p-1, 0x1.b09bfcp-4 },
+ { 0x1.f14e82p-1, 0x1.a64de6p-4 },
+ { 0x1.f1b6d0p-1, 0x1.9c31c6p-4 },
+ { 0x1.f21ca0p-1, 0x1.92470ap-4 },
+ { 0x1.f27ff8p-1, 0x1.888d1ep-4 },
+ { 0x1.f2e0eap-1, 0x1.7f036cp-4 },
+ { 0x1.f33f7ep-1, 0x1.75a960p-4 },
+ { 0x1.f39bc2p-1, 0x1.6c7e64p-4 },
+ { 0x1.f3f5c2p-1, 0x1.6381e2p-4 },
+ { 0x1.f44d88p-1, 0x1.5ab342p-4 },
+ { 0x1.f4a31ep-1, 0x1.5211ecp-4 },
+ { 0x1.f4f694p-1, 0x1.499d48p-4 },
+ { 0x1.f547f2p-1, 0x1.4154bcp-4 },
+ { 0x1.f59742p-1, 0x1.3937b2p-4 },
+ { 0x1.f5e490p-1, 0x1.31458ep-4 },
+ { 0x1.f62fe8p-1, 0x1.297dbap-4 },
+ { 0x1.f67952p-1, 0x1.21df9ap-4 },
+ { 0x1.f6c0dcp-1, 0x1.1a6a96p-4 },
+ { 0x1.f7068cp-1, 0x1.131e14p-4 },
+ { 0x1.f74a6ep-1, 0x1.0bf97ep-4 },
+ { 0x1.f78c8cp-1, 0x1.04fc3ap-4 },
+ { 0x1.f7cceep-1, 0x1.fc4b5ep-5 },
+ { 0x1.f80ba2p-1, 0x1.eeea8cp-5 },
+ { 0x1.f848acp-1, 0x1.e1d4d0p-5 },
+ { 0x1.f8841ap-1, 0x1.d508fap-5 },
+ { 0x1.f8bdf2p-1, 0x1.c885e0p-5 },
+ { 0x1.f8f63ep-1, 0x1.bc4a54p-5 },
+ { 0x1.f92d08p-1, 0x1.b05530p-5 },
+ { 0x1.f96256p-1, 0x1.a4a54ap-5 },
+ { 0x1.f99634p-1, 0x1.99397ap-5 },
+ { 0x1.f9c8a8p-1, 0x1.8e109cp-5 },
+ { 0x1.f9f9bap-1, 0x1.83298ep-5 },
+ { 0x1.fa2974p-1, 0x1.78832cp-5 },
+ { 0x1.fa57dep-1, 0x1.6e1c58p-5 },
+ { 0x1.fa84fep-1, 0x1.63f3f6p-5 },
+ { 0x1.fab0dep-1, 0x1.5a08e8p-5 },
+ { 0x1.fadb84p-1, 0x1.505a18p-5 },
+ { 0x1.fb04f6p-1, 0x1.46e66cp-5 },
+ { 0x1.fb2d40p-1, 0x1.3dacd2p-5 },
+ { 0x1.fb5464p-1, 0x1.34ac36p-5 },
+ { 0x1.fb7a6cp-1, 0x1.2be38cp-5 },
+ { 0x1.fb9f60p-1, 0x1.2351c2p-5 },
+ { 0x1.fbc344p-1, 0x1.1af5d2p-5 },
+ { 0x1.fbe61ep-1, 0x1.12ceb4p-5 },
+ { 0x1.fc07fap-1, 0x1.0adb60p-5 },
+ { 0x1.fc28d8p-1, 0x1.031ad6p-5 },
+ { 0x1.fc48c2p-1, 0x1.f7182ap-6 },
+ { 0x1.fc67bcp-1, 0x1.e85c44p-6 },
+ { 0x1.fc85d0p-1, 0x1.da0006p-6 },
+ { 0x1.fca2fep-1, 0x1.cc0180p-6 },
+ { 0x1.fcbf52p-1, 0x1.be5ecep-6 },
+ { 0x1.fcdaccp-1, 0x1.b1160ap-6 },
+ { 0x1.fcf576p-1, 0x1.a4255ap-6 },
+ { 0x1.fd0f54p-1, 0x1.978ae8p-6 },
+ { 0x1.fd286ap-1, 0x1.8b44e6p-6 },
+ { 0x1.fd40bep-1, 0x1.7f5188p-6 },
+ { 0x1.fd5856p-1, 0x1.73af0cp-6 },
+ { 0x1.fd6f34p-1, 0x1.685bb6p-6 },
+ { 0x1.fd8562p-1, 0x1.5d55ccp-6 },
+ { 0x1.fd9ae2p-1, 0x1.529b9ep-6 },
+ { 0x1.fdafb8p-1, 0x1.482b84p-6 },
+ { 0x1.fdc3e8p-1, 0x1.3e03d8p-6 },
+ { 0x1.fdd77ap-1, 0x1.3422fep-6 },
+ { 0x1.fdea6ep-1, 0x1.2a875cp-6 },
+ { 0x1.fdfcccp-1, 0x1.212f62p-6 },
+ { 0x1.fe0e96p-1, 0x1.181984p-6 },
+ { 0x1.fe1fd0p-1, 0x1.0f443ep-6 },
+ { 0x1.fe3080p-1, 0x1.06ae14p-6 },
+ { 0x1.fe40a6p-1, 0x1.fcab14p-7 },
+ { 0x1.fe504cp-1, 0x1.ec7262p-7 },
+ { 0x1.fe5f70p-1, 0x1.dcaf36p-7 },
+ { 0x1.fe6e18p-1, 0x1.cd5ecap-7 },
+ { 0x1.fe7c46p-1, 0x1.be7e5ap-7 },
+ { 0x1.fe8a00p-1, 0x1.b00b38p-7 },
+ { 0x1.fe9748p-1, 0x1.a202bep-7 },
+ { 0x1.fea422p-1, 0x1.94624ep-7 },
+ { 0x1.feb090p-1, 0x1.87275ep-7 },
+ { 0x1.febc96p-1, 0x1.7a4f6ap-7 },
+ { 0x1.fec836p-1, 0x1.6dd7fep-7 },
+ { 0x1.fed374p-1, 0x1.61beaep-7 },
+ { 0x1.fede52p-1, 0x1.56011cp-7 },
+ { 0x1.fee8d4p-1, 0x1.4a9cf6p-7 },
+ { 0x1.fef2fep-1, 0x1.3f8ff6p-7 },
+ { 0x1.fefccep-1, 0x1.34d7dcp-7 },
+ { 0x1.ff064cp-1, 0x1.2a727ap-7 },
+ { 0x1.ff0f76p-1, 0x1.205dacp-7 },
+ { 0x1.ff1852p-1, 0x1.169756p-7 },
+ { 0x1.ff20e0p-1, 0x1.0d1d6ap-7 },
+ { 0x1.ff2924p-1, 0x1.03ede2p-7 },
+ { 0x1.ff3120p-1, 0x1.f60d8ap-8 },
+ { 0x1.ff38d6p-1, 0x1.e4cc4ap-8 },
+ { 0x1.ff4048p-1, 0x1.d4143ap-8 },
+ { 0x1.ff4778p-1, 0x1.c3e1a6p-8 },
+ { 0x1.ff4e68p-1, 0x1.b430ecp-8 },
+ { 0x1.ff551ap-1, 0x1.a4fe84p-8 },
+ { 0x1.ff5b90p-1, 0x1.9646f4p-8 },
+ { 0x1.ff61ccp-1, 0x1.8806d8p-8 },
+ { 0x1.ff67d0p-1, 0x1.7a3adep-8 },
+ { 0x1.ff6d9ep-1, 0x1.6cdfccp-8 },
+ { 0x1.ff7338p-1, 0x1.5ff276p-8 },
+ { 0x1.ff789ep-1, 0x1.536fc2p-8 },
+ { 0x1.ff7dd4p-1, 0x1.4754acp-8 },
+ { 0x1.ff82dap-1, 0x1.3b9e40p-8 },
+ { 0x1.ff87b2p-1, 0x1.30499cp-8 },
+ { 0x1.ff8c5cp-1, 0x1.2553eep-8 },
+ { 0x1.ff90dcp-1, 0x1.1aba78p-8 },
+ { 0x1.ff9532p-1, 0x1.107a8cp-8 },
+ { 0x1.ff9960p-1, 0x1.06918cp-8 },
+ { 0x1.ff9d68p-1, 0x1.f9f9d0p-9 },
+ { 0x1.ffa14ap-1, 0x1.e77448p-9 },
+ { 0x1.ffa506p-1, 0x1.d58da6p-9 },
+ { 0x1.ffa8a0p-1, 0x1.c4412cp-9 },
+ { 0x1.ffac18p-1, 0x1.b38a3ap-9 },
+ { 0x1.ffaf6ep-1, 0x1.a36454p-9 },
+ { 0x1.ffb2a6p-1, 0x1.93cb12p-9 },
+ { 0x1.ffb5bep-1, 0x1.84ba30p-9 },
+ { 0x1.ffb8b8p-1, 0x1.762d84p-9 },
+ { 0x1.ffbb98p-1, 0x1.682100p-9 },
+ { 0x1.ffbe5ap-1, 0x1.5a90b0p-9 },
+ { 0x1.ffc102p-1, 0x1.4d78bcp-9 },
+ { 0x1.ffc390p-1, 0x1.40d564p-9 },
+ { 0x1.ffc606p-1, 0x1.34a306p-9 },
+ { 0x1.ffc862p-1, 0x1.28de12p-9 },
+ { 0x1.ffcaa8p-1, 0x1.1d8318p-9 },
+ { 0x1.ffccd8p-1, 0x1.128ebap-9 },
+ { 0x1.ffcef4p-1, 0x1.07fdb4p-9 },
+ { 0x1.ffd0fap-1, 0x1.fb99b8p-10 },
+ { 0x1.ffd2eap-1, 0x1.e7f232p-10 },
+ { 0x1.ffd4cap-1, 0x1.d4fed8p-10 },
+ { 0x1.ffd696p-1, 0x1.c2b9d0p-10 },
+ { 0x1.ffd84ep-1, 0x1.b11d70p-10 },
+ { 0x1.ffd9f8p-1, 0x1.a02436p-10 },
+ { 0x1.ffdb90p-1, 0x1.8fc8c8p-10 },
+ { 0x1.ffdd18p-1, 0x1.8005f0p-10 },
+ { 0x1.ffde90p-1, 0x1.70d6a4p-10 },
+ { 0x1.ffdffap-1, 0x1.6235fcp-10 },
+ { 0x1.ffe154p-1, 0x1.541f34p-10 },
+ { 0x1.ffe2a2p-1, 0x1.468daep-10 },
+ { 0x1.ffe3e2p-1, 0x1.397ceep-10 },
+ { 0x1.ffe514p-1, 0x1.2ce898p-10 },
+ { 0x1.ffe63cp-1, 0x1.20cc76p-10 },
+ { 0x1.ffe756p-1, 0x1.15246ep-10 },
+ { 0x1.ffe866p-1, 0x1.09ec86p-10 },
+ { 0x1.ffe96ap-1, 0x1.fe41cep-11 },
+ { 0x1.ffea64p-1, 0x1.e97ba4p-11 },
+ { 0x1.ffeb54p-1, 0x1.d57f52p-11 },
+ { 0x1.ffec3ap-1, 0x1.c245d4p-11 },
+ { 0x1.ffed16p-1, 0x1.afc85ep-11 },
+ { 0x1.ffedeap-1, 0x1.9e0058p-11 },
+ { 0x1.ffeeb4p-1, 0x1.8ce75ep-11 },
+ { 0x1.ffef76p-1, 0x1.7c7744p-11 },
+ { 0x1.fff032p-1, 0x1.6caa0ep-11 },
+ { 0x1.fff0e4p-1, 0x1.5d79ecp-11 },
+ { 0x1.fff18ep-1, 0x1.4ee142p-11 },
+ { 0x1.fff232p-1, 0x1.40daa4p-11 },
+ { 0x1.fff2d0p-1, 0x1.3360ccp-11 },
+ { 0x1.fff366p-1, 0x1.266ea8p-11 },
+ { 0x1.fff3f6p-1, 0x1.19ff46p-11 },
+ { 0x1.fff480p-1, 0x1.0e0de8p-11 },
+ { 0x1.fff504p-1, 0x1.0295f0p-11 },
+ { 0x1.fff582p-1, 0x1.ef25d4p-12 },
+ { 0x1.fff5fcp-1, 0x1.da0110p-12 },
+ { 0x1.fff670p-1, 0x1.c5b542p-12 },
+ { 0x1.fff6dep-1, 0x1.b23a5ap-12 },
+ { 0x1.fff74ap-1, 0x1.9f8894p-12 },
+ { 0x1.fff7aep-1, 0x1.8d986ap-12 },
+ { 0x1.fff810p-1, 0x1.7c629ap-12 },
+ { 0x1.fff86cp-1, 0x1.6be022p-12 },
+ { 0x1.fff8c6p-1, 0x1.5c0a38p-12 },
+ { 0x1.fff91cp-1, 0x1.4cda54p-12 },
+ { 0x1.fff96cp-1, 0x1.3e4a24p-12 },
+ { 0x1.fff9bap-1, 0x1.305390p-12 },
+ { 0x1.fffa04p-1, 0x1.22f0b4p-12 },
+ { 0x1.fffa4cp-1, 0x1.161be4p-12 },
+ { 0x1.fffa90p-1, 0x1.09cfa4p-12 },
+ { 0x1.fffad0p-1, 0x1.fc0d56p-13 },
+ { 0x1.fffb0ep-1, 0x1.e577bcp-13 },
+ { 0x1.fffb4ap-1, 0x1.cfd4a6p-13 },
+ { 0x1.fffb82p-1, 0x1.bb1a96p-13 },
+ { 0x1.fffbb8p-1, 0x1.a74068p-13 },
+ { 0x1.fffbecp-1, 0x1.943d4ap-13 },
+ { 0x1.fffc1ep-1, 0x1.8208bcp-13 },
+ { 0x1.fffc4ep-1, 0x1.709a8ep-13 },
+ { 0x1.fffc7ap-1, 0x1.5feadap-13 },
+ { 0x1.fffca6p-1, 0x1.4ff208p-13 },
+ { 0x1.fffccep-1, 0x1.40a8c2p-13 },
+ { 0x1.fffcf6p-1, 0x1.3207fcp-13 },
+ { 0x1.fffd1ap-1, 0x1.2408eap-13 },
+ { 0x1.fffd3ep-1, 0x1.16a502p-13 },
+ { 0x1.fffd60p-1, 0x1.09d5f8p-13 },
+ { 0x1.fffd80p-1, 0x1.fb2b7ap-14 },
+ { 0x1.fffda0p-1, 0x1.e3bcf4p-14 },
+ { 0x1.fffdbep-1, 0x1.cd5528p-14 },
+ { 0x1.fffddap-1, 0x1.b7e946p-14 },
+ { 0x1.fffdf4p-1, 0x1.a36eecp-14 },
+ { 0x1.fffe0ep-1, 0x1.8fdc1cp-14 },
+ { 0x1.fffe26p-1, 0x1.7d2738p-14 },
+ { 0x1.fffe3ep-1, 0x1.6b4702p-14 },
+ { 0x1.fffe54p-1, 0x1.5a329cp-14 },
+ { 0x1.fffe68p-1, 0x1.49e178p-14 },
+ { 0x1.fffe7ep-1, 0x1.3a4b60p-14 },
+ { 0x1.fffe90p-1, 0x1.2b6876p-14 },
+ { 0x1.fffea2p-1, 0x1.1d3120p-14 },
+ { 0x1.fffeb4p-1, 0x1.0f9e1cp-14 },
+ { 0x1.fffec4p-1, 0x1.02a868p-14 },
+ { 0x1.fffed4p-1, 0x1.ec929ap-15 },
+ { 0x1.fffee4p-1, 0x1.d4f4b4p-15 },
+ { 0x1.fffef2p-1, 0x1.be6abcp-15 },
+ { 0x1.ffff00p-1, 0x1.a8e8ccp-15 },
+ { 0x1.ffff0cp-1, 0x1.94637ep-15 },
+ { 0x1.ffff18p-1, 0x1.80cfdcp-15 },
+ { 0x1.ffff24p-1, 0x1.6e2368p-15 },
+ { 0x1.ffff30p-1, 0x1.5c540cp-15 },
+ { 0x1.ffff3ap-1, 0x1.4b581cp-15 },
+ { 0x1.ffff44p-1, 0x1.3b2652p-15 },
+ { 0x1.ffff4ep-1, 0x1.2bb5ccp-15 },
+ { 0x1.ffff56p-1, 0x1.1cfe02p-15 },
+ { 0x1.ffff60p-1, 0x1.0ef6c4p-15 },
+ { 0x1.ffff68p-1, 0x1.019842p-15 },
+ { 0x1.ffff70p-1, 0x1.e9b5e8p-16 },
+ { 0x1.ffff78p-1, 0x1.d16f58p-16 },
+ { 0x1.ffff7ep-1, 0x1.ba4f04p-16 },
+ { 0x1.ffff84p-1, 0x1.a447b8p-16 },
+ { 0x1.ffff8cp-1, 0x1.8f4cccp-16 },
+ { 0x1.ffff92p-1, 0x1.7b5224p-16 },
+ { 0x1.ffff98p-1, 0x1.684c22p-16 },
+ { 0x1.ffff9cp-1, 0x1.562facp-16 },
+ { 0x1.ffffa2p-1, 0x1.44f21ep-16 },
+ { 0x1.ffffa6p-1, 0x1.34894ap-16 },
+ { 0x1.ffffacp-1, 0x1.24eb72p-16 },
+ { 0x1.ffffb0p-1, 0x1.160f44p-16 },
+ { 0x1.ffffb4p-1, 0x1.07ebd2p-16 },
+ { 0x1.ffffb8p-1, 0x1.f4f12ep-17 },
+ { 0x1.ffffbcp-1, 0x1.db5ad0p-17 },
+ { 0x1.ffffc0p-1, 0x1.c304f0p-17 },
+ { 0x1.ffffc4p-1, 0x1.abe09ep-17 },
+ { 0x1.ffffc6p-1, 0x1.95df98p-17 },
+ { 0x1.ffffcap-1, 0x1.80f43ap-17 },
+ { 0x1.ffffccp-1, 0x1.6d1178p-17 },
+ { 0x1.ffffd0p-1, 0x1.5a2ae0p-17 },
+ { 0x1.ffffd2p-1, 0x1.483488p-17 },
+ { 0x1.ffffd4p-1, 0x1.372310p-17 },
+ { 0x1.ffffd6p-1, 0x1.26eb9ep-17 },
+ { 0x1.ffffd8p-1, 0x1.1783cep-17 },
+ { 0x1.ffffdcp-1, 0x1.08e1bap-17 },
+ { 0x1.ffffdep-1, 0x1.f5f7d8p-18 },
+ { 0x1.ffffdep-1, 0x1.db92b6p-18 },
+ { 0x1.ffffe0p-1, 0x1.c282cep-18 },
+ { 0x1.ffffe2p-1, 0x1.aab7acp-18 },
+ { 0x1.ffffe4p-1, 0x1.94219cp-18 },
+ { 0x1.ffffe6p-1, 0x1.7eb1a2p-18 },
+ { 0x1.ffffe8p-1, 0x1.6a5972p-18 },
+ { 0x1.ffffe8p-1, 0x1.570b6ap-18 },
+ { 0x1.ffffeap-1, 0x1.44ba86p-18 },
+ { 0x1.ffffeap-1, 0x1.335a62p-18 },
+ { 0x1.ffffecp-1, 0x1.22df2ap-18 },
+ { 0x1.ffffeep-1, 0x1.133d96p-18 },
+ { 0x1.ffffeep-1, 0x1.046aeap-18 },
+ { 0x1.fffff0p-1, 0x1.ecb9d0p-19 },
+ { 0x1.fffff0p-1, 0x1.d21398p-19 },
+ { 0x1.fffff2p-1, 0x1.b8d094p-19 },
+ { 0x1.fffff2p-1, 0x1.a0df10p-19 },
+ { 0x1.fffff2p-1, 0x1.8a2e26p-19 },
+ { 0x1.fffff4p-1, 0x1.74adc8p-19 },
+ { 0x1.fffff4p-1, 0x1.604ea8p-19 },
+ { 0x1.fffff4p-1, 0x1.4d0232p-19 },
+ { 0x1.fffff6p-1, 0x1.3aba86p-19 },
+ { 0x1.fffff6p-1, 0x1.296a70p-19 },
+ { 0x1.fffff6p-1, 0x1.190562p-19 },
+ { 0x1.fffff8p-1, 0x1.097f62p-19 },
+ { 0x1.fffff8p-1, 0x1.f59a20p-20 },
+ { 0x1.fffff8p-1, 0x1.d9c736p-20 },
+ { 0x1.fffff8p-1, 0x1.bf716cp-20 },
+ { 0x1.fffffap-1, 0x1.a6852cp-20 },
+ { 0x1.fffffap-1, 0x1.8eefd8p-20 },
+ { 0x1.fffffap-1, 0x1.789fb8p-20 },
+ { 0x1.fffffap-1, 0x1.6383f8p-20 },
+ { 0x1.fffffap-1, 0x1.4f8c96p-20 },
+ { 0x1.fffffap-1, 0x1.3caa62p-20 },
+ { 0x1.fffffcp-1, 0x1.2acee2p-20 },
+ { 0x1.fffffcp-1, 0x1.19ec60p-20 },
+ { 0x1.fffffcp-1, 0x1.09f5d0p-20 },
+ { 0x1.fffffcp-1, 0x1.f5bd96p-21 },
+ { 0x1.fffffcp-1, 0x1.d9371ep-21 },
+ { 0x1.fffffcp-1, 0x1.be41dep-21 },
+ { 0x1.fffffcp-1, 0x1.a4c89ep-21 },
+ { 0x1.fffffcp-1, 0x1.8cb738p-21 },
+ { 0x1.fffffep-1, 0x1.75fa8ep-21 },
+ { 0x1.fffffep-1, 0x1.608078p-21 },
+ { 0x1.fffffep-1, 0x1.4c37c0p-21 },
+ { 0x1.fffffep-1, 0x1.39100ep-21 },
+ { 0x1.fffffep-1, 0x1.26f9e0p-21 },
+ { 0x1.fffffep-1, 0x1.15e682p-21 },
+ { 0x1.fffffep-1, 0x1.05c804p-21 },
+ { 0x1.fffffep-1, 0x1.ed2254p-22 },
+ { 0x1.fffffep-1, 0x1.d06ad6p-22 },
+ { 0x1.fffffep-1, 0x1.b551c8p-22 },
+ { 0x1.fffffep-1, 0x1.9bc0a0p-22 },
+ { 0x1.fffffep-1, 0x1.83a200p-22 },
+ { 0x1.fffffep-1, 0x1.6ce1aap-22 },
+ { 0x1.fffffep-1, 0x1.576c72p-22 },
+ { 0x1.fffffep-1, 0x1.43302cp-22 },
+ { 0x1.fffffep-1, 0x1.301ba2p-22 },
+ { 0x1.fffffep-1, 0x1.1e1e86p-22 },
+ { 0x1.fffffep-1, 0x1.0d2966p-22 },
+ { 0x1.000000p+0, 0x1.fa5b50p-23 },
+ { 0x1.000000p+0, 0x1.dc3ae4p-23 },
+ { 0x1.000000p+0, 0x1.bfd756p-23 },
+ { 0x1.000000p+0, 0x1.a517dap-23 },
+ { 0x1.000000p+0, 0x1.8be4f8p-23 },
+ { 0x1.000000p+0, 0x1.74287ep-23 },
+ { 0x1.000000p+0, 0x1.5dcd66p-23 },
+ { 0x1.000000p+0, 0x1.48bfd4p-23 },
+ { 0x1.000000p+0, 0x1.34ecf8p-23 },
+ { 0x1.000000p+0, 0x1.224310p-23 },
+ { 0x1.000000p+0, 0x1.10b148p-23 },
+ },
+};
diff --git a/pl/math/erfinv_24u5.c b/pl/math/erfinv_24u5.c
new file mode 100644
index 000000000000..20e1e361befc
--- /dev/null
+++ b/pl/math/erfinv_24u5.c
@@ -0,0 +1,81 @@
+/*
+ * Double-precision inverse error function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "math_config.h"
+#include "poly_scalar_f64.h"
+#include "pl_sig.h"
+#define IGNORE_SCALAR_FENV
+#include "pl_test.h"
+
+const static struct
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
+ coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
+ of the denominator. */
+ double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10];
+} data = {
+ .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7,
+ -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3,
+ 0x1.689181bbafd0cp-3 },
+ .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7,
+ -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4,
+ 0x1p+0 },
+ .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1,
+ 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4,
+ -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 },
+ .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1,
+ 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4,
+ -0x1.4075c56404eecp+3, 0x1p+0 },
+ .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2,
+ 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3,
+ 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 },
+ .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2,
+ 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3,
+ 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2,
+ 0x1p+0 }
+};
+
+/* Inverse error function approximation, based on rational approximation as
+ described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error function",
+ Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7
+ Largest observed error is 24.46 ULP, in the extreme tail:
+ erfinv(0x1.fd9504351b757p-1) got 0x1.ff72c1092917p+0
+ want 0x1.ff72c10929158p+0. */
+double
+erfinv (double x)
+{
+ double a = fabs (x);
+
+ if (a <= 0.75)
+ {
+ /* Largest observed error in this region is 6.06 ULP:
+ erfinv(0x1.1884650fd2d41p-2) got 0x1.fb65998cbd3fep-3
+ want 0x1.fb65998cbd404p-3. */
+ double t = x * x - 0.5625;
+ return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17);
+ }
+
+ if (a <= 0.9375)
+ {
+ /* Largest observed error in this region is 6.95 ULP:
+ erfinv(0x1.a8d65b94d8c6p-1) got 0x1.f08325591b54p-1
+ want 0x1.f08325591b547p-1. */
+ double t = x * x - 0.87890625;
+ return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37);
+ }
+
+ double t = 1.0 / (sqrt (-log (1 - a)));
+ return horner_8_f64 (t, data.P_57)
+ / (copysign (t, x) * horner_9_f64 (t, data.Q_57));
+}
+
+PL_SIG (S, D, 1, erfinv, -0.99, 0.99)
+PL_TEST_ULP (erfinv, 24.0)
+PL_TEST_INTERVAL (erfinv, 0, 1, 40000)
+PL_TEST_INTERVAL (erfinv, -0x1p-1022, -1, 40000)
diff --git a/pl/math/erfinvf_4u7.c b/pl/math/erfinvf_4u7.c
new file mode 100644
index 000000000000..40736da08be8
--- /dev/null
+++ b/pl/math/erfinvf_4u7.c
@@ -0,0 +1,74 @@
+/*
+ * Single-precision inverse error function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "poly_scalar_f32.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+const static struct
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
+ coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
+ of the denominator. */
+ float P_10[3], Q_10[4], P_29[4], Q_29[4], P_50[6], Q_50[3];
+} data = { .P_10 = { -0x1.a31268p+3, 0x1.ac9048p+4, -0x1.293ff6p+3 },
+ .Q_10 = { -0x1.8265eep+3, 0x1.ef5eaep+4, -0x1.12665p+4, 0x1p+0 },
+ .P_29
+ = { -0x1.fc0252p-4, 0x1.119d44p+0, -0x1.f59ee2p+0, 0x1.b13626p-2 },
+ .Q_29 = { -0x1.69952p-4, 0x1.c7b7d2p-1, -0x1.167d7p+1, 0x1p+0 },
+ .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1,
+ -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 },
+ .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0, 0x1p+0 } };
+
+/* Inverse error function approximation, based on rational approximation as
+ described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error function",
+ Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7
+ Largest error is 4.71 ULP, in the tail region:
+ erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0
+ want 0x1.b83274p+0. */
+float
+erfinvf (float x)
+{
+ if (x == 1.0f)
+ return __math_oflowf (0);
+ if (x == -1.0f)
+ return __math_oflowf (1);
+
+ float a = fabsf (x);
+ if (a > 1.0f)
+ return __math_invalidf (x);
+
+ if (a <= 0.75f)
+ {
+ /* Greatest error in this region is 4.60 ULP:
+ erfinvf(0x1.0a98bap-5) got 0x1.d8a93ep-6
+ want 0x1.d8a948p-6. */
+ float t = x * x - 0.5625f;
+ return x * horner_2_f32 (t, data.P_10) / horner_3_f32 (t, data.Q_10);
+ }
+ if (a < 0.9375f)
+ {
+ /* Greatest error in this region is 3.79 ULP:
+ erfinvf(0x1.ac82d6p-1) got 0x1.f8fc54p-1
+ want 0x1.f8fc5cp-1. */
+ float t = x * x - 0.87890625f;
+ return x * horner_3_f32 (t, data.P_29) / horner_3_f32 (t, data.Q_29);
+ }
+
+ /* Tail region, where error is greatest (and sensitive to sqrt and log1p
+ implementations. */
+ float t = 1.0 / sqrtf (-log1pf (-a));
+ return horner_5_f32 (t, data.P_50)
+ / (copysignf (t, x) * horner_2_f32 (t, data.Q_50));
+}
+
+PL_SIG (S, F, 1, erfinv, -0.99, 0.99)
+PL_TEST_ULP (erfinvf, 4.09)
+PL_TEST_SYM_INTERVAL (erfinvf, 0, 1, 40000)
diff --git a/pl/math/erfinvl.c b/pl/math/erfinvl.c
new file mode 100644
index 000000000000..ea4aadfccd00
--- /dev/null
+++ b/pl/math/erfinvl.c
@@ -0,0 +1,114 @@
+/*
+ * Extended precision inverse error function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#define _GNU_SOURCE
+#include <math.h>
+#include <stdbool.h>
+#include <float.h>
+
+#include "math_config.h"
+#include "poly_scalar_f64.h"
+
+#define SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p0l
+#define HF_SQRT_PIl 0x1.c5bf891b4ef6aa79c3b0520d5db9p-1l
+
+const static struct
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
+ coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
+ of the denominator. */
+ double P_17[7], Q_17[7], P_37[8], Q_37[8], P_57[9], Q_57[10];
+} data = {
+ .P_17 = { 0x1.007ce8f01b2e8p+4, -0x1.6b23cc5c6c6d7p+6, 0x1.74e5f6ceb3548p+7,
+ -0x1.5200bb15cc6bbp+7, 0x1.05d193233a849p+6, -0x1.148c5474ee5e1p+3,
+ 0x1.689181bbafd0cp-3 },
+ .Q_17 = { 0x1.d8fb0f913bd7bp+3, -0x1.6d7f25a3f1c24p+6, 0x1.a450d8e7f4cbbp+7,
+ -0x1.bc3480485857p+7, 0x1.ae6b0c504ee02p+6, -0x1.499dfec1a7f5fp+4,
+ 0x1p+0 },
+ .P_37 = { -0x1.f3596123109edp-7, 0x1.60b8fe375999ep-2, -0x1.779bb9bef7c0fp+1,
+ 0x1.786ea384470a2p+3, -0x1.6a7c1453c85d3p+4, 0x1.31f0fc5613142p+4,
+ -0x1.5ea6c007d4dbbp+2, 0x1.e66f265ce9e5p-3 },
+ .Q_37 = { -0x1.636b2dcf4edbep-7, 0x1.0b5411e2acf29p-2, -0x1.3413109467a0bp+1,
+ 0x1.563e8136c554ap+3, -0x1.7b77aab1dcafbp+4, 0x1.8a3e174e05ddcp+4,
+ -0x1.4075c56404eecp+3, 0x1p+0 },
+ .P_57 = { 0x1.b874f9516f7f1p-14, 0x1.5921f2916c1c4p-7, 0x1.145ae7d5b8fa4p-2,
+ 0x1.29d6dcc3b2fb7p+1, 0x1.cabe2209a7985p+2, 0x1.11859f0745c4p+3,
+ 0x1.b7ec7bc6a2ce5p+2, 0x1.d0419e0bb42aep+1, 0x1.c5aa03eef7258p-1 },
+ .Q_57 = { 0x1.b8747e12691f1p-14, 0x1.59240d8ed1e0ap-7, 0x1.14aef2b181e2p-2,
+ 0x1.2cd181bcea52p+1, 0x1.e6e63e0b7aa4cp+2, 0x1.65cf8da94aa3ap+3,
+ 0x1.7e5c787b10a36p+3, 0x1.0626d68b6cea3p+3, 0x1.065c5f193abf6p+2,
+ 0x1p+0 }
+};
+
+/* Inverse error function approximation, based on rational approximation as
+ described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error function",
+ Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7. */
+static inline double
+__erfinv (double x)
+{
+ if (x == 1.0)
+ return __math_oflow (0);
+ if (x == -1.0)
+ return __math_oflow (1);
+
+ double a = fabs (x);
+ if (a > 1)
+ return __math_invalid (x);
+
+ if (a <= 0.75)
+ {
+ double t = x * x - 0.5625;
+ return x * horner_6_f64 (t, data.P_17) / horner_6_f64 (t, data.Q_17);
+ }
+
+ if (a <= 0.9375)
+ {
+ double t = x * x - 0.87890625;
+ return x * horner_7_f64 (t, data.P_37) / horner_7_f64 (t, data.Q_37);
+ }
+
+ double t = 1.0 / (sqrtl (-log1pl (-a)));
+ return horner_8_f64 (t, data.P_57)
+ / (copysign (t, x) * horner_9_f64 (t, data.Q_57));
+}
+
+/* Extended-precision variant, which uses the above (or asymptotic estimate) as
+ starting point for Newton refinement. This implementation is a port to C of
+ the version in the SpecialFunctions.jl Julia package, with relaxed stopping
+ criteria for the Newton refinement. */
+long double
+erfinvl (long double x)
+{
+ if (x == 0)
+ return 0;
+
+ double yf = __erfinv (x);
+ long double y;
+ if (isfinite (yf))
+ y = yf;
+ else
+ {
+ /* Double overflowed, use asymptotic estimate instead. */
+ y = copysignl (sqrtl (-logl (1.0l - fabsl (x)) * SQRT_PIl), x);
+ if (!isfinite (y))
+ return y;
+ }
+
+ double eps = fabs (yf - nextafter (yf, 0));
+ while (true)
+ {
+ long double dy = HF_SQRT_PIl * (erfl (y) - x) * exp (y * y);
+ y -= dy;
+ /* Stopping criterion is different to Julia implementation, but is enough
+ to ensure result is accurate when rounded to double-precision. */
+ if (fabsl (dy) < eps)
+ break;
+ }
+ return y;
+}
diff --git a/pl/math/estrin.h b/pl/math/estrin.h
deleted file mode 100644
index f967fb0475b0..000000000000
--- a/pl/math/estrin.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Helper macros for double-precision Estrin polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-#if V_SUPPORTED
-#define FMA v_fma_f64
-#else
-#define FMA fma
-#endif
-
-#include "estrin_wrap.h"
diff --git a/pl/math/estrin_wrap.h b/pl/math/estrin_wrap.h
deleted file mode 100644
index 2ae07001f2cf..000000000000
--- a/pl/math/estrin_wrap.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Helper macros for double-precision Estrin polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-// clang-format off
-#define ESTRIN_1_(x, c, i) FMA(x, c(1 + i), c(i))
-#define ESTRIN_2_(x, x2, c, i) FMA(x2, c(2 + i), ESTRIN_1_(x, c, i))
-#define ESTRIN_3_(x, x2, c, i) FMA(x2, ESTRIN_1_(x, c, 2 + i), ESTRIN_1_(x, c, i))
-#define ESTRIN_4_(x, x2, x4, c, i) FMA(x4, c(4 + i), ESTRIN_3_(x, x2, c, i))
-#define ESTRIN_5_(x, x2, x4, c, i) FMA(x4, ESTRIN_1_(x, c, 4 + i), ESTRIN_3_(x, x2, c, i))
-#define ESTRIN_6_(x, x2, x4, c, i) FMA(x4, ESTRIN_2_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i))
-#define ESTRIN_7_(x, x2, x4, c, i) FMA(x4, ESTRIN_3_(x, x2, c, 4 + i), ESTRIN_3_(x, x2, c, i))
-#define ESTRIN_8_(x, x2, x4, x8, c, i) FMA(x8, c(8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_9_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_1_(x, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_10_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_2_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_11_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_3_(x, x2, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_12_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_4_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_13_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_5_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_14_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_6_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_15_(x, x2, x4, x8, c, i) FMA(x8, ESTRIN_7_(x, x2, x4, c, 8 + i), ESTRIN_7_(x, x2, x4, c, i))
-#define ESTRIN_16_(x, x2, x4, x8, x16, c, i) FMA(x16, c(16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
-#define ESTRIN_17_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_1_(x, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
-#define ESTRIN_18_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_2_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
-#define ESTRIN_19_(x, x2, x4, x8, x16, c, i) FMA(x16, ESTRIN_3_(x, x2, c, 16 + i), ESTRIN_15_(x, x2, x4, x8, c, i))
-
-#define ESTRIN_1(x, c) ESTRIN_1_(x, c, 0)
-#define ESTRIN_2(x, x2, c) ESTRIN_2_(x, x2, c, 0)
-#define ESTRIN_3(x, x2, c) ESTRIN_3_(x, x2, c, 0)
-#define ESTRIN_4(x, x2, x4, c) ESTRIN_4_(x, x2, x4, c, 0)
-#define ESTRIN_5(x, x2, x4, c) ESTRIN_5_(x, x2, x4, c, 0)
-#define ESTRIN_6(x, x2, x4, c) ESTRIN_6_(x, x2, x4, c, 0)
-#define ESTRIN_7(x, x2, x4, c) ESTRIN_7_(x, x2, x4, c, 0)
-#define ESTRIN_8(x, x2, x4, x8, c) ESTRIN_8_(x, x2, x4, x8, c, 0)
-#define ESTRIN_9(x, x2, x4, x8, c) ESTRIN_9_(x, x2, x4, x8, c, 0)
-#define ESTRIN_10(x, x2, x4, x8, c) ESTRIN_10_(x, x2, x4, x8, c, 0)
-#define ESTRIN_11(x, x2, x4, x8, c) ESTRIN_11_(x, x2, x4, x8, c, 0)
-#define ESTRIN_12(x, x2, x4, x8, c) ESTRIN_12_(x, x2, x4, x8, c, 0)
-#define ESTRIN_13(x, x2, x4, x8, c) ESTRIN_13_(x, x2, x4, x8, c, 0)
-#define ESTRIN_14(x, x2, x4, x8, c) ESTRIN_14_(x, x2, x4, x8, c, 0)
-#define ESTRIN_15(x, x2, x4, x8, c) ESTRIN_15_(x, x2, x4, x8, c, 0)
-#define ESTRIN_16(x, x2, x4, x8, x16, c) ESTRIN_16_(x, x2, x4, x8, x16, c, 0)
-#define ESTRIN_17(x, x2, x4, x8, x16, c) ESTRIN_17_(x, x2, x4, x8, x16, c, 0)
-#define ESTRIN_18(x, x2, x4, x8, x16, c) ESTRIN_18_(x, x2, x4, x8, x16, c, 0)
-#define ESTRIN_19(x, x2, x4, x8, x16, c) ESTRIN_19_(x, x2, x4, x8, x16, c, 0)
-// clang-format on
diff --git a/pl/math/estrinf.h b/pl/math/estrinf.h
deleted file mode 100644
index 175233c6c799..000000000000
--- a/pl/math/estrinf.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Helper macros for single-precision Estrin polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#if V_SUPPORTED
-#define FMA v_fma_f32
-#else
-#define FMA fmaf
-#endif
-
-#include "estrin_wrap.h"
diff --git a/pl/math/expf.c b/pl/math/expf.c
index c325e45d5cc6..cd3cfa925c64 100644
--- a/pl/math/expf.c
+++ b/pl/math/expf.c
@@ -59,8 +59,8 @@ optr_aor_exp_f32 (float x)
/* Round and convert z to int, the result is in [-150*N, 128*N] and
ideally nearest int is used, otherwise the magnitude of r can be
bigger which gives larger approximation error. */
- kd = roundtoint (z);
- ki = converttoint (z);
+ kd = round (z);
+ ki = lround (z);
r = z - kd;
/* exp(x) = 2^(k/N) * 2^(r/N) ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */
diff --git a/pl/math/expm1_2u5.c b/pl/math/expm1_2u5.c
index a3faff70cb62..f7d431198614 100644
--- a/pl/math/expm1_2u5.c
+++ b/pl/math/expm1_2u5.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "estrin.h"
+#include "poly_scalar_f64.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -14,14 +14,14 @@
#define Ln2hi 0x1.62e42fefa39efp-1
#define Ln2lo 0x1.abc9e3b39803fp-56
#define Shift 0x1.8p52
-#define TinyBound \
- 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
-#define BigBound 0x1.63108c75a1937p+9 /* Above which expm1(x) overflows. */
-#define NegBound -0x1.740bf7c0d927dp+9 /* Below which expm1(x) rounds to 1. */
+/* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
+#define TinyBound 0x3cc0000000000000
+/* Above which expm1(x) overflows. */
+#define BigBound 0x1.63108c75a1937p+9
+/* Below which expm1(x) rounds to 1. */
+#define NegBound -0x1.740bf7c0d927dp+9
#define AbsMask 0x7fffffffffffffff
-#define C(i) __expm1_poly[i]
-
/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
The maximum error observed error is 2.17 ULP:
expm1(0x1.63f90a866748dp-2) got 0x1.a9af56603878ap-2
@@ -65,7 +65,7 @@ expm1 (double x)
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
double f2 = f * f;
double f4 = f2 * f2;
- double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+ double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f);
/* Assemble the result, using a slight rearrangement to achieve acceptable
accuracy.
@@ -78,8 +78,7 @@ expm1 (double x)
PL_SIG (S, D, 1, expm1, -9.9, 9.9)
PL_TEST_ULP (expm1, 1.68)
-PL_TEST_INTERVAL (expm1, 0, 0x1p-51, 1000)
-PL_TEST_INTERVAL (expm1, -0, -0x1p-51, 1000)
+PL_TEST_SYM_INTERVAL (expm1, 0, 0x1p-51, 1000)
PL_TEST_INTERVAL (expm1, 0x1p-51, 0x1.63108c75a1937p+9, 100000)
PL_TEST_INTERVAL (expm1, -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
PL_TEST_INTERVAL (expm1, 0x1.63108c75a1937p+9, inf, 100)
diff --git a/pl/math/expm1f_1u6.c b/pl/math/expm1f_1u6.c
index 70b14e48519d..e12c9ba9a8a2 100644
--- a/pl/math/expm1f_1u6.c
+++ b/pl/math/expm1f_1u6.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "hornerf.h"
+#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -20,8 +20,6 @@
#define NegLimit \
(-0x1.9bbabcp+6) /* Largest value of x for which expm1(x) rounds to 1. */
-#define C(i) __expm1f_poly[i]
-
/* Approximation for exp(x) - 1 using polynomial on a reduced interval.
The maximum error is 1.51 ULP:
expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
@@ -62,7 +60,7 @@ expm1f (float x)
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float p = fmaf (f * f, HORNER_4 (f, C), f);
+ float p = fmaf (f * f, horner_4_f32 (f, __expm1f_poly), f);
/* Assemble the result, using a slight rearrangement to achieve acceptable
accuracy.
expm1(x) ~= 2^i * (p + 1) - 1
@@ -74,7 +72,8 @@ expm1f (float x)
PL_SIG (S, F, 1, expm1, -9.9, 9.9)
PL_TEST_ULP (expm1f, 1.02)
-PL_TEST_INTERVAL (expm1f, 0, 0x1p-23, 1000)
-PL_TEST_INTERVAL (expm1f, -0, -0x1p-23, 1000)
+PL_TEST_SYM_INTERVAL (expm1f, 0, 0x1p-23, 1000)
PL_TEST_INTERVAL (expm1f, 0x1p-23, 0x1.644716p6, 100000)
+PL_TEST_INTERVAL (expm1f, 0x1.644716p6, inf, 1000)
PL_TEST_INTERVAL (expm1f, -0x1p-23, -0x1.9bbabcp+6, 100000)
+PL_TEST_INTERVAL (expm1f, -0x1.9bbabcp+6, -inf, 1000)
diff --git a/pl/math/finite_pow.h b/pl/math/finite_pow.h
new file mode 100644
index 000000000000..8944d4fae625
--- /dev/null
+++ b/pl/math/finite_pow.h
@@ -0,0 +1,365 @@
+/*
+ * Double-precision x^y function.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Scalar version of pow used for fallbacks in vector implementations. */
+
+/* Data is defined in v_pow_log_data.c. */
+#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
+#define Off 0x3fe6955500000000
+#define As __v_pow_log_data.poly
+
+/* Data is defined in v_pow_exp_data.c. */
+#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
+#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
+#define SmallExp 0x3c9 /* top12(0x1p-54). */
+#define BigExp 0x408 /* top12(512.0). */
+#define ThresExp 0x03f /* BigExp - SmallExp. */
+#define InvLn2N __v_pow_exp_data.n_over_ln2
+#define Ln2HiN __v_pow_exp_data.ln2_over_n_hi
+#define Ln2LoN __v_pow_exp_data.ln2_over_n_lo
+#define SBits __v_pow_exp_data.sbits
+#define Cs __v_pow_exp_data.poly
+
+/* Constants associated with pow. */
+#define SmallPowX 0x001 /* top12(0x1p-126). */
+#define BigPowX 0x7ff /* top12(INFINITY). */
+#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
+#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */
+#define BigPowY 0x43e /* top12(0x1.749p62). */
+#define ThresPowY 0x080 /* BigPowY - SmallPowY. */
+
+/* Top 12 bits of a double (sign and exponent bits). */
+static inline uint32_t
+top12 (double x)
+{
+ return asuint64 (x) >> 52;
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+ additional 15 bits precision. IX is the bit representation of x, but
+ normalized in the subnormal range using the sign bit for the exponent. */
+static inline double
+log_inline (uint64_t ix, double *tail)
+{
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ uint64_t tmp = ix - Off;
+ int i = (tmp >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1);
+ int k = (int64_t) tmp >> 52; /* arithmetic shift. */
+ uint64_t iz = ix - (tmp & 0xfffULL << 52);
+ double z = asdouble (iz);
+ double kd = (double) k;
+
+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
+ double invc = __v_pow_log_data.invc[i];
+ double logc = __v_pow_log_data.logc[i];
+ double logctail = __v_pow_log_data.logctail[i];
+
+ /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
+ double r = fma (z, invc, -1.0);
+
+ /* k*Ln2 + log(c) + r. */
+ double t1 = kd * __v_pow_log_data.ln2_hi + logc;
+ double t2 = t1 + r;
+ double lo1 = kd * __v_pow_log_data.ln2_lo + logctail;
+ double lo2 = t1 - t2 + r;
+
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ double ar = As[0] * r;
+ double ar2 = r * ar;
+ double ar3 = r * ar2;
+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
+ double hi = t2 + ar2;
+ double lo3 = fma (ar, r, -ar2);
+ double lo4 = t2 - hi + ar2;
+ /* p = log1p(r) - r - A[0]*r*r. */
+ double p = (ar3
+ * (As[1] + r * As[2]
+ + ar2 * (As[3] + r * As[4] + ar2 * (As[5] + r * As[6]))));
+ double lo = lo1 + lo2 + lo3 + lo4 + p;
+ double y = hi + lo;
+ *tail = hi - y + lo;
+ return y;
+}
+
+/* Handle cases that may overflow or underflow when computing the result that
+ is scale*(1+TMP) without intermediate rounding. The bit representation of
+ scale is in SBITS, however it has a computed exponent that may have
+ overflown into the sign bit so that needs to be adjusted before using it as
+ a double. (int32_t)KI is the k used in the argument reduction and exponent
+ adjustment of scale, positive k here means the result may overflow and
+ negative k means the result may underflow. */
+static inline double
+special_case (double tmp, uint64_t sbits, uint64_t ki)
+{
+ double scale, y;
+
+ if ((ki & 0x80000000) == 0)
+ {
+ /* k > 0, the exponent of scale might have overflowed by <= 460. */
+ sbits -= 1009ull << 52;
+ scale = asdouble (sbits);
+ y = 0x1p1009 * (scale + scale * tmp);
+ return check_oflow (eval_as_double (y));
+ }
+ /* k < 0, need special care in the subnormal range. */
+ sbits += 1022ull << 52;
+ /* Note: sbits is signed scale. */
+ scale = asdouble (sbits);
+ y = scale + scale * tmp;
+#if WANT_SIMD_EXCEPT
+ if (fabs (y) < 1.0)
+ {
+ /* Round y to the right precision before scaling it into the subnormal
+ range to avoid double rounding that can cause 0.5+E/2 ulp error where
+ E is the worst-case ulp error outside the subnormal range. So this
+ is only useful if the goal is better than 1 ulp worst-case error. */
+ double hi, lo, one = 1.0;
+ if (y < 0.0)
+ one = -1.0;
+ lo = scale - y + scale * tmp;
+ hi = one + y;
+ lo = one - hi + y + lo;
+ y = eval_as_double (hi + lo) - one;
+ /* Fix the sign of 0. */
+ if (y == 0.0)
+ y = asdouble (sbits & 0x8000000000000000);
+ /* The underflow exception needs to be signaled explicitly. */
+ force_eval_double (opt_barrier_double (0x1p-1022) * 0x1p-1022);
+ }
+#endif
+ y = 0x1p-1022 * y;
+ return check_uflow (eval_as_double (y));
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+ The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
+static inline double
+exp_inline (double x, double xtail, uint32_t sign_bias)
+{
+ uint32_t abstop = top12 (x) & 0x7ff;
+ if (unlikely (abstop - SmallExp >= ThresExp))
+ {
+ if (abstop - SmallExp >= 0x80000000)
+ {
+ /* Avoid spurious underflow for tiny x. */
+ /* Note: 0 is common input. */
+ return sign_bias ? -1.0 : 1.0;
+ }
+ if (abstop >= top12 (1024.0))
+ {
+ /* Note: inf and nan are already handled. */
+ /* Skip errno handling. */
+#if WANT_SIMD_EXCEPT
+ return asuint64 (x) >> 63 ? __math_uflow (sign_bias)
+ : __math_oflow (sign_bias);
+#else
+ double res_uoflow = asuint64 (x) >> 63 ? 0.0 : INFINITY;
+ return sign_bias ? -res_uoflow : res_uoflow;
+#endif
+ }
+ /* Large x is special cased below. */
+ abstop = 0;
+ }
+
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
+ double z = InvLn2N * x;
+ double kd = round (z);
+ uint64_t ki = lround (z);
+ double r = x - kd * Ln2HiN - kd * Ln2LoN;
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r += xtail;
+ /* 2^(k/N) ~= scale. */
+ uint64_t idx = ki & (N_EXP - 1);
+ uint64_t top = (ki + sign_bias) << (52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ uint64_t sbits = SBits[idx] + top;
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ double r2 = r * r;
+ double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);
+ if (unlikely (abstop == 0))
+ return special_case (tmp, sbits, ki);
+ double scale = asdouble (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ return eval_as_double (scale + scale * tmp);
+}
+
+/* Computes exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+ A version of exp_inline that is not inlined and for which sign_bias is
+ equal to 0. */
+static double NOINLINE
+exp_nosignbias (double x, double xtail)
+{
+ uint32_t abstop = top12 (x) & 0x7ff;
+ if (unlikely (abstop - SmallExp >= ThresExp))
+ {
+ /* Avoid spurious underflow for tiny x. */
+ if (abstop - SmallExp >= 0x80000000)
+ return 1.0;
+ /* Note: inf and nan are already handled. */
+ if (abstop >= top12 (1024.0))
+#if WANT_SIMD_EXCEPT
+ return asuint64 (x) >> 63 ? __math_uflow (0) : __math_oflow (0);
+#else
+ return asuint64 (x) >> 63 ? 0.0 : INFINITY;
+#endif
+ /* Large x is special cased below. */
+ abstop = 0;
+ }
+
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
+ double z = InvLn2N * x;
+ double kd = round (z);
+ uint64_t ki = lround (z);
+ double r = x - kd * Ln2HiN - kd * Ln2LoN;
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r += xtail;
+ /* 2^(k/N) ~= scale. */
+ uint64_t idx = ki & (N_EXP - 1);
+ uint64_t top = ki << (52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ uint64_t sbits = SBits[idx] + top;
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */
+ double r2 = r * r;
+ double tmp = r + r2 * Cs[0] + r * r2 * (Cs[1] + r * Cs[2]);
+ if (unlikely (abstop == 0))
+ return special_case (tmp, sbits, ki);
+ double scale = asdouble (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ return eval_as_double (scale + scale * tmp);
+}
+
+/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
+ the bit representation of a non-zero finite floating-point value. */
+static inline int
+checkint (uint64_t iy)
+{
+ int e = iy >> 52 & 0x7ff;
+ if (e < 0x3ff)
+ return 0;
+ if (e > 0x3ff + 52)
+ return 2;
+ if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
+ return 0;
+ if (iy & (1ULL << (0x3ff + 52 - e)))
+ return 1;
+ return 2;
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline int
+zeroinfnan (uint64_t i)
+{
+ return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
+}
+
+static double NOINLINE
+__pl_finite_pow (double x, double y)
+{
+ uint32_t sign_bias = 0;
+ uint64_t ix, iy;
+ uint32_t topx, topy;
+
+ ix = asuint64 (x);
+ iy = asuint64 (y);
+ topx = top12 (x);
+ topy = top12 (y);
+ if (unlikely (topx - SmallPowX >= ThresPowX
+ || (topy & 0x7ff) - SmallPowY >= ThresPowY))
+ {
+ /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0
+ and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */
+ /* Special cases: (x < 0x1p-126 or inf or nan) or
+ (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */
+ if (unlikely (zeroinfnan (iy)))
+ {
+ if (2 * iy == 0)
+ return issignaling_inline (x) ? x + y : 1.0;
+ if (ix == asuint64 (1.0))
+ return issignaling_inline (y) ? x + y : 1.0;
+ if (2 * ix > 2 * asuint64 (INFINITY)
+ || 2 * iy > 2 * asuint64 (INFINITY))
+ return x + y;
+ if (2 * ix == 2 * asuint64 (1.0))
+ return 1.0;
+ if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
+ return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
+ return y * y;
+ }
+ if (unlikely (zeroinfnan (ix)))
+ {
+ double x2 = x * x;
+ if (ix >> 63 && checkint (iy) == 1)
+ {
+ x2 = -x2;
+ sign_bias = 1;
+ }
+#if WANT_SIMD_EXCEPT
+ if (2 * ix == 0 && iy >> 63)
+ return __math_divzero (sign_bias);
+#endif
+ /* Without the barrier some versions of clang hoist the 1/x2 and
+ thus division by zero exception can be signaled spuriously. */
+ return iy >> 63 ? opt_barrier_double (1 / x2) : x2;
+ }
+ /* Here x and y are non-zero finite. */
+ if (ix >> 63)
+ {
+ /* Finite x < 0. */
+ int yint = checkint (iy);
+ if (yint == 0)
+#if WANT_SIMD_EXCEPT
+ return __math_invalid (x);
+#else
+ return __builtin_nan ("");
+#endif
+ if (yint == 1)
+ sign_bias = SignBias;
+ ix &= 0x7fffffffffffffff;
+ topx &= 0x7ff;
+ }
+ if ((topy & 0x7ff) - SmallPowY >= ThresPowY)
+ {
+ /* Note: sign_bias == 0 here because y is not odd. */
+ if (ix == asuint64 (1.0))
+ return 1.0;
+ /* |y| < 2^-65, x^y ~= 1 + y*log(x). */
+ if ((topy & 0x7ff) < SmallPowY)
+ return 1.0;
+#if WANT_SIMD_EXCEPT
+ return (ix > asuint64 (1.0)) == (topy < 0x800) ? __math_oflow (0)
+ : __math_uflow (0);
+#else
+ return (ix > asuint64 (1.0)) == (topy < 0x800) ? INFINITY : 0;
+#endif
+ }
+ if (topx == 0)
+ {
+ /* Normalize subnormal x so exponent becomes negative. */
+ /* Without the barrier some versions of clang evalutate the mul
+ unconditionally causing spurious overflow exceptions. */
+ ix = asuint64 (opt_barrier_double (x) * 0x1p52);
+ ix &= 0x7fffffffffffffff;
+ ix -= 52ULL << 52;
+ }
+ }
+
+ double lo;
+ double hi = log_inline (ix, &lo);
+ double ehi = y * hi;
+ double elo = y * lo + fma (y, hi, -ehi);
+ return exp_inline (ehi, elo, sign_bias);
+}
diff --git a/pl/math/horner.h b/pl/math/horner.h
deleted file mode 100644
index f92ab6752110..000000000000
--- a/pl/math/horner.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Helper macros for single-precision Horner polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#if V_SUPPORTED
-#define FMA v_fma_f64
-#else
-#define FMA fma
-#endif
-
-#include "horner_wrap.h"
diff --git a/pl/math/horner_wrap.h b/pl/math/horner_wrap.h
deleted file mode 100644
index 6478968db913..000000000000
--- a/pl/math/horner_wrap.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Helper macros for Horner polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-// clang-format off
-#define HORNER_1_(x, c, i) FMA(c(i + 1), x, c(i))
-#define HORNER_2_(x, c, i) FMA(HORNER_1_ (x, c, i + 1), x, c(i))
-#define HORNER_3_(x, c, i) FMA(HORNER_2_ (x, c, i + 1), x, c(i))
-#define HORNER_4_(x, c, i) FMA(HORNER_3_ (x, c, i + 1), x, c(i))
-#define HORNER_5_(x, c, i) FMA(HORNER_4_ (x, c, i + 1), x, c(i))
-#define HORNER_6_(x, c, i) FMA(HORNER_5_ (x, c, i + 1), x, c(i))
-#define HORNER_7_(x, c, i) FMA(HORNER_6_ (x, c, i + 1), x, c(i))
-#define HORNER_8_(x, c, i) FMA(HORNER_7_ (x, c, i + 1), x, c(i))
-#define HORNER_9_(x, c, i) FMA(HORNER_8_ (x, c, i + 1), x, c(i))
-#define HORNER_10_(x, c, i) FMA(HORNER_9_ (x, c, i + 1), x, c(i))
-#define HORNER_11_(x, c, i) FMA(HORNER_10_(x, c, i + 1), x, c(i))
-#define HORNER_12_(x, c, i) FMA(HORNER_11_(x, c, i + 1), x, c(i))
-
-#define HORNER_1(x, c) HORNER_1_ (x, c, 0)
-#define HORNER_2(x, c) HORNER_2_ (x, c, 0)
-#define HORNER_3(x, c) HORNER_3_ (x, c, 0)
-#define HORNER_4(x, c) HORNER_4_ (x, c, 0)
-#define HORNER_5(x, c) HORNER_5_ (x, c, 0)
-#define HORNER_6(x, c) HORNER_6_ (x, c, 0)
-#define HORNER_7(x, c) HORNER_7_ (x, c, 0)
-#define HORNER_8(x, c) HORNER_8_ (x, c, 0)
-#define HORNER_9(x, c) HORNER_9_ (x, c, 0)
-#define HORNER_10(x, c) HORNER_10_(x, c, 0)
-#define HORNER_11(x, c) HORNER_11_(x, c, 0)
-#define HORNER_12(x, c) HORNER_12_(x, c, 0)
-// clang-format on
diff --git a/pl/math/hornerf.h b/pl/math/hornerf.h
deleted file mode 100644
index 0703817b0fbb..000000000000
--- a/pl/math/hornerf.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Helper macros for double-precision Horner polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#if V_SUPPORTED
-#define FMA v_fma_f32
-#else
-#define FMA fmaf
-#endif
-
-#include "horner_wrap.h"
diff --git a/pl/math/include/mathlib.h b/pl/math/include/mathlib.h
index af5f9f9c6afb..f886e7f8c07a 100644
--- a/pl/math/include/mathlib.h
+++ b/pl/math/include/mathlib.h
@@ -1,4 +1,3 @@
-// clang-format off
/*
* Public API.
*
@@ -9,155 +8,84 @@
#ifndef _MATHLIB_H
#define _MATHLIB_H
+float acosf (float);
float acoshf (float);
+float asinf (float);
float asinhf (float);
float atan2f (float, float);
float atanf (float);
float atanhf (float);
float cbrtf (float);
float coshf (float);
+float cospif (float);
float erfcf (float);
float erff (float);
+float erfinvf (float);
+float exp10f (float);
float expm1f (float);
float log10f (float);
float log1pf (float);
float sinhf (float);
+float sinpif (float);
float tanf (float);
float tanhf (float);
+double acos (double);
double acosh (double);
+double asin (double);
double asinh (double);
double atan (double);
double atan2 (double, double);
double atanh (double);
double cbrt (double);
double cosh (double);
+double cospi (double);
double erfc (double);
+double erfinv (double);
+double exp10 (double);
double expm1 (double);
double log10 (double);
double log1p (double);
double sinh (double);
+double sinpi (double);
double tanh (double);
-float __s_acoshf (float);
-float __s_asinhf (float);
-float __s_atanf (float);
-float __s_atan2f (float, float);
-float __s_atanhf (float);
-float __s_cbrtf (float);
-float __s_coshf (float);
-float __s_erfcf (float);
-float __s_erff (float);
-float __s_expm1f (float);
-float __s_log10f (float);
-float __s_log1pf (float);
-float __s_log2f (float);
-float __s_sinhf (float);
-float __s_tanf (float);
-float __s_tanhf (float);
-
-double __s_acosh (double);
-double __s_asinh (double);
-double __s_atan (double);
-double __s_atan2 (double, double);
-double __s_atanh (double);
-double __s_cbrt (double);
-double __s_cosh (double);
-double __s_erf (double);
-double __s_erfc (double);
-double __s_expm1 (double);
-double __s_log10 (double);
-double __s_log1p (double);
-double __s_log2 (double);
-double __s_sinh (double);
-double __s_tan (double);
-double __s_tanh (double);
+long double cospil (long double);
+long double erfinvl (long double);
+long double exp10l (long double);
+long double sinpil (long double);
#if __aarch64__
-#if __GNUC__ >= 5
+# if __GNUC__ >= 5
typedef __Float32x4_t __f32x4_t;
typedef __Float64x2_t __f64x2_t;
-#elif __clang_major__*100+__clang_minor__ >= 305
-typedef __attribute__((__neon_vector_type__(4))) float __f32x4_t;
-typedef __attribute__((__neon_vector_type__(2))) double __f64x2_t;
-#else
-#error Unsupported compiler
-#endif
+# elif __clang_major__ * 100 + __clang_minor__ >= 305
+typedef __attribute__ ((__neon_vector_type__ (4))) float __f32x4_t;
+typedef __attribute__ ((__neon_vector_type__ (2))) double __f64x2_t;
+# else
+# error Unsupported compiler
+# endif
-/* Vector functions following the base PCS. */
-__f32x4_t __v_acoshf (__f32x4_t);
-__f64x2_t __v_acosh (__f64x2_t);
-__f32x4_t __v_asinhf (__f32x4_t);
-__f64x2_t __v_asinh (__f64x2_t);
-__f32x4_t __v_atanf (__f32x4_t);
-__f64x2_t __v_atan (__f64x2_t);
-__f32x4_t __v_atan2f (__f32x4_t, __f32x4_t);
-__f64x2_t __v_atan2 (__f64x2_t, __f64x2_t);
-__f32x4_t __v_atanhf (__f32x4_t);
-__f64x2_t __v_atanh (__f64x2_t);
-__f32x4_t __v_cbrtf (__f32x4_t);
-__f64x2_t __v_cbrt (__f64x2_t);
-__f32x4_t __v_coshf (__f32x4_t);
-__f64x2_t __v_cosh (__f64x2_t);
-__f32x4_t __v_erff (__f32x4_t);
-__f64x2_t __v_erf (__f64x2_t);
-__f32x4_t __v_erfcf (__f32x4_t);
-__f64x2_t __v_erfc (__f64x2_t);
-__f32x4_t __v_expm1f (__f32x4_t);
-__f64x2_t __v_expm1 (__f64x2_t);
-__f32x4_t __v_log10f (__f32x4_t);
-__f64x2_t __v_log10 (__f64x2_t);
-__f32x4_t __v_log1pf (__f32x4_t);
-__f64x2_t __v_log1p (__f64x2_t);
-__f32x4_t __v_log2f (__f32x4_t);
-__f64x2_t __v_log2 (__f64x2_t);
-__f32x4_t __v_sinhf (__f32x4_t);
-__f64x2_t __v_sinh (__f64x2_t);
-__f32x4_t __v_tanf (__f32x4_t);
-__f64x2_t __v_tan (__f64x2_t);
-__f32x4_t __v_tanhf (__f32x4_t);
-__f64x2_t __v_tanh (__f64x2_t);
+# if __GNUC__ >= 9 || __clang_major__ >= 8
+# define __vpcs __attribute__ ((__aarch64_vector_pcs__))
-#if __GNUC__ >= 9 || __clang_major__ >= 8
-#define __vpcs __attribute__((__aarch64_vector_pcs__))
+typedef struct __f32x4x2_t
+{
+ __f32x4_t val[2];
+} __f32x4x2_t;
-/* Vector functions following the vector PCS. */
-__vpcs __f32x4_t __vn_acoshf (__f32x4_t);
-__vpcs __f64x2_t __vn_acosh (__f64x2_t);
-__vpcs __f32x4_t __vn_asinhf (__f32x4_t);
-__vpcs __f64x2_t __vn_asinh (__f64x2_t);
-__vpcs __f32x4_t __vn_atanf (__f32x4_t);
-__vpcs __f64x2_t __vn_atan (__f64x2_t);
-__vpcs __f32x4_t __vn_atan2f (__f32x4_t, __f32x4_t);
-__vpcs __f64x2_t __vn_atan2 (__f64x2_t, __f64x2_t);
-__vpcs __f32x4_t __vn_atanhf (__f32x4_t);
-__vpcs __f64x2_t __vn_atanh (__f64x2_t);
-__vpcs __f32x4_t __vn_cbrtf (__f32x4_t);
-__vpcs __f64x2_t __vn_cbrt (__f64x2_t);
-__vpcs __f32x4_t __vn_coshf (__f32x4_t);
-__vpcs __f64x2_t __vn_cosh (__f64x2_t);
-__vpcs __f32x4_t __vn_erff (__f32x4_t);
-__vpcs __f64x2_t __vn_erf (__f64x2_t);
-__vpcs __f32x4_t __vn_erfcf (__f32x4_t);
-__vpcs __f64x2_t __vn_erfc (__f64x2_t);
-__vpcs __f32x4_t __vn_expm1f (__f32x4_t);
-__vpcs __f64x2_t __vn_expm1 (__f64x2_t);
-__vpcs __f32x4_t __vn_log10f (__f32x4_t);
-__vpcs __f64x2_t __vn_log10 (__f64x2_t);
-__vpcs __f32x4_t __vn_log1pf (__f32x4_t);
-__vpcs __f64x2_t __vn_log1p (__f64x2_t);
-__vpcs __f32x4_t __vn_log2f (__f32x4_t);
-__vpcs __f64x2_t __vn_log2 (__f64x2_t);
-__vpcs __f32x4_t __vn_sinhf (__f32x4_t);
-__vpcs __f64x2_t __vn_sinh (__f64x2_t);
-__vpcs __f32x4_t __vn_tanf (__f32x4_t);
-__vpcs __f64x2_t __vn_tan (__f64x2_t);
-__vpcs __f32x4_t __vn_tanhf (__f32x4_t);
-__vpcs __f64x2_t __vn_tanh (__f64x2_t);
+typedef struct __f64x2x2_t
+{
+ __f64x2_t val[2];
+} __f64x2x2_t;
/* Vector functions following the vector PCS using ABI names. */
__vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
@@ -168,77 +96,111 @@ __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
+__vpcs __f32x4x2_t _ZGVnN4v_cexpif (__f32x4_t);
+__vpcs __f64x2x2_t _ZGVnN2v_cexpi (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_cospif (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_cospi (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_erfinvf (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_erfinv (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
+__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
+__vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t);
+__vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
__vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t);
+__vpcs void _ZGVnN4vl4l4_sincosf (__f32x4_t, __f32x4_t *, __f32x4_t *);
+__vpcs void _ZGVnN2vl8l8_sincos (__f64x2_t, __f64x2_t *, __f64x2_t *);
-#endif
+# endif
-#if WANT_SVE_MATH
-#include <arm_sve.h>
-svfloat32_t __sv_atan2f_x (svfloat32_t, svfloat32_t, svbool_t);
-svfloat32_t __sv_atanf_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_atan_x (svfloat64_t, svbool_t);
-svfloat64_t __sv_atan2_x (svfloat64_t, svfloat64_t, svbool_t);
-svfloat32_t __sv_cosf_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_cos_x (svfloat64_t, svbool_t);
-svfloat32_t __sv_erff_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_erf_x (svfloat64_t, svbool_t);
-svfloat64_t __sv_erfc_x (svfloat64_t, svbool_t);
-svfloat32_t __sv_expf_x (svfloat32_t, svbool_t);
-svfloat32_t __sv_logf_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_log_x (svfloat64_t, svbool_t);
-svfloat32_t __sv_log10f_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_log10_x (svfloat64_t, svbool_t);
-svfloat32_t __sv_log2f_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_log2_x (svfloat64_t, svbool_t);
-svfloat32_t __sv_powif_x (svfloat32_t, svint32_t, svbool_t);
-svfloat64_t __sv_powi_x (svfloat64_t, svint64_t, svbool_t);
-svfloat32_t __sv_sinf_x (svfloat32_t, svbool_t);
-svfloat64_t __sv_sin_x (svfloat64_t, svbool_t);
-svfloat32_t __sv_tanf_x (svfloat32_t, svbool_t);
-/* SVE ABI names. */
+# if WANT_SVE_MATH
+# include <arm_sve.h>
+svfloat32_t _ZGVsMxv_acoshf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_acosh (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_acosf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_acos (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_asinhf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_asinh (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_asinf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_asin (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_atanhf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_atanh (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxvv_atan2f (svfloat32_t, svfloat32_t, svbool_t);
svfloat32_t _ZGVsMxv_atanf (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_atan (svfloat64_t, svbool_t);
svfloat64_t _ZGVsMxvv_atan2 (svfloat64_t, svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_cbrtf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_cbrt (svfloat64_t, svbool_t);
+svfloat32x2_t _ZGVsMxv_cexpif (svfloat32_t, svbool_t);
+svfloat64x2_t _ZGVsMxv_cexpi (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_coshf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_cosh (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_cosf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_cospif (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_cos (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_cospi (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_erff (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_erf (svfloat64_t, svbool_t);
svfloat64_t _ZGVsMxv_erfc (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_erfcf (svfloat32_t, svbool_t);
svfloat32_t _ZGVsMxv_expf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_exp (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_exp10f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_exp10 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_exp2f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_exp2 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_expm1f (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_expm1 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxvv_hypotf (svfloat32_t, svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxvv_hypot (svfloat64_t, svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_logf (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_log (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_log10f (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_log10 (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_log1pf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_log1p (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_log2f (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_log2 (svfloat64_t, svbool_t);
-svfloat32_t _ZGVsMxvv_powi(svfloat32_t, svint32_t, svbool_t);
-svfloat64_t _ZGVsMxvv_powk(svfloat64_t, svint64_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powi (svfloat32_t, svint32_t, svbool_t);
+svfloat64_t _ZGVsMxvv_powk (svfloat64_t, svint64_t, svbool_t);
+svfloat32_t _ZGVsMxvv_powf (svfloat32_t, svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxvv_pow (svfloat64_t, svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinhf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_sinh (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_sinf (svfloat32_t, svbool_t);
+svfloat32_t _ZGVsMxv_sinpif (svfloat32_t, svbool_t);
svfloat64_t _ZGVsMxv_sin (svfloat64_t, svbool_t);
+svfloat64_t _ZGVsMxv_sinpi (svfloat64_t, svbool_t);
+svfloat32_t _ZGVsMxv_tanhf (svfloat32_t, svbool_t);
+svfloat64_t _ZGVsMxv_tanh (svfloat64_t, svbool_t);
svfloat32_t _ZGVsMxv_tanf (svfloat32_t, svbool_t);
-#endif
+svfloat64_t _ZGVsMxv_tan (svfloat64_t, svbool_t);
+void _ZGVsMxvl4l4_sincosf (svfloat32_t, float *, float *, svbool_t);
+void _ZGVsMxvl8l8_sincos (svfloat64_t, double *, double *, svbool_t);
+# endif
#endif
#endif
-// clang-format on
diff --git a/pl/math/include/pl_test.h b/pl/math/include/pl_test.h
index 6a81360ba287..3a3407e337b8 100644
--- a/pl/math/include/pl_test.h
+++ b/pl/math/include/pl_test.h
@@ -10,11 +10,6 @@
/* Emit max ULP threshold - silenced for building the routine. */
#define PL_TEST_ULP(f, l)
-/* Emit alias. The PL_TEST_ALIAS declaration is piggy-backed on top of
- strong_alias. Use PL_ALIAS instead of strong_alias to make sure the alias is
- also added to the test suite. */
-#define PL_ALIAS(a, b) strong_alias (a, b)
-
/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
exceptions. e allows declaration to be emitted conditionally upon certain
build flags - defer expansion by one pass to allow those flags to be expanded
@@ -23,4 +18,7 @@
#define PL_TEST_EXPECT_FENV_ALWAYS(f)
#define PL_TEST_INTERVAL(f, lo, hi, n)
+#define PL_TEST_SYM_INTERVAL(f, lo, hi, n)
#define PL_TEST_INTERVAL_C(f, lo, hi, n, c)
+#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c)
+#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n)
diff --git a/pl/math/log1p_2u.c b/pl/math/log1p_2u.c
index 23c8ed4a1914..f9491ce52b44 100644
--- a/pl/math/log1p_2u.c
+++ b/pl/math/log1p_2u.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "estrin.h"
+#include "poly_scalar_f64.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -21,7 +21,6 @@
#define Rt2MOne 0x3fda827999fcef32
#define AbsMask 0x7fffffffffffffff
#define ExpM63 0x3c00
-#define C(i) __log1p_data.coeffs[i]
static inline double
eval_poly (double f)
@@ -29,7 +28,7 @@ eval_poly (double f)
double f2 = f * f;
double f4 = f2 * f2;
double f8 = f4 * f4;
- return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
+ return estrin_18_f64 (f, f2, f4, f8, f8 * f8, __log1p_data.coeffs);
}
/* log1p approximation using polynomial on reduced interval. Largest
@@ -126,11 +125,7 @@ log1p (double x)
PL_SIG (S, D, 1, log1p, -0.9, 10.0)
PL_TEST_ULP (log1p, 1.26)
-PL_TEST_INTERVAL (log1p, -10.0, 10.0, 10000)
-PL_TEST_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
-PL_TEST_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
-PL_TEST_INTERVAL (log1p, 0.001, 1.0, 50000)
-PL_TEST_INTERVAL (log1p, 0.0, -0x1p-23, 50000)
-PL_TEST_INTERVAL (log1p, -0x1p-23, -0.001, 50000)
-PL_TEST_INTERVAL (log1p, -0.001, -1.0, 50000)
-PL_TEST_INTERVAL (log1p, -1.0, inf, 5000)
+PL_TEST_SYM_INTERVAL (log1p, 0.0, 0x1p-23, 50000)
+PL_TEST_SYM_INTERVAL (log1p, 0x1p-23, 0.001, 50000)
+PL_TEST_SYM_INTERVAL (log1p, 0.001, 1.0, 50000)
+PL_TEST_SYM_INTERVAL (log1p, 1.0, inf, 5000)
diff --git a/pl/math/log1pf_2u1.c b/pl/math/log1pf_2u1.c
index fcfd05a6fcb7..e99174853720 100644
--- a/pl/math/log1pf_2u1.c
+++ b/pl/math/log1pf_2u1.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "hornerf.h"
+#include "poly_scalar_f32.h"
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -53,7 +53,7 @@ eval_poly (float m, uint32_t e)
x + C1 * x^2 + C2 * x^3 + C3 * x^4 + ...
Hence approximation has the form m + m^2 * P(m)
where P(x) = C1 + C2 * x + C3 * x^2 + ... . */
- return fmaf (m, m * HORNER_8 (m, C), m);
+ return fmaf (m, m * horner_8_f32 (m, __log1pf_data.coeffs), m);
#else
#error No log1pf approximation exists with the requested precision. Options are 13 or 25.
@@ -155,11 +155,7 @@ log1pf (float x)
PL_SIG (S, F, 1, log1p, -0.9, 10.0)
PL_TEST_ULP (log1pf, 1.52)
-PL_TEST_INTERVAL (log1pf, -10.0, 10.0, 10000)
-PL_TEST_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
-PL_TEST_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
-PL_TEST_INTERVAL (log1pf, 0.001, 1.0, 50000)
-PL_TEST_INTERVAL (log1pf, 0.0, -0x1p-23, 50000)
-PL_TEST_INTERVAL (log1pf, -0x1p-23, -0.001, 50000)
-PL_TEST_INTERVAL (log1pf, -0.001, -1.0, 50000)
-PL_TEST_INTERVAL (log1pf, -1.0, inf, 5000)
+PL_TEST_SYM_INTERVAL (log1pf, 0.0, 0x1p-23, 50000)
+PL_TEST_SYM_INTERVAL (log1pf, 0x1p-23, 0.001, 50000)
+PL_TEST_SYM_INTERVAL (log1pf, 0.001, 1.0, 50000)
+PL_TEST_SYM_INTERVAL (log1pf, 1.0, inf, 5000)
diff --git a/pl/math/math_config.h b/pl/math/math_config.h
index dccb3ce4c775..c3dd8f2db8c7 100644
--- a/pl/math/math_config.h
+++ b/pl/math/math_config.h
@@ -13,9 +13,9 @@
#ifndef WANT_ROUNDING
/* If defined to 1, return correct results for special cases in non-nearest
- rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than -0.0f).
- This may be set to 0 if there is no fenv support or if math functions only
- get called in round to nearest mode. */
+ rounding modes (logf (1.0f) returns 0.0f with FE_DOWNWARD rather than
+ -0.0f). This may be set to 0 if there is no fenv support or if math
+ functions only get called in round to nearest mode. */
# define WANT_ROUNDING 1
#endif
#ifndef WANT_ERRNO
@@ -27,33 +27,34 @@
#ifndef WANT_SIMD_EXCEPT
/* If defined to 1, trigger fp exceptions in vector routines, consistently with
behaviour expected from the corresponding scalar routine. */
-#define WANT_SIMD_EXCEPT 0
+# define WANT_SIMD_EXCEPT 0
#endif
/* Compiler can inline round as a single instruction. */
#ifndef HAVE_FAST_ROUND
# if __aarch64__
-# define HAVE_FAST_ROUND 1
+# define HAVE_FAST_ROUND 1
# else
-# define HAVE_FAST_ROUND 0
+# define HAVE_FAST_ROUND 0
# endif
#endif
/* Compiler can inline lround, but not (long)round(x). */
#ifndef HAVE_FAST_LROUND
-# if __aarch64__ && (100*__GNUC__ + __GNUC_MINOR__) >= 408 && __NO_MATH_ERRNO__
-# define HAVE_FAST_LROUND 1
+# if __aarch64__ && (100 * __GNUC__ + __GNUC_MINOR__) >= 408 \
+ && __NO_MATH_ERRNO__
+# define HAVE_FAST_LROUND 1
# else
-# define HAVE_FAST_LROUND 0
+# define HAVE_FAST_LROUND 0
# endif
#endif
/* Compiler can inline fma as a single instruction. */
#ifndef HAVE_FAST_FMA
# if defined FP_FAST_FMA || __aarch64__
-# define HAVE_FAST_FMA 1
+# define HAVE_FAST_FMA 1
# else
-# define HAVE_FAST_FMA 0
+# define HAVE_FAST_FMA 0
# endif
#endif
@@ -62,9 +63,9 @@
to interpose math functions with both static and dynamic linking. */
#ifndef USE_GLIBC_ABI
# if __GNUC__
-# define USE_GLIBC_ABI 1
+# define USE_GLIBC_ABI 1
# else
-# define USE_GLIBC_ABI 0
+# define USE_GLIBC_ABI 0
# endif
#endif
@@ -76,15 +77,15 @@
# define likely(x) __builtin_expect (!!(x), 1)
# define unlikely(x) __builtin_expect (x, 0)
# if __GNUC__ >= 9
-# define attribute_copy(f) __attribute__ ((copy (f)))
+# define attribute_copy(f) __attribute__ ((copy (f)))
# else
-# define attribute_copy(f)
+# define attribute_copy(f)
# endif
-# define strong_alias(f, a) \
- extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
-# define hidden_alias(f, a) \
- extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
- attribute_copy (f);
+# define strong_alias(f, a) \
+ extern __typeof (f) a __attribute__ ((alias (#f))) attribute_copy (f);
+# define hidden_alias(f, a) \
+ extern __typeof (f) a __attribute__ ((alias (#f), visibility ("hidden"))) \
+ attribute_copy (f);
#else
# define HIDDEN
# define NOINLINE
@@ -93,6 +94,31 @@
# define unlikely(x) (x)
#endif
+/* Return ptr but hide its value from the compiler so accesses through it
+ cannot be optimized based on the contents. */
+#define ptr_barrier(ptr) \
+ ({ \
+ __typeof (ptr) __ptr = (ptr); \
+ __asm("" : "+r"(__ptr)); \
+ __ptr; \
+ })
+
+/* Symbol renames to avoid libc conflicts. */
+#define __math_oflowf arm_math_oflowf
+#define __math_uflowf arm_math_uflowf
+#define __math_may_uflowf arm_math_may_uflowf
+#define __math_divzerof arm_math_divzerof
+#define __math_oflow arm_math_oflow
+#define __math_uflow arm_math_uflow
+#define __math_may_uflow arm_math_may_uflow
+#define __math_divzero arm_math_divzero
+#define __math_invalidf arm_math_invalidf
+#define __math_invalid arm_math_invalid
+#define __math_check_oflow arm_math_check_oflow
+#define __math_check_uflow arm_math_check_uflow
+#define __math_check_oflowf arm_math_check_oflowf
+#define __math_check_uflowf arm_math_check_uflowf
+
#if HAVE_FAST_ROUND
/* When set, the roundtoint and converttoint functions are provided with
the semantics documented below. */
@@ -128,7 +154,7 @@ asuint (float f)
{
float f;
uint32_t i;
- } u = {f};
+ } u = { f };
return u.i;
}
@@ -139,7 +165,7 @@ asfloat (uint32_t i)
{
uint32_t i;
float f;
- } u = {i};
+ } u = { i };
return u.f;
}
@@ -150,7 +176,7 @@ asuint64 (double f)
{
double f;
uint64_t i;
- } u = {f};
+ } u = { f };
return u.i;
}
@@ -161,7 +187,7 @@ asdouble (uint64_t i)
{
uint64_t i;
double f;
- } u = {i};
+ } u = { i };
return u.f;
}
@@ -320,10 +346,26 @@ check_uflowf (float x)
extern const struct erff_data
{
- float erff_poly_A[6];
- float erff_poly_B[7];
+ struct
+ {
+ float erf, scale;
+ } tab[513];
} __erff_data HIDDEN;
+extern const struct sv_erff_data
+{
+ float erf[513];
+ float scale[513];
+} __sv_erff_data HIDDEN;
+
+extern const struct erfcf_data
+{
+ struct
+ {
+ float erfc, scale;
+ } tab[645];
+} __erfcf_data HIDDEN;
+
/* Data for logf and log10f. */
#define LOGF_TABLE_BITS 4
#define LOGF_POLY_ORDER 4
@@ -349,9 +391,15 @@ extern const struct log10_data
double invln10;
double poly[LOG10_POLY_ORDER - 1]; /* First coefficient is 1/log(10). */
double poly1[LOG10_POLY1_ORDER - 1];
- struct {double invc, logc;} tab[1 << LOG10_TABLE_BITS];
+ struct
+ {
+ double invc, logc;
+ } tab[1 << LOG10_TABLE_BITS];
#if !HAVE_FAST_FMA
- struct {double chi, clo;} tab2[1 << LOG10_TABLE_BITS];
+ struct
+ {
+ double chi, clo;
+ } tab2[1 << LOG10_TABLE_BITS];
#endif
} __log10_data HIDDEN;
@@ -374,44 +422,38 @@ extern const struct exp_data
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
- uint64_t tab[2*(1 << EXP_TABLE_BITS)];
+ uint64_t tab[2 * (1 << EXP_TABLE_BITS)];
} __exp_data HIDDEN;
-#define ERFC_NUM_INTERVALS 20
-#define ERFC_POLY_ORDER 12
-extern const struct erfc_data
-{
- double interval_bounds[ERFC_NUM_INTERVALS + 1];
- double poly[ERFC_NUM_INTERVALS][ERFC_POLY_ORDER + 1];
-} __erfc_data HIDDEN;
-extern const struct v_erfc_data
-{
- double interval_bounds[ERFC_NUM_INTERVALS + 1];
- double poly[ERFC_NUM_INTERVALS + 1][ERFC_POLY_ORDER + 1];
-} __v_erfc_data HIDDEN;
-
-#define ERFCF_POLY_NCOEFFS 16
-extern const struct erfcf_poly_data
-{
- double poly[4][ERFCF_POLY_NCOEFFS];
-} __erfcf_poly_data HIDDEN;
-
+/* Copied from math/v_exp.h for use in vector exp_tail. */
#define V_EXP_TAIL_TABLE_BITS 8
extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] HIDDEN;
-#define V_ERF_NINTS 49
-#define V_ERF_NCOEFFS 10
-extern const struct v_erf_data
+/* Copied from math/v_exp.h for use in vector exp2. */
+#define V_EXP_TABLE_BITS 7
+extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] HIDDEN;
+
+extern const struct erf_data
+{
+ struct
+ {
+ double erf, scale;
+ } tab[769];
+} __erf_data HIDDEN;
+
+extern const struct sv_erf_data
{
- double shifts[V_ERF_NINTS];
- double coeffs[V_ERF_NCOEFFS][V_ERF_NINTS];
-} __v_erf_data HIDDEN;
+ double erf[769];
+ double scale[769];
+} __sv_erf_data HIDDEN;
-#define V_ERFF_NCOEFFS 7
-extern const struct v_erff_data
+extern const struct erfc_data
{
- float coeffs[V_ERFF_NCOEFFS][2];
-} __v_erff_data HIDDEN;
+ struct
+ {
+ double erfc, scale;
+ } tab[3488];
+} __erfc_data HIDDEN;
#define ATAN_POLY_NCOEFFS 20
extern const struct atan_poly_data
@@ -465,7 +507,6 @@ extern const struct log1p_data
} __log1p_data HIDDEN;
#define LOG1PF_2U5
-#define V_LOG1PF_2U5
#define LOG1PF_NCOEFFS 9
extern const struct log1pf_data
{
@@ -481,61 +522,52 @@ extern const struct tanf_poly_data
float poly_cotan[TANF_Q_POLY_NCOEFFS];
} __tanf_poly_data HIDDEN;
-#define V_LOG2F_POLY_NCOEFFS 9
-extern const struct v_log2f_data
-{
- float poly[V_LOG2F_POLY_NCOEFFS];
-} __v_log2f_data HIDDEN;
-
#define V_LOG2_TABLE_BITS 7
-#define V_LOG2_POLY_ORDER 6
extern const struct v_log2_data
{
- double poly[V_LOG2_POLY_ORDER - 1];
+ double poly[5];
+ double invln2;
struct
{
double invc, log2c;
- } tab[1 << V_LOG2_TABLE_BITS];
+ } table[1 << V_LOG2_TABLE_BITS];
} __v_log2_data HIDDEN;
-#define V_SINF_NCOEFFS 4
-extern const struct sv_sinf_data
-{
- float coeffs[V_SINF_NCOEFFS];
-} __sv_sinf_data HIDDEN;
-
#define V_LOG10_TABLE_BITS 7
-#define V_LOG10_POLY_ORDER 6
extern const struct v_log10_data
{
+ double poly[5];
+ double invln10, log10_2;
struct
{
double invc, log10c;
- } tab[1 << V_LOG10_TABLE_BITS];
- double poly[V_LOG10_POLY_ORDER - 1];
- double invln10, log10_2;
+ } table[1 << V_LOG10_TABLE_BITS];
} __v_log10_data HIDDEN;
-#define V_LOG10F_POLY_ORDER 9
-extern const float __v_log10f_poly[V_LOG10F_POLY_ORDER - 1] HIDDEN;
-
-#define SV_LOGF_POLY_ORDER 8
-extern const float __sv_logf_poly[SV_LOGF_POLY_ORDER - 1] HIDDEN;
-
-#define SV_LOG_POLY_ORDER 6
-#define SV_LOG_TABLE_BITS 7
-extern const struct sv_log_data
+/* Some data for SVE powf's internal exp and log. */
+#define V_POWF_EXP2_TABLE_BITS 5
+#define V_POWF_EXP2_N (1 << V_POWF_EXP2_TABLE_BITS)
+#define V_POWF_LOG2_TABLE_BITS 5
+#define V_POWF_LOG2_N (1 << V_POWF_LOG2_TABLE_BITS)
+extern const struct v_powf_data
{
- double invc[1 << SV_LOG_TABLE_BITS];
- double logc[1 << SV_LOG_TABLE_BITS];
- double poly[SV_LOG_POLY_ORDER - 1];
-} __sv_log_data HIDDEN;
+ double invc[V_POWF_LOG2_N];
+ double logc[V_POWF_LOG2_N];
+ uint64_t scale[V_POWF_EXP2_N];
+} __v_powf_data HIDDEN;
-#ifndef SV_EXPF_USE_FEXPA
-#define SV_EXPF_USE_FEXPA 0
-#endif
-#define SV_EXPF_POLY_ORDER 6
-extern const float __sv_expf_poly[SV_EXPF_POLY_ORDER - 1] HIDDEN;
+#define V_LOG_POLY_ORDER 6
+#define V_LOG_TABLE_BITS 7
+extern const struct v_log_data
+{
+ /* Shared data for vector log and log-derived routines (e.g. asinh). */
+ double poly[V_LOG_POLY_ORDER - 1];
+ double ln2;
+ struct
+ {
+ double invc, logc;
+ } table[1 << V_LOG_TABLE_BITS];
+} __v_log_data HIDDEN;
#define EXPM1F_POLY_ORDER 5
extern const float __expm1f_poly[EXPM1F_POLY_ORDER] HIDDEN;
@@ -564,9 +596,29 @@ extern const struct cbrt_data
double table[5];
} __cbrt_data HIDDEN;
-extern const struct v_tan_data
+#define ASINF_POLY_ORDER 4
+extern const float __asinf_poly[ASINF_POLY_ORDER + 1] HIDDEN;
+
+#define ASIN_POLY_ORDER 11
+extern const double __asin_poly[ASIN_POLY_ORDER + 1] HIDDEN;
+
+/* Some data for AdvSIMD and SVE pow's internal exp and log. */
+#define V_POW_EXP_TABLE_BITS 8
+extern const struct v_pow_exp_data
{
- double neg_half_pi_hi, neg_half_pi_lo;
- double poly[9];
-} __v_tan_data HIDDEN;
+ double poly[3];
+ double n_over_ln2, ln2_over_n_hi, ln2_over_n_lo, shift;
+ uint64_t sbits[1 << V_POW_EXP_TABLE_BITS];
+} __v_pow_exp_data HIDDEN;
+
+#define V_POW_LOG_TABLE_BITS 7
+extern const struct v_pow_log_data
+{
+ double poly[7]; /* First coefficient is 1. */
+ double ln2_hi, ln2_lo;
+ double invc[1 << V_POW_LOG_TABLE_BITS];
+ double logc[1 << V_POW_LOG_TABLE_BITS];
+ double logctail[1 << V_POW_LOG_TABLE_BITS];
+} __v_pow_log_data HIDDEN;
+
#endif
diff --git a/pl/math/math_err.c b/pl/math/math_err.c
index d246a89982de..74db54a5b2cd 100644
--- a/pl/math/math_err.c
+++ b/pl/math/math_err.c
@@ -8,7 +8,7 @@
#include "math_config.h"
#if WANT_ERRNO
-#include <errno.h>
+# include <errno.h>
/* NOINLINE reduces code size and avoids making math functions non-leaf
when the error handling is inlined. */
NOINLINE static double
@@ -18,7 +18,7 @@ with_errno (double y, int e)
return y;
}
#else
-#define with_errno(x, e) (x)
+# define with_errno(x, e) (x)
#endif
/* NOINLINE reduces code size. */
diff --git a/pl/math/math_errf.c b/pl/math/math_errf.c
index 96271ff18bc1..2b8c6bd25753 100644
--- a/pl/math/math_errf.c
+++ b/pl/math/math_errf.c
@@ -8,7 +8,7 @@
#include "math_config.h"
#if WANT_ERRNO
-#include <errno.h>
+# include <errno.h>
/* NOINLINE reduces code size and avoids making math functions non-leaf
when the error handling is inlined. */
NOINLINE static float
@@ -18,7 +18,7 @@ with_errnof (float y, int e)
return y;
}
#else
-#define with_errnof(x, e) (x)
+# define with_errnof(x, e) (x)
#endif
/* NOINLINE reduces code size. */
diff --git a/pl/math/pairwise_horner.h b/pl/math/pairwise_horner.h
deleted file mode 100644
index 6ad98dccd6aa..000000000000
--- a/pl/math/pairwise_horner.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Helper macros for double-precision pairwise Horner polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#if V_SUPPORTED
-#define FMA v_fma_f64
-#else
-#define FMA fma
-#endif
-
-#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pairwise_horner_wrap.h b/pl/math/pairwise_horner_wrap.h
deleted file mode 100644
index e56f059514ad..000000000000
--- a/pl/math/pairwise_horner_wrap.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Helper macros for pairwise Horner polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-// clang-format off
-#define PW_HORNER_1_(x, c, i) FMA(x, c(i + 1), c(i))
-#define PW_HORNER_3_(x, x2, c, i) FMA(x2, PW_HORNER_1_ (x, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_5_(x, x2, c, i) FMA(x2, PW_HORNER_3_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_7_(x, x2, c, i) FMA(x2, PW_HORNER_5_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_9_(x, x2, c, i) FMA(x2, PW_HORNER_7_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_11_(x, x2, c, i) FMA(x2, PW_HORNER_9_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_13_(x, x2, c, i) FMA(x2, PW_HORNER_11_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_15_(x, x2, c, i) FMA(x2, PW_HORNER_13_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_17_(x, x2, c, i) FMA(x2, PW_HORNER_15_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-
-#define PAIRWISE_HORNER_1(x, c) PW_HORNER_1_ (x, c, 0)
-#define PAIRWISE_HORNER_3(x, x2, c) PW_HORNER_3_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_5(x, x2, c) PW_HORNER_5_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_7(x, x2, c) PW_HORNER_7_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_9(x, x2, c) PW_HORNER_9_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_11(x, x2, c) PW_HORNER_11_(x, x2, c, 0)
-#define PAIRWISE_HORNER_13(x, x2, c) PW_HORNER_13_(x, x2, c, 0)
-#define PAIRWISE_HORNER_15(x, x2, c) PW_HORNER_15_(x, x2, c, 0)
-#define PAIRWISE_HORNER_17(x, x2, c) PW_HORNER_17_(x, x2, c, 0)
-
-#define PW_HORNER_2_(x, x2, c, i) FMA(x2, c(i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_4_(x, x2, c, i) FMA(x2, PW_HORNER_2_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_6_(x, x2, c, i) FMA(x2, PW_HORNER_4_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_8_(x, x2, c, i) FMA(x2, PW_HORNER_6_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_10_(x, x2, c, i) FMA(x2, PW_HORNER_8_ (x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_12_(x, x2, c, i) FMA(x2, PW_HORNER_10_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_14_(x, x2, c, i) FMA(x2, PW_HORNER_12_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_16_(x, x2, c, i) FMA(x2, PW_HORNER_14_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-#define PW_HORNER_18_(x, x2, c, i) FMA(x2, PW_HORNER_16_(x, x2, c, i + 2), PW_HORNER_1_(x, c, i))
-
-#define PAIRWISE_HORNER_2(x, x2, c) PW_HORNER_2_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_4(x, x2, c) PW_HORNER_4_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_6(x, x2, c) PW_HORNER_6_ (x, x2, c, 0)
-#define PAIRWISE_HORNER_8(x, x2, c) PW_HORNER_8_(x, x2, c, 0)
-#define PAIRWISE_HORNER_10(x, x2, c) PW_HORNER_10_(x, x2, c, 0)
-#define PAIRWISE_HORNER_12(x, x2, c) PW_HORNER_12_(x, x2, c, 0)
-#define PAIRWISE_HORNER_14(x, x2, c) PW_HORNER_14_(x, x2, c, 0)
-#define PAIRWISE_HORNER_16(x, x2, c) PW_HORNER_16_(x, x2, c, 0)
-#define PAIRWISE_HORNER_18(x, x2, c) PW_HORNER_18_(x, x2, c, 0)
-// clang-format on
diff --git a/pl/math/pairwise_hornerf.h b/pl/math/pairwise_hornerf.h
deleted file mode 100644
index 784750cde0b6..000000000000
--- a/pl/math/pairwise_hornerf.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Helper macros for single-precision pairwise Horner polynomial evaluation.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#if V_SUPPORTED
-#define FMA v_fma_f32
-#else
-#define FMA fmaf
-#endif
-
-#include "pairwise_horner_wrap.h"
diff --git a/pl/math/pl_sig.h b/pl/math/pl_sig.h
index 686d24f0d9a5..52d988f0e1ce 100644
--- a/pl/math/pl_sig.h
+++ b/pl/math/pl_sig.h
@@ -4,35 +4,51 @@
* Copyright (c) 2022-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception.
*/
+
+#define V_NAME_F1(fun) _ZGVnN4v_##fun##f
+#define V_NAME_D1(fun) _ZGVnN2v_##fun
+#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f
+#define V_NAME_D2(fun) _ZGVnN2vv_##fun
+
+#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f
+#define SV_NAME_D1(fun) _ZGVsMxv_##fun
+#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f
+#define SV_NAME_D2(fun) _ZGVsMxvv_##fun
+
#define PL_DECL_SF1(fun) float fun##f (float);
#define PL_DECL_SF2(fun) float fun##f (float, float);
#define PL_DECL_SD1(fun) double fun (double);
#define PL_DECL_SD2(fun) double fun (double, double);
-#if V_SUPPORTED
-#define PL_DECL_VF1(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t);
-#define PL_DECL_VF2(fun) VPCS_ATTR v_f32_t V_NAME (fun##f) (v_f32_t, v_f32_t);
-#define PL_DECL_VD1(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t);
-#define PL_DECL_VD2(fun) VPCS_ATTR v_f64_t V_NAME (fun) (v_f64_t, v_f64_t);
+#if WANT_VMATH
+# define PL_DECL_VF1(fun) \
+ VPCS_ATTR float32x4_t V_NAME_F1 (fun##f) (float32x4_t);
+# define PL_DECL_VF2(fun) \
+ VPCS_ATTR float32x4_t V_NAME_F2 (fun##f) (float32x4_t, float32x4_t);
+# define PL_DECL_VD1(fun) VPCS_ATTR float64x2_t V_NAME_D1 (fun) (float64x2_t);
+# define PL_DECL_VD2(fun) \
+ VPCS_ATTR float64x2_t V_NAME_D2 (fun) (float64x2_t, float64x2_t);
#else
-#define PL_DECL_VF1(fun)
-#define PL_DECL_VF2(fun)
-#define PL_DECL_VD1(fun)
-#define PL_DECL_VD2(fun)
+# define PL_DECL_VF1(fun)
+# define PL_DECL_VF2(fun)
+# define PL_DECL_VD1(fun)
+# define PL_DECL_VD2(fun)
#endif
-#if SV_SUPPORTED
-#define PL_DECL_SVF1(fun) sv_f32_t __sv_##fun##f_x (sv_f32_t, svbool_t);
-#define PL_DECL_SVF2(fun) \
- sv_f32_t __sv_##fun##f_x (sv_f32_t, sv_f32_t, svbool_t);
-#define PL_DECL_SVD1(fun) sv_f64_t __sv_##fun##_x (sv_f64_t, svbool_t);
-#define PL_DECL_SVD2(fun) \
- sv_f64_t __sv_##fun##_x (sv_f64_t, sv_f64_t, svbool_t);
+#if WANT_SVE_MATH
+# define PL_DECL_SVF1(fun) \
+ svfloat32_t SV_NAME_F1 (fun) (svfloat32_t, svbool_t);
+# define PL_DECL_SVF2(fun) \
+ svfloat32_t SV_NAME_F2 (fun) (svfloat32_t, svfloat32_t, svbool_t);
+# define PL_DECL_SVD1(fun) \
+ svfloat64_t SV_NAME_D1 (fun) (svfloat64_t, svbool_t);
+# define PL_DECL_SVD2(fun) \
+ svfloat64_t SV_NAME_D2 (fun) (svfloat64_t, svfloat64_t, svbool_t);
#else
-#define PL_DECL_SVF1(fun)
-#define PL_DECL_SVF2(fun)
-#define PL_DECL_SVD1(fun)
-#define PL_DECL_SVD2(fun)
+# define PL_DECL_SVF1(fun)
+# define PL_DECL_SVF2(fun)
+# define PL_DECL_SVD1(fun)
+# define PL_DECL_SVD2(fun)
#endif
/* For building the routines, emit function prototype from PL_SIG. This
diff --git a/pl/math/poly_advsimd_f32.h b/pl/math/poly_advsimd_f32.h
new file mode 100644
index 000000000000..438e153dff90
--- /dev/null
+++ b/pl/math/poly_advsimd_f32.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on single-precision AdvSIMD input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_POLY_ADVSIMD_F32_H
+#define PL_MATH_POLY_ADVSIMD_F32_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form:
+ v_[scheme]_[order]_f32. */
+#define VTYPE float32x4_t
+#define FMA(x, y, z) vfmaq_f32 (z, x, y)
+#define VWRAP(f) v_##f##_f32
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/pl/math/poly_advsimd_f64.h b/pl/math/poly_advsimd_f64.h
new file mode 100644
index 000000000000..7ea249a91225
--- /dev/null
+++ b/pl/math/poly_advsimd_f64.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on double-precision AdvSIMD input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_POLY_ADVSIMD_F64_H
+#define PL_MATH_POLY_ADVSIMD_F64_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form:
+ v_[scheme]_[order]_f64. */
+#define VTYPE float64x2_t
+#define FMA(x, y, z) vfmaq_f64 (z, x, y)
+#define VWRAP(f) v_##f##_f64
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/pl/math/poly_generic.h b/pl/math/poly_generic.h
new file mode 100644
index 000000000000..3fc25f8762f2
--- /dev/null
+++ b/pl/math/poly_generic.h
@@ -0,0 +1,277 @@
+/*
+ * Generic helpers for evaluating polynomials with various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef VTYPE
+# error Cannot use poly_generic without defining VTYPE
+#endif
+#ifndef VWRAP
+# error Cannot use poly_generic without defining VWRAP
+#endif
+#ifndef FMA
+# error Cannot use poly_generic without defining FMA
+#endif
+
+static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2,
+ const VTYPE *poly)
+{
+ /* At order 3, Estrin and Pairwise Horner are identical. */
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ VTYPE p23 = FMA (poly[3], x, poly[2]);
+ return FMA (p23, x2, p01);
+}
+
+static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ return FMA (poly[4], x4, p03);
+}
+static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ VTYPE p45 = FMA (poly[5], x, poly[4]);
+ return FMA (p45, x4, p03);
+}
+static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ VTYPE p45 = FMA (poly[5], x, poly[4]);
+ VTYPE p46 = FMA (poly[6], x2, p45);
+ return FMA (p46, x4, p03);
+}
+static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4);
+ return FMA (p47, x4, p03);
+}
+static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ VTYPE p89 = FMA (poly[9], x, poly[8]);
+ return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ VTYPE p89 = FMA (poly[9], x, poly[8]);
+ VTYPE p8_10 = FMA (poly[10], x2, p89);
+ return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8);
+ return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ VTYPE p16_17 = FMA (poly[17], x, poly[16]);
+ return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ VTYPE p16_17 = FMA (poly[17], x, poly[16]);
+ VTYPE p16_18 = FMA (poly[18], x2, p16_17);
+ return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16);
+ return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+
+static inline VTYPE VWRAP (horner_2) (VTYPE x, const VTYPE *poly)
+{
+ VTYPE p = FMA (poly[2], x, poly[1]);
+ return FMA (x, p, poly[0]);
+}
+static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly)
+{
+ VTYPE p = FMA (poly[3], x, poly[2]);
+ p = FMA (x, p, poly[1]);
+ p = FMA (x, p, poly[0]);
+ return p;
+}
+static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly)
+{
+ VTYPE p = FMA (poly[4], x, poly[3]);
+ p = FMA (x, p, poly[2]);
+ p = FMA (x, p, poly[1]);
+ p = FMA (x, p, poly[0]);
+ return p;
+}
+static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]);
+}
+
+static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ VTYPE p23 = FMA (poly[3], x, poly[2]);
+ VTYPE p;
+ p = FMA (x2, poly[4], p23);
+ p = FMA (x2, p, p01);
+ return p;
+}
+static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ VTYPE p23 = FMA (poly[3], x, poly[2]);
+ VTYPE p45 = FMA (poly[5], x, poly[4]);
+ VTYPE p;
+ p = FMA (x2, p45, p23);
+ p = FMA (x2, p, p01);
+ return p;
+}
+static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p26, p01);
+}
+static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p27, p01);
+}
+static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p28, p01);
+}
+static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p29, p01);
+}
+static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_10, p01);
+}
+static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_11, p01);
+}
+static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_12, p01);
+}
+static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_13, p01);
+}
+static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_14, p01);
+}
+static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_15, p01);
+}
+static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_16, p01);
+}
+static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_17, p01);
+}
+static inline VTYPE VWRAP (pw_horner_18) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_18 = VWRAP (pw_horner_16) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_18, p01);
+}
diff --git a/pl/math/poly_scalar_f32.h b/pl/math/poly_scalar_f32.h
new file mode 100644
index 000000000000..a9b1c5544494
--- /dev/null
+++ b/pl/math/poly_scalar_f32.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on siongle-precision scalar input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_POLY_SCALAR_F32_H
+#define PL_MATH_POLY_SCALAR_F32_H
+
+#include <math.h>
+
+/* Wrap scalar f32 helpers: evaluation of some scheme/order has form:
+ [scheme]_[order]_f32. */
+#define VTYPE float
+#define FMA fmaf
+#define VWRAP(f) f##_f32
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/pl/math/poly_scalar_f64.h b/pl/math/poly_scalar_f64.h
new file mode 100644
index 000000000000..207dccee30ad
--- /dev/null
+++ b/pl/math/poly_scalar_f64.h
@@ -0,0 +1,24 @@
+/*
+ * Helpers for evaluating polynomials on double-precision scalar input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_POLY_SCALAR_F64_H
+#define PL_MATH_POLY_SCALAR_F64_H
+
+#include <math.h>
+
+/* Wrap scalar f64 helpers: evaluation of some scheme/order has form:
+ [scheme]_[order]_f64. */
+#define VTYPE double
+#define FMA fma
+#define VWRAP(f) f##_f64
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/pl/math/poly_sve_f32.h b/pl/math/poly_sve_f32.h
new file mode 100644
index 000000000000..a97e2ced027a
--- /dev/null
+++ b/pl/math/poly_sve_f32.h
@@ -0,0 +1,26 @@
+/*
+ * Helpers for evaluating polynomials on single-precision SVE input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_POLY_SVE_F32_H
+#define PL_MATH_POLY_SVE_F32_H
+
+#include <arm_sve.h>
+
+/* Wrap SVE f32 helpers: evaluation of some scheme/order has form:
+ sv_[scheme]_[order]_f32_x. */
+#define VTYPE svfloat32_t
+#define STYPE float
+#define VWRAP(f) sv_##f##_f32_x
+#define DUP svdup_f32
+#include "poly_sve_generic.h"
+#undef DUP
+#undef VWRAP
+#undef STYPE
+#undef VTYPE
+
+#endif
diff --git a/pl/math/poly_sve_f64.h b/pl/math/poly_sve_f64.h
new file mode 100644
index 000000000000..5fb14b3c1700
--- /dev/null
+++ b/pl/math/poly_sve_f64.h
@@ -0,0 +1,26 @@
+/*
+ * Helpers for evaluating polynomials on double-precision SVE input, using
+ * various schemes.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_POLY_SVE_F64_H
+#define PL_MATH_POLY_SVE_F64_H
+
+#include <arm_sve.h>
+
+/* Wrap SVE f64 helpers: evaluation of some scheme/order has form:
+ sv_[scheme]_[order]_f64_x. */
+#define VTYPE svfloat64_t
+#define STYPE double
+#define VWRAP(f) sv_##f##_f64_x
+#define DUP svdup_f64
+#include "poly_sve_generic.h"
+#undef DUP
+#undef VWRAP
+#undef STYPE
+#undef VTYPE
+
+#endif
diff --git a/pl/math/poly_sve_generic.h b/pl/math/poly_sve_generic.h
new file mode 100644
index 000000000000..b568e4cddff3
--- /dev/null
+++ b/pl/math/poly_sve_generic.h
@@ -0,0 +1,301 @@
+/*
+ * Helpers for evaluating polynomials with various schemes - specific to SVE
+ * but precision-agnostic.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef VTYPE
+# error Cannot use poly_generic without defining VTYPE
+#endif
+#ifndef STYPE
+# error Cannot use poly_generic without defining STYPE
+#endif
+#ifndef VWRAP
+# error Cannot use poly_generic without defining VWRAP
+#endif
+#ifndef DUP
+# error Cannot use poly_generic without defining DUP
+#endif
+
+static inline VTYPE VWRAP (pairwise_poly_3) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ /* At order 3, Estrin and Pairwise Horner are identical. */
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
+ return svmla_x (pg, p01, p23, x2);
+}
+
+static inline VTYPE VWRAP (estrin_4) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
+ const STYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
+ return svmla_x (pg, p03, x4, poly[4]);
+}
+static inline VTYPE VWRAP (estrin_5) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
+ const STYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
+ VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
+ return svmla_x (pg, p03, p45, x4);
+}
+static inline VTYPE VWRAP (estrin_6) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
+ const STYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
+ VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
+ VTYPE p46 = svmla_x (pg, p45, x, poly[6]);
+ return svmla_x (pg, p03, p46, x4);
+}
+static inline VTYPE VWRAP (estrin_7) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
+ const STYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (pg, x, x2, poly);
+ VTYPE p47 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 4);
+ return svmla_x (pg, p03, p47, x4);
+}
+static inline VTYPE VWRAP (estrin_8) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
+ VTYPE x8, const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), x8, poly[8]);
+}
+static inline VTYPE VWRAP (estrin_9) (svbool_t pg, VTYPE x, VTYPE x2, VTYPE x4,
+ VTYPE x8, const STYPE *poly)
+{
+ VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p89, x8);
+}
+static inline VTYPE VWRAP (estrin_10) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, const STYPE *poly)
+{
+ VTYPE p89 = svmla_x (pg, DUP (poly[8]), x, poly[9]);
+ VTYPE p8_10 = svmla_x (pg, p89, x2, poly[10]);
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_10, x8);
+}
+static inline VTYPE VWRAP (estrin_11) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, const STYPE *poly)
+{
+ VTYPE p8_11 = VWRAP (pairwise_poly_3) (pg, x, x2, poly + 8);
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly), p8_11, x8);
+}
+static inline VTYPE VWRAP (estrin_12) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
+ VWRAP (estrin_4) (pg, x, x2, x4, poly + 8), x8);
+}
+static inline VTYPE VWRAP (estrin_13) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
+ VWRAP (estrin_5) (pg, x, x2, x4, poly + 8), x8);
+}
+static inline VTYPE VWRAP (estrin_14) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
+ VWRAP (estrin_6) (pg, x, x2, x4, poly + 8), x8);
+}
+static inline VTYPE VWRAP (estrin_15) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_7) (pg, x, x2, x4, poly),
+ VWRAP (estrin_7) (pg, x, x2, x4, poly + 8), x8);
+}
+static inline VTYPE VWRAP (estrin_16) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, VTYPE x16,
+ const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), x16,
+ poly[16]);
+}
+static inline VTYPE VWRAP (estrin_17) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, VTYPE x16,
+ const STYPE *poly)
+{
+ VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
+ return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_17,
+ x16);
+}
+static inline VTYPE VWRAP (estrin_18) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, VTYPE x16,
+ const STYPE *poly)
+{
+ VTYPE p16_17 = svmla_x (pg, DUP (poly[16]), x, poly[17]);
+ VTYPE p16_18 = svmla_x (pg, p16_17, x2, poly[18]);
+ return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly), p16_18,
+ x16);
+}
+static inline VTYPE VWRAP (estrin_19) (svbool_t pg, VTYPE x, VTYPE x2,
+ VTYPE x4, VTYPE x8, VTYPE x16,
+ const STYPE *poly)
+{
+ return svmla_x (pg, VWRAP (estrin_15) (pg, x, x2, x4, x8, poly),
+ VWRAP (pairwise_poly_3) (pg, x, x2, poly + 16), x16);
+}
+
+static inline VTYPE VWRAP (horner_3) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ VTYPE p = svmla_x (pg, DUP (poly[2]), x, poly[3]);
+ p = svmad_x (pg, x, p, poly[1]);
+ p = svmad_x (pg, x, p, poly[0]);
+ return p;
+}
+static inline VTYPE VWRAP (horner_4) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ VTYPE p = svmla_x (pg, DUP (poly[3]), x, poly[4]);
+ p = svmad_x (pg, x, p, poly[2]);
+ p = svmad_x (pg, x, p, poly[1]);
+ p = svmad_x (pg, x, p, poly[0]);
+ return p;
+}
+static inline VTYPE VWRAP (horner_5) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, VWRAP (horner_4) (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_6) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, VWRAP (horner_5) (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_7) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, VWRAP (horner_6) (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_8) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, VWRAP (horner_7) (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_9) (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, VWRAP (horner_8) (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE
+sv_horner_10_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, VWRAP (horner_9) (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE
+sv_horner_11_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, sv_horner_10_f32_x (pg, x, poly + 1), poly[0]);
+}
+static inline VTYPE
+sv_horner_12_f32_x (svbool_t pg, VTYPE x, const STYPE *poly)
+{
+ return svmad_x (pg, x, sv_horner_11_f32_x (pg, x, poly + 1), poly[0]);
+}
+
+static inline VTYPE VWRAP (pw_horner_4) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
+ VTYPE p;
+ p = svmla_x (pg, p23, x2, poly[4]);
+ p = svmla_x (pg, p01, x2, p);
+ return p;
+}
+static inline VTYPE VWRAP (pw_horner_5) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ VTYPE p23 = svmla_x (pg, DUP (poly[2]), x, poly[3]);
+ VTYPE p45 = svmla_x (pg, DUP (poly[4]), x, poly[5]);
+ VTYPE p;
+ p = svmla_x (pg, p23, x2, p45);
+ p = svmla_x (pg, p01, x2, p);
+ return p;
+}
+static inline VTYPE VWRAP (pw_horner_6) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p26 = VWRAP (pw_horner_4) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p26);
+}
+static inline VTYPE VWRAP (pw_horner_7) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p27 = VWRAP (pw_horner_5) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p27);
+}
+static inline VTYPE VWRAP (pw_horner_8) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p28 = VWRAP (pw_horner_6) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p28);
+}
+static inline VTYPE VWRAP (pw_horner_9) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p29 = VWRAP (pw_horner_7) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p29);
+}
+static inline VTYPE VWRAP (pw_horner_10) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_10 = VWRAP (pw_horner_8) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_10);
+}
+static inline VTYPE VWRAP (pw_horner_11) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_11 = VWRAP (pw_horner_9) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_11);
+}
+static inline VTYPE VWRAP (pw_horner_12) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_12 = VWRAP (pw_horner_10) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_12);
+}
+static inline VTYPE VWRAP (pw_horner_13) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_13 = VWRAP (pw_horner_11) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_13);
+}
+static inline VTYPE VWRAP (pw_horner_14) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_14 = VWRAP (pw_horner_12) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_14);
+}
+static inline VTYPE VWRAP (pw_horner_15) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_15 = VWRAP (pw_horner_13) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_15);
+}
+static inline VTYPE VWRAP (pw_horner_16) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_16 = VWRAP (pw_horner_14) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_16);
+}
+static inline VTYPE VWRAP (pw_horner_17) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_17 = VWRAP (pw_horner_15) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_17);
+}
+static inline VTYPE VWRAP (pw_horner_18) (svbool_t pg, VTYPE x, VTYPE x2,
+ const STYPE *poly)
+{
+ VTYPE p2_18 = VWRAP (pw_horner_16) (pg, x, x2, poly + 2);
+ VTYPE p01 = svmla_x (pg, DUP (poly[0]), x, poly[1]);
+ return svmla_x (pg, p01, x2, p2_18);
+}
diff --git a/pl/math/s_acosh_3u5.c b/pl/math/s_acosh_3u5.c
deleted file mode 100644
index f62cbd6b53f0..000000000000
--- a/pl/math/s_acosh_3u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_acosh_3u5.c"
diff --git a/pl/math/s_acoshf_3u1.c b/pl/math/s_acoshf_3u1.c
deleted file mode 100644
index 374066622a0f..000000000000
--- a/pl/math/s_acoshf_3u1.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_acoshf_3u1.c"
diff --git a/pl/math/s_asinh_3u5.c b/pl/math/s_asinh_3u5.c
deleted file mode 100644
index ab8fbd9c3d69..000000000000
--- a/pl/math/s_asinh_3u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_asinh_3u5.c"
diff --git a/pl/math/s_asinhf_2u7.c b/pl/math/s_asinhf_2u7.c
deleted file mode 100644
index 13e1a5fd314a..000000000000
--- a/pl/math/s_asinhf_2u7.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_asinhf_2u7.c"
diff --git a/pl/math/s_atan2_3u.c b/pl/math/s_atan2_3u.c
deleted file mode 100644
index 4603e5f72615..000000000000
--- a/pl/math/s_atan2_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atan2_3u.c"
diff --git a/pl/math/s_atan2f_3u.c b/pl/math/s_atan2f_3u.c
deleted file mode 100644
index 894d843273ea..000000000000
--- a/pl/math/s_atan2f_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atan2f_3u.c"
diff --git a/pl/math/s_atan_2u5.c b/pl/math/s_atan_2u5.c
deleted file mode 100644
index 4b61bc4d1460..000000000000
--- a/pl/math/s_atan_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atan_2u5.c"
diff --git a/pl/math/s_atanf_3u.c b/pl/math/s_atanf_3u.c
deleted file mode 100644
index 6b6571927195..000000000000
--- a/pl/math/s_atanf_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atanf_3u.c"
diff --git a/pl/math/s_atanh_3u5.c b/pl/math/s_atanh_3u5.c
deleted file mode 100644
index f6a5f75b1779..000000000000
--- a/pl/math/s_atanh_3u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atanh_3u5.c"
diff --git a/pl/math/s_atanhf_3u1.c b/pl/math/s_atanhf_3u1.c
deleted file mode 100644
index e7e5c6197406..000000000000
--- a/pl/math/s_atanhf_3u1.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_atanhf_3u1.c"
diff --git a/pl/math/s_cbrt_2u.c b/pl/math/s_cbrt_2u.c
deleted file mode 100644
index 435e74a546c6..000000000000
--- a/pl/math/s_cbrt_2u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_cbrt_2u.c"
diff --git a/pl/math/s_cbrtf_1u5.c b/pl/math/s_cbrtf_1u5.c
deleted file mode 100644
index 5c793704b62a..000000000000
--- a/pl/math/s_cbrtf_1u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_cbrtf_1u5.c"
diff --git a/pl/math/s_cosh_2u.c b/pl/math/s_cosh_2u.c
deleted file mode 100644
index cdf352cf5793..000000000000
--- a/pl/math/s_cosh_2u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_cosh_2u.c"
diff --git a/pl/math/s_coshf_2u4.c b/pl/math/s_coshf_2u4.c
deleted file mode 100644
index 8f7d5da6e6ef..000000000000
--- a/pl/math/s_coshf_2u4.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_coshf_2u4.c"
diff --git a/pl/math/s_erf_2u.c b/pl/math/s_erf_2u.c
deleted file mode 100644
index 839535c3897f..000000000000
--- a/pl/math/s_erf_2u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_erf_2u.c"
diff --git a/pl/math/s_erfc_4u.c b/pl/math/s_erfc_4u.c
deleted file mode 100644
index bf9e3e62bd31..000000000000
--- a/pl/math/s_erfc_4u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_erfc_4u.c"
diff --git a/pl/math/s_erfcf_1u.c b/pl/math/s_erfcf_1u.c
deleted file mode 100644
index 024d22498ff5..000000000000
--- a/pl/math/s_erfcf_1u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_erfcf_1u.c"
diff --git a/pl/math/s_erff_1u5.c b/pl/math/s_erff_1u5.c
deleted file mode 100644
index a5b9bf9afa72..000000000000
--- a/pl/math/s_erff_1u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_erff_1u5.c"
diff --git a/pl/math/s_exp_tail.c b/pl/math/s_exp_tail.c
deleted file mode 100644
index 20b1b41a9689..000000000000
--- a/pl/math/s_exp_tail.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_exp_tail.c"
diff --git a/pl/math/s_expf.c b/pl/math/s_expf.c
deleted file mode 100644
index 557a2e3d36af..000000000000
--- a/pl/math/s_expf.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_expf.c"
diff --git a/pl/math/s_expm1_2u5.c b/pl/math/s_expm1_2u5.c
deleted file mode 100644
index da2d6e7ebf82..000000000000
--- a/pl/math/s_expm1_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_expm1_2u5.c"
diff --git a/pl/math/s_expm1f_1u6.c b/pl/math/s_expm1f_1u6.c
deleted file mode 100644
index eea8089da989..000000000000
--- a/pl/math/s_expm1f_1u6.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_expm1f_1u6.c"
diff --git a/pl/math/s_log10_2u5.c b/pl/math/s_log10_2u5.c
deleted file mode 100644
index 2480e5aa2cf1..000000000000
--- a/pl/math/s_log10_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log10_2u5.c"
diff --git a/pl/math/s_log10f_3u5.c b/pl/math/s_log10f_3u5.c
deleted file mode 100644
index 173e0fdc3400..000000000000
--- a/pl/math/s_log10f_3u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log10f_3u5.c"
diff --git a/pl/math/s_log1p_2u5.c b/pl/math/s_log1p_2u5.c
deleted file mode 100644
index 20b395a5a2d0..000000000000
--- a/pl/math/s_log1p_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log1p_2u5.c"
diff --git a/pl/math/s_log1pf_2u1.c b/pl/math/s_log1pf_2u1.c
deleted file mode 100644
index 013ec4c1d903..000000000000
--- a/pl/math/s_log1pf_2u1.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log1pf_2u1.c"
diff --git a/pl/math/s_log2_3u.c b/pl/math/s_log2_3u.c
deleted file mode 100644
index d46f3f998190..000000000000
--- a/pl/math/s_log2_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log2_3u.c"
diff --git a/pl/math/s_log2f_2u5.c b/pl/math/s_log2f_2u5.c
deleted file mode 100644
index e76c67dceb62..000000000000
--- a/pl/math/s_log2f_2u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_log2f_2u5.c"
diff --git a/pl/math/s_sinh_3u.c b/pl/math/s_sinh_3u.c
deleted file mode 100644
index 27e5e65db178..000000000000
--- a/pl/math/s_sinh_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_sinh_3u.c"
diff --git a/pl/math/s_sinhf_2u3.c b/pl/math/s_sinhf_2u3.c
deleted file mode 100644
index 607f94298a79..000000000000
--- a/pl/math/s_sinhf_2u3.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_sinhf_2u3.c"
diff --git a/pl/math/s_tan_3u5.c b/pl/math/s_tan_3u5.c
deleted file mode 100644
index adb807c5beb8..000000000000
--- a/pl/math/s_tan_3u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_tan_3u5.c"
diff --git a/pl/math/s_tanf_3u5.c b/pl/math/s_tanf_3u5.c
deleted file mode 100644
index fa64c8aef697..000000000000
--- a/pl/math/s_tanf_3u5.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_tanf_3u5.c"
diff --git a/pl/math/s_tanh_3u.c b/pl/math/s_tanh_3u.c
deleted file mode 100644
index a4d7bce649f1..000000000000
--- a/pl/math/s_tanh_3u.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_tanh_3u.c"
diff --git a/pl/math/s_tanhf_2u6.c b/pl/math/s_tanhf_2u6.c
deleted file mode 100644
index 896fc62ebe9b..000000000000
--- a/pl/math/s_tanhf_2u6.c
+++ /dev/null
@@ -1,6 +0,0 @@
-/*
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#define SCALAR 1
-#include "v_tanhf_2u6.c"
diff --git a/pl/math/sinh_3u.c b/pl/math/sinh_3u.c
index f534815c6674..1d86629ee2a3 100644
--- a/pl/math/sinh_3u.c
+++ b/pl/math/sinh_3u.c
@@ -58,9 +58,6 @@ sinh (double x)
PL_SIG (S, D, 1, sinh, -10.0, 10.0)
PL_TEST_ULP (sinh, 2.08)
-PL_TEST_INTERVAL (sinh, 0, 0x1p-51, 100)
-PL_TEST_INTERVAL (sinh, -0, -0x1p-51, 100)
-PL_TEST_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
-PL_TEST_INTERVAL (sinh, -0x1p-51, -0x1.62e42fefa39fp+9, 100000)
-PL_TEST_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
-PL_TEST_INTERVAL (sinh, -0x1.62e42fefa39fp+9, -inf, 1000)
+PL_TEST_SYM_INTERVAL (sinh, 0, 0x1p-51, 100)
+PL_TEST_SYM_INTERVAL (sinh, 0x1p-51, 0x1.62e42fefa39fp+9, 100000)
+PL_TEST_SYM_INTERVAL (sinh, 0x1.62e42fefa39fp+9, inf, 1000)
diff --git a/pl/math/sinhf_2u3.c b/pl/math/sinhf_2u3.c
index de944288a02b..aa7aadcf67c5 100644
--- a/pl/math/sinhf_2u3.c
+++ b/pl/math/sinhf_2u3.c
@@ -68,9 +68,6 @@ sinhf (float x)
PL_SIG (S, F, 1, sinh, -10.0, 10.0)
PL_TEST_ULP (sinhf, 1.76)
-PL_TEST_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
-PL_TEST_INTERVAL (sinhf, -0, -0x1.62e43p+6, 100000)
-PL_TEST_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
-PL_TEST_INTERVAL (sinhf, -0x1.62e43p+6, -0x1.65a9fap+6, 100)
-PL_TEST_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
-PL_TEST_INTERVAL (sinhf, -0x1.65a9fap+6, -inf, 100)
+PL_TEST_SYM_INTERVAL (sinhf, 0, 0x1.62e43p+6, 100000)
+PL_TEST_SYM_INTERVAL (sinhf, 0x1.62e43p+6, 0x1.65a9fap+6, 100)
+PL_TEST_SYM_INTERVAL (sinhf, 0x1.65a9fap+6, inf, 100)
diff --git a/pl/math/sinpi_3u.c b/pl/math/sinpi_3u.c
new file mode 100644
index 000000000000..a04a352a62e6
--- /dev/null
+++ b/pl/math/sinpi_3u.c
@@ -0,0 +1,90 @@
+/*
+ * Double-precision scalar sinpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include <math.h>
+#include "mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_scalar_f64.h"
+
+/* Taylor series coefficents for sin(pi * x).
+ C2 coefficient (orginally ~=5.16771278) has been split into two parts:
+ C2_hi = 4, C2_lo = C2 - C2_hi (~=1.16771278)
+ This change in magnitude reduces floating point rounding errors.
+ C2_hi is then reintroduced after the polynomial approxmation. */
+static const double poly[]
+ = { 0x1.921fb54442d184p1, -0x1.2aef39896f94bp0, 0x1.466bc6775ab16p1,
+ -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
+ 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, 0x1.af86ae521260bp-21,
+ -0x1.012a9870eeb7dp-25 };
+
+#define Shift 0x1.8p+52
+
+/* Approximation for scalar double-precision sinpi(x).
+ Maximum error: 3.03 ULP:
+ sinpi(0x1.a90da2818f8b5p+7) got 0x1.fe358f255a4b3p-1
+ want 0x1.fe358f255a4b6p-1. */
+double
+sinpi (double x)
+{
+ if (isinf (x))
+ return __math_invalid (x);
+
+ double r = asdouble (asuint64 (x) & ~0x8000000000000000);
+ uint64_t sign = asuint64 (x) & 0x8000000000000000;
+
+ /* Edge cases for when sinpif should be exactly 0. (Integers)
+ 0x1p53 is the limit for single precision to store any decimal places. */
+ if (r >= 0x1p53)
+ return 0;
+
+ /* If x is an integer, return 0. */
+ uint64_t m = (uint64_t) r;
+ if (r == m)
+ return 0;
+
+ /* For very small inputs, squaring r causes underflow.
+ Values below this threshold can be approximated via sinpi(x) ≈ pi*x. */
+ if (r < 0x1p-63)
+ return M_PI * x;
+
+ /* Any non-integer values >= 0x1x51 will be int + 0.5.
+ These values should return exactly 1 or -1. */
+ if (r >= 0x1p51)
+ {
+ uint64_t iy = ((m & 1) << 63) ^ asuint64 (1.0);
+ return asdouble (sign ^ iy);
+ }
+
+ /* n = rint(|x|). */
+ double n = r + Shift;
+ sign ^= (asuint64 (n) << 63);
+ n = n - Shift;
+
+ /* r = |x| - n (range reduction into -1/2 .. 1/2). */
+ r = r - n;
+
+ /* y = sin(r). */
+ double r2 = r * r;
+ double y = horner_9_f64 (r2, poly);
+ y = y * r;
+
+ /* Reintroduce C2_hi. */
+ y = fma (-4 * r2, r, y);
+
+ /* Copy sign of x to sin(|x|). */
+ return asdouble (asuint64 (y) ^ sign);
+}
+
+PL_SIG (S, D, 1, sinpi, -0.9, 0.9)
+PL_TEST_ULP (sinpi, 2.53)
+PL_TEST_SYM_INTERVAL (sinpi, 0, 0x1p-63, 5000)
+PL_TEST_SYM_INTERVAL (sinpi, 0x1p-63, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (sinpi, 0.5, 0x1p51, 10000)
+PL_TEST_SYM_INTERVAL (sinpi, 0x1p51, inf, 10000)
diff --git a/pl/math/sinpif_2u5.c b/pl/math/sinpif_2u5.c
new file mode 100644
index 000000000000..af9ca0573b37
--- /dev/null
+++ b/pl/math/sinpif_2u5.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision scalar sinpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Taylor series coefficents for sin(pi * x). */
+#define C0 0x1.921fb6p1f
+#define C1 -0x1.4abbcep2f
+#define C2 0x1.466bc6p1f
+#define C3 -0x1.32d2ccp-1f
+#define C4 0x1.50783p-4f
+#define C5 -0x1.e30750p-8f
+
+#define Shift 0x1.0p+23f
+
+/* Approximation for scalar single-precision sinpi(x) - sinpif.
+ Maximum error: 2.48 ULP:
+ sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1
+ want 0x1.fa8c02p-1. */
+float
+sinpif (float x)
+{
+ if (isinf (x))
+ return __math_invalidf (x);
+
+ float r = asfloat (asuint (x) & ~0x80000000);
+ uint32_t sign = asuint (x) & 0x80000000;
+
+ /* Edge cases for when sinpif should be exactly 0. (Integers)
+ 0x1p23 is the limit for single precision to store any decimal places. */
+ if (r >= 0x1p23f)
+ return 0;
+
+ int32_t m = roundf (r);
+ if (m == r)
+ return 0;
+
+ /* For very small inputs, squaring r causes underflow.
+ Values below this threshold can be approximated via sinpi(x) ~= pi*x. */
+ if (r < 0x1p-31f)
+ return C0 * x;
+
+ /* Any non-integer values >= 0x1p22f will be int + 0.5.
+ These values should return exactly 1 or -1. */
+ if (r >= 0x1p22f)
+ {
+ uint32_t iy = ((m & 1) << 31) ^ asuint (-1.0f);
+ return asfloat (sign ^ iy);
+ }
+
+ /* n = rint(|x|). */
+ float n = r + Shift;
+ sign ^= (asuint (n) << 31);
+ n = n - Shift;
+
+ /* r = |x| - n (range reduction into -1/2 .. 1/2). */
+ r = r - n;
+
+ /* y = sin(pi * r). */
+ float r2 = r * r;
+ float y = fmaf (C5, r2, C4);
+ y = fmaf (y, r2, C3);
+ y = fmaf (y, r2, C2);
+ y = fmaf (y, r2, C1);
+ y = fmaf (y, r2, C0);
+
+ /* Copy sign of x to sin(|x|). */
+ return asfloat (asuint (y * r) ^ sign);
+}
+
+PL_SIG (S, F, 1, sinpi, -0.9, 0.9)
+PL_TEST_ULP (sinpif, 1.99)
+PL_TEST_SYM_INTERVAL (sinpif, 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (sinpif, 0x1p-31, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (sinpif, 0.5, 0x1p22f, 10000)
+PL_TEST_SYM_INTERVAL (sinpif, 0x1p22f, inf, 10000)
diff --git a/pl/math/sv_acos_2u.c b/pl/math/sv_acos_2u.c
new file mode 100644
index 000000000000..e06db6cae6af
--- /dev/null
+++ b/pl/math/sv_acos_2u.c
@@ -0,0 +1,91 @@
+/*
+ * Double-precision SVE acos(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64_t poly[12];
+ float64_t pi, pi_over_2;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
+ 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+ 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
+ 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
+ .pi = 0x1.921fb54442d18p+1,
+ .pi_over_2 = 0x1.921fb54442d18p+0,
+};
+
+/* Double-precision SVE implementation of vector acos(x).
+
+ For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.18 ulps,
+ _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0
+ want 0x1.0d4d0f55667f7p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.52 ulps,
+ _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1
+ want 0x1.ed82df4243f0bp-1. */
+svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+ svfloat64_t ax = svabs_x (pg, x);
+
+ svbool_t a_gt_half = svacgt (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat64_t z4 = svmul_x (pg, z2, z2);
+ svfloat64_t z8 = svmul_x (pg, z4, z4);
+ svfloat64_t z16 = svmul_x (pg, z8, z8);
+ svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ svfloat64_t y
+ = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign));
+
+ svbool_t is_neg = svcmplt (pg, x, 0.0);
+ svfloat64_t off = svdup_f64_z (is_neg, d->pi);
+ svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0));
+ svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2));
+
+ return svmla_x (pg, add, mul, y);
+}
+
+PL_SIG (SV, D, 1, acos, -1.0, 1.0)
+PL_TEST_ULP (SV_NAME_D1 (acos), 1.02)
+PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0, 0.5, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acos), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acos), -0, -inf, 20000)
diff --git a/pl/math/sv_acosf_1u4.c b/pl/math/sv_acosf_1u4.c
new file mode 100644
index 000000000000..7ac59ceedfbd
--- /dev/null
+++ b/pl/math/sv_acosf_1u4.c
@@ -0,0 +1,84 @@
+/*
+ * Single-precision SVE acos(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32_t poly[5];
+ float32_t pi, pi_over_2;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6,
+ 0x1.3af7d8p-5, },
+ .pi = 0x1.921fb6p+1f,
+ .pi_over_2 = 0x1.921fb6p+0f,
+};
+
+/* Single-precision SVE implementation of vector acos(x).
+
+ For |x| in [0, 0.5], use order 4 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.16 ulps,
+ _ZGVsMxv_acosf(0x1.ffbeccp-2) got 0x1.0c27f8p+0
+ want 0x1.0c27f6p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.32 ulps,
+ _ZGVsMxv_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
+ want 0x1.feb32ep-1. */
+svfloat32_t SV_NAME_F1 (acos) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
+ svfloat32_t ax = svabs_x (pg, x);
+ svbool_t a_gt_half = svacgt (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat32_t z = svsqrt_m (ax, a_gt_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ svfloat32_t y
+ = svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (p), sign));
+
+ svbool_t is_neg = svcmplt (pg, x, 0.0);
+ svfloat32_t off = svdup_f32_z (is_neg, d->pi);
+ svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0), sv_f32 (-1.0));
+ svfloat32_t add = svsel (a_gt_half, off, sv_f32 (d->pi_over_2));
+
+ return svmla_x (pg, add, mul, y);
+}
+
+PL_SIG (SV, F, 1, acos, -1.0, 1.0)
+PL_TEST_ULP (SV_NAME_F1 (acos), 0.82)
+PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0, 0.5, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (acos), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (SV_NAME_F1 (acos), -0, -inf, 20000)
diff --git a/pl/math/sv_acosh_3u5.c b/pl/math/sv_acosh_3u5.c
new file mode 100644
index 000000000000..faf351331464
--- /dev/null
+++ b/pl/math/sv_acosh_3u5.c
@@ -0,0 +1,50 @@
+/*
+ * Double-precision SVE acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 1
+#include "sv_log1p_inline.h"
+
+#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */
+#define OneTop 0x3ff
+
+static NOINLINE svfloat64_t
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (acosh, x, y, special);
+}
+
+/* SVE approximation for double-precision acosh, based on log1p.
+ The largest observed error is 3.19 ULP in the region where the
+ argument to log1p falls in the k=0 interval, i.e. x close to 1:
+ SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
+ want 0x1.ed23399f51373p-2. */
+svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
+{
+ svuint64_t itop = svlsr_x (pg, svreinterpret_u64 (x), 52);
+ /* (itop - OneTop) >= (BigBoundTop - OneTop). */
+ svbool_t special = svcmpge (pg, svsub_x (pg, itop, OneTop), sv_u64 (0x1ff));
+
+ svfloat64_t xm1 = svsub_x (pg, x, 1);
+ svfloat64_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1));
+ svfloat64_t y = sv_log1p_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+
+ /* Fall back to scalar routine for special lanes. */
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+
+ return y;
+}
+
+PL_SIG (SV, D, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (acosh), 2.69)
+PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 1, 0x1p511, 90000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0x1p511, inf, 10000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acosh), 0, 1, 1000)
+PL_TEST_INTERVAL (SV_NAME_D1 (acosh), -0, -inf, 10000)
diff --git a/pl/math/sv_acoshf_2u8.c b/pl/math/sv_acoshf_2u8.c
new file mode 100644
index 000000000000..f527083af40a
--- /dev/null
+++ b/pl/math/sv_acoshf_2u8.c
@@ -0,0 +1,47 @@
+/*
+ * Single-precision SVE acosh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define One 0x3f800000
+#define Thres 0x20000000 /* asuint(0x1p64) - One. */
+
+#include "sv_log1pf_inline.h"
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (acoshf, x, y, special);
+}
+
+/* Single-precision SVE acosh(x) routine. Implements the same algorithm as
+ vector acoshf and log1p.
+
+ Maximum error is 2.78 ULPs:
+ SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4
+ want 0x1.f45b3cp-4. */
+svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg)
+{
+ svuint32_t ix = svreinterpret_u32 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
+
+ svfloat32_t xm1 = svsub_x (pg, x, 1.0f);
+ svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f));
+ svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+ return y;
+}
+
+PL_SIG (SV, F, 1, acosh, 1.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (acosh), 2.29)
+PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0, 1, 500)
+PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 1, 0x1p64, 100000)
+PL_TEST_INTERVAL (SV_NAME_F1 (acosh), 0x1p64, inf, 1000)
+PL_TEST_INTERVAL (SV_NAME_F1 (acosh), -0, -inf, 1000)
diff --git a/pl/math/sv_asin_3u.c b/pl/math/sv_asin_3u.c
new file mode 100644
index 000000000000..c3dd37b145ae
--- /dev/null
+++ b/pl/math/sv_asin_3u.c
@@ -0,0 +1,84 @@
+/*
+ * Double-precision SVE asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64_t poly[12];
+ float64_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4,
+ 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6,
+ 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
+ 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7,
+ 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6,
+ -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
+ .pi_over_2f = 0x1.921fb54442d18p+0,
+};
+
+#define P(i) sv_f64 (d->poly[i])
+
+/* Double-precision SVE implementation of vector asin(x).
+
+ For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.52 ulps,
+ _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2
+ want 0x1.ec13757305f26p-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.69 ulps,
+ _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+ want 0x1.110d7e85fdd53p-1. */
+svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+ svfloat64_t ax = svabs_x (pg, x);
+ svbool_t a_ge_half = svacge (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat64_t z4 = svmul_x (pg, z2, z2);
+ svfloat64_t z8 = svmul_x (pg, z4, z4);
+ svfloat64_t z16 = svmul_x (pg, z8, z8);
+ svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f);
+
+ /* Copy sign. */
+ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+}
+
+PL_SIG (SV, D, 1, asin, -1.0, 1.0)
+PL_TEST_ULP (SV_NAME_D1 (asin), 2.19)
+PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0, 0.5, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (asin), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (SV_NAME_D1 (asin), -0, -inf, 20000)
diff --git a/pl/math/sv_asinf_2u5.c b/pl/math/sv_asinf_2u5.c
new file mode 100644
index 000000000000..8e9edc2439f5
--- /dev/null
+++ b/pl/math/sv_asinf_2u5.c
@@ -0,0 +1,76 @@
+/*
+ * Single-precision SVE asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32_t poly[5];
+ float32_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { 0x1.55555ep-3, 0x1.33261ap-4, 0x1.70d7dcp-5, 0x1.b059dp-6,
+ 0x1.3af7d8p-5, },
+ .pi_over_2f = 0x1.921fb6p+0f,
+};
+
+/* Single-precision SVE implementation of vector asin(x).
+
+ For |x| in [0, 0.5], use order 4 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.83 ulps,
+ _ZGVsMxv_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2
+ want 0x1.fef15cp-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.41 ulps,
+ _ZGVsMxv_asinf (-0x1.00203ep-1) got -0x1.0c3a64p-1
+ want -0x1.0c3a6p-1. */
+svfloat32_t SV_NAME_F1 (asin) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svbool_t a_ge_half = svacge (pg, x, 0.5);
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
+ svmul_x (pg, x, x));
+ svfloat32_t z = svsqrt_m (ax, a_ge_half, z2);
+
+ /* Use a single polynomial approximation P for both intervals. */
+ svfloat32_t p = sv_horner_4_f32_x (pg, z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ svfloat32_t y = svmad_m (a_ge_half, p, sv_f32 (-2.0), d->pi_over_2f);
+
+ /* Copy sign. */
+ return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
+}
+
+PL_SIG (SV, F, 1, asin, -1.0, 1.0)
+PL_TEST_ULP (SV_NAME_F1 (asin), 1.91)
+PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0, 0.5, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (asin), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (SV_NAME_F1 (asin), -0, -inf, 20000) \ No newline at end of file
diff --git a/pl/math/sv_asinh_3u0.c b/pl/math/sv_asinh_3u0.c
new file mode 100644
index 000000000000..711f0dfdbedc
--- /dev/null
+++ b/pl/math/sv_asinh_3u0.c
@@ -0,0 +1,129 @@
+/*
+ * Double-precision SVE asinh(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define OneTop sv_u64 (0x3ff) /* top12(asuint64(1.0f)). */
+#define HugeBound sv_u64 (0x5fe) /* top12(asuint64(0x1p511)). */
+#define TinyBound (0x3e5) /* top12(asuint64(0x1p-26)). */
+#define SignMask (0x8000000000000000)
+
+/* Constants & data for log. */
+#define A(i) __v_log_data.poly[i]
+#define Ln2 (0x1.62e42fefa39efp-1)
+#define N (1 << V_LOG_TABLE_BITS)
+#define OFF (0x3fe6900900000000)
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (asinh, x, y, special);
+}
+
+static inline svfloat64_t
+__sv_log_inline (svfloat64_t x, const svbool_t pg)
+{
+ /* Double-precision SVE log, copied from pl/math/sv_log_2u5.c with some
+ cosmetic modification and special-cases removed. See that file for details
+ of the algorithm used. */
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t tmp = svsub_x (pg, ix, OFF);
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+ svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svfloat64_t z = svreinterpret_f64 (iz);
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
+ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
+ svfloat64_t kd = svcvt_f64_x (pg, k);
+ svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, Ln2);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = svmla_x (pg, sv_f64 (A (2)), r, A (3));
+ svfloat64_t p = svmla_x (pg, sv_f64 (A (0)), r, A (1));
+ y = svmla_x (pg, y, r2, A (4));
+ y = svmla_x (pg, p, r2, y);
+ y = svmla_x (pg, hi, r2, y);
+ return y;
+}
+
+/* Double-precision implementation of SVE asinh(x).
+ asinh is very sensitive around 1, so it is impractical to devise a single
+ low-cost algorithm which is sufficiently accurate on a wide range of input.
+ Instead we use two different algorithms:
+ asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ where log(x) is an optimized log approximation, and P(x) is a polynomial
+ shared with the scalar routine. The greatest observed error 2.51 ULP, in
+ |x| >= 1:
+ _ZGVsMxv_asinh(0x1.170469d024505p+0) got 0x1.e3181c43b0f36p-1
+ want 0x1.e3181c43b0f39p-1. */
+svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
+{
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t iax = svbic_x (pg, ix, SignMask);
+ svuint64_t sign = svand_x (pg, ix, SignMask);
+ svfloat64_t ax = svreinterpret_f64 (iax);
+ svuint64_t top12 = svlsr_x (pg, iax, 52);
+
+ svbool_t ge1 = svcmpge (pg, top12, OneTop);
+ svbool_t special = svcmpge (pg, top12, HugeBound);
+
+ /* Option 1: |x| >= 1.
+ Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)). */
+ svfloat64_t option_1 = sv_f64 (0);
+ if (likely (svptest_any (pg, ge1)))
+ {
+ svfloat64_t axax = svmul_x (pg, ax, ax);
+ option_1 = __sv_log_inline (
+ svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, axax, 1))), pg);
+ }
+
+ /* Option 2: |x| < 1.
+ Compute asinh(x) using a polynomial.
+ The largest observed error in this region is 1.51 ULPs:
+ _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
+ want 0x1.c1e649ee2681dp-1. */
+ svfloat64_t option_2 = sv_f64 (0);
+ if (likely (svptest_any (pg, svnot_z (pg, ge1))))
+ {
+ svfloat64_t x2 = svmul_x (pg, ax, ax);
+ svfloat64_t z2 = svmul_x (pg, x2, x2);
+ svfloat64_t z4 = svmul_x (pg, z2, z2);
+ svfloat64_t z8 = svmul_x (pg, z4, z4);
+ svfloat64_t z16 = svmul_x (pg, z8, z8);
+ svfloat64_t p
+ = sv_estrin_17_f64_x (pg, x2, z2, z4, z8, z16, __asinh_data.poly);
+ option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
+ }
+
+ /* Choose the right option for each lane. */
+ svfloat64_t y = svsel (ge1, option_1, option_2);
+
+ /* Apply sign of x to y. */
+ y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+ return y;
+}
+
+PL_SIG (SV, D, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (asinh), 2.52)
+/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
+ Ensures the svsel is choosing the right option in all cases. */
+#define SV_ASINH_INTERVAL(lo, hi, n) \
+ PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0.5) \
+ PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 2) \
+ PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (asinh), lo, hi, n, 0x1p600)
+SV_ASINH_INTERVAL (0, 0x1p-26, 50000)
+SV_ASINH_INTERVAL (0x1p-26, 1, 50000)
+SV_ASINH_INTERVAL (1, 0x1p511, 50000)
+SV_ASINH_INTERVAL (0x1p511, inf, 40000)
diff --git a/pl/math/sv_asinhf_2u5.c b/pl/math/sv_asinhf_2u5.c
new file mode 100644
index 000000000000..1f1f6e5c846f
--- /dev/null
+++ b/pl/math/sv_asinhf_2u5.c
@@ -0,0 +1,55 @@
+/*
+ * Single-precision SVE asinh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#include "sv_log1pf_inline.h"
+
+#define BigBound (0x5f800000) /* asuint(0x1p64). */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (asinhf, x, y, special);
+}
+
+/* Single-precision SVE asinh(x) routine. Implements the same algorithm as
+ vector asinhf and log1p.
+
+ Maximum error is 2.48 ULPs:
+ SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4
+ want 0x1.ffbbb8p-4. */
+svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg)
+{
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svbool_t special = svcmpge (pg, iax, BigBound);
+
+ /* asinh(x) = log(x + sqrt(x * x + 1)).
+ For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
+ svfloat32_t ax2 = svmul_x (pg, ax, ax);
+ svfloat32_t d = svadd_x (pg, svsqrt_x (pg, svadd_x (pg, ax2, 1.0f)), 1.0f);
+ svfloat32_t y
+ = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))),
+ special);
+ return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y)));
+}
+
+PL_SIG (SV, F, 1, asinh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (asinh), 1.98)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0, 0x1p-12, 4000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p-12, 1.0, 20000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 1.0, 0x1p64, 20000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (asinh), 0x1p64, inf, 4000)
diff --git a/pl/math/sv_atan2_2u5.c b/pl/math/sv_atan2_2u5.c
index a4bea1dcba09..00530a324a76 100644
--- a/pl/math/sv_atan2_2u5.c
+++ b/pl/math/sv_atan2_2u5.c
@@ -8,86 +8,109 @@
#include "sv_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_sve_f64.h"
-#if SV_SUPPORTED
-
-#include "sv_atan_common.h"
+static const struct data
+{
+ float64_t poly[20];
+ float64_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
+ 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+ -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
+ 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+ -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
+ 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+ -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
+ .pi_over_2 = 0x1.921fb54442d18p+0,
+};
/* Useful constants. */
-#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
#define SignMask sv_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
-__attribute__ ((noinline)) static sv_f64_t
-specialcase (sv_f64_t y, sv_f64_t x, sv_f64_t ret, const svbool_t cmp)
+static svfloat64_t NOINLINE
+special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
+ const svbool_t cmp)
{
return sv_call2_f64 (atan2, y, x, ret, cmp);
}
-/* Returns a predicate indicating true if the input is the bit representation of
- 0, infinity or nan. */
+/* Returns a predicate indicating true if the input is the bit representation
+ of 0, infinity or nan. */
static inline svbool_t
-zeroinfnan (sv_u64_t i, const svbool_t pg)
+zeroinfnan (svuint64_t i, const svbool_t pg)
{
- return svcmpge_u64 (pg, svsub_n_u64_x (pg, svlsl_n_u64_x (pg, i, 1), 1),
- sv_u64 (2 * asuint64 (INFINITY) - 1));
+ return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
+ sv_u64 (2 * asuint64 (INFINITY) - 1));
}
/* Fast implementation of SVE atan2. Errors are greatest when y and
x are reasonably close together. The greatest observed error is 2.28 ULP:
- sv_atan2(-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
+ _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */
-sv_f64_t
-__sv_atan2_x (sv_f64_t y, sv_f64_t x, const svbool_t pg)
+svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
{
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_u64_t iy = sv_as_u64_f64 (y);
+ const struct data *data_ptr = ptr_barrier (&data);
+
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t iy = svreinterpret_u64 (y);
svbool_t cmp_x = zeroinfnan (ix, pg);
svbool_t cmp_y = zeroinfnan (iy, pg);
- svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+ svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
- sv_u64_t sign_x = svand_u64_x (pg, ix, SignMask);
- sv_u64_t sign_y = svand_u64_x (pg, iy, SignMask);
- sv_u64_t sign_xy = sveor_u64_x (pg, sign_x, sign_y);
+ svuint64_t sign_x = svand_x (pg, ix, SignMask);
+ svuint64_t sign_y = svand_x (pg, iy, SignMask);
+ svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
- sv_f64_t ax = svabs_f64_x (pg, x);
- sv_f64_t ay = svabs_f64_x (pg, y);
+ svfloat64_t ax = svabs_x (pg, x);
+ svfloat64_t ay = svabs_x (pg, y);
- svbool_t pred_xlt0 = svcmplt_f64 (pg, x, sv_f64 (0.0));
- svbool_t pred_aygtax = svcmpgt_f64 (pg, ay, ax);
+ svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
+ svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */
- sv_f64_t n = svsel_f64 (pred_aygtax, svneg_f64_x (pg, ax), ay);
- sv_f64_t d = svsel_f64 (pred_aygtax, ay, ax);
- sv_f64_t z = svdiv_f64_x (pg, n, d);
+ svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+ svfloat64_t d = svsel (pred_aygtax, ay, ax);
+ svfloat64_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */
- sv_f64_t shift = svsel_f64 (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
- shift = svsel_f64 (pred_aygtax, svadd_n_f64_x (pg, shift, 1.0), shift);
- shift = svmul_f64_x (pg, shift, PiOver2);
+ svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0));
+ shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ shift = svmul_x (pg, shift, data_ptr->pi_over_2);
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
+ svfloat64_t z2 = svmul_x (pg, z, z);
+ svfloat64_t x2 = svmul_x (pg, z2, z2);
+ svfloat64_t x4 = svmul_x (pg, x2, x2);
+ svfloat64_t x8 = svmul_x (pg, x4, x4);
- sv_f64_t ret = __sv_atan_common (pg, pg, z, z, shift);
+ svfloat64_t ret = svmla_x (
+ pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly),
+ sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8);
+
+ /* y = shift + z + z^3 * P(z^2). */
+ svfloat64_t z3 = svmul_x (pg, z2, z);
+ ret = svmla_x (pg, z, z3, ret);
+
+ ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */
- ret = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (ret), sign_xy));
+ ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
if (unlikely (svptest_any (pg, cmp_xy)))
- {
- return specialcase (y, x, ret, cmp_xy);
- }
+ return special_case (y, x, ret, cmp_xy);
return ret;
}
-PL_ALIAS (__sv_atan2_x, _ZGVsMxvv_atan2)
-
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
PL_SIG (SV, D, 2, atan2)
-PL_TEST_ULP (__sv_atan2, 1.78)
-PL_TEST_INTERVAL (__sv_atan2, -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (__sv_atan2, -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atan2, 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atan2, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (__sv_atan2, 1e6, 1e32, 40000)
-#endif
+PL_TEST_ULP (SV_NAME_D2 (atan2), 1.78)
+PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_D2 (atan2), 100, inf, 40000)
+PL_TEST_INTERVAL (SV_NAME_D2 (atan2), -0, -inf, 40000)
diff --git a/pl/math/sv_atan2f_3u.c b/pl/math/sv_atan2f_3u.c
index f7674c441f2f..9ff73ecb74ba 100644
--- a/pl/math/sv_atan2f_3u.c
+++ b/pl/math/sv_atan2f_3u.c
@@ -8,87 +8,101 @@
#include "sv_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_sve_f32.h"
-#if SV_SUPPORTED
-
-#include "sv_atanf_common.h"
+static const struct data
+{
+ float32_t poly[8];
+ float32_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0]. */
+ .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+ -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
+ .pi_over_2 = 0x1.921fb6p+0f,
+};
-/* Useful constants. */
-#define PiOver2 sv_f32 (0x1.921fb6p+0f)
#define SignMask sv_u32 (0x80000000)
/* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */
-static inline sv_f32_t
-specialcase (sv_f32_t y, sv_f32_t x, sv_f32_t ret, const svbool_t cmp)
+static inline svfloat32_t
+special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
+ const svbool_t cmp)
{
return sv_call2_f32 (atan2f, y, x, ret, cmp);
}
-/* Returns a predicate indicating true if the input is the bit representation of
- 0, infinity or nan. */
+/* Returns a predicate indicating true if the input is the bit representation
+ of 0, infinity or nan. */
static inline svbool_t
-zeroinfnan (sv_u32_t i, const svbool_t pg)
+zeroinfnan (svuint32_t i, const svbool_t pg)
{
- return svcmpge_u32 (pg, svsub_n_u32_x (pg, svlsl_n_u32_x (pg, i, 1), 1),
- sv_u32 (2 * 0x7f800000lu - 1));
+ return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
+ sv_u32 (2 * 0x7f800000lu - 1));
}
-/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2)
- with reduction to [0,1] using z=1/x and shift = pi/2.
- Maximum observed error is 2.95 ULP:
- __sv_atan2f(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
- want 0x1.967f00p-1. */
-sv_f32_t
-__sv_atan2f_x (sv_f32_t y, sv_f32_t x, const svbool_t pg)
+/* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 *
+ P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum
+ observed error is 2.95 ULP:
+ _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+ want 0x1.967f00p-1. */
+svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
{
- sv_u32_t ix = sv_as_u32_f32 (x);
- sv_u32_t iy = sv_as_u32_f32 (y);
+ const struct data *data_ptr = ptr_barrier (&data);
+
+ svuint32_t ix = svreinterpret_u32 (x);
+ svuint32_t iy = svreinterpret_u32 (y);
svbool_t cmp_x = zeroinfnan (ix, pg);
svbool_t cmp_y = zeroinfnan (iy, pg);
- svbool_t cmp_xy = svorr_b_z (pg, cmp_x, cmp_y);
+ svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
- sv_u32_t sign_x = svand_u32_x (pg, ix, SignMask);
- sv_u32_t sign_y = svand_u32_x (pg, iy, SignMask);
- sv_u32_t sign_xy = sveor_u32_x (pg, sign_x, sign_y);
+ svuint32_t sign_x = svand_x (pg, ix, SignMask);
+ svuint32_t sign_y = svand_x (pg, iy, SignMask);
+ svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
- sv_f32_t ax = svabs_f32_x (pg, x);
- sv_f32_t ay = svabs_f32_x (pg, y);
+ svfloat32_t ax = svabs_x (pg, x);
+ svfloat32_t ay = svabs_x (pg, y);
- svbool_t pred_xlt0 = svcmplt_f32 (pg, x, sv_f32 (0.0));
- svbool_t pred_aygtax = svcmpgt_f32 (pg, ay, ax);
+ svbool_t pred_xlt0 = svcmplt (pg, x, 0.0);
+ svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
/* Set up z for call to atan. */
- sv_f32_t n = svsel_f32 (pred_aygtax, svneg_f32_x (pg, ax), ay);
- sv_f32_t d = svsel_f32 (pred_aygtax, ay, ax);
- sv_f32_t z = svdiv_f32_x (pg, n, d);
+ svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+ svfloat32_t d = svsel (pred_aygtax, ay, ax);
+ svfloat32_t z = svdiv_x (pg, n, d);
/* Work out the correct shift. */
- sv_f32_t shift = svsel_f32 (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
- shift = svsel_f32 (pred_aygtax, svadd_n_f32_x (pg, shift, 1.0), shift);
- shift = svmul_f32_x (pg, shift, PiOver2);
+ svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0));
+ shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift);
+ shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
+ svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t z4 = svmul_x (pg, z2, z2);
+ svfloat32_t z8 = svmul_x (pg, z4, z4);
- sv_f32_t ret = __sv_atanf_common (pg, pg, z, z, shift);
+ svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly);
+
+ /* ret = shift + z + z^3 * P(z^2). */
+ svfloat32_t z3 = svmul_x (pg, z2, z);
+ ret = svmla_x (pg, z, z3, ret);
+
+ ret = svadd_m (pg, ret, shift);
/* Account for the sign of x and y. */
- ret = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (ret), sign_xy));
+ ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
if (unlikely (svptest_any (pg, cmp_xy)))
- {
- return specialcase (y, x, ret, cmp_xy);
- }
+ return special_case (y, x, ret, cmp_xy);
return ret;
}
-PL_ALIAS (__sv_atan2f_x, _ZGVsMxvv_atan2f)
-
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
PL_SIG (SV, F, 2, atan2)
-PL_TEST_ULP (__sv_atan2f, 2.45)
-PL_TEST_INTERVAL (__sv_atan2f, -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (__sv_atan2f, -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atan2f, 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atan2f, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (__sv_atan2f, 1e6, 1e32, 40000)
-#endif
+PL_TEST_ULP (SV_NAME_F2 (atan2), 2.45)
+PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_F2 (atan2), 100, inf, 40000)
+PL_TEST_INTERVAL (SV_NAME_F2 (atan2), -0, -inf, 40000)
diff --git a/pl/math/sv_atan_2u5.c b/pl/math/sv_atan_2u5.c
index 02ac331970c9..7ab486a4c9d2 100644
--- a/pl/math/sv_atan_2u5.c
+++ b/pl/math/sv_atan_2u5.c
@@ -8,55 +8,80 @@
#include "sv_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_sve_f64.h"
-#if SV_SUPPORTED
-
-#include "sv_atan_common.h"
+static const struct data
+{
+ float64_t poly[20];
+ float64_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
+ 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
+ -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
+ 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
+ -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
+ 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
+ -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
+ .pi_over_2 = 0x1.921fb54442d18p+0,
+};
/* Useful constants. */
-#define PiOver2 sv_f64 (0x1.921fb54442d18p+0)
-#define AbsMask (0x7fffffffffffffff)
+#define SignMask (0x8000000000000000)
/* Fast implementation of SVE atan.
Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
error is 2.27 ulps:
- __sv_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
- want 0x1.9225645bdd7c3p-1. */
-sv_f64_t
-__sv_atan_x (sv_f64_t x, const svbool_t pg)
+ _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+ want 0x1.9225645bdd7c3p-1. */
+svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
/* No need to trigger special case. Small cases, infs and nans
are supported by our approximation technique. */
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t sign = svand_x (pg, ix, SignMask);
/* Argument reduction:
y := arctan(x) for x < 1
y := pi/2 + arctan(-1/x) for x > 1
Hence, use z=-1/a if x>=1, otherwise z=a. */
- svbool_t red = svacgt_n_f64 (pg, x, 1.0);
+ svbool_t red = svacgt (pg, x, 1.0);
/* Avoid dependency in abs(x) in division (and comparison). */
- sv_f64_t z = svsel_f64 (red, svdiv_f64_x (pg, sv_f64 (-1.0), x), x);
+ svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x);
/* Use absolute value only when needed (odd powers of z). */
- sv_f64_t az = svabs_f64_x (pg, z);
- az = svneg_f64_m (az, red, az);
+ svfloat64_t az = svabs_x (pg, z);
+ az = svneg_m (az, red, az);
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=19. */
+ svfloat64_t z2 = svmul_x (pg, z, z);
+ svfloat64_t x2 = svmul_x (pg, z2, z2);
+ svfloat64_t x4 = svmul_x (pg, x2, x2);
+ svfloat64_t x8 = svmul_x (pg, x4, x4);
- sv_f64_t y = __sv_atan_common (pg, red, z, az, PiOver2);
+ svfloat64_t y
+ = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly),
+ sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* y = shift + z + z^3 * P(z^2). */
+ svfloat64_t z3 = svmul_x (pg, z2, az);
+ y = svmla_x (pg, az, z3, y);
+
+ /* Apply shift as indicated by `red` predicate. */
+ y = svadd_m (red, y, d->pi_over_2);
/* y = atan(x) if x>0, -atan(-x) otherwise. */
- y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
+ y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
return y;
}
-PL_ALIAS (__sv_atan_x, _ZGVsMxv_atan)
-
PL_SIG (SV, D, 1, atan, -3.1, 3.1)
-PL_TEST_ULP (__sv_atan, 1.78)
-PL_TEST_INTERVAL (__sv_atan, -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (__sv_atan, -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atan, 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atan, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (__sv_atan, 1e6, 1e32, 40000)
-#endif
+PL_TEST_ULP (SV_NAME_D1 (atan), 1.78)
+PL_TEST_INTERVAL (SV_NAME_D1 (atan), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (atan), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (atan), 100, inf, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (atan), -0, -inf, 40000)
diff --git a/pl/math/sv_atan_common.h b/pl/math/sv_atan_common.h
deleted file mode 100644
index bfe6998d2416..000000000000
--- a/pl/math/sv_atan_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Double-precision polynomial evaluation function for SVE atan(x) and
- * atan2(y,x).
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-#include "sv_math.h"
-
-#define P(i) sv_f64 (__atan_poly_data.poly[i])
-
-/* Polynomial used in fast SVE atan(x) and atan2(y,x) implementations
- The order 19 polynomial P approximates (atan(sqrt(x))-sqrt(x))/x^(3/2). */
-static inline sv_f64_t
-__sv_atan_common (svbool_t pg, svbool_t red, sv_f64_t z, sv_f64_t az,
- sv_f64_t shift)
-{
- /* Use full Estrin scheme for P(z^2) with deg(P)=19. */
- sv_f64_t z2 = svmul_f64_x (pg, z, z);
-
- /* Level 1. */
- sv_f64_t P_1_0 = sv_fma_f64_x (pg, P (1), z2, P (0));
- sv_f64_t P_3_2 = sv_fma_f64_x (pg, P (3), z2, P (2));
- sv_f64_t P_5_4 = sv_fma_f64_x (pg, P (5), z2, P (4));
- sv_f64_t P_7_6 = sv_fma_f64_x (pg, P (7), z2, P (6));
- sv_f64_t P_9_8 = sv_fma_f64_x (pg, P (9), z2, P (8));
- sv_f64_t P_11_10 = sv_fma_f64_x (pg, P (11), z2, P (10));
- sv_f64_t P_13_12 = sv_fma_f64_x (pg, P (13), z2, P (12));
- sv_f64_t P_15_14 = sv_fma_f64_x (pg, P (15), z2, P (14));
- sv_f64_t P_17_16 = sv_fma_f64_x (pg, P (17), z2, P (16));
- sv_f64_t P_19_18 = sv_fma_f64_x (pg, P (19), z2, P (18));
-
- /* Level 2. */
- sv_f64_t x2 = svmul_f64_x (pg, z2, z2);
- sv_f64_t P_3_0 = sv_fma_f64_x (pg, P_3_2, x2, P_1_0);
- sv_f64_t P_7_4 = sv_fma_f64_x (pg, P_7_6, x2, P_5_4);
- sv_f64_t P_11_8 = sv_fma_f64_x (pg, P_11_10, x2, P_9_8);
- sv_f64_t P_15_12 = sv_fma_f64_x (pg, P_15_14, x2, P_13_12);
- sv_f64_t P_19_16 = sv_fma_f64_x (pg, P_19_18, x2, P_17_16);
-
- /* Level 3. */
- sv_f64_t x4 = svmul_f64_x (pg, x2, x2);
- sv_f64_t P_7_0 = sv_fma_f64_x (pg, P_7_4, x4, P_3_0);
- sv_f64_t P_15_8 = sv_fma_f64_x (pg, P_15_12, x4, P_11_8);
-
- /* Level 4. */
- sv_f64_t x8 = svmul_f64_x (pg, x4, x4);
- sv_f64_t y = sv_fma_f64_x (pg, P_19_16, x8, P_15_8);
- y = sv_fma_f64_x (pg, y, x8, P_7_0);
-
- /* Finalize. y = shift + z + z^3 * P(z^2). */
- sv_f64_t z3 = svmul_f64_x (pg, z2, az);
- y = sv_fma_f64_x (pg, y, z3, az);
-
- /* Apply shift as indicated by `red` predicate. */
- y = svadd_f64_m (red, y, shift);
-
- return y;
-}
diff --git a/pl/math/sv_atanf_2u9.c b/pl/math/sv_atanf_2u9.c
index 8d38e42b2290..4defb356e7f9 100644
--- a/pl/math/sv_atanf_2u9.c
+++ b/pl/math/sv_atanf_2u9.c
@@ -8,52 +8,69 @@
#include "sv_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_sve_f32.h"
-#if SV_SUPPORTED
-
-#include "sv_atanf_common.h"
+static const struct data
+{
+ float32_t poly[8];
+ float32_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0]. */
+ .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
+ -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
+ .pi_over_2 = 0x1.921fb6p+0f,
+};
-#define PiOver2 sv_f32 (0x1.921fb6p+0f)
-#define AbsMask (0x7fffffff)
+#define SignMask (0x80000000)
/* Fast implementation of SVE atanf based on
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
z=-1/x and shift = pi/2.
Largest observed error is 2.9 ULP, close to +/-1.0:
- __sv_atanf(0x1.0468f6p+0) got -0x1.967f06p-1
- want -0x1.967fp-1. */
-sv_f32_t
-__sv_atanf_x (sv_f32_t x, const svbool_t pg)
+ _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1
+ want -0x1.967fp-1. */
+svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
/* No need to trigger special case. Small cases, infs and nans
are supported by our approximation technique. */
- sv_u32_t ix = sv_as_u32_f32 (x);
- sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
+ svuint32_t ix = svreinterpret_u32 (x);
+ svuint32_t sign = svand_x (pg, ix, SignMask);
/* Argument reduction:
y := arctan(x) for x < 1
y := pi/2 + arctan(-1/x) for x > 1
Hence, use z=-1/a if x>=1, otherwise z=a. */
- svbool_t red = svacgt_n_f32 (pg, x, 1.0f);
+ svbool_t red = svacgt (pg, x, 1.0f);
/* Avoid dependency in abs(x) in division (and comparison). */
- sv_f32_t z = svsel_f32 (red, svdiv_f32_x (pg, sv_f32 (-1.0f), x), x);
+ svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x);
/* Use absolute value only when needed (odd powers of z). */
- sv_f32_t az = svabs_f32_x (pg, z);
- az = svneg_f32_m (az, red, az);
+ svfloat32_t az = svabs_x (pg, z);
+ az = svneg_m (az, red, az);
+
+ /* Use split Estrin scheme for P(z^2) with deg(P)=7. */
+ svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t z4 = svmul_x (pg, z2, z2);
+ svfloat32_t z8 = svmul_x (pg, z4, z4);
- sv_f32_t y = __sv_atanf_common (pg, red, z, az, PiOver2);
+ svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly);
+
+ /* y = shift + z + z^3 * P(z^2). */
+ svfloat32_t z3 = svmul_x (pg, z2, az);
+ y = svmla_x (pg, az, z3, y);
+
+ /* Apply shift as indicated by 'red' predicate. */
+ y = svadd_m (red, y, sv_f32 (d->pi_over_2));
/* y = atan(x) if x>0, -atan(-x) otherwise. */
- return sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
}
-PL_ALIAS (__sv_atanf_x, _ZGVsMxv_atanf)
-
PL_SIG (SV, F, 1, atan, -3.1, 3.1)
-PL_TEST_ULP (__sv_atanf, 2.9)
-PL_TEST_INTERVAL (__sv_atanf, -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (__sv_atanf, -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atanf, 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (__sv_atanf, 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (__sv_atanf, 1e6, 1e32, 40000)
-#endif
+PL_TEST_ULP (SV_NAME_F1 (atan), 2.9)
+PL_TEST_INTERVAL (SV_NAME_F1 (atan), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (atan), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (atan), 100, inf, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (atan), -0, -inf, 40000)
diff --git a/pl/math/sv_atanf_common.h b/pl/math/sv_atanf_common.h
deleted file mode 100644
index dc45effec1cd..000000000000
--- a/pl/math/sv_atanf_common.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Single-precision polynomial evaluation function for SVE atan(x) and
- * atan2(y,x).
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef PL_MATH_SV_ATANF_COMMON_H
-#define PL_MATH_SV_ATANF_COMMON_H
-
-#include "math_config.h"
-#include "sv_math.h"
-
-#define P(i) sv_f32 (__atanf_poly_data.poly[i])
-
-/* Polynomial used in fast SVE atanf(x) and atan2f(y,x) implementations
- The order 7 polynomial P approximates (f(sqrt(x))-sqrt(x))/x^(3/2). */
-static inline sv_f32_t
-__sv_atanf_common (svbool_t pg, svbool_t red, sv_f32_t z, sv_f32_t az,
- sv_f32_t shift)
-{
- /* Use full Estrin scheme for P(z^2) with deg(P)=7. */
-
- /* First compute square powers of z. */
- sv_f32_t z2 = svmul_f32_x (pg, z, z);
- sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
- sv_f32_t z8 = svmul_f32_x (pg, z4, z4);
-
- /* Then assemble polynomial. */
- sv_f32_t p_4_7 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (7), P (6))),
- (sv_fma_f32_x (pg, z2, P (5), P (4))));
- sv_f32_t p_0_3 = sv_fma_f32_x (pg, z4, (sv_fma_f32_x (pg, z2, P (3), P (2))),
- (sv_fma_f32_x (pg, z2, P (1), P (0))));
- sv_f32_t y = sv_fma_f32_x (pg, z8, p_4_7, p_0_3);
-
- /* Finalize. y = shift + z + z^3 * P(z^2). */
- sv_f32_t z3 = svmul_f32_x (pg, z2, az);
- y = sv_fma_f32_x (pg, y, z3, az);
-
- /* Apply shift as indicated by 'red' predicate. */
- y = svadd_f32_m (red, y, shift);
-
- return y;
-}
-
-#endif // PL_MATH_SV_ATANF_COMMON_H
diff --git a/pl/math/sv_atanh_3u3.c b/pl/math/sv_atanh_3u3.c
new file mode 100644
index 000000000000..dcc9350b4962
--- /dev/null
+++ b/pl/math/sv_atanh_3u3.c
@@ -0,0 +1,60 @@
+/*
+ * Double-precision SVE atanh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define WANT_SV_LOG1P_K0_SHORTCUT 0
+#include "sv_log1p_inline.h"
+
+#define One (0x3ff0000000000000)
+#define Half (0x3fe0000000000000)
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (atanh, x, y, special);
+}
+
+/* SVE approximation for double-precision atanh, based on log1p.
+ The greatest observed error is 2.81 ULP:
+ _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ want 0x1.ffd8ff31b501cp-6. */
+svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
+{
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t iax = svreinterpret_u64 (ax);
+ svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
+
+ /* It is special if iax >= 1. */
+// svbool_t special = svcmpge (pg, iax, One);
+ svbool_t special = svacge (pg, x, 1.0);
+
+ /* Computation is performed based on the following sequence of equality:
+ (1+x)/(1-x) = 1 + 2x/(1-x). */
+ svfloat64_t y;
+ y = svadd_x (pg, ax, ax);
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax));
+ /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
+ y = sv_log1p_inline (y, pg);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmul_x (pg, halfsign, y), special);
+ return svmul_x (pg, halfsign, y);
+}
+
+PL_SIG (SV, D, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (SV_NAME_D1 (atanh), 3.32)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
+PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
+PL_TEST_SYM_INTERVAL_C (SV_NAME_D1 (atanh), 1, inf, 100, 0)
diff --git a/pl/math/sv_atanhf_2u8.c b/pl/math/sv_atanhf_2u8.c
new file mode 100644
index 000000000000..413c60ce05da
--- /dev/null
+++ b/pl/math/sv_atanhf_2u8.c
@@ -0,0 +1,56 @@
+/*
+ * Single-precision vector atanh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#include "sv_log1pf_inline.h"
+
+#define One (0x3f800000)
+#define Half (0x3f000000)
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (atanhf, x, y, special);
+}
+
+/* Approximation for vector single-precision atanh(x) using modified log1p.
+ The maximum error is 2.28 ULP:
+ _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5
+ want 0x1.ffbbb6p-5. */
+svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg)
+{
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, Half));
+ svbool_t special = svcmpge (pg, iax, One);
+
+ /* Computation is performed based on the following sequence of equality:
+ * (1+x)/(1-x) = 1 + 2x/(1-x). */
+ svfloat32_t y = svadd_x (pg, ax, ax);
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f32 (1), ax));
+ /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
+ y = sv_log1pf_inline (y, pg);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmul_x (pg, halfsign, y), special);
+
+ return svmul_x (pg, halfsign, y);
+}
+
+PL_SIG (SV, F, 1, atanh, -1.0, 1.0)
+PL_TEST_ULP (SV_NAME_F1 (atanh), 2.59)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0, 0x1p-12, 1000, 0)
+PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 0x1p-12, 1, 20000, 0)
+PL_TEST_SYM_INTERVAL_C (SV_NAME_F1 (atanh), 1, inf, 1000, 0)
diff --git a/pl/math/sv_cbrt_2u.c b/pl/math/sv_cbrt_2u.c
new file mode 100644
index 000000000000..192f1cd80d59
--- /dev/null
+++ b/pl/math/sv_cbrt_2u.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision SVE cbrt(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f64.h"
+
+const static struct data
+{
+ float64_t poly[4];
+ float64_t table[5];
+ float64_t one_third, two_thirds, shift;
+ int64_t exp_bias;
+ uint64_t tiny_bound, thresh;
+} data = {
+ /* Generated with FPMinimax in [0.5, 1]. */
+ .poly = { 0x1.c14e8ee44767p-2, 0x1.dd2d3f99e4c0ep-1, -0x1.08e83026b7e74p-1,
+ 0x1.2c74eaa3ba428p-3, },
+ /* table[i] = 2^((i - 2) / 3). */
+ .table = { 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0, },
+ .one_third = 0x1.5555555555555p-2,
+ .two_thirds = 0x1.5555555555555p-1,
+ .shift = 0x1.8p52,
+ .exp_bias = 1022,
+ .tiny_bound = 0x0010000000000000, /* Smallest normal. */
+ .thresh = 0x7fe0000000000000, /* asuint64 (infinity) - tiny_bound. */
+};
+
+#define MantissaMask 0x000fffffffffffff
+#define HalfExp 0x3fe0000000000000
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (cbrt, x, y, special);
+}
+
+static inline svfloat64_t
+shifted_lookup (const svbool_t pg, const float64_t *table, svint64_t i)
+{
+ return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
+}
+
+/* Approximation for double-precision vector cbrt(x), using low-order
+ polynomial and two Newton iterations. Greatest observed error is 1.79 ULP.
+ Errors repeat according to the exponent, for instance an error observed for
+ double value m * 2^e will be observed for any input m * 2^(e + 3*i), where i
+ is an integer.
+ _ZGVsMxv_cbrt (0x0.3fffb8d4413f3p-1022) got 0x1.965f53b0e5d97p-342
+ want 0x1.965f53b0e5d95p-342. */
+svfloat64_t SV_NAME_D1 (cbrt) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t iax = svreinterpret_u64 (ax);
+ svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
+
+ /* Subnormal, +/-0 and special values. */
+ svbool_t special = svcmpge (pg, svsub_x (pg, iax, d->tiny_bound), d->thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexp, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ svfloat64_t m = svreinterpret_f64 (svorr_x (
+ pg, svand_x (pg, svreinterpret_u64 (x), MantissaMask), HalfExp));
+ svint64_t e
+ = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, iax, 52)), d->exp_bias);
+
+ /* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point
+ for Newton iterations. */
+ svfloat64_t p
+ = sv_pairwise_poly_3_f64_x (pg, m, svmul_x (pg, m, m), d->poly);
+
+ /* Two iterations of Newton's method for iteratively approximating cbrt. */
+ svfloat64_t m_by_3 = svmul_x (pg, m, d->one_third);
+ svfloat64_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
+ d->two_thirds);
+ a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, a, a)), a, d->two_thirds);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ svfloat64_t eb3f = svmul_x (pg, svcvt_f64_x (pg, e), d->one_third);
+ svint64_t ey = svcvt_s64_x (pg, eb3f);
+ svint64_t em3 = svmls_x (pg, e, ey, 3);
+
+ svfloat64_t my = shifted_lookup (pg, d->table, em3);
+ my = svmul_x (pg, my, a);
+
+ /* Vector version of ldexp. */
+ svfloat64_t y = svscale_x (pg, my, ey);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)),
+ special);
+
+ /* Copy sign. */
+ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
+}
+
+PL_SIG (SV, D, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (cbrt), 1.30)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cbrt), 0, inf, 1000000)
diff --git a/pl/math/sv_cbrtf_1u7.c b/pl/math/sv_cbrtf_1u7.c
new file mode 100644
index 000000000000..5b625f308827
--- /dev/null
+++ b/pl/math/sv_cbrtf_1u7.c
@@ -0,0 +1,116 @@
+/*
+ * Single-precision SVE cbrt(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f32.h"
+
+const static struct data
+{
+ float32_t poly[4];
+ float32_t table[5];
+ float32_t one_third, two_thirds;
+} data = {
+ /* Very rough approximation of cbrt(x) in [0.5, 1], generated with FPMinimax.
+ */
+ .poly = { 0x1.c14e96p-2, 0x1.dd2d3p-1, -0x1.08e81ap-1,
+ 0x1.2c74c2p-3, },
+ /* table[i] = 2^((i - 2) / 3). */
+ .table = { 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+ .one_third = 0x1.555556p-2f,
+ .two_thirds = 0x1.555556p-1f,
+};
+
+#define SmallestNormal 0x00800000
+#define Thresh 0x7f000000 /* asuint(INFINITY) - SmallestNormal. */
+#define MantissaMask 0x007fffff
+#define HalfExp 0x3f000000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (cbrtf, x, y, special);
+}
+
+static inline svfloat32_t
+shifted_lookup (const svbool_t pg, const float32_t *table, svint32_t i)
+{
+ return svld1_gather_index (pg, table, svadd_x (pg, i, 2));
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+ with initial guess obtained by a low-order polynomial. Greatest error
+ is 1.64 ULP. This is observed for every value where the mantissa is
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
+ _ZGVsMxv_cbrtf (0x1.85a2aap+3) got 0x1.267936p+1
+ want 0x1.267932p+1. */
+svfloat32_t SV_NAME_F1 (cbrt) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+
+ /* Subnormal, +/-0 and special values. */
+ svbool_t special = svcmpge (pg, svsub_x (pg, iax, SmallestNormal), Thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexpf, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ svfloat32_t m = svreinterpret_f32 (svorr_x (
+ pg, svand_x (pg, svreinterpret_u32 (x), MantissaMask), HalfExp));
+ svint32_t e = svsub_x (pg, svreinterpret_s32 (svlsr_x (pg, iax, 23)), 126);
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ svfloat32_t p
+ = sv_pairwise_poly_3_f32_x (pg, m, svmul_x (pg, m, m), d->poly);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ svfloat32_t m_by_3 = svmul_x (pg, m, d->one_third);
+ svfloat32_t a = svmla_x (pg, svdiv_x (pg, m_by_3, svmul_x (pg, p, p)), p,
+ d->two_thirds);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ svfloat32_t ef = svmul_x (pg, svcvt_f32_x (pg, e), d->one_third);
+ svint32_t ey = svcvt_s32_x (pg, ef);
+ svint32_t em3 = svmls_x (pg, e, ey, 3);
+
+ svfloat32_t my = shifted_lookup (pg, d->table, em3);
+ my = svmul_x (pg, my, a);
+
+ /* Vector version of ldexpf. */
+ svfloat32_t y = svscale_x (pg, my, ey);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (
+ x, svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)),
+ special);
+
+ /* Copy sign. */
+ return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign));
+}
+
+PL_SIG (SV, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (cbrt), 1.15)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cbrt), 0, inf, 1000000)
diff --git a/pl/math/sv_cexpi_3u5.c b/pl/math/sv_cexpi_3u5.c
new file mode 100644
index 000000000000..920acfea5da0
--- /dev/null
+++ b/pl/math/sv_cexpi_3u5.c
@@ -0,0 +1,45 @@
+/*
+ * Double-precision vector cexpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_sincos_common.h"
+#include "sv_math.h"
+#include "pl_test.h"
+
+static svfloat64x2_t NOINLINE
+special_case (svfloat64_t x, svbool_t special, svfloat64x2_t y)
+{
+ return svcreate2 (sv_call_f64 (sin, x, svget2 (y, 0), special),
+ sv_call_f64 (cos, x, svget2 (y, 1), special));
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ sv_cexpi_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+svfloat64x2_t
+_ZGVsMxv_cexpi (svfloat64_t x, svbool_t pg)
+{
+ const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data);
+ svbool_t special = check_ge_rangeval (pg, x, d);
+
+ svfloat64x2_t sc = sv_sincos_inline (pg, x, d);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, special, sc);
+ return sc;
+}
+
+PL_TEST_ULP (_ZGVsMxv_cexpi_sin, 2.73)
+PL_TEST_ULP (_ZGVsMxv_cexpi_cos, 2.73)
+#define SV_CEXPI_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_cexpi_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_cexpi_cos, lo, hi, n)
+SV_CEXPI_INTERVAL (0, 0x1p23, 500000)
+SV_CEXPI_INTERVAL (-0, -0x1p23, 500000)
+SV_CEXPI_INTERVAL (0x1p23, inf, 10000)
+SV_CEXPI_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/pl/math/sv_cexpif_1u8.c b/pl/math/sv_cexpif_1u8.c
new file mode 100644
index 000000000000..93f2f998cb38
--- /dev/null
+++ b/pl/math/sv_cexpif_1u8.c
@@ -0,0 +1,47 @@
+/*
+ * Single-precision vector cexpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_sincosf_common.h"
+#include "sv_math.h"
+#include "pl_test.h"
+
+static svfloat32x2_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, svfloat32x2_t y)
+{
+ return svcreate2 (sv_call_f32 (sinf, x, svget2 (y, 0), special),
+ sv_call_f32 (cosf, x, svget2 (y, 1), special));
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+svfloat32x2_t
+_ZGVsMxv_cexpif (svfloat32_t x, svbool_t pg)
+{
+ const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data);
+ svbool_t special = check_ge_rangeval (pg, x, d);
+
+ svfloat32x2_t sc = sv_sincosf_inline (pg, x, d);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, special, sc);
+ return sc;
+}
+
+PL_TEST_ULP (_ZGVsMxv_cexpif_sin, 1.17)
+PL_TEST_ULP (_ZGVsMxv_cexpif_cos, 1.31)
+#define SV_CEXPIF_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_cexpif_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_cexpif_cos, lo, hi, n)
+SV_CEXPIF_INTERVAL (0, 0x1p20, 500000)
+SV_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
+SV_CEXPIF_INTERVAL (0x1p20, inf, 10000)
+SV_CEXPIF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/pl/math/sv_cos_2u5.c b/pl/math/sv_cos_2u5.c
index 194034802452..76af3459b3f2 100644
--- a/pl/math/sv_cos_2u5.c
+++ b/pl/math/sv_cos_2u5.c
@@ -9,76 +9,78 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
+static const struct data
+{
+ double inv_pio2, pio2_1, pio2_2, pio2_3, shift;
+} data = {
+ /* Polynomial coefficients are hardwired in FTMAD instructions. */
+ .inv_pio2 = 0x1.45f306dc9c882p-1,
+ .pio2_1 = 0x1.921fb50000000p+0,
+ .pio2_2 = 0x1.110b460000000p-26,
+ .pio2_3 = 0x1.1a62633145c07p-54,
+ /* Original shift used in AdvSIMD cos,
+ plus a contribution to set the bit #0 of q
+ as expected by trigonometric instructions. */
+ .shift = 0x1.8000000000001p52
+};
-#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
-#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
-#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
-#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
-/* Original shift used in Neon cos,
- plus a contribution to set the bit #0 of q
- as expected by trigonometric instructions. */
-#define Shift (sv_f64 (0x1.8000000000001p52))
-#define RangeVal (sv_f64 (0x1p23))
-#define AbsMask (0x7fffffffffffffff)
+#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */
-static NOINLINE sv_f64_t
-__sv_cos_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t oob)
{
- return sv_call_f64 (cos, x, y, cmp);
+ return sv_call_f64 (cos, x, y, oob);
}
/* A fast SVE implementation of cos based on trigonometric
instructions (FTMAD, FTSSEL, FTSMUL).
Maximum measured error: 2.108 ULPs.
- __sv_cos(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
- want -0x1.fddd4c65c7f05p-3. */
-sv_f64_t
-__sv_cos_x (sv_f64_t x, const svbool_t pg)
+ SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3
+ want -0x1.fddd4c65c7f05p-3. */
+svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg)
{
- sv_f64_t n, r, r2, y;
- svbool_t cmp;
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t r = svabs_x (pg, x);
+ svbool_t oob = svcmpge (pg, svreinterpret_u64 (r), RangeVal);
- r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
- cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
+ /* Load some constants in quad-word chunks to minimise memory access. */
+ svbool_t ptrue = svptrue_b64 ();
+ svfloat64_t invpio2_and_pio2_1 = svld1rq (ptrue, &d->inv_pio2);
+ svfloat64_t pio2_23 = svld1rq (ptrue, &d->pio2_2);
/* n = rint(|x|/(pi/2)). */
- sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
- n = svsub_f64_x (pg, q, Shift);
+ svfloat64_t q = svmla_lane (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0);
+ svfloat64_t n = svsub_x (pg, q, d->shift);
/* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
- r = sv_fma_f64_x (pg, NegPio2_1, n, r);
- r = sv_fma_f64_x (pg, NegPio2_2, n, r);
- r = sv_fma_f64_x (pg, NegPio2_3, n, r);
+ r = svmls_lane (r, n, invpio2_and_pio2_1, 1);
+ r = svmls_lane (r, n, pio2_23, 0);
+ r = svmls_lane (r, n, pio2_23, 1);
/* cos(r) poly approx. */
- r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
- y = sv_f64 (0.0);
- y = svtmad_f64 (y, r2, 7);
- y = svtmad_f64 (y, r2, 6);
- y = svtmad_f64 (y, r2, 5);
- y = svtmad_f64 (y, r2, 4);
- y = svtmad_f64 (y, r2, 3);
- y = svtmad_f64 (y, r2, 2);
- y = svtmad_f64 (y, r2, 1);
- y = svtmad_f64 (y, r2, 0);
+ svfloat64_t r2 = svtsmul (r, svreinterpret_u64 (q));
+ svfloat64_t y = sv_f64 (0.0);
+ y = svtmad (y, r2, 7);
+ y = svtmad (y, r2, 6);
+ y = svtmad (y, r2, 5);
+ y = svtmad (y, r2, 4);
+ y = svtmad (y, r2, 3);
+ y = svtmad (y, r2, 2);
+ y = svtmad (y, r2, 1);
+ y = svtmad (y, r2, 0);
/* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
- sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
- /* Apply factor. */
- y = svmul_f64_x (pg, f, y);
+ svfloat64_t f = svtssel (r, svreinterpret_u64 (q));
- /* No need to pass pg to specialcase here since cmp is a strict subset,
- guaranteed by the cmpge above. */
- if (unlikely (svptest_any (pg, cmp)))
- return __sv_cos_specialcase (x, y, cmp);
- return y;
-}
+ if (unlikely (svptest_any (pg, oob)))
+ return special_case (x, svmul_x (svnot_z (pg, oob), y, f), oob);
-PL_ALIAS (__sv_cos_x, _ZGVsMxv_cos)
+ /* Apply factor. */
+ return svmul_x (pg, f, y);
+}
PL_SIG (SV, D, 1, cos, -3.1, 3.1)
-PL_TEST_ULP (__sv_cos, 1.61)
-PL_TEST_INTERVAL (__sv_cos, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (__sv_cos, 0x1p-4, 0x1p4, 500000)
-#endif
+PL_TEST_ULP (SV_NAME_D1 (cos), 1.61)
+PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (SV_NAME_D1 (cos), 0x1p-4, 0x1p4, 500000)
diff --git a/pl/math/sv_cosf_2u1.c b/pl/math/sv_cosf_2u1.c
index 8f138bcba7af..4bdb0dd146bb 100644
--- a/pl/math/sv_cosf_2u1.c
+++ b/pl/math/sv_cosf_2u1.c
@@ -9,74 +9,72 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
+static const struct data
+{
+ float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift;
+} data = {
+ /* Polynomial coefficients are hard-wired in FTMAD instructions. */
+ .neg_pio2_1 = -0x1.921fb6p+0f,
+ .neg_pio2_2 = 0x1.777a5cp-25f,
+ .neg_pio2_3 = 0x1.ee59dap-50f,
+ .inv_pio2 = 0x1.45f306p-1f,
+ /* Original shift used in AdvSIMD cosf,
+ plus a contribution to set the bit #0 of q
+ as expected by trigonometric instructions. */
+ .shift = 0x1.800002p+23f
+};
-#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
-#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
-#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
-#define RangeVal (sv_f32 (0x1p20f))
-#define InvPio2 (sv_f32 (0x1.45f306p-1f))
-/* Original shift used in Neon cosf,
- plus a contribution to set the bit #0 of q
- as expected by trigonometric instructions. */
-#define Shift (sv_f32 (0x1.800002p+23f))
-#define AbsMask (0x7fffffff)
+#define RangeVal 0x49800000 /* asuint32(0x1p20f). */
-static NOINLINE sv_f32_t
-__sv_cosf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t oob)
{
- return sv_call_f32 (cosf, x, y, cmp);
+ return sv_call_f32 (cosf, x, y, oob);
}
/* A fast SVE implementation of cosf based on trigonometric
instructions (FTMAD, FTSSEL, FTSMUL).
Maximum measured error: 2.06 ULPs.
- __sv_cosf(0x1.dea2f2p+19) got 0x1.fffe7ap-6
- want 0x1.fffe76p-6. */
-sv_f32_t
-__sv_cosf_x (sv_f32_t x, const svbool_t pg)
+ SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6
+ want 0x1.fffe76p-6. */
+svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg)
{
- sv_f32_t n, r, r2, y;
- svbool_t cmp;
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t r = svabs_x (pg, x);
+ svbool_t oob = svcmpge (pg, svreinterpret_u32 (r), RangeVal);
- r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
- cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+ /* Load some constants in quad-word chunks to minimise memory access. */
+ svfloat32_t negpio2_and_invpio2 = svld1rq (svptrue_b32 (), &d->neg_pio2_1);
/* n = rint(|x|/(pi/2)). */
- sv_f32_t q = sv_fma_f32_x (pg, InvPio2, r, Shift);
- n = svsub_f32_x (pg, q, Shift);
+ svfloat32_t q = svmla_lane (sv_f32 (d->shift), r, negpio2_and_invpio2, 3);
+ svfloat32_t n = svsub_x (pg, q, d->shift);
/* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
- r = sv_fma_f32_x (pg, NegPio2_1, n, r);
- r = sv_fma_f32_x (pg, NegPio2_2, n, r);
- r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+ r = svmla_lane (r, n, negpio2_and_invpio2, 0);
+ r = svmla_lane (r, n, negpio2_and_invpio2, 1);
+ r = svmla_lane (r, n, negpio2_and_invpio2, 2);
/* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
- sv_f32_t f = svtssel_f32 (r, sv_as_u32_f32 (q));
+ svfloat32_t f = svtssel (r, svreinterpret_u32 (q));
/* cos(r) poly approx. */
- r2 = svtsmul_f32 (r, sv_as_u32_f32 (q));
- y = sv_f32 (0.0f);
- y = svtmad_f32 (y, r2, 4);
- y = svtmad_f32 (y, r2, 3);
- y = svtmad_f32 (y, r2, 2);
- y = svtmad_f32 (y, r2, 1);
- y = svtmad_f32 (y, r2, 0);
+ svfloat32_t r2 = svtsmul (r, svreinterpret_u32 (q));
+ svfloat32_t y = sv_f32 (0.0f);
+ y = svtmad (y, r2, 4);
+ y = svtmad (y, r2, 3);
+ y = svtmad (y, r2, 2);
+ y = svtmad (y, r2, 1);
+ y = svtmad (y, r2, 0);
+ if (unlikely (svptest_any (pg, oob)))
+ return special_case (x, svmul_x (svnot_z (pg, oob), f, y), oob);
/* Apply factor. */
- y = svmul_f32_x (pg, f, y);
-
- /* No need to pass pg to specialcase here since cmp is a strict subset,
- guaranteed by the cmpge above. */
- if (unlikely (svptest_any (pg, cmp)))
- return __sv_cosf_specialcase (x, y, cmp);
- return y;
+ return svmul_x (pg, f, y);
}
-PL_ALIAS (__sv_cosf_x, _ZGVsMxv_cosf)
-
PL_SIG (SV, F, 1, cos, -3.1, 3.1)
-PL_TEST_ULP (__sv_cosf, 1.57)
-PL_TEST_INTERVAL (__sv_cosf, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (__sv_cosf, 0x1p-4, 0x1p4, 500000)
-#endif
+PL_TEST_ULP (SV_NAME_F1 (cos), 1.57)
+PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0, 0xffff0000, 10000)
+PL_TEST_INTERVAL (SV_NAME_F1 (cos), 0x1p-4, 0x1p4, 500000)
diff --git a/pl/math/sv_cosh_2u.c b/pl/math/sv_cosh_2u.c
new file mode 100644
index 000000000000..a6d743fb9b96
--- /dev/null
+++ b/pl/math/sv_cosh_2u.c
@@ -0,0 +1,100 @@
+/*
+ * Double-precision SVE cosh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64_t poly[3];
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+ uint64_t index_mask, special_bound;
+} data = {
+ .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
+ 0x1.5555576a59599p-5, },
+
+ .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */
+ /* -ln2/N. */
+ .ln2_hi = -0x1.62e42fefa39efp-9,
+ .ln2_lo = -0x1.abc9e3b39803f3p-64,
+ .shift = 0x1.8p+52,
+ .thres = 704.0,
+
+ .index_mask = 0xff,
+ /* 0x1.6p9, above which exp overflows. */
+ .special_bound = 0x4086000000000000,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
+ special-case handling or tail. */
+static inline svfloat64_t
+exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+ /* Calculate exp(x). */
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
+ r = svmla_x (pg, r, n, d->ln2_lo);
+
+ svuint64_t u = svreinterpret_u64 (z);
+ svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
+ svuint64_t i = svand_x (pg, u, d->index_mask);
+
+ svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
+ y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
+ y = svmla_x (pg, sv_f64 (1.0), r, y);
+ y = svmul_x (pg, r, y);
+
+ /* s = 2^(n/N). */
+ u = svld1_gather_index (pg, __v_exp_tail_data, i);
+ svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+
+ return svmla_x (pg, s, s, y);
+}
+
+/* Approximation for SVE double-precision cosh(x) using exp_inline.
+ cosh(x) = (exp(x) + exp(-x)) / 2.
+ The greatest observed error is in the scalar fall-back region, so is the
+ same as the scalar routine, 1.93 ULP:
+ _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
+ want 0x1.fd774e958236fp+1021.
+
+ The greatest observed error in the non-special region is 1.54 ULP:
+ _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
+ want 0x1.f5e2bb8d5c991p+8. */
+svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
+
+ /* Up to the point that exp overflows, we can use it to calculate cosh by
+ exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
+ svfloat64_t t = exp_inline (ax, pg, d);
+ svfloat64_t half_t = svmul_x (pg, t, 0.5);
+ svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+
+ /* Fall back to scalar for any special cases. */
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+ return svadd_x (pg, half_t, half_over_t);
+}
+
+PL_SIG (SV, D, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (cosh), 1.43)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
diff --git a/pl/math/sv_coshf_2u.c b/pl/math/sv_coshf_2u.c
new file mode 100644
index 000000000000..81680fef318e
--- /dev/null
+++ b/pl/math/sv_coshf_2u.c
@@ -0,0 +1,56 @@
+/*
+ * Single-precision SVE cosh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#include "sv_expf_inline.h"
+
+static const struct data
+{
+ struct sv_expf_data expf_consts;
+ uint32_t special_bound;
+} data = {
+ .expf_consts = SV_EXPF_DATA,
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .special_bound = 0x42ad496c,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+ return sv_call_f32 (coshf, x, y, pg);
+}
+
+/* Single-precision vector cosh, using vector expf.
+ Maximum error is 1.89 ULP:
+ _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
+ want 0x1.f00adcp+127. */
+svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
+ svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
+ svfloat32_t half_t = svmul_x (pg, t, 0.5);
+ svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+
+ return svadd_x (pg, half_t, half_over_t);
+}
+
+PL_SIG (SV, F, 1, cosh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (cosh), 1.39)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1p-63, 100)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/sv_cospi_3u2.c b/pl/math/sv_cospi_3u2.c
new file mode 100644
index 000000000000..d80f899c41e4
--- /dev/null
+++ b/pl/math/sv_cospi_3u2.c
@@ -0,0 +1,63 @@
+/*
+ * Double-precision SVE cospi(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ double poly[10];
+ double range_val;
+} data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1,
+ -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
+ 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16,
+ 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 },
+ .range_val = 0x1p53,
+};
+
+/* A fast SVE implementation of cospi.
+ Maximum error 3.20 ULP:
+ _ZGVsMxv_cospi(0x1.f18ba32c63159p-6) got 0x1.fdabf595f9763p-1
+ want 0x1.fdabf595f9766p-1. */
+svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Using cospi(x) = sinpi(0.5 - x)
+ range reduction and offset into sinpi range -1/2 .. 1/2
+ r = 0.5 - |x - rint(x)|. */
+ svfloat64_t n = svrinta_x (pg, x);
+ svfloat64_t r = svsub_x (pg, x, n);
+ r = svsub_x (pg, sv_f64 (0.5), svabs_x (pg, r));
+
+ /* Result should be negated based on if n is odd or not.
+ If ax >= 2^53, the result will always be positive. */
+ svbool_t cmp = svaclt (pg, x, d->range_val);
+ svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n));
+ svuint64_t sign = svlsl_z (cmp, intn, 63);
+
+ /* y = sin(r). */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t r4 = svmul_x (pg, r2, r2);
+ svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly);
+ y = svmul_x (pg, y, r);
+
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+}
+
+PL_SIG (SV, D, 1, cospi, -0.9, 0.9)
+PL_TEST_ULP (SV_NAME_D1 (cospi), 2.71)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (cospi), 0x1p51, inf, 100000)
diff --git a/pl/math/sv_cospif_2u6.c b/pl/math/sv_cospif_2u6.c
new file mode 100644
index 000000000000..fb2922d0533a
--- /dev/null
+++ b/pl/math/sv_cospif_2u6.c
@@ -0,0 +1,59 @@
+/*
+ * Single-precision SVE cospi(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float poly[6];
+ float range_val;
+} data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f,
+ 0x1.50783p-4f, -0x1.e30750p-8f },
+ .range_val = 0x1p31f,
+};
+
+/* A fast SVE implementation of cospif.
+ Maximum error: 2.60 ULP:
+ _ZGVsMxv_cospif(+/-0x1.cae664p-4) got 0x1.e09c9ep-1
+ want 0x1.e09c98p-1. */
+svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Using cospi(x) = sinpi(0.5 - x)
+ range reduction and offset into sinpi range -1/2 .. 1/2
+ r = 0.5 - |x - rint(x)|. */
+ svfloat32_t n = svrinta_x (pg, x);
+ svfloat32_t r = svsub_x (pg, x, n);
+ r = svsub_x (pg, sv_f32 (0.5f), svabs_x (pg, r));
+
+ /* Result should be negated based on if n is odd or not.
+ If ax >= 2^31, the result will always be positive. */
+ svbool_t cmp = svaclt (pg, x, d->range_val);
+ svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n));
+ svuint32_t sign = svlsl_z (cmp, intn, 31);
+
+ /* y = sin(r). */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly);
+ y = svmul_x (pg, y, r);
+
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
+}
+
+PL_SIG (SV, F, 1, cospi, -0.9, 0.9)
+PL_TEST_ULP (SV_NAME_F1 (cospi), 2.08)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0.5, 0x1p31f, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (cospi), 0x1p31f, inf, 10000)
diff --git a/pl/math/sv_erf_2u5.c b/pl/math/sv_erf_2u5.c
new file mode 100644
index 000000000000..cbf9718e5bb0
--- /dev/null
+++ b/pl/math/sv_erf_2u5.c
@@ -0,0 +1,111 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ double third;
+ double tenth, two_over_five, two_over_fifteen;
+ double two_over_nine, two_over_fortyfive;
+ double max, shift;
+} data = {
+ .third = 0x1.5555555555556p-2, /* used to compute 2/3 and 1/6 too. */
+ .two_over_fifteen = 0x1.1111111111111p-3,
+ .tenth = -0x1.999999999999ap-4,
+ .two_over_five = -0x1.999999999999ap-2,
+ .two_over_nine = -0x1.c71c71c71c71cp-3,
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
+ .max = 5.9921875, /* 6 - 1/128. */
+ .shift = 0x1p45,
+};
+
+#define SignMask (0x8000000000000000)
+
+/* Double-precision implementation of vector erf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+ erf(x) ~ erf(r) + scale * d * [
+ + 1
+ - r d
+ + 1/3 (2 r^2 - 1) d^2
+ - 1/6 (r (2 r^2 - 3)) d^3
+ + 1/30 (4 r^4 - 12 r^2 + 3) d^4
+ - 1/90 (4 r^4 - 20 r^2 + 15) d^5
+ ]
+
+ Maximum measure error: 2.29 ULP
+ _ZGVsMxv_erf(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
+ want -0x1.20dd59132ebafp-8. */
+svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ /* |x| >= 6.0 - 1/128. Opposite conditions except none of them catch NaNs so
+ they can be used in lookup and BSLs to yield the expected results. */
+ svbool_t a_ge_max = svacge (pg, x, dat->max);
+ svbool_t a_lt_max = svaclt (pg, x, dat->max);
+
+ /* Set r to multiple of 1/128 nearest to |x|. */
+ svfloat64_t a = svabs_x (pg, x);
+ svfloat64_t shift = sv_f64 (dat->shift);
+ svfloat64_t z = svadd_x (pg, a, shift);
+ svuint64_t i
+ = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
+
+ /* Lookup without shortcut for small values but with predicate to avoid
+ segfault for large values and NaNs. */
+ svfloat64_t r = svsub_x (pg, z, shift);
+ svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
+ svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
+
+ /* erf(x) ~ erf(r) + scale * d * poly (r, d). */
+ svfloat64_t d = svsub_x (pg, a, r);
+ svfloat64_t d2 = svmul_x (pg, d, d);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+
+ /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
+ svfloat64_t p1 = r;
+ svfloat64_t third = sv_f64 (dat->third);
+ svfloat64_t twothird = svmul_x (pg, third, 2.0);
+ svfloat64_t sixth = svmul_x (pg, third, 0.5);
+ svfloat64_t p2 = svmls_x (pg, third, r2, twothird);
+ svfloat64_t p3 = svmad_x (pg, r2, third, -0.5);
+ p3 = svmul_x (pg, r, p3);
+ svfloat64_t p4
+ = svmla_x (pg, sv_f64 (dat->two_over_five), r2, dat->two_over_fifteen);
+ p4 = svmls_x (pg, sv_f64 (dat->tenth), r2, p4);
+ svfloat64_t p5
+ = svmla_x (pg, sv_f64 (dat->two_over_nine), r2, dat->two_over_fortyfive);
+ p5 = svmla_x (pg, sixth, r2, p5);
+ p5 = svmul_x (pg, r, p5);
+
+ svfloat64_t p34 = svmla_x (pg, p3, d, p4);
+ svfloat64_t p12 = svmla_x (pg, p1, d, p2);
+ svfloat64_t y = svmla_x (pg, p34, d2, p5);
+ y = svmla_x (pg, p12, d2, y);
+
+ y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y));
+
+ /* Solves the |x| = inf and NaN cases. */
+ y = svsel (a_ge_max, sv_f64 (1.0), y);
+
+ /* Copy sign. */
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t iy = svreinterpret_u64 (y);
+ svuint64_t sign = svand_x (pg, ix, SignMask);
+ return svreinterpret_f64 (svorr_x (pg, sign, iy));
+}
+
+PL_SIG (SV, D, 1, erf, -6.0, 6.0)
+PL_TEST_ULP (SV_NAME_D1 (erf), 1.79)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, 5.9921875, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 5.9921875, inf, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erf), 0, inf, 4000)
diff --git a/pl/math/sv_erf_3u.c b/pl/math/sv_erf_3u.c
deleted file mode 100644
index bec7f8a819d2..000000000000
--- a/pl/math/sv_erf_3u.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Double-precision SVE erf(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if SV_SUPPORTED
-
-#define Scale (8.0)
-#define AbsMask (0x7fffffffffffffff)
-
-static NOINLINE sv_f64_t
-__sv_erf_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
-{
- return sv_call_f64 (erf, x, y, cmp);
-}
-
-/* Optimized double precision SVE error function erf.
- Maximum observed error is 2.62 ULP:
- __sv_erf(0x1.79cab7e3078fap+2) got 0x1.0000000000001p+0
- want 0x1.fffffffffffffp-1. */
-sv_f64_t
-__sv_erf_x (sv_f64_t x, const svbool_t pg)
-{
- /* Use top 16 bits to test for special cases and small values. */
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_u64_t atop = svand_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 48), 0x7fff);
-
- /* Handle both inf/nan as well as small values (|x|<2^-28). */
- svbool_t cmp
- = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3e30), 0x7ff0 - 0x3e30);
-
- /* Get sign and absolute value. */
- sv_f64_t a = sv_as_f64_u64 (svand_n_u64_x (pg, ix, AbsMask));
- sv_u64_t sign = svand_n_u64_x (pg, ix, ~AbsMask);
-
- /* i = trunc(Scale*x). */
- sv_f64_t a_scale = svmul_n_f64_x (pg, a, Scale);
- /* Saturate index of intervals. */
- svbool_t a_lt_6 = svcmplt_n_u64 (pg, atop, 0x4018);
- sv_u64_t i = svcvt_u64_f64_m (sv_u64 (V_ERF_NINTS - 1), a_lt_6, a_scale);
-
- /* Load polynomial coefficients. */
- sv_f64_t P_0 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[0], i);
- sv_f64_t P_1 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[1], i);
- sv_f64_t P_2 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[2], i);
- sv_f64_t P_3 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[3], i);
- sv_f64_t P_4 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[4], i);
- sv_f64_t P_5 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[5], i);
- sv_f64_t P_6 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[6], i);
- sv_f64_t P_7 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[7], i);
- sv_f64_t P_8 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[8], i);
- sv_f64_t P_9 = sv_lookup_f64_x (pg, __v_erf_data.coeffs[9], i);
-
- /* Get shift and scale. */
- sv_f64_t shift = sv_lookup_f64_x (pg, __v_erf_data.shifts, i);
-
- /* Transform polynomial variable.
- Set z = 0 in the boring domain to avoid overflow. */
- sv_f64_t z = svmla_f64_m (a_lt_6, shift, sv_f64 (Scale), a);
-
- /* Evaluate polynomial P(z) using level-2 Estrin. */
- sv_f64_t r1 = sv_fma_f64_x (pg, z, P_1, P_0);
- sv_f64_t r2 = sv_fma_f64_x (pg, z, P_3, P_2);
- sv_f64_t r3 = sv_fma_f64_x (pg, z, P_5, P_4);
- sv_f64_t r4 = sv_fma_f64_x (pg, z, P_7, P_6);
- sv_f64_t r5 = sv_fma_f64_x (pg, z, P_9, P_8);
-
- sv_f64_t z2 = svmul_f64_x (pg, z, z);
- sv_f64_t z4 = svmul_f64_x (pg, z2, z2);
-
- sv_f64_t q2 = sv_fma_f64_x (pg, r4, z2, r3);
- sv_f64_t q1 = sv_fma_f64_x (pg, r2, z2, r1);
-
- sv_f64_t y = sv_fma_f64_x (pg, z4, r5, q2);
- y = sv_fma_f64_x (pg, z4, y, q1);
-
- /* y = erf(x) if x > 0, -erf(-x) otherwise. */
- y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
-
- if (unlikely (svptest_any (pg, cmp)))
- return __sv_erf_specialcase (x, y, cmp);
- return y;
-}
-
-PL_ALIAS (__sv_erf_x, _ZGVsMxv_erf)
-
-PL_SIG (SV, D, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (__sv_erf, 2.13)
-PL_TEST_INTERVAL (__sv_erf, 0, 0x1p-28, 20000)
-PL_TEST_INTERVAL (__sv_erf, 0x1p-28, 1, 60000)
-PL_TEST_INTERVAL (__sv_erf, 1, 0x1p28, 60000)
-PL_TEST_INTERVAL (__sv_erf, 0x1p28, inf, 20000)
-PL_TEST_INTERVAL (__sv_erf, -0, -0x1p-28, 20000)
-PL_TEST_INTERVAL (__sv_erf, -0x1p-28, -1, 60000)
-PL_TEST_INTERVAL (__sv_erf, -1, -0x1p28, 60000)
-PL_TEST_INTERVAL (__sv_erf, -0x1p28, -inf, 20000)
-#endif
diff --git a/pl/math/sv_erf_data.c b/pl/math/sv_erf_data.c
new file mode 100644
index 000000000000..7244aceda5a5
--- /dev/null
+++ b/pl/math/sv_erf_data.c
@@ -0,0 +1,1558 @@
+/*
+ * Data for approximation of erf.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Lookup table used in vector erf.
+ For each possible rounded input r (multiples of 1/128), between
+ r = 0.0 and r = 6.0 (769 values):
+ - the first entry __erf_data.tab.erf contains the values of erf(r),
+ - the second entry __erf_data.tab.scale contains the values of
+ 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
+ algorithm, since lookup is performed only for x >= 1/64-1/512. */
+const struct sv_erf_data __sv_erf_data = {
+ .erf = { 0x0.0000000000000p+0,
+ 0x1.20dbf3deb1340p-7,
+ 0x1.20d77083f17a0p-6,
+ 0x1.b137e0cf584dcp-6,
+ 0x1.20c5645dd2538p-5,
+ 0x1.68e5d3bbc9526p-5,
+ 0x1.b0fafef135745p-5,
+ 0x1.f902a77bd3821p-5,
+ 0x1.207d480e90658p-4,
+ 0x1.44703e87e8593p-4,
+ 0x1.68591a1e83b5dp-4,
+ 0x1.8c36beb8a8d23p-4,
+ 0x1.b0081148a873ap-4,
+ 0x1.d3cbf7e70a4b3p-4,
+ 0x1.f78159ec8bb50p-4,
+ 0x1.0d939005f65e5p-3,
+ 0x1.1f5e1a35c3b89p-3,
+ 0x1.311fc15f56d14p-3,
+ 0x1.42d7fc2f64959p-3,
+ 0x1.548642321d7c6p-3,
+ 0x1.662a0bdf7a89fp-3,
+ 0x1.77c2d2a765f9ep-3,
+ 0x1.895010fdbdbfdp-3,
+ 0x1.9ad142662e14dp-3,
+ 0x1.ac45e37fe2526p-3,
+ 0x1.bdad72110a648p-3,
+ 0x1.cf076d1233237p-3,
+ 0x1.e05354b96ff36p-3,
+ 0x1.f190aa85540e2p-3,
+ 0x1.015f78a3dcf3dp-2,
+ 0x1.09eed6982b948p-2,
+ 0x1.127631eb8de32p-2,
+ 0x1.1af54e232d609p-2,
+ 0x1.236bef825d9a2p-2,
+ 0x1.2bd9db0f7827fp-2,
+ 0x1.343ed6989b7d9p-2,
+ 0x1.3c9aa8b84bedap-2,
+ 0x1.44ed18d9f6462p-2,
+ 0x1.4d35ef3e5372ep-2,
+ 0x1.5574f4ffac98ep-2,
+ 0x1.5da9f415ff23fp-2,
+ 0x1.65d4b75b00471p-2,
+ 0x1.6df50a8dff772p-2,
+ 0x1.760aba57a76bfp-2,
+ 0x1.7e15944d9d3e4p-2,
+ 0x1.861566f5fd3c0p-2,
+ 0x1.8e0a01cab516bp-2,
+ 0x1.95f3353cbb146p-2,
+ 0x1.9dd0d2b721f39p-2,
+ 0x1.a5a2aca209394p-2,
+ 0x1.ad68966569a87p-2,
+ 0x1.b522646bbda68p-2,
+ 0x1.bccfec24855b8p-2,
+ 0x1.c4710406a65fcp-2,
+ 0x1.cc058392a6d2dp-2,
+ 0x1.d38d4354c3bd0p-2,
+ 0x1.db081ce6e2a48p-2,
+ 0x1.e275eaf25e458p-2,
+ 0x1.e9d68931ae650p-2,
+ 0x1.f129d471eabb1p-2,
+ 0x1.f86faa9428f9dp-2,
+ 0x1.ffa7ea8eb5fd0p-2,
+ 0x1.03693a371519cp-1,
+ 0x1.06f794ab2cae7p-1,
+ 0x1.0a7ef5c18edd2p-1,
+ 0x1.0dff4f247f6c6p-1,
+ 0x1.1178930ada115p-1,
+ 0x1.14eab43841b55p-1,
+ 0x1.1855a5fd3dd50p-1,
+ 0x1.1bb95c3746199p-1,
+ 0x1.1f15cb50bc4dep-1,
+ 0x1.226ae840d4d70p-1,
+ 0x1.25b8a88b6dd7fp-1,
+ 0x1.28ff0240d52cdp-1,
+ 0x1.2c3debfd7d6c1p-1,
+ 0x1.2f755ce9a21f4p-1,
+ 0x1.32a54cb8db67bp-1,
+ 0x1.35cdb3a9a144dp-1,
+ 0x1.38ee8a84beb71p-1,
+ 0x1.3c07ca9cb4f9ep-1,
+ 0x1.3f196dcd0f135p-1,
+ 0x1.42236e79a5fa6p-1,
+ 0x1.4525c78dd5966p-1,
+ 0x1.4820747ba2dc2p-1,
+ 0x1.4b13713ad3513p-1,
+ 0x1.4dfeba47f63ccp-1,
+ 0x1.50e24ca35fd2cp-1,
+ 0x1.53be25d016a4fp-1,
+ 0x1.569243d2b3a9bp-1,
+ 0x1.595ea53035283p-1,
+ 0x1.5c2348ecc4dc3p-1,
+ 0x1.5ee02e8a71a53p-1,
+ 0x1.61955607dd15dp-1,
+ 0x1.6442bfdedd397p-1,
+ 0x1.66e86d0312e82p-1,
+ 0x1.69865ee075011p-1,
+ 0x1.6c1c9759d0e5fp-1,
+ 0x1.6eab18c74091bp-1,
+ 0x1.7131e5f496a5ap-1,
+ 0x1.73b1021fc0cb8p-1,
+ 0x1.762870f720c6fp-1,
+ 0x1.78983697dc96fp-1,
+ 0x1.7b00578c26037p-1,
+ 0x1.7d60d8c979f7bp-1,
+ 0x1.7fb9bfaed8078p-1,
+ 0x1.820b1202f27fbp-1,
+ 0x1.8454d5f25760dp-1,
+ 0x1.8697120d92a4ap-1,
+ 0x1.88d1cd474a2e0p-1,
+ 0x1.8b050ef253c37p-1,
+ 0x1.8d30debfc572ep-1,
+ 0x1.8f5544bd00c04p-1,
+ 0x1.91724951b8fc6p-1,
+ 0x1.9387f53df5238p-1,
+ 0x1.959651980da31p-1,
+ 0x1.979d67caa6631p-1,
+ 0x1.999d4192a5715p-1,
+ 0x1.9b95e8fd26abap-1,
+ 0x1.9d8768656cc42p-1,
+ 0x1.9f71ca72cffb6p-1,
+ 0x1.a1551a16aaeafp-1,
+ 0x1.a331628a45b92p-1,
+ 0x1.a506af4cc00f4p-1,
+ 0x1.a6d50c20fa293p-1,
+ 0x1.a89c850b7d54dp-1,
+ 0x1.aa5d265064366p-1,
+ 0x1.ac16fc7143263p-1,
+ 0x1.adca142b10f98p-1,
+ 0x1.af767a741088bp-1,
+ 0x1.b11c3c79bb424p-1,
+ 0x1.b2bb679ead19cp-1,
+ 0x1.b4540978921eep-1,
+ 0x1.b5e62fce16095p-1,
+ 0x1.b771e894d602ep-1,
+ 0x1.b8f741ef54f83p-1,
+ 0x1.ba764a2af2b78p-1,
+ 0x1.bbef0fbde6221p-1,
+ 0x1.bd61a1453ab44p-1,
+ 0x1.bece0d82d1a5cp-1,
+ 0x1.c034635b66e23p-1,
+ 0x1.c194b1d49a184p-1,
+ 0x1.c2ef0812fc1bdp-1,
+ 0x1.c443755820d64p-1,
+ 0x1.c5920900b5fd1p-1,
+ 0x1.c6dad2829ec62p-1,
+ 0x1.c81de16b14cefp-1,
+ 0x1.c95b455cce69dp-1,
+ 0x1.ca930e0e2a825p-1,
+ 0x1.cbc54b476248dp-1,
+ 0x1.ccf20ce0c0d27p-1,
+ 0x1.ce1962c0e0d8bp-1,
+ 0x1.cf3b5cdaf0c39p-1,
+ 0x1.d0580b2cfd249p-1,
+ 0x1.d16f7dbe41ca0p-1,
+ 0x1.d281c49d818d0p-1,
+ 0x1.d38eefdf64fddp-1,
+ 0x1.d4970f9ce00d9p-1,
+ 0x1.d59a33f19ed42p-1,
+ 0x1.d6986cfa798e7p-1,
+ 0x1.d791cad3eff01p-1,
+ 0x1.d8865d98abe01p-1,
+ 0x1.d97635600bb89p-1,
+ 0x1.da61623cb41e0p-1,
+ 0x1.db47f43b2980dp-1,
+ 0x1.dc29fb60715afp-1,
+ 0x1.dd0787a8bb39dp-1,
+ 0x1.dde0a90611a0dp-1,
+ 0x1.deb56f5f12d28p-1,
+ 0x1.df85ea8db188ep-1,
+ 0x1.e0522a5dfda73p-1,
+ 0x1.e11a3e8cf4eb8p-1,
+ 0x1.e1de36c75ba58p-1,
+ 0x1.e29e22a89d766p-1,
+ 0x1.e35a11b9b61cep-1,
+ 0x1.e4121370224ccp-1,
+ 0x1.e4c6372cd8927p-1,
+ 0x1.e5768c3b4a3fcp-1,
+ 0x1.e62321d06c5e0p-1,
+ 0x1.e6cc0709c8a0dp-1,
+ 0x1.e7714aec96534p-1,
+ 0x1.e812fc64db369p-1,
+ 0x1.e8b12a44944a8p-1,
+ 0x1.e94be342e6743p-1,
+ 0x1.e9e335fb56f87p-1,
+ 0x1.ea7730ed0bbb9p-1,
+ 0x1.eb07e27a133aap-1,
+ 0x1.eb9558e6b42cep-1,
+ 0x1.ec1fa258c4beap-1,
+ 0x1.eca6ccd709544p-1,
+ 0x1.ed2ae6489ac1ep-1,
+ 0x1.edabfc7453e63p-1,
+ 0x1.ee2a1d004692cp-1,
+ 0x1.eea5557137ae0p-1,
+ 0x1.ef1db32a2277cp-1,
+ 0x1.ef93436bc2daap-1,
+ 0x1.f006135426b26p-1,
+ 0x1.f0762fde45ee6p-1,
+ 0x1.f0e3a5e1a1788p-1,
+ 0x1.f14e8211e8c55p-1,
+ 0x1.f1b6d0fea5f4dp-1,
+ 0x1.f21c9f12f0677p-1,
+ 0x1.f27ff89525acfp-1,
+ 0x1.f2e0e9a6a8b09p-1,
+ 0x1.f33f7e43a706bp-1,
+ 0x1.f39bc242e43e6p-1,
+ 0x1.f3f5c1558b19ep-1,
+ 0x1.f44d870704911p-1,
+ 0x1.f4a31ebcd47dfp-1,
+ 0x1.f4f693b67bd77p-1,
+ 0x1.f547f10d60597p-1,
+ 0x1.f59741b4b97cfp-1,
+ 0x1.f5e4907982a07p-1,
+ 0x1.f62fe80272419p-1,
+ 0x1.f67952cff6282p-1,
+ 0x1.f6c0db3c34641p-1,
+ 0x1.f7068b7b10fd9p-1,
+ 0x1.f74a6d9a38383p-1,
+ 0x1.f78c8b812d498p-1,
+ 0x1.f7cceef15d631p-1,
+ 0x1.f80ba18636f07p-1,
+ 0x1.f848acb544e95p-1,
+ 0x1.f88419ce4e184p-1,
+ 0x1.f8bdf1fb78370p-1,
+ 0x1.f8f63e416ebffp-1,
+ 0x1.f92d077f8d56dp-1,
+ 0x1.f96256700da8ep-1,
+ 0x1.f99633a838a57p-1,
+ 0x1.f9c8a7989af0dp-1,
+ 0x1.f9f9ba8d3c733p-1,
+ 0x1.fa2974addae45p-1,
+ 0x1.fa57ddfe27376p-1,
+ 0x1.fa84fe5e05c8dp-1,
+ 0x1.fab0dd89d1309p-1,
+ 0x1.fadb831a9f9c3p-1,
+ 0x1.fb04f6868a944p-1,
+ 0x1.fb2d3f20f9101p-1,
+ 0x1.fb54641aebbc9p-1,
+ 0x1.fb7a6c834b5a2p-1,
+ 0x1.fb9f5f4739170p-1,
+ 0x1.fbc3433260ca5p-1,
+ 0x1.fbe61eef4cf6ap-1,
+ 0x1.fc07f907bc794p-1,
+ 0x1.fc28d7e4f9cd0p-1,
+ 0x1.fc48c1d033c7ap-1,
+ 0x1.fc67bcf2d7b8fp-1,
+ 0x1.fc85cf56ecd38p-1,
+ 0x1.fca2fee770c79p-1,
+ 0x1.fcbf5170b578bp-1,
+ 0x1.fcdacca0bfb73p-1,
+ 0x1.fcf57607a6e7cp-1,
+ 0x1.fd0f5317f582fp-1,
+ 0x1.fd2869270a56fp-1,
+ 0x1.fd40bd6d7a785p-1,
+ 0x1.fd58550773cb5p-1,
+ 0x1.fd6f34f52013ap-1,
+ 0x1.fd85621b0876dp-1,
+ 0x1.fd9ae142795e3p-1,
+ 0x1.fdafb719e6a69p-1,
+ 0x1.fdc3e835500b3p-1,
+ 0x1.fdd7790ea5bc0p-1,
+ 0x1.fdea6e062d0c9p-1,
+ 0x1.fdfccb62e52d3p-1,
+ 0x1.fe0e9552ebdd6p-1,
+ 0x1.fe1fcfebe2083p-1,
+ 0x1.fe307f2b503d0p-1,
+ 0x1.fe40a6f70af4bp-1,
+ 0x1.fe504b1d9696cp-1,
+ 0x1.fe5f6f568b301p-1,
+ 0x1.fe6e1742f7cf6p-1,
+ 0x1.fe7c466dc57a1p-1,
+ 0x1.fe8a004c19ae6p-1,
+ 0x1.fe97483db8670p-1,
+ 0x1.fea4218d6594ap-1,
+ 0x1.feb08f7146046p-1,
+ 0x1.febc950b3fa75p-1,
+ 0x1.fec835695932ep-1,
+ 0x1.fed37386190fbp-1,
+ 0x1.fede5248e38f4p-1,
+ 0x1.fee8d486585eep-1,
+ 0x1.fef2fd00af31ap-1,
+ 0x1.fefcce6813974p-1,
+ 0x1.ff064b5afffbep-1,
+ 0x1.ff0f766697c76p-1,
+ 0x1.ff18520700971p-1,
+ 0x1.ff20e0a7ba8c2p-1,
+ 0x1.ff2924a3f7a83p-1,
+ 0x1.ff312046f2339p-1,
+ 0x1.ff38d5cc4227fp-1,
+ 0x1.ff404760319b4p-1,
+ 0x1.ff47772010262p-1,
+ 0x1.ff4e671a85425p-1,
+ 0x1.ff55194fe19dfp-1,
+ 0x1.ff5b8fb26f5f6p-1,
+ 0x1.ff61cc26c1578p-1,
+ 0x1.ff67d08401202p-1,
+ 0x1.ff6d9e943c231p-1,
+ 0x1.ff733814af88cp-1,
+ 0x1.ff789eb6130c9p-1,
+ 0x1.ff7dd41ce2b4dp-1,
+ 0x1.ff82d9e1a76d8p-1,
+ 0x1.ff87b1913e853p-1,
+ 0x1.ff8c5cad200a5p-1,
+ 0x1.ff90dcaba4096p-1,
+ 0x1.ff9532f846ab0p-1,
+ 0x1.ff9960f3eb327p-1,
+ 0x1.ff9d67f51ddbap-1,
+ 0x1.ffa14948549a7p-1,
+ 0x1.ffa506302ebaep-1,
+ 0x1.ffa89fe5b3625p-1,
+ 0x1.ffac17988ef4bp-1,
+ 0x1.ffaf6e6f4f5c0p-1,
+ 0x1.ffb2a5879f35ep-1,
+ 0x1.ffb5bdf67fe6fp-1,
+ 0x1.ffb8b8c88295fp-1,
+ 0x1.ffbb970200110p-1,
+ 0x1.ffbe599f4f9d9p-1,
+ 0x1.ffc10194fcb64p-1,
+ 0x1.ffc38fcffbb7cp-1,
+ 0x1.ffc60535dd7f5p-1,
+ 0x1.ffc862a501fd7p-1,
+ 0x1.ffcaa8f4c9beap-1,
+ 0x1.ffccd8f5c66d1p-1,
+ 0x1.ffcef371ea4d7p-1,
+ 0x1.ffd0f92cb6ba7p-1,
+ 0x1.ffd2eae369a07p-1,
+ 0x1.ffd4c94d29fdbp-1,
+ 0x1.ffd6951b33686p-1,
+ 0x1.ffd84ef9009eep-1,
+ 0x1.ffd9f78c7524ap-1,
+ 0x1.ffdb8f7605ee7p-1,
+ 0x1.ffdd1750e1220p-1,
+ 0x1.ffde8fb314ebfp-1,
+ 0x1.ffdff92db56e5p-1,
+ 0x1.ffe1544d01ccbp-1,
+ 0x1.ffe2a1988857cp-1,
+ 0x1.ffe3e19349dc7p-1,
+ 0x1.ffe514bbdc197p-1,
+ 0x1.ffe63b8c8b5f7p-1,
+ 0x1.ffe7567b7b5e1p-1,
+ 0x1.ffe865fac722bp-1,
+ 0x1.ffe96a78a04a9p-1,
+ 0x1.ffea645f6d6dap-1,
+ 0x1.ffeb5415e7c44p-1,
+ 0x1.ffec39ff380b9p-1,
+ 0x1.ffed167b12ac2p-1,
+ 0x1.ffede9e5d3262p-1,
+ 0x1.ffeeb49896c6dp-1,
+ 0x1.ffef76e956a9fp-1,
+ 0x1.fff0312b010b5p-1,
+ 0x1.fff0e3ad91ec2p-1,
+ 0x1.fff18ebe2b0e1p-1,
+ 0x1.fff232a72b48ep-1,
+ 0x1.fff2cfb0453d9p-1,
+ 0x1.fff3661e9569dp-1,
+ 0x1.fff3f634b79f9p-1,
+ 0x1.fff48032dbe40p-1,
+ 0x1.fff50456dab8cp-1,
+ 0x1.fff582dc48d30p-1,
+ 0x1.fff5fbfc8a439p-1,
+ 0x1.fff66feee5129p-1,
+ 0x1.fff6dee89352ep-1,
+ 0x1.fff7491cd4af6p-1,
+ 0x1.fff7aebcff755p-1,
+ 0x1.fff80ff8911fdp-1,
+ 0x1.fff86cfd3e657p-1,
+ 0x1.fff8c5f702ccfp-1,
+ 0x1.fff91b102fca8p-1,
+ 0x1.fff96c717b695p-1,
+ 0x1.fff9ba420e834p-1,
+ 0x1.fffa04a7928b1p-1,
+ 0x1.fffa4bc63ee9ap-1,
+ 0x1.fffa8fc0e5f33p-1,
+ 0x1.fffad0b901755p-1,
+ 0x1.fffb0ecebee1bp-1,
+ 0x1.fffb4a210b172p-1,
+ 0x1.fffb82cd9dcbfp-1,
+ 0x1.fffbb8f1049c6p-1,
+ 0x1.fffbeca6adbe9p-1,
+ 0x1.fffc1e08f25f5p-1,
+ 0x1.fffc4d3120aa1p-1,
+ 0x1.fffc7a37857d2p-1,
+ 0x1.fffca53375ce3p-1,
+ 0x1.fffcce3b57bffp-1,
+ 0x1.fffcf564ab6b7p-1,
+ 0x1.fffd1ac4135f9p-1,
+ 0x1.fffd3e6d5cd87p-1,
+ 0x1.fffd607387b07p-1,
+ 0x1.fffd80e8ce0dap-1,
+ 0x1.fffd9fdeabccep-1,
+ 0x1.fffdbd65e5ad0p-1,
+ 0x1.fffdd98e903b2p-1,
+ 0x1.fffdf46816833p-1,
+ 0x1.fffe0e0140857p-1,
+ 0x1.fffe26683972ap-1,
+ 0x1.fffe3daa95b18p-1,
+ 0x1.fffe53d558ae9p-1,
+ 0x1.fffe68f4fa777p-1,
+ 0x1.fffe7d156d244p-1,
+ 0x1.fffe904222101p-1,
+ 0x1.fffea2860ee1ep-1,
+ 0x1.fffeb3ebb267bp-1,
+ 0x1.fffec47d19457p-1,
+ 0x1.fffed443e2787p-1,
+ 0x1.fffee34943b15p-1,
+ 0x1.fffef1960d85dp-1,
+ 0x1.fffeff32af7afp-1,
+ 0x1.ffff0c273bea2p-1,
+ 0x1.ffff187b6bc0ep-1,
+ 0x1.ffff2436a21dcp-1,
+ 0x1.ffff2f5fefcaap-1,
+ 0x1.ffff39fe16963p-1,
+ 0x1.ffff44178c8d2p-1,
+ 0x1.ffff4db27f146p-1,
+ 0x1.ffff56d4d5e5ep-1,
+ 0x1.ffff5f8435efcp-1,
+ 0x1.ffff67c604180p-1,
+ 0x1.ffff6f9f67e55p-1,
+ 0x1.ffff77154e0d6p-1,
+ 0x1.ffff7e2c6aea2p-1,
+ 0x1.ffff84e93cd75p-1,
+ 0x1.ffff8b500e77cp-1,
+ 0x1.ffff9164f8e46p-1,
+ 0x1.ffff972be5c59p-1,
+ 0x1.ffff9ca891572p-1,
+ 0x1.ffffa1de8c582p-1,
+ 0x1.ffffa6d13de73p-1,
+ 0x1.ffffab83e54b8p-1,
+ 0x1.ffffaff99bac4p-1,
+ 0x1.ffffb43555b5fp-1,
+ 0x1.ffffb839e52f3p-1,
+ 0x1.ffffbc09fa7cdp-1,
+ 0x1.ffffbfa82616bp-1,
+ 0x1.ffffc316d9ed0p-1,
+ 0x1.ffffc6586abf6p-1,
+ 0x1.ffffc96f1165ep-1,
+ 0x1.ffffcc5cec0c1p-1,
+ 0x1.ffffcf23ff5fcp-1,
+ 0x1.ffffd1c637b2bp-1,
+ 0x1.ffffd4456a10dp-1,
+ 0x1.ffffd6a3554a1p-1,
+ 0x1.ffffd8e1a2f22p-1,
+ 0x1.ffffdb01e8546p-1,
+ 0x1.ffffdd05a75eap-1,
+ 0x1.ffffdeee4f810p-1,
+ 0x1.ffffe0bd3e852p-1,
+ 0x1.ffffe273c15b7p-1,
+ 0x1.ffffe41314e06p-1,
+ 0x1.ffffe59c6698bp-1,
+ 0x1.ffffe710d565ep-1,
+ 0x1.ffffe8717232dp-1,
+ 0x1.ffffe9bf4098cp-1,
+ 0x1.ffffeafb377d5p-1,
+ 0x1.ffffec2641a9ep-1,
+ 0x1.ffffed413e5b7p-1,
+ 0x1.ffffee4d01cd6p-1,
+ 0x1.ffffef4a55bd4p-1,
+ 0x1.fffff039f9e8fp-1,
+ 0x1.fffff11ca4876p-1,
+ 0x1.fffff1f302bc1p-1,
+ 0x1.fffff2bdb904dp-1,
+ 0x1.fffff37d63a36p-1,
+ 0x1.fffff43297019p-1,
+ 0x1.fffff4dde0118p-1,
+ 0x1.fffff57fc4a95p-1,
+ 0x1.fffff618c3da6p-1,
+ 0x1.fffff6a956450p-1,
+ 0x1.fffff731ee681p-1,
+ 0x1.fffff7b2f8ed6p-1,
+ 0x1.fffff82cdcf1bp-1,
+ 0x1.fffff89ffc4aap-1,
+ 0x1.fffff90cb3c81p-1,
+ 0x1.fffff9735b73bp-1,
+ 0x1.fffff9d446cccp-1,
+ 0x1.fffffa2fc5015p-1,
+ 0x1.fffffa8621251p-1,
+ 0x1.fffffad7a2652p-1,
+ 0x1.fffffb248c39dp-1,
+ 0x1.fffffb6d1e95dp-1,
+ 0x1.fffffbb196132p-1,
+ 0x1.fffffbf22c1e2p-1,
+ 0x1.fffffc2f171e3p-1,
+ 0x1.fffffc688a9cfp-1,
+ 0x1.fffffc9eb76acp-1,
+ 0x1.fffffcd1cbc28p-1,
+ 0x1.fffffd01f36afp-1,
+ 0x1.fffffd2f57d68p-1,
+ 0x1.fffffd5a2041fp-1,
+ 0x1.fffffd8271d12p-1,
+ 0x1.fffffda86faa9p-1,
+ 0x1.fffffdcc3b117p-1,
+ 0x1.fffffdedf37edp-1,
+ 0x1.fffffe0db6b91p-1,
+ 0x1.fffffe2ba0ea5p-1,
+ 0x1.fffffe47ccb60p-1,
+ 0x1.fffffe62534d4p-1,
+ 0x1.fffffe7b4c81ep-1,
+ 0x1.fffffe92ced93p-1,
+ 0x1.fffffea8ef9cfp-1,
+ 0x1.fffffebdc2ec6p-1,
+ 0x1.fffffed15bcbap-1,
+ 0x1.fffffee3cc32cp-1,
+ 0x1.fffffef5251c2p-1,
+ 0x1.ffffff0576917p-1,
+ 0x1.ffffff14cfb92p-1,
+ 0x1.ffffff233ee1dp-1,
+ 0x1.ffffff30d18e8p-1,
+ 0x1.ffffff3d9480fp-1,
+ 0x1.ffffff4993c46p-1,
+ 0x1.ffffff54dab72p-1,
+ 0x1.ffffff5f74141p-1,
+ 0x1.ffffff6969fb8p-1,
+ 0x1.ffffff72c5fb6p-1,
+ 0x1.ffffff7b91176p-1,
+ 0x1.ffffff83d3d07p-1,
+ 0x1.ffffff8b962bep-1,
+ 0x1.ffffff92dfba2p-1,
+ 0x1.ffffff99b79d2p-1,
+ 0x1.ffffffa0248e8p-1,
+ 0x1.ffffffa62ce54p-1,
+ 0x1.ffffffabd69b4p-1,
+ 0x1.ffffffb127525p-1,
+ 0x1.ffffffb624592p-1,
+ 0x1.ffffffbad2affp-1,
+ 0x1.ffffffbf370cdp-1,
+ 0x1.ffffffc355dfdp-1,
+ 0x1.ffffffc733572p-1,
+ 0x1.ffffffcad3626p-1,
+ 0x1.ffffffce39b67p-1,
+ 0x1.ffffffd169d0cp-1,
+ 0x1.ffffffd466fa5p-1,
+ 0x1.ffffffd7344aap-1,
+ 0x1.ffffffd9d4aabp-1,
+ 0x1.ffffffdc4ad7ap-1,
+ 0x1.ffffffde9964ep-1,
+ 0x1.ffffffe0c2bf0p-1,
+ 0x1.ffffffe2c92dbp-1,
+ 0x1.ffffffe4aed5ep-1,
+ 0x1.ffffffe675bbdp-1,
+ 0x1.ffffffe81fc4ep-1,
+ 0x1.ffffffe9aeb97p-1,
+ 0x1.ffffffeb24467p-1,
+ 0x1.ffffffec81ff2p-1,
+ 0x1.ffffffedc95e7p-1,
+ 0x1.ffffffeefbc85p-1,
+ 0x1.fffffff01a8b6p-1,
+ 0x1.fffffff126e1ep-1,
+ 0x1.fffffff221f30p-1,
+ 0x1.fffffff30cd3fp-1,
+ 0x1.fffffff3e8892p-1,
+ 0x1.fffffff4b606fp-1,
+ 0x1.fffffff57632dp-1,
+ 0x1.fffffff629e44p-1,
+ 0x1.fffffff6d1e56p-1,
+ 0x1.fffffff76ef3fp-1,
+ 0x1.fffffff801c1fp-1,
+ 0x1.fffffff88af67p-1,
+ 0x1.fffffff90b2e3p-1,
+ 0x1.fffffff982fc1p-1,
+ 0x1.fffffff9f2e9fp-1,
+ 0x1.fffffffa5b790p-1,
+ 0x1.fffffffabd229p-1,
+ 0x1.fffffffb18582p-1,
+ 0x1.fffffffb6d844p-1,
+ 0x1.fffffffbbd0aap-1,
+ 0x1.fffffffc0748fp-1,
+ 0x1.fffffffc4c96cp-1,
+ 0x1.fffffffc8d462p-1,
+ 0x1.fffffffcc9a41p-1,
+ 0x1.fffffffd01f89p-1,
+ 0x1.fffffffd36871p-1,
+ 0x1.fffffffd678edp-1,
+ 0x1.fffffffd954aep-1,
+ 0x1.fffffffdbff2ap-1,
+ 0x1.fffffffde7ba0p-1,
+ 0x1.fffffffe0cd16p-1,
+ 0x1.fffffffe2f664p-1,
+ 0x1.fffffffe4fa30p-1,
+ 0x1.fffffffe6daf7p-1,
+ 0x1.fffffffe89b0cp-1,
+ 0x1.fffffffea3c9ap-1,
+ 0x1.fffffffebc1a9p-1,
+ 0x1.fffffffed2c21p-1,
+ 0x1.fffffffee7dc8p-1,
+ 0x1.fffffffefb847p-1,
+ 0x1.ffffffff0dd2bp-1,
+ 0x1.ffffffff1ede9p-1,
+ 0x1.ffffffff2ebdap-1,
+ 0x1.ffffffff3d843p-1,
+ 0x1.ffffffff4b453p-1,
+ 0x1.ffffffff58126p-1,
+ 0x1.ffffffff63fc3p-1,
+ 0x1.ffffffff6f121p-1,
+ 0x1.ffffffff79626p-1,
+ 0x1.ffffffff82fabp-1,
+ 0x1.ffffffff8be77p-1,
+ 0x1.ffffffff94346p-1,
+ 0x1.ffffffff9bec8p-1,
+ 0x1.ffffffffa319fp-1,
+ 0x1.ffffffffa9c63p-1,
+ 0x1.ffffffffaffa4p-1,
+ 0x1.ffffffffb5be5p-1,
+ 0x1.ffffffffbb1a2p-1,
+ 0x1.ffffffffc014ep-1,
+ 0x1.ffffffffc4b56p-1,
+ 0x1.ffffffffc901cp-1,
+ 0x1.ffffffffccfffp-1,
+ 0x1.ffffffffd0b56p-1,
+ 0x1.ffffffffd4271p-1,
+ 0x1.ffffffffd759dp-1,
+ 0x1.ffffffffda520p-1,
+ 0x1.ffffffffdd13cp-1,
+ 0x1.ffffffffdfa2dp-1,
+ 0x1.ffffffffe202dp-1,
+ 0x1.ffffffffe4371p-1,
+ 0x1.ffffffffe642ap-1,
+ 0x1.ffffffffe8286p-1,
+ 0x1.ffffffffe9eb0p-1,
+ 0x1.ffffffffeb8d0p-1,
+ 0x1.ffffffffed10ap-1,
+ 0x1.ffffffffee782p-1,
+ 0x1.ffffffffefc57p-1,
+ 0x1.fffffffff0fa7p-1,
+ 0x1.fffffffff218fp-1,
+ 0x1.fffffffff3227p-1,
+ 0x1.fffffffff4188p-1,
+ 0x1.fffffffff4fc9p-1,
+ 0x1.fffffffff5cfdp-1,
+ 0x1.fffffffff6939p-1,
+ 0x1.fffffffff748ep-1,
+ 0x1.fffffffff7f0dp-1,
+ 0x1.fffffffff88c5p-1,
+ 0x1.fffffffff91c6p-1,
+ 0x1.fffffffff9a1bp-1,
+ 0x1.fffffffffa1d2p-1,
+ 0x1.fffffffffa8f6p-1,
+ 0x1.fffffffffaf92p-1,
+ 0x1.fffffffffb5b0p-1,
+ 0x1.fffffffffbb58p-1,
+ 0x1.fffffffffc095p-1,
+ 0x1.fffffffffc56dp-1,
+ 0x1.fffffffffc9e8p-1,
+ 0x1.fffffffffce0dp-1,
+ 0x1.fffffffffd1e1p-1,
+ 0x1.fffffffffd56cp-1,
+ 0x1.fffffffffd8b3p-1,
+ 0x1.fffffffffdbbap-1,
+ 0x1.fffffffffde86p-1,
+ 0x1.fffffffffe11dp-1,
+ 0x1.fffffffffe380p-1,
+ 0x1.fffffffffe5b6p-1,
+ 0x1.fffffffffe7c0p-1,
+ 0x1.fffffffffe9a2p-1,
+ 0x1.fffffffffeb60p-1,
+ 0x1.fffffffffecfbp-1,
+ 0x1.fffffffffee77p-1,
+ 0x1.fffffffffefd6p-1,
+ 0x1.ffffffffff11ap-1,
+ 0x1.ffffffffff245p-1,
+ 0x1.ffffffffff359p-1,
+ 0x1.ffffffffff457p-1,
+ 0x1.ffffffffff542p-1,
+ 0x1.ffffffffff61bp-1,
+ 0x1.ffffffffff6e3p-1,
+ 0x1.ffffffffff79bp-1,
+ 0x1.ffffffffff845p-1,
+ 0x1.ffffffffff8e2p-1,
+ 0x1.ffffffffff973p-1,
+ 0x1.ffffffffff9f8p-1,
+ 0x1.ffffffffffa73p-1,
+ 0x1.ffffffffffae4p-1,
+ 0x1.ffffffffffb4cp-1,
+ 0x1.ffffffffffbadp-1,
+ 0x1.ffffffffffc05p-1,
+ 0x1.ffffffffffc57p-1,
+ 0x1.ffffffffffca2p-1,
+ 0x1.ffffffffffce7p-1,
+ 0x1.ffffffffffd27p-1,
+ 0x1.ffffffffffd62p-1,
+ 0x1.ffffffffffd98p-1,
+ 0x1.ffffffffffdcap-1,
+ 0x1.ffffffffffdf8p-1,
+ 0x1.ffffffffffe22p-1,
+ 0x1.ffffffffffe49p-1,
+ 0x1.ffffffffffe6cp-1,
+ 0x1.ffffffffffe8dp-1,
+ 0x1.ffffffffffeabp-1,
+ 0x1.ffffffffffec7p-1,
+ 0x1.ffffffffffee1p-1,
+ 0x1.ffffffffffef8p-1,
+ 0x1.fffffffffff0ep-1,
+ 0x1.fffffffffff22p-1,
+ 0x1.fffffffffff34p-1,
+ 0x1.fffffffffff45p-1,
+ 0x1.fffffffffff54p-1,
+ 0x1.fffffffffff62p-1,
+ 0x1.fffffffffff6fp-1,
+ 0x1.fffffffffff7bp-1,
+ 0x1.fffffffffff86p-1,
+ 0x1.fffffffffff90p-1,
+ 0x1.fffffffffff9ap-1,
+ 0x1.fffffffffffa2p-1,
+ 0x1.fffffffffffaap-1,
+ 0x1.fffffffffffb1p-1,
+ 0x1.fffffffffffb8p-1,
+ 0x1.fffffffffffbep-1,
+ 0x1.fffffffffffc3p-1,
+ 0x1.fffffffffffc8p-1,
+ 0x1.fffffffffffcdp-1,
+ 0x1.fffffffffffd1p-1,
+ 0x1.fffffffffffd5p-1,
+ 0x1.fffffffffffd9p-1,
+ 0x1.fffffffffffdcp-1,
+ 0x1.fffffffffffdfp-1,
+ 0x1.fffffffffffe2p-1,
+ 0x1.fffffffffffe4p-1,
+ 0x1.fffffffffffe7p-1,
+ 0x1.fffffffffffe9p-1,
+ 0x1.fffffffffffebp-1,
+ 0x1.fffffffffffedp-1,
+ 0x1.fffffffffffeep-1,
+ 0x1.ffffffffffff0p-1,
+ 0x1.ffffffffffff1p-1,
+ 0x1.ffffffffffff3p-1,
+ 0x1.ffffffffffff4p-1,
+ 0x1.ffffffffffff5p-1,
+ 0x1.ffffffffffff6p-1,
+ 0x1.ffffffffffff7p-1,
+ 0x1.ffffffffffff7p-1,
+ 0x1.ffffffffffff8p-1,
+ 0x1.ffffffffffff9p-1,
+ 0x1.ffffffffffff9p-1,
+ 0x1.ffffffffffffap-1,
+ 0x1.ffffffffffffbp-1,
+ 0x1.ffffffffffffbp-1,
+ 0x1.ffffffffffffbp-1,
+ 0x1.ffffffffffffcp-1,
+ 0x1.ffffffffffffcp-1,
+ 0x1.ffffffffffffdp-1,
+ 0x1.ffffffffffffdp-1,
+ 0x1.ffffffffffffdp-1,
+ 0x1.ffffffffffffdp-1,
+ 0x1.ffffffffffffep-1,
+ 0x1.ffffffffffffep-1,
+ 0x1.ffffffffffffep-1,
+ 0x1.ffffffffffffep-1,
+ 0x1.ffffffffffffep-1,
+ 0x1.ffffffffffffep-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.fffffffffffffp-1,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0,
+ },
+ .scale = { 0x1.20dd750429b6dp+0,
+ 0x1.20d8f1975c85dp+0,
+ 0x1.20cb67bd452c7p+0,
+ 0x1.20b4d8bac36c1p+0,
+ 0x1.209546ad13ccfp+0,
+ 0x1.206cb4897b148p+0,
+ 0x1.203b261cd0052p+0,
+ 0x1.2000a00ae3804p+0,
+ 0x1.1fbd27cdc72d3p+0,
+ 0x1.1f70c3b4f2cc7p+0,
+ 0x1.1f1b7ae44867fp+0,
+ 0x1.1ebd5552f795bp+0,
+ 0x1.1e565bca400d4p+0,
+ 0x1.1de697e413d28p+0,
+ 0x1.1d6e14099944ap+0,
+ 0x1.1cecdb718d61cp+0,
+ 0x1.1c62fa1e869b6p+0,
+ 0x1.1bd07cdd189acp+0,
+ 0x1.1b357141d95d5p+0,
+ 0x1.1a91e5a748165p+0,
+ 0x1.19e5e92b964abp+0,
+ 0x1.19318bae53a04p+0,
+ 0x1.1874ddcdfce24p+0,
+ 0x1.17aff0e56ec10p+0,
+ 0x1.16e2d7093cd8cp+0,
+ 0x1.160da304ed92fp+0,
+ 0x1.153068581b781p+0,
+ 0x1.144b3b337c90cp+0,
+ 0x1.135e3075d076bp+0,
+ 0x1.12695da8b5bdep+0,
+ 0x1.116cd8fd67618p+0,
+ 0x1.1068b94962e5ep+0,
+ 0x1.0f5d1602f7e41p+0,
+ 0x1.0e4a073dc1b91p+0,
+ 0x1.0d2fa5a70c168p+0,
+ 0x1.0c0e0a8223359p+0,
+ 0x1.0ae54fa490722p+0,
+ 0x1.09b58f724416bp+0,
+ 0x1.087ee4d9ad247p+0,
+ 0x1.07416b4fbfe7cp+0,
+ 0x1.05fd3ecbec297p+0,
+ 0x1.04b27bc403d30p+0,
+ 0x1.03613f2812dafp+0,
+ 0x1.0209a65e29545p+0,
+ 0x1.00abcf3e187a9p+0,
+ 0x1.fe8fb01a47307p-1,
+ 0x1.fbbbbef34b4b2p-1,
+ 0x1.f8dc092d58ff8p-1,
+ 0x1.f5f0cdaf15313p-1,
+ 0x1.f2fa4c16c0019p-1,
+ 0x1.eff8c4b1375dbp-1,
+ 0x1.ecec7870ebca7p-1,
+ 0x1.e9d5a8e4c934ep-1,
+ 0x1.e6b4982f158b9p-1,
+ 0x1.e38988fc46e72p-1,
+ 0x1.e054be79d3042p-1,
+ 0x1.dd167c4cf9d2ap-1,
+ 0x1.d9cf06898cdafp-1,
+ 0x1.d67ea1a8b5368p-1,
+ 0x1.d325927fb9d89p-1,
+ 0x1.cfc41e36c7df9p-1,
+ 0x1.cc5a8a3fbea40p-1,
+ 0x1.c8e91c4d01368p-1,
+ 0x1.c5701a484ef9dp-1,
+ 0x1.c1efca49a5011p-1,
+ 0x1.be68728e29d5dp-1,
+ 0x1.bada596f25436p-1,
+ 0x1.b745c55905bf8p-1,
+ 0x1.b3aafcc27502ep-1,
+ 0x1.b00a46237d5bep-1,
+ 0x1.ac63e7ecc1411p-1,
+ 0x1.a8b8287ec6a09p-1,
+ 0x1.a5074e2157620p-1,
+ 0x1.a1519efaf889ep-1,
+ 0x1.9d97610879642p-1,
+ 0x1.99d8da149c13fp-1,
+ 0x1.96164fafd8de3p-1,
+ 0x1.925007283d7aap-1,
+ 0x1.8e86458169af8p-1,
+ 0x1.8ab94f6caa71dp-1,
+ 0x1.86e9694134b9ep-1,
+ 0x1.8316d6f48133dp-1,
+ 0x1.7f41dc12c9e89p-1,
+ 0x1.7b6abbb7aaf19p-1,
+ 0x1.7791b886e7403p-1,
+ 0x1.73b714a552763p-1,
+ 0x1.6fdb11b1e0c34p-1,
+ 0x1.6bfdf0beddaf5p-1,
+ 0x1.681ff24b4ab04p-1,
+ 0x1.6441563c665d4p-1,
+ 0x1.60625bd75d07bp-1,
+ 0x1.5c8341bb23767p-1,
+ 0x1.58a445da7c74cp-1,
+ 0x1.54c5a57629db0p-1,
+ 0x1.50e79d1749ac9p-1,
+ 0x1.4d0a6889dfd9fp-1,
+ 0x1.492e42d78d2c5p-1,
+ 0x1.4553664273d24p-1,
+ 0x1.417a0c4049fd0p-1,
+ 0x1.3da26d759aef5p-1,
+ 0x1.39ccc1b136d5ap-1,
+ 0x1.35f93fe7d1b3dp-1,
+ 0x1.32281e2fd1a92p-1,
+ 0x1.2e5991bd4cbfcp-1,
+ 0x1.2a8dcede3673bp-1,
+ 0x1.26c508f6bd0ffp-1,
+ 0x1.22ff727dd6f7bp-1,
+ 0x1.1f3d3cf9ffe5ap-1,
+ 0x1.1b7e98fe26217p-1,
+ 0x1.17c3b626c7a11p-1,
+ 0x1.140cc3173f007p-1,
+ 0x1.1059ed7740313p-1,
+ 0x1.0cab61f084b93p-1,
+ 0x1.09014c2ca74dap-1,
+ 0x1.055bd6d32e8d7p-1,
+ 0x1.01bb2b87c6968p-1,
+ 0x1.fc3ee5d1524b0p-2,
+ 0x1.f511a91a67d2ap-2,
+ 0x1.edeeee0959518p-2,
+ 0x1.e6d6ffaa65a25p-2,
+ 0x1.dfca26f5bbf88p-2,
+ 0x1.d8c8aace11e63p-2,
+ 0x1.d1d2cfff91594p-2,
+ 0x1.cae8d93f1d7b6p-2,
+ 0x1.c40b0729ed547p-2,
+ 0x1.bd3998457afdap-2,
+ 0x1.b674c8ffc6283p-2,
+ 0x1.afbcd3afe8ab6p-2,
+ 0x1.a911f096fbc26p-2,
+ 0x1.a27455e14c93cp-2,
+ 0x1.9be437a7de946p-2,
+ 0x1.9561c7f23a47bp-2,
+ 0x1.8eed36b886d93p-2,
+ 0x1.8886b1e5ecfd1p-2,
+ 0x1.822e655b417e6p-2,
+ 0x1.7be47af1f5d89p-2,
+ 0x1.75a91a7f4d2edp-2,
+ 0x1.6f7c69d7d3ef8p-2,
+ 0x1.695e8cd31867ep-2,
+ 0x1.634fa54fa285fp-2,
+ 0x1.5d4fd33729015p-2,
+ 0x1.575f3483021c3p-2,
+ 0x1.517de540ce2a3p-2,
+ 0x1.4babff975a04cp-2,
+ 0x1.45e99bcbb7915p-2,
+ 0x1.4036d0468a7a2p-2,
+ 0x1.3a93b1998736cp-2,
+ 0x1.35005285227f1p-2,
+ 0x1.2f7cc3fe6f423p-2,
+ 0x1.2a09153529381p-2,
+ 0x1.24a55399ea239p-2,
+ 0x1.1f518ae487dc8p-2,
+ 0x1.1a0dc51a9934dp-2,
+ 0x1.14da0a961fd14p-2,
+ 0x1.0fb6620c550afp-2,
+ 0x1.0aa2d09497f2bp-2,
+ 0x1.059f59af7a906p-2,
+ 0x1.00abff4dec7a3p-2,
+ 0x1.f79183b101c5bp-3,
+ 0x1.edeb406d9c824p-3,
+ 0x1.e4652fadcb6b2p-3,
+ 0x1.daff4969c0b04p-3,
+ 0x1.d1b982c501370p-3,
+ 0x1.c893ce1dcbef7p-3,
+ 0x1.bf8e1b1ca2279p-3,
+ 0x1.b6a856c3ed54fp-3,
+ 0x1.ade26b7fbed95p-3,
+ 0x1.a53c4135a6526p-3,
+ 0x1.9cb5bd549b111p-3,
+ 0x1.944ec2e4f5630p-3,
+ 0x1.8c07329874652p-3,
+ 0x1.83deeada4d25ap-3,
+ 0x1.7bd5c7df3fe9cp-3,
+ 0x1.73eba3b5b07b7p-3,
+ 0x1.6c205655be71fp-3,
+ 0x1.6473b5b15a7a1p-3,
+ 0x1.5ce595c455b0ap-3,
+ 0x1.5575c8a468361p-3,
+ 0x1.4e241e912c305p-3,
+ 0x1.46f066040a832p-3,
+ 0x1.3fda6bc016994p-3,
+ 0x1.38e1fae1d6a9dp-3,
+ 0x1.3206dceef5f87p-3,
+ 0x1.2b48d9e5dea1cp-3,
+ 0x1.24a7b84d38971p-3,
+ 0x1.1e233d434b813p-3,
+ 0x1.17bb2c8d41535p-3,
+ 0x1.116f48a6476ccp-3,
+ 0x1.0b3f52ce8c383p-3,
+ 0x1.052b0b1a174eap-3,
+ 0x1.fe6460fef4680p-4,
+ 0x1.f2a901ccafb37p-4,
+ 0x1.e723726b824a9p-4,
+ 0x1.dbd32ac4c99b0p-4,
+ 0x1.d0b7a0f921e7cp-4,
+ 0x1.c5d0497c09e74p-4,
+ 0x1.bb1c972f23e50p-4,
+ 0x1.b09bfb7d11a83p-4,
+ 0x1.a64de673e8837p-4,
+ 0x1.9c31c6df3b1b8p-4,
+ 0x1.92470a61b6965p-4,
+ 0x1.888d1d8e510a3p-4,
+ 0x1.7f036c0107294p-4,
+ 0x1.75a96077274bap-4,
+ 0x1.6c7e64e7281cbp-4,
+ 0x1.6381e2980956bp-4,
+ 0x1.5ab342383d177p-4,
+ 0x1.5211ebf41880bp-4,
+ 0x1.499d478bca735p-4,
+ 0x1.4154bc68d75c3p-4,
+ 0x1.3937b1b319259p-4,
+ 0x1.31458e6542847p-4,
+ 0x1.297db960e4f63p-4,
+ 0x1.21df9981f8e53p-4,
+ 0x1.1a6a95b1e786fp-4,
+ 0x1.131e14fa1625dp-4,
+ 0x1.0bf97e95f2a64p-4,
+ 0x1.04fc3a0481321p-4,
+ 0x1.fc4b5e32d6259p-5,
+ 0x1.eeea8c1b1db93p-5,
+ 0x1.e1d4cf1e2450ap-5,
+ 0x1.d508f9a1ea64ep-5,
+ 0x1.c885df3451a07p-5,
+ 0x1.bc4a54a84e834p-5,
+ 0x1.b055303221015p-5,
+ 0x1.a4a549829587ep-5,
+ 0x1.993979e14fffdp-5,
+ 0x1.8e109c4622913p-5,
+ 0x1.83298d717210ep-5,
+ 0x1.78832c03aa2b1p-5,
+ 0x1.6e1c5893c380bp-5,
+ 0x1.63f3f5c4de13bp-5,
+ 0x1.5a08e85af27e0p-5,
+ 0x1.505a174e9c929p-5,
+ 0x1.46e66be002240p-5,
+ 0x1.3dacd1a8d8ccdp-5,
+ 0x1.34ac36ad8dafep-5,
+ 0x1.2be38b6d92415p-5,
+ 0x1.2351c2f2d1449p-5,
+ 0x1.1af5d2e04f3f6p-5,
+ 0x1.12ceb37ff9bc3p-5,
+ 0x1.0adb5fcfa8c75p-5,
+ 0x1.031ad58d56279p-5,
+ 0x1.f7182a851bca2p-6,
+ 0x1.e85c449e377f2p-6,
+ 0x1.da0005e5f28dfp-6,
+ 0x1.cc0180af00a8bp-6,
+ 0x1.be5ecd2fcb5f9p-6,
+ 0x1.b1160991ff737p-6,
+ 0x1.a4255a00b9f03p-6,
+ 0x1.978ae8b55ce1bp-6,
+ 0x1.8b44e6031383ep-6,
+ 0x1.7f5188610ddc8p-6,
+ 0x1.73af0c737bb45p-6,
+ 0x1.685bb5134ef13p-6,
+ 0x1.5d55cb54cd53ap-6,
+ 0x1.529b9e8cf9a1ep-6,
+ 0x1.482b8455dc491p-6,
+ 0x1.3e03d891b37dep-6,
+ 0x1.3422fd6d12e2bp-6,
+ 0x1.2a875b5ffab56p-6,
+ 0x1.212f612dee7fbp-6,
+ 0x1.181983e5133ddp-6,
+ 0x1.0f443edc5ce49p-6,
+ 0x1.06ae13b0d3255p-6,
+ 0x1.fcab1483ea7fcp-7,
+ 0x1.ec72615a894c4p-7,
+ 0x1.dcaf3691fc448p-7,
+ 0x1.cd5ec93c12431p-7,
+ 0x1.be7e5ac24963bp-7,
+ 0x1.b00b38d6b3575p-7,
+ 0x1.a202bd6372dcep-7,
+ 0x1.94624e78e0fafp-7,
+ 0x1.87275e3a6869dp-7,
+ 0x1.7a4f6aca256cbp-7,
+ 0x1.6dd7fe3358230p-7,
+ 0x1.61beae53b72b7p-7,
+ 0x1.56011cc3b036dp-7,
+ 0x1.4a9cf6bda3f4cp-7,
+ 0x1.3f8ff5042a88ep-7,
+ 0x1.34d7dbc76d7e5p-7,
+ 0x1.2a727a89a3f14p-7,
+ 0x1.205dac02bd6b9p-7,
+ 0x1.1697560347b25p-7,
+ 0x1.0d1d69569b82dp-7,
+ 0x1.03ede1a45bfeep-7,
+ 0x1.f60d8aa2a88f2p-8,
+ 0x1.e4cc4abf7d065p-8,
+ 0x1.d4143a9dfe965p-8,
+ 0x1.c3e1a5f5c077cp-8,
+ 0x1.b430ecf4a83a8p-8,
+ 0x1.a4fe83fb9db25p-8,
+ 0x1.9646f35a76623p-8,
+ 0x1.8806d70b2fc36p-8,
+ 0x1.7a3ade6c8b3e4p-8,
+ 0x1.6cdfcbfc1e263p-8,
+ 0x1.5ff2750fe7820p-8,
+ 0x1.536fc18f7ce5cp-8,
+ 0x1.4754abacdf1dcp-8,
+ 0x1.3b9e3f9d06e3fp-8,
+ 0x1.30499b503957fp-8,
+ 0x1.2553ee2a336bfp-8,
+ 0x1.1aba78ba3af89p-8,
+ 0x1.107a8c7323a6ep-8,
+ 0x1.06918b6355624p-8,
+ 0x1.f9f9cfd9c3035p-9,
+ 0x1.e77448fb66bb9p-9,
+ 0x1.d58da68fd1170p-9,
+ 0x1.c4412bf4b8f0bp-9,
+ 0x1.b38a3af2e55b4p-9,
+ 0x1.a3645330550ffp-9,
+ 0x1.93cb11a30d765p-9,
+ 0x1.84ba3004a50d0p-9,
+ 0x1.762d84469c18fp-9,
+ 0x1.6821000795a03p-9,
+ 0x1.5a90b00981d93p-9,
+ 0x1.4d78bba8ca5fdp-9,
+ 0x1.40d564548fad7p-9,
+ 0x1.34a305080681fp-9,
+ 0x1.28de11c5031ebp-9,
+ 0x1.1d83170fbf6fbp-9,
+ 0x1.128eb96be8798p-9,
+ 0x1.07fdb4dafea5fp-9,
+ 0x1.fb99b8b8279e1p-10,
+ 0x1.e7f232d9e2630p-10,
+ 0x1.d4fed7195d7e8p-10,
+ 0x1.c2b9cf7f893bfp-10,
+ 0x1.b11d702b3deb1p-10,
+ 0x1.a024365f771bdp-10,
+ 0x1.8fc8c794b03b5p-10,
+ 0x1.8005f08d6f1efp-10,
+ 0x1.70d6a46e07ddap-10,
+ 0x1.6235fbd7a4345p-10,
+ 0x1.541f340697987p-10,
+ 0x1.468dadf4080abp-10,
+ 0x1.397ced7af2b15p-10,
+ 0x1.2ce898809244ep-10,
+ 0x1.20cc76202c5fap-10,
+ 0x1.15246dda49d47p-10,
+ 0x1.09ec86c75d497p-10,
+ 0x1.fe41cd9bb4eeep-11,
+ 0x1.e97ba3b77f306p-11,
+ 0x1.d57f524723822p-11,
+ 0x1.c245d4b998479p-11,
+ 0x1.afc85e0f82e12p-11,
+ 0x1.9e005769dbc1dp-11,
+ 0x1.8ce75e9f6f8a0p-11,
+ 0x1.7c7744d9378f7p-11,
+ 0x1.6caa0d3582fe9p-11,
+ 0x1.5d79eb71e893bp-11,
+ 0x1.4ee1429bf7cc0p-11,
+ 0x1.40daa3c89f5b6p-11,
+ 0x1.3360ccd23db3ap-11,
+ 0x1.266ea71d4f71ap-11,
+ 0x1.19ff4663ae9dfp-11,
+ 0x1.0e0de78654d1ep-11,
+ 0x1.0295ef6591848p-11,
+ 0x1.ef25d37f49fe1p-12,
+ 0x1.da01102b5f851p-12,
+ 0x1.c5b5412dcafadp-12,
+ 0x1.b23a5a23e4210p-12,
+ 0x1.9f8893d8fd1c1p-12,
+ 0x1.8d986a4187285p-12,
+ 0x1.7c629a822bc9ep-12,
+ 0x1.6be02102b3520p-12,
+ 0x1.5c0a378c90bcap-12,
+ 0x1.4cda5374ea275p-12,
+ 0x1.3e4a23d1f4702p-12,
+ 0x1.30538fbb77ecdp-12,
+ 0x1.22f0b496539bdp-12,
+ 0x1.161be46ad3b50p-12,
+ 0x1.09cfa445b00ffp-12,
+ 0x1.fc0d55470cf51p-13,
+ 0x1.e577bbcd49935p-13,
+ 0x1.cfd4a5adec5bfp-13,
+ 0x1.bb1a9657ce465p-13,
+ 0x1.a740684026555p-13,
+ 0x1.943d4a1d1ed39p-13,
+ 0x1.8208bc334a6a5p-13,
+ 0x1.709a8db59f25cp-13,
+ 0x1.5feada379d8b7p-13,
+ 0x1.4ff207314a102p-13,
+ 0x1.40a8c1949f75ep-13,
+ 0x1.3207fb7420eb9p-13,
+ 0x1.2408e9ba3327fp-13,
+ 0x1.16a501f0e42cap-13,
+ 0x1.09d5f819c9e29p-13,
+ 0x1.fb2b792b40a22p-14,
+ 0x1.e3bcf436a1a95p-14,
+ 0x1.cd55277c18d05p-14,
+ 0x1.b7e94604479dcp-14,
+ 0x1.a36eec00926ddp-14,
+ 0x1.8fdc1b2dcf7b9p-14,
+ 0x1.7d2737527c3f9p-14,
+ 0x1.6b4702d7d5849p-14,
+ 0x1.5a329b7d30748p-14,
+ 0x1.49e17724f4d41p-14,
+ 0x1.3a4b60ba9aa4dp-14,
+ 0x1.2b6875310f785p-14,
+ 0x1.1d312098e9dbap-14,
+ 0x1.0f9e1b4dd36dfp-14,
+ 0x1.02a8673a94691p-14,
+ 0x1.ec929a665b449p-15,
+ 0x1.d4f4b4c8e09edp-15,
+ 0x1.be6abbb10a5aap-15,
+ 0x1.a8e8cc1fadef6p-15,
+ 0x1.94637d5bacfdbp-15,
+ 0x1.80cfdc72220cfp-15,
+ 0x1.6e2367dc27f95p-15,
+ 0x1.5c540b4936fd2p-15,
+ 0x1.4b581b8d170fcp-15,
+ 0x1.3b2652b06c2b2p-15,
+ 0x1.2bb5cc22e5db6p-15,
+ 0x1.1cfe010e2052dp-15,
+ 0x1.0ef6c4c84a0fep-15,
+ 0x1.01984165a5f36p-15,
+ 0x1.e9b5e8d00ce76p-16,
+ 0x1.d16f5716c6c1ap-16,
+ 0x1.ba4f035d60e02p-16,
+ 0x1.a447b7b03f045p-16,
+ 0x1.8f4ccca7fc90dp-16,
+ 0x1.7b5223dac7336p-16,
+ 0x1.684c227fcacefp-16,
+ 0x1.562fac4329b48p-16,
+ 0x1.44f21e49054f2p-16,
+ 0x1.34894a5e24657p-16,
+ 0x1.24eb7254ccf83p-16,
+ 0x1.160f438c70913p-16,
+ 0x1.07ebd2a2d2844p-16,
+ 0x1.f4f12e9ab070ap-17,
+ 0x1.db5ad0b27805cp-17,
+ 0x1.c304efa2c6f4ep-17,
+ 0x1.abe09e9144b5ep-17,
+ 0x1.95df988e76644p-17,
+ 0x1.80f439b4ee04bp-17,
+ 0x1.6d11788a69c64p-17,
+ 0x1.5a2adfa0b4bc4p-17,
+ 0x1.4834877429b8fp-17,
+ 0x1.37231085c7d9ap-17,
+ 0x1.26eb9daed6f7ep-17,
+ 0x1.1783ceac28910p-17,
+ 0x1.08e1badf0fcedp-17,
+ 0x1.f5f7d88472604p-18,
+ 0x1.db92b5212fb8dp-18,
+ 0x1.c282cd3957edap-18,
+ 0x1.aab7abace48dcp-18,
+ 0x1.94219bfcb4928p-18,
+ 0x1.7eb1a2075864dp-18,
+ 0x1.6a597219a93d9p-18,
+ 0x1.570b69502f313p-18,
+ 0x1.44ba864670882p-18,
+ 0x1.335a62115bce2p-18,
+ 0x1.22df298214423p-18,
+ 0x1.133d96ae7e0ddp-18,
+ 0x1.046aeabcfcdecp-18,
+ 0x1.ecb9cfe1d8642p-19,
+ 0x1.d21397ead99cbp-19,
+ 0x1.b8d094c86d374p-19,
+ 0x1.a0df0f0c626dcp-19,
+ 0x1.8a2e269750a39p-19,
+ 0x1.74adc8f4064d3p-19,
+ 0x1.604ea819f007cp-19,
+ 0x1.4d0231928c6f9p-19,
+ 0x1.3aba85fe22e1fp-19,
+ 0x1.296a70f414053p-19,
+ 0x1.1905613b3abf2p-19,
+ 0x1.097f6156f32c5p-19,
+ 0x1.f59a20caf6695p-20,
+ 0x1.d9c73698fb1dcp-20,
+ 0x1.bf716c6168baep-20,
+ 0x1.a6852c6b58392p-20,
+ 0x1.8eefd70594a88p-20,
+ 0x1.789fb715aae95p-20,
+ 0x1.6383f726a8e04p-20,
+ 0x1.4f8c96f26a26ap-20,
+ 0x1.3caa61607f920p-20,
+ 0x1.2acee2f5ecdb8p-20,
+ 0x1.19ec60b1242edp-20,
+ 0x1.09f5cf4dd2877p-20,
+ 0x1.f5bd95d8730d8p-21,
+ 0x1.d9371e2ff7c35p-21,
+ 0x1.be41de54d155ap-21,
+ 0x1.a4c89e08ef4f3p-21,
+ 0x1.8cb738399b12cp-21,
+ 0x1.75fa8dbc84becp-21,
+ 0x1.608078a70dcbcp-21,
+ 0x1.4c37c0394d094p-21,
+ 0x1.39100d5687bfep-21,
+ 0x1.26f9df8519bd6p-21,
+ 0x1.15e6827001f18p-21,
+ 0x1.05c803e4831c1p-21,
+ 0x1.ed22548cffd35p-22,
+ 0x1.d06ad6ecdf971p-22,
+ 0x1.b551c847fbc96p-22,
+ 0x1.9bc09f112b494p-22,
+ 0x1.83a1ff0aa239dp-22,
+ 0x1.6ce1aa3fd7bddp-22,
+ 0x1.576c72b514859p-22,
+ 0x1.43302cc4a0da8p-22,
+ 0x1.301ba221dc9bbp-22,
+ 0x1.1e1e857adc568p-22,
+ 0x1.0d2966b1746f7p-22,
+ 0x1.fa5b4f49cc6b2p-23,
+ 0x1.dc3ae30b55c16p-23,
+ 0x1.bfd7555a3bd68p-23,
+ 0x1.a517d9e61628ap-23,
+ 0x1.8be4f8f6c951fp-23,
+ 0x1.74287ded49339p-23,
+ 0x1.5dcd669f2cd34p-23,
+ 0x1.48bfd38302870p-23,
+ 0x1.34ecf8a3c124ap-23,
+ 0x1.22430f521cbcfp-23,
+ 0x1.10b1488aeb235p-23,
+ 0x1.0027c00a263a6p-23,
+ 0x1.e12ee004efc37p-24,
+ 0x1.c3e44ae32b16bp-24,
+ 0x1.a854ea14102a8p-24,
+ 0x1.8e6761569f45dp-24,
+ 0x1.7603bac345f65p-24,
+ 0x1.5f1353cdad001p-24,
+ 0x1.4980cb3c80949p-24,
+ 0x1.3537f00b6ad4dp-24,
+ 0x1.2225b12bffc68p-24,
+ 0x1.10380e1adb7e9p-24,
+ 0x1.febc107d5efaap-25,
+ 0x1.df0f2a0ee6946p-25,
+ 0x1.c14b2188bcee4p-25,
+ 0x1.a553644f7f07dp-25,
+ 0x1.8b0cfce0579dfp-25,
+ 0x1.725e7c5dd20f7p-25,
+ 0x1.5b2fe547a1340p-25,
+ 0x1.456a974e92e93p-25,
+ 0x1.30f93c3699078p-25,
+ 0x1.1dc7b5b978cf8p-25,
+ 0x1.0bc30c5d52f15p-25,
+ 0x1.f5b2be65a0c7fp-26,
+ 0x1.d5f3a8dea7357p-26,
+ 0x1.b82915b03515bp-26,
+ 0x1.9c3517e789488p-26,
+ 0x1.81fb7df06136ep-26,
+ 0x1.6961b8d641d06p-26,
+ 0x1.524ec4d916caep-26,
+ 0x1.3cab1343d18d1p-26,
+ 0x1.2860757487a01p-26,
+ 0x1.155a09065d4f7p-26,
+ 0x1.0384250e4c9fcp-26,
+ 0x1.e59890b926c78p-27,
+ 0x1.c642116a8a9e3p-27,
+ 0x1.a8e405e651ab6p-27,
+ 0x1.8d5f98114f872p-27,
+ 0x1.7397c5a66e307p-27,
+ 0x1.5b71456c5a4c4p-27,
+ 0x1.44d26de513197p-27,
+ 0x1.2fa31d6371537p-27,
+ 0x1.1bcca373b7b43p-27,
+ 0x1.0939ab853339fp-27,
+ 0x1.efac5187b2863p-28,
+ 0x1.cf1e86235d0e6p-28,
+ 0x1.b0a68a2128babp-28,
+ 0x1.9423165bc4444p-28,
+ 0x1.7974e743dea3cp-28,
+ 0x1.607e9eacd1050p-28,
+ 0x1.4924a74dec728p-28,
+ 0x1.334d19e0c2160p-28,
+ 0x1.1edfa3c5f5ccap-28,
+ 0x1.0bc56f1b54701p-28,
+ 0x1.f3d2185e047d9p-29,
+ 0x1.d26cb87945e87p-29,
+ 0x1.b334fac4b9f99p-29,
+ 0x1.96076f7918d1cp-29,
+ 0x1.7ac2d72fc2c63p-29,
+ 0x1.614801550319ep-29,
+ 0x1.4979ac8b28926p-29,
+ 0x1.333c68e2d0548p-29,
+ 0x1.1e767bce37dd7p-29,
+ 0x1.0b0fc5b6d05a0p-29,
+ 0x1.f1e3523b41d7dp-30,
+ 0x1.d00de6608effep-30,
+ 0x1.b0778b7b3301ap-30,
+ 0x1.92fb04ec0f6cfp-30,
+ 0x1.77756ec9f78fap-30,
+ 0x1.5dc61922d5a06p-30,
+ 0x1.45ce65699ff6dp-30,
+ 0x1.2f71a5f159970p-30,
+ 0x1.1a94ff571654fp-30,
+ 0x1.071f4bbea09ecp-30,
+ 0x1.e9f1ff8ddd774p-31,
+ 0x1.c818223a202c7p-31,
+ 0x1.a887bd2b4404dp-31,
+ 0x1.8b1a336c5eb6bp-31,
+ 0x1.6fab63324088ap-31,
+ 0x1.56197e30205bap-31,
+ 0x1.3e44e45301b92p-31,
+ 0x1.281000bfe4c3fp-31,
+ 0x1.135f28f2d50b4p-31,
+ 0x1.00187dded5975p-31,
+ 0x1.dc479de0ef001p-32,
+ 0x1.bad4fdad3caa1p-32,
+ 0x1.9baed3ed27ab8p-32,
+ 0x1.7ead9ce4285bbp-32,
+ 0x1.63ac6b4edc88ep-32,
+ 0x1.4a88be2a6390cp-32,
+ 0x1.332259185f1a0p-32,
+ 0x1.1d5b1f3793044p-32,
+ 0x1.0916f04b6e18bp-32,
+ 0x1.ec77101de6926p-33,
+ 0x1.c960bf23153e0p-33,
+ 0x1.a8bd20fc65ef7p-33,
+ 0x1.8a61745ec7d1dp-33,
+ 0x1.6e25d0e756261p-33,
+ 0x1.53e4f7d1666cbp-33,
+ 0x1.3b7c27a7ddb0ep-33,
+ 0x1.24caf2c32af14p-33,
+ 0x1.0fb3186804d0fp-33,
+ 0x1.f830c0bb41fd7p-34,
+ 0x1.d3c0f1a91c846p-34,
+ 0x1.b1e5acf351d87p-34,
+ 0x1.92712d259ce66p-34,
+ 0x1.7538c60a04476p-34,
+ 0x1.5a14b04b47879p-34,
+ 0x1.40dfd87456f4cp-34,
+ 0x1.2977b1172b9d5p-34,
+ 0x1.13bc07e891491p-34,
+ 0x1.ff1dbb4300811p-35,
+ 0x1.d9a880f306bd8p-35,
+ 0x1.b6e45220b55e0p-35,
+ 0x1.96a0b33f2c4dap-35,
+ 0x1.78b07e9e924acp-35,
+ 0x1.5ce9ab1670dd2p-35,
+ 0x1.4325167006bb0p-35,
+ 0x1.2b3e53538ff3fp-35,
+ 0x1.15137a7f44864p-35,
+ 0x1.0084ff125639dp-35,
+ 0x1.daeb0b7311ec7p-36,
+ 0x1.b7937d1c40c52p-36,
+ 0x1.96d082f59ab06p-36,
+ 0x1.7872d9fa10aadp-36,
+ 0x1.5c4e8e37bc7d0p-36,
+ 0x1.423ac0df49a40p-36,
+ 0x1.2a117230ad284p-36,
+ 0x1.13af4f04f9998p-36,
+ 0x1.fde703724e560p-37,
+ 0x1.d77f0c82e7641p-37,
+ 0x1.b3ee02611d7ddp-37,
+ 0x1.92ff33023d5bdp-37,
+ 0x1.7481a9e69f53fp-37,
+ 0x1.5847eda620959p-37,
+ 0x1.3e27c1fcc74bdp-37,
+ 0x1.25f9ee0b923dcp-37,
+ 0x1.0f9a0686531ffp-37,
+ 0x1.f5cc7718082afp-38,
+ 0x1.cf7e53d6a2ca5p-38,
+ 0x1.ac0f5f3229372p-38,
+ 0x1.8b498644847eap-38,
+ 0x1.6cfa9bcca59dcp-38,
+ 0x1.50f411d4fd2cdp-38,
+ 0x1.370ab8327af5ep-38,
+ 0x1.1f167f88c6b6ep-38,
+ 0x1.08f24085d4597p-38,
+ 0x1.e8f70e181d619p-39,
+ 0x1.c324c20e337dcp-39,
+ 0x1.a03261574b54ep-39,
+ 0x1.7fe903cdf5855p-39,
+ 0x1.6215c58da3450p-39,
+ 0x1.46897d4b69fc6p-39,
+ 0x1.2d1877d731b7bp-39,
+ 0x1.159a386b11517p-39,
+ 0x1.ffd27ae9393cep-40,
+ 0x1.d7c593130dd0bp-40,
+ 0x1.b2cd607c79bcfp-40,
+ 0x1.90ae4d3405651p-40,
+ 0x1.71312dd1759e2p-40,
+ 0x1.5422ef5d8949dp-40,
+ 0x1.39544b0ecc957p-40,
+ 0x1.20997f73e73ddp-40,
+ 0x1.09ca0eaacd277p-40,
+ 0x1.e9810295890ecp-41,
+ 0x1.c2b45b5aa4a1dp-41,
+ 0x1.9eee068fa7596p-41,
+ 0x1.7df2b399c10a8p-41,
+ 0x1.5f8b87a31bd85p-41,
+ 0x1.4385c96e9a2d9p-41,
+ 0x1.29b2933ef4cbcp-41,
+ 0x1.11e68a6378f8ap-41,
+ 0x1.f7f338086a86bp-42,
+ 0x1.cf8d7d9ce040ap-42,
+ 0x1.aa577251ae484p-42,
+ 0x1.8811d739efb5ep-42,
+ 0x1.68823e52970bep-42,
+ 0x1.4b72ae68e8b4cp-42,
+ 0x1.30b14dbe876bcp-42,
+ 0x1.181012ef86610p-42,
+ 0x1.01647ba798744p-42,
+ 0x1.d90e917701675p-43,
+ 0x1.b2a87e86d0c8ap-43,
+ 0x1.8f53dcb377293p-43,
+ 0x1.6ed2f2515e933p-43,
+ 0x1.50ecc9ed47f19p-43,
+ 0x1.356cd5ce7799ep-43,
+ 0x1.1c229a587ab78p-43,
+ 0x1.04e15ecc7f3f6p-43,
+ 0x1.deffc7e6a6017p-44,
+ 0x1.b7b040832f310p-44,
+ 0x1.938e021f36d76p-44,
+ 0x1.7258610b3b233p-44,
+ 0x1.53d3bfc82a909p-44,
+ 0x1.37c92babdc2fdp-44,
+ 0x1.1e06010120f6ap-44,
+ 0x1.065b9616170d4p-44,
+ 0x1.e13dd96b3753ap-45,
+ 0x1.b950d32467392p-45,
+ 0x1.94a72263259a5p-45,
+ 0x1.72fd93e036cdcp-45,
+ 0x1.54164576929abp-45,
+ 0x1.37b83c521fe96p-45,
+ 0x1.1daf033182e96p-45,
+ 0x1.05ca50205d26ap-45,
+ 0x1.dfbb6235639fap-46,
+ 0x1.b7807e294781fp-46,
+ 0x1.9298add70a734p-46,
+ 0x1.70beaf9c7ffb6p-46,
+ 0x1.51b2cd6709222p-46,
+ 0x1.353a6cf7f7fffp-46,
+ 0x1.1b1fa8cbe84a7p-46,
+ 0x1.0330f0fd69921p-46,
+ 0x1.da81670f96f9bp-47,
+ 0x1.b24a16b4d09aap-47,
+ 0x1.8d6eeb6efdbd6p-47,
+ 0x1.6ba91ac734785p-47,
+ 0x1.4cb7966770ab5p-47,
+ 0x1.305e9721d0981p-47,
+ 0x1.1667311fff70ap-47,
+ 0x1.fd3de10d62855p-48,
+ 0x1.d1aefbcd48d0cp-48,
+ 0x1.a9cc93c25aca9p-48,
+ 0x1.85487ee3ea735p-48,
+ 0x1.63daf8b4b1e0cp-48,
+ 0x1.45421e69a6ca1p-48,
+ 0x1.294175802d99ap-48,
+ 0x1.0fa17bf41068fp-48,
+ 0x1.f05e82aae2bb9p-49,
+ 0x1.c578101b29058p-49,
+ 0x1.9e39dc5dd2f7cp-49,
+ 0x1.7a553a728bbf2p-49,
+ 0x1.5982008db1304p-49,
+ 0x1.3b7e00422e51bp-49,
+ 0x1.200c898d9ee3ep-49,
+ 0x1.06f5f7eb65a56p-49,
+ 0x1.e00e9148a1d25p-50,
+ 0x1.b623734024e92p-50,
+ 0x1.8fd4e01891bf8p-50,
+ 0x1.6cd44c7470d89p-50,
+ 0x1.4cd9c04158cd7p-50,
+ 0x1.2fa34bf5c8344p-50,
+ 0x1.14f4890ff2461p-50,
+ 0x1.f92c49dfa4df5p-51,
+ 0x1.ccaaea71ab0dfp-51,
+ 0x1.a40829f001197p-51,
+ 0x1.7eef13b59e96cp-51,
+ 0x1.5d11e1a252bf5p-51,
+ 0x1.3e296303b2297p-51,
+ 0x1.21f47009f43cep-51,
+ 0x1.083768c5e4541p-51,
+ 0x1.e1777d831265ep-52,
+ 0x1.b69f10b0191b5p-52,
+ 0x1.8f8a3a05b5b52p-52,
+ 0x1.6be573c40c8e7p-52,
+ 0x1.4b645ba991fdbp-52,
+ 0x1.2dc119095729fp-52,
+ },
+};
diff --git a/pl/math/sv_erfc_1u8.c b/pl/math/sv_erfc_1u8.c
new file mode 100644
index 000000000000..a91bef96f2e7
--- /dev/null
+++ b/pl/math/sv_erfc_1u8.c
@@ -0,0 +1,164 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ uint64_t off_idx, off_arr;
+ double max, shift;
+ double p20, p40, p41, p42;
+ double p51, p52;
+ double q5, r5;
+ double q6, r6;
+ double q7, r7;
+ double q8, r8;
+ double q9, r9;
+ uint64_t table_scale;
+} data = {
+ /* Set an offset so the range of the index used for lookup is 3487, and it
+ can be clamped using a saturated add on an offset index.
+ Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */
+ .off_idx = 0xbd3ffffffffff260,
+ .off_arr = 0xfffffffffffff260, /* 0xffffffffffffffff - 3487. */
+ .max = 0x1.b3ep+4, /* 3487/128. */
+ .shift = 0x1p45,
+ .table_scale = 0x37f0000000000000, /* asuint64(0x1p-128). */
+ .p20 = 0x1.5555555555555p-2, /* 1/3, used to compute 2/3 and 1/6. */
+ .p40 = -0x1.999999999999ap-4, /* 1/10. */
+ .p41 = -0x1.999999999999ap-2, /* 2/5. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
+ .p51 = -0x1.c71c71c71c71cp-3, /* 2/9. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
+ /* Qi = (i+1) / i, for i = 5, ..., 9. */
+ .q5 = 0x1.3333333333333p0,
+ .q6 = 0x1.2aaaaaaaaaaabp0,
+ .q7 = 0x1.2492492492492p0,
+ .q8 = 0x1.2p0,
+ .q9 = 0x1.1c71c71c71c72p0,
+ /* Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
+ .r5 = -0x1.e79e79e79e79ep-3,
+ .r6 = -0x1.b6db6db6db6dbp-3,
+ .r7 = -0x1.8e38e38e38e39p-3,
+ .r8 = -0x1.6c16c16c16c17p-3,
+ .r9 = -0x1.4f2094f2094f2p-3,
+};
+
+/* Optimized double-precision vector erfc(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+ - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+ + p6(r) d^6 + ... + p10(r) d^10
+
+ Polynomials p6(r) to p10(r) are computed using recurrence relation
+
+ 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
+ with p0 = 1, and p1(r) = -r.
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+ Maximum measured error: 1.71 ULP
+ _ZGVsMxv_erfc(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
+ want 0x1.e15fcbea3e7adp-608. */
+svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ svfloat64_t a = svabs_x (pg, x);
+
+ /* Clamp input at |x| <= 3487/128. */
+ a = svmin_x (pg, a, dat->max);
+
+ /* Reduce x to the nearest multiple of 1/128. */
+ svfloat64_t shift = sv_f64 (dat->shift);
+ svfloat64_t z = svadd_x (pg, a, shift);
+
+ /* Saturate index for the NaN case. */
+ svuint64_t i = svqadd (svreinterpret_u64 (z), dat->off_idx);
+
+ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
+ i = svadd_x (pg, i, i);
+ const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
+ svfloat64_t erfcr = svld1_gather_index (pg, p, i);
+ svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
+ svfloat64_t r = svsub_x (pg, z, shift);
+ svfloat64_t d = svsub_x (pg, a, r);
+ svfloat64_t d2 = svmul_x (pg, d, d);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+
+ /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p9(r) * d^9. */
+ svfloat64_t p1 = r;
+ svfloat64_t third = sv_f64 (dat->p20);
+ svfloat64_t twothird = svmul_x (pg, third, 2.0);
+ svfloat64_t sixth = svmul_x (pg, third, 0.5);
+ svfloat64_t p2 = svmls_x (pg, third, r2, twothird);
+ svfloat64_t p3 = svmad_x (pg, r2, third, -0.5);
+ p3 = svmul_x (pg, r, p3);
+ svfloat64_t p4 = svmla_x (pg, sv_f64 (dat->p41), r2, dat->p42);
+ p4 = svmls_x (pg, sv_f64 (dat->p40), r2, p4);
+ svfloat64_t p5 = svmla_x (pg, sv_f64 (dat->p51), r2, dat->p52);
+ p5 = svmla_x (pg, sixth, r2, p5);
+ p5 = svmul_x (pg, r, p5);
+ /* Compute p_i using recurrence relation:
+ p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
+ svfloat64_t qr5 = svld1rq (svptrue_b64 (), &dat->q5);
+ svfloat64_t qr6 = svld1rq (svptrue_b64 (), &dat->q6);
+ svfloat64_t qr7 = svld1rq (svptrue_b64 (), &dat->q7);
+ svfloat64_t qr8 = svld1rq (svptrue_b64 (), &dat->q8);
+ svfloat64_t qr9 = svld1rq (svptrue_b64 (), &dat->q9);
+ svfloat64_t p6 = svmla_x (pg, p4, p5, svmul_lane (r, qr5, 0));
+ p6 = svmul_lane (p6, qr5, 1);
+ svfloat64_t p7 = svmla_x (pg, p5, p6, svmul_lane (r, qr6, 0));
+ p7 = svmul_lane (p7, qr6, 1);
+ svfloat64_t p8 = svmla_x (pg, p6, p7, svmul_lane (r, qr7, 0));
+ p8 = svmul_lane (p8, qr7, 1);
+ svfloat64_t p9 = svmla_x (pg, p7, p8, svmul_lane (r, qr8, 0));
+ p9 = svmul_lane (p9, qr8, 1);
+ svfloat64_t p10 = svmla_x (pg, p8, p9, svmul_lane (r, qr9, 0));
+ p10 = svmul_lane (p10, qr9, 1);
+ /* Compute polynomial in d using pairwise Horner scheme. */
+ svfloat64_t p90 = svmla_x (pg, p9, d, p10);
+ svfloat64_t p78 = svmla_x (pg, p7, d, p8);
+ svfloat64_t p56 = svmla_x (pg, p5, d, p6);
+ svfloat64_t p34 = svmla_x (pg, p3, d, p4);
+ svfloat64_t p12 = svmla_x (pg, p1, d, p2);
+ svfloat64_t y = svmla_x (pg, p78, d2, p90);
+ y = svmla_x (pg, p56, d2, y);
+ y = svmla_x (pg, p34, d2, y);
+ y = svmla_x (pg, p12, d2, y);
+
+ y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y));
+
+ /* Offset equals 2.0 if sign, else 0.0. */
+ svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+ svfloat64_t off = svreinterpret_f64 (svlsr_x (pg, sign, 1));
+ /* Handle sign and scale back in a single fma. */
+ svfloat64_t fac = svreinterpret_f64 (svorr_x (pg, sign, dat->table_scale));
+
+ return svmla_x (pg, off, fac, y);
+}
+
+PL_SIG (SV, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (SV_NAME_D1 (erfc), 1.21)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (erfc), 0.0, 0x1p-26, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 28.0, inf, 40000)
+PL_TEST_INTERVAL (SV_NAME_D1 (erfc), 6.0, -inf, 40000)
diff --git a/pl/math/sv_erfc_4u.c b/pl/math/sv_erfc_4u.c
deleted file mode 100644
index 076b47129862..000000000000
--- a/pl/math/sv_erfc_4u.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Double-precision SVE erfc(x) function.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if SV_SUPPORTED
-#include "sv_exp_tail.h"
-
-sv_f64_t __sv_exp_x (sv_f64_t, svbool_t);
-
-static NOINLINE sv_f64_t
-specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
-{
- return sv_call_f64 (erfc, x, y, special);
-}
-
-static inline sv_u64_t
-lookup_interval_idx (const svbool_t pg, sv_f64_t abs_x)
-{
- /* Interval index is calculated by (((abs(x) + 1)^4) >> 53) - 1023, bounded by
- the number of polynomials. */
- sv_f64_t xp1 = svadd_n_f64_x (pg, abs_x, 1);
- xp1 = svmul_f64_x (pg, xp1, xp1);
- xp1 = svmul_f64_x (pg, xp1, xp1);
- sv_u64_t interval_idx
- = svsub_n_u64_x (pg, svlsr_n_u64_x (pg, sv_as_u64_f64 (xp1), 52), 1023);
- return svsel_u64 (svcmple_n_u64 (pg, interval_idx, ERFC_NUM_INTERVALS),
- interval_idx, sv_u64 (ERFC_NUM_INTERVALS));
-}
-
-static inline sv_f64_t
-sv_eval_poly (const svbool_t pg, sv_f64_t z, sv_u64_t idx)
-{
- sv_u64_t offset = svmul_n_u64_x (pg, idx, ERFC_POLY_ORDER + 1);
- const double *base = &__v_erfc_data.poly[0][12];
- sv_f64_t r = sv_lookup_f64_x (pg, base, offset);
- for (int i = 0; i < ERFC_POLY_ORDER; i++)
- {
- base--;
- sv_f64_t c = sv_lookup_f64_x (pg, base, offset);
- r = sv_fma_f64_x (pg, z, r, c);
- }
- return r;
-}
-
-static inline sv_f64_t
-sv_eval_gauss (const svbool_t pg, sv_f64_t abs_x)
-{
- /* Accurate evaluation of exp(-x^2). This operation is sensitive to rounding
- errors in x^2, so we compute an estimate for the error and use a custom exp
- helper which corrects for the calculated error estimate. */
- sv_f64_t a2 = svmul_f64_x (pg, abs_x, abs_x);
-
- /* Split abs_x into (a_hi + a_lo), where a_hi is the 'large' component and
- a_lo is the 'small' component. */
- const sv_f64_t scale = sv_f64 (0x1.0000002p27);
- sv_f64_t a_hi = svneg_f64_x (pg, sv_fma_f64_x (pg, scale, abs_x,
- svneg_f64_x (pg, abs_x)));
- a_hi = sv_fma_f64_x (pg, scale, abs_x, a_hi);
- sv_f64_t a_lo = svsub_f64_x (pg, abs_x, a_hi);
-
- sv_f64_t a_hi_neg = svneg_f64_x (pg, a_hi);
- sv_f64_t a_lo_neg = svneg_f64_x (pg, a_lo);
-
- /* We can then estimate the error in abs_x^2 by computing (abs_x * abs_x) -
- (a_hi + a_lo) * (a_hi + a_lo). */
- sv_f64_t e2 = sv_fma_f64_x (pg, a_hi_neg, a_hi, a2);
- e2 = sv_fma_f64_x (pg, a_hi_neg, a_lo, e2);
- e2 = sv_fma_f64_x (pg, a_lo_neg, a_hi, e2);
- e2 = sv_fma_f64_x (pg, a_lo_neg, a_lo, e2);
-
- return sv_exp_tail (pg, svneg_f64_x (pg, a2), e2);
-}
-
-/* Optimized double precision vector complementary error function erfc.
- Maximum measured error is 3.64 ULP:
- __sv_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
- want 0x1.ff3f4c8e200d9p-42. */
-sv_f64_t
-__sv_erfc_x (sv_f64_t x, const svbool_t pg)
-{
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_f64_t abs_x = svabs_f64_x (pg, x);
- sv_u64_t atop = svlsr_n_u64_x (pg, sv_as_u64_f64 (abs_x), 52);
-
- /* Outside of the 'interesting' bounds, [-6, 28], +ve goes to 0, -ve goes
- to 2. As long as the polynomial is 0 in the boring zone, we can assemble
- the result correctly. This is dealt with in two ways:
-
- The 'coarse approach' is that the approximation algorithm is
- zero-predicated on in_bounds = |x| < 32, which saves the need to do
- coefficient lookup etc for |x| >= 32.
-
- The coarse approach misses [-32, -6] and [28, 32], which are dealt with in
- the polynomial and index calculation, such that the polynomial evaluates to
- 0 in these regions. */
- /* in_bounds is true for lanes where |x| < 32. */
- svbool_t in_bounds = svcmplt_n_u64 (pg, atop, 0x404);
- /* boring_zone = 2 for x < 0, 0 otherwise. */
- sv_f64_t boring_zone
- = sv_as_f64_u64 (svlsl_n_u64_x (pg, svlsr_n_u64_x (pg, ix, 63), 62));
- /* Very small, nan and inf. */
- svbool_t special_cases
- = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, atop, 0x3cd), 0x432);
-
- /* erfc(|x|) ~= P_i(|x|-x_i)*exp(-x^2)
-
- Where P_i is a polynomial and x_i is an offset, both defined in
- v_erfc_data.c. i is chosen based on which interval x falls in. */
- sv_u64_t i = lookup_interval_idx (in_bounds, abs_x);
- sv_f64_t x_i = sv_lookup_f64_x (in_bounds, __v_erfc_data.interval_bounds, i);
- sv_f64_t p = sv_eval_poly (in_bounds, svsub_f64_x (pg, abs_x, x_i), i);
- /* 'copy' sign of x to p, i.e. negate p if x is negative. */
- sv_u64_t sign = svbic_n_u64_z (in_bounds, ix, 0x7fffffffffffffff);
- p = sv_as_f64_u64 (sveor_u64_z (in_bounds, sv_as_u64_f64 (p), sign));
-
- sv_f64_t e = sv_eval_gauss (in_bounds, abs_x);
-
- /* Assemble result: 2-p*e if x<0, p*e otherwise. No need to conditionally
- select boring_zone because P[V_ERFC_NINTS-1]=0. */
- sv_f64_t y = sv_fma_f64_x (pg, p, e, boring_zone);
-
- if (unlikely (svptest_any (pg, special_cases)))
- {
- return specialcase (x, y, special_cases);
- }
- return y;
-}
-
-PL_ALIAS (__sv_erfc_x, _ZGVsMxv_erfc)
-
-PL_SIG (SV, D, 1, erfc, -4.0, 10.0)
-PL_TEST_ULP (__sv_erfc, 3.15)
-PL_TEST_INTERVAL (__sv_erfc, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (__sv_erfc, 0x1p-127, 0x1p-26, 40000)
-PL_TEST_INTERVAL (__sv_erfc, -0x1p-127, -0x1p-26, 40000)
-PL_TEST_INTERVAL (__sv_erfc, 0x1p-26, 0x1p5, 40000)
-PL_TEST_INTERVAL (__sv_erfc, -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (__sv_erfc, 0, inf, 40000)
-#endif
diff --git a/pl/math/sv_erfcf_1u7.c b/pl/math/sv_erfcf_1u7.c
new file mode 100644
index 000000000000..cda8f0b3752e
--- /dev/null
+++ b/pl/math/sv_erfcf_1u7.c
@@ -0,0 +1,111 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ uint32_t off_idx, off_arr;
+ float max, shift;
+ float third, two_thirds, two_over_fifteen, two_over_five, tenth;
+} data = {
+ /* Set an offset so the range of the index used for lookup is 644, and it can
+ be clamped using a saturated add. */
+ .off_idx = 0xb7fffd7b, /* 0xffffffff - asuint(shift) - 644. */
+ .off_arr = 0xfffffd7b, /* 0xffffffff - 644. */
+ .max = 10.0625f, /* 644/64. */
+ .shift = 0x1p17f,
+ .third = 0x1.555556p-2f,
+ .two_thirds = 0x1.555556p-1f,
+ .two_over_fifteen = 0x1.111112p-3f,
+ .two_over_five = -0x1.99999ap-2f,
+ .tenth = -0x1.99999ap-4f,
+};
+
+#define SignMask 0x80000000
+#define TableScale 0x28000000 /* 0x1p-47. */
+
+/* Optimized single-precision vector erfcf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/64.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+ Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
+ _ZGVsMxv_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
+ want 0x1.f51216p-120. */
+svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ svfloat32_t a = svabs_x (pg, x);
+
+ /* Clamp input at |x| <= 10.0 + 4/64. */
+ a = svmin_x (pg, a, dat->max);
+
+ /* Reduce x to the nearest multiple of 1/64. */
+ svfloat32_t shift = sv_f32 (dat->shift);
+ svfloat32_t z = svadd_x (pg, a, shift);
+
+ /* Saturate index for the NaN case. */
+ svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
+
+ /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
+ i = svmul_x (pg, i, 2);
+ const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+ svfloat32_t erfcr = svld1_gather_index (pg, p, i);
+ svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
+ svfloat32_t r = svsub_x (pg, z, shift);
+ svfloat32_t d = svsub_x (pg, a, r);
+ svfloat32_t d2 = svmul_x (pg, d, d);
+ svfloat32_t r2 = svmul_x (pg, r, r);
+
+ svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
+ svfloat32_t third = svdup_lane (coeffs, 0);
+
+ svfloat32_t p1 = r;
+ svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
+ svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+ svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
+ p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
+
+ svfloat32_t y = svmla_x (pg, p3, d, p4);
+ y = svmla_x (pg, p2, d, y);
+ y = svmla_x (pg, p1, d, y);
+
+ /* Solves the |x| = inf/nan case. */
+ y = svmls_x (pg, erfcr, scale, svmls_x (pg, d, d2, y));
+
+ /* Offset equals 2.0f if sign, else 0.0f. */
+ svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), SignMask);
+ svfloat32_t off = svreinterpret_f32 (svlsr_x (pg, sign, 1));
+ /* Handle sign and scale back in a single fma. */
+ svfloat32_t fac = svreinterpret_f32 (svorr_x (pg, sign, TableScale));
+
+ return svmla_x (pg, off, fac, y);
+}
+
+PL_SIG (SV, F, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (erfc), 1.14)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erfc), 0.0, 0x1p-26, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (erfc), 10.0625, inf, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (erfc), -4.0, -inf, 40000)
diff --git a/pl/math/sv_erff_1u3.c b/pl/math/sv_erff_1u3.c
deleted file mode 100644
index c7a738c55f7b..000000000000
--- a/pl/math/sv_erff_1u3.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Single-precision vector erf(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if SV_SUPPORTED
-
-#define AbsMask (0x7fffffff)
-
-static NOINLINE sv_f32_t
-__sv_erff_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
-{
- return sv_call_f32 (erff, x, y, cmp);
-}
-
-sv_f32_t __sv_expf_x (svbool_t, sv_f32_t);
-
-/* Optimized single precision vector erf. Worst-case error is 1.25 ULP:
- __sv_erff(0x1.dc59fap-1) got 0x1.9f9c88p-1
- want 0x1.9f9c8ap-1. */
-sv_f32_t
-__sv_erff_x (sv_f32_t x, const svbool_t pg)
-{
- sv_u32_t ix = sv_as_u32_f32 (x);
- sv_u32_t atop = svand_n_u32_x (pg, svlsr_n_u32_x (pg, ix, 16), 0x7fff);
- /* Handle both inf/nan as well as small values (|x|<2^-28). */
- svbool_t cmp
- = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, atop, 0x3180), 0x7ff0 - 0x3180);
-
- sv_u32_t sign = svand_n_u32_x (pg, ix, ~AbsMask);
- /* |x| < 0.921875. */
- svbool_t red = svaclt_n_f32 (pg, x, 0.921875f);
- /* |x| > 4.0. */
- svbool_t bor = svacgt_n_f32 (pg, x, 4.0f);
-
- /* Load polynomial coefficients. */
- sv_u32_t idx_lo = svsel (red, sv_u32 (0), sv_u32 (1));
- sv_u32_t idx_hi = svadd_n_u32_x (pg, idx_lo, 2);
-
- const float *base = (float *) __v_erff_data.coeffs;
- sv_f32_t c_2_5 = svld1rq (svptrue_b32 (), base + 2);
- sv_f32_t c_6_9 = svld1rq (svptrue_b32 (), base + 6);
- sv_f32_t c_10_13 = svld1rq (svptrue_b32 (), base + 10);
-
- /* Do not need to store elem 0 of __v_erff_data as it is not used. */
- sv_f32_t p1 = svtbl (c_2_5, idx_lo);
- sv_f32_t p2 = svtbl (c_2_5, idx_hi);
- sv_f32_t p3 = svtbl (c_6_9, idx_lo);
- sv_f32_t p4 = svtbl (c_6_9, idx_hi);
- sv_f32_t p5 = svtbl (c_10_13, idx_lo);
- sv_f32_t p6 = svtbl (c_10_13, idx_hi);
-
- sv_f32_t a = svabs_f32_x (pg, x);
- /* Square with merging mul - z is x^2 for reduced, |x| otherwise. */
- sv_f32_t z = svmul_f32_m (red, a, a);
-
- /* Evaluate polynomial on |x| or x^2. */
- sv_f32_t r = sv_fma_f32_x (pg, z, p6, p5);
- r = sv_fma_f32_x (pg, z, r, p4);
- r = sv_fma_f32_x (pg, z, r, p3);
- r = sv_fma_f32_x (pg, z, r, p2);
- r = sv_fma_f32_x (pg, z, r, p1);
- /* Use merging svmad for last operation - apply first coefficient if not
- reduced, otherwise r is propagated unchanged. This is because the reduced
- polynomial has lower order than the non-reduced. */
- r = svmad_n_f32_m (svnot_b_z (pg, red), r, z, base[1]);
- r = sv_fma_f32_x (pg, a, r, a);
-
- /* y = |x| + |x| * P(x^2) if |x| < 0.921875
- y = 1 - exp (-(|x| + |x| * P(|x|))) otherwise. */
- sv_f32_t y = __sv_expf_x (pg, svneg_f32_x (pg, r));
- y = svsel_f32 (red, r, svsubr_n_f32_x (pg, y, 1.0));
-
- /* Boring domain (absolute value is required to get the sign of erf(-nan)
- right). */
- y = svsel_f32 (bor, sv_f32 (1.0f), svabs_f32_x (pg, y));
-
- /* y = erf(x) if x>0, -erf(-x) otherwise. */
- y = sv_as_f32_u32 (sveor_u32_x (pg, sv_as_u32_f32 (y), sign));
-
- if (unlikely (svptest_any (pg, cmp)))
- return __sv_erff_specialcase (x, y, cmp);
- return y;
-}
-
-PL_ALIAS (__sv_erff_x, _ZGVsMxv_erff)
-
-PL_SIG (SV, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (__sv_erff, 0.76)
-PL_TEST_INTERVAL (__sv_erff, 0, 0x1p-28, 20000)
-PL_TEST_INTERVAL (__sv_erff, 0x1p-28, 1, 60000)
-PL_TEST_INTERVAL (__sv_erff, 1, 0x1p28, 60000)
-PL_TEST_INTERVAL (__sv_erff, 0x1p28, inf, 20000)
-PL_TEST_INTERVAL (__sv_erff, -0, -0x1p-28, 20000)
-PL_TEST_INTERVAL (__sv_erff, -0x1p-28, -1, 60000)
-PL_TEST_INTERVAL (__sv_erff, -1, -0x1p28, 60000)
-PL_TEST_INTERVAL (__sv_erff, -0x1p28, -inf, 20000)
-#endif
diff --git a/pl/math/sv_erff_2u.c b/pl/math/sv_erff_2u.c
new file mode 100644
index 000000000000..adeee798ee2e
--- /dev/null
+++ b/pl/math/sv_erff_2u.c
@@ -0,0 +1,90 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float min, max, scale, shift, third;
+} data = {
+ .min = 0x1.cp-7f, /* 1/64 - 1/512. */
+ .max = 3.9375, /* 4 - 8/128. */
+ .scale = 0x1.20dd76p+0f, /* 2/sqrt(pi). */
+ .shift = 0x1p16f,
+ .third = 0x1.555556p-2f, /* 1/3. */
+};
+
+#define SignMask (0x80000000)
+
+/* Single-precision implementation of vector erf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
+
+ Values of erf(r) and scale are read from lookup tables.
+ For |x| < 0x1.cp-7, the algorithm sets r = 0, erf(r) = 0, and scale = 2 /
+ sqrt(pi), so it simply boils down to a Taylor series expansion near 0. For
+ |x| > 3.9375, erf(|x|) rounds to 1.0f.
+
+ Maximum error on each interval:
+ - [0, 0x1.cp-7]: 1.93 ULP
+ _ZGVsMxv_erff(0x1.c373e6p-9) got 0x1.fd686cp-9 want 0x1.fd6868p-9
+ - [0x1.cp-7, 4.0]: 1.26 ULP
+ _ZGVsMxv_erff(0x1.1d002ep+0) got 0x1.c4eb9ap-1 want 0x1.c4eb98p-1. */
+svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ /* |x| > 1/64 - 1/512. */
+ svbool_t a_gt_min = svacgt (pg, x, dat->min);
+
+ /* |x| >= 4.0 - 8/128. */
+ svbool_t a_ge_max = svacge (pg, x, dat->max);
+ svfloat32_t a = svabs_x (pg, x);
+
+ svfloat32_t shift = sv_f32 (dat->shift);
+ svfloat32_t z = svadd_x (pg, a, shift);
+ svuint32_t i
+ = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
+
+ /* Saturate lookup index. */
+ i = svsel (a_ge_max, sv_u32 (512), i);
+
+ /* r and erf(r) set to 0 for |x| below min. */
+ svfloat32_t r = svsub_z (a_gt_min, z, shift);
+ svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
+
+ /* scale set to 2/sqrt(pi) for |x| below min. */
+ svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
+ scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
+
+ /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */
+ svfloat32_t d = svsub_x (pg, a, r);
+ svfloat32_t d2 = svmul_x (pg, d, d);
+ svfloat32_t y = svmla_x (pg, r, d, dat->third);
+ y = svmla_x (pg, erfr, scale, svmls_x (pg, d, d2, y));
+
+ /* Solves the |x| = inf case. */
+ y = svsel (a_ge_max, sv_f32 (1.0f), y);
+
+ /* Copy sign. */
+ svuint32_t ix = svreinterpret_u32 (x);
+ svuint32_t iy = svreinterpret_u32 (y);
+ svuint32_t sign = svand_x (pg, ix, SignMask);
+ return svreinterpret_f32 (svorr_x (pg, sign, iy));
+}
+
+PL_SIG (SV, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (SV_NAME_F1 (erf), 1.43)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, 0x1.cp-7, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0x1.cp-7, 3.9375, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 3.9375, inf, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (erf), 0, inf, 4000)
diff --git a/pl/math/sv_erff_data.c b/pl/math/sv_erff_data.c
new file mode 100644
index 000000000000..154d3c188874
--- /dev/null
+++ b/pl/math/sv_erff_data.c
@@ -0,0 +1,1046 @@
+/*
+ * Data for approximation of vector erff.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* Lookup table used in SVE erff.
+ For each possible rounded input r (multiples of 1/128), between
+ r = 0.0 and r = 4.0 (513 values):
+ - __erff_data.erf contains the values of erf(r),
+ - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2).
+ Note that indices 0 and 1 are never hit by the algorithm, since lookup is
+ performed only for x >= 1/64-1/512. */
+const struct sv_erff_data __sv_erff_data = {
+ .erf = { 0x0.000000p+0,
+ 0x1.20dbf4p-7,
+ 0x1.20d770p-6,
+ 0x1.b137e0p-6,
+ 0x1.20c564p-5,
+ 0x1.68e5d4p-5,
+ 0x1.b0fafep-5,
+ 0x1.f902a8p-5,
+ 0x1.207d48p-4,
+ 0x1.44703ep-4,
+ 0x1.68591ap-4,
+ 0x1.8c36bep-4,
+ 0x1.b00812p-4,
+ 0x1.d3cbf8p-4,
+ 0x1.f7815ap-4,
+ 0x1.0d9390p-3,
+ 0x1.1f5e1ap-3,
+ 0x1.311fc2p-3,
+ 0x1.42d7fcp-3,
+ 0x1.548642p-3,
+ 0x1.662a0cp-3,
+ 0x1.77c2d2p-3,
+ 0x1.895010p-3,
+ 0x1.9ad142p-3,
+ 0x1.ac45e4p-3,
+ 0x1.bdad72p-3,
+ 0x1.cf076ep-3,
+ 0x1.e05354p-3,
+ 0x1.f190aap-3,
+ 0x1.015f78p-2,
+ 0x1.09eed6p-2,
+ 0x1.127632p-2,
+ 0x1.1af54ep-2,
+ 0x1.236bf0p-2,
+ 0x1.2bd9dcp-2,
+ 0x1.343ed6p-2,
+ 0x1.3c9aa8p-2,
+ 0x1.44ed18p-2,
+ 0x1.4d35f0p-2,
+ 0x1.5574f4p-2,
+ 0x1.5da9f4p-2,
+ 0x1.65d4b8p-2,
+ 0x1.6df50ap-2,
+ 0x1.760abap-2,
+ 0x1.7e1594p-2,
+ 0x1.861566p-2,
+ 0x1.8e0a02p-2,
+ 0x1.95f336p-2,
+ 0x1.9dd0d2p-2,
+ 0x1.a5a2acp-2,
+ 0x1.ad6896p-2,
+ 0x1.b52264p-2,
+ 0x1.bccfecp-2,
+ 0x1.c47104p-2,
+ 0x1.cc0584p-2,
+ 0x1.d38d44p-2,
+ 0x1.db081cp-2,
+ 0x1.e275eap-2,
+ 0x1.e9d68ap-2,
+ 0x1.f129d4p-2,
+ 0x1.f86faap-2,
+ 0x1.ffa7eap-2,
+ 0x1.03693ap-1,
+ 0x1.06f794p-1,
+ 0x1.0a7ef6p-1,
+ 0x1.0dff50p-1,
+ 0x1.117894p-1,
+ 0x1.14eab4p-1,
+ 0x1.1855a6p-1,
+ 0x1.1bb95cp-1,
+ 0x1.1f15ccp-1,
+ 0x1.226ae8p-1,
+ 0x1.25b8a8p-1,
+ 0x1.28ff02p-1,
+ 0x1.2c3decp-1,
+ 0x1.2f755cp-1,
+ 0x1.32a54cp-1,
+ 0x1.35cdb4p-1,
+ 0x1.38ee8ap-1,
+ 0x1.3c07cap-1,
+ 0x1.3f196ep-1,
+ 0x1.42236ep-1,
+ 0x1.4525c8p-1,
+ 0x1.482074p-1,
+ 0x1.4b1372p-1,
+ 0x1.4dfebap-1,
+ 0x1.50e24cp-1,
+ 0x1.53be26p-1,
+ 0x1.569244p-1,
+ 0x1.595ea6p-1,
+ 0x1.5c2348p-1,
+ 0x1.5ee02ep-1,
+ 0x1.619556p-1,
+ 0x1.6442c0p-1,
+ 0x1.66e86ep-1,
+ 0x1.69865ep-1,
+ 0x1.6c1c98p-1,
+ 0x1.6eab18p-1,
+ 0x1.7131e6p-1,
+ 0x1.73b102p-1,
+ 0x1.762870p-1,
+ 0x1.789836p-1,
+ 0x1.7b0058p-1,
+ 0x1.7d60d8p-1,
+ 0x1.7fb9c0p-1,
+ 0x1.820b12p-1,
+ 0x1.8454d6p-1,
+ 0x1.869712p-1,
+ 0x1.88d1cep-1,
+ 0x1.8b050ep-1,
+ 0x1.8d30dep-1,
+ 0x1.8f5544p-1,
+ 0x1.91724ap-1,
+ 0x1.9387f6p-1,
+ 0x1.959652p-1,
+ 0x1.979d68p-1,
+ 0x1.999d42p-1,
+ 0x1.9b95e8p-1,
+ 0x1.9d8768p-1,
+ 0x1.9f71cap-1,
+ 0x1.a1551ap-1,
+ 0x1.a33162p-1,
+ 0x1.a506b0p-1,
+ 0x1.a6d50cp-1,
+ 0x1.a89c86p-1,
+ 0x1.aa5d26p-1,
+ 0x1.ac16fcp-1,
+ 0x1.adca14p-1,
+ 0x1.af767ap-1,
+ 0x1.b11c3cp-1,
+ 0x1.b2bb68p-1,
+ 0x1.b4540ap-1,
+ 0x1.b5e630p-1,
+ 0x1.b771e8p-1,
+ 0x1.b8f742p-1,
+ 0x1.ba764ap-1,
+ 0x1.bbef10p-1,
+ 0x1.bd61a2p-1,
+ 0x1.bece0ep-1,
+ 0x1.c03464p-1,
+ 0x1.c194b2p-1,
+ 0x1.c2ef08p-1,
+ 0x1.c44376p-1,
+ 0x1.c5920ap-1,
+ 0x1.c6dad2p-1,
+ 0x1.c81de2p-1,
+ 0x1.c95b46p-1,
+ 0x1.ca930ep-1,
+ 0x1.cbc54cp-1,
+ 0x1.ccf20cp-1,
+ 0x1.ce1962p-1,
+ 0x1.cf3b5cp-1,
+ 0x1.d0580cp-1,
+ 0x1.d16f7ep-1,
+ 0x1.d281c4p-1,
+ 0x1.d38ef0p-1,
+ 0x1.d49710p-1,
+ 0x1.d59a34p-1,
+ 0x1.d6986cp-1,
+ 0x1.d791cap-1,
+ 0x1.d8865ep-1,
+ 0x1.d97636p-1,
+ 0x1.da6162p-1,
+ 0x1.db47f4p-1,
+ 0x1.dc29fcp-1,
+ 0x1.dd0788p-1,
+ 0x1.dde0aap-1,
+ 0x1.deb570p-1,
+ 0x1.df85eap-1,
+ 0x1.e0522ap-1,
+ 0x1.e11a3ep-1,
+ 0x1.e1de36p-1,
+ 0x1.e29e22p-1,
+ 0x1.e35a12p-1,
+ 0x1.e41214p-1,
+ 0x1.e4c638p-1,
+ 0x1.e5768cp-1,
+ 0x1.e62322p-1,
+ 0x1.e6cc08p-1,
+ 0x1.e7714ap-1,
+ 0x1.e812fcp-1,
+ 0x1.e8b12ap-1,
+ 0x1.e94be4p-1,
+ 0x1.e9e336p-1,
+ 0x1.ea7730p-1,
+ 0x1.eb07e2p-1,
+ 0x1.eb9558p-1,
+ 0x1.ec1fa2p-1,
+ 0x1.eca6ccp-1,
+ 0x1.ed2ae6p-1,
+ 0x1.edabfcp-1,
+ 0x1.ee2a1ep-1,
+ 0x1.eea556p-1,
+ 0x1.ef1db4p-1,
+ 0x1.ef9344p-1,
+ 0x1.f00614p-1,
+ 0x1.f07630p-1,
+ 0x1.f0e3a6p-1,
+ 0x1.f14e82p-1,
+ 0x1.f1b6d0p-1,
+ 0x1.f21ca0p-1,
+ 0x1.f27ff8p-1,
+ 0x1.f2e0eap-1,
+ 0x1.f33f7ep-1,
+ 0x1.f39bc2p-1,
+ 0x1.f3f5c2p-1,
+ 0x1.f44d88p-1,
+ 0x1.f4a31ep-1,
+ 0x1.f4f694p-1,
+ 0x1.f547f2p-1,
+ 0x1.f59742p-1,
+ 0x1.f5e490p-1,
+ 0x1.f62fe8p-1,
+ 0x1.f67952p-1,
+ 0x1.f6c0dcp-1,
+ 0x1.f7068cp-1,
+ 0x1.f74a6ep-1,
+ 0x1.f78c8cp-1,
+ 0x1.f7cceep-1,
+ 0x1.f80ba2p-1,
+ 0x1.f848acp-1,
+ 0x1.f8841ap-1,
+ 0x1.f8bdf2p-1,
+ 0x1.f8f63ep-1,
+ 0x1.f92d08p-1,
+ 0x1.f96256p-1,
+ 0x1.f99634p-1,
+ 0x1.f9c8a8p-1,
+ 0x1.f9f9bap-1,
+ 0x1.fa2974p-1,
+ 0x1.fa57dep-1,
+ 0x1.fa84fep-1,
+ 0x1.fab0dep-1,
+ 0x1.fadb84p-1,
+ 0x1.fb04f6p-1,
+ 0x1.fb2d40p-1,
+ 0x1.fb5464p-1,
+ 0x1.fb7a6cp-1,
+ 0x1.fb9f60p-1,
+ 0x1.fbc344p-1,
+ 0x1.fbe61ep-1,
+ 0x1.fc07fap-1,
+ 0x1.fc28d8p-1,
+ 0x1.fc48c2p-1,
+ 0x1.fc67bcp-1,
+ 0x1.fc85d0p-1,
+ 0x1.fca2fep-1,
+ 0x1.fcbf52p-1,
+ 0x1.fcdaccp-1,
+ 0x1.fcf576p-1,
+ 0x1.fd0f54p-1,
+ 0x1.fd286ap-1,
+ 0x1.fd40bep-1,
+ 0x1.fd5856p-1,
+ 0x1.fd6f34p-1,
+ 0x1.fd8562p-1,
+ 0x1.fd9ae2p-1,
+ 0x1.fdafb8p-1,
+ 0x1.fdc3e8p-1,
+ 0x1.fdd77ap-1,
+ 0x1.fdea6ep-1,
+ 0x1.fdfcccp-1,
+ 0x1.fe0e96p-1,
+ 0x1.fe1fd0p-1,
+ 0x1.fe3080p-1,
+ 0x1.fe40a6p-1,
+ 0x1.fe504cp-1,
+ 0x1.fe5f70p-1,
+ 0x1.fe6e18p-1,
+ 0x1.fe7c46p-1,
+ 0x1.fe8a00p-1,
+ 0x1.fe9748p-1,
+ 0x1.fea422p-1,
+ 0x1.feb090p-1,
+ 0x1.febc96p-1,
+ 0x1.fec836p-1,
+ 0x1.fed374p-1,
+ 0x1.fede52p-1,
+ 0x1.fee8d4p-1,
+ 0x1.fef2fep-1,
+ 0x1.fefccep-1,
+ 0x1.ff064cp-1,
+ 0x1.ff0f76p-1,
+ 0x1.ff1852p-1,
+ 0x1.ff20e0p-1,
+ 0x1.ff2924p-1,
+ 0x1.ff3120p-1,
+ 0x1.ff38d6p-1,
+ 0x1.ff4048p-1,
+ 0x1.ff4778p-1,
+ 0x1.ff4e68p-1,
+ 0x1.ff551ap-1,
+ 0x1.ff5b90p-1,
+ 0x1.ff61ccp-1,
+ 0x1.ff67d0p-1,
+ 0x1.ff6d9ep-1,
+ 0x1.ff7338p-1,
+ 0x1.ff789ep-1,
+ 0x1.ff7dd4p-1,
+ 0x1.ff82dap-1,
+ 0x1.ff87b2p-1,
+ 0x1.ff8c5cp-1,
+ 0x1.ff90dcp-1,
+ 0x1.ff9532p-1,
+ 0x1.ff9960p-1,
+ 0x1.ff9d68p-1,
+ 0x1.ffa14ap-1,
+ 0x1.ffa506p-1,
+ 0x1.ffa8a0p-1,
+ 0x1.ffac18p-1,
+ 0x1.ffaf6ep-1,
+ 0x1.ffb2a6p-1,
+ 0x1.ffb5bep-1,
+ 0x1.ffb8b8p-1,
+ 0x1.ffbb98p-1,
+ 0x1.ffbe5ap-1,
+ 0x1.ffc102p-1,
+ 0x1.ffc390p-1,
+ 0x1.ffc606p-1,
+ 0x1.ffc862p-1,
+ 0x1.ffcaa8p-1,
+ 0x1.ffccd8p-1,
+ 0x1.ffcef4p-1,
+ 0x1.ffd0fap-1,
+ 0x1.ffd2eap-1,
+ 0x1.ffd4cap-1,
+ 0x1.ffd696p-1,
+ 0x1.ffd84ep-1,
+ 0x1.ffd9f8p-1,
+ 0x1.ffdb90p-1,
+ 0x1.ffdd18p-1,
+ 0x1.ffde90p-1,
+ 0x1.ffdffap-1,
+ 0x1.ffe154p-1,
+ 0x1.ffe2a2p-1,
+ 0x1.ffe3e2p-1,
+ 0x1.ffe514p-1,
+ 0x1.ffe63cp-1,
+ 0x1.ffe756p-1,
+ 0x1.ffe866p-1,
+ 0x1.ffe96ap-1,
+ 0x1.ffea64p-1,
+ 0x1.ffeb54p-1,
+ 0x1.ffec3ap-1,
+ 0x1.ffed16p-1,
+ 0x1.ffedeap-1,
+ 0x1.ffeeb4p-1,
+ 0x1.ffef76p-1,
+ 0x1.fff032p-1,
+ 0x1.fff0e4p-1,
+ 0x1.fff18ep-1,
+ 0x1.fff232p-1,
+ 0x1.fff2d0p-1,
+ 0x1.fff366p-1,
+ 0x1.fff3f6p-1,
+ 0x1.fff480p-1,
+ 0x1.fff504p-1,
+ 0x1.fff582p-1,
+ 0x1.fff5fcp-1,
+ 0x1.fff670p-1,
+ 0x1.fff6dep-1,
+ 0x1.fff74ap-1,
+ 0x1.fff7aep-1,
+ 0x1.fff810p-1,
+ 0x1.fff86cp-1,
+ 0x1.fff8c6p-1,
+ 0x1.fff91cp-1,
+ 0x1.fff96cp-1,
+ 0x1.fff9bap-1,
+ 0x1.fffa04p-1,
+ 0x1.fffa4cp-1,
+ 0x1.fffa90p-1,
+ 0x1.fffad0p-1,
+ 0x1.fffb0ep-1,
+ 0x1.fffb4ap-1,
+ 0x1.fffb82p-1,
+ 0x1.fffbb8p-1,
+ 0x1.fffbecp-1,
+ 0x1.fffc1ep-1,
+ 0x1.fffc4ep-1,
+ 0x1.fffc7ap-1,
+ 0x1.fffca6p-1,
+ 0x1.fffccep-1,
+ 0x1.fffcf6p-1,
+ 0x1.fffd1ap-1,
+ 0x1.fffd3ep-1,
+ 0x1.fffd60p-1,
+ 0x1.fffd80p-1,
+ 0x1.fffda0p-1,
+ 0x1.fffdbep-1,
+ 0x1.fffddap-1,
+ 0x1.fffdf4p-1,
+ 0x1.fffe0ep-1,
+ 0x1.fffe26p-1,
+ 0x1.fffe3ep-1,
+ 0x1.fffe54p-1,
+ 0x1.fffe68p-1,
+ 0x1.fffe7ep-1,
+ 0x1.fffe90p-1,
+ 0x1.fffea2p-1,
+ 0x1.fffeb4p-1,
+ 0x1.fffec4p-1,
+ 0x1.fffed4p-1,
+ 0x1.fffee4p-1,
+ 0x1.fffef2p-1,
+ 0x1.ffff00p-1,
+ 0x1.ffff0cp-1,
+ 0x1.ffff18p-1,
+ 0x1.ffff24p-1,
+ 0x1.ffff30p-1,
+ 0x1.ffff3ap-1,
+ 0x1.ffff44p-1,
+ 0x1.ffff4ep-1,
+ 0x1.ffff56p-1,
+ 0x1.ffff60p-1,
+ 0x1.ffff68p-1,
+ 0x1.ffff70p-1,
+ 0x1.ffff78p-1,
+ 0x1.ffff7ep-1,
+ 0x1.ffff84p-1,
+ 0x1.ffff8cp-1,
+ 0x1.ffff92p-1,
+ 0x1.ffff98p-1,
+ 0x1.ffff9cp-1,
+ 0x1.ffffa2p-1,
+ 0x1.ffffa6p-1,
+ 0x1.ffffacp-1,
+ 0x1.ffffb0p-1,
+ 0x1.ffffb4p-1,
+ 0x1.ffffb8p-1,
+ 0x1.ffffbcp-1,
+ 0x1.ffffc0p-1,
+ 0x1.ffffc4p-1,
+ 0x1.ffffc6p-1,
+ 0x1.ffffcap-1,
+ 0x1.ffffccp-1,
+ 0x1.ffffd0p-1,
+ 0x1.ffffd2p-1,
+ 0x1.ffffd4p-1,
+ 0x1.ffffd6p-1,
+ 0x1.ffffd8p-1,
+ 0x1.ffffdcp-1,
+ 0x1.ffffdep-1,
+ 0x1.ffffdep-1,
+ 0x1.ffffe0p-1,
+ 0x1.ffffe2p-1,
+ 0x1.ffffe4p-1,
+ 0x1.ffffe6p-1,
+ 0x1.ffffe8p-1,
+ 0x1.ffffe8p-1,
+ 0x1.ffffeap-1,
+ 0x1.ffffeap-1,
+ 0x1.ffffecp-1,
+ 0x1.ffffeep-1,
+ 0x1.ffffeep-1,
+ 0x1.fffff0p-1,
+ 0x1.fffff0p-1,
+ 0x1.fffff2p-1,
+ 0x1.fffff2p-1,
+ 0x1.fffff2p-1,
+ 0x1.fffff4p-1,
+ 0x1.fffff4p-1,
+ 0x1.fffff4p-1,
+ 0x1.fffff6p-1,
+ 0x1.fffff6p-1,
+ 0x1.fffff6p-1,
+ 0x1.fffff8p-1,
+ 0x1.fffff8p-1,
+ 0x1.fffff8p-1,
+ 0x1.fffff8p-1,
+ 0x1.fffffap-1,
+ 0x1.fffffap-1,
+ 0x1.fffffap-1,
+ 0x1.fffffap-1,
+ 0x1.fffffap-1,
+ 0x1.fffffap-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffcp-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.fffffep-1,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ 0x1.000000p+0,
+ },
+ .scale = { 0x1.20dd76p+0,
+ 0x1.20d8f2p+0,
+ 0x1.20cb68p+0,
+ 0x1.20b4d8p+0,
+ 0x1.209546p+0,
+ 0x1.206cb4p+0,
+ 0x1.203b26p+0,
+ 0x1.2000a0p+0,
+ 0x1.1fbd28p+0,
+ 0x1.1f70c4p+0,
+ 0x1.1f1b7ap+0,
+ 0x1.1ebd56p+0,
+ 0x1.1e565cp+0,
+ 0x1.1de698p+0,
+ 0x1.1d6e14p+0,
+ 0x1.1cecdcp+0,
+ 0x1.1c62fap+0,
+ 0x1.1bd07cp+0,
+ 0x1.1b3572p+0,
+ 0x1.1a91e6p+0,
+ 0x1.19e5eap+0,
+ 0x1.19318cp+0,
+ 0x1.1874dep+0,
+ 0x1.17aff0p+0,
+ 0x1.16e2d8p+0,
+ 0x1.160da4p+0,
+ 0x1.153068p+0,
+ 0x1.144b3cp+0,
+ 0x1.135e30p+0,
+ 0x1.12695ep+0,
+ 0x1.116cd8p+0,
+ 0x1.1068bap+0,
+ 0x1.0f5d16p+0,
+ 0x1.0e4a08p+0,
+ 0x1.0d2fa6p+0,
+ 0x1.0c0e0ap+0,
+ 0x1.0ae550p+0,
+ 0x1.09b590p+0,
+ 0x1.087ee4p+0,
+ 0x1.07416cp+0,
+ 0x1.05fd3ep+0,
+ 0x1.04b27cp+0,
+ 0x1.036140p+0,
+ 0x1.0209a6p+0,
+ 0x1.00abd0p+0,
+ 0x1.fe8fb0p-1,
+ 0x1.fbbbbep-1,
+ 0x1.f8dc0ap-1,
+ 0x1.f5f0cep-1,
+ 0x1.f2fa4cp-1,
+ 0x1.eff8c4p-1,
+ 0x1.ecec78p-1,
+ 0x1.e9d5a8p-1,
+ 0x1.e6b498p-1,
+ 0x1.e38988p-1,
+ 0x1.e054bep-1,
+ 0x1.dd167cp-1,
+ 0x1.d9cf06p-1,
+ 0x1.d67ea2p-1,
+ 0x1.d32592p-1,
+ 0x1.cfc41ep-1,
+ 0x1.cc5a8ap-1,
+ 0x1.c8e91cp-1,
+ 0x1.c5701ap-1,
+ 0x1.c1efcap-1,
+ 0x1.be6872p-1,
+ 0x1.bada5ap-1,
+ 0x1.b745c6p-1,
+ 0x1.b3aafcp-1,
+ 0x1.b00a46p-1,
+ 0x1.ac63e8p-1,
+ 0x1.a8b828p-1,
+ 0x1.a5074ep-1,
+ 0x1.a1519ep-1,
+ 0x1.9d9762p-1,
+ 0x1.99d8dap-1,
+ 0x1.961650p-1,
+ 0x1.925008p-1,
+ 0x1.8e8646p-1,
+ 0x1.8ab950p-1,
+ 0x1.86e96ap-1,
+ 0x1.8316d6p-1,
+ 0x1.7f41dcp-1,
+ 0x1.7b6abcp-1,
+ 0x1.7791b8p-1,
+ 0x1.73b714p-1,
+ 0x1.6fdb12p-1,
+ 0x1.6bfdf0p-1,
+ 0x1.681ff2p-1,
+ 0x1.644156p-1,
+ 0x1.60625cp-1,
+ 0x1.5c8342p-1,
+ 0x1.58a446p-1,
+ 0x1.54c5a6p-1,
+ 0x1.50e79ep-1,
+ 0x1.4d0a68p-1,
+ 0x1.492e42p-1,
+ 0x1.455366p-1,
+ 0x1.417a0cp-1,
+ 0x1.3da26ep-1,
+ 0x1.39ccc2p-1,
+ 0x1.35f940p-1,
+ 0x1.32281ep-1,
+ 0x1.2e5992p-1,
+ 0x1.2a8dcep-1,
+ 0x1.26c508p-1,
+ 0x1.22ff72p-1,
+ 0x1.1f3d3cp-1,
+ 0x1.1b7e98p-1,
+ 0x1.17c3b6p-1,
+ 0x1.140cc4p-1,
+ 0x1.1059eep-1,
+ 0x1.0cab62p-1,
+ 0x1.09014cp-1,
+ 0x1.055bd6p-1,
+ 0x1.01bb2cp-1,
+ 0x1.fc3ee6p-2,
+ 0x1.f511aap-2,
+ 0x1.edeeeep-2,
+ 0x1.e6d700p-2,
+ 0x1.dfca26p-2,
+ 0x1.d8c8aap-2,
+ 0x1.d1d2d0p-2,
+ 0x1.cae8dap-2,
+ 0x1.c40b08p-2,
+ 0x1.bd3998p-2,
+ 0x1.b674c8p-2,
+ 0x1.afbcd4p-2,
+ 0x1.a911f0p-2,
+ 0x1.a27456p-2,
+ 0x1.9be438p-2,
+ 0x1.9561c8p-2,
+ 0x1.8eed36p-2,
+ 0x1.8886b2p-2,
+ 0x1.822e66p-2,
+ 0x1.7be47ap-2,
+ 0x1.75a91ap-2,
+ 0x1.6f7c6ap-2,
+ 0x1.695e8cp-2,
+ 0x1.634fa6p-2,
+ 0x1.5d4fd4p-2,
+ 0x1.575f34p-2,
+ 0x1.517de6p-2,
+ 0x1.4bac00p-2,
+ 0x1.45e99cp-2,
+ 0x1.4036d0p-2,
+ 0x1.3a93b2p-2,
+ 0x1.350052p-2,
+ 0x1.2f7cc4p-2,
+ 0x1.2a0916p-2,
+ 0x1.24a554p-2,
+ 0x1.1f518ap-2,
+ 0x1.1a0dc6p-2,
+ 0x1.14da0ap-2,
+ 0x1.0fb662p-2,
+ 0x1.0aa2d0p-2,
+ 0x1.059f5ap-2,
+ 0x1.00ac00p-2,
+ 0x1.f79184p-3,
+ 0x1.edeb40p-3,
+ 0x1.e46530p-3,
+ 0x1.daff4ap-3,
+ 0x1.d1b982p-3,
+ 0x1.c893cep-3,
+ 0x1.bf8e1cp-3,
+ 0x1.b6a856p-3,
+ 0x1.ade26cp-3,
+ 0x1.a53c42p-3,
+ 0x1.9cb5bep-3,
+ 0x1.944ec2p-3,
+ 0x1.8c0732p-3,
+ 0x1.83deeap-3,
+ 0x1.7bd5c8p-3,
+ 0x1.73eba4p-3,
+ 0x1.6c2056p-3,
+ 0x1.6473b6p-3,
+ 0x1.5ce596p-3,
+ 0x1.5575c8p-3,
+ 0x1.4e241ep-3,
+ 0x1.46f066p-3,
+ 0x1.3fda6cp-3,
+ 0x1.38e1fap-3,
+ 0x1.3206dcp-3,
+ 0x1.2b48dap-3,
+ 0x1.24a7b8p-3,
+ 0x1.1e233ep-3,
+ 0x1.17bb2cp-3,
+ 0x1.116f48p-3,
+ 0x1.0b3f52p-3,
+ 0x1.052b0cp-3,
+ 0x1.fe6460p-4,
+ 0x1.f2a902p-4,
+ 0x1.e72372p-4,
+ 0x1.dbd32ap-4,
+ 0x1.d0b7a0p-4,
+ 0x1.c5d04ap-4,
+ 0x1.bb1c98p-4,
+ 0x1.b09bfcp-4,
+ 0x1.a64de6p-4,
+ 0x1.9c31c6p-4,
+ 0x1.92470ap-4,
+ 0x1.888d1ep-4,
+ 0x1.7f036cp-4,
+ 0x1.75a960p-4,
+ 0x1.6c7e64p-4,
+ 0x1.6381e2p-4,
+ 0x1.5ab342p-4,
+ 0x1.5211ecp-4,
+ 0x1.499d48p-4,
+ 0x1.4154bcp-4,
+ 0x1.3937b2p-4,
+ 0x1.31458ep-4,
+ 0x1.297dbap-4,
+ 0x1.21df9ap-4,
+ 0x1.1a6a96p-4,
+ 0x1.131e14p-4,
+ 0x1.0bf97ep-4,
+ 0x1.04fc3ap-4,
+ 0x1.fc4b5ep-5,
+ 0x1.eeea8cp-5,
+ 0x1.e1d4d0p-5,
+ 0x1.d508fap-5,
+ 0x1.c885e0p-5,
+ 0x1.bc4a54p-5,
+ 0x1.b05530p-5,
+ 0x1.a4a54ap-5,
+ 0x1.99397ap-5,
+ 0x1.8e109cp-5,
+ 0x1.83298ep-5,
+ 0x1.78832cp-5,
+ 0x1.6e1c58p-5,
+ 0x1.63f3f6p-5,
+ 0x1.5a08e8p-5,
+ 0x1.505a18p-5,
+ 0x1.46e66cp-5,
+ 0x1.3dacd2p-5,
+ 0x1.34ac36p-5,
+ 0x1.2be38cp-5,
+ 0x1.2351c2p-5,
+ 0x1.1af5d2p-5,
+ 0x1.12ceb4p-5,
+ 0x1.0adb60p-5,
+ 0x1.031ad6p-5,
+ 0x1.f7182ap-6,
+ 0x1.e85c44p-6,
+ 0x1.da0006p-6,
+ 0x1.cc0180p-6,
+ 0x1.be5ecep-6,
+ 0x1.b1160ap-6,
+ 0x1.a4255ap-6,
+ 0x1.978ae8p-6,
+ 0x1.8b44e6p-6,
+ 0x1.7f5188p-6,
+ 0x1.73af0cp-6,
+ 0x1.685bb6p-6,
+ 0x1.5d55ccp-6,
+ 0x1.529b9ep-6,
+ 0x1.482b84p-6,
+ 0x1.3e03d8p-6,
+ 0x1.3422fep-6,
+ 0x1.2a875cp-6,
+ 0x1.212f62p-6,
+ 0x1.181984p-6,
+ 0x1.0f443ep-6,
+ 0x1.06ae14p-6,
+ 0x1.fcab14p-7,
+ 0x1.ec7262p-7,
+ 0x1.dcaf36p-7,
+ 0x1.cd5ecap-7,
+ 0x1.be7e5ap-7,
+ 0x1.b00b38p-7,
+ 0x1.a202bep-7,
+ 0x1.94624ep-7,
+ 0x1.87275ep-7,
+ 0x1.7a4f6ap-7,
+ 0x1.6dd7fep-7,
+ 0x1.61beaep-7,
+ 0x1.56011cp-7,
+ 0x1.4a9cf6p-7,
+ 0x1.3f8ff6p-7,
+ 0x1.34d7dcp-7,
+ 0x1.2a727ap-7,
+ 0x1.205dacp-7,
+ 0x1.169756p-7,
+ 0x1.0d1d6ap-7,
+ 0x1.03ede2p-7,
+ 0x1.f60d8ap-8,
+ 0x1.e4cc4ap-8,
+ 0x1.d4143ap-8,
+ 0x1.c3e1a6p-8,
+ 0x1.b430ecp-8,
+ 0x1.a4fe84p-8,
+ 0x1.9646f4p-8,
+ 0x1.8806d8p-8,
+ 0x1.7a3adep-8,
+ 0x1.6cdfccp-8,
+ 0x1.5ff276p-8,
+ 0x1.536fc2p-8,
+ 0x1.4754acp-8,
+ 0x1.3b9e40p-8,
+ 0x1.30499cp-8,
+ 0x1.2553eep-8,
+ 0x1.1aba78p-8,
+ 0x1.107a8cp-8,
+ 0x1.06918cp-8,
+ 0x1.f9f9d0p-9,
+ 0x1.e77448p-9,
+ 0x1.d58da6p-9,
+ 0x1.c4412cp-9,
+ 0x1.b38a3ap-9,
+ 0x1.a36454p-9,
+ 0x1.93cb12p-9,
+ 0x1.84ba30p-9,
+ 0x1.762d84p-9,
+ 0x1.682100p-9,
+ 0x1.5a90b0p-9,
+ 0x1.4d78bcp-9,
+ 0x1.40d564p-9,
+ 0x1.34a306p-9,
+ 0x1.28de12p-9,
+ 0x1.1d8318p-9,
+ 0x1.128ebap-9,
+ 0x1.07fdb4p-9,
+ 0x1.fb99b8p-10,
+ 0x1.e7f232p-10,
+ 0x1.d4fed8p-10,
+ 0x1.c2b9d0p-10,
+ 0x1.b11d70p-10,
+ 0x1.a02436p-10,
+ 0x1.8fc8c8p-10,
+ 0x1.8005f0p-10,
+ 0x1.70d6a4p-10,
+ 0x1.6235fcp-10,
+ 0x1.541f34p-10,
+ 0x1.468daep-10,
+ 0x1.397ceep-10,
+ 0x1.2ce898p-10,
+ 0x1.20cc76p-10,
+ 0x1.15246ep-10,
+ 0x1.09ec86p-10,
+ 0x1.fe41cep-11,
+ 0x1.e97ba4p-11,
+ 0x1.d57f52p-11,
+ 0x1.c245d4p-11,
+ 0x1.afc85ep-11,
+ 0x1.9e0058p-11,
+ 0x1.8ce75ep-11,
+ 0x1.7c7744p-11,
+ 0x1.6caa0ep-11,
+ 0x1.5d79ecp-11,
+ 0x1.4ee142p-11,
+ 0x1.40daa4p-11,
+ 0x1.3360ccp-11,
+ 0x1.266ea8p-11,
+ 0x1.19ff46p-11,
+ 0x1.0e0de8p-11,
+ 0x1.0295f0p-11,
+ 0x1.ef25d4p-12,
+ 0x1.da0110p-12,
+ 0x1.c5b542p-12,
+ 0x1.b23a5ap-12,
+ 0x1.9f8894p-12,
+ 0x1.8d986ap-12,
+ 0x1.7c629ap-12,
+ 0x1.6be022p-12,
+ 0x1.5c0a38p-12,
+ 0x1.4cda54p-12,
+ 0x1.3e4a24p-12,
+ 0x1.305390p-12,
+ 0x1.22f0b4p-12,
+ 0x1.161be4p-12,
+ 0x1.09cfa4p-12,
+ 0x1.fc0d56p-13,
+ 0x1.e577bcp-13,
+ 0x1.cfd4a6p-13,
+ 0x1.bb1a96p-13,
+ 0x1.a74068p-13,
+ 0x1.943d4ap-13,
+ 0x1.8208bcp-13,
+ 0x1.709a8ep-13,
+ 0x1.5feadap-13,
+ 0x1.4ff208p-13,
+ 0x1.40a8c2p-13,
+ 0x1.3207fcp-13,
+ 0x1.2408eap-13,
+ 0x1.16a502p-13,
+ 0x1.09d5f8p-13,
+ 0x1.fb2b7ap-14,
+ 0x1.e3bcf4p-14,
+ 0x1.cd5528p-14,
+ 0x1.b7e946p-14,
+ 0x1.a36eecp-14,
+ 0x1.8fdc1cp-14,
+ 0x1.7d2738p-14,
+ 0x1.6b4702p-14,
+ 0x1.5a329cp-14,
+ 0x1.49e178p-14,
+ 0x1.3a4b60p-14,
+ 0x1.2b6876p-14,
+ 0x1.1d3120p-14,
+ 0x1.0f9e1cp-14,
+ 0x1.02a868p-14,
+ 0x1.ec929ap-15,
+ 0x1.d4f4b4p-15,
+ 0x1.be6abcp-15,
+ 0x1.a8e8ccp-15,
+ 0x1.94637ep-15,
+ 0x1.80cfdcp-15,
+ 0x1.6e2368p-15,
+ 0x1.5c540cp-15,
+ 0x1.4b581cp-15,
+ 0x1.3b2652p-15,
+ 0x1.2bb5ccp-15,
+ 0x1.1cfe02p-15,
+ 0x1.0ef6c4p-15,
+ 0x1.019842p-15,
+ 0x1.e9b5e8p-16,
+ 0x1.d16f58p-16,
+ 0x1.ba4f04p-16,
+ 0x1.a447b8p-16,
+ 0x1.8f4cccp-16,
+ 0x1.7b5224p-16,
+ 0x1.684c22p-16,
+ 0x1.562facp-16,
+ 0x1.44f21ep-16,
+ 0x1.34894ap-16,
+ 0x1.24eb72p-16,
+ 0x1.160f44p-16,
+ 0x1.07ebd2p-16,
+ 0x1.f4f12ep-17,
+ 0x1.db5ad0p-17,
+ 0x1.c304f0p-17,
+ 0x1.abe09ep-17,
+ 0x1.95df98p-17,
+ 0x1.80f43ap-17,
+ 0x1.6d1178p-17,
+ 0x1.5a2ae0p-17,
+ 0x1.483488p-17,
+ 0x1.372310p-17,
+ 0x1.26eb9ep-17,
+ 0x1.1783cep-17,
+ 0x1.08e1bap-17,
+ 0x1.f5f7d8p-18,
+ 0x1.db92b6p-18,
+ 0x1.c282cep-18,
+ 0x1.aab7acp-18,
+ 0x1.94219cp-18,
+ 0x1.7eb1a2p-18,
+ 0x1.6a5972p-18,
+ 0x1.570b6ap-18,
+ 0x1.44ba86p-18,
+ 0x1.335a62p-18,
+ 0x1.22df2ap-18,
+ 0x1.133d96p-18,
+ 0x1.046aeap-18,
+ 0x1.ecb9d0p-19,
+ 0x1.d21398p-19,
+ 0x1.b8d094p-19,
+ 0x1.a0df10p-19,
+ 0x1.8a2e26p-19,
+ 0x1.74adc8p-19,
+ 0x1.604ea8p-19,
+ 0x1.4d0232p-19,
+ 0x1.3aba86p-19,
+ 0x1.296a70p-19,
+ 0x1.190562p-19,
+ 0x1.097f62p-19,
+ 0x1.f59a20p-20,
+ 0x1.d9c736p-20,
+ 0x1.bf716cp-20,
+ 0x1.a6852cp-20,
+ 0x1.8eefd8p-20,
+ 0x1.789fb8p-20,
+ 0x1.6383f8p-20,
+ 0x1.4f8c96p-20,
+ 0x1.3caa62p-20,
+ 0x1.2acee2p-20,
+ 0x1.19ec60p-20,
+ 0x1.09f5d0p-20,
+ 0x1.f5bd96p-21,
+ 0x1.d9371ep-21,
+ 0x1.be41dep-21,
+ 0x1.a4c89ep-21,
+ 0x1.8cb738p-21,
+ 0x1.75fa8ep-21,
+ 0x1.608078p-21,
+ 0x1.4c37c0p-21,
+ 0x1.39100ep-21,
+ 0x1.26f9e0p-21,
+ 0x1.15e682p-21,
+ 0x1.05c804p-21,
+ 0x1.ed2254p-22,
+ 0x1.d06ad6p-22,
+ 0x1.b551c8p-22,
+ 0x1.9bc0a0p-22,
+ 0x1.83a200p-22,
+ 0x1.6ce1aap-22,
+ 0x1.576c72p-22,
+ 0x1.43302cp-22,
+ 0x1.301ba2p-22,
+ 0x1.1e1e86p-22,
+ 0x1.0d2966p-22,
+ 0x1.fa5b50p-23,
+ 0x1.dc3ae4p-23,
+ 0x1.bfd756p-23,
+ 0x1.a517dap-23,
+ 0x1.8be4f8p-23,
+ 0x1.74287ep-23,
+ 0x1.5dcd66p-23,
+ 0x1.48bfd4p-23,
+ 0x1.34ecf8p-23,
+ 0x1.224310p-23,
+ 0x1.10b148p-23,
+ },
+};
diff --git a/pl/math/sv_exp10_1u5.c b/pl/math/sv_exp10_1u5.c
new file mode 100644
index 000000000000..519693afcab0
--- /dev/null
+++ b/pl/math/sv_exp10_1u5.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision SVE 10^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f64.h"
+
+#define SpecialBound 307.0 /* floor (log10 (2^1023)). */
+
+static const struct data
+{
+ double poly[5];
+ double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
+} data = {
+ /* Coefficients generated using Remez algorithm.
+ rel error: 0x1.9fcb9b3p-60
+ abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
+ max ulp err 0.52 +0.5. */
+ .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
+ 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
+ /* 1.5*2^46+1023. This value is further explained below. */
+ .shift = 0x1.800000000ffc0p+46,
+ .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
+ .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */
+ .log2_10_lo = -0x1.9dc1da994fd21p-59,
+ .scale_thres = 1280.0,
+ .special_bound = SpecialBound,
+};
+
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+/* Update of both special and non-special cases, if any special case is
+ detected. */
+static inline svfloat64_t
+special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+ const struct data *d)
+{
+ /* s=2^n may overflow, break it up into s=s1*s2,
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
+ and s1*s1 overflows only if n>0. */
+
+ /* If n<=0 then set b to 0x6, 0 otherwise. */
+ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
+ svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);
+
+ /* Set s1 to generate overflow depending on sign of exponent n. */
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold. */
+ svfloat64_t s2 = svreinterpret_f64 (
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+ /* |n| > 1280 => 2^(n) overflows. */
+ svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
+
+ svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+ svfloat64_t r0 = svmul_x (pg, r2, s1);
+
+ return svsel (p_cmp, r1, r0);
+}
+
+/* Fast vector implementation of exp10 using FEXPA instruction.
+ Maximum measured error is 1.02 ulp.
+ SV_NAME_D1 (exp10)(-0x1.2862fec805e58p+2) got 0x1.885a89551d782p-16
+ want 0x1.885a89551d781p-16. */
+svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t no_big_scale = svacle (pg, x, d->special_bound);
+ svbool_t special = svnot_z (pg, no_big_scale);
+
+ /* n = round(x/(log10(2)/N)). */
+ svfloat64_t shift = sv_f64 (d->shift);
+ svfloat64_t z = svmla_x (pg, shift, x, d->log10_2);
+ svfloat64_t n = svsub_x (pg, z, shift);
+
+ /* r = x - n*log10(2)/N. */
+ svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi);
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, log2_10, 0);
+ r = svmls_lane (r, n, log2_10, 1);
+
+ /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so
+ for consistent NaN handling we have to manually propagate them. This
+ comes at significant performance cost. */
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t scale = svexpa (u);
+
+ /* Approximate exp10(r) using polynomial. */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
+ sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
+
+ /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound
+ multiplication may overflow, so use special case routine. */
+ if (unlikely (svptest_any (pg, special)))
+ {
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+ special case function so needs to be copied.
+ e = sign bit of u << 46. */
+ svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+ /* Copy sign to scale. */
+ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+ return special_case (pg, scale, y, n, d);
+ }
+
+ /* No special case. */
+ return svmla_x (pg, scale, scale, y);
+}
+
+PL_SIG (SV, D, 1, exp10, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_D1 (exp10), 0.52)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 0, 307, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp10), 307, inf, 1000)
diff --git a/pl/math/sv_exp10f_1u5.c b/pl/math/sv_exp10f_1u5.c
new file mode 100644
index 000000000000..9ecde8f1aa52
--- /dev/null
+++ b/pl/math/sv_exp10f_1u5.c
@@ -0,0 +1,87 @@
+/*
+ * Single-precision SVE 2^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "include/mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f32.h"
+
+/* For x < -SpecialBound, the result is subnormal and not handled correctly by
+ FEXPA. */
+#define SpecialBound 37.9
+
+static const struct data
+{
+ float poly[5];
+ float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
+} data = {
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error.
+ rel error: 0x1.89dafa3p-24
+ abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+ maxerr: 0.52 +0.5 ulp. */
+ .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
+ 0x1.12b41ap-1f },
+ /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
+ .shift = 0x1.903f8p17f,
+ .log10_2 = 0x1.a934fp+1,
+ .log2_10_hi = 0x1.344136p-2,
+ .log2_10_lo = -0x1.ec10cp-27,
+ .special_bound = SpecialBound,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (exp10f, x, y, special);
+}
+
+/* Single-precision SVE exp10f routine. Implements the same algorithm
+ as AdvSIMD exp10f.
+ Worst case error is 1.02 ULPs.
+ _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+ want 0x1.ba5f9cp-1. */
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
+ with poly(r) in [1/sqrt(2), sqrt(2)] and
+ x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
+
+ /* Load some constants in quad-word chunks to minimise memory access (last
+ lane is wasted). */
+ svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
+
+ /* n = round(x/(log10(2)/N)). */
+ svfloat32_t shift = sv_f32 (d->shift);
+ svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
+ svfloat32_t n = svsub_x (pg, z, shift);
+
+ /* r = x - n*log10(2)/N. */
+ svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
+ r = svmls_lane (r, n, log10_2_and_inv, 2);
+
+ svbool_t special = svacgt (pg, x, d->special_bound);
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+
+ /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t poly
+ = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
+ sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmla_x (pg, scale, scale, poly), special);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+PL_SIG (SV, F, 1, exp10, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_F1 (exp10), 0.52)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), 0, SpecialBound, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp10), SpecialBound, inf, 50000)
diff --git a/pl/math/sv_exp2_2u.c b/pl/math/sv_exp2_2u.c
new file mode 100644
index 000000000000..dcbca8adddd1
--- /dev/null
+++ b/pl/math/sv_exp2_2u.c
@@ -0,0 +1,107 @@
+/*
+ * Double-precision SVE 2^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+
+#define BigBound 1022
+#define UOFlowBound 1280
+
+static const struct data
+{
+ double poly[4];
+ double shift, big_bound, uoflow_bound;
+} data = {
+ /* Coefficients are computed using Remez algorithm with
+ minimisation of the absolute error. */
+ .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
+ 0x1.3b2abf5571ad8p-7 },
+ .shift = 0x1.8p52 / N,
+ .uoflow_bound = UOFlowBound,
+ .big_bound = BigBound,
+};
+
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+/* Update of both special and non-special cases, if any special case is
+ detected. */
+static inline svfloat64_t
+special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
+ const struct data *d)
+{
+ /* s=2^n may overflow, break it up into s=s1*s2,
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
+ and s1*s1 overflows only if n>0. */
+
+ /* If n<=0 then set b to 0x6, 0 otherwise. */
+ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
+ svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);
+
+ /* Set s1 to generate overflow depending on sign of exponent n. */
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold. */
+ svfloat64_t s2 = svreinterpret_f64 (
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+ /* |n| > 1280 => 2^(n) overflows. */
+ svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+
+ svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+ svfloat64_t r0 = svmul_x (pg, r2, s1);
+
+ return svsel (p_cmp, r1, r0);
+}
+
+/* Fast vector implementation of exp2.
+ Maximum measured error is 1.65 ulp.
+ _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
+ want 0x1.f8db0d4df721dp-1. */
+svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t no_big_scale = svacle (pg, x, d->big_bound);
+ svbool_t special = svnot_z (pg, no_big_scale);
+
+ /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N]. */
+ svfloat64_t shift = sv_f64 (d->shift);
+ svfloat64_t kd = svadd_x (pg, x, shift);
+ svuint64_t ki = svreinterpret_u64 (kd);
+ /* kd = k/N. */
+ kd = svsub_x (pg, kd, shift);
+ svfloat64_t r = svsub_x (pg, x, kd);
+
+ /* scale ~= 2^(k/N). */
+ svuint64_t idx = svand_x (pg, ki, N - 1);
+ svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
+ svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+
+ /* Approximate exp2(r) using polynomial. */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
+ svfloat64_t y = svmul_x (pg, r, p);
+
+ /* Assemble exp2(x) = exp2(r) * scale. */
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (pg, scale, y, kd, d);
+ return svmla_x (pg, scale, scale, y);
+}
+
+PL_SIG (SV, D, 1, exp2, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_D1 (exp2), 1.15)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), 0, BigBound, 1000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), BigBound, UOFlowBound, 100000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp2), UOFlowBound, inf, 1000)
diff --git a/pl/math/sv_exp2f_1u6.c b/pl/math/sv_exp2f_1u6.c
new file mode 100644
index 000000000000..9698ff6f0682
--- /dev/null
+++ b/pl/math/sv_exp2f_1u6.c
@@ -0,0 +1,80 @@
+/*
+ * Single-precision SVE 2^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float poly[5];
+ float shift, thres;
+} data = {
+ /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+ compatibility with polynomial helpers. */
+ .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
+ 0x1.59977ap-10f },
+ /* 1.5*2^17 + 127. */
+ .shift = 0x1.903f8p17f,
+ /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+ correctly by FEXPA. */
+ .thres = 0x1.5d5e2ap+6f,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (exp2f, x, y, special);
+}
+
+/* Single-precision SVE exp2f routine. Implements the same algorithm
+ as AdvSIMD exp2f.
+ Worst case error is 1.04 ULPs.
+ SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
+ want 0x1.ba7ebp+0. */
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ svfloat32_t shift = sv_f32 (d->shift);
+ svfloat32_t z = svadd_x (pg, x, shift);
+ svfloat32_t n = svsub_x (pg, z, shift);
+ svfloat32_t r = svsub_x (pg, x, n);
+
+ svbool_t special = svacgt (pg, x, d->thres);
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
+
+ /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
+ Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
+ coefficients 1 to 4, and apply most significant coefficient directly. */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
+ svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmla_x (pg, scale, scale, poly), special);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+PL_SIG (SV, F, 1, exp2, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_F1 (exp2), 0.55)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 0, Thres, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, 1, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), 1, Thres, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), Thres, inf, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, -0x1p-23, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p-23, -1, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, -0x1p23, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0x1p23, -inf, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -0, ScaleThres, 40000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -1, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), -1, ScaleThres, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (exp2), ScaleThres, -inf, 50000)
diff --git a/pl/math/sv_exp_1u5.c b/pl/math/sv_exp_1u5.c
new file mode 100644
index 000000000000..c187def9e625
--- /dev/null
+++ b/pl/math/sv_exp_1u5.c
@@ -0,0 +1,137 @@
+/*
+ * Double-precision vector e^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ double poly[4];
+ double ln2_hi, ln2_lo, inv_ln2, shift, thres;
+} data = {
+ .poly = { /* ulp error: 0.53. */
+ 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
+ 0x1.1111266d28935p-7 },
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ /* 1/ln2. */
+ .inv_ln2 = 0x1.71547652b82fep+0,
+ /* 1.5*2^46+1023. This value is further explained below. */
+ .shift = 0x1.800000000ffc0p+46,
+ .thres = 704.0,
+};
+
+#define C(i) sv_f64 (d->poly[i])
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+/* Update of both special and non-special cases, if any special case is
+ detected. */
+static inline svfloat64_t
+special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
+{
+ /* s=2^n may overflow, break it up into s=s1*s2,
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
+ and s1*s1 overflows only if n>0. */
+
+ /* If n<=0 then set b to 0x6, 0 otherwise. */
+ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
+ svuint64_t b
+ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
+
+ /* Set s1 to generate overflow depending on sign of exponent n. */
+ svfloat64_t s1 = svreinterpret_f64 (
+ svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */
+ /* Offset s to avoid overflow in final result if n is below threshold. */
+ svfloat64_t s2 = svreinterpret_f64 (
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
+ b)); /* as_u64 (s) - 0x3010...0 + b. */
+
+ /* |n| > 1280 => 2^(n) overflows. */
+ svbool_t p_cmp = svacgt (pg, n, 1280.0);
+
+ svfloat64_t r1 = svmul_x (pg, s1, s1);
+ svfloat64_t r2 = svmla_x (pg, s2, s2, y);
+ svfloat64_t r0 = svmul_x (pg, r2, s1);
+
+ return svsel (p_cmp, r1, r0);
+}
+
+/* SVE exp algorithm. Maximum measured error is 1.01ulps:
+ SV_NAME_D1 (exp)(0x1.4619d7b04da41p+6) got 0x1.885d9acc41da7p+117
+ want 0x1.885d9acc41da6p+117. */
+svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svbool_t special = svacgt (pg, x, d->thres);
+
+ /* Use a modifed version of the shift used for flooring, such that x/ln2 is
+ rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 *
+ 2^46.
+
+ n is not an integer but can be written as n = m + i/64, with i and m
+ integer, 0 <= i < 64 and m <= n.
+
+ Bits 5:0 of z will be null every time x/ln2 reaches a new integer value
+ (n=m, i=0), and is incremented every time z (or n) is incremented by 1/64.
+ FEXPA expects i in bits 5:0 of the input so it can be used as index into
+ FEXPA hardwired table T[i] = 2^(i/64) for i = 0:63, that will in turn
+ populate the mantissa of the output. Therefore, we use u=asuint(z) as
+ input to FEXPA.
+
+ We add 1023 to the modified shift value in order to set bits 16:6 of u to
+ 1, such that once these bits are moved to the exponent of the output of
+ FEXPA, we get the exponent of 2^n right, i.e. we get 2^m. */
+ svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t r = svmls_lane (x, n, ln2, 0);
+ r = svmls_lane (r, n, ln2, 1);
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
+ svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
+ svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+ svfloat64_t y = svmla_x (pg, r, p04, r2);
+
+ /* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for
+ consistent NaN handling we have to manually propagate them. This comes at
+ significant performance cost. */
+ svfloat64_t s = svexpa (u);
+
+ /* Assemble result as exp(x) = 2^n * exp(r). If |x| > Thresh the
+ multiplication may overflow, so use special case routine. */
+
+ if (unlikely (svptest_any (pg, special)))
+ {
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+ special case function so needs to be copied.
+ e = sign bit of u << 46. */
+ svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+ /* Copy sign to s. */
+ s = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (s)));
+ return special_case (pg, s, y, n);
+ }
+
+ /* No special case. */
+ return svmla_x (pg, s, s, y);
+}
+
+PL_SIG (SV, D, 1, exp, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_D1 (exp), 1.46)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0, 0x1p-23, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p-23, 1, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 1, 0x1p23, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (exp), 0x1p23, inf, 50000)
diff --git a/pl/math/sv_exp_tail.h b/pl/math/sv_exp_tail.h
deleted file mode 100644
index 9b739da9d82a..000000000000
--- a/pl/math/sv_exp_tail.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Double-precision SVE e^(x+tail) function.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#ifndef SV_EXP_TAIL_H
-#define SV_EXP_TAIL_H
-
-#include "sv_math.h"
-#if SV_SUPPORTED
-
-#include "v_exp_tail.h"
-
-#define C1 sv_f64 (C1_scal)
-#define C2 sv_f64 (C2_scal)
-#define C3 sv_f64 (C3_scal)
-#define MinusLn2hi (-Ln2hi_scal)
-#define MinusLn2lo (-Ln2lo_scal)
-
-#define N (1 << V_EXP_TAIL_TABLE_BITS)
-#define Tab __v_exp_tail_data
-#define IndexMask (N - 1)
-#define Shift sv_f64 (0x1.8p+52)
-#define Thres 704.0
-
-static inline sv_f64_t
-sv_exp_tail_special_case (svbool_t pg, sv_f64_t s, sv_f64_t y, sv_f64_t n)
-{
- sv_f64_t absn = svabs_f64_x (pg, n);
-
- /* 2^(n/N) may overflow, break it up into s1*s2. */
- sv_u64_t b = svsel_u64 (svcmple_n_f64 (pg, n, 0), sv_u64 (0x6000000000000000),
- sv_u64 (0));
- sv_f64_t s1 = sv_as_f64_u64 (svsubr_n_u64_x (pg, b, 0x7000000000000000));
- sv_f64_t s2 = sv_as_f64_u64 (
- svadd_u64_x (pg, svsub_n_u64_x (pg, sv_as_u64_f64 (s), 0x3010000000000000),
- b));
-
- svbool_t cmp = svcmpgt_n_f64 (pg, absn, 1280.0 * N);
- sv_f64_t r1 = svmul_f64_x (pg, s1, s1);
- sv_f64_t r0 = svmul_f64_x (pg, sv_fma_f64_x (pg, y, s2, s2), s1);
- return svsel_f64 (cmp, r1, r0);
-}
-
-static inline sv_f64_t
-sv_exp_tail (const svbool_t pg, sv_f64_t x, sv_f64_t xtail)
-{
- /* Calculate exp(x + xtail). */
- sv_f64_t z = sv_fma_n_f64_x (pg, InvLn2_scal, x, Shift);
- sv_f64_t n = svsub_f64_x (pg, z, Shift);
-
- sv_f64_t r = sv_fma_n_f64_x (pg, MinusLn2hi, n, x);
- r = sv_fma_n_f64_x (pg, MinusLn2lo, n, r);
-
- sv_u64_t u = sv_as_u64_f64 (z);
- sv_u64_t e = svlsl_n_u64_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
- sv_u64_t i = svand_n_u64_x (pg, u, IndexMask);
-
- sv_f64_t y = sv_fma_f64_x (pg, C3, r, C2);
- y = sv_fma_f64_x (pg, y, r, C1);
- y = sv_fma_f64_x (pg, y, r, sv_f64 (1.0));
- y = sv_fma_f64_x (pg, y, r, xtail);
-
- /* s = 2^(n/N). */
- u = sv_lookup_u64_x (pg, Tab, i);
- sv_f64_t s = sv_as_f64_u64 (svadd_u64_x (pg, u, e));
-
- svbool_t cmp = svcmpgt_n_f64 (pg, svabs_f64_x (pg, x), Thres);
- if (unlikely (svptest_any (pg, cmp)))
- {
- return sv_exp_tail_special_case (pg, s, y, n);
- }
- return sv_fma_f64_x (pg, y, s, s);
-}
-
-#endif
-#endif
diff --git a/pl/math/sv_expf_2u.c b/pl/math/sv_expf_2u.c
index 87fbe45df5fd..93d705ce420a 100644
--- a/pl/math/sv_expf_2u.c
+++ b/pl/math/sv_expf_2u.c
@@ -9,148 +9,78 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
-
-#define C(i) __sv_expf_poly[i]
-
-#define InvLn2 (0x1.715476p+0f)
-#define Ln2hi (0x1.62e4p-1f)
-#define Ln2lo (0x1.7f7d1cp-20f)
-
-#if SV_EXPF_USE_FEXPA
-
-#define Shift (0x1.903f8p17f) /* 1.5*2^17 + 127. */
-#define Thres \
- (0x1.5d5e2ap+6f) /* Roughly 87.3. For x < -Thres, the result is subnormal \
- and not handled correctly by FEXPA. */
-
-static NOINLINE sv_f32_t
-special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+static const struct data
+{
+ float poly[5];
+ float inv_ln2, ln2_hi, ln2_lo, shift, thres;
+} data = {
+ /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+ compatibility with polynomial helpers. */
+ .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
+ 0x1.0e4020p-7f },
+ .inv_ln2 = 0x1.715476p+0f,
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+ /* 1.5*2^17 + 127. */
+ .shift = 0x1.903f8p17f,
+ /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+ correctly by FEXPA. */
+ .thres = 0x1.5d5e2ap+6f,
+};
+
+#define C(i) sv_f32 (d->poly[i])
+#define ExponentBias 0x3f800000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
{
- /* The special-case handler from the Neon routine does not handle subnormals
- in a way that is compatible with FEXPA. For the FEXPA variant we just fall
- back to scalar expf. */
return sv_call_f32 (expf, x, y, special);
}
-#else
-
-#define Shift (0x1.8p23f) /* 1.5 * 2^23. */
-#define Thres (126.0f)
-
-/* Special-case handler adapted from Neon variant. Uses s, y and n to produce
- the final result (normal cases included). It performs an update of all lanes!
- Therefore:
- - all previous computation need to be done on all lanes indicated by input
- pg
- - we cannot simply apply the special case to the special-case-activated
- lanes. Besides it is likely that this would not increase performance (no
- scatter/gather). */
-static inline sv_f32_t
-specialcase (svbool_t pg, sv_f32_t poly, sv_f32_t n, sv_u32_t e,
- svbool_t p_cmp1, sv_f32_t scale)
+/* Optimised single-precision SVE exp function.
+ Worst-case error is 1.04 ulp:
+ SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+ want 0x1.ba74bap+4. */
+svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
{
- /* s=2^(n/N) may overflow, break it up into s=s1*s2,
- such that exp = s + s*y can be computed as s1*(s2+s2*y)
- and s1*s1 overflows only if n>0. */
-
- /* If n<=0 then set b to 0x820...0, 0 otherwise. */
- svbool_t p_sign = svcmple_n_f32 (pg, n, 0.0f); /* n <= 0. */
- sv_u32_t b
- = svdup_n_u32_z (p_sign, 0x82000000); /* Inactive lanes set to 0. */
-
- /* Set s1 to generate overflow depending on sign of exponent n. */
- sv_f32_t s1
- = sv_as_f32_u32 (svadd_n_u32_x (pg, b, 0x7f000000)); /* b + 0x7f000000. */
- /* Offset s to avoid overflow in final result if n is below threshold. */
- sv_f32_t s2 = sv_as_f32_u32 (
- svsub_u32_x (pg, e, b)); /* as_u32 (s) - 0x3010...0 + b. */
-
- /* |n| > 192 => 2^(n/N) overflows. */
- svbool_t p_cmp2 = svacgt_n_f32 (pg, n, 192.0f);
+ const struct data *d = ptr_barrier (&data);
- sv_f32_t r2 = svmul_f32_x (pg, s1, s1);
- sv_f32_t r1 = sv_fma_f32_x (pg, poly, s2, s2);
- r1 = svmul_f32_x (pg, r1, s1);
- sv_f32_t r0 = sv_fma_f32_x (pg, poly, scale, scale);
-
- /* Apply condition 1 then 2.
- Returns r2 if cond2 is true, otherwise
- if cond1 is true then return r1, otherwise return r0. */
- sv_f32_t r = svsel_f32 (p_cmp1, r1, r0);
-
- return svsel_f32 (p_cmp2, r2, r);
-}
-
-#endif
-
-/* Optimised single-precision SVE exp function. By default this is an SVE port
- of the Neon algorithm from math/. Alternatively, enable a modification of
- that algorithm that looks up scale using SVE FEXPA instruction with
- SV_EXPF_USE_FEXPA.
-
- Worst-case error of the default algorithm is 1.95 ulp:
- __sv_expf(-0x1.4cb74ap+2) got 0x1.6a022cp-8
- want 0x1.6a023p-8.
-
- Worst-case error when using FEXPA is 1.04 ulp:
- __sv_expf(0x1.a8eda4p+1) got 0x1.ba74bcp+4
- want 0x1.ba74bap+4. */
-sv_f32_t
-__sv_expf_x (sv_f32_t x, const svbool_t pg)
-{
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ /* Load some constants in quad-word chunks to minimise memory access (last
+ lane is wasted). */
+ svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
+
/* n = round(x/(ln2/N)). */
- sv_f32_t z = sv_fma_n_f32_x (pg, InvLn2, x, sv_f32 (Shift));
- sv_f32_t n = svsub_n_f32_x (pg, z, Shift);
+ svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
+ svfloat32_t n = svsub_x (pg, z, d->shift);
/* r = x - n*ln2/N. */
- sv_f32_t r = sv_fma_n_f32_x (pg, -Ln2hi, n, x);
- r = sv_fma_n_f32_x (pg, -Ln2lo, n, r);
+ svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
+ r = svmls_lane (r, n, invln2_and_ln2, 2);
-/* scale = 2^(n/N). */
-#if SV_EXPF_USE_FEXPA
- /* NaNs also need special handling with FEXPA. */
- svbool_t is_special_case
- = svorr_b_z (pg, svacgt_n_f32 (pg, x, Thres), svcmpne_f32 (pg, x, x));
- sv_f32_t scale = svexpa_f32 (sv_as_u32_f32 (z));
-#else
- sv_u32_t e = svlsl_n_u32_x (pg, sv_as_u32_f32 (z), 23);
- svbool_t is_special_case = svacgt_n_f32 (pg, n, Thres);
- sv_f32_t scale = sv_as_f32_u32 (svadd_n_u32_x (pg, e, 0x3f800000));
-#endif
+ /* scale = 2^(n/N). */
+ svbool_t is_special_case = svacgt (pg, x, d->thres);
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
- /* y = exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
- sv_f32_t r2 = svmul_f32_x (pg, r, r);
- sv_f32_t p = sv_fma_n_f32_x (pg, C (0), r, sv_f32 (C (1)));
- sv_f32_t q = sv_fma_n_f32_x (pg, C (2), r, sv_f32 (C (3)));
- q = sv_fma_f32_x (pg, p, r2, q);
- p = svmul_n_f32_x (pg, r, C (4));
- sv_f32_t poly = sv_fma_f32_x (pg, q, r2, p);
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+ svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_x (pg, r, C (0));
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
if (unlikely (svptest_any (pg, is_special_case)))
-#if SV_EXPF_USE_FEXPA
- return special_case (x, sv_fma_f32_x (pg, poly, scale, scale),
- is_special_case);
-#else
- return specialcase (pg, poly, n, e, is_special_case, scale);
-#endif
+ return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
- return sv_fma_f32_x (pg, poly, scale, scale);
+ return svmla_x (pg, scale, scale, poly);
}
-PL_ALIAS (__sv_expf_x, _ZGVsMxv_expf)
-
PL_SIG (SV, F, 1, exp, -9.9, 9.9)
-PL_TEST_ULP (__sv_expf, 1.46)
-PL_TEST_INTERVAL (__sv_expf, 0, 0x1p-23, 40000)
-PL_TEST_INTERVAL (__sv_expf, 0x1p-23, 1, 50000)
-PL_TEST_INTERVAL (__sv_expf, 1, 0x1p23, 50000)
-PL_TEST_INTERVAL (__sv_expf, 0x1p23, inf, 50000)
-PL_TEST_INTERVAL (__sv_expf, -0, -0x1p-23, 40000)
-PL_TEST_INTERVAL (__sv_expf, -0x1p-23, -1, 50000)
-PL_TEST_INTERVAL (__sv_expf, -1, -0x1p23, 50000)
-PL_TEST_INTERVAL (__sv_expf, -0x1p23, -inf, 50000)
-#endif // SV_SUPPORTED
+PL_TEST_ULP (SV_NAME_F1 (exp), 0.55)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0, 0x1p-23, 40000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p-23, 1, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 1, 0x1p23, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (exp), 0x1p23, inf, 50000)
diff --git a/pl/math/sv_expf_data.c b/pl/math/sv_expf_data.c
deleted file mode 100644
index 6875adf857b6..000000000000
--- a/pl/math/sv_expf_data.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Coefficients for single-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Coefficients copied from the polynomial in math/v_expf.c. */
-const float __sv_expf_poly[] = {0x1.0e4020p-7f, 0x1.573e2ep-5f, 0x1.555e66p-3f,
- 0x1.fffdb6p-2f, 0x1.ffffecp-1f};
diff --git a/pl/math/sv_expf_inline.h b/pl/math/sv_expf_inline.h
new file mode 100644
index 000000000000..0ef4e0fda946
--- /dev/null
+++ b/pl/math/sv_expf_inline.h
@@ -0,0 +1,66 @@
+/*
+ * SVE helper for single-precision routines which calculate exp(x) and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_SV_EXPF_INLINE_H
+#define PL_MATH_SV_EXPF_INLINE_H
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+struct sv_expf_data
+{
+ float poly[5];
+ float inv_ln2, ln2_hi, ln2_lo, shift;
+};
+
+/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+ compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
+#define SV_EXPF_DATA \
+ { \
+ .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
+ 0x1.0e4020p-7f }, \
+ \
+ .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
+ .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
+ }
+
+#define C(i) sv_f32 (d->poly[i])
+
+static inline svfloat32_t
+expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
+{
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+
+ /* Load some constants in quad-word chunks to minimise memory access. */
+ svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+
+ /* n = round(x/(ln2/N)). */
+ svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+ svfloat32_t n = svsub_x (pg, z, d->shift);
+
+ /* r = x - n*ln2/N. */
+ svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
+ r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+
+ /* scale = 2^(n/N). */
+ svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+
+ /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
+ svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
+ svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
+ svfloat32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+
+ return svmla_x (pg, scale, scale, poly);
+}
+
+#endif // PL_MATH_SV_EXPF_INLINE_H \ No newline at end of file
diff --git a/pl/math/sv_expm1_2u5.c b/pl/math/sv_expm1_2u5.c
new file mode 100644
index 000000000000..82a31f6d9c0e
--- /dev/null
+++ b/pl/math/sv_expm1_2u5.c
@@ -0,0 +1,95 @@
+/*
+ * Double-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define SpecialBound 0x1.62b7d369a5aa9p+9
+#define ExponentBias 0x3ff0000000000000
+
+static const struct data
+{
+ double poly[11];
+ double shift, inv_ln2, special_bound;
+ /* To be loaded in one quad-word. */
+ double ln2_hi, ln2_lo;
+} data = {
+ /* Generated using fpminimax. */
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
+ 0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+ .special_bound = SpecialBound,
+ .inv_ln2 = 0x1.71547652b82fep0,
+ .ln2_hi = 0x1.62e42fefa39efp-1,
+ .ln2_lo = 0x1.abc9e3b39803fp-56,
+ .shift = 0x1.8p52,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
+{
+ return sv_call_f64 (expm1, x, y, pg);
+}
+
+/* Double-precision vector exp(x) - 1 function.
+ The maximum error observed error is 2.18 ULP:
+ _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+ want 0x1.a8b9ea8d66e2p-2. */
+svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Large, Nan/Inf. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat64_t shift = sv_f64 (d->shift);
+ svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
+ svint64_t i = svcvt_s64_x (pg, n);
+ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+ svfloat64_t f = svmls_lane (x, n, ln2, 0);
+ f = svmls_lane (f, n, ln2, 1);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
+ svfloat64_t f8 = svmul_x (pg, f4, f4);
+ svfloat64_t p
+ = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
+ svfloat64_t t = svreinterpret_f64 (u);
+
+ /* expm1(x) ~= p * t + (t - 1). */
+ svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+
+ return y;
+}
+
+PL_SIG (SV, D, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_D1 (expm1), 1.68)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0, 0x1p-23, 1000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), 0x1p-23, SpecialBound, 200000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (expm1), SpecialBound, inf, 1000)
diff --git a/pl/math/sv_expm1f_1u6.c b/pl/math/sv_expm1f_1u6.c
new file mode 100644
index 000000000000..0ec7c00f5300
--- /dev/null
+++ b/pl/math/sv_expm1f_1u6.c
@@ -0,0 +1,93 @@
+/*
+ * Single-precision vector exp(x) - 1 function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Largest value of x for which expm1(x) should round to -1. */
+#define SpecialBound 0x1.5ebc4p+6f
+
+static const struct data
+{
+ /* These 4 are grouped together so they can be loaded as one quadword, then
+ used with _lane forms of svmla/svmls. */
+ float c2, c4, ln2_hi, ln2_lo;
+ float c0, c1, c3, inv_ln2, special_bound, shift;
+} data = {
+ /* Generated using fpminimax. */
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3,
+ .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7,
+ .c4 = 0x1.6b55a2p-10,
+
+ .special_bound = SpecialBound, .shift = 0x1.8p23f,
+ .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
+};
+
+#define C(i) sv_f32 (d->c##i)
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t pg)
+{
+ return sv_call_f32 (expm1f, x, x, pg);
+}
+
+/* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP:
+ _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2
+ want 0x1.e859d4p-2. */
+svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Large, NaN/Inf. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, pg);
+
+ /* This vector is reliant on layout of data - it contains constants
+ that can be used with _lane forms of svmla/svmls. Values are:
+ [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
+ svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+ j = svsub_x (pg, j, d->shift);
+ svint32_t i = svcvt_s32_x (pg, j);
+
+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+ f = svmls_lane (f, j, lane_constants, 3);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
+ p = svmla_x (pg, C (0), f, p);
+ p = svmla_x (pg, f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svfloat32_t t = svreinterpret_f32 (
+ svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+PL_SIG (SV, F, 1, expm1, -9.9, 9.9)
+PL_TEST_ULP (SV_NAME_F1 (expm1), 1.02)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), 0, SpecialBound, 100000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (expm1), SpecialBound, inf, 1000)
diff --git a/pl/math/sv_expm1f_inline.h b/pl/math/sv_expm1f_inline.h
new file mode 100644
index 000000000000..a6e2050ff4a6
--- /dev/null
+++ b/pl/math/sv_expm1f_inline.h
@@ -0,0 +1,73 @@
+/*
+ * SVE helper for single-precision routines which calculate exp(x) - 1 and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_SV_EXPM1F_INLINE_H
+#define PL_MATH_SV_EXPM1F_INLINE_H
+
+#include "sv_math.h"
+
+struct sv_expm1f_data
+{
+ /* These 4 are grouped together so they can be loaded as one quadword, then
+ used with _lane forms of svmla/svmls. */
+ float32_t c2, c4, ln2_hi, ln2_lo;
+ float32_t c0, c1, c3, inv_ln2, shift;
+};
+
+/* Coefficients generated using fpminimax. */
+#define SV_EXPM1F_DATA \
+ { \
+ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ \
+ .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
+ .ln2_lo = 0x1.7f7d1cp-20f, \
+ }
+
+#define C(i) sv_f32 (d->c##i)
+
+static inline svfloat32_t
+expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d)
+{
+ /* This vector is reliant on layout of data - it contains constants
+ that can be used with _lane forms of svmla/svmls. Values are:
+ [ coeff_2, coeff_4, ln2_hi, ln2_lo ]. */
+ svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
+
+ /* Reduce argument to smaller range:
+ Let i = round(x / ln2)
+ and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where 2^i is exact because i is an integer. */
+ svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
+ j = svsub_x (pg, j, d->shift);
+ svint32_t i = svcvt_s32_x (pg, j);
+
+ svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
+ f = svmls_lane (f, j, lane_constants, 3);
+
+ /* Approximate expm1(f) using polynomial.
+ Taylor expansion for expm1(x) has the form:
+ x + ax^2 + bx^3 + cx^4 ....
+ So we calculate the polynomial P(f) = a + bf + cf^2 + ...
+ and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
+ svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
+ svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
+ svfloat32_t f2 = svmul_x (pg, f, f);
+ svfloat32_t p = svmla_x (pg, p12, f2, p34);
+ p = svmla_x (pg, C (0), f, p);
+ p = svmla_x (pg, f, f2, p);
+
+ /* Assemble the result.
+ expm1(x) ~= 2^i * (p + 1) - 1
+ Let t = 2^i. */
+ svfloat32_t t = svscale_x (pg, sv_f32 (1), i);
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+#endif // PL_MATH_SV_EXPM1F_INLINE_H \ No newline at end of file
diff --git a/pl/math/sv_hypot_1u5.c b/pl/math/sv_hypot_1u5.c
new file mode 100644
index 000000000000..cf1590e4b9ab
--- /dev/null
+++ b/pl/math/sv_hypot_1u5.c
@@ -0,0 +1,51 @@
+/*
+ * Double-precision SVE hypot(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ uint64_t tiny_bound, thres;
+} data = {
+ .tiny_bound = 0x0c80000000000000, /* asuint (0x1p-102). */
+ .thres = 0x7300000000000000, /* asuint (inf) - tiny_bound. */
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t sqsum, svfloat64_t x, svfloat64_t y, svbool_t pg,
+ svbool_t special)
+{
+ return sv_call2_f64 (hypot, x, y, svsqrt_x (pg, sqsum), special);
+}
+
+/* SVE implementation of double-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVsMxvv_hypot (-0x1.6a22d0412cdd3p+352, 0x1.d3d89bd66fb1ap+330)
+ got 0x1.6a22d0412cfp+352
+ want 0x1.6a22d0412cf01p+352. */
+svfloat64_t SV_NAME_D2 (hypot) (svfloat64_t x, svfloat64_t y, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
+
+ svbool_t special = svcmpge (
+ pg, svsub_x (pg, svreinterpret_u64 (sqsum), d->tiny_bound), d->thres);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (sqsum, x, y, pg, special);
+ return svsqrt_x (pg, sqsum);
+}
+
+PL_SIG (SV, D, 2, hypot, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_D2 (hypot), 0.71)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/pl/math/sv_hypotf_1u5.c b/pl/math/sv_hypotf_1u5.c
new file mode 100644
index 000000000000..f428832b3dbc
--- /dev/null
+++ b/pl/math/sv_hypotf_1u5.c
@@ -0,0 +1,45 @@
+/*
+ * Single-precision SVE hypot(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define TinyBound 0x0c800000 /* asuint (0x1p-102). */
+#define Thres 0x73000000 /* 0x70000000 - TinyBound. */
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t sqsum, svfloat32_t x, svfloat32_t y, svbool_t pg,
+ svbool_t special)
+{
+ return sv_call2_f32 (hypotf, x, y, svsqrt_x (pg, sqsum), special);
+}
+
+/* SVE implementation of single-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVsMxvv_hypotf (0x1.6a213cp-19, -0x1.32b982p-26) got 0x1.6a2346p-19
+ want 0x1.6a2344p-19. */
+svfloat32_t SV_NAME_F2 (hypot) (svfloat32_t x, svfloat32_t y,
+ const svbool_t pg)
+{
+ svfloat32_t sqsum = svmla_x (pg, svmul_x (pg, x, x), y, y);
+
+ svbool_t special = svcmpge (
+ pg, svsub_x (pg, svreinterpret_u32 (sqsum), TinyBound), Thres);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (sqsum, x, y, pg, special);
+
+ return svsqrt_x (pg, sqsum);
+}
+
+PL_SIG (SV, F, 2, hypot, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_F2 (hypot), 0.71)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/pl/math/sv_log10_2u5.c b/pl/math/sv_log10_2u5.c
index 884e2011d2f8..f55e068fd442 100644
--- a/pl/math/sv_log10_2u5.c
+++ b/pl/math/sv_log10_2u5.c
@@ -6,84 +6,70 @@
*/
#include "sv_math.h"
-#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_sve_f64.h"
-#if SV_SUPPORTED
-
-#define OFF 0x3fe6900900000000
+#define Min 0x0010000000000000
+#define Max 0x7ff0000000000000
+#define Thres 0x7fe0000000000000 /* Max - Min. */
+#define Off 0x3fe6900900000000
#define N (1 << V_LOG10_TABLE_BITS)
-#define A(i) __v_log10_data.poly[i]
-
-static inline sv_f64_t
-specialcase (sv_f64_t x, sv_f64_t y, svbool_t special)
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
{
return sv_call_f64 (log10, x, y, special);
}
-/* SVE log10 algorithm. Maximum measured error is 2.46 ulps.
- __sv_log10(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
- want 0x1.fffbdf6eaa667p-6. */
-sv_f64_t
-__sv_log10_x (sv_f64_t x, const svbool_t pg)
+/* SVE log10 algorithm.
+ Maximum measured error is 2.46 ulps.
+ SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6
+ want 0x1.fffbdf6eaa667p-6. */
+svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg)
{
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
-
- svbool_t is_special_case
- = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x07ff0 - 0x0010);
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
- sv_u64_t i
- = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG10_TABLE_BITS), N);
- sv_f64_t k
- = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
- sv_f64_t z = sv_as_f64_u64 (
- svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+ svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS);
+ i = svand_x (pg, i, (N - 1) << 1);
+ svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+ svfloat64_t z = svreinterpret_f64 (
+ svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)));
/* log(x) = k*log(2) + log(c) + log(z/c). */
-
- sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
- sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].invc, idx);
- sv_f64_t logc = sv_lookup_f64_x (pg, &__v_log10_data.tab[0].log10c, idx);
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log10_data.table[0].invc, i);
+ svfloat64_t logc
+ = svld1_gather_index (pg, &__v_log10_data.table[0].log10c, i);
/* We approximate log(z/c) with a polynomial P(x) ~= log(x + 1):
r = z/c - 1 (we look up precomputed 1/c)
log(z/c) ~= P(r). */
- sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
+ svfloat64_t r = svmad_x (pg, invc, z, -1.0);
/* hi = log(c) + k*log(2). */
- sv_f64_t w = sv_fma_n_f64_x (pg, __v_log10_data.invln10, r, logc);
- sv_f64_t hi = sv_fma_n_f64_x (pg, __v_log10_data.log10_2, k, w);
+ svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10);
+ svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- sv_f64_t r2 = svmul_f64_x (pg, r, r);
- sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
- sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
- y = sv_fma_n_f64_x (pg, A (4), r2, y);
- y = sv_fma_f64_x (pg, y, r2, p);
- y = sv_fma_f64_x (pg, y, r2, hi);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly);
- if (unlikely (svptest_any (pg, is_special_case)))
- {
- return specialcase (x, y, is_special_case);
- }
- return y;
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
+ special);
+ return svmla_x (pg, hi, r2, y);
}
-PL_ALIAS (__sv_log10_x, _ZGVsMxv_log10)
-
PL_SIG (SV, D, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (__sv_log10, 1.97)
-PL_TEST_INTERVAL (__sv_log10, -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (__sv_log10, 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_log10, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_log10, 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (__sv_log10, 1.0, 100, 50000)
-PL_TEST_INTERVAL (__sv_log10, 100, inf, 50000)
-#endif
+PL_TEST_ULP (SV_NAME_D1 (log10), 1.97)
+PL_TEST_INTERVAL (SV_NAME_D1 (log10), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log10), 1.0, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log10), 100, inf, 50000)
diff --git a/pl/math/sv_log10f_3u5.c b/pl/math/sv_log10f_3u5.c
index e7b1e9801fa9..a685b23e5de5 100644
--- a/pl/math/sv_log10f_3u5.c
+++ b/pl/math/sv_log10f_3u5.c
@@ -9,80 +9,85 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
+static const struct data
+{
+ float poly_0246[4];
+ float poly_1357[4];
+ float ln2, inv_ln10;
+} data = {
+ .poly_1357 = {
+ /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
+ 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane
+ variant of MLA intrinsic. */
+ 0x1.2879c8p-3f, 0x1.6408f8p-4f, 0x1.f0e514p-5f, 0x1.f5f76ap-5f
+ },
+ .poly_0246 = { -0x1.bcb79cp-3f, -0x1.bcd472p-4f, -0x1.246f8p-4f,
+ -0x1.0fc92cp-4f },
+ .ln2 = 0x1.62e43p-1f,
+ .inv_ln10 = 0x1.bcb7b2p-2f,
+};
-#define SpecialCaseMin 0x00800000
-#define SpecialCaseMax 0x7f800000
+#define Min 0x00800000
+#define Max 0x7f800000
+#define Thres 0x7f000000 /* Max - Min. */
#define Offset 0x3f2aaaab /* 0.666667. */
-#define Mask 0x007fffff
-#define Ln2 0x1.62e43p-1f /* 0x3f317218. */
-#define InvLn10 0x1.bcb7b2p-2f
-
-#define P(i) __v_log10f_poly[i]
+#define MantissaMask 0x007fffff
-static NOINLINE sv_f32_t
-special_case (sv_f32_t x, sv_f32_t y, svbool_t special)
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
{
return sv_call_f32 (log10f, x, y, special);
}
/* Optimised implementation of SVE log10f using the same algorithm and
- polynomial as v_log10f. Maximum error is 3.31ulps:
- __sv_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
- want 0x1.ffe2f4p-4. */
-sv_f32_t
-__sv_log10f_x (sv_f32_t x, const svbool_t pg)
+ polynomial as AdvSIMD log10f.
+ Maximum error is 3.31ulps:
+ SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4
+ want 0x1.ffe2f4p-4. */
+svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
{
- sv_u32_t ix = sv_as_u32_f32 (x);
- svbool_t special_cases
- = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, ix, SpecialCaseMin),
- SpecialCaseMax - SpecialCaseMin);
+ const struct data *d = ptr_barrier (&data);
+ svuint32_t ix = svreinterpret_u32 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- ix = svsub_n_u32_x (pg, ix, Offset);
- sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (ix),
- 23)); /* signextend. */
- ix = svand_n_u32_x (pg, ix, Mask);
- ix = svadd_n_u32_x (pg, ix, Offset);
- sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (ix), 1.0f);
+ ix = svsub_x (pg, ix, Offset);
+ svfloat32_t n = svcvt_f32_x (
+ pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
+ ix = svand_x (pg, ix, MantissaMask);
+ ix = svadd_x (pg, ix, Offset);
+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
/* y = log10(1+r) + n*log10(2)
log10(1+r) ~ r * InvLn(10) + P(r)
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
- log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3)
-
- P(r) = r2 * (Q01 + r2 * (Q23 + r2 * (Q45 + r2 * Q67)))
- and Qij = Pi + r * Pj. */
- sv_f32_t q12 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
- sv_f32_t q34 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
- sv_f32_t q56 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
- sv_f32_t q78 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
+ log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r4 = svmul_x (pg, r2, r2);
+ svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
+ svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
+ svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
+ svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_0246[2]), r, p_1357, 2);
+ svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_0246[3]), r, p_1357, 3);
+ svfloat32_t q_47 = svmla_x (pg, q_45, r2, q_67);
+ svfloat32_t q_03 = svmla_x (pg, q_01, r2, q_23);
+ svfloat32_t y = svmla_x (pg, q_03, r4, q_47);
- sv_f32_t r2 = svmul_f32_x (pg, r, r);
- sv_f32_t y = sv_fma_f32_x (pg, q78, r2, q56);
- y = sv_fma_f32_x (pg, y, r2, q34);
- y = sv_fma_f32_x (pg, y, r2, q12);
+ /* Using hi = Log10(2)*n + r*InvLn(10) is faster but less accurate. */
+ svfloat32_t hi = svmla_x (pg, r, n, d->ln2);
+ hi = svmul_x (pg, hi, d->inv_ln10);
- /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster but less
- accurate. */
- sv_f32_t p = sv_fma_n_f32_x (pg, Ln2, n, r);
- y = sv_fma_f32_x (pg, y, r2, svmul_n_f32_x (pg, p, InvLn10));
-
- if (unlikely (svptest_any (pg, special_cases)))
- {
- return special_case (x, y, special_cases);
- }
- return y;
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
+ special);
+ return svmla_x (pg, hi, r2, y);
}
-PL_ALIAS (__sv_log10f_x, _ZGVsMxv_log10f)
-
PL_SIG (SV, F, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (__sv_log10f, 2.82)
-PL_TEST_INTERVAL (__sv_log10f, -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (__sv_log10f, 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_log10f, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_log10f, 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (__sv_log10f, 1.0, 100, 50000)
-PL_TEST_INTERVAL (__sv_log10f, 100, inf, 50000)
-#endif
+PL_TEST_ULP (SV_NAME_F1 (log10), 2.82)
+PL_TEST_INTERVAL (SV_NAME_F1 (log10), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log10), 1.0, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log10), 100, inf, 50000)
diff --git a/pl/math/sv_log1p_2u5.c b/pl/math/sv_log1p_2u5.c
new file mode 100644
index 000000000000..f178ab16238a
--- /dev/null
+++ b/pl/math/sv_log1p_2u5.c
@@ -0,0 +1,116 @@
+/*
+ * Double-precision SVE log(1+x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ double poly[19];
+ double ln2_hi, ln2_lo;
+ uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
+} data = {
+ /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20
+ polynomial, however first 2 coefficients are 0 and 1 so are not stored. */
+ .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+ 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+ -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+ 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+ -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+ 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+ -0x1.cfa7385bdb37ep-6, },
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ /* top32(asuint64(sqrt(2)/2)) << 32. */
+ .hfrt2_top = 0x3fe6a09e00000000,
+ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
+ .onemhfrt2_top = 0x00095f6200000000,
+ .inf = 0x7ff0000000000000,
+ .mone = 0xbff0000000000000,
+};
+
+#define AbsMask 0x7fffffffffffffff
+#define BottomMask 0xffffffff
+
+static svfloat64_t NOINLINE
+special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
+{
+ return sv_call_f64 (log1p, x, y, special);
+}
+
+/* Vector approximation for log1p using polynomial on reduced interval. Maximum
+ observed error is 2.46 ULP:
+ _ZGVsMxv_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
+ want 0x1.fd5565fb590f6p+2. */
+svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t ax = svand_x (pg, ix, AbsMask);
+ svbool_t special
+ = svorr_z (pg, svcmpge (pg, ax, d->inf), svcmpge (pg, ix, d->mone));
+
+ /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
+ is in [sqrt(2)/2, sqrt(2)]):
+ log1p(x) = k*log(2) + log1p(f).
+
+ f may not be representable exactly, so we need a correction term:
+ let m = round(1 + x), c = (1 + x) - m.
+ c << m: at very small x, log1p(x) ~ x, hence:
+ log(1+x) - log(m) ~ c/m.
+
+ We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */
+
+ /* Obtain correctly scaled k by manipulation in the exponent.
+ The scalar algorithm casts down to 32-bit at this point to calculate k and
+ u_red. We stay in double-width to obtain f and k, using the same constants
+ as the scalar algorithm but shifted left by 32. */
+ svfloat64_t m = svadd_x (pg, x, 1);
+ svuint64_t mi = svreinterpret_u64 (m);
+ svuint64_t u = svadd_x (pg, mi, d->onemhfrt2_top);
+
+ svint64_t ki = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), 0x3ff);
+ svfloat64_t k = svcvt_f64_x (pg, ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ svuint64_t utop
+ = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top);
+ svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
+ svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+
+ /* Correction term c/m. */
+ svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m);
+
+ /* Approximate log1p(x) on the reduced input using a polynomial. Because
+ log1p(0)=0 we choose an approximation of the form:
+ x + C0*x^2 + C1*x^3 + C2x^4 + ...
+ Hence approximation has the form f + f^2 * P(f)
+ where P(x) = C0 + C1*x + C2x^2 + ...
+ Assembling this all correctly is dealt with at the final step. */
+ svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
+ f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
+ svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly);
+
+ svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
+ svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
+ svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (special, x, y);
+
+ return y;
+}
+
+PL_SIG (SV, D, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (log1p), 1.97)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (log1p), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log1p), 1, inf, 10000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log1p), -1, -inf, 10)
diff --git a/pl/math/sv_log1p_inline.h b/pl/math/sv_log1p_inline.h
new file mode 100644
index 000000000000..983f8e1b0413
--- /dev/null
+++ b/pl/math/sv_log1p_inline.h
@@ -0,0 +1,96 @@
+/*
+ * Helper for SVE double-precision routines which calculate log(1 + x) and do
+ * not need special-case handling
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef PL_MATH_SV_LOG1P_INLINE_H
+#define PL_MATH_SV_LOG1P_INLINE_H
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct sv_log1p_data
+{
+ double poly[19], ln2[2];
+ uint64_t hf_rt2_top;
+ uint64_t one_m_hf_rt2_top;
+ uint32_t bottom_mask;
+ int64_t one_top;
+} sv_log1p_data = {
+ /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
+ */
+ .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
+ 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
+ -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
+ 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
+ -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
+ 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
+ -0x1.cfa7385bdb37ep-6 },
+ .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+ .hf_rt2_top = 0x3fe6a09e00000000,
+ .one_m_hf_rt2_top = 0x00095f6200000000,
+ .bottom_mask = 0xffffffff,
+ .one_top = 0x3ff
+};
+
+static inline svfloat64_t
+sv_log1p_inline (svfloat64_t x, const svbool_t pg)
+{
+ /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
+ differs from v_log1p_2u5.c by:
+ - No special-case handling - this should be dealt with by the caller.
+ - Pairwise Horner polynomial evaluation for improved accuracy.
+ - Optionally simulate the shortcut for k=0, used in the scalar routine,
+ using svsel, for improved accuracy when the argument to log1p is close
+ to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
+ in the source of the caller before including this file.
+ See sv_log1p_2u1.c for details of the algorithm. */
+ const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
+ svfloat64_t m = svadd_x (pg, x, 1);
+ svuint64_t mi = svreinterpret_u64 (m);
+ svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top);
+
+ svint64_t ki
+ = svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top);
+ svfloat64_t k = svcvt_f64_x (pg, ki);
+
+ /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
+ svuint64_t utop
+ = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top);
+ svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, d->bottom_mask));
+ svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+
+ /* Correction term c/m. */
+ svfloat64_t c = svsub_x (pg, x, svsub_x (pg, m, 1));
+ svfloat64_t cm;
+
+#ifndef WANT_SV_LOG1P_K0_SHORTCUT
+#error \
+ "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
+#elif WANT_SV_LOG1P_K0_SHORTCUT
+ /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
+ that the approximation is solely the polynomial. */
+ svbool_t knot0 = svcmpne (pg, k, 0);
+ cm = svdiv_z (knot0, c, m);
+ if (likely (!svptest_any (pg, knot0)))
+ {
+ f = svsel (knot0, f, x);
+ }
+#else
+ /* No shortcut. */
+ cm = svdiv_x (pg, c, m);
+#endif
+
+ /* Approximate log1p(f) on the reduced input using a polynomial. */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+
+ /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
+ svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
+ svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+
+ return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+}
+#endif // PL_MATH_SV_LOG1P_INLINE_H
diff --git a/pl/math/sv_log1pf_1u3.c b/pl/math/sv_log1pf_1u3.c
new file mode 100644
index 000000000000..ea1a3dbf723a
--- /dev/null
+++ b/pl/math/sv_log1pf_1u3.c
@@ -0,0 +1,97 @@
+/*
+ * Single-precision vector log(x + 1) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float poly[8];
+ float ln2, exp_bias;
+ uint32_t four, three_quarters;
+} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as
+ this can be fmov-ed directly instead of including it in
+ the main load-and-mla polynomial schedule. */
+ 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+ -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f,
+ 0x1.abcb6p-4f, -0x1.6f0d5ep-5f},
+ .ln2 = 0x1.62e43p-1f,
+ .exp_bias = 0x1p-23f,
+ .four = 0x40800000,
+ .three_quarters = 0x3f400000};
+
+#define SignExponentMask 0xff800000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (log1pf, x, y, special);
+}
+
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.27 ULP very close to 0.5.
+ _ZGVsMxv_log1pf(0x1.fffffep-2) got 0x1.9f324p-2
+ want 0x1.9f323ep-2. */
+svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* x < -1, Inf/Nan. */
+ svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000);
+ special = svorn_z (pg, special, svcmpge (pg, x, -1));
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
+ svfloat32_t m = svadd_x (pg, x, 1);
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
+ svint32_t k
+ = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters),
+ sv_s32 (SignExponentMask));
+
+ /* Scale x by exponent manipulation. */
+ svfloat32_t m_scale = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k)));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
+ m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25));
+
+ /* Evaluate polynomial on reduced interval. */
+ svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale),
+ ms4 = svmul_x (pg, ms2, ms2);
+ svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly);
+ p = svmad_x (pg, m_scale, p, -0.5);
+ p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p));
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
+ svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias);
+
+ /* Apply the scaling back. */
+ svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, y, special);
+
+ return y;
+}
+
+PL_SIG (SV, F, 1, log1p, -0.9, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (log1p), 0.77)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0, 0x1p-23, 5000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (log1p), 0x1p-23, 1, 5000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log1p), 1, inf, 10000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log1p), -1, -inf, 10)
diff --git a/pl/math/sv_log1pf_inline.h b/pl/math/sv_log1pf_inline.h
new file mode 100644
index 000000000000..d13b094f6b5d
--- /dev/null
+++ b/pl/math/sv_log1pf_inline.h
@@ -0,0 +1,65 @@
+/*
+ * Helper for SVE routines which calculate log(1 + x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_SV_LOG1PF_INLINE_H
+#define PL_MATH_SV_LOG1PF_INLINE_H
+
+#include "v_math.h"
+#include "math_config.h"
+#include "poly_sve_f32.h"
+
+static const struct sv_log1pf_data
+{
+ float32_t poly[9];
+ float32_t ln2;
+ float32_t scale_back;
+} sv_log1pf_data = {
+ /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
+ .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f,
+ -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f,
+ -0x1.6f0d5ep-5f },
+ .scale_back = 0x1.0p-23f,
+ .ln2 = 0x1.62e43p-1f,
+};
+
+static inline svfloat32_t
+eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg)
+{
+ svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1]));
+ svfloat32_t m2 = svmul_x (pg, m, m);
+ svfloat32_t q = svmla_x (pg, m, m2, p_12);
+ svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2);
+ p = svmul_x (pg, m2, p);
+
+ return svmla_x (pg, q, m2, p);
+}
+
+static inline svfloat32_t
+sv_log1pf_inline (svfloat32_t x, svbool_t pg)
+{
+ const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data);
+
+ svfloat32_t m = svadd_x (pg, x, 1.0f);
+
+ svint32_t ks = svsub_x (pg, svreinterpret_s32 (m),
+ svreinterpret_s32 (svdup_f32 (0.75f)));
+ ks = svand_x (pg, ks, 0xff800000);
+ svuint32_t k = svreinterpret_u32 (ks);
+ svfloat32_t s = svreinterpret_f32 (
+ svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k));
+
+ svfloat32_t m_scale
+ = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k));
+ m_scale
+ = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
+ svfloat32_t p = eval_poly (m_scale, d->poly, pg);
+ svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back);
+ return svmla_x (pg, p, scale_back, d->ln2);
+}
+
+#endif // PL_MATH_SV_LOG1PF_INLINE_H \ No newline at end of file
diff --git a/pl/math/sv_log2_3u.c b/pl/math/sv_log2_3u.c
index a0815bb5646f..0775a39cc85d 100644
--- a/pl/math/sv_log2_3u.c
+++ b/pl/math/sv_log2_3u.c
@@ -8,78 +8,66 @@
#include "sv_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_sve_f64.h"
-#if SV_SUPPORTED
-
-#define InvLn2 sv_f64 (0x1.71547652b82fep0)
#define N (1 << V_LOG2_TABLE_BITS)
-#define OFF 0x3fe6900900000000
-#define P(i) sv_f64 (__v_log2_data.poly[i])
+#define Off 0x3fe6900900000000
+#define Max (0x7ff0000000000000)
+#define Min (0x0010000000000000)
+#define Thresh (0x7fe0000000000000) /* Max - Min. */
-NOINLINE static sv_f64_t
-specialcase (sv_f64_t x, sv_f64_t y, const svbool_t cmp)
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
{
return sv_call_f64 (log2, x, y, cmp);
}
-/* Double-precision SVE log2 routine. Implements the same algorithm as vector
- log10, with coefficients and table entries scaled in extended precision.
+/* Double-precision SVE log2 routine.
+ Implements the same algorithm as AdvSIMD log10, with coefficients and table
+ entries scaled in extended precision.
The maximum observed error is 2.58 ULP:
- __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
- want 0x1.fffb34198d9ddp-5. */
-sv_f64_t
-__sv_log2_x (sv_f64_t x, const svbool_t pg)
+ SV_NAME_D1 (log2)(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+ want 0x1.fffb34198d9ddp-5. */
+svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg)
{
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
-
- svbool_t special
- = svcmpge_n_u64 (pg, svsub_n_u64_x (pg, top, 0x0010), 0x7ff0 - 0x0010);
+ svuint64_t ix = svreinterpret_u64 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh);
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
- sv_u64_t i
- = sv_mod_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, 52 - V_LOG2_TABLE_BITS), N);
- sv_f64_t k
- = sv_to_f64_s64_x (pg, svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52));
- sv_f64_t z = sv_as_f64_u64 (
- svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52)));
+ svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS);
+ i = svand_x (pg, i, (N - 1) << 1);
+ svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52));
+ svfloat64_t z = svreinterpret_f64 (
+ svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)));
- sv_u64_t idx = svmul_n_u64_x (pg, i, 2);
- sv_f64_t invc = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].invc, idx);
- sv_f64_t log2c = sv_lookup_f64_x (pg, &__v_log2_data.tab[0].log2c, idx);
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log2_data.table[0].invc, i);
+ svfloat64_t log2c
+ = svld1_gather_index (pg, &__v_log2_data.table[0].log2c, i);
/* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
- sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
- sv_f64_t w = sv_fma_f64_x (pg, r, InvLn2, log2c);
+ svfloat64_t r = svmad_x (pg, invc, z, -1.0);
+ svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2);
- sv_f64_t r2 = svmul_f64_x (pg, r, r);
- sv_f64_t p_23 = sv_fma_f64_x (pg, P (3), r, P (2));
- sv_f64_t p_01 = sv_fma_f64_x (pg, P (1), r, P (0));
- sv_f64_t y = sv_fma_f64_x (pg, P (4), r2, p_23);
- y = sv_fma_f64_x (pg, y, r2, p_01);
- y = sv_fma_f64_x (pg, y, r2, svadd_f64_x (pg, k, w));
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly);
+ w = svadd_x (pg, k, w);
if (unlikely (svptest_any (pg, special)))
- {
- return specialcase (x, y, special);
- }
- return y;
+ return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y),
+ special);
+ return svmla_x (pg, w, r2, y);
}
-PL_ALIAS (__sv_log2_x, _ZGVsMxv_log2)
-
PL_SIG (SV, D, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (__sv_log2, 2.09)
-PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2)
-PL_TEST_INTERVAL (__sv_log2, -0.0, -0x1p126, 1000)
-PL_TEST_INTERVAL (__sv_log2, 0.0, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_log2, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_log2, 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (__sv_log2, 1.0, 100, 50000)
-PL_TEST_INTERVAL (__sv_log2, 100, inf, 50000)
-
-#endif
+PL_TEST_ULP (SV_NAME_D1 (log2), 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_D1 (log2))
+PL_TEST_INTERVAL (SV_NAME_D1 (log2), -0.0, -0x1p126, 1000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log2), 100, inf, 50000)
diff --git a/pl/math/sv_log2f_2u5.c b/pl/math/sv_log2f_2u5.c
index fe2ab16b90b7..9e96c62bbcc6 100644
--- a/pl/math/sv_log2f_2u5.c
+++ b/pl/math/sv_log2f_2u5.c
@@ -9,71 +9,78 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
-
-#define P(i) __v_log2f_data.poly[i]
+static const struct data
+{
+ float poly_02468[5];
+ float poly_1357[4];
+} data = {
+ .poly_1357 = {
+ /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
+ 1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane
+ variant of MLA intrinsic. */
+ -0x1.715458p-1f, -0x1.7171a4p-2f, -0x1.e5143ep-3f, -0x1.c675bp-3f
+ },
+ .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
+ 0x1.9d8ecap-3f, 0x1.9e495p-3f },
+};
-#define Ln2 (0x1.62e43p-1f) /* 0x3f317218. */
#define Min (0x00800000)
#define Max (0x7f800000)
-#define Mask (0x007fffff)
+#define Thres (0x7f000000) /* Max - Min. */
+#define MantissaMask (0x007fffff)
#define Off (0x3f2aaaab) /* 0.666667. */
-static NOINLINE sv_f32_t
-specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
{
return sv_call_f32 (log2f, x, y, cmp);
}
/* Optimised implementation of SVE log2f, using the same algorithm
- and polynomial as Neon log2f. Maximum error is 2.48 ULPs:
- __sv_log2f(0x1.558174p+0) got 0x1.a9be84p-2
- want 0x1.a9be8p-2. */
-sv_f32_t
-__sv_log2f_x (sv_f32_t x, const svbool_t pg)
+ and polynomial as AdvSIMD log2f.
+ Maximum error is 2.48 ULPs:
+ SV_NAME_F1 (log2)(0x1.558174p+0) got 0x1.a9be84p-2
+ want 0x1.a9be8p-2. */
+svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
{
- sv_u32_t u = sv_as_u32_f32 (x);
- svbool_t special
- = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t u = svreinterpret_u32 (x);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_n_u32_x (pg, u, Off);
- sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
- 23)); /* Sign-extend. */
- u = svand_n_u32_x (pg, u, Mask);
- u = svadd_n_u32_x (pg, u, Off);
- sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+ u = svsub_x (pg, u, Off);
+ svfloat32_t n = svcvt_f32_x (
+ pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
+ u = svand_x (pg, u, MantissaMask);
+ u = svadd_x (pg, u, Off);
+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log2(1+r) + n. */
- sv_f32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (pg, r, r);
/* Evaluate polynomial using pairwise Horner scheme. */
- sv_f32_t p67 = sv_fma_n_f32_x (pg, P (7), r, sv_f32 (P (6)));
- sv_f32_t p45 = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (4)));
- sv_f32_t p23 = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (2)));
- sv_f32_t p01 = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (0)));
- sv_f32_t y;
- y = sv_fma_n_f32_x (pg, P (8), r2, p67);
- y = sv_fma_f32_x (pg, y, r2, p45);
- y = sv_fma_f32_x (pg, y, r2, p23);
- y = sv_fma_f32_x (pg, y, r2, p01);
- y = sv_fma_f32_x (pg, y, r, n);
+ svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
+ svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_02468[0]), r, p_1357, 0);
+ svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_02468[1]), r, p_1357, 1);
+ svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_02468[2]), r, p_1357, 2);
+ svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_02468[3]), r, p_1357, 3);
+ svfloat32_t y = svmla_x (pg, q_67, r2, sv_f32 (d->poly_02468[4]));
+ y = svmla_x (pg, q_45, r2, y);
+ y = svmla_x (pg, q_23, r2, y);
+ y = svmla_x (pg, q_01, r2, y);
if (unlikely (svptest_any (pg, special)))
- return specialcase (x, y, special);
- return y;
+ return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
+ return svmla_x (pg, n, r, y);
}
-PL_ALIAS (__sv_log2f_x, _ZGVsMxv_log2f)
-
PL_SIG (SV, F, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (__sv_log2f, 1.99)
-PL_TEST_EXPECT_FENV_ALWAYS (__sv_log2f)
-PL_TEST_INTERVAL (__sv_log2f, -0.0, -0x1p126, 4000)
-PL_TEST_INTERVAL (__sv_log2f, 0.0, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_log2f, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_log2f, 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (__sv_log2f, 1.0, 100, 50000)
-PL_TEST_INTERVAL (__sv_log2f, 100, inf, 50000)
-
-#endif // SV_SUPPORTED
+PL_TEST_ULP (SV_NAME_F1 (log2), 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (SV_NAME_F1 (log2))
+PL_TEST_INTERVAL (SV_NAME_F1 (log2), -0.0, -0x1p126, 4000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0.0, 0x1p-126, 4000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log2), 100, inf, 50000)
diff --git a/pl/math/sv_log_2u5.c b/pl/math/sv_log_2u5.c
index 7f06fd31ebf1..2530c9e3f62c 100644
--- a/pl/math/sv_log_2u5.c
+++ b/pl/math/sv_log_2u5.c
@@ -9,77 +9,68 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
+#define P(i) sv_f64 (__v_log_data.poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
+#define Off (0x3fe6900900000000)
+#define MaxTop (0x7ff)
+#define MinTop (0x001)
+#define ThreshTop (0x7fe) /* MaxTop - MinTop. */
-#define A(i) __sv_log_data.poly[i]
-#define Ln2 (0x1.62e42fefa39efp-1)
-#define N (1 << SV_LOG_TABLE_BITS)
-#define OFF (0x3fe6900900000000)
-
-double
-optr_aor_log_f64 (double);
-
-static NOINLINE sv_f64_t
-__sv_log_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
{
- return sv_call_f64 (optr_aor_log_f64, x, y, cmp);
+ return sv_call_f64 (log, x, y, cmp);
}
-/* SVE port of Neon log algorithm from math/.
+/* SVE port of AdvSIMD log algorithm.
Maximum measured error is 2.17 ulp:
- __sv_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
- want 0x1.ffffff1cca045p-2. */
-sv_f64_t
-__sv_log_x (sv_f64_t x, const svbool_t pg)
+ SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2
+ want 0x1.ffffff1cca045p-2. */
+svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
{
- sv_u64_t ix = sv_as_u64_f64 (x);
- sv_u64_t top = svlsr_n_u64_x (pg, ix, 48);
- svbool_t cmp = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, 0x0010),
- sv_u64 (0x7ff0 - 0x0010));
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t top = svlsr_x (pg, ix, 52);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- sv_u64_t tmp = svsub_n_u64_x (pg, ix, OFF);
- /* Equivalent to (tmp >> (52 - SV_LOG_TABLE_BITS)) % N, since N is a power
- of 2. */
- sv_u64_t i
- = svand_n_u64_x (pg, svlsr_n_u64_x (pg, tmp, (52 - SV_LOG_TABLE_BITS)),
- N - 1);
- sv_s64_t k
- = svasr_n_s64_x (pg, sv_as_s64_u64 (tmp), 52); /* Arithmetic shift. */
- sv_u64_t iz = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
- sv_f64_t z = sv_as_f64_u64 (iz);
+ svuint64_t tmp = svsub_x (pg, ix, Off);
+ /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
+ The actual value of i is double this due to table layout. */
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+ svint64_t k
+ = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svfloat64_t z = svreinterpret_f64 (iz);
/* Lookup in 2 global lists (length N). */
- sv_f64_t invc = sv_lookup_f64_x (pg, __sv_log_data.invc, i);
- sv_f64_t logc = sv_lookup_f64_x (pg, __sv_log_data.logc, i);
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- sv_f64_t r = sv_fma_f64_x (pg, z, invc, sv_f64 (-1.0));
- sv_f64_t kd = sv_to_f64_s64_x (pg, k);
+ svfloat64_t r = svmad_x (pg, invc, z, -1);
+ svfloat64_t kd = svcvt_f64_x (pg, k);
/* hi = r + log(c) + k*Ln2. */
- sv_f64_t hi = sv_fma_n_f64_x (pg, Ln2, kd, svadd_f64_x (pg, logc, r));
+ svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- sv_f64_t r2 = svmul_f64_x (pg, r, r);
- sv_f64_t y = sv_fma_n_f64_x (pg, A (3), r, sv_f64 (A (2)));
- sv_f64_t p = sv_fma_n_f64_x (pg, A (1), r, sv_f64 (A (0)));
- y = sv_fma_n_f64_x (pg, A (4), r2, y);
- y = sv_fma_f64_x (pg, y, r2, p);
- y = sv_fma_f64_x (pg, y, r2, hi);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = svmla_x (pg, P (2), r, P (3));
+ svfloat64_t p = svmla_x (pg, P (0), r, P (1));
+ y = svmla_x (pg, y, r2, P (4));
+ y = svmla_x (pg, p, r2, y);
if (unlikely (svptest_any (pg, cmp)))
- return __sv_log_specialcase (x, y, cmp);
- return y;
+ return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
+ return svmla_x (pg, hi, r2, y);
}
-PL_ALIAS (__sv_log_x, _ZGVsMxv_log)
-
PL_SIG (SV, D, 1, log, 0.01, 11.1)
-PL_TEST_ULP (__sv_log, 1.68)
-PL_TEST_INTERVAL (__sv_log, -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (__sv_log, 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_log, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_log, 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (__sv_log, 1.0, 100, 50000)
-PL_TEST_INTERVAL (__sv_log, 100, inf, 50000)
-#endif // SV_SUPPORTED
+PL_TEST_ULP (SV_NAME_D1 (log), 1.68)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), -0.0, -inf, 1000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), 0, 0x1p-149, 1000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), 1.0, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_D1 (log), 100, inf, 50000)
diff --git a/pl/math/sv_log_data.c b/pl/math/sv_log_data.c
deleted file mode 100644
index 77f9989444f5..000000000000
--- a/pl/math/sv_log_data.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Coefficients for double-precision SVE log(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-const struct sv_log_data __sv_log_data = {
- /* All coefficients and table entries are copied from the Neon routine in
- math/. See math/v_log_data.c for an explanation of the algorithm. */
-
- .invc = {0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
- 0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
- 0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
- 0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
- 0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
- 0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
- 0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
- 0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
- 0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
- 0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
- 0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
- 0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
- 0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
- 0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
- 0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
- 0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
- 0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
- 0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
- 0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
- 0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
- 0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
- 0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
- 0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
- 0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
- 0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
- 0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
- 0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
- 0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
- 0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
- 0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
- 0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
- 0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
- 0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
- 0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
- 0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
- 0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
- 0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
- 0x1.010037d38bcc2p+0, 1.0,
- 0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
- 0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
- 0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
- 0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
- 0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
- 0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
- 0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
- 0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
- 0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
- 0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
- 0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
- 0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
- 0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
- 0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
- 0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
- 0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
- 0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
- 0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
- 0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
- 0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
- 0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
- 0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
- 0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
- 0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
- 0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
- 0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1},
-
- .logc = {-0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
- -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
- -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
- -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
- -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
- -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
- -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
- -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
- -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
- -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
- -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
- -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
- -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
- -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
- -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
- -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
- -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
- -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
- -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
- -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
- -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
- -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
- -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
- -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
- -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
- -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
- -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
- -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
- -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
- -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
- -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
- -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
- -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
- -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
- -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
- -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
- -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
- -0x1.ff6fe1feb4e53p-9, 0.0,
- 0x1.fe91885ec8e20p-8, 0x1.fc516f716296dp-7,
- 0x1.7bb4dd70a015bp-6, 0x1.f84c99b34b674p-6,
- 0x1.39f9ce4fb2d71p-5, 0x1.7756c0fd22e78p-5,
- 0x1.b43ee82db8f3ap-5, 0x1.f0b3fced60034p-5,
- 0x1.165bd78d4878ep-4, 0x1.3425d2715ebe6p-4,
- 0x1.51b8bd91b7915p-4, 0x1.6f15632c76a47p-4,
- 0x1.8c3c88ecbe503p-4, 0x1.a92ef077625dap-4,
- 0x1.c5ed5745fa006p-4, 0x1.e27876de1c993p-4,
- 0x1.fed104fce4cdcp-4, 0x1.0d7bd9c17d78bp-3,
- 0x1.1b76986cef97bp-3, 0x1.295913d24f750p-3,
- 0x1.37239fa295d17p-3, 0x1.44d68dd78714bp-3,
- 0x1.52722ebe5d780p-3, 0x1.5ff6d12671f98p-3,
- 0x1.6d64c2389484bp-3, 0x1.7abc4da40fddap-3,
- 0x1.87fdbda1e8452p-3, 0x1.95295b06a5f37p-3,
- 0x1.a23f6d34abbc5p-3, 0x1.af403a28e04f2p-3,
- 0x1.bc2c06a85721ap-3, 0x1.c903161240163p-3,
- 0x1.d5c5aa93287ebp-3, 0x1.e274051823fa9p-3,
- 0x1.ef0e656300c16p-3, 0x1.fb9509f05aa2ap-3,
- 0x1.04041821f37afp-2, 0x1.0a340a49b3029p-2,
- 0x1.105a7918a126dp-2, 0x1.1677819812b84p-2,
- 0x1.1c8b405b40c0ep-2, 0x1.2295d16cfa6b1p-2,
- 0x1.28975066318a2p-2, 0x1.2e8fd855d86fcp-2,
- 0x1.347f83d605e59p-2, 0x1.3a666d1244588p-2,
- 0x1.4044adb6f8ec4p-2, 0x1.461a5f077558cp-2,
- 0x1.4be799e20b9c8p-2, 0x1.51ac76a6b79dfp-2,
- 0x1.57690d5744a45p-2, 0x1.5d1d758e45217p-2},
-
- .poly = {-0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
- 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3},
-};
diff --git a/pl/math/sv_logf_3u4.c b/pl/math/sv_logf_3u4.c
index 11f0b8aa12c5..967355247036 100644
--- a/pl/math/sv_logf_3u4.c
+++ b/pl/math/sv_logf_3u4.c
@@ -9,69 +9,78 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
-
-#define P(i) __sv_logf_poly[i]
+static const struct data
+{
+ float poly_0135[4];
+ float poly_246[3];
+ float ln2;
+} data = {
+ .poly_0135 = {
+ /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
+ that coeffs 0, 1, 3 and 5 can be loaded as a single quad-word, hence used
+ with _lane variant of MLA intrinsic. */
+ -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
+ },
+ .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
+ .ln2 = 0x1.62e43p-1f
+};
-#define Ln2 (0x1.62e43p-1f) /* 0x3f317218 */
#define Min (0x00800000)
#define Max (0x7f800000)
+#define Thresh (0x7f000000) /* Max - Min. */
#define Mask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667 */
+#define Off (0x3f2aaaab) /* 0.666667. */
-float
-optr_aor_log_f32 (float);
+float optr_aor_log_f32 (float);
-static NOINLINE sv_f32_t
-__sv_logf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
{
return sv_call_f32 (optr_aor_log_f32, x, y, cmp);
}
-/* Optimised implementation of SVE logf, using the same algorithm and polynomial
- as the Neon routine in math/. Maximum error is 3.34 ULPs:
- __sv_logf(0x1.557298p+0) got 0x1.26edecp-2
- want 0x1.26ede6p-2. */
-sv_f32_t
-__sv_logf_x (sv_f32_t x, const svbool_t pg)
+/* Optimised implementation of SVE logf, using the same algorithm and
+ polynomial as the AdvSIMD routine. Maximum error is 3.34 ULPs:
+ SV_NAME_F1 (log)(0x1.557298p+0) got 0x1.26edecp-2
+ want 0x1.26ede6p-2. */
+svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
{
- sv_u32_t u = sv_as_u32_f32 (x);
- svbool_t cmp
- = svcmpge_u32 (pg, svsub_n_u32_x (pg, u, Min), sv_u32 (Max - Min));
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t u = svreinterpret_u32 (x);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_n_u32_x (pg, u, Off);
- sv_f32_t n = sv_to_f32_s32_x (pg, svasr_n_s32_x (pg, sv_as_s32_u32 (u),
- 23)); /* Sign-extend. */
- u = svand_n_u32_x (pg, u, Mask);
- u = svadd_n_u32_x (pg, u, Off);
- sv_f32_t r = svsub_n_f32_x (pg, sv_as_f32_u32 (u), 1.0f);
+ u = svsub_x (pg, u, Off);
+ svfloat32_t n = svcvt_f32_x (
+ pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
+ u = svand_x (pg, u, Mask);
+ u = svadd_x (pg, u, Off);
+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log(1+r) + n*ln2. */
- sv_f32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (pg, r, r);
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
- sv_f32_t p = sv_fma_n_f32_x (pg, P (1), r, sv_f32 (P (2)));
- sv_f32_t q = sv_fma_n_f32_x (pg, P (3), r, sv_f32 (P (4)));
- sv_f32_t y = sv_fma_n_f32_x (pg, P (5), r, sv_f32 (P (6)));
- p = sv_fma_n_f32_x (pg, P (0), r2, p);
- q = sv_fma_f32_x (pg, p, r2, q);
- y = sv_fma_f32_x (pg, q, r2, y);
- p = sv_fma_n_f32_x (pg, Ln2, n, r);
- y = sv_fma_f32_x (pg, y, r2, p);
+ svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
+ svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+ svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2);
+ svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3);
+ p = svmla_lane (p, r2, p_0135, 0);
+
+ q = svmla_x (pg, q, r2, p);
+ y = svmla_x (pg, y, r2, q);
+ p = svmla_x (pg, r, n, d->ln2);
if (unlikely (svptest_any (pg, cmp)))
- return __sv_logf_specialcase (x, y, cmp);
- return y;
+ return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+ return svmla_x (pg, p, r2, y);
}
-PL_ALIAS (__sv_logf_x, _ZGVsMxv_logf)
-
PL_SIG (SV, F, 1, log, 0.01, 11.1)
-PL_TEST_ULP (__sv_logf, 2.85)
-PL_TEST_INTERVAL (__sv_logf, -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (__sv_logf, 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_logf, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_logf, 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (__sv_logf, 1.0, 100, 50000)
-PL_TEST_INTERVAL (__sv_logf, 100, inf, 50000)
-#endif // SV_SUPPORTED
+PL_TEST_ULP (SV_NAME_F1 (log), 2.85)
+PL_TEST_INTERVAL (SV_NAME_F1 (log), -0.0, -inf, 100)
+PL_TEST_INTERVAL (SV_NAME_F1 (log), 0, 0x1p-126, 100)
+PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log), 1.0, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (log), 100, inf, 50000)
diff --git a/pl/math/sv_logf_data.c b/pl/math/sv_logf_data.c
deleted file mode 100644
index 51dd7a7eeb37..000000000000
--- a/pl/math/sv_logf_data.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Coefficients for single-precision SVE log function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-const float __sv_logf_poly[] = {
- /* Copied from coeffs for the Neon routine in math/. */
- -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, -0x1.4f9934p-3f, 0x1.961348p-3f,
- -0x1.00187cp-2f, 0x1.555d7cp-2f, -0x1.ffffc8p-2f,
-};
diff --git a/pl/math/sv_math.h b/pl/math/sv_math.h
index 5ef0ad3bd5e0..f67fe91803ba 100644
--- a/pl/math/sv_math.h
+++ b/pl/math/sv_math.h
@@ -10,236 +10,124 @@
#ifndef WANT_VMATH
/* Enable the build of vector math code. */
-#define WANT_VMATH 1
+# define WANT_VMATH 1
#endif
-#if WANT_VMATH
-
-#if WANT_SVE_MATH
-#define SV_SUPPORTED 1
-
-#include <arm_sve.h>
-#include <stdbool.h>
-
-#include "math_config.h"
-typedef float f32_t;
-typedef uint32_t u32_t;
-typedef int32_t s32_t;
-typedef double f64_t;
-typedef uint64_t u64_t;
-typedef int64_t s64_t;
+#if WANT_VMATH
-typedef svfloat64_t sv_f64_t;
-typedef svuint64_t sv_u64_t;
-typedef svint64_t sv_s64_t;
+# include <arm_sve.h>
+# include <stdbool.h>
-typedef svfloat32_t sv_f32_t;
-typedef svuint32_t sv_u32_t;
-typedef svint32_t sv_s32_t;
+# include "math_config.h"
/* Double precision. */
-static inline sv_s64_t
-sv_s64 (s64_t x)
-{
- return svdup_n_s64 (x);
-}
-
-static inline sv_u64_t
-sv_u64 (u64_t x)
-{
- return svdup_n_u64 (x);
-}
-
-static inline sv_f64_t
-sv_f64 (f64_t x)
-{
- return svdup_n_f64 (x);
-}
-
-static inline sv_f64_t
-sv_fma_f64_x (svbool_t pg, sv_f64_t x, sv_f64_t y, sv_f64_t z)
+static inline svint64_t
+sv_s64 (int64_t x)
{
- return svmla_f64_x (pg, z, x, y);
+ return svdup_s64 (x);
}
-/* res = z + x * y with x scalar. */
-static inline sv_f64_t
-sv_fma_n_f64_x (svbool_t pg, f64_t x, sv_f64_t y, sv_f64_t z)
+static inline svuint64_t
+sv_u64 (uint64_t x)
{
- return svmla_n_f64_x (pg, z, y, x);
+ return svdup_u64 (x);
}
-static inline sv_s64_t
-sv_as_s64_u64 (sv_u64_t x)
+static inline svfloat64_t
+sv_f64 (double x)
{
- return svreinterpret_s64_u64 (x);
+ return svdup_f64 (x);
}
-static inline sv_u64_t
-sv_as_u64_f64 (sv_f64_t x)
-{
- return svreinterpret_u64_f64 (x);
-}
-
-static inline sv_f64_t
-sv_as_f64_u64 (sv_u64_t x)
-{
- return svreinterpret_f64_u64 (x);
-}
-
-static inline sv_f64_t
-sv_to_f64_s64_x (svbool_t pg, sv_s64_t s)
-{
- return svcvt_f64_x (pg, s);
-}
-
-static inline sv_f64_t
-sv_call_f64 (f64_t (*f) (f64_t), sv_f64_t x, sv_f64_t y, svbool_t cmp)
+static inline svfloat64_t
+sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp)
{
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
- f64_t elem = svclastb_n_f64 (p, 0, x);
+ double elem = svclastb (p, 0, x);
elem = (*f) (elem);
- sv_f64_t y2 = svdup_n_f64 (elem);
- y = svsel_f64 (p, y2, y);
+ svfloat64_t y2 = sv_f64 (elem);
+ y = svsel (p, y2, y);
p = svpnext_b64 (cmp, p);
}
return y;
}
-static inline sv_f64_t
-sv_call2_f64 (f64_t (*f) (f64_t, f64_t), sv_f64_t x1, sv_f64_t x2, sv_f64_t y,
- svbool_t cmp)
+static inline svfloat64_t
+sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2,
+ svfloat64_t y, svbool_t cmp)
{
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
- f64_t elem1 = svclastb_n_f64 (p, 0, x1);
- f64_t elem2 = svclastb_n_f64 (p, 0, x2);
- f64_t ret = (*f) (elem1, elem2);
- sv_f64_t y2 = svdup_n_f64 (ret);
- y = svsel_f64 (p, y2, y);
+ double elem1 = svclastb (p, 0, x1);
+ double elem2 = svclastb (p, 0, x2);
+ double ret = (*f) (elem1, elem2);
+ svfloat64_t y2 = sv_f64 (ret);
+ y = svsel (p, y2, y);
p = svpnext_b64 (cmp, p);
}
return y;
}
-/* Load array of uint64_t into svuint64_t. */
-static inline sv_u64_t
-sv_lookup_u64_x (svbool_t pg, const u64_t *tab, sv_u64_t idx)
-{
- return svld1_gather_u64index_u64 (pg, tab, idx);
-}
-
-/* Load array of double into svfloat64_t. */
-static inline sv_f64_t
-sv_lookup_f64_x (svbool_t pg, const f64_t *tab, sv_u64_t idx)
+static inline svuint64_t
+sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y)
{
- return svld1_gather_u64index_f64 (pg, tab, idx);
-}
-
-static inline sv_u64_t
-sv_mod_n_u64_x (svbool_t pg, sv_u64_t x, u64_t y)
-{
- sv_u64_t q = svdiv_n_u64_x (pg, x, y);
- return svmls_n_u64_x (pg, x, q, y);
+ svuint64_t q = svdiv_x (pg, x, y);
+ return svmls_x (pg, x, q, y);
}
/* Single precision. */
-static inline sv_s32_t
-sv_s32 (s32_t x)
-{
- return svdup_n_s32 (x);
-}
-
-static inline sv_u32_t
-sv_u32 (u32_t x)
-{
- return svdup_n_u32 (x);
-}
-
-static inline sv_f32_t
-sv_f32 (f32_t x)
-{
- return svdup_n_f32 (x);
-}
-
-static inline sv_f32_t
-sv_fma_f32_x (svbool_t pg, sv_f32_t x, sv_f32_t y, sv_f32_t z)
-{
- return svmla_f32_x (pg, z, x, y);
-}
-
-/* res = z + x * y with x scalar. */
-static inline sv_f32_t
-sv_fma_n_f32_x (svbool_t pg, f32_t x, sv_f32_t y, sv_f32_t z)
-{
- return svmla_n_f32_x (pg, z, y, x);
-}
-
-static inline sv_u32_t
-sv_as_u32_f32 (sv_f32_t x)
-{
- return svreinterpret_u32_f32 (x);
-}
-
-static inline sv_f32_t
-sv_as_f32_u32 (sv_u32_t x)
+static inline svint32_t
+sv_s32 (int32_t x)
{
- return svreinterpret_f32_u32 (x);
+ return svdup_s32 (x);
}
-static inline sv_s32_t
-sv_as_s32_u32 (sv_u32_t x)
+static inline svuint32_t
+sv_u32 (uint32_t x)
{
- return svreinterpret_s32_u32 (x);
+ return svdup_u32 (x);
}
-static inline sv_f32_t
-sv_to_f32_s32_x (svbool_t pg, sv_s32_t s)
+static inline svfloat32_t
+sv_f32 (float x)
{
- return svcvt_f32_x (pg, s);
+ return svdup_f32 (x);
}
-static inline sv_s32_t
-sv_to_s32_f32_x (svbool_t pg, sv_f32_t x)
-{
- return svcvt_s32_f32_x (pg, x);
-}
-
-static inline sv_f32_t
-sv_call_f32 (f32_t (*f) (f32_t), sv_f32_t x, sv_f32_t y, svbool_t cmp)
+static inline svfloat32_t
+sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp)
{
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
- f32_t elem = svclastb_n_f32 (p, 0, x);
+ float elem = svclastb (p, 0, x);
elem = (*f) (elem);
- sv_f32_t y2 = svdup_n_f32 (elem);
- y = svsel_f32 (p, y2, y);
+ svfloat32_t y2 = sv_f32 (elem);
+ y = svsel (p, y2, y);
p = svpnext_b32 (cmp, p);
}
return y;
}
-static inline sv_f32_t
-sv_call2_f32 (f32_t (*f) (f32_t, f32_t), sv_f32_t x1, sv_f32_t x2, sv_f32_t y,
- svbool_t cmp)
+static inline svfloat32_t
+sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2,
+ svfloat32_t y, svbool_t cmp)
{
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
- f32_t elem1 = svclastb_n_f32 (p, 0, x1);
- f32_t elem2 = svclastb_n_f32 (p, 0, x2);
- f32_t ret = (*f) (elem1, elem2);
- sv_f32_t y2 = svdup_n_f32 (ret);
- y = svsel_f32 (p, y2, y);
+ float elem1 = svclastb (p, 0, x1);
+ float elem2 = svclastb (p, 0, x2);
+ float ret = (*f) (elem1, elem2);
+ svfloat32_t y2 = sv_f32 (ret);
+ y = svsel (p, y2, y);
p = svpnext_b32 (cmp, p);
}
return y;
}
-
-#endif
#endif
+
#endif
diff --git a/pl/math/sv_pow_1u5.c b/pl/math/sv_pow_1u5.c
new file mode 100644
index 000000000000..0838810206a1
--- /dev/null
+++ b/pl/math/sv_pow_1u5.c
@@ -0,0 +1,444 @@
+/*
+ * Double-precision SVE pow(x, y) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* This version share a similar algorithm as AOR scalar pow.
+
+ The core computation consists in computing pow(x, y) as
+
+ exp (y * log (x)).
+
+ The algorithms for exp and log are very similar to scalar exp and log.
+ The log relies on table lookup for 3 variables and an order 8 polynomial.
+ It returns a high and a low contribution that are then passed to the exp,
+ to minimise the loss of accuracy in both routines.
+ The exp is based on 8-bit table lookup for scale and order-4 polynomial.
+ The SVE algorithm drops the tail in the exp computation at the price of
+ a lower accuracy, slightly above 1ULP.
+ The SVE algorithm also drops the special treatement of small (< 2^-65) and
+ large (> 2^63) finite values of |y|, as they only affect non-round to nearest
+ modes.
+
+ Maximum measured error is 1.04 ULPs:
+ SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
+ got 0x1.f7116284221fcp-1
+ want 0x1.f7116284221fdp-1. */
+
+/* Data is defined in v_pow_log_data.c. */
+#define N_LOG (1 << V_POW_LOG_TABLE_BITS)
+#define A __v_pow_log_data.poly
+#define Off 0x3fe6955500000000
+
+/* Data is defined in v_pow_exp_data.c. */
+#define N_EXP (1 << V_POW_EXP_TABLE_BITS)
+#define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
+#define C __v_pow_exp_data.poly
+#define SmallExp 0x3c9 /* top12(0x1p-54). */
+#define BigExp 0x408 /* top12(512.). */
+#define ThresExp 0x03f /* BigExp - SmallExp. */
+#define HugeExp 0x409 /* top12(1024.). */
+
+/* Constants associated with pow. */
+#define SmallPowX 0x001 /* top12(0x1p-126). */
+#define BigPowX 0x7ff /* top12(INFINITY). */
+#define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
+#define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */
+#define BigPowY 0x43e /* top12(0x1.749p62). */
+#define ThresPowY 0x080 /* BigPowY - SmallPowY. */
+
+/* Check if x is an integer. */
+static inline svbool_t
+sv_isint (svbool_t pg, svfloat64_t x)
+{
+ return svcmpeq (pg, svrintz_z (pg, x), x);
+}
+
+/* Check if x is real not integer valued. */
+static inline svbool_t
+sv_isnotint (svbool_t pg, svfloat64_t x)
+{
+ return svcmpne (pg, svrintz_z (pg, x), x);
+}
+
+/* Check if x is an odd integer. */
+static inline svbool_t
+sv_isodd (svbool_t pg, svfloat64_t x)
+{
+ svfloat64_t y = svmul_x (pg, x, 0.5);
+ return sv_isnotint (pg, y);
+}
+
+/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
+ the bit representation of a non-zero finite floating-point value. */
+static inline int
+checkint (uint64_t iy)
+{
+ int e = iy >> 52 & 0x7ff;
+ if (e < 0x3ff)
+ return 0;
+ if (e > 0x3ff + 52)
+ return 2;
+ if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
+ return 0;
+ if (iy & (1ULL << (0x3ff + 52 - e)))
+ return 1;
+ return 2;
+}
+
+/* Top 12 bits (sign and exponent of each double float lane). */
+static inline svuint64_t
+sv_top12 (svfloat64_t x)
+{
+ return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline int
+zeroinfnan (uint64_t i)
+{
+ return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan. */
+static inline svbool_t
+sv_zeroinfnan (svbool_t pg, svuint64_t i)
+{
+ return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
+ 2 * asuint64 (INFINITY) - 1);
+}
+
+/* Handle cases that may overflow or underflow when computing the result that
+ is scale*(1+TMP) without intermediate rounding. The bit representation of
+ scale is in SBITS, however it has a computed exponent that may have
+ overflown into the sign bit so that needs to be adjusted before using it as
+ a double. (int32_t)KI is the k used in the argument reduction and exponent
+ adjustment of scale, positive k here means the result may overflow and
+ negative k means the result may underflow. */
+static inline double
+specialcase (double tmp, uint64_t sbits, uint64_t ki)
+{
+ double scale;
+ if ((ki & 0x80000000) == 0)
+ {
+ /* k > 0, the exponent of scale might have overflowed by <= 460. */
+ sbits -= 1009ull << 52;
+ scale = asdouble (sbits);
+ return 0x1p1009 * (scale + scale * tmp);
+ }
+ /* k < 0, need special care in the subnormal range. */
+ sbits += 1022ull << 52;
+ /* Note: sbits is signed scale. */
+ scale = asdouble (sbits);
+ double y = scale + scale * tmp;
+ return 0x1p-1022 * y;
+}
+
+/* Scalar fallback for special cases of SVE pow's exp. */
+static inline svfloat64_t
+sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
+ svfloat64_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ double sx1 = svclastb (p, 0, x1);
+ uint64_t su1 = svclastb (p, 0, u1);
+ uint64_t su2 = svclastb (p, 0, u2);
+ double elem = specialcase (sx1, su1, su2);
+ svfloat64_t y2 = sv_f64 (elem);
+ y = svsel (p, y2, y);
+ p = svpnext_b64 (cmp, p);
+ }
+ return y;
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+ additional 15 bits precision. IX is the bit representation of x, but
+ normalized in the subnormal range using the sign bit for the exponent. */
+static inline svfloat64_t
+sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+{
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ svuint64_t tmp = svsub_x (pg, ix, Off);
+ svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
+ sv_u64 (N_LOG - 1));
+ svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
+ svfloat64_t z = svreinterpret_f64 (iz);
+ svfloat64_t kd = svcvt_f64_x (pg, k);
+
+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
+ /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
+ that uses array of structures. We also do the lookup earlier in the code to
+ make sure it finishes as early as possible. */
+ svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
+ svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
+
+ /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
+ svfloat64_t r = svmad_x (pg, z, invc, -1.0);
+ /* k*Ln2 + log(c) + r. */
+ svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
+ svfloat64_t t2 = svadd_x (pg, t1, r);
+ svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
+ svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
+
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */
+ svfloat64_t ar2 = svmul_x (pg, r, ar);
+ svfloat64_t ar3 = svmul_x (pg, r, ar2);
+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
+ svfloat64_t hi = svadd_x (pg, t2, ar2);
+ svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
+ svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
+ /* p = log1p(r) - r - A[0]*r*r. */
+ /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
+ A[6])))). */
+ svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
+ svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
+ svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
+ svfloat64_t p = svmla_x (pg, a34, ar2, a56);
+ p = svmla_x (pg, a12, ar2, p);
+ p = svmul_x (pg, ar3, p);
+ svfloat64_t lo = svadd_x (
+ pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+ svfloat64_t y = svadd_x (pg, hi, lo);
+ *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
+ return y;
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
+ The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
+static inline svfloat64_t
+sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+ svuint64_t sign_bias)
+{
+ /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
+ and other cases of large values of x (scale * (1 + TMP) oflow). */
+ svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff);
+ /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
+ svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
+
+ /* Conditions special, uflow and oflow are all expressed as uoflow &&
+ something, hence do not bother computing anything if no lane in uoflow is
+ true. */
+ svbool_t special = svpfalse_b ();
+ svbool_t uflow = svpfalse_b ();
+ svbool_t oflow = svpfalse_b ();
+ if (unlikely (svptest_any (pg, uoflow)))
+ {
+ /* |x| is tiny (|x| <= 0x1p-54). */
+ uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+ uflow = svand_z (pg, uoflow, uflow);
+ /* |x| is huge (|x| >= 1024). */
+ oflow = svcmpge (pg, abstop, HugeExp);
+ oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
+ /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
+ or underflow. */
+ special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+ }
+
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
+ svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
+ svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
+ svfloat64_t kd = svadd_x (pg, z, shift);
+ svuint64_t ki = svreinterpret_u64 (kd);
+ kd = svsub_x (pg, kd, shift);
+ svfloat64_t r = x;
+ r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
+ r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r = svadd_x (pg, r, xtail);
+ /* 2^(k/N) ~= scale. */
+ svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
+ svuint64_t top
+ = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+ sbits = svadd_x (pg, sbits, top);
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
+ tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
+ tmp = svmla_x (pg, r, r2, tmp);
+ svfloat64_t scale = svreinterpret_f64 (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ z = svmla_x (pg, scale, scale, tmp);
+
+ /* Update result with special and large cases. */
+ if (unlikely (svptest_any (pg, special)))
+ z = sv_call_specialcase (tmp, sbits, ki, z, special);
+
+ /* Handle underflow and overflow. */
+ svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
+ svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
+ svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+ svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+ res_uoflow = svreinterpret_f64 (
+ svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+ z = svsel (oflow, res_uoflow, z);
+ /* Avoid spurious underflow for tiny x. */
+ svfloat64_t res_spurious_uflow
+ = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+ z = svsel (uflow, res_spurious_uflow, z);
+
+ return z;
+}
+
+static inline double
+pow_sc (double x, double y)
+{
+ uint64_t ix = asuint64 (x);
+ uint64_t iy = asuint64 (y);
+ /* Special cases: |x| or |y| is 0, inf or nan. */
+ if (unlikely (zeroinfnan (iy)))
+ {
+ if (2 * iy == 0)
+ return issignaling_inline (x) ? x + y : 1.0;
+ if (ix == asuint64 (1.0))
+ return issignaling_inline (y) ? x + y : 1.0;
+ if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY))
+ return x + y;
+ if (2 * ix == 2 * asuint64 (1.0))
+ return 1.0;
+ if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
+ return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
+ return y * y;
+ }
+ if (unlikely (zeroinfnan (ix)))
+ {
+ double_t x2 = x * x;
+ if (ix >> 63 && checkint (iy) == 1)
+ x2 = -x2;
+ /* Without the barrier some versions of clang hoist the 1/x2 and
+ thus division by zero exception can be signaled spuriously. */
+ return (iy >> 63) ? opt_barrier_double (1 / x2) : x2;
+ }
+ return x;
+}
+
+svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
+{
+ /* This preamble handles special case conditions used in the final scalar
+ fallbacks. It also updates ix and sign_bias, that are used in the core
+ computation too, i.e., exp( y * log (x) ). */
+ svuint64_t vix0 = svreinterpret_u64 (x);
+ svuint64_t viy0 = svreinterpret_u64 (y);
+ svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
+
+ /* Negative x cases. */
+ svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
+ svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
+
+ /* Set sign_bias and ix depending on sign of x and nature of y. */
+ svbool_t yisnotint_xisneg = svpfalse_b ();
+ svuint64_t sign_bias = sv_u64 (0);
+ svuint64_t vix = vix0;
+ svuint64_t vtopx1 = vtopx0;
+ if (unlikely (svptest_any (pg, xisneg)))
+ {
+ /* Determine nature of y. */
+ yisnotint_xisneg = sv_isnotint (xisneg, y);
+ svbool_t yisint_xisneg = sv_isint (xisneg, y);
+ svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
+ /* ix set to abs(ix) if y is integer. */
+ vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
+ vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
+ /* Set to SignBias if x is negative and y is odd. */
+ sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
+ }
+
+ /* Special cases of x or y: zero, inf and nan. */
+ svbool_t xspecial = sv_zeroinfnan (pg, vix0);
+ svbool_t yspecial = sv_zeroinfnan (pg, viy0);
+ svbool_t special = svorr_z (pg, xspecial, yspecial);
+
+ /* Small cases of x: |x| < 0x1p-126. */
+ svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
+ svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
+ if (unlikely (svptest_any (pg, xsmall)))
+ {
+ /* Normalize subnormal x so exponent becomes negative. */
+ svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
+
+ svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
+ vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
+ vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52);
+ vix = svsel (topx_is_null, vix_norm, vix);
+ }
+
+ /* y_hi = log(ix, &y_lo). */
+ svfloat64_t vlo;
+ svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
+
+ /* z = exp(y_hi, y_lo, sign_bias). */
+ svfloat64_t vehi = svmul_x (pg, y, vhi);
+ svfloat64_t velo = svmul_x (pg, y, vlo);
+ svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
+ velo = svsub_x (pg, velo, vemi);
+ svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
+
+ /* Cases of finite y and finite negative x. */
+ vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
+
+ /* Cases of zero/inf/nan x or y. */
+ if (unlikely (svptest_any (pg, special)))
+ vz = sv_call2_f64 (pow_sc, x, y, vz, special);
+
+ return vz;
+}
+
+PL_SIG (SV, D, 2, pow)
+PL_TEST_ULP (SV_NAME_D2 (pow), 0.55)
+/* Wide intervals spanning the whole domain but shared between x and y. */
+#define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define EXPAND(str) str##000000000
+#define SHL52(str) EXPAND (str)
+SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
+SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
+SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
+SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
+SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
+SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
+SV_POW_INTERVAL2 (0, inf, 0, inf, 1000)
+/* x~1 or y~1. */
+SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
+SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
+SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
+/* around estimated argmaxs of ULP error. */
+SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
+SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
+/* x is negative, y is odd or even integer, or y is real not integer. */
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+/* |x| is inf, y is odd or even integer, or y is real not integer. */
+SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
+SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
+SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
+SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
+/* 0.0^y. */
+SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
+/* 1.0^y. */
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+PL_TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
diff --git a/pl/math/sv_powf_2u6.c b/pl/math/sv_powf_2u6.c
new file mode 100644
index 000000000000..2db0636aea62
--- /dev/null
+++ b/pl/math/sv_powf_2u6.c
@@ -0,0 +1,360 @@
+/*
+ * Single-precision SVE powf function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* The following data is used in the SVE pow core computation
+ and special case detection. */
+#define Tinvc __v_powf_data.invc
+#define Tlogc __v_powf_data.logc
+#define Texp __v_powf_data.scale
+#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
+#define Shift 0x1.8p52
+#define Norm 0x1p23f /* 0x4b000000. */
+
+/* Overall ULP error bound for pow is 2.6 ulp
+ ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
+static const struct data
+{
+ double log_poly[4];
+ double exp_poly[3];
+ float uflow_bound, oflow_bound, small_bound;
+ uint32_t sign_bias, sign_mask, subnormal_bias, off;
+} data = {
+ /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
+ V_POWF_EXP2_N. */
+ .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
+ -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
+ /* rel err: 1.69 * 2^-34. */
+ .exp_poly = {
+ 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */
+ 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */
+ 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */
+ },
+ .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */
+ .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */
+ .small_bound = 0x1p-126f,
+ .off = 0x3f35d000,
+ .sign_bias = SignBias,
+ .sign_mask = 0x80000000,
+ .subnormal_bias = 0x0b800000, /* 23 << 23. */
+};
+
+#define A(i) sv_f64 (d->log_poly[i])
+#define C(i) sv_f64 (d->exp_poly[i])
+
+/* Check if x is an integer. */
+static inline svbool_t
+svisint (svbool_t pg, svfloat32_t x)
+{
+ return svcmpeq (pg, svrintz_z (pg, x), x);
+}
+
+/* Check if x is real not integer valued. */
+static inline svbool_t
+svisnotint (svbool_t pg, svfloat32_t x)
+{
+ return svcmpne (pg, svrintz_z (pg, x), x);
+}
+
+/* Check if x is an odd integer. */
+static inline svbool_t
+svisodd (svbool_t pg, svfloat32_t x)
+{
+ svfloat32_t y = svmul_x (pg, x, 0.5f);
+ return svisnotint (pg, y);
+}
+
+/* Check if zero, inf or nan. */
+static inline svbool_t
+sv_zeroinfnan (svbool_t pg, svuint32_t i)
+{
+ return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
+ 2u * 0x7f800000 - 1);
+}
+
+/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
+ the bit representation of a non-zero finite floating-point value. */
+static inline int
+checkint (uint32_t iy)
+{
+ int e = iy >> 23 & 0xff;
+ if (e < 0x7f)
+ return 0;
+ if (e > 0x7f + 23)
+ return 2;
+ if (iy & ((1 << (0x7f + 23 - e)) - 1))
+ return 0;
+ if (iy & (1 << (0x7f + 23 - e)))
+ return 1;
+ return 2;
+}
+
+/* Check if zero, inf or nan. */
+static inline int
+zeroinfnan (uint32_t ix)
+{
+ return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
+}
+
+/* A scalar subroutine used to fix main power special cases. Similar to the
+ preamble of finite_powf except that we do not update ix and sign_bias. This
+ is done in the preamble of the SVE powf. */
+static inline float
+powf_specialcase (float x, float y, float z)
+{
+ uint32_t ix = asuint (x);
+ uint32_t iy = asuint (y);
+ /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
+ if (unlikely (zeroinfnan (iy)))
+ {
+ if (2 * iy == 0)
+ return issignalingf_inline (x) ? x + y : 1.0f;
+ if (ix == 0x3f800000)
+ return issignalingf_inline (y) ? x + y : 1.0f;
+ if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
+ return x + y;
+ if (2 * ix == 2 * 0x3f800000)
+ return 1.0f;
+ if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
+ return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
+ return y * y;
+ }
+ if (unlikely (zeroinfnan (ix)))
+ {
+ float_t x2 = x * x;
+ if (ix & 0x80000000 && checkint (iy) == 1)
+ x2 = -x2;
+ return iy & 0x80000000 ? 1 / x2 : x2;
+ }
+ /* We need a return here in case x<0 and y is integer, but all other tests
+ need to be run. */
+ return z;
+}
+
+/* Scalar fallback for special case routines with custom signature. */
+static inline svfloat32_t
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
+{
+ svbool_t p = svpfirst (cmp, svpfalse ());
+ while (svptest_any (cmp, p))
+ {
+ float sx1 = svclastb (p, 0, x1);
+ float sx2 = svclastb (p, 0, x2);
+ float elem = svclastb (p, 0, y);
+ elem = powf_specialcase (sx1, sx2, elem);
+ svfloat32_t y2 = sv_f32 (elem);
+ y = svsel (p, y2, y);
+ p = svpnext_b32 (cmp, p);
+ }
+ return y;
+}
+
+/* Compute core for half of the lanes in double precision. */
+static inline svfloat64_t
+sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
+ svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
+ const struct data *d)
+{
+ svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
+ svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
+
+ /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
+ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
+ svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
+
+ /* Polynomial to approximate log1p(r)/ln2. */
+ svfloat64_t logx = A (0);
+ logx = svmla_x (pg, A (1), r, logx);
+ logx = svmla_x (pg, A (2), r, logx);
+ logx = svmla_x (pg, A (3), r, logx);
+ logx = svmla_x (pg, y0, r, logx);
+ *pylogx = svmul_x (pg, y, logx);
+
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
+ svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
+ svuint64_t ki = svreinterpret_u64 (kd);
+ kd = svsub_x (pg, kd, Shift);
+
+ r = svsub_x (pg, *pylogx, kd);
+
+ /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
+ svuint64_t t
+ = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
+ svuint64_t ski = svadd_x (pg, ki, sign_bias);
+ t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
+ svfloat64_t s = svreinterpret_f64 (t);
+
+ svfloat64_t p = C (0);
+ p = svmla_x (pg, C (1), p, r);
+ p = svmla_x (pg, C (2), p, r);
+ p = svmla_x (pg, s, p, svmul_x (pg, s, r));
+
+ return p;
+}
+
+/* Widen vector to double precision and compute core on both halves of the
+ vector. Lower cost of promotion by considering all lanes active. */
+static inline svfloat32_t
+sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
+ svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
+ const struct data *d)
+{
+ const svbool_t ptrue = svptrue_b64 ();
+
+ /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
+ order to perform core computation in double precision. */
+ const svbool_t pg_lo = svunpklo (pg);
+ const svbool_t pg_hi = svunpkhi (pg);
+ svfloat64_t y_lo = svcvt_f64_x (
+ ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+ svfloat64_t y_hi = svcvt_f64_x (
+ ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+ svfloat32_t z = svreinterpret_f32 (iz);
+ svfloat64_t z_lo = svcvt_f64_x (
+ ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
+ svfloat64_t z_hi = svcvt_f64_x (
+ ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
+ svuint64_t i_lo = svunpklo (i);
+ svuint64_t i_hi = svunpkhi (i);
+ svint64_t k_lo = svunpklo (k);
+ svint64_t k_hi = svunpkhi (k);
+ svuint64_t sign_bias_lo = svunpklo (sign_bias);
+ svuint64_t sign_bias_hi = svunpkhi (sign_bias);
+
+ /* Compute each part in double precision. */
+ svfloat64_t ylogx_lo, ylogx_hi;
+ svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
+ sign_bias_lo, &ylogx_lo, d);
+ svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
+ sign_bias_hi, &ylogx_hi, d);
+
+ /* Convert back to single-precision and interleave. */
+ svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
+ svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
+ *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
+ svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
+ svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
+ return svuzp1 (lo_32, hi_32);
+}
+
+/* Implementation of SVE powf.
+ Provides the same accuracy as AdvSIMD powf, since it relies on the same
+ algorithm. The theoretical maximum error is under 2.60 ULPs.
+ Maximum measured error is 2.56 ULPs:
+ SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
+ want 0x1.fd4b06p+127. */
+svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint32_t vix0 = svreinterpret_u32 (x);
+ svuint32_t viy0 = svreinterpret_u32 (y);
+
+ /* Negative x cases. */
+ svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
+ svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
+
+ /* Set sign_bias and ix depending on sign of x and nature of y. */
+ svbool_t yisnotint_xisneg = svpfalse_b ();
+ svuint32_t sign_bias = sv_u32 (0);
+ svuint32_t vix = vix0;
+ if (unlikely (svptest_any (pg, xisneg)))
+ {
+ /* Determine nature of y. */
+ yisnotint_xisneg = svisnotint (xisneg, y);
+ svbool_t yisint_xisneg = svisint (xisneg, y);
+ svbool_t yisodd_xisneg = svisodd (xisneg, y);
+ /* ix set to abs(ix) if y is integer. */
+ vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
+ /* Set to SignBias if x is negative and y is odd. */
+ sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
+ }
+
+ /* Special cases of x or y: zero, inf and nan. */
+ svbool_t xspecial = sv_zeroinfnan (pg, vix0);
+ svbool_t yspecial = sv_zeroinfnan (pg, viy0);
+ svbool_t cmp = svorr_z (pg, xspecial, yspecial);
+
+ /* Small cases of x: |x| < 0x1p-126. */
+ svbool_t xsmall = svaclt (pg, x, d->small_bound);
+ if (unlikely (svptest_any (pg, xsmall)))
+ {
+ /* Normalize subnormal x so exponent becomes negative. */
+ svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
+ vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
+ vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
+ vix = svsel (xsmall, vix_norm, vix);
+ }
+ /* Part of core computation carried in working precision. */
+ svuint32_t tmp = svsub_x (pg, vix, d->off);
+ svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+ V_POWF_LOG2_N - 1);
+ svuint32_t top = svand_x (pg, tmp, 0xff800000);
+ svuint32_t iz = svsub_x (pg, vix, top);
+ svint32_t k
+ = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
+
+ /* Compute core in extended precision and return intermediate ylogx results to
+ handle cases of underflow and underflow in exp. */
+ svfloat32_t ylogx;
+ svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
+
+ /* Handle exp special cases of underflow and overflow. */
+ svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+ svfloat32_t ret_oflow
+ = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
+ svfloat32_t ret_uflow = svreinterpret_f32 (sign);
+ ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
+ ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
+
+ /* Cases of finite y and finite negative x. */
+ ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
+
+ if (unlikely (svptest_any (pg, cmp)))
+ return sv_call_powf_sc (x, y, ret, cmp);
+
+ return ret;
+}
+
+PL_SIG (SV, F, 2, pow)
+PL_TEST_ULP (SV_NAME_F2 (pow), 2.06)
+/* Wide intervals spanning the whole domain but shared between x and y. */
+#define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
+SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
+SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
+/* x~1 or y~1. */
+SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
+SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
+SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
+/* around estimated argmaxs of ULP error. */
+SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
+SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
+/* x is negative, y is odd or even integer, or y is real not integer. */
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+/* |x| is inf, y is odd or even integer, or y is real not integer. */
+SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
+SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
+SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
+SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
+/* 0.0^y. */
+SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
+/* 1.0^y. */
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
diff --git a/pl/math/sv_powi.c b/pl/math/sv_powi.c
index 1bb0eb3d3498..e53bf2195533 100644
--- a/pl/math/sv_powi.c
+++ b/pl/math/sv_powi.c
@@ -6,23 +6,22 @@
*/
#include "sv_math.h"
-#if SV_SUPPORTED
/* Optimized double-precision vector powi (double base, long integer power).
powi is developed for environments in which accuracy is of much less
importance than performance, hence we provide no estimate for worst-case
error. */
svfloat64_t
-__sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p)
+_ZGVsMxvv_powk (svfloat64_t as, svint64_t ns, svbool_t p)
{
/* Compute powi by successive squaring, right to left. */
- svfloat64_t acc = svdup_n_f64 (1.0);
- svbool_t want_recip = svcmplt_n_s64 (p, ns, 0);
- svuint64_t ns_abs = svreinterpret_u64_s64 (svabs_s64_x (p, ns));
+ svfloat64_t acc = sv_f64 (1.0);
+ svbool_t want_recip = svcmplt (p, ns, 0);
+ svuint64_t ns_abs = svreinterpret_u64 (svabs_x (p, ns));
/* We use a max to avoid needing to check whether any lane != 0 on each
iteration. */
- uint64_t max_n = svmaxv_u64 (p, ns_abs);
+ uint64_t max_n = svmaxv (p, ns_abs);
svfloat64_t c = as;
/* Successively square c, and use merging predication (_m) to determine
@@ -30,24 +29,20 @@ __sv_powi_x (svfloat64_t as, svint64_t ns, svbool_t p)
iteration. */
while (true)
{
- svbool_t px = svcmpeq_n_u64 (p, svand_n_u64_x (p, ns_abs, 1ull), 1ull);
- acc = svmul_f64_m (px, acc, c);
+ svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1ull), 1ull);
+ acc = svmul_m (px, acc, c);
max_n >>= 1;
if (max_n == 0)
break;
- ns_abs = svlsr_n_u64_x (p, ns_abs, 1);
- c = svmul_f64_x (p, c, c);
+ ns_abs = svlsr_x (p, ns_abs, 1);
+ c = svmul_x (p, c, c);
}
/* Negative powers are handled by computing the abs(n) version and then
taking the reciprocal. */
if (svptest_any (want_recip, want_recip))
- acc = svdivr_n_f64_m (want_recip, acc, 1.0);
+ acc = svdivr_m (want_recip, acc, 1.0);
return acc;
}
-
-strong_alias (__sv_powi_x, _ZGVsMxvv_powk)
-
-#endif // SV_SUPPORTED
diff --git a/pl/math/sv_powif.c b/pl/math/sv_powif.c
index d0567e393927..7e032fd86a20 100644
--- a/pl/math/sv_powif.c
+++ b/pl/math/sv_powif.c
@@ -6,23 +6,22 @@
*/
#include "sv_math.h"
-#if SV_SUPPORTED
/* Optimized single-precision vector powi (float base, integer power).
powi is developed for environments in which accuracy is of much less
importance than performance, hence we provide no estimate for worst-case
error. */
svfloat32_t
-__sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p)
+_ZGVsMxvv_powi (svfloat32_t as, svint32_t ns, svbool_t p)
{
/* Compute powi by successive squaring, right to left. */
- svfloat32_t acc = svdup_n_f32 (1.f);
- svbool_t want_recip = svcmplt_n_s32 (p, ns, 0);
- svuint32_t ns_abs = svreinterpret_u32_s32 (svabs_s32_x (p, ns));
+ svfloat32_t acc = sv_f32 (1.f);
+ svbool_t want_recip = svcmplt (p, ns, 0);
+ svuint32_t ns_abs = svreinterpret_u32 (svabs_x (p, ns));
/* We use a max to avoid needing to check whether any lane != 0 on each
iteration. */
- uint32_t max_n = svmaxv_u32 (p, ns_abs);
+ uint32_t max_n = svmaxv (p, ns_abs);
svfloat32_t c = as;
/* Successively square c, and use merging predication (_m) to determine
@@ -30,25 +29,20 @@ __sv_powif_x (svfloat32_t as, svint32_t ns, svbool_t p)
iteration. */
while (true)
{
- svbool_t px = svcmpeq_n_u32 (p, svand_n_u32_x (p, ns_abs, 1), 1);
- acc = svmul_f32_m (px, acc, c);
+ svbool_t px = svcmpeq (p, svand_x (p, ns_abs, 1), 1);
+ acc = svmul_m (px, acc, c);
max_n >>= 1;
if (max_n == 0)
break;
- ns_abs = svlsr_n_u32_x (p, ns_abs, 1);
- c = svmul_f32_x (p, c, c);
+ ns_abs = svlsr_x (p, ns_abs, 1);
+ c = svmul_x (p, c, c);
}
/* Negative powers are handled by computing the abs(n) version and then
taking the reciprocal. */
if (svptest_any (want_recip, want_recip))
- acc = svdivr_n_f32_m (want_recip, acc, 1.0f);
+ acc = svdivr_m (want_recip, acc, 1.0f);
return acc;
}
-
-/* Note no trailing f for ZGV... name - 64-bit integer version is powk. */
-strong_alias (__sv_powif_x, _ZGVsMxvv_powi)
-
-#endif // SV_SUPPORTED
diff --git a/pl/math/sv_sin_3u.c b/pl/math/sv_sin_3u.c
deleted file mode 100644
index 3fee08061918..000000000000
--- a/pl/math/sv_sin_3u.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Double-precision SVE sin(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "sv_math.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if SV_SUPPORTED
-
-#define InvPi (sv_f64 (0x1.45f306dc9c883p-2))
-#define HalfPi (sv_f64 (0x1.921fb54442d18p+0))
-#define InvPio2 (sv_f64 (0x1.45f306dc9c882p-1))
-#define NegPio2_1 (sv_f64 (-0x1.921fb50000000p+0))
-#define NegPio2_2 (sv_f64 (-0x1.110b460000000p-26))
-#define NegPio2_3 (sv_f64 (-0x1.1a62633145c07p-54))
-#define Shift (sv_f64 (0x1.8p52))
-#define RangeVal (sv_f64 (0x1p23))
-#define AbsMask (0x7fffffffffffffff)
-
-static NOINLINE sv_f64_t
-__sv_sin_specialcase (sv_f64_t x, sv_f64_t y, svbool_t cmp)
-{
- return sv_call_f64 (sin, x, y, cmp);
-}
-
-/* A fast SVE implementation of sin based on trigonometric
- instructions (FTMAD, FTSSEL, FTSMUL).
- Maximum observed error in 2.52 ULP:
- __sv_sin(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40
- want 0x1.10ace8f3e7868p-40. */
-sv_f64_t
-__sv_sin_x (sv_f64_t x, const svbool_t pg)
-{
- sv_f64_t n, r, r2, y;
- sv_u64_t sign;
- svbool_t cmp;
-
- r = sv_as_f64_u64 (svand_n_u64_x (pg, sv_as_u64_f64 (x), AbsMask));
- sign = svand_n_u64_x (pg, sv_as_u64_f64 (x), ~AbsMask);
- cmp = svcmpge_u64 (pg, sv_as_u64_f64 (r), sv_as_u64_f64 (RangeVal));
-
- /* n = rint(|x|/(pi/2)). */
- sv_f64_t q = sv_fma_f64_x (pg, InvPio2, r, Shift);
- n = svsub_f64_x (pg, q, Shift);
-
- /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */
- r = sv_fma_f64_x (pg, NegPio2_1, n, r);
- r = sv_fma_f64_x (pg, NegPio2_2, n, r);
- r = sv_fma_f64_x (pg, NegPio2_3, n, r);
-
- /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */
- sv_f64_t f = svtssel_f64 (r, sv_as_u64_f64 (q));
-
- /* sin(r) poly approx. */
- r2 = svtsmul_f64 (r, sv_as_u64_f64 (q));
- y = sv_f64 (0.0);
- y = svtmad_f64 (y, r2, 7);
- y = svtmad_f64 (y, r2, 6);
- y = svtmad_f64 (y, r2, 5);
- y = svtmad_f64 (y, r2, 4);
- y = svtmad_f64 (y, r2, 3);
- y = svtmad_f64 (y, r2, 2);
- y = svtmad_f64 (y, r2, 1);
- y = svtmad_f64 (y, r2, 0);
-
- /* Apply factor. */
- y = svmul_f64_x (pg, f, y);
-
- /* sign = y^sign. */
- y = sv_as_f64_u64 (sveor_u64_x (pg, sv_as_u64_f64 (y), sign));
-
- /* No need to pass pg to specialcase here since cmp is a strict subset,
- guaranteed by the cmpge above. */
- if (unlikely (svptest_any (pg, cmp)))
- return __sv_sin_specialcase (x, y, cmp);
- return y;
-}
-
-PL_ALIAS (__sv_sin_x, _ZGVsMxv_sin)
-
-PL_SIG (SV, D, 1, sin, -3.1, 3.1)
-PL_TEST_ULP (__sv_sin, 2.03)
-PL_TEST_INTERVAL (__sv_sin, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (__sv_sin, 0x1p-4, 0x1p4, 500000)
-#endif
diff --git a/pl/math/sv_sin_3u5.c b/pl/math/sv_sin_3u5.c
new file mode 100644
index 000000000000..a81f3fc80f3d
--- /dev/null
+++ b/pl/math/sv_sin_3u5.c
@@ -0,0 +1,96 @@
+/*
+ * Double-precision SVE sin(x) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ double inv_pi, pi_1, pi_2, pi_3, shift, range_val;
+ double poly[7];
+} data = {
+ .poly = { -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, -0x1.a01a019936f27p-13,
+ 0x1.71de37a97d93ep-19, -0x1.ae633919987c6p-26,
+ 0x1.60e277ae07cecp-33, -0x1.9e9540300a1p-41, },
+
+ .inv_pi = 0x1.45f306dc9c883p-2,
+ .pi_1 = 0x1.921fb54442d18p+1,
+ .pi_2 = 0x1.1a62633145c06p-53,
+ .pi_3 = 0x1.c1cd129024e09p-106,
+ .shift = 0x1.8p52,
+ .range_val = 0x1p23,
+};
+
+#define C(i) sv_f64 (d->poly[i])
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+{
+ return sv_call_f64 (sin, x, y, cmp);
+}
+
+/* A fast SVE implementation of sin.
+ Maximum observed error in [-pi/2, pi/2], where argument is not reduced,
+ is 2.87 ULP:
+ _ZGVsMxv_sin (0x1.921d5c6a07142p+0) got 0x1.fffffffa7dc02p-1
+ want 0x1.fffffffa7dc05p-1
+ Maximum observed error in the entire non-special domain ([-2^23, 2^23])
+ is 3.22 ULP:
+ _ZGVsMxv_sin (0x1.5702447b6f17bp+22) got 0x1.ffdcd125c84fbp-3
+ want 0x1.ffdcd125c84f8p-3. */
+svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Load some values in quad-word chunks to minimise memory access. */
+ const svbool_t ptrue = svptrue_b64 ();
+ svfloat64_t shift = sv_f64 (d->shift);
+ svfloat64_t inv_pi_and_pi1 = svld1rq (ptrue, &d->inv_pi);
+ svfloat64_t pi2_and_pi3 = svld1rq (ptrue, &d->pi_2);
+
+ /* n = rint(|x|/pi). */
+ svfloat64_t n = svmla_lane (shift, x, inv_pi_and_pi1, 0);
+ svuint64_t odd = svlsl_x (pg, svreinterpret_u64 (n), 63);
+ n = svsub_x (pg, n, shift);
+
+ /* r = |x| - n*(pi/2) (range reduction into -pi/2 .. pi/2). */
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, inv_pi_and_pi1, 1);
+ r = svmls_lane (r, n, pi2_and_pi3, 0);
+ r = svmls_lane (r, n, pi2_and_pi3, 1);
+
+ /* sin(r) poly approx. */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t r3 = svmul_x (pg, r2, r);
+ svfloat64_t r4 = svmul_x (pg, r2, r2);
+
+ svfloat64_t t1 = svmla_x (pg, C (4), C (5), r2);
+ svfloat64_t t2 = svmla_x (pg, C (2), C (3), r2);
+ svfloat64_t t3 = svmla_x (pg, C (0), C (1), r2);
+
+ svfloat64_t y = svmla_x (pg, t1, C (6), r4);
+ y = svmla_x (pg, t2, y, r4);
+ y = svmla_x (pg, t3, y, r4);
+ y = svmla_x (pg, r, y, r3);
+
+ svbool_t cmp = svacle (pg, x, d->range_val);
+ cmp = svnot_z (pg, cmp);
+ if (unlikely (svptest_any (pg, cmp)))
+ return special_case (x,
+ svreinterpret_f64 (sveor_z (
+ svnot_z (pg, cmp), svreinterpret_u64 (y), odd)),
+ cmp);
+
+ /* Copy sign. */
+ return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd));
+}
+
+PL_SIG (SV, D, 1, sin, -3.1, 3.1)
+PL_TEST_ULP (SV_NAME_D1 (sin), 2.73)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0, 0x1p23, 1000000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sin), 0x1p23, inf, 10000)
diff --git a/pl/math/sv_sincos_3u5.c b/pl/math/sv_sincos_3u5.c
new file mode 100644
index 000000000000..f73550082d5b
--- /dev/null
+++ b/pl/math/sv_sincos_3u5.c
@@ -0,0 +1,61 @@
+/*
+ * Double-precision vector sincos function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincos declaration. If building
+ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+ be linked against the scalar sincosf from math/. */
+#define _GNU_SOURCE
+#include <math.h>
+#undef _GNU_SOURCE
+
+#include "sv_sincos_common.h"
+#include "sv_math.h"
+#include "pl_test.h"
+
+static void NOINLINE
+special_case (svfloat64_t x, svbool_t special, double *out_sin,
+ double *out_cos)
+{
+ svbool_t p = svptrue_pat_b64 (SV_VL1);
+ for (int i = 0; i < svcntd (); i++)
+ {
+ if (svptest_any (special, p))
+ sincos (svlastb (p, x), out_sin + i, out_cos + i);
+ p = svpnext_b64 (svptrue_b64 (), p);
+ }
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ sv_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+void
+_ZGVsMxvl8l8_sincos (svfloat64_t x, double *out_sin, double *out_cos,
+ svbool_t pg)
+{
+ const struct sv_sincos_data *d = ptr_barrier (&sv_sincos_data);
+ svbool_t special = check_ge_rangeval (pg, x, d);
+
+ svfloat64x2_t sc = sv_sincos_inline (pg, x, d);
+
+ svst1 (pg, out_sin, svget2 (sc, 0));
+ svst1 (pg, out_cos, svget2 (sc, 1));
+
+ if (unlikely (svptest_any (pg, special)))
+ special_case (x, special, out_sin, out_cos);
+}
+
+PL_TEST_ULP (_ZGVsMxv_sincos_sin, 2.73)
+PL_TEST_ULP (_ZGVsMxv_sincos_cos, 2.73)
+#define SV_SINCOS_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_sincos_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_sincos_cos, lo, hi, n)
+SV_SINCOS_INTERVAL (0, 0x1p23, 500000)
+SV_SINCOS_INTERVAL (-0, -0x1p23, 500000)
+SV_SINCOS_INTERVAL (0x1p23, inf, 10000)
+SV_SINCOS_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/pl/math/sv_sincos_common.h b/pl/math/sv_sincos_common.h
new file mode 100644
index 000000000000..f7b58deb90bd
--- /dev/null
+++ b/pl/math/sv_sincos_common.h
@@ -0,0 +1,85 @@
+/*
+ * Core approximation for double-precision vector sincos
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+
+static const struct sv_sincos_data
+{
+ double sin_poly[7], cos_poly[6], pio2[3];
+ double inv_pio2, shift, range_val;
+} sv_sincos_data = {
+ .inv_pio2 = 0x1.45f306dc9c882p-1,
+ .pio2 = { 0x1.921fb50000000p+0, 0x1.110b460000000p-26,
+ 0x1.1a62633145c07p-54 },
+ .shift = 0x1.8p52,
+ .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */
+ -0x1.555555555547bp-3, 0x1.1111111108a4dp-7,
+ -0x1.a01a019936f27p-13, 0x1.71de37a97d93ep-19,
+ -0x1.ae633919987c6p-26, 0x1.60e277ae07cecp-33,
+ -0x1.9e9540300a1p-41 },
+ .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */
+ 0x1.555555555554cp-5, -0x1.6c16c16c1521fp-10,
+ 0x1.a01a019cbf62ap-16, -0x1.27e4f812b681ep-22,
+ 0x1.1ee9f152a57cdp-29, -0x1.8fb131098404bp-37 },
+ .range_val = 0x1p23, };
+
+static inline svbool_t
+check_ge_rangeval (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d)
+{
+ svbool_t in_bounds = svaclt (pg, x, d->range_val);
+ return svnot_z (pg, in_bounds);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+static inline svfloat64x2_t
+sv_sincos_inline (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d)
+{
+ /* q = nearest integer to 2 * x / pi. */
+ svfloat64_t q = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_pio2),
+ d->shift);
+ svint64_t n = svcvt_s64_x (pg, q);
+
+ /* Reduce x such that r is in [ -pi/4, pi/4 ]. */
+ svfloat64_t r = x;
+ r = svmls_x (pg, r, q, d->pio2[0]);
+ r = svmls_x (pg, r, q, d->pio2[1]);
+ r = svmls_x (pg, r, q, d->pio2[2]);
+
+ svfloat64_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r2, r),
+ r4 = svmul_x (pg, r2, r2);
+
+ /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
+ svfloat64_t s = sv_pw_horner_6_f64_x (pg, r2, r4, d->sin_poly);
+ s = svmla_x (pg, r, r3, s);
+
+ /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
+ svfloat64_t c = sv_pw_horner_5_f64_x (pg, r2, r4, d->cos_poly);
+ c = svmad_x (pg, c, r2, -0.5);
+ c = svmad_x (pg, c, r2, 1);
+
+ svuint64_t un = svreinterpret_u64 (n);
+ /* If odd quadrant, swap cos and sin. */
+ svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 63), 0);
+ svfloat64_t ss = svsel (swap, s, c);
+ svfloat64_t cc = svsel (swap, c, s);
+
+ /* Fix signs according to quadrant.
+ ss = asdouble(asuint64(ss) ^ ((n & 2) << 62))
+ cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */
+ svuint64_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 62);
+ svuint64_t cos_sign = svlsl_x (
+ pg, svand_x (pg, svreinterpret_u64 (svadd_x (pg, n, 1)), 2), 62);
+ ss = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ss), sin_sign));
+ cc = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (cc), cos_sign));
+
+ return svcreate2 (ss, cc);
+}
diff --git a/pl/math/sv_sincosf_1u8.c b/pl/math/sv_sincosf_1u8.c
new file mode 100644
index 000000000000..c335de8d3dbb
--- /dev/null
+++ b/pl/math/sv_sincosf_1u8.c
@@ -0,0 +1,62 @@
+/*
+ * Single-precision vector sincos function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincosf declaration. If building
+ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+ be linked against the scalar sincosf from math/. */
+#define _GNU_SOURCE
+#include <math.h>
+#undef _GNU_SOURCE
+
+#include "sv_sincosf_common.h"
+#include "sv_math.h"
+#include "pl_test.h"
+
+static void NOINLINE
+special_case (svfloat32_t x, svbool_t special, float *out_sin, float *out_cos)
+{
+ svbool_t p = svptrue_pat_b32 (SV_VL1);
+ for (int i = 0; i < svcntw (); i++)
+ {
+ if (svptest_any (special, p))
+ sincosf (svlastb (p, x), out_sin + i, out_cos + i);
+ p = svpnext_b32 (svptrue_b32 (), p);
+ }
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+void
+_ZGVsMxvl4l4_sincosf (svfloat32_t x, float *out_sin, float *out_cos,
+ svbool_t pg)
+{
+ const struct sv_sincosf_data *d = ptr_barrier (&sv_sincosf_data);
+ svbool_t special = check_ge_rangeval (pg, x, d);
+
+ svfloat32x2_t sc = sv_sincosf_inline (pg, x, d);
+
+ svst1_f32 (pg, out_sin, svget2 (sc, 0));
+ svst1_f32 (pg, out_cos, svget2 (sc, 1));
+
+ if (unlikely (svptest_any (pg, special)))
+ special_case (x, special, out_sin, out_cos);
+}
+
+PL_TEST_ULP (_ZGVsMxv_sincosf_sin, 1.17)
+PL_TEST_ULP (_ZGVsMxv_sincosf_cos, 1.31)
+#define SV_SINCOSF_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_sincosf_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVsMxv_sincosf_cos, lo, hi, n)
+SV_SINCOSF_INTERVAL (0, 0x1p20, 500000)
+SV_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
+SV_SINCOSF_INTERVAL (0x1p20, inf, 10000)
+SV_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/pl/math/sv_sincosf_common.h b/pl/math/sv_sincosf_common.h
new file mode 100644
index 000000000000..714e996443b3
--- /dev/null
+++ b/pl/math/sv_sincosf_common.h
@@ -0,0 +1,81 @@
+/*
+ * Core approximation for single-precision vector sincos
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+
+const static struct sv_sincosf_data
+{
+ float poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val;
+} sv_sincosf_data = {
+ .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */
+ -0x1.555546p-3, 0x1.11076p-7, -0x1.994eb4p-13 },
+ .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */
+ 0x1.55554ap-5, -0x1.6c0c1ap-10, 0x1.99e0eep-16 },
+ .pio2 = { 0x1.921fb6p+0f, -0x1.777a5cp-25f, -0x1.ee59dap-50f },
+ .inv_pio2 = 0x1.45f306p-1f,
+ .shift = 0x1.8p23,
+ .range_val = 0x1p20
+};
+
+static inline svbool_t
+check_ge_rangeval (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d)
+{
+ svbool_t in_bounds = svaclt (pg, x, d->range_val);
+ return svnot_z (pg, in_bounds);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ sv_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ sv_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+static inline svfloat32x2_t
+sv_sincosf_inline (svbool_t pg, svfloat32_t x, const struct sv_sincosf_data *d)
+{
+ /* n = rint ( x / (pi/2) ). */
+ svfloat32_t q = svmla_x (pg, sv_f32 (d->shift), x, d->inv_pio2);
+ q = svsub_x (pg, q, d->shift);
+ svint32_t n = svcvt_s32_x (pg, q);
+
+ /* Reduce x such that r is in [ -pi/4, pi/4 ]. */
+ svfloat32_t r = x;
+ r = svmls_x (pg, r, q, d->pio2[0]);
+ r = svmls_x (pg, r, q, d->pio2[1]);
+ r = svmls_x (pg, r, q, d->pio2[2]);
+
+ /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
+ svfloat32_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r, r2);
+ svfloat32_t s = svmla_x (pg, sv_f32 (d->poly_sin[1]), r2, d->poly_sin[2]);
+ s = svmad_x (pg, r2, s, d->poly_sin[0]);
+ s = svmla_x (pg, r, r3, s);
+
+ /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
+ svfloat32_t r4 = svmul_x (pg, r2, r2);
+ svfloat32_t p = svmla_x (pg, sv_f32 (d->poly_cos[1]), r2, d->poly_cos[2]);
+ svfloat32_t c = svmad_x (pg, sv_f32 (d->poly_cos[0]), r2, -0.5);
+ c = svmla_x (pg, c, r4, p);
+ c = svmad_x (pg, r2, c, 1);
+
+ svuint32_t un = svreinterpret_u32 (n);
+ /* If odd quadrant, swap cos and sin. */
+ svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 31), 0);
+ svfloat32_t ss = svsel (swap, s, c);
+ svfloat32_t cc = svsel (swap, c, s);
+
+ /* Fix signs according to quadrant.
+ ss = asfloat(asuint(ss) ^ ((n & 2) << 30))
+ cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */
+ svuint32_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 30);
+ svuint32_t cos_sign = svlsl_x (
+ pg, svand_x (pg, svreinterpret_u32 (svadd_x (pg, n, 1)), 2), 30);
+ ss = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ss), sin_sign));
+ cc = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (cc), cos_sign));
+
+ return svcreate2 (ss, cc);
+}
diff --git a/pl/math/sv_sinf_1u9.c b/pl/math/sv_sinf_1u9.c
index 9184ccd3cf0c..675d7b2480f7 100644
--- a/pl/math/sv_sinf_1u9.c
+++ b/pl/math/sv_sinf_1u9.c
@@ -9,23 +9,31 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
-
-#define A3 (sv_f32 (__sv_sinf_data.coeffs[3]))
-#define A5 (sv_f32 (__sv_sinf_data.coeffs[2]))
-#define A7 (sv_f32 (__sv_sinf_data.coeffs[1]))
-#define A9 (sv_f32 (__sv_sinf_data.coeffs[0]))
+static const struct data
+{
+ float poly[4];
+ /* Pi-related values to be loaded as one quad-word and used with
+ svmla_lane. */
+ float negpi1, negpi2, negpi3, invpi;
+ float shift;
+} data = {
+ .poly = {
+ /* Non-zero coefficients from the degree 9 Taylor series expansion of
+ sin. */
+ -0x1.555548p-3f, 0x1.110df4p-7f, -0x1.9f42eap-13f, 0x1.5b2e76p-19f
+ },
+ .negpi1 = -0x1.921fb6p+1f,
+ .negpi2 = 0x1.777a5cp-24f,
+ .negpi3 = 0x1.ee59dap-49f,
+ .invpi = 0x1.45f306p-2f,
+ .shift = 0x1.8p+23f
+};
-#define NegPi1 (sv_f32 (-0x1.921fb6p+1f))
-#define NegPi2 (sv_f32 (0x1.777a5cp-24f))
-#define NegPi3 (sv_f32 (0x1.ee59dap-49f))
-#define RangeVal (sv_f32 (0x1p20f))
-#define InvPi (sv_f32 (0x1.45f306p-2f))
-#define Shift (sv_f32 (0x1.8p+23f))
-#define AbsMask (0x7fffffff)
+#define RangeVal 0x49800000 /* asuint32 (0x1p20f). */
+#define C(i) sv_f32 (d->poly[i])
-static NOINLINE sv_f32_t
-__sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
{
return sv_call_f32 (sinf, x, y, cmp);
}
@@ -34,51 +42,52 @@ __sv_sinf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
Maximum error: 1.89 ULPs.
This maximum error is achieved at multiple values in [-2^18, 2^18]
but one example is:
- __sv_sinf(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */
-sv_f32_t
-__sv_sinf_x (sv_f32_t x, const svbool_t pg)
+ SV_NAME_F1 (sin)(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1. */
+svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg)
{
- sv_f32_t n, r, r2, y;
- sv_u32_t sign, odd;
- svbool_t cmp;
+ const struct data *d = ptr_barrier (&data);
- r = sv_as_f32_u32 (svand_n_u32_x (pg, sv_as_u32_f32 (x), AbsMask));
- sign = svand_n_u32_x (pg, sv_as_u32_f32 (x), ~AbsMask);
- cmp = svcmpge_u32 (pg, sv_as_u32_f32 (r), sv_as_u32_f32 (RangeVal));
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+ svbool_t cmp = svcmpge (pg, svreinterpret_u32 (ax), RangeVal);
+
+ /* pi_vals are a quad-word of helper values - the first 3 elements contain
+ -pi in extended precision, the last contains 1 / pi. */
+ svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->negpi1);
/* n = rint(|x|/pi). */
- n = sv_fma_f32_x (pg, InvPi, r, Shift);
- odd = svlsl_n_u32_x (pg, sv_as_u32_f32 (n), 31);
- n = svsub_f32_x (pg, n, Shift);
+ svfloat32_t n = svmla_lane (sv_f32 (d->shift), ax, pi_vals, 3);
+ svuint32_t odd = svlsl_x (pg, svreinterpret_u32 (n), 31);
+ n = svsub_x (pg, n, d->shift);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
- r = sv_fma_f32_x (pg, NegPi1, n, r);
- r = sv_fma_f32_x (pg, NegPi2, n, r);
- r = sv_fma_f32_x (pg, NegPi3, n, r);
+ svfloat32_t r;
+ r = svmla_lane (ax, n, pi_vals, 0);
+ r = svmla_lane (r, n, pi_vals, 1);
+ r = svmla_lane (r, n, pi_vals, 2);
/* sin(r) approx using a degree 9 polynomial from the Taylor series
expansion. Note that only the odd terms of this are non-zero. */
- r2 = svmul_f32_x (pg, r, r);
- y = sv_fma_f32_x (pg, A9, r2, A7);
- y = sv_fma_f32_x (pg, y, r2, A5);
- y = sv_fma_f32_x (pg, y, r2, A3);
- y = sv_fma_f32_x (pg, svmul_f32_x (pg, y, r2), r, r);
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t y;
+ y = svmla_x (pg, C (2), r2, C (3));
+ y = svmla_x (pg, C (1), r2, y);
+ y = svmla_x (pg, C (0), r2, y);
+ y = svmla_x (pg, r, r, svmul_x (pg, y, r2));
/* sign = y^sign^odd. */
- y = sv_as_f32_u32 (
- sveor_u32_x (pg, sv_as_u32_f32 (y), sveor_u32_x (pg, sign, odd)));
+ sign = sveor_x (pg, sign, odd);
- /* No need to pass pg to specialcase here since cmp is a strict subset,
- guaranteed by the cmpge above. */
if (unlikely (svptest_any (pg, cmp)))
- return __sv_sinf_specialcase (x, y, cmp);
- return y;
+ return special_case (x,
+ svreinterpret_f32 (sveor_x (
+ svnot_z (pg, cmp), svreinterpret_u32 (y), sign)),
+ cmp);
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
}
-PL_ALIAS (__sv_sinf_x, _ZGVsMxv_sinf)
-
PL_SIG (SV, F, 1, sin, -3.1, 3.1)
-PL_TEST_ULP (__sv_sinf, 1.40)
-PL_TEST_INTERVAL (__sv_sinf, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (__sv_sinf, 0x1p-4, 0x1p4, 500000)
-#endif
+PL_TEST_ULP (SV_NAME_F1 (sin), 1.40)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0, 0x1p23, 1000000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sin), 0x1p23, inf, 10000)
diff --git a/pl/math/sv_sinf_poly_data.c b/pl/math/sv_sinf_poly_data.c
deleted file mode 100644
index 1e1ab5e48df1..000000000000
--- a/pl/math/sv_sinf_poly_data.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Data used in single-precision sin(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Polynomial coefficients for approximating sin(x) in single
- precision. These are the non-zero coefficients from the
- degree 9 Taylor series expansion of sin. */
-
-const struct sv_sinf_data __sv_sinf_data = {.coeffs = {
- 0x1.5b2e76p-19f,
- -0x1.9f42eap-13f,
- 0x1.110df4p-7f,
- -0x1.555548p-3f,
- }};
diff --git a/pl/math/sv_sinh_3u.c b/pl/math/sv_sinh_3u.c
new file mode 100644
index 000000000000..a01e19caecda
--- /dev/null
+++ b/pl/math/sv_sinh_3u.c
@@ -0,0 +1,103 @@
+/*
+ * Double-precision SVE sinh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64_t poly[11];
+ float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
+ uint64_t halff;
+ int64_t onef;
+ uint64_t large_bound;
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+ 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+ 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+ .inv_ln2 = 0x1.71547652b82fep0,
+ .m_ln2_hi = -0x1.62e42fefa39efp-1,
+ .m_ln2_lo = -0x1.abc9e3b39803fp-56,
+ .shift = 0x1.8p52,
+
+ .halff = 0x3fe0000000000000,
+ .onef = 0x3ff0000000000000,
+ /* 2^9. expm1 helper overflows for large input. */
+ .large_bound = 0x4080000000000000,
+};
+
+static inline svfloat64_t
+expm1_inline (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Reduce argument:
+ exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
+ where i = round(x / ln2)
+ and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
+ svfloat64_t j
+ = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+ svint64_t i = svcvt_s64_x (pg, j);
+ svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
+ f = svmla_x (pg, f, j, d->m_ln2_lo);
+ /* Approximate expm1(f) using polynomial. */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
+ svfloat64_t f8 = svmul_x (pg, f4, f4);
+ svfloat64_t p
+ = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
+ /* t = 2^i. */
+ svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+ /* expm1(x) ~= p * t + (t - 1). */
+ return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svbool_t pg)
+{
+ return sv_call_f64 (sinh, x, x, pg);
+}
+
+/* Approximation for SVE double-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The greatest observed error is 2.57 ULP:
+ _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
+ want 0x1.ab929fc64bd63p-2. */
+svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat64_t ax = svabs_x (pg, x);
+ svuint64_t sign
+ = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
+
+ svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
+
+ /* Fall back to scalar variant for all lanes if any are special. */
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, pg);
+
+ /* Up to the point that expm1 overflows, we can use it to calculate sinh
+ using a slight rearrangement of the definition of sinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ svfloat64_t t = expm1_inline (ax, pg);
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+ return svmul_x (pg, t, halfsign);
+}
+
+PL_SIG (SV, D, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (sinh), 2.08)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinh), 0x1p9, inf, 1000)
diff --git a/pl/math/sv_sinhf_2u3.c b/pl/math/sv_sinhf_2u3.c
new file mode 100644
index 000000000000..e34ecf378ad3
--- /dev/null
+++ b/pl/math/sv_sinhf_2u3.c
@@ -0,0 +1,64 @@
+/*
+ * Single-precision SVE sinh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#include "sv_expm1f_inline.h"
+
+static const struct data
+{
+ struct sv_expm1f_data expm1f_consts;
+ uint32_t halff, large_bound;
+} data = {
+ .expm1f_consts = SV_EXPM1F_DATA,
+ .halff = 0x3f000000,
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
+ .large_bound = 0x42b0c0a7,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+{
+ return sv_call_f32 (sinhf, x, y, pg);
+}
+
+/* Approximation for SVE single-precision sinh(x) using expm1.
+ sinh(x) = (exp(x) - exp(-x)) / 2.
+ The maximum error is 2.26 ULP:
+ _ZGVsMxv_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+ want 0x1.e469e4p-4. */
+svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t sign
+ = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax));
+ svfloat32_t halfsign = svreinterpret_f32 (svorr_x (pg, sign, d->halff));
+
+ svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->large_bound);
+
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+ using a slight rearrangement of the definition of asinh. This allows us to
+ retain acceptable accuracy for very small inputs. */
+ svfloat32_t t = expm1f_inline (ax, pg, &d->expm1f_consts);
+ t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+
+ /* Fall back to the scalar variant for any lanes which would cause
+ expm1f to overflow. */
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svmul_x (pg, t, halfsign), special);
+
+ return svmul_x (pg, t, halfsign);
+}
+
+PL_SIG (SV, F, 1, sinh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (sinh), 1.76)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0, 0x1.6a09e8p-32, 1000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x1.6a09e8p-32, 0x42b0c0a7, 100000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
diff --git a/pl/math/sv_sinpi_3u1.c b/pl/math/sv_sinpi_3u1.c
new file mode 100644
index 000000000000..c9f23da1b19b
--- /dev/null
+++ b/pl/math/sv_sinpi_3u1.c
@@ -0,0 +1,57 @@
+/*
+ * Double-precision SVE sinpi(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f64.h"
+
+static const struct data
+{
+ double poly[10];
+} data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1,
+ -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8,
+ 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16,
+ 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 },
+};
+
+/* A fast SVE implementation of sinpi.
+ Maximum error 3.10 ULP:
+ _ZGVsMxv_sinpi(0x1.df1a14f1b235p-2) got 0x1.fd64f541606cp-1
+ want 0x1.fd64f541606c3p-1. */
+svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* range reduction into -1/2 .. 1/2)
+ with n = rint(x) and r = r - n. */
+ svfloat64_t n = svrinta_x (pg, x);
+ svfloat64_t r = svsub_x (pg, x, n);
+
+ /* Result should be negated based on if n is odd or not. */
+ svuint64_t intn = svreinterpret_u64 (svcvt_s64_x (pg, n));
+ svuint64_t sign = svlsl_z (pg, intn, 63);
+
+ /* y = sin(r). */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t r4 = svmul_x (pg, r2, r2);
+ svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly);
+ y = svmul_x (pg, y, r);
+
+ return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+}
+
+PL_SIG (SV, D, 1, sinpi, -0.9, 0.9)
+PL_TEST_ULP (SV_NAME_D1 (sinpi), 2.61)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (sinpi), 0x1p51, inf, 10000)
diff --git a/pl/math/sv_sinpif_2u5.c b/pl/math/sv_sinpif_2u5.c
new file mode 100644
index 000000000000..ac3f924bed68
--- /dev/null
+++ b/pl/math/sv_sinpif_2u5.c
@@ -0,0 +1,53 @@
+/*
+ * Single-precision SVE sinpi(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_sve_f32.h"
+
+static const struct data
+{
+ float poly[6];
+} data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f,
+ 0x1.50783p-4f, -0x1.e30750p-8f },
+};
+
+/* A fast SVE implementation of sinpif.
+ Maximum error 2.48 ULP:
+ _ZGVsMxv_sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1
+ want 0x1.fa8c02p-1. */
+svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* range reduction into -1/2 .. 1/2
+ with n = rint(x) and r = r - n. */
+ svfloat32_t n = svrinta_x (pg, x);
+ svfloat32_t r = svsub_x (pg, x, n);
+
+ /* Result should be negated based on if n is odd or not. */
+ svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n));
+ svuint32_t sign = svlsl_z (pg, intn, 31);
+
+ /* y = sin(r). */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly);
+ y = svmul_x (pg, y, r);
+
+ return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
+}
+
+PL_SIG (SV, F, 1, sinpi, -0.9, 0.9)
+PL_TEST_ULP (SV_NAME_F1 (sinpi), 1.99)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0.5, 0x1p22f, 10000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (sinpi), 0x1p22f, inf, 10000)
diff --git a/pl/math/sv_tan_3u5.c b/pl/math/sv_tan_3u5.c
new file mode 100644
index 000000000000..746396e98a10
--- /dev/null
+++ b/pl/math/sv_tan_3u5.c
@@ -0,0 +1,99 @@
+/*
+ * Double-precision SVE tan(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ double poly[9];
+ double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift;
+} data = {
+ /* Polynomial generated with FPMinimax. */
+ .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
+ 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
+ 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11,
+ 0x1.4e4fd14147622p-12, },
+ .half_pi_hi = 0x1.921fb54442d18p0,
+ .half_pi_lo = 0x1.1a62633145c07p-54,
+ .inv_half_pi = 0x1.45f306dc9c883p-1,
+ .range_val = 0x1p23,
+ .shift = 0x1.8p52,
+};
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (tan, x, y, special);
+}
+
+/* Vector approximation for double-precision tan.
+ Maximum measured error is 3.48 ULP:
+ _ZGVsMxv_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+ want -0x1.f6ccd8ecf7deap+37. */
+svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ /* Invert condition to catch NaNs and Infs as well as large values. */
+ svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val));
+
+ /* q = nearest integer to 2 * x / pi. */
+ svfloat64_t shift = sv_f64 (dat->shift);
+ svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi);
+ q = svsub_x (pg, q, shift);
+ svint64_t qi = svcvt_s64_x (pg, q);
+
+ /* Use q to reduce x to r in [-pi/4, pi/4], by:
+ r = x - q * pi/2, in extended precision. */
+ svfloat64_t r = x;
+ svfloat64_t half_pi = svld1rq (svptrue_b64 (), &dat->half_pi_hi);
+ r = svmls_lane (r, q, half_pi, 0);
+ r = svmls_lane (r, q, half_pi, 1);
+ /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
+ formula. */
+ r = svmul_x (pg, r, 0.5);
+
+ /* Approximate tan(r) using order 8 polynomial.
+ tan(x) is odd, so polynomial has the form:
+ tan(x) ~= x + C0 * x^3 + C1 * x^5 + C3 * x^7 + ...
+ Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
+ Then compute the approximation by:
+ tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t r4 = svmul_x (pg, r2, r2);
+ svfloat64_t r8 = svmul_x (pg, r4, r4);
+ /* Use offset version coeff array by 1 to evaluate from C1 onwards. */
+ svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1);
+ p = svmad_x (pg, p, r2, dat->poly[0]);
+ p = svmla_x (pg, r, r2, svmul_x (pg, p, r));
+
+ /* Recombination uses double-angle formula:
+ tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
+ and reciprocity around pi/2:
+ tan(x) = 1 / (tan(pi/2 - x))
+ to assemble result using change-of-sign and conditional selection of
+ numerator/denominator dependent on odd/even-ness of q (hence quadrant). */
+ svbool_t use_recip
+ = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0);
+
+ svfloat64_t n = svmad_x (pg, p, p, -1);
+ svfloat64_t d = svmul_x (pg, p, 2);
+ svfloat64_t swap = n;
+ n = svneg_m (n, use_recip, d);
+ d = svsel (use_recip, swap, d);
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special);
+ return svdiv_x (pg, n, d);
+}
+
+PL_SIG (SV, D, 1, tan, -3.1, 3.1)
+PL_TEST_ULP (SV_NAME_D1 (tan), 2.99)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0, 0x1p23, 500000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tan), 0x1p23, inf, 5000)
diff --git a/pl/math/sv_tanf_3u5.c b/pl/math/sv_tanf_3u5.c
index cca43bd886fd..6b8cd1e64b44 100644
--- a/pl/math/sv_tanf_3u5.c
+++ b/pl/math/sv_tanf_3u5.c
@@ -9,63 +9,67 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if SV_SUPPORTED
-
-/* Constants. */
-#define NegPio2_1 (sv_f32 (-0x1.921fb6p+0f))
-#define NegPio2_2 (sv_f32 (0x1.777a5cp-25f))
-#define NegPio2_3 (sv_f32 (0x1.ee59dap-50f))
-#define InvPio2 (sv_f32 (0x1.45f306p-1f))
-#define RangeVal (sv_f32 (0x1p15f))
-#define Shift (sv_f32 (0x1.8p+23f))
-
-#define poly(i) sv_f32 (__tanf_poly_data.poly_tan[i])
-
-/* Use full Estrin's scheme to evaluate polynomial. */
-static inline sv_f32_t
-eval_poly (svbool_t pg, sv_f32_t z)
+static const struct data
{
- sv_f32_t z2 = svmul_f32_x (pg, z, z);
- sv_f32_t z4 = svmul_f32_x (pg, z2, z2);
- sv_f32_t y_10 = sv_fma_f32_x (pg, z, poly (1), poly (0));
- sv_f32_t y_32 = sv_fma_f32_x (pg, z, poly (3), poly (2));
- sv_f32_t y_54 = sv_fma_f32_x (pg, z, poly (5), poly (4));
- sv_f32_t y_32_10 = sv_fma_f32_x (pg, z2, y_32, y_10);
- sv_f32_t y = sv_fma_f32_x (pg, z4, y_54, y_32_10);
- return y;
-}
-
-static NOINLINE sv_f32_t
-__sv_tanf_specialcase (sv_f32_t x, sv_f32_t y, svbool_t cmp)
+ float pio2_1, pio2_2, pio2_3, invpio2;
+ float c1, c3, c5;
+ float c0, c2, c4, range_val, shift;
+} data = {
+ /* Coefficients generated using:
+ poly = fpminimax((tan(sqrt(x))-sqrt(x))/x^(3/2),
+ deg,
+ [|single ...|],
+ [a*a;b*b]);
+ optimize relative error
+ final prec : 23 bits
+ deg : 5
+ a : 0x1p-126 ^ 2
+ b : ((pi) / 0x1p2) ^ 2
+ dirty rel error: 0x1.f7c2e4p-25
+ dirty abs error: 0x1.f7c2ecp-25. */
+ .c0 = 0x1.55555p-2, .c1 = 0x1.11166p-3,
+ .c2 = 0x1.b88a78p-5, .c3 = 0x1.7b5756p-6,
+ .c4 = 0x1.4ef4cep-8, .c5 = 0x1.0e1e74p-7,
+
+ .pio2_1 = 0x1.921fb6p+0f, .pio2_2 = -0x1.777a5cp-25f,
+ .pio2_3 = -0x1.ee59dap-50f, .invpio2 = 0x1.45f306p-1f,
+ .range_val = 0x1p15f, .shift = 0x1.8p+23f
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
{
return sv_call_f32 (tanf, x, y, cmp);
}
/* Fast implementation of SVE tanf.
Maximum error is 3.45 ULP:
- __sv_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
- want 0x1.ff9850p-1. */
-sv_f32_t
-__sv_tanf_x (sv_f32_t x, const svbool_t pg)
+ SV_NAME_F1 (tan)(-0x1.e5f0cap+13) got 0x1.ff9856p-1
+ want 0x1.ff9850p-1. */
+svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
/* Determine whether input is too large to perform fast regression. */
- svbool_t cmp = svacge_f32 (pg, x, RangeVal);
- svbool_t pred_minuszero = svcmpeq_f32 (pg, x, sv_f32 (-0.0));
+ svbool_t cmp = svacge (pg, x, d->range_val);
+
+ svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1);
+ svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1);
/* n = rint(x/(pi/2)). */
- sv_f32_t q = sv_fma_f32_x (pg, InvPio2, x, Shift);
- sv_f32_t n = svsub_f32_x (pg, q, Shift);
+ svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3);
+ svfloat32_t n = svsub_x (pg, q, d->shift);
/* n is already a signed integer, simply convert it. */
- sv_s32_t in = sv_to_s32_f32_x (pg, n);
+ svint32_t in = svcvt_s32_x (pg, n);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
- sv_s32_t alt = svand_s32_x (pg, in, sv_s32 (1));
- svbool_t pred_alt = svcmpne_s32 (pg, alt, sv_s32 (0));
+ svint32_t alt = svand_x (pg, in, 1);
+ svbool_t pred_alt = svcmpne (pg, alt, 0);
/* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */
- sv_f32_t r;
- r = sv_fma_f32_x (pg, NegPio2_1, n, x);
- r = sv_fma_f32_x (pg, NegPio2_2, n, r);
- r = sv_fma_f32_x (pg, NegPio2_3, n, r);
+ svfloat32_t r;
+ r = svmls_lane (x, n, pi_vals, 0);
+ r = svmls_lane (r, n, pi_vals, 1);
+ r = svmls_lane (r, n, pi_vals, 2);
/* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form
@@ -75,38 +79,41 @@ __sv_tanf_x (sv_f32_t x, const svbool_t pg)
the same polynomial approximation of tan as above. */
/* Perform additional reduction if required. */
- sv_f32_t z = svneg_f32_m (r, pred_alt, r);
+ svfloat32_t z = svneg_m (r, pred_alt, r);
- /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */
- sv_f32_t z2 = svmul_f32_x (pg, z, z);
- sv_f32_t p = eval_poly (pg, z2);
- sv_f32_t y = sv_fma_f32_x (pg, svmul_f32_x (pg, z, z2), p, z);
+ /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4],
+ using Estrin on z^2. */
+ svfloat32_t z2 = svmul_x (pg, z, z);
+ svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
+ svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
+ svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
- /* Transform result back, if necessary. */
- sv_f32_t inv_y = svdiv_f32_x (pg, sv_f32 (1.0f), y);
- y = svsel_f32 (pred_alt, inv_y, y);
+ svfloat32_t z4 = svmul_x (pg, z2, z2);
+ svfloat32_t p = svmla_x (pg, p01, z4, p23);
+
+ svfloat32_t z8 = svmul_x (pg, z4, z4);
+ p = svmla_x (pg, p, z8, p45);
+
+ svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2));
- /* Fast reduction does not handle the x = -0.0 case well,
- therefore it is fixed here. */
- y = svsel_f32 (pred_minuszero, x, y);
+ /* Transform result back, if necessary. */
+ svfloat32_t inv_y = svdivr_x (pg, y, 1.0f);
/* No need to pass pg to specialcase here since cmp is a strict subset,
guaranteed by the cmpge above. */
if (unlikely (svptest_any (pg, cmp)))
- return __sv_tanf_specialcase (x, y, cmp);
- return y;
-}
+ return special_case (x, svsel (pred_alt, inv_y, y), cmp);
-PL_ALIAS (__sv_tanf_x, _ZGVsMxv_tanf)
+ return svsel (pred_alt, inv_y, y);
+}
PL_SIG (SV, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (__sv_tanf, 2.96)
-PL_TEST_INTERVAL (__sv_tanf, -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p-23, 0.7, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 0.7, 1.5, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 1.5, 100, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 100, 0x1p17, 50000)
-PL_TEST_INTERVAL (__sv_tanf, 0x1p17, inf, 50000)
-#endif
+PL_TEST_ULP (SV_NAME_F1 (tan), 2.96)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p-23, 0.7, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0.7, 1.5, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 1.5, 100, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 100, 0x1p17, 50000)
+PL_TEST_INTERVAL (SV_NAME_F1 (tan), 0x1p17, inf, 50000)
diff --git a/pl/math/sv_tanh_3u.c b/pl/math/sv_tanh_3u.c
new file mode 100644
index 000000000000..f54139f1ddbc
--- /dev/null
+++ b/pl/math/sv_tanh_3u.c
@@ -0,0 +1,96 @@
+/*
+ * Double-precision SVE tanh(x) function.
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "poly_sve_f64.h"
+#include "mathlib.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64_t poly[11];
+ float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+ uint64_t thresh, tiny_bound;
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
+ 0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
+ 0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
+ 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
+ 0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
+
+ .inv_ln2 = 0x1.71547652b82fep0,
+ .ln2_hi = -0x1.62e42fefa39efp-1,
+ .ln2_lo = -0x1.abc9e3b39803fp-56,
+ .shift = 0x1.8p52,
+
+ .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27). */
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
+ .thresh = 0x01f241bf835f9d5f,
+};
+
+static inline svfloat64_t
+expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+{
+ /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
+ the scalar variant of tanh. */
+
+ /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
+ svfloat64_t j
+ = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
+ svint64_t i = svcvt_s64_x (pg, j);
+ svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
+ f = svmla_x (pg, f, j, d->ln2_lo);
+
+ /* Approximate expm1(f) using polynomial. */
+ svfloat64_t f2 = svmul_x (pg, f, f);
+ svfloat64_t f4 = svmul_x (pg, f2, f2);
+ svfloat64_t p = svmla_x (
+ pg, f, f2,
+ sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
+
+ /* t = 2 ^ i. */
+ svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
+ /* expm1(x) = p * t + (t - 1). */
+ return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+}
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+{
+ return sv_call_f64 (tanh, x, y, special);
+}
+
+/* SVE approximation for double-precision tanh(x), using a simplified
+ version of expm1. The greatest observed error is 2.77 ULP:
+ _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+ want -0x1.bd6a21a163624p-3. */
+svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
+ svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+
+ svfloat64_t u = svadd_x (pg, x, x);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ svfloat64_t q = expm1_inline (u, pg, d);
+ svfloat64_t qp2 = svadd_x (pg, q, 2);
+
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svdiv_x (pg, q, qp2), special);
+ return svdiv_x (pg, q, qp2);
+}
+
+PL_SIG (SV, D, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_D1 (tanh), 2.27)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+PL_TEST_SYM_INTERVAL (SV_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
diff --git a/pl/math/sv_tanhf_2u6.c b/pl/math/sv_tanhf_2u6.c
new file mode 100644
index 000000000000..988a56de0b2e
--- /dev/null
+++ b/pl/math/sv_tanhf_2u6.c
@@ -0,0 +1,59 @@
+/*
+ * Single-precision SVE tanh(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "sv_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#include "sv_expm1f_inline.h"
+
+static const struct data
+{
+ struct sv_expm1f_data expm1f_consts;
+ uint32_t boring_bound, onef;
+} data = {
+ .expm1f_consts = SV_EXPM1F_DATA,
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
+ .boring_bound = 0x41102cb3,
+ .onef = 0x3f800000,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+ return sv_call_f32 (tanhf, x, y, special);
+}
+
+/* Approximation for single-precision SVE tanh(x), using a simplified
+ version of expm1f. The maximum error is 2.57 ULP:
+ _ZGVsMxv_tanhf (0x1.fc1832p-5) got 0x1.fb71a4p-5
+ want 0x1.fb71aap-5. */
+svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t ax = svabs_x (pg, x);
+ svuint32_t iax = svreinterpret_u32 (ax);
+ svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax);
+ svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound);
+ svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef));
+
+ svbool_t special = svcmpgt (pg, iax, 0x7f800000);
+
+ /* tanh(x) = (e^2x - 1) / (e^2x + 1). */
+ svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts);
+ svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0));
+ if (unlikely (svptest_any (pg, special)))
+ return special_case (x, svsel_f32 (is_boring, boring, y), special);
+ return svsel_f32 (is_boring, boring, y);
+}
+
+PL_SIG (SV, F, 1, tanh, -10.0, 10.0)
+PL_TEST_ULP (SV_NAME_F1 (tanh), 2.07)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_SYM_INTERVAL (SV_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
diff --git a/pl/math/tanf_3u3.c b/pl/math/tanf_3u3.c
index ec006dc04c4c..30c86fa89730 100644
--- a/pl/math/tanf_3u3.c
+++ b/pl/math/tanf_3u3.c
@@ -7,7 +7,7 @@
#include "math_config.h"
#include "pl_sig.h"
#include "pl_test.h"
-#include "pairwise_hornerf.h"
+#include "poly_scalar_f32.h"
/* Useful constants. */
#define NegPio2_1 (-0x1.921fb6p+0f)
@@ -22,19 +22,16 @@
/* 2PI * 2^-64. */
#define Pio2p63 (0x1.921FB54442D18p-62)
-#define P(i) __tanf_poly_data.poly_tan[i]
-#define Q(i) __tanf_poly_data.poly_cotan[i]
-
static inline float
eval_P (float z)
{
- return PAIRWISE_HORNER_5 (z, z * z, P);
+ return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan);
}
static inline float
eval_Q (float z)
{
- return PAIRWISE_HORNER_3 (z, z * z, Q);
+ return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan);
}
/* Reduction of the input argument x using Cody-Waite approach, such that x = r
@@ -188,15 +185,9 @@ tanf (float x)
PL_SIG (S, F, 1, tan, -3.1, 3.1)
PL_TEST_ULP (tanf, 2.80)
PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
-PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000)
-PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
-PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000)
-PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000)
-PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000)
-PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000)
-PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000)
-PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
-PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000)
-PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000)
-PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000)
+PL_TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
+PL_TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
+PL_TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000)
+PL_TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000)
+PL_TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
+PL_TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000)
diff --git a/pl/math/tanh_3u.c b/pl/math/tanh_3u.c
index 46d9fb3fd7e1..86f2904afc32 100644
--- a/pl/math/tanh_3u.c
+++ b/pl/math/tanh_3u.c
@@ -5,7 +5,7 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
#include "math_config.h"
-#include "estrin.h"
+#include "poly_scalar_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
@@ -14,7 +14,6 @@
#define Ln2hi 0x1.62e42fefa39efp-1
#define Ln2lo 0x1.abc9e3b39803fp-56
#define Shift 0x1.8p52
-#define C(i) __expm1_poly[i]
#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */
#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */
@@ -38,7 +37,7 @@ expm1_inline (double x)
/* Approximate expm1(f) using polynomial. */
double f2 = f * f;
double f4 = f2 * f2;
- double p = fma (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+ double p = fma (f2, estrin_10_f64 (f, f2, f4, f4 * f4, __expm1_poly), f);
/* t = 2 ^ i. */
double t = asdouble ((uint64_t) (i + 1023) << 52);
@@ -47,9 +46,9 @@ expm1_inline (double x)
}
/* Approximation for double-precision tanh(x), using a simplified version of
- expm1. The greatest observed error is 2.75 ULP:
- tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
- want -0x1.ba31ba4691ab4p-3. */
+ expm1. The greatest observed error is 2.77 ULP:
+ tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+ want -0x1.bd6a21a163624p-3. */
double
tanh (double x)
{
@@ -73,10 +72,7 @@ tanh (double x)
}
PL_SIG (S, D, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (tanh, 2.26)
-PL_TEST_INTERVAL (tanh, 0, TinyBound, 1000)
-PL_TEST_INTERVAL (tanh, -0, -TinyBound, 1000)
-PL_TEST_INTERVAL (tanh, TinyBound, BoringBound, 100000)
-PL_TEST_INTERVAL (tanh, -TinyBound, -BoringBound, 100000)
-PL_TEST_INTERVAL (tanh, BoringBound, inf, 1000)
-PL_TEST_INTERVAL (tanh, -BoringBound, -inf, 1000)
+PL_TEST_ULP (tanh, 2.27)
+PL_TEST_SYM_INTERVAL (tanh, 0, TinyBound, 1000)
+PL_TEST_SYM_INTERVAL (tanh, TinyBound, BoringBound, 100000)
+PL_TEST_SYM_INTERVAL (tanh, BoringBound, inf, 1000)
diff --git a/pl/math/tanhf_2u6.c b/pl/math/tanhf_2u6.c
index 76e54a438e57..93ea3cf5d865 100644
--- a/pl/math/tanhf_2u6.c
+++ b/pl/math/tanhf_2u6.c
@@ -83,9 +83,6 @@ tanhf (float x)
PL_SIG (S, F, 1, tanh, -10.0, 10.0)
PL_TEST_ULP (tanhf, 2.09)
-PL_TEST_INTERVAL (tanhf, 0, 0x1p-23, 1000)
-PL_TEST_INTERVAL (tanhf, -0, -0x1p-23, 1000)
-PL_TEST_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
-PL_TEST_INTERVAL (tanhf, -0x1p-23, -0x1.205966p+3, 100000)
-PL_TEST_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
-PL_TEST_INTERVAL (tanhf, -0x1.205966p+3, -inf, 100)
+PL_TEST_SYM_INTERVAL (tanhf, 0, 0x1p-23, 1000)
+PL_TEST_SYM_INTERVAL (tanhf, 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_SYM_INTERVAL (tanhf, 0x1.205966p+3, inf, 100)
diff --git a/pl/math/test/mathbench_funcs.h b/pl/math/test/mathbench_funcs.h
index e0f6ac70912c..f2710a979d40 100644
--- a/pl/math/test/mathbench_funcs.h
+++ b/pl/math/test/mathbench_funcs.h
@@ -9,20 +9,10 @@
#define _ZSF1(fun, a, b) F(fun##f, a, b)
#define _ZSD1(f, a, b) D(f, a, b)
-#ifdef __vpcs
+#if defined(__vpcs) && __aarch64__
-#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b) VNF(__vn_##fun##f, a, b) VNF(_ZGVnN4v_##fun##f, a, b)
-#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b) VND(__vn_##f, a, b) VND(_ZGVnN2v_##f, a, b)
-
-#elif __aarch64__
-
-#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b) VF(__v_##fun##f, a, b)
-#define _ZVD1(f, a, b) D(__s_##f, a, b) VD(__v_##f, a, b)
-
-#elif WANT_VMATH
-
-#define _ZVF1(fun, a, b) F(__s_##fun##f, a, b)
-#define _ZVD1(f, a, b) D(__s_##f, a, b)
+#define _ZVF1(fun, a, b) VNF(_ZGVnN4v_##fun##f, a, b)
+#define _ZVD1(f, a, b) VND(_ZGVnN2v_##f, a, b)
#else
@@ -33,8 +23,8 @@
#if WANT_SVE_MATH
-#define _ZSVF1(fun, a, b) SVF(__sv_##fun##f_x, a, b) SVF(_ZGVsMxv_##fun##f, a, b)
-#define _ZSVD1(f, a, b) SVD(__sv_##f##_x, a, b) SVD(_ZGVsMxv_##f, a, b)
+#define _ZSVF1(fun, a, b) SVF(_ZGVsMxv_##fun##f, a, b)
+#define _ZSVD1(f, a, b) SVD(_ZGVsMxv_##f, a, b)
#else
@@ -64,23 +54,34 @@
{"atan2", 'd', 0, -10.0, 10.0, {.d = atan2_wrap}},
{"powi", 'd', 0, 0.01, 11.1, {.d = powi_wrap}},
-{"__s_atan2f", 'f', 0, -10.0, 10.0, {.f = __s_atan2f_wrap}},
-{"__s_atan2", 'd', 0, -10.0, 10.0, {.d = __s_atan2_wrap}},
-{"__v_atan2f", 'f', 'v', -10.0, 10.0, {.vf = __v_atan2f_wrap}},
-{"__v_atan2", 'd', 'v', -10.0, 10.0, {.vd = __v_atan2_wrap}},
-{"__vn_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = __vn_atan2f_wrap}},
{"_ZGVnN4vv_atan2f", 'f', 'n', -10.0, 10.0, {.vnf = _Z_atan2f_wrap}},
-{"__vn_atan2", 'd', 'n', -10.0, 10.0, {.vnd = __vn_atan2_wrap}},
{"_ZGVnN2vv_atan2", 'd', 'n', -10.0, 10.0, {.vnd = _Z_atan2_wrap}},
+{"_ZGVnN4vv_hypotf", 'f', 'n', -10.0, 10.0, {.vnf = _Z_hypotf_wrap}},
+{"_ZGVnN2vv_hypot", 'd', 'n', -10.0, 10.0, {.vnd = _Z_hypot_wrap}},
+{"_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = xy_Z_pow}},
+{"x_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = x_Z_pow}},
+{"y_ZGVnN2vv_pow", 'd', 'n', -10.0, 10.0, {.vnd = y_Z_pow}},
+{"_ZGVnN4vl4l4_sincosf", 'f', 'n', -3.1, 3.1, {.vnf = _Z_sincosf_wrap}},
+{"_ZGVnN2vl8l8_sincos", 'd', 'n', -3.1, 3.1, {.vnd = _Z_sincos_wrap}},
+{"_ZGVnN4v_cexpif", 'f', 'n', -3.1, 3.1, {.vnf = _Z_cexpif_wrap}},
+{"_ZGVnN2v_cexpi", 'd', 'n', -3.1, 3.1, {.vnd = _Z_cexpi_wrap}},
#if WANT_SVE_MATH
-{"__sv_atan2f_x", 'f', 's', -10.0, 10.0, {.svf = __sv_atan2f_wrap}},
{"_ZGVsMxvv_atan2f", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_atan2f_wrap}},
-{"__sv_atan2_x", 'd', 's', -10.0, 10.0, {.svd = __sv_atan2_wrap}},
-{"_ZGVsM2vv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
-{"__sv_powif_x", 'f', 's', -10.0, 10.0, {.svf = __sv_powif_wrap}},
+{"_ZGVsMxvv_atan2", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_atan2_wrap}},
+{"_ZGVsMxvv_hypotf", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_hypotf_wrap}},
+{"_ZGVsMxvv_hypot", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_hypot_wrap}},
{"_ZGVsMxvv_powi", 'f', 's', -10.0, 10.0, {.svf = _Z_sv_powi_wrap}},
-{"__sv_powi_x", 'd', 's', -10.0, 10.0, {.svd = __sv_powi_wrap}},
{"_ZGVsMxvv_powk", 'd', 's', -10.0, 10.0, {.svd = _Z_sv_powk_wrap}},
+{"_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = xy_Z_sv_powf}},
+{"x_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = x_Z_sv_powf}},
+{"y_ZGVsMxvv_powf", 'f', 's', -10.0, 10.0, {.svf = y_Z_sv_powf}},
+{"_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = xy_Z_sv_pow}},
+{"x_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = x_Z_sv_pow}},
+{"y_ZGVsMxvv_pow", 'd', 's', -10.0, 10.0, {.svd = y_Z_sv_pow}},
+{"_ZGVsMxvl4l4_sincosf", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_sincosf_wrap}},
+{"_ZGVsMxvl8l8_sincos", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_sincos_wrap}},
+{"_ZGVsMxv_cexpif", 'f', 's', -3.1, 3.1, {.svf = _Z_sv_cexpif_wrap}},
+{"_ZGVsMxv_cexpi", 'd', 's', -3.1, 3.1, {.svd = _Z_sv_cexpi_wrap}},
#endif
- // clang-format on
+ // clang-format on
diff --git a/pl/math/test/mathbench_wrappers.h b/pl/math/test/mathbench_wrappers.h
index eba960eb96ac..fe7f8963cdee 100644
--- a/pl/math/test/mathbench_wrappers.h
+++ b/pl/math/test/mathbench_wrappers.h
@@ -23,87 +23,106 @@ powi_wrap (double x)
return __builtin_powi (x, (int) round (x));
}
-#if WANT_VMATH
-#if __aarch64__
+#if __aarch64__ && defined(__vpcs)
-static double
-__s_atan2_wrap (double x)
+__vpcs static v_double
+_Z_atan2_wrap (v_double x)
{
- return __s_atan2 (5.0, x);
+ return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
}
-static float
-__s_atan2f_wrap (float x)
+__vpcs static v_float
+_Z_atan2f_wrap (v_float x)
{
- return __s_atan2f (5.0f, x);
+ return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
}
-static v_double
-__v_atan2_wrap (v_double x)
+__vpcs static v_float
+_Z_hypotf_wrap (v_float x)
{
- return __v_atan2 (v_double_dup (5.0), x);
+ return _ZGVnN4vv_hypotf (v_float_dup (5.0f), x);
}
-static v_float
-__v_atan2f_wrap (v_float x)
+__vpcs static v_double
+_Z_hypot_wrap (v_double x)
{
- return __v_atan2f (v_float_dup (5.0f), x);
+ return _ZGVnN2vv_hypot (v_double_dup (5.0), x);
}
-#ifdef __vpcs
-
__vpcs static v_double
-__vn_atan2_wrap (v_double x)
+xy_Z_pow (v_double x)
{
- return __vn_atan2 (v_double_dup (5.0), x);
+ return _ZGVnN2vv_pow (x, x);
}
-__vpcs static v_float
-__vn_atan2f_wrap (v_float x)
+__vpcs static v_double
+x_Z_pow (v_double x)
{
- return __vn_atan2f (v_float_dup (5.0f), x);
+ return _ZGVnN2vv_pow (x, v_double_dup (23.4));
}
__vpcs static v_double
-_Z_atan2_wrap (v_double x)
+y_Z_pow (v_double x)
{
- return _ZGVnN2vv_atan2 (v_double_dup (5.0), x);
+ return _ZGVnN2vv_pow (v_double_dup (2.34), x);
}
__vpcs static v_float
-_Z_atan2f_wrap (v_float x)
+_Z_sincosf_wrap (v_float x)
{
- return _ZGVnN4vv_atan2f (v_float_dup (5.0f), x);
+ v_float s, c;
+ _ZGVnN4vl4l4_sincosf (x, &s, &c);
+ return s + c;
}
-#endif // __vpcs
-#endif // __arch64__
-#endif // WANT_VMATH
+__vpcs static v_float
+_Z_cexpif_wrap (v_float x)
+{
+ __f32x4x2_t sc = _ZGVnN4v_cexpif (x);
+ return sc.val[0] + sc.val[1];
+}
-#if WANT_SVE_MATH
+__vpcs static v_double
+_Z_sincos_wrap (v_double x)
+{
+ v_double s, c;
+ _ZGVnN2vl8l8_sincos (x, &s, &c);
+ return s + c;
+}
-static sv_float
-__sv_atan2f_wrap (sv_float x, sv_bool pg)
+__vpcs static v_double
+_Z_cexpi_wrap (v_double x)
{
- return __sv_atan2f_x (x, svdup_n_f32 (5.0f), pg);
+ __f64x2x2_t sc = _ZGVnN2v_cexpi (x);
+ return sc.val[0] + sc.val[1];
}
+#endif // __arch64__ && __vpcs
+
+#if WANT_SVE_MATH
+
static sv_float
_Z_sv_atan2f_wrap (sv_float x, sv_bool pg)
{
- return _ZGVsMxvv_atan2f (x, svdup_n_f32 (5.0f), pg);
+ return _ZGVsMxvv_atan2f (x, svdup_f32 (5.0f), pg);
}
static sv_double
-__sv_atan2_wrap (sv_double x, sv_bool pg)
+_Z_sv_atan2_wrap (sv_double x, sv_bool pg)
{
- return __sv_atan2_x (x, svdup_n_f64 (5.0), pg);
+ return _ZGVsMxvv_atan2 (x, svdup_f64 (5.0), pg);
+}
+
+static sv_float
+_Z_sv_hypotf_wrap (sv_float x, sv_bool pg)
+{
+ return _ZGVsMxvv_hypotf (x, svdup_f32 (5.0), pg);
}
static sv_double
-_Z_sv_atan2_wrap (sv_double x, sv_bool pg)
+_Z_sv_hypot_wrap (sv_double x, sv_bool pg)
{
- return _ZGVsMxvv_atan2 (x, svdup_n_f64 (5.0), pg);
+ return _ZGVsMxvv_hypot (x, svdup_f64 (5.0), pg);
}
static sv_float
@@ -112,22 +131,76 @@ _Z_sv_powi_wrap (sv_float x, sv_bool pg)
return _ZGVsMxvv_powi (x, svcvt_s32_f32_x (pg, x), pg);
}
+static sv_double
+_Z_sv_powk_wrap (sv_double x, sv_bool pg)
+{
+ return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
+}
+
+static sv_float
+xy_Z_sv_powf (sv_float x, sv_bool pg)
+{
+ return _ZGVsMxvv_powf (x, x, pg);
+}
+
static sv_float
-__sv_powif_wrap (sv_float x, sv_bool pg)
+x_Z_sv_powf (sv_float x, sv_bool pg)
{
- return __sv_powif_x (x, svcvt_s32_f32_x (pg, x), pg);
+ return _ZGVsMxvv_powf (x, svdup_f32 (23.4f), pg);
+}
+
+static sv_float
+y_Z_sv_powf (sv_float x, sv_bool pg)
+{
+ return _ZGVsMxvv_powf (svdup_f32 (2.34f), x, pg);
}
static sv_double
-_Z_sv_powk_wrap (sv_double x, sv_bool pg)
+xy_Z_sv_pow (sv_double x, sv_bool pg)
{
- return _ZGVsMxvv_powk (x, svcvt_s64_f64_x (pg, x), pg);
+ return _ZGVsMxvv_pow (x, x, pg);
+}
+
+static sv_double
+x_Z_sv_pow (sv_double x, sv_bool pg)
+{
+ return _ZGVsMxvv_pow (x, svdup_f64 (23.4), pg);
+}
+
+static sv_double
+y_Z_sv_pow (sv_double x, sv_bool pg)
+{
+ return _ZGVsMxvv_pow (svdup_f64 (2.34), x, pg);
+}
+
+static sv_float
+_Z_sv_sincosf_wrap (sv_float x, sv_bool pg)
+{
+ float s[svcntw ()], c[svcntw ()];
+ _ZGVsMxvl4l4_sincosf (x, s, c, pg);
+ return svadd_x (pg, svld1 (pg, s), svld1 (pg, s));
+}
+
+static sv_float
+_Z_sv_cexpif_wrap (sv_float x, sv_bool pg)
+{
+ svfloat32x2_t sc = _ZGVsMxv_cexpif (x, pg);
+ return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1));
+}
+
+static sv_double
+_Z_sv_sincos_wrap (sv_double x, sv_bool pg)
+{
+ double s[svcntd ()], c[svcntd ()];
+ _ZGVsMxvl8l8_sincos (x, s, c, pg);
+ return svadd_x (pg, svld1 (pg, s), svld1 (pg, s));
}
static sv_double
-__sv_powi_wrap (sv_double x, sv_bool pg)
+_Z_sv_cexpi_wrap (sv_double x, sv_bool pg)
{
- return __sv_powi_x (x, svcvt_s64_f64_x (pg, x), pg);
+ svfloat64x2_t sc = _ZGVsMxv_cexpi (x, pg);
+ return svadd_x (pg, svget2 (sc, 0), svget2 (sc, 1));
}
#endif // WANT_SVE_MATH
diff --git a/pl/math/test/pl_test.h b/pl/math/test/pl_test.h
index 467d1cac0c36..e7ed4eed634e 100644
--- a/pl/math/test/pl_test.h
+++ b/pl/math/test/pl_test.h
@@ -8,18 +8,14 @@
/* Emit the max ULP threshold, l, for routine f. Piggy-back PL_TEST_EXPECT_FENV
on PL_TEST_ULP to add EXPECT_FENV to all scalar routines. */
-#if !(V_SUPPORTED || SV_SUPPORTED)
-#define PL_TEST_ULP(f, l) \
- PL_TEST_EXPECT_FENV_ALWAYS (f) \
- PL_TEST_ULP f l
+#if WANT_VMATH || defined(IGNORE_SCALAR_FENV)
+# define PL_TEST_ULP(f, l) PL_TEST_ULP f l
#else
-#define PL_TEST_ULP(f, l) PL_TEST_ULP f l
+# define PL_TEST_ULP(f, l) \
+ PL_TEST_EXPECT_FENV_ALWAYS (f) \
+ PL_TEST_ULP f l
#endif
-/* Emit aliases to allow test params to be mapped from aliases back to their
- aliasees. */
-#define PL_ALIAS(a, b) PL_TEST_ALIAS a b
-
/* Emit routine name if e == 1 and f is expected to correctly trigger fenv
exceptions. e allows declaration to be emitted conditionally upon certain
build flags - defer expansion by one pass to allow those flags to be expanded
@@ -30,4 +26,14 @@
#define PL_TEST_EXPECT_FENV_ALWAYS(f) PL_TEST_EXPECT_FENV (f, 1)
#define PL_TEST_INTERVAL(f, lo, hi, n) PL_TEST_INTERVAL f lo hi n
+#define PL_TEST_SYM_INTERVAL(f, lo, hi, n) \
+ PL_TEST_INTERVAL (f, lo, hi, n) \
+ PL_TEST_INTERVAL (f, -lo, -hi, n)
#define PL_TEST_INTERVAL_C(f, lo, hi, n, c) PL_TEST_INTERVAL f lo hi n c
+#define PL_TEST_SYM_INTERVAL_C(f, lo, hi, n, c) \
+ PL_TEST_INTERVAL_C (f, lo, hi, n, c) \
+ PL_TEST_INTERVAL_C (f, -lo, -hi, n, c)
+// clang-format off
+#define PL_TEST_INTERVAL2(f, xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL f xlo,ylo xhi,yhi n
+// clang-format on
diff --git a/pl/math/test/runulp.sh b/pl/math/test/runulp.sh
index 4d02530d44b1..0f5a41f76b25 100755
--- a/pl/math/test/runulp.sh
+++ b/pl/math/test/runulp.sh
@@ -21,55 +21,55 @@ FAIL=0
PASS=0
t() {
- key=$(cat $ALIASES | { grep " $1$" || echo $1; } | awk '{print $1}')
- L=$(cat $LIMITS | grep "^$key " | awk '{print $2}')
+ routine=$1
+ L=$(cat $LIMITS | grep "^$routine " | awk '{print $2}')
[[ $L =~ ^[0-9]+\.[0-9]+$ ]]
- extra_flags=""
+ extra_flags=
[[ -z "${5:-}" ]] || extra_flags="$extra_flags -c $5"
- grep -q "^$key$" $FENV || extra_flags="$extra_flags -f"
- $emu ./ulp -e $L $flags ${extra_flags} $1 $2 $3 $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
+ grep -q "^$routine$" $FENV || extra_flags="$extra_flags -f"
+ IFS=',' read -ra LO <<< "$2"
+ IFS=',' read -ra HI <<< "$3"
+ ITV="${LO[0]} ${HI[0]}"
+ for i in "${!LO[@]}"; do
+ [[ "$i" -eq "0" ]] || ITV="$ITV x ${LO[$i]} ${HI[$i]}"
+ done
+ # Add -z flag to ignore zero sign for vector routines
+ { echo $routine | grep -q "ZGV"; } && extra_flags="$extra_flags -z"
+ $emu ./ulp -e $L $flags ${extra_flags} $routine $ITV $4 && PASS=$((PASS+1)) || FAIL=$((FAIL+1))
}
check() {
$emu ./ulp -f -q "$@" #>/dev/null
}
-# Regression-test for correct NaN handling in atan2
-check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
-check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
-check atan2 nan nan x -nan -nan
+if [ "$FUNC" == "atan2" ] || [ -z "$FUNC" ]; then
+ # Regression-test for correct NaN handling in atan2
+ check atan2 0x1p-1022 0x1p-1000 x 0 0x1p-1022 40000
+ check atan2 0x1.7887a0a717aefp+1017 0x1.7887a0a717aefp+1017 x -nan -nan
+ check atan2 nan nan x -nan -nan
+fi
# vector functions
flags="${ULPFLAGS:--q}"
-runs=
-check __s_log10f 1 && runs=1
-runv=
-check __v_log10f 1 && runv=1
-runvn=
-check __vn_log10f 1 && runvn=1
runsv=
if [ $WANT_SVE_MATH -eq 1 ]; then
-check __sv_cosf 0 && runsv=1
-check __sv_cos 0 && runsv=1
-check __sv_sinf 0 && runsv=1
-check __sv_sin 0 && runsv=1
# No guarantees about powi accuracy, so regression-test for exactness
# w.r.t. the custom reference impl in ulp_wrappers.h
-check -q -f -e 0 __sv_powif 0 inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 __sv_powif -0 -inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 __sv_powif 0 inf x -0 -1000 100000 && runsv=1
-check -q -f -e 0 __sv_powif -0 -inf x -0 -1000 100000 && runsv=1
-check -q -f -e 0 __sv_powi 0 inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 __sv_powi -0 -inf x 0 1000 100000 && runsv=1
-check -q -f -e 0 __sv_powi 0 inf x -0 -1000 100000 && runsv=1
-check -q -f -e 0 __sv_powi -0 -inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powi 0 inf x 0 1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x 0 1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powi 0 inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powi -0 -inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powk 0 inf x 0 1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x 0 1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powk 0 inf x -0 -1000 100000 && runsv=1
+check -q -f -e 0 _ZGVsMxvv_powk -0 -inf x -0 -1000 100000 && runsv=1
fi
while read F LO HI N C
do
t $F $LO $HI $N $C
done << EOF
-$(cat $INTERVALS)
+$(cat $INTERVALS | grep "\b$FUNC\b")
EOF
[ 0 -eq $FAIL ] || {
diff --git a/pl/math/test/testcases/directed/acos.tst b/pl/math/test/testcases/directed/acos.tst
new file mode 100644
index 000000000000..a73dcd25965b
--- /dev/null
+++ b/pl/math/test/testcases/directed/acos.tst
@@ -0,0 +1,17 @@
+; acos.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acos op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=acos op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=acos op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acos op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=acos op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acos op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=acos op1=00000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=acos op1=80000000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=acos op1=3ff00000.00000000 result=00000000.00000000 errno=0
+func=acos op1=bff00000.00000000 result=400921fb.54442d18.469 errno=0
+func=acos op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=acos op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
diff --git a/pl/math/test/testcases/directed/acosf.tst b/pl/math/test/testcases/directed/acosf.tst
new file mode 100644
index 000000000000..9e453e3bff5e
--- /dev/null
+++ b/pl/math/test/testcases/directed/acosf.tst
@@ -0,0 +1,21 @@
+; acosf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=acosf op1=7fc00001 result=7fc00001 errno=0
+func=acosf op1=ffc00001 result=7fc00001 errno=0
+func=acosf op1=7f800001 result=7fc00001 errno=0 status=i
+func=acosf op1=ff800001 result=7fc00001 errno=0 status=i
+func=acosf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=acosf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=acosf op1=00000000 result=3fc90fda.a22 errno=0
+func=acosf op1=80000000 result=3fc90fda.a22 errno=0
+func=acosf op1=3f800000 result=00000000 errno=0
+func=acosf op1=bf800000 result=40490fda.a22 errno=0
+func=acosf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=acosf op1=bf800001 result=7fc00001 errno=EDOM status=i
+func=acosf op1=33000000 result=3fc90fda.622 error=0
+func=acosf op1=30000000 result=3fc90fda.a12 error=0
+func=acosf op1=2d000000 result=3fc90fda.a21 error=0
+func=acosf op1=2a000000 result=3fc90fda.a22 error=0
diff --git a/pl/math/test/testcases/directed/asin.tst b/pl/math/test/testcases/directed/asin.tst
new file mode 100644
index 000000000000..6180d7849d90
--- /dev/null
+++ b/pl/math/test/testcases/directed/asin.tst
@@ -0,0 +1,24 @@
+; asin.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asin op1=7ff80000.00000001 result=7ff80000.00000001 errno=0
+func=asin op1=fff80000.00000001 result=7ff80000.00000001 errno=0
+func=asin op1=7ff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asin op1=fff00000.00000001 result=7ff80000.00000001 errno=0 status=i
+func=asin op1=7ff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=asin op1=fff00000.00000000 result=7ff80000.00000001 errno=EDOM status=i
+func=asin op1=00000000.00000000 result=00000000.00000000 errno=0
+func=asin op1=80000000.00000000 result=80000000.00000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=asin op1=00000000.00000001 result=00000000.00000001 errno=0 maybestatus=ux
+func=asin op1=80000000.00000001 result=80000000.00000001 errno=0 maybestatus=ux
+
+func=asin op1=3ff00000.00000000 result=3ff921fb.54442d18.469 errno=0
+func=asin op1=bff00000.00000000 result=bff921fb.54442d18.469 errno=0
+func=asin op1=3ff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
+func=asin op1=bff00000.00000001 result=7ff80000.00000001 errno=EDOM status=i
diff --git a/pl/math/test/testcases/directed/asinf.tst b/pl/math/test/testcases/directed/asinf.tst
new file mode 100644
index 000000000000..a85b2593768d
--- /dev/null
+++ b/pl/math/test/testcases/directed/asinf.tst
@@ -0,0 +1,24 @@
+; asinf.tst
+;
+; Copyright (c) 2009-2023, Arm Limited.
+; SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+func=asinf op1=7fc00001 result=7fc00001 errno=0
+func=asinf op1=ffc00001 result=7fc00001 errno=0
+func=asinf op1=7f800001 result=7fc00001 errno=0 status=i
+func=asinf op1=ff800001 result=7fc00001 errno=0 status=i
+func=asinf op1=7f800000 result=7fc00001 errno=EDOM status=i
+func=asinf op1=ff800000 result=7fc00001 errno=EDOM status=i
+func=asinf op1=00000000 result=00000000 errno=0
+func=asinf op1=80000000 result=80000000 errno=0
+; Inconsistent behavior was detected for the following 2 cases.
+; No exception is raised with certain versions of glibc. Functions
+; approximated by x near zero may not generate/implement flops and
+; thus may not raise exceptions.
+func=asinf op1=00000001 result=00000001 errno=0 maybestatus=ux
+func=asinf op1=80000001 result=80000001 errno=0 maybestatus=ux
+
+func=asinf op1=3f800000 result=3fc90fda.a22 errno=0
+func=asinf op1=bf800000 result=bfc90fda.a22 errno=0
+func=asinf op1=3f800001 result=7fc00001 errno=EDOM status=i
+func=asinf op1=bf800001 result=7fc00001 errno=EDOM status=i
diff --git a/pl/math/test/ulp_funcs.h b/pl/math/test/ulp_funcs.h
index 5e3133e1db4c..4929b481ffe1 100644
--- a/pl/math/test/ulp_funcs.h
+++ b/pl/math/test/ulp_funcs.h
@@ -5,26 +5,12 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#ifdef __vpcs
+#if defined(__vpcs) && __aarch64__
-#define _ZVF1(f) SF1 (f) VF1 (f) ZVNF1 (f)
-#define _ZVD1(f) SD1 (f) VD1 (f) ZVND1 (f)
-#define _ZVF2(f) SF2 (f) VF2 (f) ZVNF2 (f)
-#define _ZVD2(f) SD2 (f) VD2 (f) ZVND2 (f)
-
-#elif __aarch64
-
-#define _ZVF1(f) SF1 (f) VF1 (f)
-#define _ZVD1(f) SD1 (f) VD1 (f)
-#define _ZVF2(f) SF2 (f) VF2 (f)
-#define _ZVD2(f) SD2 (f) VD2 (f)
-
-#elif WANT_VMATH
-
-#define _ZVF1(f) SF1 (f)
-#define _ZVD1(f) SD1 (f)
-#define _ZVF2(f) SF2 (f)
-#define _ZVD2(f) SD2 (f)
+#define _ZVF1(f) ZVF1 (f)
+#define _ZVD1(f) ZVD1 (f)
+#define _ZVF2(f) ZVF2 (f)
+#define _ZVD2(f) ZVD2 (f)
#else
@@ -37,10 +23,10 @@
#if WANT_SVE_MATH
-#define _ZSVF1(f) SVF1 (f) ZSVF1 (f)
-#define _ZSVF2(f) SVF2 (f) ZSVF2 (f)
-#define _ZSVD1(f) SVD1 (f) ZSVD1 (f)
-#define _ZSVD2(f) SVD2 (f) ZSVD2 (f)
+#define _ZSVF1(f) ZSVF1 (f)
+#define _ZSVF2(f) ZSVF2 (f)
+#define _ZSVD1(f) ZSVD1 (f)
+#define _ZSVD2(f) ZSVD2 (f)
#else
@@ -58,9 +44,27 @@
#include "ulp_funcs_gen.h"
+F (_ZGVnN4v_sincosf_sin, v_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0)
+F (_ZGVnN4v_sincosf_cos, v_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0)
+F (_ZGVnN4v_cexpif_sin, v_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0)
+F (_ZGVnN4v_cexpif_cos, v_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0)
+
+F (_ZGVnN2v_sincos_sin, v_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+F (_ZGVnN2v_sincos_cos, v_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+F (_ZGVnN2v_cexpi_sin, v_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+F (_ZGVnN2v_cexpi_cos, v_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+
#if WANT_SVE_MATH
-F (__sv_powi, sv_powi, ref_powi, mpfr_powi, 2, 0, d2, 0)
F (_ZGVsMxvv_powk, Z_sv_powk, ref_powi, mpfr_powi, 2, 0, d2, 0)
-F (__sv_powif, sv_powif, ref_powif, mpfr_powi, 2, 1, f2, 0)
F (_ZGVsMxvv_powi, Z_sv_powi, ref_powif, mpfr_powi, 2, 1, f2, 0)
+
+F (_ZGVsMxv_sincosf_sin, sv_sincosf_sin, sin, mpfr_sin, 1, 1, f1, 0)
+F (_ZGVsMxv_sincosf_cos, sv_sincosf_cos, cos, mpfr_cos, 1, 1, f1, 0)
+F (_ZGVsMxv_cexpif_sin, sv_cexpif_sin, sin, mpfr_sin, 1, 1, f1, 0)
+F (_ZGVsMxv_cexpif_cos, sv_cexpif_cos, cos, mpfr_cos, 1, 1, f1, 0)
+
+F (_ZGVsMxv_sincos_sin, sv_sincos_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+F (_ZGVsMxv_sincos_cos, sv_sincos_cos, cosl, mpfr_cos, 1, 0, d1, 0)
+F (_ZGVsMxv_cexpi_sin, sv_cexpi_sin, sinl, mpfr_sin, 1, 0, d1, 0)
+F (_ZGVsMxv_cexpi_cos, sv_cexpi_cos, cosl, mpfr_cos, 1, 0, d1, 0)
#endif
diff --git a/pl/math/test/ulp_wrappers.h b/pl/math/test/ulp_wrappers.h
index b682e939054a..0f7b68949c7b 100644
--- a/pl/math/test/ulp_wrappers.h
+++ b/pl/math/test/ulp_wrappers.h
@@ -6,7 +6,9 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#define _GNU_SOURCE
#include <stdbool.h>
+#include <arm_neon.h>
#if USE_MPFR
static int sincos_mpfr_sin(mpfr_t y, const mpfr_t x, mpfr_rnd_t r) {
@@ -36,7 +38,7 @@ static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t
double. This is fine since a round-trip to higher precision and
back down is correctly rounded. */
#define DECL_POW_INT_REF(NAME, DBL_T, FLT_T, INT_T) \
- static DBL_T NAME (DBL_T in_val, DBL_T y) \
+ static DBL_T __attribute__((unused)) NAME (DBL_T in_val, DBL_T y) \
{ \
INT_T n = (INT_T) round (y); \
FLT_T acc = 1.0; \
@@ -60,41 +62,17 @@ static int wrap_mpfr_powi(mpfr_t ret, const mpfr_t x, const mpfr_t y, mpfr_rnd_t
DECL_POW_INT_REF(ref_powif, double, float, int)
DECL_POW_INT_REF(ref_powi, long double, double, int)
-#define VF1_WRAP(func) static float v_##func##f(float x) { return __v_##func##f(argf(x))[0]; }
-#define VF2_WRAP(func) static float v_##func##f(float x, float y) { return __v_##func##f(argf(x), argf(y))[0]; }
-#define VD1_WRAP(func) static double v_##func(double x) { return __v_##func(argd(x))[0]; }
-#define VD2_WRAP(func) static double v_##func(double x, double y) { return __v_##func(argd(x), argd(y))[0]; }
-
-#define VNF1_WRAP(func) static float vn_##func##f(float x) { return __vn_##func##f(argf(x))[0]; }
-#define VNF2_WRAP(func) static float vn_##func##f(float x, float y) { return __vn_##func##f(argf(x), argf(y))[0]; }
-#define VND1_WRAP(func) static double vn_##func(double x) { return __vn_##func(argd(x))[0]; }
-#define VND2_WRAP(func) static double vn_##func(double x, double y) { return __vn_##func(argd(x), argd(y))[0]; }
-
#define ZVF1_WRAP(func) static float Z_##func##f(float x) { return _ZGVnN4v_##func##f(argf(x))[0]; }
#define ZVF2_WRAP(func) static float Z_##func##f(float x, float y) { return _ZGVnN4vv_##func##f(argf(x), argf(y))[0]; }
#define ZVD1_WRAP(func) static double Z_##func(double x) { return _ZGVnN2v_##func(argd(x))[0]; }
#define ZVD2_WRAP(func) static double Z_##func(double x, double y) { return _ZGVnN2vv_##func(argd(x), argd(y))[0]; }
-#ifdef __vpcs
-
-#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func) ZVF1_WRAP(func)
-#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func) ZVF2_WRAP(func)
-#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func) ZVD1_WRAP(func)
-#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func) ZVD2_WRAP(func)
-
-#elif __aarch64__
-
-#define ZVNF1_WRAP(func) VF1_WRAP(func) VNF1_WRAP(func)
-#define ZVNF2_WRAP(func) VF2_WRAP(func) VNF2_WRAP(func)
-#define ZVND1_WRAP(func) VD1_WRAP(func) VND1_WRAP(func)
-#define ZVND2_WRAP(func) VD2_WRAP(func) VND2_WRAP(func)
-
-#elif WANT_VMATH
+#if defined(__vpcs) && __aarch64__
-#define ZVNF1_WRAP(func) VF1_WRAP(func)
-#define ZVNF2_WRAP(func) VF2_WRAP(func)
-#define ZVND1_WRAP(func) VD1_WRAP(func)
-#define ZVND2_WRAP(func) VD2_WRAP(func)
+#define ZVNF1_WRAP(func) ZVF1_WRAP(func)
+#define ZVNF2_WRAP(func) ZVF2_WRAP(func)
+#define ZVND1_WRAP(func) ZVD1_WRAP(func)
+#define ZVND2_WRAP(func) ZVD2_WRAP(func)
#else
@@ -105,11 +83,6 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
#endif
-#define SVF1_WRAP(func) static float sv_##func##f(float x) { return svretf(__sv_##func##f_x(svargf(x), svptrue_b32())); }
-#define SVF2_WRAP(func) static float sv_##func##f(float x, float y) { return svretf(__sv_##func##f_x(svargf(x), svargf(y), svptrue_b32())); }
-#define SVD1_WRAP(func) static double sv_##func(double x) { return svretd(__sv_##func##_x(svargd(x), svptrue_b64())); }
-#define SVD2_WRAP(func) static double sv_##func(double x, double y) { return svretd(__sv_##func##_x(svargd(x), svargd(y), svptrue_b64())); }
-
#define ZSVF1_WRAP(func) static float Z_sv_##func##f(float x) { return svretf(_ZGVsMxv_##func##f(svargf(x), svptrue_b32())); }
#define ZSVF2_WRAP(func) static float Z_sv_##func##f(float x, float y) { return svretf(_ZGVsMxvv_##func##f(svargf(x), svargf(y), svptrue_b32())); }
#define ZSVD1_WRAP(func) static double Z_sv_##func(double x) { return svretd(_ZGVsMxv_##func(svargd(x), svptrue_b64())); }
@@ -117,10 +90,10 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
#if WANT_SVE_MATH
-#define ZSVNF1_WRAP(func) SVF1_WRAP(func) ZSVF1_WRAP(func)
-#define ZSVNF2_WRAP(func) SVF2_WRAP(func) ZSVF2_WRAP(func)
-#define ZSVND1_WRAP(func) SVD1_WRAP(func) ZSVD1_WRAP(func)
-#define ZSVND2_WRAP(func) SVD2_WRAP(func) ZSVD2_WRAP(func)
+#define ZSVNF1_WRAP(func) ZSVF1_WRAP(func)
+#define ZSVNF2_WRAP(func) ZSVF2_WRAP(func)
+#define ZSVND1_WRAP(func) ZSVD1_WRAP(func)
+#define ZSVND2_WRAP(func) ZSVD2_WRAP(func)
#else
@@ -139,10 +112,29 @@ DECL_POW_INT_REF(ref_powi, long double, double, int)
#include "ulp_wrappers_gen.h"
+float v_sincosf_sin(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return s[0]; }
+float v_sincosf_cos(float x) { float32x4_t s, c; _ZGVnN4vl4l4_sincosf(vdupq_n_f32(x), &s, &c); return c[0]; }
+float v_cexpif_sin(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[0][0]; }
+float v_cexpif_cos(float x) { return _ZGVnN4v_cexpif(vdupq_n_f32(x)).val[1][0]; }
+
+double v_sincos_sin(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return s[0]; }
+double v_sincos_cos(double x) { float64x2_t s, c; _ZGVnN2vl8l8_sincos(vdupq_n_f64(x), &s, &c); return c[0]; }
+double v_cexpi_sin(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[0][0]; }
+double v_cexpi_cos(double x) { return _ZGVnN2v_cexpi(vdupq_n_f64(x)).val[1][0]; }
+
#if WANT_SVE_MATH
-static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
-static float sv_powif(float x, float y) { return svretf(__sv_powif_x(svargf(x), svdup_n_s32((int)round(y)), svptrue_b32())); }
-static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
-static double sv_powi(double x, double y) { return svretd(__sv_powi_x(svargd(x), svdup_n_s64((long)round(y)), svptrue_b64())); }
+static float Z_sv_powi(float x, float y) { return svretf(_ZGVsMxvv_powi(svargf(x), svdup_s32((int)round(y)), svptrue_b32())); }
+static double Z_sv_powk(double x, double y) { return svretd(_ZGVsMxvv_powk(svargd(x), svdup_s64((long)round(y)), svptrue_b64())); }
+
+float sv_sincosf_sin(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return s[0]; }
+float sv_sincosf_cos(float x) { float s[svcntw()], c[svcntw()]; _ZGVsMxvl4l4_sincosf(svdup_f32(x), s, c, svptrue_b32()); return c[0]; }
+float sv_cexpif_sin(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 0)); }
+float sv_cexpif_cos(float x) { return svretf(svget2(_ZGVsMxv_cexpif(svdup_f32(x), svptrue_b32()), 1)); }
+
+double sv_sincos_sin(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return s[0]; }
+double sv_sincos_cos(double x) { double s[svcntd()], c[svcntd()]; _ZGVsMxvl8l8_sincos(svdup_f64(x), s, c, svptrue_b64()); return c[0]; }
+double sv_cexpi_sin(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 0)); }
+double sv_cexpi_cos(double x) { return svretd(svget2(_ZGVsMxv_cexpi(svdup_f64(x), svptrue_b64()), 1)); }
+
#endif
// clang-format on
diff --git a/pl/math/tools/asin.sollya b/pl/math/tools/asin.sollya
new file mode 100644
index 000000000000..8ef861d0898b
--- /dev/null
+++ b/pl/math/tools/asin.sollya
@@ -0,0 +1,29 @@
+// polynomial for approximating asin(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+f = asin(x);
+dtype = double;
+
+prec=256;
+
+a = 0x1p-106;
+b = 0.25;
+
+deg = 11;
+
+backward = proc(poly, d) {
+ return d + d ^ 3 * poly(d * d);
+};
+
+forward = proc(f, d) {
+ return (f(sqrt(d))-sqrt(d))/(d*sqrt(d));
+};
+
+poly = fpminimax(forward(f, x), [|0,...,deg|], [|dtype ...|], [a;b], relative, floating);
+
+display = hexadecimal!;
+print("rel error:", dirtyinfnorm(1-backward(poly, x)/f(x), [a;b]));
+print("in [", a, b, "]");
+for i from 0 to deg do print(coeff(poly, i));
diff --git a/pl/math/tools/asinf.sollya b/pl/math/tools/asinf.sollya
new file mode 100644
index 000000000000..5b627e546c73
--- /dev/null
+++ b/pl/math/tools/asinf.sollya
@@ -0,0 +1,36 @@
+// polynomial for approximating asinf(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+f = asin(x);
+dtype = single;
+
+a = 0x1p-24;
+b = 0.25;
+
+deg = 4;
+
+backward = proc(poly, d) {
+ return d + d ^ 3 * poly(d * d);
+};
+
+forward = proc(f, d) {
+ return (f(sqrt(d))-sqrt(d))/(d*sqrt(d));
+};
+
+approx = proc(poly, d) {
+ return remez(1 - poly(x) / forward(f, x), deg - d, [a;b], x^d/forward(f, x), 1e-16);
+};
+
+poly = 0;
+for i from 0 to deg do {
+ i;
+ p = roundcoefficients(approx(poly,i), [|dtype ...|]);
+ poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal!;
+print("rel error:", accurateinfnorm(1-backward(poly, x)/f(x), [a;b], 30));
+print("in [", a, b, "]");
+for i from 0 to deg do print(coeff(poly, i));
diff --git a/pl/math/tools/erf.sollya b/pl/math/tools/erf.sollya
new file mode 100644
index 000000000000..b2fc559b511e
--- /dev/null
+++ b/pl/math/tools/erf.sollya
@@ -0,0 +1,25 @@
+// tables and constants for approximating erf(x).
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+display = hexadecimal;
+prec=128;
+
+// Tables
+print("{ i, r, erf(r), 2/sqrt(pi) * exp(-r^2)}");
+for i from 0 to 768 do {
+ r = i / 128;
+ t0 = double(erf(r));
+ t1 = double(2/sqrt(pi) * exp(-r * r));
+ print("{ " @ i @ ",\t" @ r @ ",\t" @ t0 @ ",\t" @ t1 @ " },");
+};
+
+// Constants
+double(1/3);
+double(1/10);
+double(2/15);
+double(2/9);
+double(2/45);
+double(2/sqrt(pi));
+
diff --git a/pl/math/tools/erfc.sollya b/pl/math/tools/erfc.sollya
index 8c40b4b5db6b..1e2791291ebb 100644
--- a/pl/math/tools/erfc.sollya
+++ b/pl/math/tools/erfc.sollya
@@ -1,23 +1,51 @@
-// polynomial for approximating erfc(x)*exp(x*x)
+// tables and constants for approximating erfc(x).
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2023, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-deg = 12; // poly degree
-
-// interval bounds
-a = 0x1.60dfc14636e2ap0;
-b = 0x1.d413cccfe779ap0;
+display = hexadecimal;
+prec=128;
-f = proc(y) {
- t = y + a;
- return erfc(t) * exp(t*t);
+// Tables
+print("{ i, r, erfc(r), 2/sqrt(pi) * exp(-r^2) }");
+for i from 0 to 3787 do {
+ r = 0.0 + i / 128;
+ t0 = double(erfc(r) * 2^128);
+ t1 = double(2/sqrt(pi) * exp(-r * r) * 2^128);
+ print("{ " @ t0 @ ",\t" @ t1 @ " },");
};
-poly = remez(f(x), deg, [0;b-a], 1, 1e-16);
+// Constants
+print("> 2/sqrt(pi)");
+double(2/sqrt(pi));
-display = hexadecimal;
-print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
-print("in [",a,b,"]");
-print("coeffs:");
-for i from 0 to deg do round(coeff(poly,i), 52, RN);
+print("> 1/3");
+double(1/3);
+
+print("> P5");
+double(2/15);
+double(1/10);
+double(2/9);
+double(2/45);
+
+print("> P6");
+double(1/42);
+double(1/7);
+double(2/21);
+double(4/315);
+
+print("> Q");
+double( 5.0 / 4.0);
+double( 6.0 / 5.0);
+double( 7.0 / 6.0);
+double( 8.0 / 7.0);
+double( 9.0 / 8.0);
+double(10.0 / 9.0);
+
+print("> R");
+double(-2.0 * 4.0 / (5.0 * 6.0));
+double(-2.0 * 5.0 / (6.0 * 7.0));
+double(-2.0 * 6.0 / (7.0 * 8.0));
+double(-2.0 * 7.0 / (8.0 * 9.0));
+double(-2.0 * 8.0 / (9.0 * 10.0));
+double(-2.0 * 9.0 / (10.0 * 11.0));
diff --git a/pl/math/tools/erfcf.sollya b/pl/math/tools/erfcf.sollya
index 69c683647af7..1d7fc264d99d 100644
--- a/pl/math/tools/erfcf.sollya
+++ b/pl/math/tools/erfcf.sollya
@@ -1,31 +1,22 @@
-// polynomial for approximating erfc(x)*exp(x*x)
+// tables and constants for approximating erfcf(x).
//
-// Copyright (c) 2022-2023, Arm Limited.
+// Copyright (c) 2023, Arm Limited.
// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
-deg = 15; // poly degree
-
-// interval bounds
-a = 0x1.0p-26;
-b = 2;
-
-f = proc(y) {
- return erfc(y) * exp(y*y);
-};
-
-approx = proc(poly, d) {
- return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
-};
+display = hexadecimal;
+prec=128;
-poly = 0;
-for i from 0 to deg do {
- p = roundcoefficients(approx(poly,i), [|D ...|]);
- poly = poly + x^i*coeff(p,0);
- print(i);
+// Tables
+print("{ i, r, erfc(r), 2/sqrt(pi) * exp(-r^2) }");
+for i from 0 to 644 do {
+ r = 0.0 + i / 64;
+ t0 = single(erfc(r) * 2^47);
+ t1 = single(2/sqrt(pi) * exp(-r * r) * 2^47);
+ print("{ " @ t0 @ ",\t" @ t1 @ " },");
};
-display = hexadecimal;
-print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
-print("in [",a,b,"]");
-print("coeffs:");
-for i from 0 to deg do coeff(poly,i);
+// Constants
+single(1/3);
+single(2/15);
+single(1/10);
+single(2/sqrt(pi));
diff --git a/pl/math/tools/erff.sollya b/pl/math/tools/erff.sollya
new file mode 100644
index 000000000000..59b23ef021f0
--- /dev/null
+++ b/pl/math/tools/erff.sollya
@@ -0,0 +1,20 @@
+// tables and constants for approximating erff(x).
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+display = hexadecimal;
+prec=128;
+
+// Tables
+print("{ i, r, erf(r), 2/sqrt(pi) * exp(-r^2)}");
+for i from 0 to 512 do {
+ r = i / 128;
+ t0 = single(erf(r));
+ t1 = single(2/sqrt(pi) * exp(-r * r));
+ print("{ " @ i @ ",\t" @ r @ ",\t" @ t0 @ ",\t" @ t1 @ " },");
+};
+
+// Constants
+single(1/3);
+single(2/sqrt(pi));
diff --git a/pl/math/tools/exp10.sollya b/pl/math/tools/exp10.sollya
new file mode 100644
index 000000000000..9f30b4018209
--- /dev/null
+++ b/pl/math/tools/exp10.sollya
@@ -0,0 +1,55 @@
+// polynomial for approximating 10^x
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// exp10f parameters
+deg = 5; // poly degree
+N = 1; // Neon 1, SVE 64
+b = log(2)/(2 * N * log(10)); // interval
+a = -b;
+wp = single;
+
+// exp10 parameters
+//deg = 4; // poly degree - bump to 5 for ~1 ULP
+//N = 128; // table size
+//b = log(2)/(2 * N * log(10)); // interval
+//a = -b;
+//wp = D;
+
+
+// find polynomial with minimal relative error
+
+f = 10^x;
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|/|f(x)|
+approx = proc(poly,d) {
+ return remez(1 - poly(x)/f(x), deg-d, [a;b], x^d/f(x), 1e-10);
+};
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|
+approx_abs = proc(poly,d) {
+ return remez(f(x) - poly(x), deg-d, [a;b], x^d, 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = 1;
+for i from 1 to deg do {
+ p = roundcoefficients(approx(poly,i), [|wp ...|]);
+// p = roundcoefficients(approx_abs(poly,i), [|wp ...|]);
+ poly = poly + x^i*coeff(p,0);
+};
+
+display = hexadecimal;
+print("rel error:", accurateinfnorm(1-poly(x)/10^x, [a;b], 30));
+print("abs error:", accurateinfnorm(10^x-poly(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
+
+log10_2 = round(N * log(10) / log(2), wp, RN);
+log2_10 = log(2) / (N * log(10));
+log2_10_hi = round(log2_10, wp, RN);
+log2_10_lo = round(log2_10 - log2_10_hi, wp, RN);
+print(log10_2);
+print(log2_10_hi);
+print(log2_10_lo);
diff --git a/pl/math/tools/sincos.sollya b/pl/math/tools/sincos.sollya
new file mode 100644
index 000000000000..7d36266b446b
--- /dev/null
+++ b/pl/math/tools/sincos.sollya
@@ -0,0 +1,33 @@
+// polynomial for approximating cos(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// This script only finds the coeffs for cos - see math/aarch64/v_sin.c for sin coeffs
+
+deg = 14; // polynomial degree
+a = -pi/4; // interval
+b = pi/4;
+
+// find even polynomial with minimal abs error compared to cos(x)
+
+f = cos(x);
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|
+approx = proc(poly,d) {
+ return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = 1;
+for i from 1 to deg/2 do {
+ p = roundcoefficients(approx(poly,2*i), [|double ...|]);
+ poly = poly + x^(2*i)*coeff(p,0);
+};
+
+display = hexadecimal;
+//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+//print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/tools/sincosf.sollya b/pl/math/tools/sincosf.sollya
new file mode 100644
index 000000000000..178ee83ac196
--- /dev/null
+++ b/pl/math/tools/sincosf.sollya
@@ -0,0 +1,33 @@
+// polynomial for approximating cos(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+// This script only finds the coeffs for cos - see math/tools/sin.sollya for sin coeffs.
+
+deg = 8; // polynomial degree
+a = -pi/4; // interval
+b = pi/4;
+
+// find even polynomial with minimal abs error compared to cos(x)
+
+f = cos(x);
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|
+approx = proc(poly,d) {
+ return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
+};
+
+// first coeff is fixed, iteratively find optimal double prec coeffs
+poly = 1;
+for i from 1 to deg/2 do {
+ p = roundcoefficients(approx(poly,2*i), [|single ...|]);
+ poly = poly + x^(2*i)*coeff(p,0);
+};
+
+display = hexadecimal;
+//print("rel error:", accurateinfnorm(1-poly(x)/f(x), [a;b], 30));
+//print("abs error:", accurateinfnorm(f(x)-poly(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/tools/sinpi.sollya b/pl/math/tools/sinpi.sollya
new file mode 100644
index 000000000000..62cc87e7697d
--- /dev/null
+++ b/pl/math/tools/sinpi.sollya
@@ -0,0 +1,33 @@
+// polynomial for approximating sinpi(x)
+//
+// Copyright (c) 2023, Arm Limited.
+// SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+
+deg = 19; // polynomial degree
+a = -1/2; // interval
+b = 1/2;
+
+// find even polynomial with minimal abs error compared to sinpi(x)
+
+// f = sin(pi* x);
+f = pi*x;
+c = 1;
+for i from 1 to 80 do { c = 2*i*(2*i + 1)*c; f = f + (-1)^i*(pi*x)^(2*i+1)/c; };
+
+// return p that minimizes |f(x) - poly(x) - x^d*p(x)|
+approx = proc(poly,d) {
+ return remez(f(x)-poly(x), deg-d, [a;b], x^d, 1e-10);
+};
+
+// first coeff is predefine, iteratively find optimal double prec coeffs
+poly = pi*x;
+for i from 0 to (deg-1)/2 do {
+ p = roundcoefficients(approx(poly,2*i+1), [|D ...|]);
+ poly = poly + x^(2*i+1)*coeff(p,0);
+};
+
+display = hexadecimal;
+print("abs error:", accurateinfnorm(sin(pi*x)-poly(x), [a;b], 30));
+print("in [",a,b,"]");
+print("coeffs:");
+for i from 0 to deg do coeff(poly,i);
diff --git a/pl/math/trigpi_references.c b/pl/math/trigpi_references.c
new file mode 100644
index 000000000000..4b0514b6766a
--- /dev/null
+++ b/pl/math/trigpi_references.c
@@ -0,0 +1,57 @@
+/*
+ * Extended precision scalar reference functions for trigpi.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#define _GNU_SOURCE
+#include "math_config.h"
+#include "mathlib.h"
+
+long double
+sinpil (long double x)
+{
+ /* sin(inf) should return nan, as defined by C23. */
+ if (isinf (x))
+ return __math_invalid (x);
+
+ long double ax = fabsl (x);
+
+ /* Return 0 for all values above 2^64 to prevent
+ overflow when casting to uint64_t. */
+ if (ax >= 0x1p64)
+ return 0;
+
+ /* All integer cases should return 0. */
+ if (ax == (uint64_t) ax)
+ return 0;
+
+ return sinl (x * M_PIl);
+}
+
+long double
+cospil (long double x)
+{
+ /* cos(inf) should return nan, as defined by C23. */
+ if (isinf (x))
+ return __math_invalid (x);
+
+ long double ax = fabsl (x);
+
+ if (ax >= 0x1p64)
+ return 1;
+
+ uint64_t m = (uint64_t) ax;
+
+ /* Integer values of cospi(x) should return +/-1.
+ The sign depends on if x is odd or even. */
+ if (m == ax)
+ return (m & 1) ? -1 : 1;
+
+ /* Values of Integer + 0.5 should always return 0. */
+ if (ax - 0.5 == m || ax + 0.5 == m)
+ return 0;
+
+ return cosl (ax * M_PIl);
+} \ No newline at end of file
diff --git a/pl/math/v_acos_2u.c b/pl/math/v_acos_2u.c
new file mode 100644
index 000000000000..581f8506c0d6
--- /dev/null
+++ b/pl/math/v_acos_2u.c
@@ -0,0 +1,122 @@
+/*
+ * Double-precision vector acos(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64x2_t poly[12];
+ float64x2_t pi, pi_over_2;
+ uint64x2_t abs_mask;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+ V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+ V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+ V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+ V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+ V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+ .pi = V2 (0x1.921fb54442d18p+1),
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define Oneu (0x3ff0000000000000)
+#define Small (0x3e50000000000000) /* 2^-53. */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (acos, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector acos(x).
+
+ For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-53 for correct
+ rounding.
+ If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+ approximation.
+
+ For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.18 ulps,
+ _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
+ want 0x1.0d54d1985c069p+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.52 ulps,
+ _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
+ want 0x1.edbbedf8a7d6cp-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* A single comparison for One, Small and QNaN. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+ v_u64 (Oneu - Small));
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, x, AllMask);
+#endif
+
+ uint64x2_t a_le_half = vcleq_f64 (ax, v_f64 (0.5));
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x),
+ vfmaq_f64 (v_f64 (0.5), v_f64 (-0.5), ax));
+ float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float64x2_t z4 = vmulq_f64 (z2, z2);
+ float64x2_t z8 = vmulq_f64 (z4, z4);
+ float64x2_t z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
+
+ uint64x2_t is_neg = vcltzq_f64 (x);
+ float64x2_t off = vreinterpretq_f64_u64 (
+ vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->pi)));
+ float64x2_t mul = vbslq_f64 (a_le_half, v_f64 (-1.0), v_f64 (2.0));
+ float64x2_t add = vbslq_f64 (a_le_half, d->pi_over_2, off);
+
+ return vfmaq_f64 (add, mul, y);
+}
+
+PL_SIG (V, D, 1, acos, -1.0, 1.0)
+PL_TEST_ULP (V_NAME_D1 (acos), 1.02)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (acos), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_D1 (acos), 0, Small, 5000)
+PL_TEST_INTERVAL (V_NAME_D1 (acos), Small, 0.5, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (acos), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (acos), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (acos), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (V_NAME_D1 (acos), -0, -inf, 20000)
diff --git a/pl/math/v_acosf_1u4.c b/pl/math/v_acosf_1u4.c
new file mode 100644
index 000000000000..bb17b1df18f3
--- /dev/null
+++ b/pl/math/v_acosf_1u4.c
@@ -0,0 +1,113 @@
+/*
+ * Single-precision vector acos(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t pi_over_2f, pif;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+ V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+ .pi_over_2f = V4 (0x1.921fb6p+0f),
+ .pif = V4 (0x1.921fb6p+1f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x32800000 /* 2^-26. */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (acosf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector acos(x).
+
+ For |x| < Small, approximate acos(x) by pi/2 - x. Small = 2^-26 for correct
+ rounding.
+ If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the following
+ approximation.
+
+ For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation of asin is an odd polynomial:
+
+ acos(x) ~ pi/2 - (x + x^3 P(x^2)).
+
+ The largest observed error in this region is 1.26 ulps,
+ _ZGVnN4v_acosf (0x1.843bfcp-2) got 0x1.2e934cp+0 want 0x1.2e934ap+0.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 1.32 ulps,
+ _ZGVnN4v_acosf (0x1.15ba56p-1) got 0x1.feb33p-1
+ want 0x1.feb32ep-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (acos) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+ /* A single comparison for One, Small and QNaN. */
+ uint32x4_t special
+ = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+ float32x4_t ax = vreinterpretq_f32_u32 (ia);
+ uint32x4_t a_le_half = vcleq_u32 (ia, v_u32 (Half));
+
+ /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+ z2 = x ^ 2 and z = |x| , if |x| < 0.5
+ z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5. */
+ float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
+ vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+ float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float32x4_t p = v_horner_4_f32 (z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5
+ = 2 Q(|x|) , for 0.5 < x < 1.0
+ = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */
+ float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
+
+ uint32x4_t is_neg = vcltzq_f32 (x);
+ float32x4_t off = vreinterpretq_f32_u32 (
+ vandq_u32 (vreinterpretq_u32_f32 (d->pif), is_neg));
+ float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (-1.0), v_f32 (2.0));
+ float32x4_t add = vbslq_f32 (a_le_half, d->pi_over_2f, off);
+
+ return vfmaq_f32 (add, mul, y);
+}
+
+PL_SIG (V, F, 1, acos, -1.0, 1.0)
+PL_TEST_ULP (V_NAME_F1 (acos), 0.82)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (acos), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_F1 (acos), 0, 0x1p-26, 5000)
+PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p-26, 0.5, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (acos), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (acos), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (acos), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (V_NAME_F1 (acos), -0, -inf, 20000)
diff --git a/pl/math/v_acosh_3u5.c b/pl/math/v_acosh_3u5.c
index 22f69d7636e4..42fa2616d562 100644
--- a/pl/math/v_acosh_3u5.c
+++ b/pl/math/v_acosh_3u5.c
@@ -11,41 +11,56 @@
#define WANT_V_LOG1P_K0_SHORTCUT 1
#include "v_log1p_inline.h"
-#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */
-
-#if V_SUPPORTED
+const static struct data
+{
+ struct v_log1p_data log1p_consts;
+ uint64x2_t one, thresh;
+} data = {
+ .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ .one = V2 (0x3ff0000000000000),
+ .thresh = V2 (0x1ff0000000000000) /* asuint64(0x1p511) - asuint64(1). */
+};
-static NOINLINE VPCS_ATTR v_f64_t
-special_case (v_f64_t x)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
+ const struct v_log1p_data *d)
{
- return v_call_f64 (acosh, x, x, v_u64 (-1));
+ return v_call_f64 (acosh, x, log1p_inline (y, d), special);
}
/* Vector approximation for double-precision acosh, based on log1p.
The largest observed error is 3.02 ULP in the region where the
argument to log1p falls in the k=0 interval, i.e. x close to 1:
- __v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
- want 0x1.f2d6d823bc9e2p-5. */
-VPCS_ATTR v_f64_t V_NAME (acosh) (v_f64_t x)
+ _ZGVnN2v_acosh(0x1.00798aaf80739p+0) got 0x1.f2d6d823bc9dfp-5
+ want 0x1.f2d6d823bc9e2p-5. */
+VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x)
{
- v_u64_t itop = v_as_u64_f64 (x) >> 52;
- v_u64_t special = v_cond_u64 ((itop - OneTop) >= (BigBoundTop - OneTop));
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (x), d->one), d->thresh);
+ float64x2_t special_arg = x;
- /* Fall back to scalar routine for all lanes if any of them are special. */
+#if WANT_SIMD_EXCEPT
if (unlikely (v_any_u64 (special)))
- return special_case (x);
+ x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x);
+#endif
- v_f64_t xm1 = x - 1;
- v_f64_t u = xm1 * (x + 1);
- return log1p_inline (xm1 + v_sqrt_f64 (u));
+ float64x2_t xm1 = vsubq_f64 (x, v_f64 (1));
+ float64x2_t y;
+ y = vaddq_f64 (x, v_f64 (1));
+ y = vmulq_f64 (y, xm1);
+ y = vsqrtq_f64 (y);
+ y = vaddq_f64 (xm1, y);
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (special_arg, y, special, &d->log1p_consts);
+ return log1p_inline (y, &d->log1p_consts);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, acosh, 1.0, 10.0)
-PL_TEST_ULP (V_NAME (acosh), 2.53)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (acosh))
-PL_TEST_INTERVAL (V_NAME (acosh), 1, 0x1p511, 90000)
-PL_TEST_INTERVAL (V_NAME (acosh), 0x1p511, inf, 10000)
-PL_TEST_INTERVAL (V_NAME (acosh), 0, 1, 1000)
-PL_TEST_INTERVAL (V_NAME (acosh), -0, -inf, 10000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (acosh), 2.53)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (acosh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_D1 (acosh), 1, 0x1p511, 90000)
+PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0x1p511, inf, 10000)
+PL_TEST_INTERVAL (V_NAME_D1 (acosh), 0, 1, 1000)
+PL_TEST_INTERVAL (V_NAME_D1 (acosh), -0, -inf, 10000)
diff --git a/pl/math/v_acoshf_3u1.c b/pl/math/v_acoshf_3u1.c
index 2b5aff591a74..a2ff0f02635b 100644
--- a/pl/math/v_acoshf_3u1.c
+++ b/pl/math/v_acoshf_3u1.c
@@ -7,19 +7,26 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "v_log1pf_inline.h"
-#define SignMask 0x80000000
-#define One 0x3f800000
-#define SquareLim 0x5f800000 /* asuint(0x1p64). */
-
-#if V_SUPPORTED
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t one;
+ uint16x4_t thresh;
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .one = V4 (0x3f800000),
+ .thresh = V4 (0x2000) /* asuint(0x1p64) - asuint(1). */
+};
-#include "v_log1pf_inline.h"
+#define SignMask 0x80000000
-static NOINLINE VPCS_ATTR v_f32_t
-special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
+ const struct v_log1pf_data d)
{
- return v_call_f32 (acoshf, x, y, special);
+ return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
/* Vector approximation for single-precision acosh, based on log1p. Maximum
@@ -32,37 +39,40 @@ special_case (v_f32_t x, v_f32_t y, v_u32_t special)
__v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
want 0x1.fbc7f4p-4. */
-VPCS_ATTR v_f32_t V_NAME (acoshf) (v_f32_t x)
+VPCS_ATTR float32x4_t V_NAME_F1 (acosh) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t special = v_cond_u32 ((ix - One) >= (SquareLim - One));
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
#if WANT_SIMD_EXCEPT
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
- only xm1 to calculate u, as operating on x will trigger invalid for NaN. */
- v_f32_t xm1 = v_sel_f32 (special, v_f32 (1), x - 1);
- v_f32_t u = v_fma_f32 (xm1, xm1, 2 * xm1);
+ only xm1 to calculate u, as operating on x will trigger invalid for NaN.
+ Widening sign-extend special predicate in order to mask with it. */
+ uint32x4_t p
+ = vreinterpretq_u32_s32 (vmovl_s16 (vreinterpret_s16_u16 (special)));
+ float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
+ float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
#else
- v_f32_t xm1 = x - 1;
- v_f32_t u = xm1 * (x + 1.0f);
+ float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
+ float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
#endif
- v_f32_t y = log1pf_inline (xm1 + v_sqrt_f32 (u));
- if (unlikely (v_any_u32 (special)))
- return special_case (x, y, special);
- return y;
+ float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, special, d->log1pf_consts);
+ return log1pf_inline (y, d->log1pf_consts);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, acosh, 1.0, 10.0)
#if WANT_SIMD_EXCEPT
-PL_TEST_ULP (V_NAME (acoshf), 2.29)
+PL_TEST_ULP (V_NAME_F1 (acosh), 2.29)
#else
-PL_TEST_ULP (V_NAME (acoshf), 2.58)
-#endif
-PL_TEST_EXPECT_FENV (V_NAME (acoshf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (acoshf), 0, 1, 500)
-PL_TEST_INTERVAL (V_NAME (acoshf), 1, SquareLim, 100000)
-PL_TEST_INTERVAL (V_NAME (acoshf), SquareLim, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (acoshf), -0, -inf, 1000)
+PL_TEST_ULP (V_NAME_F1 (acosh), 2.58)
#endif
+PL_TEST_EXPECT_FENV (V_NAME_F1 (acosh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_F1 (acosh), 0, 1, 500)
+PL_TEST_INTERVAL (V_NAME_F1 (acosh), 1, SquareLim, 100000)
+PL_TEST_INTERVAL (V_NAME_F1 (acosh), SquareLim, inf, 1000)
+PL_TEST_INTERVAL (V_NAME_F1 (acosh), -0, -inf, 1000)
diff --git a/pl/math/v_asin_3u.c b/pl/math/v_asin_3u.c
new file mode 100644
index 000000000000..756443c6b320
--- /dev/null
+++ b/pl/math/v_asin_3u.c
@@ -0,0 +1,113 @@
+/*
+ * Double-precision vector asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64x2_t poly[12];
+ float64x2_t pi_over_2;
+ uint64x2_t abs_mask;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+ on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */
+ .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
+ V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
+ V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
+ V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
+ V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
+ V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .abs_mask = V2 (0x7fffffffffffffff),
+};
+
+#define AllMask v_u64 (0xffffffffffffffff)
+#define One (0x3ff0000000000000)
+#define Small (0x3e50000000000000) /* 2^-12. */
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (asin, x, y, special);
+}
+#endif
+
+/* Double-precision implementation of vector asin(x).
+
+ For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+ rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+ following approximation.
+
+ For |x| in [Small, 0.5], use an order 11 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 1.01 ulps,
+ _ZGVnN2v_asin (0x1.da9735b5a9277p-2) got 0x1.ed78525a927efp-2
+ want 0x1.ed78525a927eep-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.69 ulps,
+ _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
+ want 0x1.110d7e85fdd53p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+
+#if WANT_SIMD_EXCEPT
+ /* Special values need to be computed with scalar fallbacks so
+ that appropriate exceptions are raised. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (ax), v_u64 (Small)),
+ v_u64 (One - Small));
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, x, AllMask);
+#endif
+
+ uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x),
+ vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
+ float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float64x2_t z4 = vmulq_f64 (z2, z2);
+ float64x2_t z8 = vmulq_f64 (z4, z4);
+ float64x2_t z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0));
+
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
+}
+
+PL_SIG (V, D, 1, asin, -1.0, 1.0)
+PL_TEST_ULP (V_NAME_D1 (asin), 2.19)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (asin), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_D1 (asin), 0, Small, 5000)
+PL_TEST_INTERVAL (V_NAME_D1 (asin), Small, 0.5, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (asin), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (asin), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (asin), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (V_NAME_D1 (asin), -0, -inf, 20000)
diff --git a/pl/math/v_asinf_2u5.c b/pl/math/v_asinf_2u5.c
new file mode 100644
index 000000000000..eb978cd956ab
--- /dev/null
+++ b/pl/math/v_asinf_2u5.c
@@ -0,0 +1,104 @@
+/*
+ * Single-precision vector asin(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t pi_over_2f;
+} data = {
+ /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on
+ [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */
+ .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
+ V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
+ .pi_over_2f = V4 (0x1.921fb6p+0f),
+};
+
+#define AbsMask 0x7fffffff
+#define Half 0x3f000000
+#define One 0x3f800000
+#define Small 0x39800000 /* 2^-12. */
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (asinf, x, y, special);
+}
+#endif
+
+/* Single-precision implementation of vector asin(x).
+
+ For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
+ rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
+ following approximation.
+
+ For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+ approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+ The largest observed error in this region is 0.83 ulps,
+ _ZGVnN4v_asinf (0x1.ea00f4p-2) got 0x1.fef15ep-2 want 0x1.fef15cp-2.
+
+ For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+ asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z).
+
+ The largest observed error in this region is 2.41 ulps,
+ _ZGVnN4v_asinf (0x1.00203ep-1) got 0x1.0c3a64p-1 want 0x1.0c3a6p-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (asin) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+#if WANT_SIMD_EXCEPT
+ /* Special values need to be computed with scalar fallbacks so
+ that appropriate fp exceptions are raised. */
+ uint32x4_t special
+ = vcgtq_u32 (vsubq_u32 (ia, v_u32 (Small)), v_u32 (One - Small));
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, x, v_u32 (0xffffffff));
+#endif
+
+ float32x4_t ax = vreinterpretq_f32_u32 (ia);
+ uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
+
+ /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+ z = x ^ 2 and y = |x| , if |x| < 0.5
+ z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */
+ float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
+ vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+ float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
+
+ /* Use a single polynomial approximation P for both intervals. */
+ float32x4_t p = v_horner_4_f32 (z2, d->poly);
+ /* Finalize polynomial: z + z * z2 * P(z2). */
+ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
+
+ /* asin(|x|) = Q(|x|) , for |x| < 0.5
+ = pi/2 - 2 Q(|x|), for |x| >= 0.5. */
+ float32x4_t y
+ = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
+
+ /* Copy sign. */
+ return vbslq_f32 (v_u32 (AbsMask), y, x);
+}
+
+PL_SIG (V, F, 1, asin, -1.0, 1.0)
+PL_TEST_ULP (V_NAME_F1 (asin), 1.91)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (asin), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_F1 (asin), 0, 0x1p-12, 5000)
+PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p-12, 0.5, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (asin), 0.5, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (asin), 1.0, 0x1p11, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (asin), 0x1p11, inf, 20000)
+PL_TEST_INTERVAL (V_NAME_F1 (asin), -0, -inf, 20000)
diff --git a/pl/math/v_asinh_3u5.c b/pl/math/v_asinh_3u5.c
index fd329b6b7f69..4862bef94861 100644
--- a/pl/math/v_asinh_3u5.c
+++ b/pl/math/v_asinh_3u5.c
@@ -6,75 +6,81 @@
*/
#include "v_math.h"
-#include "estrin.h"
+#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
+#define A(i) v_f64 (__v_log_data.poly[i])
+#define N (1 << V_LOG_TABLE_BITS)
-#define OneTop 0x3ff /* top12(asuint64(1.0f)). */
-#define HugeBound 0x5fe /* top12(asuint64(0x1p511)). */
-#define TinyBound 0x3e5 /* top12(asuint64(0x1p-26)). */
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define C(i) v_f64 (__asinh_data.poly[i])
-
-/* Constants & data for log. */
-#define OFF 0x3fe6000000000000
-#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
-#define A(i) v_f64 (__sv_log_data.poly[i])
-#define T(i) __log_data.tab[i]
-#define N (1 << LOG_TABLE_BITS)
+const static struct data
+{
+ float64x2_t poly[18];
+ uint64x2_t off, huge_bound, abs_mask;
+ float64x2_t ln2, tiny_bound;
+} data = {
+ .off = V2 (0x3fe6900900000000),
+ .ln2 = V2 (0x1.62e42fefa39efp-1),
+ .huge_bound = V2 (0x5fe0000000000000),
+ .tiny_bound = V2 (0x1p-26),
+ .abs_mask = V2 (0x7fffffffffffffff),
+ /* Even terms of polynomial s.t. asinh(x) is approximated by
+ asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
+ Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
+ .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4),
+ V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6),
+ V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6),
+ V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7),
+ V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7),
+ V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8),
+ V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9),
+ V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12),
+ V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) },
+};
-static NOINLINE v_f64_t
-special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
{
return v_call_f64 (asinh, x, y, special);
}
struct entry
{
- v_f64_t invc;
- v_f64_t logc;
+ float64x2_t invc;
+ float64x2_t logc;
};
static inline struct entry
-lookup (v_u64_t i)
+lookup (uint64x2_t i)
{
- struct entry e;
-#ifdef SCALAR
- e.invc = T (i).invc;
- e.logc = T (i).logc;
-#else
- e.invc[0] = T (i[0]).invc;
- e.logc[0] = T (i[0]).logc;
- e.invc[1] = T (i[1]).invc;
- e.logc[1] = T (i[1]).logc;
-#endif
- return e;
+ float64x2_t e0 = vld1q_f64 (
+ &__v_log_data.table[(i[0] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+ float64x2_t e1 = vld1q_f64 (
+ &__v_log_data.table[(i[1] >> (52 - V_LOG_TABLE_BITS)) & (N - 1)].invc);
+ return (struct entry){ vuzp1q_f64 (e0, e1), vuzp2q_f64 (e0, e1) };
}
-static inline v_f64_t
-log_inline (v_f64_t x)
+static inline float64x2_t
+log_inline (float64x2_t x, const struct data *d)
{
- /* Double-precision vector log, copied from math/v_log.c with some cosmetic
- modification and special-cases removed. See that file for details of the
- algorithm used. */
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t tmp = ix - OFF;
- v_u64_t i = (tmp >> (52 - LOG_TABLE_BITS)) % N;
- v_s64_t k = v_as_s64_u64 (tmp) >> 52;
- v_u64_t iz = ix - (tmp & 0xfffULL << 52);
- v_f64_t z = v_as_f64_u64 (iz);
- struct entry e = lookup (i);
- v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
- v_f64_t kd = v_to_f64_s64 (k);
- v_f64_t hi = v_fma_f64 (kd, Ln2, e.logc + r);
- v_f64_t r2 = r * r;
- v_f64_t y = v_fma_f64 (A (3), r, A (2));
- v_f64_t p = v_fma_f64 (A (1), r, A (0));
- y = v_fma_f64 (A (4), r2, y);
- y = v_fma_f64 (y, r2, p);
- y = v_fma_f64 (y, r2, hi);
+ /* Double-precision vector log, copied from ordinary vector log with some
+ cosmetic modification and special-cases removed. */
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t tmp = vsubq_u64 (ix, d->off);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz
+ = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52)));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+ struct entry e = lookup (tmp);
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+ float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = vfmaq_f64 (A (2), A (3), r);
+ float64x2_t p = vfmaq_f64 (A (0), A (1), r);
+ y = vfmaq_f64 (y, A (4), r2);
+ y = vfmaq_f64 (p, y, r2);
+ y = vfmaq_f64 (hi, y, r2);
return y;
}
@@ -89,34 +95,35 @@ log_inline (v_f64_t x)
|x| >= 1:
__v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1
want 0x1.ffffcfd0e2352p-1. */
-VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
+VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t iax = ix & AbsMask;
- v_f64_t ax = v_as_f64_u64 (iax);
- v_u64_t top12 = iax >> 52;
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t iax = vreinterpretq_u64_f64 (ax);
- v_u64_t gt1 = v_cond_u64 (top12 >= OneTop);
- v_u64_t special = v_cond_u64 (top12 >= HugeBound);
+ uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1));
+ uint64x2_t special = vcgeq_u64 (iax, d->huge_bound);
#if WANT_SIMD_EXCEPT
- v_u64_t tiny = v_cond_u64 (top12 < TinyBound);
- special |= tiny;
+ uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound);
+ special = vorrq_u64 (special, tiny);
#endif
/* Option 1: |x| >= 1.
Compute asinh(x) according by asinh(x) = log(x + sqrt(x^2 + 1)).
If WANT_SIMD_EXCEPT is enabled, sidestep special values, which will
overflow, by setting special lanes to 1. These will be fixed later. */
- v_f64_t option_1 = v_f64 (0);
+ float64x2_t option_1 = v_f64 (0);
if (likely (v_any_u64 (gt1)))
{
#if WANT_SIMD_EXCEPT
- v_f64_t xm = v_sel_f64 (special, v_f64 (1), ax);
+ float64x2_t xm = v_zerofy_f64 (ax, special);
#else
- v_f64_t xm = ax;
+ float64x2_t xm = ax;
#endif
- option_1 = log_inline (xm + v_sqrt_f64 (xm * xm + 1));
+ option_1 = log_inline (
+ vaddq_f64 (xm, vsqrtq_f64 (vfmaq_f64 (v_f64 (1), xm, xm))), d);
}
/* Option 2: |x| < 1.
@@ -127,49 +134,42 @@ VPCS_ATTR v_f64_t V_NAME (asinh) (v_f64_t x)
special-case. The largest observed error in this region is 1.47 ULPs:
__v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1
want 0x1.c1d6bf874019cp-1. */
- v_f64_t option_2 = v_f64 (0);
- if (likely (v_any_u64 (~gt1)))
+ float64x2_t option_2 = v_f64 (0);
+ if (likely (v_any_u64 (vceqzq_u64 (gt1))))
{
#if WANT_SIMD_EXCEPT
- ax = v_sel_f64 (tiny | gt1, v_f64 (0), ax);
+ ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1));
#endif
- v_f64_t x2 = ax * ax;
- v_f64_t z2 = x2 * x2;
- v_f64_t z4 = z2 * z2;
- v_f64_t z8 = z4 * z4;
- v_f64_t p = ESTRIN_17 (x2, z2, z4, z8, z8 * z8, C);
- option_2 = v_fma_f64 (p, x2 * ax, ax);
+ float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2),
+ z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2),
+ z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8);
+ float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly);
+ option_2 = vfmaq_f64 (ax, p, x3);
#if WANT_SIMD_EXCEPT
- option_2 = v_sel_f64 (tiny, x, option_2);
+ option_2 = vbslq_f64 (tiny, x, option_2);
#endif
}
/* Choose the right option for each lane. */
- v_f64_t y = v_sel_f64 (gt1, option_1, option_2);
+ float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
/* Copy sign. */
- y = v_as_f64_u64 (v_bsl_u64 (AbsMask, v_as_u64_f64 (y), ix));
+ y = vbslq_f64 (d->abs_mask, y, x);
if (unlikely (v_any_u64 (special)))
return special_case (x, y, special);
return y;
}
-VPCS_ALIAS
PL_SIG (V, D, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (asinh), 2.80)
-PL_TEST_EXPECT_FENV (V_NAME (asinh), WANT_SIMD_EXCEPT)
+PL_TEST_ULP (V_NAME_D1 (asinh), 2.80)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (asinh), WANT_SIMD_EXCEPT)
/* Test vector asinh 3 times, with control lane < 1, > 1 and special.
Ensures the v_sel is choosing the right option in all cases. */
-#define V_ASINH_INTERVAL(lo, hi, n) \
- PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0.5) \
- PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 2) \
- PL_TEST_INTERVAL_C (V_NAME (asinh), lo, hi, n, 0x1p600)
+#define V_ASINH_INTERVAL(lo, hi, n) \
+ PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0.5) \
+ PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 2) \
+ PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (asinh), lo, hi, n, 0x1p600)
V_ASINH_INTERVAL (0, 0x1p-26, 50000)
V_ASINH_INTERVAL (0x1p-26, 1, 50000)
V_ASINH_INTERVAL (1, 0x1p511, 50000)
V_ASINH_INTERVAL (0x1p511, inf, 40000)
-V_ASINH_INTERVAL (-0, -0x1p-26, 50000)
-V_ASINH_INTERVAL (-0x1p-26, -1, 50000)
-V_ASINH_INTERVAL (-1, -0x1p511, 50000)
-V_ASINH_INTERVAL (-0x1p511, -inf, 40000)
-#endif
diff --git a/pl/math/v_asinhf_2u7.c b/pl/math/v_asinhf_2u7.c
index 9d8c8a936ae3..1723ba90d2f3 100644
--- a/pl/math/v_asinhf_2u7.c
+++ b/pl/math/v_asinhf_2u7.c
@@ -6,21 +6,29 @@
*/
#include "v_math.h"
-#include "include/mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
-
-#if V_SUPPORTED
+#include "v_log1pf_inline.h"
#define SignMask v_u32 (0x80000000)
-#define One v_f32 (1.0f)
-#define BigBound v_u32 (0x5f800000) /* asuint(0x1p64). */
-#define TinyBound v_u32 (0x30800000) /* asuint(0x1p-30). */
-#include "v_log1pf_inline.h"
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t big_bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V4 (0x30800000) /* asuint(0x1p-30). */
+#endif
+};
-static NOINLINE v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
{
return v_call_f32 (asinhf, x, y, special);
}
@@ -28,43 +36,45 @@ specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
/* Single-precision implementation of vector asinh(x), using vector log1p.
Worst-case error is 2.66 ULP, at roughly +/-0.25:
__v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
-VPCS_ATTR v_f32_t V_NAME (asinhf) (v_f32_t x)
+VPCS_ATTR float32x4_t V_NAME_F1 (asinh) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iax = ix & ~SignMask;
- v_u32_t sign = ix & SignMask;
- v_f32_t ax = v_as_f32_u32 (iax);
- v_u32_t special = v_cond_u32 (iax >= BigBound);
+ const struct data *dat = ptr_barrier (&data);
+ uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
+ float32x4_t ax = vreinterpretq_f32_u32 (iax);
+ uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ float32x4_t special_arg = x;
#if WANT_SIMD_EXCEPT
/* Sidestep tiny and large values to avoid inadvertently triggering
under/overflow. */
- special |= v_cond_u32 (iax < TinyBound);
+ special = vorrq_u32 (special, vcltq_u32 (iax, dat->tiny_bound));
if (unlikely (v_any_u32 (special)))
- ax = v_sel_f32 (special, One, ax);
+ {
+ ax = v_zerofy_f32 (ax, special);
+ x = v_zerofy_f32 (x, special);
+ }
#endif
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
- v_f32_t d = One + v_sqrt_f32 (ax * ax + One);
- v_f32_t y = log1pf_inline (ax + ax * ax / d);
- y = v_as_f32_u32 (sign | v_as_u32_f32 (y));
+ float32x4_t d
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
+ float32x4_t y = log1pf_inline (
+ vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
if (unlikely (v_any_u32 (special)))
- return specialcase (x, y, special);
- return y;
+ return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
+ return vbslq_f32 (SignMask, x, y);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, asinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (asinhf), 2.17)
-PL_TEST_EXPECT_FENV (V_NAME (asinhf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (asinhf), 0, 0x1p-12, 40000)
-PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p-12, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (asinhf), 1.0, 0x1p11, 40000)
-PL_TEST_INTERVAL (V_NAME (asinhf), 0x1p11, inf, 40000)
-PL_TEST_INTERVAL (V_NAME (asinhf), 0, -0x1p-12, 20000)
-PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p-12, -1.0, 20000)
-PL_TEST_INTERVAL (V_NAME (asinhf), -1.0, -0x1p11, 20000)
-PL_TEST_INTERVAL (V_NAME (asinhf), -0x1p11, -inf, 20000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (asinh), 2.17)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (asinh), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0, 0x1p-12, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p-12, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), 1.0, 0x1p11, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), 0x1p11, inf, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0, -0x1p-12, 20000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p-12, -1.0, 20000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), -1.0, -0x1p11, 20000)
+PL_TEST_INTERVAL (V_NAME_F1 (asinh), -0x1p11, -inf, 20000)
diff --git a/pl/math/v_atan2_3u.c b/pl/math/v_atan2_3u.c
index 6327fea8eb2c..f24667682dec 100644
--- a/pl/math/v_atan2_3u.c
+++ b/pl/math/v_atan2_3u.c
@@ -8,83 +8,114 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f64.h"
-#if V_SUPPORTED
-
-#include "atan_common.h"
+static const struct data
+{
+ float64x2_t pi_over_2;
+ float64x2_t poly[20];
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ the interval [2**-1022, 1.0]. */
+ .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
+ V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
+ V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
+ V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
+ V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
+ V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
+ V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
+ V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
+ V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
+ V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+};
-#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
#define SignMask v_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t y, v_f64_t x, v_f64_t ret, v_u64_t cmp)
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
{
return v_call2_f64 (atan2, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
-static inline v_u64_t
-zeroinfnan (v_u64_t i)
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i)
{
- return v_cond_u64 (2 * i - 1 >= v_u64 (2 * asuint64 (INFINITY) - 1));
+ /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
+ v_u64 (2 * asuint64 (INFINITY) - 1));
}
/* Fast implementation of vector atan2.
Maximum observed error is 2.8 ulps:
- v_atan2(0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
+ _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
got 0x1.92d628ab678ccp-1
want 0x1.92d628ab678cfp-1. */
-VPCS_ATTR
-v_f64_t V_NAME (atan2) (v_f64_t y, v_f64_t x)
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t iy = v_as_u64_f64 (y);
+ const struct data *data_ptr = ptr_barrier (&data);
- v_u64_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t iy = vreinterpretq_u64_f64 (y);
- v_u64_t sign_x = ix & SignMask;
- v_u64_t sign_y = iy & SignMask;
- v_u64_t sign_xy = sign_x ^ sign_y;
+ uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
- v_f64_t ax = v_abs_f64 (x);
- v_f64_t ay = v_abs_f64 (y);
+ uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+ uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+ uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
- v_u64_t pred_xlt0 = x < 0.0;
- v_u64_t pred_aygtax = ay > ax;
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+ uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
/* Set up z for call to atan. */
- v_f64_t n = v_sel_f64 (pred_aygtax, -ax, ay);
- v_f64_t d = v_sel_f64 (pred_aygtax, ay, ax);
- v_f64_t z = v_div_f64 (n, d);
+ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+ float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
+ float64x2_t z = vdivq_f64 (n, d);
/* Work out the correct shift. */
- v_f64_t shift = v_sel_f64 (pred_xlt0, v_f64 (-2.0), v_f64 (0.0));
- shift = v_sel_f64 (pred_aygtax, shift + 1.0, shift);
- shift *= PiOver2;
-
- v_f64_t ret = eval_poly (z, z, shift);
+ float64x2_t shift = vreinterpretq_f64_u64 (
+ vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
+ shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
+ shift = vmulq_f64 (shift, data_ptr->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+ float64x2_t ret
+ = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
+ v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
+ ret = vaddq_f64 (ret, shift);
/* Account for the sign of x and y. */
- ret = v_as_f64_u64 (v_as_u64_f64 (ret) ^ sign_xy);
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
if (unlikely (v_any_u64 (special_cases)))
- {
- return specialcase (y, x, ret, special_cases);
- }
+ return special_case (y, x, ret, special_cases);
return ret;
}
-VPCS_ALIAS
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
PL_SIG (V, D, 2, atan2)
// TODO tighten this once __v_atan2 is fixed
-PL_TEST_ULP (V_NAME (atan2), 2.9)
-PL_TEST_INTERVAL (V_NAME (atan2), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atan2), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan2), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan2), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan2), 1e6, 1e32, 40000)
-#endif
+PL_TEST_ULP (V_NAME_D2 (atan2), 2.9)
+PL_TEST_INTERVAL (V_NAME_D2 (atan2), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME_D2 (atan2), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME_D2 (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME_D2 (atan2), 1e6, 1e32, 40000)
diff --git a/pl/math/v_atan2f_3u.c b/pl/math/v_atan2f_3u.c
index 5d1e6ca4488e..bbfc3cb552f6 100644
--- a/pl/math/v_atan2f_3u.c
+++ b/pl/math/v_atan2f_3u.c
@@ -8,82 +8,108 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f32.h"
-#if V_SUPPORTED
-
-#include "atanf_common.h"
+static const struct data
+{
+ float32x4_t poly[8];
+ float32x4_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0].
+ Generated using fpminimax between FLT_MIN and 1. */
+ .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+ V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+ V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+ .pi_over_2 = V4 (0x1.921fb6p+0f),
+};
-/* Useful constants. */
-#define PiOver2 v_f32 (0x1.921fb6p+0f)
#define SignMask v_u32 (0x80000000)
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f32_t
-specialcase (v_f32_t y, v_f32_t x, v_f32_t ret, v_u32_t cmp)
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
{
return v_call2_f32 (atan2f, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
-static inline v_u32_t
-zeroinfnan (v_u32_t i)
+static inline uint32x4_t
+zeroinfnan (uint32x4_t i)
{
- return v_cond_u32 (2 * i - 1 >= v_u32 (2 * 0x7f800000lu - 1));
+ /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
+ v_u32 (2 * 0x7f800000lu - 1));
}
/* Fast implementation of vector atan2f. Maximum observed error is
2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
- v_atan2(0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
- want 0x1.967f00p-1. */
-VPCS_ATTR
-v_f32_t V_NAME (atan2f) (v_f32_t y, v_f32_t x)
+ _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
+ want 0x1.967f00p-1. */
+float32x4_t VPCS_ATTR V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iy = v_as_u32_f32 (y);
+ const struct data *data_ptr = ptr_barrier (&data);
- v_u32_t special_cases = zeroinfnan (ix) | zeroinfnan (iy);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t iy = vreinterpretq_u32_f32 (y);
- v_u32_t sign_x = ix & SignMask;
- v_u32_t sign_y = iy & SignMask;
- v_u32_t sign_xy = sign_x ^ sign_y;
+ uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
- v_f32_t ax = v_abs_f32 (x);
- v_f32_t ay = v_abs_f32 (y);
+ uint32x4_t sign_x = vandq_u32 (ix, SignMask);
+ uint32x4_t sign_y = vandq_u32 (iy, SignMask);
+ uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y);
- v_u32_t pred_xlt0 = x < 0.0f;
- v_u32_t pred_aygtax = ay > ax;
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ay = vabsq_f32 (y);
+
+ uint32x4_t pred_xlt0 = vcltzq_f32 (x);
+ uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
/* Set up z for call to atanf. */
- v_f32_t n = v_sel_f32 (pred_aygtax, -ax, ay);
- v_f32_t d = v_sel_f32 (pred_aygtax, ay, ax);
- v_f32_t z = v_div_f32 (n, d);
+ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+ float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
+ float32x4_t z = vdivq_f32 (n, d);
/* Work out the correct shift. */
- v_f32_t shift = v_sel_f32 (pred_xlt0, v_f32 (-2.0f), v_f32 (0.0f));
- shift = v_sel_f32 (pred_aygtax, shift + 1.0f, shift);
- shift *= PiOver2;
-
- v_f32_t ret = eval_poly (z, z, shift);
+ float32x4_t shift = vreinterpretq_f32_u32 (
+ vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
+ shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
+ shift = vmulq_f32 (shift, data_ptr->pi_over_2);
+
+ /* Calculate the polynomial approximation.
+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+ a standard implementation using z8 creates spurious underflow
+ in the very last fma (when z^8 is small enough).
+ Therefore, we split the last fma into a mul and an fma.
+ Horner and single-level Estrin have higher errors that exceed
+ threshold. */
+ float32x4_t z2 = vmulq_f32 (z, z);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+
+ float32x4_t ret = vfmaq_f32 (
+ v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
+ vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
+
+ /* y = shift + z * P(z^2). */
+ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
/* Account for the sign of y. */
- ret = v_as_f32_u32 (v_as_u32_f32 (ret) ^ sign_xy);
+ ret = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
if (unlikely (v_any_u32 (special_cases)))
{
- return specialcase (y, x, ret, special_cases);
+ return special_case (y, x, ret, special_cases);
}
return ret;
}
-VPCS_ALIAS
/* Arity of 2 means no mathbench entry emitted. See test/mathbench_funcs.h. */
PL_SIG (V, F, 2, atan2)
-PL_TEST_ULP (V_NAME (atan2f), 2.46)
-PL_TEST_INTERVAL (V_NAME (atan2f), -10.0, 10.0, 50000)
-PL_TEST_INTERVAL (V_NAME (atan2f), -1.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan2f), 0.0, 1.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan2f), 1.0, 100.0, 40000)
-PL_TEST_INTERVAL (V_NAME (atan2f), 1e6, 1e32, 40000)
-#endif
+PL_TEST_ULP (V_NAME_F2 (atan2), 2.46)
+PL_TEST_INTERVAL (V_NAME_F2 (atan2), -10.0, 10.0, 50000)
+PL_TEST_INTERVAL (V_NAME_F2 (atan2), -1.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME_F2 (atan2), 0.0, 1.0, 40000)
+PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1.0, 100.0, 40000)
+PL_TEST_INTERVAL (V_NAME_F2 (atan2), 1e6, 1e32, 40000)
diff --git a/pl/math/v_atan_2u5.c b/pl/math/v_atan_2u5.c
index 0f3c2ccf2606..ba68cc3cc720 100644
--- a/pl/math/v_atan_2u5.c
+++ b/pl/math/v_atan_2u5.c
@@ -8,33 +8,51 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f64.h"
-#if V_SUPPORTED
-
-#include "atan_common.h"
+static const struct data
+{
+ float64x2_t pi_over_2;
+ float64x2_t poly[20];
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-1022, 1.0]. */
+ .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
+ V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
+ V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
+ V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
+ V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
+ V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
+ V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
+ V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
+ V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
+ V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+};
-#define PiOver2 v_f64 (0x1.921fb54442d18p+0)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define TinyBound 0x3e1 /* top12(asuint64(0x1p-30)). */
-#define BigBound 0x434 /* top12(asuint64(0x1p53)). */
+#define SignMask v_u64 (0x8000000000000000)
+#define TinyBound 0x3e10000000000000 /* asuint64(0x1p-30). */
+#define BigBound 0x4340000000000000 /* asuint64(0x1p53). */
/* Fast implementation of vector atan.
Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
- __v_atan(0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
- want 0x1.9225645bdd7c3p-1. */
-VPCS_ATTR
-v_f64_t V_NAME (atan) (v_f64_t x)
+ _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
+ want 0x1.9225645bdd7c3p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
{
+ const struct data *d = ptr_barrier (&data);
+
/* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need
fenv. */
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t sign = ix & ~AbsMask;
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t sign = vandq_u64 (ix, SignMask);
#if WANT_SIMD_EXCEPT
- v_u64_t ia12 = (ix >> 52) & 0x7ff;
- v_u64_t special = v_cond_u64 (ia12 - TinyBound > BigBound - TinyBound);
+ uint64x2_t ia12 = vandq_u64 (ix, v_u64 (0x7ff0000000000000));
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia12, v_u64 (TinyBound)),
+ v_u64 (BigBound - TinyBound));
/* If any lane is special, fall back to the scalar routine for all lanes. */
if (unlikely (v_any_u64 (special)))
return v_call_f64 (atan, x, v_f64 (0), v_u64 (-1));
@@ -44,31 +62,43 @@ v_f64_t V_NAME (atan) (v_f64_t x)
y := arctan(x) for x < 1
y := pi/2 + arctan(-1/x) for x > 1
Hence, use z=-1/a if x>=1, otherwise z=a. */
- v_u64_t red = v_cagt_f64 (x, v_f64 (1.0));
+ uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
/* Avoid dependency in abs(x) in division (and comparison). */
- v_f64_t z = v_sel_f64 (red, v_div_f64 (v_f64 (-1.0), x), x);
- v_f64_t shift = v_sel_f64 (red, PiOver2, v_f64 (0.0));
+ float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
+ float64x2_t shift = vreinterpretq_f64_u64 (
+ vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
/* Use absolute value only when needed (odd powers of z). */
- v_f64_t az = v_abs_f64 (z);
- az = v_sel_f64 (red, -az, az);
+ float64x2_t az = vbslq_f64 (
+ SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
- /* Calculate the polynomial approximation. */
- v_f64_t y = eval_poly (z, az, shift);
+ /* Calculate the polynomial approximation.
+ Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
+ full scheme to avoid underflow in x^16.
+ The order 19 polynomial P approximates
+ (atan(sqrt(x))-sqrt(x))/x^(3/2). */
+ float64x2_t z2 = vmulq_f64 (z, z);
+ float64x2_t x2 = vmulq_f64 (z2, z2);
+ float64x2_t x4 = vmulq_f64 (x2, x2);
+ float64x2_t x8 = vmulq_f64 (x4, x4);
+ float64x2_t y
+ = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
+ v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* Finalize. y = shift + z + z^3 * P(z^2). */
+ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
+ y = vaddq_f64 (y, shift);
/* y = atan(x) if x>0, -atan(-x) otherwise. */
- y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
return y;
}
-VPCS_ALIAS
PL_SIG (V, D, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (atan), 1.78)
-PL_TEST_EXPECT_FENV (V_NAME (atan), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (atan), 0, 0x1p-30, 10000)
-PL_TEST_INTERVAL (V_NAME (atan), -0, -0x1p-30, 1000)
-PL_TEST_INTERVAL (V_NAME (atan), 0x1p-30, 0x1p53, 900000)
-PL_TEST_INTERVAL (V_NAME (atan), -0x1p-30, -0x1p53, 90000)
-PL_TEST_INTERVAL (V_NAME (atan), 0x1p53, inf, 10000)
-PL_TEST_INTERVAL (V_NAME (atan), -0x1p53, -inf, 1000)
-
-#endif
+PL_TEST_ULP (V_NAME_D1 (atan), 1.78)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (atan), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL (V_NAME_D1 (atan), 0, 0x1p-30, 10000)
+PL_TEST_INTERVAL (V_NAME_D1 (atan), -0, -0x1p-30, 1000)
+PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p-30, 0x1p53, 900000)
+PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p-30, -0x1p53, 90000)
+PL_TEST_INTERVAL (V_NAME_D1 (atan), 0x1p53, inf, 10000)
+PL_TEST_INTERVAL (V_NAME_D1 (atan), -0x1p53, -inf, 1000)
diff --git a/pl/math/v_atanf_3u.c b/pl/math/v_atanf_3u.c
index 67d90b94f5d3..f522d957c1cc 100644
--- a/pl/math/v_atanf_3u.c
+++ b/pl/math/v_atanf_3u.c
@@ -8,19 +8,32 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f32.h"
-#if V_SUPPORTED
+static const struct data
+{
+ float32x4_t poly[8];
+ float32x4_t pi_over_2;
+} data = {
+ /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+ [2**-128, 1.0].
+ Generated using fpminimax between FLT_MIN and 1. */
+ .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
+ V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
+ V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
+ .pi_over_2 = V4 (0x1.921fb6p+0f),
+};
+
+#define SignMask v_u32 (0x80000000)
-#include "atanf_common.h"
+#define P(i) d->poly[i]
-#define PiOver2 v_f32 (0x1.921fb6p+0f)
-#define AbsMask v_u32 (0x7fffffff)
-#define TinyBound 0x308 /* top12(asuint(0x1p-30)). */
-#define BigBound 0x4e8 /* top12(asuint(0x1p30)). */
+#define TinyBound 0x30800000 /* asuint(0x1p-30). */
+#define BigBound 0x4e800000 /* asuint(0x1p30). */
#if WANT_SIMD_EXCEPT
-static NOINLINE v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
{
return v_call_f32 (atanf, x, y, special);
}
@@ -29,55 +42,66 @@ specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
/* Fast implementation of vector atanf based on
atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
- v_atanf(0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
-VPCS_ATTR
-v_f32_t V_NAME (atanf) (v_f32_t x)
+ _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (atan) (float32x4_t x)
{
+ const struct data *d = ptr_barrier (&data);
+
/* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need
fenv. */
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t sign = ix & ~AbsMask;
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t sign = vandq_u32 (ix, SignMask);
#if WANT_SIMD_EXCEPT
- v_u32_t ia12 = (ix >> 20) & 0x7ff;
- v_u32_t special = v_cond_u32 (ia12 - TinyBound > BigBound - TinyBound);
+ uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
+ uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
+ v_u32 (BigBound - TinyBound));
/* If any lane is special, fall back to the scalar routine for all lanes. */
if (unlikely (v_any_u32 (special)))
- return specialcase (x, x, v_u32 (-1));
+ return special_case (x, x, v_u32 (-1));
#endif
/* Argument reduction:
y := arctan(x) for x < 1
y := pi/2 + arctan(-1/x) for x > 1
Hence, use z=-1/a if x>=1, otherwise z=a. */
- v_u32_t red = v_cagt_f32 (x, v_f32 (1.0));
+ uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
/* Avoid dependency in abs(x) in division (and comparison). */
- v_f32_t z = v_sel_f32 (red, v_div_f32 (v_f32 (-1.0f), x), x);
- v_f32_t shift = v_sel_f32 (red, PiOver2, v_f32 (0.0f));
+ float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
+ float32x4_t shift = vreinterpretq_f32_u32 (
+ vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
/* Use absolute value only when needed (odd powers of z). */
- v_f32_t az = v_abs_f32 (z);
- az = v_sel_f32 (red, -az, az);
+ float32x4_t az = vbslq_f32 (
+ SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
- /* Calculate the polynomial approximation. */
- v_f32_t y = eval_poly (z, az, shift);
+ /* Calculate the polynomial approximation.
+ Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
+ a standard implementation using z8 creates spurious underflow
+ in the very last fma (when z^8 is small enough).
+ Therefore, we split the last fma into a mul and an fma.
+ Horner and single-level Estrin have higher errors that exceed
+ threshold. */
+ float32x4_t z2 = vmulq_f32 (z, z);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+
+ float32x4_t y = vfmaq_f32 (
+ v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
+ vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
+
+ /* y = shift + z * P(z^2). */
+ y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
/* y = atan(x) if x>0, -atan(-x) otherwise. */
- y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
return y;
}
-VPCS_ALIAS
PL_SIG (V, F, 1, atan, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (atanf), 2.5)
-PL_TEST_EXPECT_FENV (V_NAME (atanf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (atanf), 0, 0x1p-30, 5000)
-PL_TEST_INTERVAL (V_NAME (atanf), -0, -0x1p-30, 5000)
-PL_TEST_INTERVAL (V_NAME (atanf), 0x1p-30, 1, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), -0x1p-30, -1, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 1, 0x1p30, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), -1, -0x1p30, 40000)
-PL_TEST_INTERVAL (V_NAME (atanf), 0x1p30, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (atanf), -0x1p30, -inf, 1000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (atan), 2.5)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (atan), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0, 0x1p-30, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p-30, 1, 40000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 1, 0x1p30, 40000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (atan), 0x1p30, inf, 1000)
diff --git a/pl/math/v_atanh_3u5.c b/pl/math/v_atanh_3u5.c
index bfaf5c2b917f..f282826a3f32 100644
--- a/pl/math/v_atanh_3u5.c
+++ b/pl/math/v_atanh_3u5.c
@@ -6,56 +6,61 @@
*/
#include "v_math.h"
-#include "pairwise_horner.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
#define WANT_V_LOG1P_K0_SHORTCUT 0
#include "v_log1p_inline.h"
-#define AbsMask 0x7fffffffffffffff
-#define Half 0x3fe0000000000000
-#define One 0x3ff0000000000000
+const static struct data
+{
+ struct v_log1p_data log1p_consts;
+ uint64x2_t one, half;
+} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
+ .one = V2 (0x3ff0000000000000),
+ .half = V2 (0x3fe0000000000000) };
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
{
return v_call_f64 (atanh, x, y, special);
}
/* Approximation for vector double-precision atanh(x) using modified log1p.
The greatest observed error is 3.31 ULP:
- __v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
- want 0x1.ffd8ff31b501cp-6. */
+ _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
+ want 0x1.ffd8ff31b501cp-6. */
VPCS_ATTR
-v_f64_t V_NAME (atanh) (v_f64_t x)
+float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t sign = ix & ~AbsMask;
- v_u64_t ia = ix & AbsMask;
- v_u64_t special = v_cond_u64 (ia >= One);
- v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+ const struct data *d = ptr_barrier (&data);
- /* Mask special lanes with 0 to prevent spurious underflow. */
- v_f64_t ax = v_sel_f64 (special, v_f64 (0), v_as_f64_u64 (ia));
- v_f64_t y = halfsign * log1p_inline ((2 * ax) / (1 - ax));
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (ax);
+ uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia);
+ uint64x2_t special = vcgeq_u64 (ia, d->one);
+ float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half));
+
+#if WANT_SIMD_EXCEPT
+ ax = v_zerofy_f64 (ax, special);
+#endif
+
+ float64x2_t y;
+ y = vaddq_f64 (ax, ax);
+ y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax));
+ y = log1p_inline (y, &d->log1p_consts);
if (unlikely (v_any_u64 (special)))
- return specialcase (x, y, special);
- return y;
+ return special_case (x, vmulq_f64 (y, halfsign), special);
+ return vmulq_f64 (y, halfsign);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, atanh, -1.0, 1.0)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (atanh))
-PL_TEST_ULP (V_NAME (atanh), 3.32)
-PL_TEST_INTERVAL_C (V_NAME (atanh), 0, 0x1p-23, 10000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanh), -0, -0x1p-23, 10000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanh), 0x1p-23, 1, 90000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanh), -0x1p-23, -1, 90000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanh), 1, inf, 100, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanh), -1, -inf, 100, 0)
-#endif
+PL_TEST_EXPECT_FENV (V_NAME_D1 (atanh), WANT_SIMD_EXCEPT)
+PL_TEST_ULP (V_NAME_D1 (atanh), 3.32)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0, 0x1p-23, 10000, 0)
+PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 0x1p-23, 1, 90000, 0)
+PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (atanh), 1, inf, 100, 0)
diff --git a/pl/math/v_atanhf_3u1.c b/pl/math/v_atanhf_3u1.c
index cd3069661142..f6a5f25eca9a 100644
--- a/pl/math/v_atanhf_3u1.c
+++ b/pl/math/v_atanhf_3u1.c
@@ -6,57 +6,72 @@
*/
#include "v_math.h"
-#include "mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "v_log1pf_inline.h"
-#if V_SUPPORTED
+const static struct data
+{
+ struct v_log1pf_data log1pf_consts;
+ uint32x4_t one;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound;
+#endif
+} data = {
+ .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
+ .one = V4 (0x3f800000),
+#if WANT_SIMD_EXCEPT
+ /* 0x1p-12, below which atanhf(x) rounds to x. */
+ .tiny_bound = V4 (0x39800000),
+#endif
+};
-#include "v_log1pf_inline.h"
+#define AbsMask v_u32 (0x7fffffff)
+#define Half v_u32 (0x3f000000)
-#define AbsMask 0x7fffffff
-#define Half 0x3f000000
-#define One 0x3f800000
-#define TinyBound 0x39800000 /* 0x1p-12, below which atanhf(x) rounds to x. */
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (atanhf, x, y, special);
+}
/* Approximation for vector single-precision atanh(x) using modified log1p.
The maximum error is 3.08 ULP:
__v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
want 0x1.ffcb82p-5. */
-VPCS_ATTR v_f32_t V_NAME (atanhf) (v_f32_t x)
+VPCS_ATTR float32x4_t V_NAME_F1 (atanh) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_f32_t halfsign
- = v_as_f32_u32 (v_bsl_u32 (v_u32 (AbsMask), v_u32 (Half), ix));
- v_u32_t iax = ix & AbsMask;
+ const struct data *d = ptr_barrier (&data);
- v_f32_t ax = v_as_f32_u32 (iax);
+ float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
#if WANT_SIMD_EXCEPT
- v_u32_t special = v_cond_u32 ((iax >= One) | (iax <= TinyBound));
+ uint32x4_t special
+ = vorrq_u32 (vcgeq_u32 (iax, d->one), vcltq_u32 (iax, d->tiny_bound));
/* Side-step special cases by setting those lanes to 0, which will trigger no
exceptions. These will be fixed up later. */
if (unlikely (v_any_u32 (special)))
- ax = v_sel_f32 (special, v_f32 (0), ax);
+ ax = v_zerofy_f32 (ax, special);
#else
- v_u32_t special = v_cond_u32 (iax >= One);
+ uint32x4_t special = vcgeq_u32 (iax, d->one);
#endif
- v_f32_t y = halfsign * log1pf_inline ((2 * ax) / (1 - ax));
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
+ y = log1pf_inline (y, d->log1pf_consts);
if (unlikely (v_any_u32 (special)))
- return v_call_f32 (atanhf, x, y, special);
- return y;
+ return special_case (x, vmulq_f32 (halfsign, y), special);
+ return vmulq_f32 (halfsign, y);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, atanh, -1.0, 1.0)
-PL_TEST_ULP (V_NAME (atanhf), 2.59)
-PL_TEST_EXPECT_FENV (V_NAME (atanhf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL_C (V_NAME (atanhf), 0, 0x1p-12, 500, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanhf), 0x1p-12, 1, 200000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanhf), 1, inf, 1000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanhf), -0, -0x1p-12, 500, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanhf), -0x1p-12, -1, 200000, 0)
-PL_TEST_INTERVAL_C (V_NAME (atanhf), -1, -inf, 1000, 0)
-#endif
+PL_TEST_ULP (V_NAME_F1 (atanh), 2.59)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (atanh), WANT_SIMD_EXCEPT)
+/* atanh is asymptotic at 1, which is the default control value - have to set
+ -c 0 specially to ensure fp exceptions are triggered correctly (choice of
+ control lane is irrelevant if fp exceptions are disabled). */
+PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0, 0x1p-12, 500, 0)
+PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 0x1p-12, 1, 200000, 0)
+PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (atanh), 1, inf, 1000, 0)
diff --git a/pl/math/v_cbrt_2u.c b/pl/math/v_cbrt_2u.c
index d5abe41024bc..cc7cff15dc0f 100644
--- a/pl/math/v_cbrt_2u.c
+++ b/pl/math/v_cbrt_2u.c
@@ -6,26 +6,38 @@
*/
#include "v_math.h"
-#include "mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f64.h"
-#if V_SUPPORTED
+const static struct data
+{
+ float64x2_t poly[4], one_third, shift;
+ int64x2_t exp_bias;
+ uint64x2_t abs_mask, tiny_bound;
+ uint32x4_t thresh;
+ double table[5];
+} data = {
+ .shift = V2 (0x1.8p52),
+ .poly = { /* Generated with fpminimax in [0.5, 1]. */
+ V2 (0x1.c14e8ee44767p-2), V2 (0x1.dd2d3f99e4c0ep-1),
+ V2 (-0x1.08e83026b7e74p-1), V2 (0x1.2c74eaa3ba428p-3) },
+ .exp_bias = V2 (1022),
+ .abs_mask = V2(0x7fffffffffffffff),
+ .tiny_bound = V2(0x0010000000000000), /* Smallest normal. */
+ .thresh = V4(0x7fe00000), /* asuint64 (infinity) - tiny_bound. */
+ .one_third = V2(0x1.5555555555555p-2),
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a2f98d728bp-1, 0x1.965fea53d6e3dp-1, 0x1p0,
+ 0x1.428a2f98d728bp0, 0x1.965fea53d6e3dp0 }
+};
-#define AbsMask 0x7fffffffffffffff
-#define TwoThirds v_f64 (0x1.5555555555555p-1)
-#define TinyBound 0x001 /* top12 (smallest_normal). */
-#define BigBound 0x7ff /* top12 (infinity). */
#define MantissaMask v_u64 (0x000fffffffffffff)
-#define HalfExp v_u64 (0x3fe0000000000000)
-
-#define C(i) v_f64 (__cbrt_data.poly[i])
-#define T(i) v_lookup_f64 (__cbrt_data.table, i)
-static NOINLINE v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint32x2_t special)
{
- return v_call_f64 (cbrt, x, y, special);
+ return v_call_f64 (cbrt, x, y, vmovl_u32 (special));
}
/* Approximation for double-precision vector cbrt(x), using low-order polynomial
@@ -35,31 +47,33 @@ specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
integer.
__v_cbrt(0x1.fffff403f0bc6p+1) got 0x1.965fe72821e9bp+0
want 0x1.965fe72821e99p+0. */
-VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
+VPCS_ATTR float64x2_t V_NAME_D1 (cbrt) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t iax = ix & AbsMask;
- v_u64_t ia12 = iax >> 52;
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
/* Subnormal, +/-0 and special values. */
- v_u64_t special = v_cond_u64 ((ia12 < TinyBound) | (ia12 >= BigBound));
+ uint32x2_t special
+ = vcge_u32 (vsubhn_u64 (iax, d->tiny_bound), vget_low_u32 (d->thresh));
/* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
version of frexp, which gets subnormal values wrong - these have to be
special-cased as a result. */
- v_f64_t m = v_as_f64_u64 (v_bsl_u64 (MantissaMask, iax, HalfExp));
- v_s64_t e = v_as_s64_u64 (iax >> 52) - 1022;
+ float64x2_t m = vbslq_f64 (MantissaMask, x, v_f64 (0.5));
+ int64x2_t exp_bias = d->exp_bias;
+ uint64x2_t ia12 = vshrq_n_u64 (iax, 52);
+ int64x2_t e = vsubq_s64 (vreinterpretq_s64_u64 (ia12), exp_bias);
/* Calculate rough approximation for cbrt(m) in [0.5, 1.0], starting point for
Newton iterations. */
- v_f64_t p_01 = v_fma_f64 (C (1), m, C (0));
- v_f64_t p_23 = v_fma_f64 (C (3), m, C (2));
- v_f64_t p = v_fma_f64 (m * m, p_23, p_01);
-
+ float64x2_t p = v_pairwise_poly_3_f64 (m, vmulq_f64 (m, m), d->poly);
+ float64x2_t one_third = d->one_third;
/* Two iterations of Newton's method for iteratively approximating cbrt. */
- v_f64_t m_by_3 = m / 3;
- v_f64_t a = v_fma_f64 (TwoThirds, p, m_by_3 / (p * p));
- a = v_fma_f64 (TwoThirds, a, m_by_3 / (a * a));
+ float64x2_t m_by_3 = vmulq_f64 (m, one_third);
+ float64x2_t two_thirds = vaddq_f64 (one_third, one_third);
+ float64x2_t a
+ = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (p, p)), two_thirds, p);
+ a = vfmaq_f64 (vdivq_f64 (m_by_3, vmulq_f64 (a, a)), two_thirds, a);
/* Assemble the result by the following:
@@ -76,23 +90,27 @@ VPCS_ATTR v_f64_t V_NAME (cbrt) (v_f64_t x)
cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
- v_s64_t ey = e / 3;
- v_f64_t my = a * T (v_as_u64_s64 (e % 3 + 2));
+ float64x2_t ef = vcvtq_f64_s64 (e);
+ float64x2_t eb3f = vrndnq_f64 (vmulq_f64 (ef, one_third));
+ int64x2_t em3 = vcvtq_s64_f64 (vfmsq_f64 (ef, eb3f, v_f64 (3)));
+ int64x2_t ey = vcvtq_s64_f64 (eb3f);
+
+ float64x2_t my = (float64x2_t){ d->table[em3[0] + 2], d->table[em3[1] + 2] };
+ my = vmulq_f64 (my, a);
/* Vector version of ldexp. */
- v_f64_t y = v_as_f64_u64 ((v_as_u64_s64 (ey + 1023) << 52)) * my;
- /* Copy sign. */
- y = v_as_f64_u64 (v_bsl_u64 (v_u64 (AbsMask), v_as_u64_f64 (y), ix));
+ float64x2_t y = vreinterpretq_f64_s64 (
+ vshlq_n_s64 (vaddq_s64 (ey, vaddq_s64 (exp_bias, v_s64 (1))), 52));
+ y = vmulq_f64 (y, my);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, vbslq_f64 (d->abs_mask, y, x), special);
- if (unlikely (v_any_u64 (special)))
- return specialcase (x, y, special);
- return y;
+ /* Copy sign. */
+ return vbslq_f64 (d->abs_mask, y, x);
}
-VPCS_ALIAS
-PL_TEST_ULP (V_NAME (cbrt), 1.30)
+PL_TEST_ULP (V_NAME_D1 (cbrt), 1.30)
PL_SIG (V, D, 1, cbrt, -10.0, 10.0)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrt))
-PL_TEST_INTERVAL (V_NAME (cbrt), 0, inf, 1000000)
-PL_TEST_INTERVAL (V_NAME (cbrt), -0, -inf, 1000000)
-#endif
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cbrt))
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cbrt), 0, inf, 1000000)
diff --git a/pl/math/v_cbrtf_1u5.c b/pl/math/v_cbrtf_1u5.c
deleted file mode 100644
index 62fa37505834..000000000000
--- a/pl/math/v_cbrtf_1u5.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Single-precision vector cbrt(x) function.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "mathlib.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-#define AbsMask 0x7fffffff
-#define SignMask v_u32 (0x80000000)
-#define TwoThirds v_f32 (0x1.555556p-1f)
-#define SmallestNormal 0x00800000
-#define MantissaMask 0x007fffff
-#define HalfExp 0x3f000000
-
-#define C(i) v_f32 (__cbrtf_data.poly[i])
-#define T(i) v_lookup_f32 (__cbrtf_data.table, i)
-
-static NOINLINE v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
-{
- return v_call_f32 (cbrtf, x, y, special);
-}
-
-/* Approximation for vector single-precision cbrt(x) using Newton iteration with
- initial guess obtained by a low-order polynomial. Greatest error is 1.5 ULP.
- This is observed for every value where the mantissa is 0x1.81410e and the
- exponent is a multiple of 3, for example:
- __v_cbrtf(0x1.81410ep+30) got 0x1.255d96p+10
- want 0x1.255d92p+10. */
-VPCS_ATTR v_f32_t V_NAME (cbrtf) (v_f32_t x)
-{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iax = ix & AbsMask;
-
- /* Subnormal, +/-0 and special values. */
- v_u32_t special = v_cond_u32 ((iax < SmallestNormal) | (iax >= 0x7f800000));
-
- /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
- version of frexpf, which gets subnormal values wrong - these have to be
- special-cased as a result. */
- v_f32_t m = v_as_f32_u32 ((iax & MantissaMask) | HalfExp);
- v_s32_t e = v_as_s32_u32 (iax >> 23) - 126;
-
- /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
- the less accurate the next stage of the algorithm needs to be. An order-4
- polynomial is enough for one Newton iteration. */
- v_f32_t p_01 = v_fma_f32 (C (1), m, C (0));
- v_f32_t p_23 = v_fma_f32 (C (3), m, C (2));
- v_f32_t p = v_fma_f32 (m * m, p_23, p_01);
-
- /* One iteration of Newton's method for iteratively approximating cbrt. */
- v_f32_t m_by_3 = m / 3;
- v_f32_t a = v_fma_f32 (TwoThirds, p, m_by_3 / (p * p));
-
- /* Assemble the result by the following:
-
- cbrt(x) = cbrt(m) * 2 ^ (e / 3).
-
- We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
- not necessarily a multiple of 3 we lose some information.
-
- Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
-
- Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which is
- an integer in [-2, 2], and can be looked up in the table T. Hence the
- result is assembled as:
-
- cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
-
- v_s32_t ey = e / 3;
- v_f32_t my = a * T (v_as_u32_s32 (e % 3 + 2));
-
- /* Vector version of ldexpf. */
- v_f32_t y = v_as_f32_u32 ((v_as_u32_s32 (ey + 127) << 23)) * my;
- /* Copy sign. */
- y = v_as_f32_u32 (v_bsl_u32 (SignMask, ix, v_as_u32_f32 (y)));
-
- if (unlikely (v_any_u32 (special)))
- return specialcase (x, y, special);
- return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (cbrtf), 1.03)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cbrtf))
-PL_TEST_INTERVAL (V_NAME (cbrtf), 0, inf, 1000000)
-PL_TEST_INTERVAL (V_NAME (cbrtf), -0, -inf, 1000000)
-#endif
diff --git a/pl/math/v_cbrtf_1u7.c b/pl/math/v_cbrtf_1u7.c
new file mode 100644
index 000000000000..74918765209f
--- /dev/null
+++ b/pl/math/v_cbrtf_1u7.c
@@ -0,0 +1,116 @@
+/*
+ * Single-precision vector cbrt(x) function.
+ *
+ * Copyright (c) 2022-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_advsimd_f32.h"
+
+const static struct data
+{
+ float32x4_t poly[4], one_third;
+ float table[5];
+} data = {
+ .poly = { /* Very rough approximation of cbrt(x) in [0.5, 1], generated with
+ FPMinimax. */
+ V4 (0x1.c14e96p-2), V4 (0x1.dd2d3p-1), V4 (-0x1.08e81ap-1),
+ V4 (0x1.2c74c2p-3) },
+ .table = { /* table[i] = 2^((i - 2) / 3). */
+ 0x1.428a3p-1, 0x1.965feap-1, 0x1p0, 0x1.428a3p0, 0x1.965feap0 },
+ .one_third = V4 (0x1.555556p-2f),
+};
+
+#define SignMask v_u32 (0x80000000)
+#define SmallestNormal v_u32 (0x00800000)
+#define Thresh vdup_n_u16 (0x7f00) /* asuint(INFINITY) - SmallestNormal. */
+#define MantissaMask v_u32 (0x007fffff)
+#define HalfExp v_u32 (0x3f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint16x4_t special)
+{
+ return v_call_f32 (cbrtf, x, y, vmovl_u16 (special));
+}
+
+static inline float32x4_t
+shifted_lookup (const float *table, int32x4_t i)
+{
+ return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2],
+ table[i[3] + 2] };
+}
+
+/* Approximation for vector single-precision cbrt(x) using Newton iteration
+ with initial guess obtained by a low-order polynomial. Greatest error
+ is 1.64 ULP. This is observed for every value where the mantissa is
+ 0x1.85a2aa and the exponent is a multiple of 3, for example:
+ _ZGVnN4v_cbrtf(0x1.85a2aap+3) got 0x1.267936p+1
+ want 0x1.267932p+1. */
+VPCS_ATTR float32x4_t V_NAME_F1 (cbrt) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
+ /* Subnormal, +/-0 and special values. */
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (iax, SmallestNormal), Thresh);
+
+ /* Decompose |x| into m * 2^e, where m is in [0.5, 1.0]. This is a vector
+ version of frexpf, which gets subnormal values wrong - these have to be
+ special-cased as a result. */
+ float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5));
+ int32x4_t e
+ = vsubq_s32 (vreinterpretq_s32_u32 (vshrq_n_u32 (iax, 23)), v_s32 (126));
+
+ /* p is a rough approximation for cbrt(m) in [0.5, 1.0]. The better this is,
+ the less accurate the next stage of the algorithm needs to be. An order-4
+ polynomial is enough for one Newton iteration. */
+ float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly);
+
+ float32x4_t one_third = d->one_third;
+ float32x4_t two_thirds = vaddq_f32 (one_third, one_third);
+
+ /* One iteration of Newton's method for iteratively approximating cbrt. */
+ float32x4_t m_by_3 = vmulq_f32 (m, one_third);
+ float32x4_t a
+ = vfmaq_f32 (vdivq_f32 (m_by_3, vmulq_f32 (p, p)), two_thirds, p);
+
+ /* Assemble the result by the following:
+
+ cbrt(x) = cbrt(m) * 2 ^ (e / 3).
+
+ We can get 2 ^ round(e / 3) using ldexp and integer divide, but since e is
+ not necessarily a multiple of 3 we lose some information.
+
+ Let q = 2 ^ round(e / 3), then t = 2 ^ (e / 3) / q.
+
+ Then we know t = 2 ^ (i / 3), where i is the remainder from e / 3, which
+ is an integer in [-2, 2], and can be looked up in the table T. Hence the
+ result is assembled as:
+
+ cbrt(x) = cbrt(m) * t * 2 ^ round(e / 3) * sign. */
+ float32x4_t ef = vmulq_f32 (vcvtq_f32_s32 (e), one_third);
+ int32x4_t ey = vcvtq_s32_f32 (ef);
+ int32x4_t em3 = vsubq_s32 (e, vmulq_s32 (ey, v_s32 (3)));
+
+ float32x4_t my = shifted_lookup (d->table, em3);
+ my = vmulq_f32 (my, a);
+
+ /* Vector version of ldexpf. */
+ float32x4_t y
+ = vreinterpretq_f32_s32 (vshlq_n_s32 (vaddq_s32 (ey, v_s32 (127)), 23));
+ y = vmulq_f32 (y, my);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, vbslq_f32 (SignMask, x, y), special);
+
+ /* Copy sign. */
+ return vbslq_f32 (SignMask, x, y);
+}
+
+PL_SIG (V, F, 1, cbrt, -10.0, 10.0)
+PL_TEST_ULP (V_NAME_F1 (cbrt), 1.15)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (cbrt))
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cbrt), 0, inf, 1000000)
diff --git a/pl/math/v_cexpi_3u5.c b/pl/math/v_cexpi_3u5.c
new file mode 100644
index 000000000000..5163b15926b8
--- /dev/null
+++ b/pl/math/v_cexpi_3u5.c
@@ -0,0 +1,45 @@
+/*
+ * Double-precision vector sincos function - return-by-value interface.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincos_common.h"
+#include "v_math.h"
+#include "pl_test.h"
+
+static float64x2x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, float64x2x2_t y)
+{
+ return (float64x2x2_t){ v_call_f64 (sin, x, y.val[0], special),
+ v_call_f64 (cos, x, y.val[1], special) };
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+VPCS_ATTR float64x2x2_t
+_ZGVnN2v_cexpi (float64x2_t x)
+{
+ const struct v_sincos_data *d = ptr_barrier (&v_sincos_data);
+ uint64x2_t special = check_ge_rangeval (x, d);
+
+ float64x2x2_t sc = v_sincos_inline (x, d);
+
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, special, sc);
+ return sc;
+}
+
+PL_TEST_ULP (_ZGVnN2v_cexpi_sin, 2.73)
+PL_TEST_ULP (_ZGVnN2v_cexpi_cos, 2.73)
+#define V_CEXPI_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN2v_cexpi_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN2v_cexpi_cos, lo, hi, n)
+V_CEXPI_INTERVAL (0, 0x1p23, 500000)
+V_CEXPI_INTERVAL (-0, -0x1p23, 500000)
+V_CEXPI_INTERVAL (0x1p23, inf, 10000)
+V_CEXPI_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/pl/math/v_cexpif_1u8.c b/pl/math/v_cexpif_1u8.c
new file mode 100644
index 000000000000..4897018d3090
--- /dev/null
+++ b/pl/math/v_cexpif_1u8.c
@@ -0,0 +1,47 @@
+/*
+ * Single-precision vector cexpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_sincosf_common.h"
+#include "v_math.h"
+#include "pl_test.h"
+
+static float32x4x2_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, float32x4x2_t y)
+{
+ return (float32x4x2_t){ v_call_f32 (sinf, x, y.val[0], special),
+ v_call_f32 (cosf, x, y.val[1], special) };
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_cexpif_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_cexpif_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+VPCS_ATTR float32x4x2_t
+_ZGVnN4v_cexpif (float32x4_t x)
+{
+ const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data);
+ uint32x4_t special = check_ge_rangeval (x, d);
+
+ float32x4x2_t sc = v_sincosf_inline (x, d);
+
+ if (unlikely (v_any_u32 (special)))
+ return special_case (x, special, sc);
+ return sc;
+}
+
+PL_TEST_ULP (_ZGVnN4v_cexpif_sin, 1.17)
+PL_TEST_ULP (_ZGVnN4v_cexpif_cos, 1.31)
+#define V_CEXPIF_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN4v_cexpif_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN4v_cexpif_cos, lo, hi, n)
+V_CEXPIF_INTERVAL (0, 0x1p20, 500000)
+V_CEXPIF_INTERVAL (-0, -0x1p20, 500000)
+V_CEXPIF_INTERVAL (0x1p20, inf, 10000)
+V_CEXPIF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/pl/math/v_cosh_2u.c b/pl/math/v_cosh_2u.c
index 0a9fbf817a10..649c390f4622 100644
--- a/pl/math/v_cosh_2u.c
+++ b/pl/math/v_cosh_2u.c
@@ -8,89 +8,97 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
-#include "v_exp_tail.h"
-
-#define C1 v_f64 (C1_scal)
-#define C2 v_f64 (C2_scal)
-#define C3 v_f64 (C3_scal)
-#define InvLn2 v_f64 (InvLn2_scal)
-#define Ln2hi v_f64 (Ln2hi_scal)
-#define Ln2lo v_f64 (Ln2lo_scal)
-#define IndexMask v_u64 (IndexMask_scal)
-#define Shift v_f64 (Shift_scal)
-#define Thres v_f64 (Thres_scal)
-
-#define AbsMask 0x7fffffffffffffff
-#define Half v_f64 (0.5)
-#define SpecialBound \
- 0x4086000000000000 /* 0x1.6p9, above which exp overflows. */
-
-#if V_SUPPORTED
-
-static inline v_f64_t
-exp_inline (v_f64_t x)
+
+static const struct data
+{
+ float64x2_t poly[3];
+ float64x2_t inv_ln2, ln2, shift, thres;
+ uint64x2_t index_mask, special_bound;
+} data = {
+ .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+ V2 (0x1.5555576a59599p-5), },
+
+ .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
+ /* -ln2/N. */
+ .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64},
+ .shift = V2 (0x1.8p+52),
+ .thres = V2 (704.0),
+
+ .index_mask = V2 (0xff),
+ /* 0x1.6p9, above which exp overflows. */
+ .special_bound = V2 (0x4086000000000000),
+};
+
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (cosh, x, y, special);
+}
+
+/* Helper for approximating exp(x). Copied from v_exp_tail, with no
+ special-case handling or tail. */
+static inline float64x2_t
+exp_inline (float64x2_t x)
{
- /* Helper for approximating exp(x). Copied from v_exp_tail, with no
- special-case handling or tail. */
+ const struct data *d = ptr_barrier (&data);
/* n = round(x/(ln2/N)). */
- v_f64_t z = v_fma_f64 (x, InvLn2, Shift);
- v_u64_t u = v_as_u64_f64 (z);
- v_f64_t n = z - Shift;
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
/* r = x - n*ln2/N. */
- v_f64_t r = x;
- r = v_fma_f64 (-Ln2hi, n, r);
- r = v_fma_f64 (-Ln2lo, n, r);
+ float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0);
+ r = vfmaq_laneq_f64 (r, n, d->ln2, 1);
- v_u64_t e = u << (52 - V_EXP_TAIL_TABLE_BITS);
- v_u64_t i = u & IndexMask;
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+ uint64x2_t i = vandq_u64 (u, d->index_mask);
/* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
- v_f64_t y = v_fma_f64 (C3, r, C2);
- y = v_fma_f64 (y, r, C1);
- y = v_fma_f64 (y, r, v_f64 (1)) * r;
+ float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r);
+ y = vfmaq_f64 (d->poly[0], y, r);
+ y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r);
/* s = 2^(n/N). */
- u = v_lookup_u64 (Tab, i);
- v_f64_t s = v_as_f64_u64 (u + e);
+ u = v_lookup_u64 (__v_exp_tail_data, i);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
- return v_fma_f64 (y, s, s);
+ return vfmaq_f64 (s, y, s);
}
/* Approximation for vector double-precision cosh(x) using exp_inline.
cosh(x) = (exp(x) + exp(-x)) / 2.
- The greatest observed error is in the scalar fall-back region, so is the same
- as the scalar routine, 1.93 ULP:
- __v_cosh(0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
- want 0x1.fdf28623ef923p+1021.
+ The greatest observed error is in the scalar fall-back region, so is the
+ same as the scalar routine, 1.93 ULP:
+ _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021
+ want 0x1.fdf28623ef923p+1021.
The greatest observed error in the non-special region is 1.54 ULP:
- __v_cosh(0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
- want 0x1.f711dcb0c77b1p+7. */
-VPCS_ATTR v_f64_t V_NAME (cosh) (v_f64_t x)
+ _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7
+ want 0x1.f711dcb0c77b1p+7. */
+float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t iax = ix & AbsMask;
- v_u64_t special = v_cond_u64 (iax > SpecialBound);
+ const struct data *d = ptr_barrier (&data);
- /* If any inputs are special, fall back to scalar for all lanes. */
- if (unlikely (v_any_u64 (special)))
- return v_call_f64 (cosh, x, x, v_u64 (-1));
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t special
+ = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound);
- v_f64_t ax = v_as_f64_u64 (iax);
/* Up to the point that exp overflows, we can use it to calculate cosh by
exp(|x|) / 2 + 1 / (2 * exp(|x|)). */
- v_f64_t t = exp_inline (ax);
- return t * Half + Half / t;
+ float64x2_t t = exp_inline (ax);
+ float64x2_t half_t = vmulq_n_f64 (t, 0.5);
+ float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t);
+
+ /* Fall back to scalar for any special cases. */
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x, vaddq_f64 (half_t, half_over_t), special);
+
+ return vaddq_f64 (half_t, half_over_t);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (cosh), 1.43)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (cosh))
-PL_TEST_INTERVAL (V_NAME (cosh), 0, 0x1.6p9, 100000)
-PL_TEST_INTERVAL (V_NAME (cosh), -0, -0x1.6p9, 100000)
-PL_TEST_INTERVAL (V_NAME (cosh), 0x1.6p9, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (cosh), -0x1.6p9, -inf, 1000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (cosh), 1.43)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (cosh))
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0, 0x1.6p9, 100000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cosh), 0x1.6p9, inf, 1000)
diff --git a/pl/math/v_coshf_2u4.c b/pl/math/v_coshf_2u4.c
index 1422d4d12b31..c622b0b183f1 100644
--- a/pl/math/v_coshf_2u4.c
+++ b/pl/math/v_coshf_2u4.c
@@ -5,70 +5,76 @@
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
+#include "v_expf_inline.h"
#include "v_math.h"
#include "mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
-#define AbsMask 0x7fffffff
-#define TinyBound 0x20000000 /* 0x1p-63: Round to 1 below this. */
-#define SpecialBound \
- 0x42ad496c /* 0x1.5a92d8p+6: expf overflows above this, so have to use \
- special case. */
-#define Half v_f32 (0.5)
-
-#if V_SUPPORTED
+static const struct data
+{
+ struct v_expf_data expf_consts;
+ uint32x4_t tiny_bound, special_bound;
+} data = {
+ .expf_consts = V_EXPF_DATA,
+ .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
+ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .special_bound = V4 (0x42ad496c),
+};
-v_f32_t V_NAME (expf) (v_f32_t);
+#if !WANT_SIMD_EXCEPT
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (coshf, x, y, special);
+}
+#endif
/* Single-precision vector cosh, using vector expf.
Maximum error is 2.38 ULP:
- __v_coshf(0x1.e8001ep+1) got 0x1.6a491ep+4 want 0x1.6a4922p+4. */
-VPCS_ATTR v_f32_t V_NAME (coshf) (v_f32_t x)
+ _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4
+ want 0x1.6a4922p+4. */
+float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iax = ix & AbsMask;
- v_f32_t ax = v_as_f32_u32 (iax);
- v_u32_t special = v_cond_u32 (iax >= SpecialBound);
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered correctly, fall back to the scalar
variant for all inputs if any input is a special value or above the bound
- at which expf overflows. */
+ at which expf overflows. */
if (unlikely (v_any_u32 (special)))
return v_call_f32 (coshf, x, x, v_u32 (-1));
- v_u32_t tiny = v_cond_u32 (iax <= TinyBound);
+ uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound);
/* If any input is tiny, avoid underflow exception by fixing tiny lanes of
- input to 1, which will generate no exceptions, and then also fixing tiny
- lanes of output to 1 just before return. */
+ input to 0, which will generate no exceptions. */
if (unlikely (v_any_u32 (tiny)))
- ax = v_sel_f32 (tiny, v_f32 (1), ax);
+ ax = v_zerofy_f32 (ax, tiny);
#endif
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- v_f32_t t = V_NAME (expf) (ax);
- v_f32_t y = t * Half + Half / t;
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+ float32x4_t half_t = vmulq_n_f32 (t, 0.5);
+ float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
#if WANT_SIMD_EXCEPT
if (unlikely (v_any_u32 (tiny)))
- return v_sel_f32 (tiny, v_f32 (1), y);
+ return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
#else
if (unlikely (v_any_u32 (special)))
- return v_call_f32 (coshf, x, y, special);
+ return special_case (x, vaddq_f32 (half_t, half_over_t), special);
#endif
- return y;
+ return vaddq_f32 (half_t, half_over_t);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, cosh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (coshf), 1.89)
-PL_TEST_EXPECT_FENV (V_NAME (coshf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1p-63, 100)
-PL_TEST_INTERVAL (V_NAME (coshf), 0, 0x1.5a92d8p+6, 80000)
-PL_TEST_INTERVAL (V_NAME (coshf), 0x1.5a92d8p+6, inf, 2000)
-PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1p-63, 100)
-PL_TEST_INTERVAL (V_NAME (coshf), -0, -0x1.5a92d8p+6, 80000)
-PL_TEST_INTERVAL (V_NAME (coshf), -0x1.5a92d8p+6, -inf, 2000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (cosh), 1.89)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (cosh), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1p-63, 100)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0, 0x1.5a92d8p+6, 80000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cosh), 0x1.5a92d8p+6, inf, 2000)
diff --git a/pl/math/v_cospi_3u1.c b/pl/math/v_cospi_3u1.c
new file mode 100644
index 000000000000..3c2ee0b74c8e
--- /dev/null
+++ b/pl/math/v_cospi_3u1.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision vector cospi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64x2_t poly[10];
+ float64x2_t range_val;
+} data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+ V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+ V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+ V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+ V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+ .range_val = V2 (0x1p63),
+};
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (cospi, x, y, cmp);
+}
+
+/* Approximation for vector double-precision cospi(x).
+ Maximum Error 3.06 ULP:
+ _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1
+ want 0x1.fa854babfb6c1p-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ float64x2_t r = vabsq_f64 (x);
+ uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
+ to avoid them overflowing and throwing exceptions. */
+ r = v_zerofy_f64 (r, cmp);
+ uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63);
+
+#else
+ float64x2_t r = x;
+ uint64x2_t cmp = vcageq_f64 (r, d->range_val);
+ uint64x2_t odd
+ = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+
+#endif
+
+ r = vsubq_f64 (r, vrndaq_f64 (r));
+
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r));
+
+ /* y = sin(r). */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t r4 = vmulq_f64 (r2, r2);
+ float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
+
+ /* Fallback to scalar. */
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+
+ /* Reintroduce the sign bit for inputs which round to odd. */
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+PL_SIG (V, D, 1, cospi, -0.9, 0.9)
+PL_TEST_ULP (V_NAME_D1 (cospi), 2.56)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (cospi), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0, 0x1p-63, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p-63, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0.5, 0x1p51, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (cospi), 0x1p51, inf, 10000)
diff --git a/pl/math/v_cospif_3u2.c b/pl/math/v_cospif_3u2.c
new file mode 100644
index 000000000000..d88aa828439d
--- /dev/null
+++ b/pl/math/v_cospif_3u2.c
@@ -0,0 +1,83 @@
+/*
+ * Single-precision vector cospi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32x4_t poly[6];
+ float32x4_t range_val;
+} data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+ V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+ .range_val = V4 (0x1p31f),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (cospif, x, y, cmp);
+}
+
+/* Approximation for vector single-precision cospi(x)
+ Maximum Error: 3.17 ULP:
+ _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1
+ want 0x1.f7cd5p-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ float32x4_t r = vabsq_f32 (x);
+ uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd
+ to avoid them overflowing and throwing exceptions. */
+ r = v_zerofy_f32 (r, cmp);
+ uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31);
+
+#else
+ float32x4_t r = x;
+ uint32x4_t cmp = vcageq_f32 (r, d->range_val);
+
+ uint32x4_t odd
+ = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+
+#endif
+
+ /* r = x - rint(x). */
+ r = vsubq_f32 (r, vrndaq_f32 (r));
+
+ /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */
+ r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+ float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
+
+ /* Fallback to scalar. */
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+
+ /* Reintroduce the sign bit for inputs which round to odd. */
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+PL_SIG (V, F, 1, cospi, -0.9, 0.9)
+PL_TEST_ULP (V_NAME_F1 (cospi), 2.67)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (cospi), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p-31, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0.5, 0x1p32f, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (cospi), 0x1p32f, inf, 10000)
diff --git a/pl/math/v_erf_2u.c b/pl/math/v_erf_2u.c
deleted file mode 100644
index 1d7ddbb1ee3e..000000000000
--- a/pl/math/v_erf_2u.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Double-precision vector erf(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "include/mathlib.h"
-#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define AbsXMax v_f64 (0x1.8p+2)
-#define Scale v_f64 (0x1p+3)
-
-/* Special cases (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (erf, x, y, cmp);
-}
-
-/* A structure to perform look-up in coeffs and other parameter tables. */
-struct entry
-{
- v_f64_t P[V_ERF_NCOEFFS];
- v_f64_t shift;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
- struct entry e;
-#ifdef SCALAR
- for (int j = 0; j < V_ERF_NCOEFFS; ++j)
- e.P[j] = __v_erf_data.coeffs[j][i];
- e.shift = __v_erf_data.shifts[i];
-#else
- for (int j = 0; j < V_ERF_NCOEFFS; ++j)
- {
- e.P[j][0] = __v_erf_data.coeffs[j][i[0]];
- e.P[j][1] = __v_erf_data.coeffs[j][i[1]];
- }
- e.shift[0] = __v_erf_data.shifts[i[0]];
- e.shift[1] = __v_erf_data.shifts[i[1]];
-#endif
- return e;
-}
-
-/* Optimized double precision vector error function erf. Maximum
- observed error is 1.75 ULP, in [0.110, 0.111]:
- verf(0x1.c5e0c2d5d0543p-4) got 0x1.fe0ed62a54987p-4
- want 0x1.fe0ed62a54985p-4. */
-VPCS_ATTR
-v_f64_t V_NAME (erf) (v_f64_t x)
-{
- /* Handle both inf/nan as well as small values (|x|<2^-28)
- If any condition in the lane is true then a loop over
- scalar calls will be performed. */
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t atop = (ix >> 48) & v_u64 (0x7fff);
- v_u64_t special_case
- = v_cond_u64 (atop - v_u64 (0x3e30) >= v_u64 (0x7ff0 - 0x3e30));
-
- /* Get sign and absolute value. */
- v_u64_t sign = v_as_u64_f64 (x) & ~AbsMask;
- v_f64_t a = v_min_f64 (v_abs_f64 (x), AbsXMax);
-
- /* Compute index by truncating 8 * a with a=|x| saturated to 6.0. */
-
-#ifdef SCALAR
- v_u64_t i = v_trunc_u64 (a * Scale);
-#else
- v_u64_t i = vcvtq_n_u64_f64 (a, 3);
-#endif
- /* Get polynomial coefficients and shift parameter using lookup. */
- struct entry dat = lookup (i);
-
- /* Evaluate polynomial on transformed argument. */
- v_f64_t z = v_fma_f64 (a, Scale, dat.shift);
-
- v_f64_t r1 = v_fma_f64 (z, dat.P[1], dat.P[0]);
- v_f64_t r2 = v_fma_f64 (z, dat.P[3], dat.P[2]);
- v_f64_t r3 = v_fma_f64 (z, dat.P[5], dat.P[4]);
- v_f64_t r4 = v_fma_f64 (z, dat.P[7], dat.P[6]);
- v_f64_t r5 = v_fma_f64 (z, dat.P[9], dat.P[8]);
-
- v_f64_t z2 = z * z;
- v_f64_t y = v_fma_f64 (z2, r5, r4);
- y = v_fma_f64 (z2, y, r3);
- y = v_fma_f64 (z2, y, r2);
- y = v_fma_f64 (z2, y, r1);
-
- /* y=erf(x) if x>0, -erf(-x) otherwise. */
- y = v_as_f64_u64 (v_as_u64_f64 (y) ^ sign);
-
- if (unlikely (v_any_u64 (special_case)))
- return specialcase (x, y, special_case);
- return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, D, 1, erf, -6.0, 6.0)
-PL_TEST_ULP (V_NAME (erf), 1.26)
-PL_TEST_INTERVAL (V_NAME (erf), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (V_NAME (erf), 0x1p-127, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erf), -0x1p-127, -0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erf), 0x1p-26, 0x1p3, 40000)
-PL_TEST_INTERVAL (V_NAME (erf), -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (V_NAME (erf), 0, inf, 40000)
-#endif
diff --git a/pl/math/v_erf_2u5.c b/pl/math/v_erf_2u5.c
new file mode 100644
index 000000000000..e581ec5bb8a7
--- /dev/null
+++ b/pl/math/v_erf_2u5.c
@@ -0,0 +1,158 @@
+/*
+ * Double-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64x2_t third;
+ float64x2_t tenth, two_over_five, two_over_fifteen;
+ float64x2_t two_over_nine, two_over_fortyfive;
+ float64x2_t max, shift;
+#if WANT_SIMD_EXCEPT
+ float64x2_t tiny_bound, huge_bound, scale_minus_one;
+#endif
+} data = {
+ .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
+ .two_over_fifteen = V2 (0x1.1111111111111p-3),
+ .tenth = V2 (-0x1.999999999999ap-4),
+ .two_over_five = V2 (-0x1.999999999999ap-2),
+ .two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
+ .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+ .max = V2 (5.9921875), /* 6 - 1/128. */
+ .shift = V2 (0x1p45),
+#if WANT_SIMD_EXCEPT
+ .huge_bound = V2 (0x1p205),
+ .tiny_bound = V2 (0x1p-226),
+ .scale_minus_one = V2 (0x1.06eba8214db69p-3), /* 2/sqrt(pi) - 1.0. */
+#endif
+};
+
+#define AbsMask 0x7fffffffffffffff
+
+struct entry
+{
+ float64x2_t erf;
+ float64x2_t scale;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ float64x2_t e1 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[0])),
+ e2 = vld1q_f64 ((float64_t *) (__erf_data.tab + i[1]));
+ e.erf = vuzp1q_f64 (e1, e2);
+ e.scale = vuzp2q_f64 (e1, e2);
+ return e;
+}
+
+/* Double-precision implementation of vector erf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r) + scale * d * [
+ + 1
+ - r d
+ + 1/3 (2 r^2 - 1) d^2
+ - 1/6 (r (2 r^2 - 3)) d^3
+ + 1/30 (4 r^4 - 12 r^2 + 3) d^4
+ - 1/90 (4 r^4 - 20 r^2 + 15) d^5
+ ]
+
+ Maximum measure error: 2.29 ULP
+ V_NAME_D1 (erf)(-0x1.00003c924e5d1p-8) got -0x1.20dd59132ebadp-8
+ want -0x1.20dd59132ebafp-8. */
+float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+ float64x2_t a = vabsq_f64 (x);
+ /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
+ to return expected results. */
+ uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
+ uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| huge or tiny. */
+ uint64x2_t cmp1 = vcgtq_f64 (a, dat->huge_bound);
+ uint64x2_t cmp2 = vcltq_f64 (a, dat->tiny_bound);
+ uint64x2_t cmp = vorrq_u64 (cmp1, cmp2);
+ /* If any lanes are special, mask them with 1 for small x or 8 for large
+ values and retain a copy of a to allow special case handler to fix special
+ lanes later. This is only necessary if fenv exceptions are to be triggered
+ correctly. */
+ if (unlikely (v_any_u64 (cmp)))
+ {
+ a = vbslq_f64 (cmp1, v_f64 (8.0), a);
+ a = vbslq_f64 (cmp2, v_f64 (1.0), a);
+ }
+#endif
+
+ /* Set r to multiple of 1/128 nearest to |x|. */
+ float64x2_t shift = dat->shift;
+ float64x2_t z = vaddq_f64 (a, shift);
+
+ /* Lookup erf(r) and scale(r) in table, without shortcut for small values,
+ but with saturated indices for large values and NaNs in order to avoid
+ segfault. */
+ uint64x2_t i
+ = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
+ i = vbslq_u64 (a_le_max, i, v_u64 (768));
+ struct entry e = lookup (i);
+
+ float64x2_t r = vsubq_f64 (z, shift);
+
+ /* erf(x) ~ erf(r) + scale * d * poly (r, d). */
+ float64x2_t d = vsubq_f64 (a, r);
+ float64x2_t d2 = vmulq_f64 (d, d);
+ float64x2_t r2 = vmulq_f64 (r, r);
+
+ /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
+ float64x2_t p1 = r;
+ float64x2_t p2
+ = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
+ float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
+ float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+ p4 = vfmsq_f64 (dat->tenth, r2, p4);
+ float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+ p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
+
+ float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+ float64x2_t p12 = vfmaq_f64 (p1, d, p2);
+ float64x2_t y = vfmaq_f64 (p34, d2, p5);
+ y = vfmaq_f64 (p12, d2, y);
+
+ y = vfmaq_f64 (e.erf, e.scale, vfmsq_f64 (d, d2, y));
+
+ /* Solves the |x| = inf and NaN cases. */
+ y = vbslq_f64 (a_gt_max, v_f64 (1.0), y);
+
+ /* Copy sign. */
+ y = vbslq_f64 (v_u64 (AbsMask), y, x);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (cmp2)))
+ {
+ /* Neutralise huge values of x before fixing small values. */
+ x = vbslq_f64 (cmp1, v_f64 (1.0), x);
+ /* Fix tiny values that trigger spurious underflow. */
+ return vbslq_f64 (cmp2, vfmaq_f64 (x, dat->scale_minus_one, x), y);
+ }
+#endif
+ return y;
+}
+
+PL_SIG (V, D, 1, erf, -6.0, 6.0)
+PL_TEST_ULP (V_NAME_D1 (erf), 1.79)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (erf), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, 5.9921875, 40000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 5.9921875, inf, 40000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (erf), 0, inf, 40000)
diff --git a/pl/math/v_erf_data.c b/pl/math/v_erf_data.c
deleted file mode 100644
index 7bbb281ad912..000000000000
--- a/pl/math/v_erf_data.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Polynomial coefficients and shifts for double-precision erf(x) vector
- * function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* 48 intervals of the form [x_i, x_{i+1}] with x_i = i / 8 for
- i=1,...,47 (x_0 = 2^-1022). There is an extra dummy interval for
- [6, +inf] with all coeffs = 0 except for P_0 = 1.0, as erf(x) == 1
- above 6.
-
- Coefficients for each interval generated using fpminimax algorithm. See
- v_erf.sollya for details. Note the array is transposed, so for a set of
- coefficients C generated on interval i, C[j] is at coeffs[j][i]. */
-
-const struct v_erf_data __v_erf_data
- = {.shifts
- = {-0x1p-1019, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12,
- -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25,
- -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38,
- -39, -40, -41, -42, -43, -44, -45, -46, -47, 0},
- .coeffs = {
- // clang-format off
-
-{0x1.20dd750429b6dp-1022, 0x1.1f5e1a35c3b8ap-3, 0x1.1af54e232d609p-2, 0x1.9dd0d2b721f39p-2, 0x1.0a7ef5c18edd2p-1, 0x1.3f196dcd0f135p-1,
- 0x1.6c1c9759d0e5fp-1, 0x1.91724951b8fc6p-1, 0x1.af767a741088bp-1, 0x1.c6dad2829ec62p-1, 0x1.d8865d98abe00p-1, 0x1.e5768c3b4a3fcp-1,
- 0x1.eea5557137ae0p-1, 0x1.f4f693b67bd77p-1, 0x1.f92d077f8d56dp-1, 0x1.fbe61eef4cf6ap-1, 0x1.fd9ae142795e3p-1, 0x1.fea4218d6594ap-1,
- 0x1.ff404760319b4p-1, 0x1.ff9960f3eb327p-1, 0x1.ffcaa8f4c9beap-1, 0x1.ffe514bbdc197p-1, 0x1.fff2cfb0453d9p-1, 0x1.fff9ba420e834p-1,
- 0x1.fffd1ac4135f9p-1, 0x1.fffeb3ebb267bp-1, 0x1.ffff6f9f67e55p-1, 0x1.ffffc316d9ed0p-1, 0x1.ffffe710d565ep-1, 0x1.fffff618c3da6p-1,
- 0x1.fffffc2f171e3p-1, 0x1.fffffe92ced93p-1, 0x1.ffffff7b91176p-1, 0x1.ffffffd169d0cp-1, 0x1.fffffff01a8b6p-1, 0x1.fffffffabd229p-1,
- 0x1.fffffffe4fa30p-1, 0x1.ffffffff79626p-1, 0x1.ffffffffd759dp-1, 0x1.fffffffff4188p-1, 0x1.fffffffffc9e8p-1, 0x1.ffffffffff11ap-1,
- 0x1.ffffffffffc05p-1, 0x1.ffffffffffef8p-1, 0x1.fffffffffffbep-1, 0x1.ffffffffffff0p-1, 0x1.ffffffffffffcp-1, 0x1.fffffffffffffp-1, 1.0},
-
-{0x1.20dd750429b6dp-3, 0x1.1c62fa1e86989p-3, 0x1.0f5d1602f7dfbp-3, 0x1.f5f0cdaf152b2p-4, 0x1.c1efca49a5051p-4, 0x1.86e9694134b22p-4,
- 0x1.492e42d78d39cp-4, 0x1.0cab61f084b1bp-4, 0x1.a911f096fbb79p-5, 0x1.45e99bcbb78d4p-5, 0x1.e4652fadcbaa3p-6, 0x1.5ce595c455bccp-6,
- 0x1.e723726b81ff1p-7, 0x1.499d478bca4acp-7, 0x1.b055303221566p-8, 0x1.12ceb37ffa389p-8, 0x1.529b9e8cfa59fp-9, 0x1.94624e78e084fp-10,
- 0x1.d4143a9e023f5p-11, 0x1.06918b63537c2p-11, 0x1.1d83170fcc34bp-12, 0x1.2ce898808f08ep-13, 0x1.3360ccd26e06ap-14, 0x1.30538fbb986fbp-15,
- 0x1.2408e9bb1b657p-16, 0x1.0f9e1b4e4baaep-17, 0x1.e9b5e8d71b5e3p-19, 0x1.abe09e85af38ap-20, 0x1.6a5972347c568p-21, 0x1.296a70eff1bd9p-22,
- 0x1.d9371ee6bfc07p-24, 0x1.6ce1a88a01b3ap-25, 0x1.10b14985663f9p-26, 0x1.8b0d07ade43d8p-28, 0x1.155a098eceb0fp-29, 0x1.7974d3b397e7cp-31,
- 0x1.f1e3bf5a6493ap-33, 0x1.3e47781d91b97p-34, 0x1.8a7038368986cp-36, 0x1.d9d4d7be5992cp-38, 0x1.137dabebc1319p-39, 0x1.367541123e46cp-41,
- 0x1.58007ab162c1dp-43, 0x1.709f0d280b3f5p-45, 0x1.30a3dcf531ebfp-47, 0x1.d2707c055dedcp-50, 0x1.0d97f61945387p-49, 0x1.1dbc3ab728933p-50, 0},
-
-{0x1.2411381609db0p-51, -0x1.1c62fa1e75c0ap-9, -0x1.0f5d1602eb436p-8, -0x1.78749a4346714p-8, -0x1.c1efca49a7b15p-8, -0x1.e8a3c39178d95p-8,
- -0x1.edc5644363883p-8, -0x1.d62beb64e19eep-8, -0x1.a911f096f7a87p-8, -0x1.6ea6cf452dca3p-8, -0x1.2ebf3dccb166cp-8, -0x1.dfbbadedfcde6p-9,
- -0x1.6d5a95d08c346p-9, -0x1.0bcfca21880c9p-9, -0x1.7a4a8a2bf1a0bp-10, -0x1.01a1c8481a466p-10, -0x1.529b9e8d29ddap-11, -0x1.ada873604cf20p-12,
- -0x1.074b60f960c25p-12, -0x1.37ccd585732c6p-13, -0x1.64e3dcd73a1d3p-14, -0x1.8af14827e93bap-15, -0x1.a6a519ae712fbp-16, -0x1.b5781ea681265p-17,
- -0x1.b60d5ed744563p-18, -0x1.a8670acc75c29p-19, -0x1.8de3ce2154088p-20, -0x1.690584329096ap-21, -0x1.3d0e478659a54p-22, -0x1.0d8875cb088d0p-23,
- -0x1.bba3c56e56d69p-25, -0x1.617a60b4bcd87p-26, -0x1.10b16afb9ce08p-27, -0x1.9766e11f62828p-29, -0x1.26afbc55ef33cp-30, -0x1.9cd52c0e709a9p-32,
- -0x1.18175f6758766p-33, -0x1.705a68dde7f3ap-35, -0x1.d65ba6d52556dp-37, -0x1.23af5c3865987p-38, -0x1.51c72cd64a6bcp-40, -0x1.79f63bbc02f5ap-42,
- -0x1.2346f2840d7bfp-43, -0x1.8110f614395a8p-45, 0x1.c3309f1fe85a4p-46, 0x1.09e6fb6ee0b85p-46, -0x1.959834938224fp-46, -0x1.0e9a684ecee47p-46, 0},
-
-{-0x1.812746b057b58p-11, -0x1.6f552dbf96b31p-11, -0x1.3c97445cee1b0p-11, -0x1.e106c523a966dp-12, -0x1.2bf5318638e21p-12, -0x1.c8105034ea92fp-14,
- 0x1.b6e85963275c5p-15, 0x1.7c9d756585d29p-13, 0x1.1b614b0e78122p-12, 0x1.4cb3cf0b42031p-12, 0x1.571d01cf7eeb3p-12, 0x1.4374d82fe7f2ep-12,
- 0x1.1c2a02b9199a0p-12, 0x1.d6631e131dabap-13, 0x1.7148c3d9d22bap-13, 0x1.143d1c76ae7c6p-13, 0x1.8b0ae3afc07e6p-14, 0x1.0ea475d5b3822p-14,
- 0x1.63ef6208bd4adp-15, 0x1.c1ec100ec3e71p-16, 0x1.119da13709716p-16, 0x1.407fbd00318a5p-17, 0x1.69cf481b4666cp-18, 0x1.89e17d2b19c42p-19,
- 0x1.9db7531fa76f6p-20, 0x1.a37382bd61dc8p-21, 0x1.9aa4a8e8fe8dfp-22, 0x1.8451fcde36f23p-23, 0x1.62cd605193fe9p-24, 0x1.394b0d46af85cp-25,
- 0x1.0b6c0d1191ec9p-26, 0x1.b9581bcc8f4ebp-28, 0x1.603ea0f602119p-29, 0x1.0ff28bc88022cp-30, 0x1.95ecc71a0b4bep-32, 0x1.24ffe516534d4p-33,
- 0x1.9aa89abeffd90p-35, 0x1.1ab57210158fap-36, 0x1.8b0c503eafbcbp-38, 0x1.166413b8ba611p-39, 0x1.5848fad1e38e9p-42, 0x1.3573cc6d6d4e6p-49,
- 0x1.404c0dc8b5ffcp-42, 0x1.38779160f5f11p-43, -0x1.1dc84293acf27p-42, -0x1.2892755467252p-43, 0x1.8e40aed4a9e02p-43, 0x1.0cef3bce98bedp-43, 0},
-
-{0x1.4ade8e6d47ef0p-43, 0x1.196c9ee6491cfp-16, 0x1.040e8be6a9625p-15, 0x1.5529ad049b967p-15, 0x1.76f27e1744b44p-15, 0x1.6963c95cd8395p-15,
- 0x1.349b5d6ae76a6p-15, 0x1.cc6056b95eed3p-16, 0x1.1b614adacb10dp-16, 0x1.ca5080f4ec9b9p-18, -0x1.93a9d54fb750bp-20, -0x1.f3b8d7695d38cp-18,
- -0x1.6d5a929bfde5fp-17, -0x1.974c013452be9p-17, -0x1.8a0da620ab60fp-17, -0x1.5a3166e1f5682p-17, -0x1.1a2c5ad80a584p-17, -0x1.afe552a6507eep-18,
- -0x1.38a9879a760b8p-18, -0x1.ae595d5041755p-19, -0x1.1a89c93c4b9c8p-19, -0x1.62d4c3dc10fdbp-20, -0x1.ab0c620cf63d1p-21, -0x1.ed4aeff35fd90p-22,
- -0x1.11c8e63fae76dp-22, -0x1.2454a1fb4749ap-23, -0x1.2c7f7846b0e7bp-24, -0x1.298c17acfd63ap-25, -0x1.1c0f6cc5baa18p-26, -0x1.0574c9f0e63fap-27,
- -0x1.d0a5c4232f4cep-29, -0x1.8d9d301253af8p-30, -0x1.49cb78be34c81p-31, -0x1.08fc30eb50526p-32, -0x1.96e2f50cad458p-34, -0x1.2c888ddad994bp-35,
- -0x1.c5dd3068e7fcap-37, -0x1.935b876ed56ffp-38, -0x1.e74a7c256ba0dp-39, -0x1.1681c73733b50p-39, 0x1.855ab0b8664dep-41, 0x1.4aebdf7fb67e5p-41,
- -0x1.2aef07c393759p-40, -0x1.37e52b17505e6p-41, 0x1.394b997da7ed5p-40, 0x1.4345440ea9876p-41, -0x1.af227669dca68p-41, -0x1.23589e4f3cc49p-41, 0},
-
-{0x1.ce2f1b1646d4bp-19, 0x1.aaba29a029bd5p-19, 0x1.47e57fbf662a0p-19, 0x1.74882f55f1bd4p-20, 0x1.dfed759bd9091p-23, -0x1.c124b2acb3ee8p-21,
- -0x1.b429a82901889p-20, -0x1.1350ee93fbfb3p-19, -0x1.1b613a5e1e196p-19, -0x1.f65ceb61aa63ap-20, -0x1.82814da1daaa1p-20, -0x1.f5729185c040ep-21,
- -0x1.e72489bfea503p-22, -0x1.17d784c065f21p-24, 0x1.b2229e5122850p-23, 0x1.779b916c44358p-22, 0x1.ace7a08f66cb0p-22, 0x1.9973788b8f181p-22,
- 0x1.5d3bceb9c39d5p-22, 0x1.11da976499339p-22, 0x1.90eaa0d25df91p-23, 0x1.146c19a9f0ae8p-23, 0x1.693a52f5ccd0bp-24, 0x1.c122683fc1404p-25,
- 0x1.0a866e311e50ap-25, 0x1.2e85588e08741p-26, 0x1.493501a3ee15cp-27, 0x1.572eec204dc18p-28, 0x1.590e0157d4dabp-29, 0x1.4c0619d7359e8p-30,
- 0x1.36608b7b22d22p-31, 0x1.0e3f514a0d7fep-32, 0x1.e04d29135056ep-34, 0x1.aa936eb977e33p-35, 0x1.3ce1ec4a299b6p-36, 0x1.aba42bc751130p-38,
- 0x1.0861b5dc819e3p-38, 0x1.3bc7b1f0f8afbp-38, 0x1.7d6c896bf3579p-38, 0x1.14f24be91338cp-38, -0x1.2896024cf2ca9p-39, -0x1.c2e8399d1e8e7p-40,
- 0x1.7836a61cc0f4bp-39, 0x1.8a98e07f8cdfcp-40, -0x1.8f332379c6ce4p-39, -0x1.9bbec3ab83755p-40, 0x1.126c9c6d24bd6p-39, 0x1.72eaeac065cc2p-40, 0},
-
-{0x1.240b25b9a9823p-39, -0x1.733f879c52150p-24, -0x1.4c00873f3742fp-23, -0x1.9a6fe48163775p-23, -0x1.99ed7481d2399p-23, -0x1.52aea61425cf7p-23,
- -0x1.b853c3ad1c781p-24, -0x1.53c3e486c1845p-25, 0x1.2e2a4e7a0286dp-26, 0x1.fd0e266132929p-25, 0x1.5cf1d8fe5611fp-24, 0x1.6b140ba72ac56p-24,
- 0x1.3cab2fa73a9c4p-24, 0x1.d864967df5009p-25, 0x1.25b4551256078p-25, 0x1.0d029bc50b0cdp-26, 0x1.e126485c5dceep-30, -0x1.dd5e4bed818c0p-28,
- -0x1.7cd1b44dbfdc3p-27, -0x1.981def704f39ep-27, -0x1.6f0e87a0f3e35p-27, -0x1.267c0dc9b6e95p-27, -0x1.b2ec3078bf153p-28, -0x1.2b066605239f5p-28,
- -0x1.840473ed3d070p-29, -0x1.daf9b9b8c06cap-30, -0x1.1661520cf8a32p-30, -0x1.2fa49c29e30b5p-31, -0x1.4ddfd9d6a7cf4p-32, -0x1.4a55b8564425ap-33,
- -0x1.5df1ca746f291p-34, -0x1.dd6b8d1ec2e4fp-36, -0x1.34c63d902f888p-36, -0x1.b55b65a1655c0p-37, -0x1.9c1cfd1e2142cp-39, 0x1.98f2b73f288c4p-43,
- -0x1.3baba91a10af8p-39, -0x1.8cb03e5359e2bp-38, -0x1.16063ce2129afp-37, -0x1.9fd74120d8e00p-38, 0x1.cf0caf7defe71p-39, 0x1.5d029f324f3a7p-39,
- -0x1.21268c2290cb5p-38, -0x1.2f6de12d74afdp-39, 0x1.332ead763d55ap-38, 0x1.3cd3a7103e138p-39, -0x1.a64e5d1cdb028p-39, -0x1.1d674b3db2a42p-39, 0},
-
-{-0x1.b84a0abf33534p-27, -0x1.89c6cd0cf2b65p-27, -0x1.09bb37091d4aep-27, -0x1.68f777b72ca95p-29, 0x1.60a5240c5ece1p-29, 0x1.c7421c28ef551p-28,
- 0x1.2e75b6acb2116p-27, 0x1.30f14412b258cp-27, 0x1.f153992d28a09p-28, 0x1.3b80153a3c97bp-28, 0x1.df36fe4b5094cp-30, -0x1.724a2b185f507p-31,
- -0x1.37cb36ce4237dp-29, -0x1.963d70f677f90p-29, -0x1.8d5c135b0af66p-29, -0x1.42fbc01c11a3bp-29, -0x1.baba060b7adb1p-30, -0x1.eaf481fbc6feap-31,
- -0x1.5b5d0a354e49cp-32, 0x1.fb57bbdb6f854p-35, 0x1.2423823b5dcaep-32, 0x1.64e9c7f44ececp-32, 0x1.59b6fb115bcefp-32, 0x1.179a1737c24d9p-32,
- 0x1.a9515bcf95bb0p-33, 0x1.1ca83baba64bdp-33, 0x1.826e7ef89b3cap-34, 0x1.7ab5cb5ca2db0p-35, 0x1.2ce997226e82dp-35, 0x1.fdd14ca5a6d38p-37,
- 0x1.d35252de2a363p-37, -0x1.8dd5e799b3695p-39, 0x1.047fd46786432p-38, 0x1.aa8639c65a4a4p-38, 0x1.10495d2cdaee5p-41, -0x1.24b2b7e751230p-40,
- 0x1.e2ec0b9e9b211p-40, 0x1.6203cc50754ffp-38, 0x1.f95c0def7238bp-38, 0x1.7b31a463405b9p-38, -0x1.a826fa90b3c96p-39, -0x1.3f6315812b719p-39,
- 0x1.0862d42832ac6p-38, 0x1.1575d5fa4614cp-39, -0x1.18eb527929cedp-38, -0x1.21bd844e0e3b8p-39, 0x1.8233e415548a0p-39, 0x1.0501b16f5819bp-39, 0},
-
-{0x1.9b4497171a29dp-39, 0x1.7f9c0bcd4b3e7p-32, 0x1.4928133bccac3p-31, 0x1.7b5a70f49485bp-31, 0x1.4f71ee2c4aff3p-31, 0x1.bca22e6a9cd38p-32,
- 0x1.1c93a34970852p-33, -0x1.03d86c164d20cp-33, -0x1.448222383eb95p-32, -0x1.95aa76b3417ddp-32, -0x1.80448ecd34689p-32, -0x1.19d3f547d1f1fp-32,
- -0x1.2c65995a6a63fp-33, -0x1.01b5832823cc6p-35, 0x1.97d70f56a4524p-35, 0x1.7d57df58d20a9p-34, 0x1.a3d6fe32773b9p-34, 0x1.6ff53581ac827p-34,
- 0x1.faff84d277a6fp-35, 0x1.39ff19e23455bp-35, 0x1.9b1e383b8e03dp-37, 0x1.fd37bce839816p-40, -0x1.31b58a910d109p-37, -0x1.480a28743a67fp-37,
- -0x1.9a8b926ca51b4p-37, -0x1.14d6b0b9c8256p-37, -0x1.227dfd10a7f51p-37, -0x1.d1d5ba9e5676cp-42, -0x1.71c57d72b90eap-38, -0x1.018922e3bb1eap-40,
- -0x1.e0970faab38e6p-39, 0x1.a442b8ab5ed33p-39, -0x1.3a6f0acbd7293p-40, -0x1.7c53be7062a3ap-39, -0x1.c562622693573p-44, 0x1.458e668db57cdp-41,
- -0x1.d5f41a61e90a0p-41, -0x1.60d1f7c57cb11p-39, -0x1.f8fa4c98324fep-39, -0x1.7b178840b90e3p-39, 0x1.a8558cdf5220ap-40, 0x1.3f7acb241cdbbp-40,
- -0x1.086dc81118428p-39, -0x1.15828db8b2da6p-40, 0x1.18f9d5a5099c3p-39, 0x1.21cd05249b8c9p-40, -0x1.82493a2d7a1fep-40, -0x1.0510a8a58c1abp-40, 0},
-
-{0x1.4c0cf8eccd2e0p-35, 0x1.de696ed8004cbp-36, 0x1.62392d5363e58p-37, -0x1.21d68e1a8e4c7p-37, -0x1.867b57075ec9dp-36, -0x1.058af4c30abafp-35,
- -0x1.dbb6594ed5127p-36, -0x1.6006d1f354794p-36, -0x1.311e96adfec96p-37, 0x1.2c82e5ef56703p-39, 0x1.6f2c1413cbe8ep-37, 0x1.c46886dd6c5d6p-37,
- 0x1.92e273bf63d54p-37, 0x1.2982faf5df034p-37, 0x1.5ad37b1dc30c4p-38, 0x1.97104fd2630f8p-40, -0x1.38bcd955ecbb9p-40, -0x1.7779727d36c91p-39,
- -0x1.4862c13c3ccf5p-39, -0x1.53facd6319433p-39, -0x1.de2f6e88b0926p-41, -0x1.fb0967f0fa611p-41, 0x1.5fadb405af344p-42, 0x1.e90319ef64411p-43,
- 0x1.fc013fac4d3d7p-41, 0x1.0546d08a05cacp-41, 0x1.fa1b10c35012ep-41, -0x1.000d4354b8049p-41, 0x1.b68ee44b2b84bp-41, 0x1.cfa36d83ea2afp-48,
- 0x1.5c41a6c8aaf3ap-41, -0x1.7edb2342ceb28p-41, 0x1.d9211942a37d9p-43, 0x1.39b815d399ba2p-41, 0x1.1fc46969db91bp-46, -0x1.1736507c25bafp-43,
- 0x1.89bbcfdb5c677p-43, 0x1.28f22b295bc86p-41, 0x1.a9396e0b45a3bp-41, 0x1.3f409ac2dbfafp-41, -0x1.65682520f07a7p-42, -0x1.0d1586492d3b1p-42,
- 0x1.bd6c9f236abc3p-42, 0x1.d376a4bd795bep-43, -0x1.d94e87dd31275p-42, -0x1.e82d04ff5649fp-43, 0x1.455b18d5d810fp-42, 0x1.b7c6a4ab711bdp-43, 0}
- // clang-format on
- }};
diff --git a/pl/math/v_erfc_1u8.c b/pl/math/v_erfc_1u8.c
new file mode 100644
index 000000000000..10ef7e6a3c34
--- /dev/null
+++ b/pl/math/v_erfc_1u8.c
@@ -0,0 +1,198 @@
+/*
+ * Double-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ uint64x2_t offset, table_scale;
+ float64x2_t max, shift;
+ float64x2_t p20, p40, p41, p42;
+ float64x2_t p51, p52;
+ float64x2_t qr5, qr6, qr7, qr8, qr9;
+#if WANT_SIMD_EXCEPT
+ float64x2_t uflow_bound;
+#endif
+} data = {
+ /* Set an offset so the range of the index used for lookup is 3487, and it
+ can be clamped using a saturated add on an offset index.
+ Index offset is 0xffffffffffffffff - asuint64(shift) - 3487. */
+ .offset = V2 (0xbd3ffffffffff260),
+ .table_scale = V2 (0x37f0000000000000 << 1), /* asuint64 (2^-128) << 1. */
+ .max = V2 (0x1.b3ep+4), /* 3487/128. */
+ .shift = V2 (0x1p45),
+ .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
+ .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
+ .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
+ .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
+ .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
+ .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
+ /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
+ .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
+ .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
+ .qr7 = { 0x1.2492492492492p0, -0x1.8e38e38e38e39p-3 },
+ .qr8 = { 0x1.2p0, -0x1.6c16c16c16c17p-3 },
+ .qr9 = { 0x1.1c71c71c71c72p0, -0x1.4f2094f2094f2p-3 },
+#if WANT_SIMD_EXCEPT
+ .uflow_bound = V2 (0x1.a8b12fc6e4892p+4),
+#endif
+};
+
+#define TinyBound 0x4000000000000000 /* 0x1p-511 << 1. */
+#define Off 0xfffffffffffff260 /* 0xffffffffffffffff - 3487. */
+
+struct entry
+{
+ float64x2_t erfc;
+ float64x2_t scale;
+};
+
+static inline struct entry
+lookup (uint64x2_t i)
+{
+ struct entry e;
+ float64x2_t e1 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[0])),
+ e2 = vld1q_f64 ((float64_t *) (__erfc_data.tab - Off + i[1]));
+ e.erfc = vuzp1q_f64 (e1, e2);
+ e.scale = vuzp2q_f64 (e1, e2);
+ return e;
+}
+
+#if WANT_SIMD_EXCEPT
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ return v_call_f64 (erfc, x, y, cmp);
+}
+#endif
+
+/* Optimized double-precision vector erfc(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+ - r * (2/45 r^4 - 2/9 r^2 + 1/6) d^5
+ + p6(r) d^6 + ... + p10(r) d^10
+
+ Polynomials p6(r) to p10(r) are computed using recurrence relation
+
+ 2(i+1)p_i + 2r(i+2)p_{i+1} + (i+2)(i+3)p_{i+2} = 0,
+ with p0 = 1, and p1(r) = -r.
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+
+ Maximum measured error: 1.71 ULP
+ V_NAME_D1 (erfc)(0x1.46cfe976733p+4) got 0x1.e15fcbea3e7afp-608
+ want 0x1.e15fcbea3e7adp-608. */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| < 2^-511. Avoid fabs by left-shifting by 1. */
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t cmp = vcltq_u64 (vaddq_u64 (ix, ix), v_u64 (TinyBound));
+ /* x >= ~26.54 (into subnormal case and uflow case). Comparison is done in
+ integer domain to avoid raising exceptions in presence of nans. */
+ uint64x2_t uflow = vcgeq_s64 (vreinterpretq_s64_f64 (x),
+ vreinterpretq_s64_f64 (dat->uflow_bound));
+ cmp = vorrq_u64 (cmp, uflow);
+ float64x2_t xm = x;
+ /* If any lanes are special, mask them with 0 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u64 (cmp)))
+ x = v_zerofy_f64 (x, cmp);
+#endif
+
+ float64x2_t a = vabsq_f64 (x);
+ a = vminq_f64 (a, dat->max);
+
+ /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
+ 2/sqrt(pi), when x reduced to r = 0. */
+ float64x2_t shift = dat->shift;
+ float64x2_t z = vaddq_f64 (a, shift);
+
+ /* Clamp index to a range of 3487. A naive approach would use a subtract and
+ min. Instead we offset the table address and the index, then use a
+ saturating add. */
+ uint64x2_t i = vqaddq_u64 (vreinterpretq_u64_f64 (z), dat->offset);
+
+ struct entry e = lookup (i);
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
+ float64x2_t r = vsubq_f64 (z, shift);
+ float64x2_t d = vsubq_f64 (a, r);
+ float64x2_t d2 = vmulq_f64 (d, d);
+ float64x2_t r2 = vmulq_f64 (r, r);
+
+ float64x2_t p1 = r;
+ float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
+ float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
+ float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+ p4 = vfmsq_f64 (dat->p40, r2, p4);
+ float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+ p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
+ /* Compute p_i using recurrence relation:
+ p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
+ float64x2_t p6 = vfmaq_f64 (p4, p5, vmulq_laneq_f64 (r, dat->qr5, 0));
+ p6 = vmulq_laneq_f64 (p6, dat->qr5, 1);
+ float64x2_t p7 = vfmaq_f64 (p5, p6, vmulq_laneq_f64 (r, dat->qr6, 0));
+ p7 = vmulq_laneq_f64 (p7, dat->qr6, 1);
+ float64x2_t p8 = vfmaq_f64 (p6, p7, vmulq_laneq_f64 (r, dat->qr7, 0));
+ p8 = vmulq_laneq_f64 (p8, dat->qr7, 1);
+ float64x2_t p9 = vfmaq_f64 (p7, p8, vmulq_laneq_f64 (r, dat->qr8, 0));
+ p9 = vmulq_laneq_f64 (p9, dat->qr8, 1);
+ float64x2_t p10 = vfmaq_f64 (p8, p9, vmulq_laneq_f64 (r, dat->qr9, 0));
+ p10 = vmulq_laneq_f64 (p10, dat->qr9, 1);
+ /* Compute polynomial in d using pairwise Horner scheme. */
+ float64x2_t p90 = vfmaq_f64 (p9, d, p10);
+ float64x2_t p78 = vfmaq_f64 (p7, d, p8);
+ float64x2_t p56 = vfmaq_f64 (p5, d, p6);
+ float64x2_t p34 = vfmaq_f64 (p3, d, p4);
+ float64x2_t p12 = vfmaq_f64 (p1, d, p2);
+ float64x2_t y = vfmaq_f64 (p78, d2, p90);
+ y = vfmaq_f64 (p56, d2, y);
+ y = vfmaq_f64 (p34, d2, y);
+ y = vfmaq_f64 (p12, d2, y);
+
+ y = vfmsq_f64 (e.erfc, e.scale, vfmsq_f64 (d, d2, y));
+
+ /* Offset equals 2.0 if sign, else 0.0. */
+ uint64x2_t sign = vshrq_n_u64 (vreinterpretq_u64_f64 (x), 63);
+ float64x2_t off = vreinterpretq_f64_u64 (vshlq_n_u64 (sign, 62));
+ /* Copy sign and scale back in a single fma. Since the bit patterns do not
+ overlap, then logical or and addition are equivalent here. */
+ float64x2_t fac = vreinterpretq_f64_u64 (
+ vsraq_n_u64 (vshlq_n_u64 (sign, 63), dat->table_scale, 1));
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (xm, vfmaq_f64 (off, fac, y), cmp);
+#endif
+
+ return vfmaq_f64 (off, fac, y);
+}
+
+PL_SIG (V, D, 1, erfc, -6.0, 28.0)
+PL_TEST_ULP (V_NAME_D1 (erfc), 1.21)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (erfc), 0, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME_D1 (erfc), 0x1p-26, 28.0, 40000)
+PL_TEST_INTERVAL (V_NAME_D1 (erfc), -0x1p-26, -6.0, 40000)
+PL_TEST_INTERVAL (V_NAME_D1 (erfc), 28.0, inf, 40000)
+PL_TEST_INTERVAL (V_NAME_D1 (erfc), -6.0, -inf, 40000)
diff --git a/pl/math/v_erfc_4u.c b/pl/math/v_erfc_4u.c
deleted file mode 100644
index c30635153a20..000000000000
--- a/pl/math/v_erfc_4u.c
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Double-precision vector erfc(x) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "horner.h"
-#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-/* Accurate exponential (vector variant of exp_dd). */
-v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
-
-#define One v_f64 (1.0)
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define Scale v_f64 (0x1.0000002p27)
-
-/* Coeffs for polynomial approximation on [0x1.0p-28., 31.]. */
-#define PX __v_erfc_data.poly
-#define xint __v_erfc_data.interval_bounds
-
-/* Special cases (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
-{
- return v_call_f64 (erfc, x, y, cmp);
-}
-
-/* A structure to perform look-up in coeffs and other parameter
- tables. */
-struct entry
-{
- v_f64_t P[ERFC_POLY_ORDER + 1];
- v_f64_t xi;
-};
-
-static inline struct entry
-lookup (v_u64_t i)
-{
- struct entry e;
-#ifdef SCALAR
- for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
- e.P[j] = PX[i][j];
- e.xi = xint[i];
-#else
- for (int j = 0; j <= ERFC_POLY_ORDER; ++j)
- {
- e.P[j][0] = PX[i[0]][j];
- e.P[j][1] = PX[i[1]][j];
- }
- e.xi[0] = xint[i[0]];
- e.xi[1] = xint[i[1]];
-#endif
- return e;
-}
-
-/* Accurate evaluation of exp(x^2) using compensated product
- (x^2 ~ x*x + e2) and custom exp(y+d) routine for small
- corrections d<<y. */
-static inline v_f64_t
-v_eval_gauss (v_f64_t a)
-{
- v_f64_t e2;
- v_f64_t a2 = a * a;
-
- /* TwoProduct (Dekker) applied to a * a. */
- v_f64_t a_hi = -v_fma_f64 (Scale, a, -a);
- a_hi = v_fma_f64 (Scale, a, a_hi);
- v_f64_t a_lo = a - a_hi;
-
- /* Now assemble error term. */
- e2 = v_fma_f64 (-a_hi, a_hi, a2);
- e2 = v_fma_f64 (-a_hi, a_lo, e2);
- e2 = v_fma_f64 (-a_lo, a_hi, e2);
- e2 = v_fma_f64 (-a_lo, a_lo, e2);
-
- /* Fast and accurate evaluation of exp(-a2 + e2) where e2 << a2. */
- return V_NAME (exp_tail) (-a2, e2);
-}
-
-/* Optimized double precision vector complementary error function erfc.
- Maximum measured error is 3.64 ULP:
- __v_erfc(0x1.4792573ee6cc7p+2) got 0x1.ff3f4c8e200d5p-42
- want 0x1.ff3f4c8e200d9p-42. */
-VPCS_ATTR
-v_f64_t V_NAME (erfc) (v_f64_t x)
-{
- v_f64_t z, p, y;
- v_u64_t ix, atop, sign, i, cmp;
-
- ix = v_as_u64_f64 (x);
- /* Compute fac as early as possible in order to get best performance. */
- v_f64_t fac = v_as_f64_u64 ((ix >> 63) << 62);
- /* Use 12-bit for small, nan and inf case detection. */
- atop = (ix >> 52) & 0x7ff;
- cmp = v_cond_u64 (atop - v_u64 (0x3cd) >= v_u64 (0x7ff - 0x3cd));
-
- struct entry dat;
-
- /* All entries of the vector are out of bounds, take a short path.
- Use smallest possible number above 28 representable in 12 bits. */
- v_u64_t out_of_bounds = v_cond_u64 (atop >= v_u64 (0x404));
-
- /* Use sign to produce either 0 if x > 0, 2 otherwise. */
- if (v_all_u64 (out_of_bounds) && likely (v_any_u64 (~cmp)))
- return fac;
-
- /* erfc(|x|) = P(|x|-x_i)*exp(-x^2). */
-
- v_f64_t a = v_abs_f64 (x);
-
- /* Interval bounds are a logarithmic scale, i.e. interval n has
- lower bound 2^(n/4) - 1. Use the exponent of (|x|+1)^4 to obtain
- the interval index. */
- v_f64_t xp1 = a + v_f64 (1.0);
- xp1 = xp1 * xp1;
- xp1 = xp1 * xp1;
- v_u64_t ixp1 = v_as_u64_f64 (xp1);
- i = (ixp1 >> 52) - v_u64 (1023);
-
- /* Index cannot exceed number of polynomials. */
-#ifdef SCALAR
- i = i <= (ERFC_NUM_INTERVALS) ? i : ERFC_NUM_INTERVALS;
-#else
- i = (v_u64_t){i[0] <= ERFC_NUM_INTERVALS ? i[0] : ERFC_NUM_INTERVALS,
- i[1] <= ERFC_NUM_INTERVALS ? i[1] : ERFC_NUM_INTERVALS};
-#endif
- /* Get coeffs of i-th polynomial. */
- dat = lookup (i);
-
- /* Evaluate Polynomial: P(|x|-x_i). */
- z = a - dat.xi;
-#define C(i) dat.P[i]
- p = HORNER_12 (z, C);
-
- /* Evaluate Gaussian: exp(-x^2). */
- v_f64_t e = v_eval_gauss (a);
-
- /* Copy sign. */
- sign = v_as_u64_f64 (x) & ~AbsMask;
- p = v_as_f64_u64 (v_as_u64_f64 (p) ^ sign);
-
- /* Assemble result as 2.0 - p * e if x < 0, p * e otherwise. */
- y = v_fma_f64 (p, e, fac);
-
- /* No need to fix value of y if x is out of bound, as
- P[ERFC_NUM_INTERVALS]=0. */
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, D, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (V_NAME (erfc), 3.15)
-PL_TEST_INTERVAL (V_NAME (erfc), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-1022, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-1022, -0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erfc), 0x1p-26, 0x1p5, 40000)
-PL_TEST_INTERVAL (V_NAME (erfc), -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (V_NAME (erfc), 0, inf, 40000)
-#endif
diff --git a/pl/math/v_erfc_data.c b/pl/math/v_erfc_data.c
deleted file mode 100644
index 3c47033c1170..000000000000
--- a/pl/math/v_erfc_data.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Polynomial coefficients for double-precision erfc(x) vector function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Coefficients for 20 order-12 polynomials used in v_erfc. The intervals have
- the same bounds as the scalar algorithm, with the exception of the lower
- bound of the first interval which is larger. This is because the vector
- variants fall back to the scalar for tiny arguments, meaning that we can use
- a slightly different approach which is more precise for larger inputs but
- unacceptably imprecise for tiny inputs. */
-
-const struct v_erfc_data __v_erfc_data = {
-
-/* Bounds for 20 intervals spanning [0x1.0p-28., 31.]. Interval bounds are a
- logarithmic scale, i.e. interval n has lower bound 2^(n/4) - 1, with the
- exception of the first interval. */
-.interval_bounds = {
- 0x1p-28, /* If xmin=2^-28, 0 otherwise. */
- 0x1.837f0518db8a9p-3, /* 0.189. */
- 0x1.a827999fcef32p-2, /* 0.414. */
- 0x1.5d13f32b5a75bp-1, /* 0.682. */
- 0x1.0p0, /* 1.000. */
- 0x1.60dfc14636e2ap0, /* 1.378. */
- 0x1.d413cccfe779ap0, /* 1.828. */
- 0x1.2e89f995ad3adp1, /* 2.364. */
- 0x1.8p1, /* 3.000. */
- 0x1.e0dfc14636e2ap1, /* 3.757. */
- 0x1.2a09e667f3bcdp2, /* 4.657. */
- 0x1.6e89f995ad3adp2, /* 5.727. */
- 0x1.cp2, /* 7.000. */
- 0x1.106fe0a31b715p3, /* 8.514. */
- 0x1.4a09e667f3bcdp3, /* 10.31. */
- 0x1.8e89f995ad3adp3, /* 12.45. */
- 0x1.ep3, /* 15.00. */
- 0x1.206fe0a31b715p4, /* 18.03. */
- 0x1.5a09e667f3bcdp4, /* 21.63. */
- 0x1.9e89f995ad3adp4, /* 25.91. */
- 0x1.fp4 /* 31.00. */
-},
-
-/* Generated using fpminimax algorithm on each interval separately. The
- polynomial approximates erfc(x + a) * exp((x + a) ^ 2) in the interval
- [0;b-a], where [a;b] is the interval in which the input lies. Note this is
- slightly different from the scalar polynomial, which approximates
- erfc(x + a) * exp(x ^ 2). See v_erfc.sollya for more details. */
-.poly = {
-/* 3.725290298461914e-9 < x < 0.18920711500272103. */
-{0x1.ffffffdbe4516p-1, -0x1.20dd74e429b54p0, 0x1.ffffffb7c6a67p-1, -0x1.8127466fa2ec9p-1, 0x1.ffffff6eeff5ap-2, -0x1.341f668c90dccp-2, 0x1.5554aca74e5d6p-3, -0x1.6014d9d3fed0dp-4, 0x1.546b5f2c85127p-5, -0x1.2f7ec79acc129p-6, 0x1.a27e53703b7abp-8, 0x1.7b18bce311fa3p-12, -0x1.1897cda04df3ap-9},
-/* 0.18920711500272103 < x < 0.41421356237309515. */
-{0x1.a2b43de077724p-1, -0x1.a3495bb58664cp-1, 0x1.535f3ff4547e6p-1, -0x1.d96eea2951a7cp-2, 0x1.269566a956371p-2, -0x1.4e281de026b47p-3, 0x1.5ea071b652a2fp-4, -0x1.57f46cfca7024p-5, 0x1.3db28243f06abp-6, -0x1.138745eef6f26p-7, 0x1.a9cd70bad344p-9, -0x1.c6e4fda8920c4p-11, 0x1.624709ca2bc71p-16},
-/* 0.41421356237309515 < x < 0.681792830507429. */
-{0x1.532e75764e513p-1, -0x1.28be34f327f9dp-1, 0x1.b088738cca84cp-2, -0x1.14377551bd5c8p-2, 0x1.3e1ecedd64246p-3, -0x1.5087f3110eb57p-4, 0x1.4b3c61efcb562p-5, -0x1.324cc70a4f459p-6, 0x1.0cd19a96af21bp-7, -0x1.cc2ccc725d07p-9, 0x1.a3ba67a7d02b4p-10, -0x1.b1943295882abp-11, 0x1.53a1c5fdf8e67p-12},
-/* 0.681792830507429 < x < 1. */
-{0x1.10f974588f63dp-1, -0x1.9b032139e3367p-2, 0x1.09b942b8a951dp-2, -0x1.327553909cb88p-3, 0x1.42819b6c9a14p-4, -0x1.3a6d6f1924825p-5, 0x1.1f1864dd6f28fp-6, -0x1.ef12c5e9f3232p-8, 0x1.962ac63d55aa1p-9, -0x1.4146d9206419cp-10, 0x1.f823f62268229p-12, -0x1.837ab488d5ed8p-13, 0x1.aa021ae16edfep-15},
-/* 1 < x < 1.378414230005442. */
-{0x1.b5d8780f956b2p-2, -0x1.17c4e3f17c034p-2, 0x1.3c27283c31939p-3, -0x1.44837f88a0ecdp-4, 0x1.33cad0dc779c8p-5, -0x1.10fcef8294e8dp-6, 0x1.c8cb3e5a6a5a6p-8, -0x1.6aedbd3a05f1cp-9, 0x1.1325c0bf9a0cap-10, -0x1.8e28d61a0f646p-12, 0x1.0d554e2ab3652p-13, -0x1.35b5f9ac296ebp-15, 0x1.b8faf07e2527dp-18},
-/* 1.378414230005442 < x < 1.8284271247461903. */
-{0x1.5ee444130b7dbp-2, -0x1.78396ab2083e8p-3, 0x1.6e617ec5bc039p-4, -0x1.49e60f6238765p-5, 0x1.16064fb4428c9p-6, -0x1.ba80a8575a434p-8, 0x1.4ec30f2efeb8p-9, -0x1.e40456c735f09p-11, 0x1.4f7ee6b7885b7p-12, -0x1.bc9997995fdecp-14, 0x1.1169f7327ff2p-15, -0x1.174826d000852p-17, 0x1.5506a7433e925p-20},
-/* 1.8284271247461903 < x < 2.363585661014858. */
-{0x1.19a22c064d4eap-2, -0x1.f645498cae1b3p-4, 0x1.a0565950e1256p-5, -0x1.446605c186f6dp-6, 0x1.df1231b47ff04p-8, -0x1.515164d13dfafp-9, 0x1.c72bde869ad61p-11, -0x1.2768fbf9b1d6ep-12, 0x1.71bd3a1b851e9p-14, -0x1.bca5b5942017cp-16, 0x1.f2d480b3a2e63p-18, -0x1.d339662d53467p-20, 0x1.06d67ebf792bp-22},
-/* 2.363585661014858 < x < 3. */
-{0x1.c57f0542a7637p-3, -0x1.4e5535c17af25p-4, 0x1.d31272523acfep-6, -0x1.3727cbbfd1bfcp-7, 0x1.8d6730b8c5a4cp-9, -0x1.e88548286036fp-11, 0x1.21f6e89456853p-12, -0x1.4d4b7787bd3c2p-14, 0x1.735dc84e7ff16p-16, -0x1.8eb02db832048p-18, 0x1.8dfb8add3b86ep-20, -0x1.47a340d76c72bp-22, 0x1.3e5925ffebe6bp-25},
-/* 3 < x < 3.756828460010884. */
-{0x1.6e9827d229d2dp-3, -0x1.bd6ae4d14b1adp-5, 0x1.043fe1a98c3b9p-6, -0x1.259061ba34453p-8, 0x1.409cc2cc96bedp-10, -0x1.53dec3fd6c443p-12, 0x1.5e72f7baf3554p-14, -0x1.601aa94bf21eep-16, 0x1.58e730ceaa91dp-18, -0x1.4762cbd256163p-20, 0x1.22b8bea5d4a5ap-22, -0x1.ac197af37fcadp-25, 0x1.74cdf138a0b73p-28},
-/* 3.756828460010884 < x < 4.656854249492381. */
-{0x1.29a8a4e95063ep-3, -0x1.29a8a316d331dp-5, 0x1.21876b3fe50cfp-7, -0x1.1276f2d8eefd9p-9, 0x1.fbff521741e5cp-12, -0x1.cb9ce996b9601p-14, 0x1.971075371ef81p-16, -0x1.61458571e4738p-18, 0x1.2c51c21b7ab9ep-20, -0x1.f01e444a666c3p-23, 0x1.7e8f2979b67f1p-25, -0x1.e505367843027p-28, 0x1.67809d68de49cp-31},
-/* 4.656854249492381 < x < 5.727171322029716. */
-{0x1.e583024e2bc7fp-4, -0x1.8fb458acb5acep-6, 0x1.42b9dffac075cp-8, -0x1.ff9fe9a48522p-11, 0x1.8e7e866f4f073p-13, -0x1.313aeee1c2d45p-15, 0x1.cc299efd7374cp-18, -0x1.5587e53442d66p-20, 0x1.f2aca160f159bp-23, -0x1.62ae4834dcda7p-25, 0x1.d6b070147cb37p-28, -0x1.fee399e7be1bfp-31, 0x1.41d6f9fbc9515p-34},
-/* 5.727171322029716 < x < 7. */
-{0x1.8d9cbafa30408p-4, -0x1.0dd14614ed1cfp-6, 0x1.6943976ea6bf4p-9, -0x1.dd6f05f3b914cp-12, 0x1.37891317e7bcfp-14, -0x1.91a81ce9014a2p-17, 0x1.ffcac303208b9p-20, -0x1.424f1af78feb3p-22, 0x1.90b8edbca12a5p-25, -0x1.e69bea0338c7fp-28, 0x1.13b974a710373p-30, -0x1.fdc9aa9359794p-34, 0x1.105fc772b5a66p-37},
-/* 7 < x < 8.513656920021768. */
-{0x1.46dc6bf900f68p-4, -0x1.6e4b45246f95p-7, 0x1.96a3de47d4bd7p-10, -0x1.bf5070eccb409p-13, 0x1.e7af6e83607a2p-16, -0x1.078bf5306f9eep-18, 0x1.1a6e8327243adp-21, -0x1.2c1e7368c7809p-24, 0x1.3bc83557dac43p-27, -0x1.45a6405b2e649p-30, 0x1.3aac4888689ebp-33, -0x1.f1fa23448a168p-37, 0x1.c868668755778p-41},
-/* 8.513656920021768 < x < 10.313708498984761. */
-{0x1.0d9a17e032288p-4, -0x1.f3e942ff4df7p-8, 0x1.cc77f09dabc5cp-11, -0x1.a56e8bfd32da8p-14, 0x1.7f49e31164409p-17, -0x1.5a73f46a6afc9p-20, 0x1.374240ce973d2p-23, -0x1.15e8d473b728cp-26, 0x1.ec3ec79699378p-30, -0x1.ab3b8aba63362p-33, 0x1.5a1381cfe2866p-36, -0x1.c78e252ce77ccp-40, 0x1.589857ceaaaeep-44},
-/* 10.313708498984761 < x < 12.454342644059432. */
-{0x1.be0c73cc19eddp-5, -0x1.56ce6f6c0cbb1p-8, 0x1.0645980ecbbfcp-11, -0x1.8f86f887f6598p-15, 0x1.2ef80cd9e00b1p-18, -0x1.c97ffd66720e4p-22, 0x1.57f0eeecf030ap-25, -0x1.016df7d5e28d9p-28, 0x1.7f0d022922f1dp-32, -0x1.1849731f004aep-35, 0x1.8149e7ca0fb3cp-39, -0x1.b1fe4abe62d81p-43, 0x1.1ae4d60247651p-47},
-/* 12.454342644059432 < x < 15. */
-{0x1.71eafbd9f5877p-5, -0x1.d83714d90461fp-9, 0x1.2c74dbacd45fdp-12, -0x1.7d27f3cfe160ep-16, 0x1.e20b13b8d32e3p-20, -0x1.2fe33cb2bce33p-23, 0x1.7dfd564d69a07p-27, -0x1.dea62ef0f7d7ep-31, 0x1.2a7b946273ea5p-34, -0x1.6eb665bad5b72p-38, 0x1.a8191750e8bf9p-42, -0x1.92d8a86cbd0fcp-46, 0x1.bba272feef841p-51},
-/* 15 < x < 18.027313840043536. */
-{0x1.33714a024097ep-5, -0x1.467f441a50bc3p-9, 0x1.59fa2994c6f7ap-13, -0x1.6dd369d642b7dp-17, 0x1.81fb2aaf2e37p-21, -0x1.966040990b623p-25, 0x1.aaee55e15a079p-29, -0x1.bf756fc8ef04p-33, 0x1.d2daf554e0157p-37, -0x1.dec63e10d317p-41, 0x1.cae915bab7704p-45, -0x1.6537fbb62a8edp-49, 0x1.3f14bd5531da8p-54},
-/* 18.027313840043536 < x < 21.627416997969522. */
-{0x1.fff97acd75487p-6, -0x1.c502e8e46eb81p-10, 0x1.903b065062756p-14, -0x1.6110aa5e81885p-18, 0x1.36fd4c13c4f1fp-22, -0x1.11848650be987p-26, 0x1.e06596bf6a27p-31, -0x1.a527876771d55p-35, 0x1.6fe1b92a40eb8p-39, -0x1.3c6eb50b23bc6p-43, 0x1.fead2230125dp-48, -0x1.5073427c5207dp-52, 0x1.ff420973fa51dp-58},
-/* 21.627416997969522 < x < 25.908685288118864. */
-{0x1.aaf347fc8c45bp-6, -0x1.3b2fd709cf8e5p-10, 0x1.d0ddfb858b60ap-15, -0x1.5673f4a8bb08ep-19, 0x1.f80488e89ddb9p-24, -0x1.728391905fcf3p-28, 0x1.101538d7e30bap-32, -0x1.8f16f49d0fa3bp-37, 0x1.23bbaea534034p-41, -0x1.a40119533ee1p-46, 0x1.1b75770e435fdp-50, -0x1.3804bdeb33efdp-55, 0x1.8ba4e7838a4dp-61},
-/* 25.908685288118864 < x < 31. */
-{0x1.64839d636f92bp-6, -0x1.b7adf753623afp-11, 0x1.0eec0b635a0c4p-15, -0x1.4da09b802ef48p-20, 0x1.9a8b149f5ddf1p-25, -0x1.f8d1f722c65bap-30, 0x1.36247d9a20e19p-34, -0x1.7cbd25180c1d3p-39, 0x1.d243c7a5c8331p-44, -0x1.19e00cc6b1e08p-48, 0x1.418cb6823f2d9p-53, -0x1.2dfdc526c43acp-58, 0x1.49885a987486fp-64},
-/* Dummy interval for x>31 */
-{0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0, 0x0p0,
- 0x0p0, 0x0p0, 0x0p0}
-}
-};
diff --git a/pl/math/v_erfcf_1u.c b/pl/math/v_erfcf_1u.c
deleted file mode 100644
index 963490d789bd..000000000000
--- a/pl/math/v_erfcf_1u.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Single-precision vector erfc(x) function.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "erfcf.h"
-#include "estrin.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-#define P(ia12) __erfcf_poly_data.poly[interval_index (ia12)]
-
-VPCS_ATTR v_f64_t V_NAME (exp_tail) (v_f64_t, v_f64_t);
-
-static VPCS_ATTR NOINLINE v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t special)
-{
- return v_call_f32 (erfcf, x, y, special);
-}
-
-static inline uint32_t
-interval_index (uint32_t ia12)
-{
- // clang-format off
- return (ia12 < 0x400 ? 0 :
- (ia12 < 0x408 ? 1 :
- (ia12 < 0x410 ? 2 :
- 3)));
- // clang-format on
-}
-
-/* The C macro wraps the coeffs argument in order to make the
- poynomial evaluation more readable. In the scalarised variant the
- second pointer is ignored. */
-#ifdef SCALAR
-#define C(i) coeff1[i]
-#else
-#define C(i) ((v_f64_t){coeff1[i], coeff2[i]})
-#endif
-
-static inline v_f64_t
-v_approx_erfcf_poly_gauss (v_f64_t x, const double *coeff1,
- const double *coeff2)
-{
- v_f64_t x2 = x * x;
- v_f64_t x4 = x2 * x2;
- v_f64_t poly = ESTRIN_15 (x, x2, x4, x4 * x4, C);
- v_f64_t gauss = V_NAME (exp_tail) (-(x * x), v_f64 (0.0));
- return poly * gauss;
-}
-
-static inline float
-approx_poly_gauss (float abs_x, const double *coeff)
-{
- return (float) (eval_poly (abs_x, coeff) * eval_exp_mx2 (abs_x));
-}
-
-static v_f32_t
-v_approx_erfcf (v_f32_t abs_x, v_u32_t sign, v_u32_t ia12, v_u32_t lanes)
-{
-#ifdef SCALAR
- float y = approx_poly_gauss (abs_x, P (ia12));
- return sign ? 2 - y : y;
-#else
- float32x2_t lo32 = {0, 0};
- float32x2_t hi32 = {0, 0};
- /* The polynomial and Gaussian components must be calculated in
- double precision in order to meet the required ULP error. This
- means we have to promote low and high halves of the
- single-precision input vector to two separate double-precision
- input vectors. This incurs some overhead, and there is also
- overhead to loading the polynomial coefficients as this cannot be
- done in a vector fashion. This would be wasted effort for
- elements which lie in the 'boring' zone, as they will be
- overwritten later. Hence we use the lanes parameter to only do
- the promotion on a pair of lanes if both of those lanes are
- interesting and not special cases. If one lane is inactive, we
- use a scalar routine which is shared with the scalar variant. */
- if (lanes[0] & lanes[1])
- {
- lo32 = vcvt_f32_f64 (
- v_approx_erfcf_poly_gauss (vcvt_f64_f32 (vget_low_f32 (abs_x)),
- P (ia12[0]), P (ia12[1])));
- }
- else if (lanes[0])
- {
- lo32[0] = approx_poly_gauss (abs_x[0], P (ia12[0]));
- }
- else if (lanes[1])
- {
- lo32[1] = approx_poly_gauss (abs_x[1], P (ia12[1]));
- }
-
- if (lanes[2] & lanes[3])
- {
- hi32
- = vcvt_f32_f64 (v_approx_erfcf_poly_gauss (vcvt_high_f64_f32 (abs_x),
- P (ia12[2]), P (ia12[3])));
- }
- else if (lanes[2])
- {
- hi32[0] = approx_poly_gauss (abs_x[2], P (ia12[2]));
- }
- else if (lanes[3])
- {
- hi32[1] = approx_poly_gauss (abs_x[3], P (ia12[3]));
- }
-
- v_f32_t y = vcombine_f32 (lo32, hi32);
-
- if (v_any_u32 (sign))
- {
- y = vbslq_f32 (vceqzq_u32 (sign), y, 2 - y);
- }
-
- return y;
-#endif
-}
-
-/* Optimized single-precision vector complementary error function
- erfcf. Max measured error: 0.750092 at various values between
- -0x1.06521p-20 and -0x1.add1dap-17. For example:
- __v_erfc(-0x1.08185p-18) got 0x1.00004cp+0 want 0x1.00004ap+0
- +0.249908 ulp err 0.250092. */
-VPCS_ATTR
-v_f32_t V_NAME (erfcf) (v_f32_t x)
-{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t ia = ix & 0x7fffffff;
- v_u32_t ia12 = ia >> 20;
- v_u32_t sign = ix >> 31;
- v_u32_t inf_ia12 = v_u32 (0x7f8);
-
- v_u32_t special_cases
- = v_cond_u32 ((ia12 - 0x328) >= ((inf_ia12 & 0x7f8) - 0x328));
- v_u32_t in_bounds
- = v_cond_u32 ((ia < 0x408ccccd) | (~sign & (ix < 0x4120f5c3)));
- v_f32_t boring_zone = v_as_f32_u32 (sign << 30);
-
-#ifdef SCALAR
- if (unlikely (special_cases))
- {
- if (ia12 >= 0x7f8)
- return (float) (sign << 1) + 1.0f / x; /* Special cases. */
- else
- return 1.0f - x; /* Small case. */
- }
- else if (likely (!in_bounds))
- {
- return sign ? boring_zone : __math_uflowf (boring_zone);
- }
-#endif
-
- v_f32_t y = v_approx_erfcf (v_as_f32_u32 (ia), sign, ia12,
- in_bounds & ~special_cases);
-
-#ifndef SCALAR
- y = vbslq_f32 (~in_bounds, boring_zone, y);
-
- if (unlikely (v_any_u32 (special_cases)))
- {
- return specialcase (x, y, special_cases);
- }
-#endif
-
- return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, F, 1, erfc, -6.0, 28.0)
-PL_TEST_ULP (V_NAME (erfcf), 0.26)
-PL_TEST_INTERVAL (V_NAME (erfcf), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-127, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-127, -0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erfcf), 0x1p-26, 0x1p5, 40000)
-PL_TEST_INTERVAL (V_NAME (erfcf), -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (V_NAME (erfcf), 0, inf, 40000)
-#endif
diff --git a/pl/math/v_erfcf_1u7.c b/pl/math/v_erfcf_1u7.c
new file mode 100644
index 000000000000..c361d0704438
--- /dev/null
+++ b/pl/math/v_erfcf_1u7.c
@@ -0,0 +1,166 @@
+/*
+ * Single-precision vector erfc(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ uint32x4_t offset, table_scale;
+ float32x4_t max, shift;
+ float32x4_t coeffs, third, two_over_five, tenth;
+#if WANT_SIMD_EXCEPT
+ float32x4_t uflow_bound;
+#endif
+
+} data = {
+ /* Set an offset so the range of the index used for lookup is 644, and it can
+ be clamped using a saturated add. */
+ .offset = V4 (0xb7fffd7b), /* 0xffffffff - asuint(shift) - 644. */
+ .table_scale = V4 (0x28000000 << 1), /* asuint (2^-47) << 1. */
+ .max = V4 (10.0625f), /* 10 + 1/16 = 644/64. */
+ .shift = V4 (0x1p17f),
+ /* Store 1/3, 2/3 and 2/15 in a single register for use with indexed muls and
+ fmas. */
+ .coeffs = (float32x4_t){ 0x1.555556p-2f, 0x1.555556p-1f, 0x1.111112p-3f, 0 },
+ .third = V4 (0x1.555556p-2f),
+ .two_over_five = V4 (-0x1.99999ap-2f),
+ .tenth = V4 (-0x1.99999ap-4f),
+#if WANT_SIMD_EXCEPT
+ .uflow_bound = V4 (0x1.2639cp+3f),
+#endif
+};
+
+#define TinyBound 0x41000000 /* 0x1p-62f << 1. */
+#define Thres 0xbe000000 /* asuint(infinity) << 1 - TinyBound. */
+#define Off 0xfffffd7b /* 0xffffffff - 644. */
+
+struct entry
+{
+ float32x4_t erfc;
+ float32x4_t scale;
+};
+
+static inline struct entry
+lookup (uint32x4_t i)
+{
+ struct entry e;
+ float64_t t0 = *((float64_t *) (__erfcf_data.tab - Off + i[0]));
+ float64_t t1 = *((float64_t *) (__erfcf_data.tab - Off + i[1]));
+ float64_t t2 = *((float64_t *) (__erfcf_data.tab - Off + i[2]));
+ float64_t t3 = *((float64_t *) (__erfcf_data.tab - Off + i[3]));
+ float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
+ float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+ e.erfc = vuzp1q_f32 (e1, e2);
+ e.scale = vuzp2q_f32 (e1, e2);
+ return e;
+}
+
+#if WANT_SIMD_EXCEPT
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ return v_call_f32 (erfcf, x, y, cmp);
+}
+#endif
+
+/* Optimized single-precision vector erfcf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/64.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erfc(x) ~ erfc(r) - scale * d * poly(r, d), with
+
+ poly(r, d) = 1 - r d + (2/3 r^2 - 1/3) d^2 - r (1/3 r^2 - 1/2) d^3
+ + (2/15 r^4 - 2/5 r^2 + 1/10) d^4
+
+ Values of erfc(r) and scale are read from lookup tables. Stored values
+ are scaled to avoid hitting the subnormal range.
+
+ Note that for x < 0, erfc(x) = 2.0 - erfc(-x).
+ Maximum error: 1.63 ULP (~1.0 ULP for x < 0.0).
+ _ZGVnN4v_erfcf(0x1.1dbf7ap+3) got 0x1.f51212p-120
+ want 0x1.f51216p-120. */
+VPCS_ATTR
+float32x4_t V_NAME_F1 (erfc) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| < 2^-62. Avoid fabs by left-shifting by 1. */
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t cmp = vcltq_u32 (vaddq_u32 (ix, ix), v_u32 (TinyBound));
+ /* x >= ~9.19 (into subnormal case and uflow case). Comparison is done in
+ integer domain to avoid raising exceptions in presence of nans. */
+ uint32x4_t uflow = vcgeq_s32 (vreinterpretq_s32_f32 (x),
+ vreinterpretq_s32_f32 (dat->uflow_bound));
+ cmp = vorrq_u32 (cmp, uflow);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 0 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = v_zerofy_f32 (x, cmp);
+#endif
+
+ float32x4_t a = vabsq_f32 (x);
+ a = vminq_f32 (a, dat->max);
+
+ /* Lookup erfc(r) and scale(r) in tables, e.g. set erfc(r) to 0 and scale to
+ 2/sqrt(pi), when x reduced to r = 0. */
+ float32x4_t shift = dat->shift;
+ float32x4_t z = vaddq_f32 (a, shift);
+
+ /* Clamp index to a range of 644. A naive approach would use a subtract and
+ min. Instead we offset the table address and the index, then use a
+ saturating add. */
+ uint32x4_t i = vqaddq_u32 (vreinterpretq_u32_f32 (z), dat->offset);
+
+ struct entry e = lookup (i);
+
+ /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */
+ float32x4_t r = vsubq_f32 (z, shift);
+ float32x4_t d = vsubq_f32 (a, r);
+ float32x4_t d2 = vmulq_f32 (d, d);
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ float32x4_t p1 = r;
+ float32x4_t p2 = vfmsq_laneq_f32 (dat->third, r2, dat->coeffs, 1);
+ float32x4_t p3
+ = vmulq_f32 (r, vfmaq_laneq_f32 (v_f32 (-0.5), r2, dat->coeffs, 0));
+ float32x4_t p4 = vfmaq_laneq_f32 (dat->two_over_five, r2, dat->coeffs, 2);
+ p4 = vfmsq_f32 (dat->tenth, r2, p4);
+
+ float32x4_t y = vfmaq_f32 (p3, d, p4);
+ y = vfmaq_f32 (p2, d, y);
+ y = vfmaq_f32 (p1, d, y);
+ y = vfmsq_f32 (e.erfc, e.scale, vfmsq_f32 (d, d2, y));
+
+ /* Offset equals 2.0f if sign, else 0.0f. */
+ uint32x4_t sign = vshrq_n_u32 (vreinterpretq_u32_f32 (x), 31);
+ float32x4_t off = vreinterpretq_f32_u32 (vshlq_n_u32 (sign, 30));
+ /* Copy sign and scale back in a single fma. Since the bit patterns do not
+ overlap, then logical or and addition are equivalent here. */
+ float32x4_t fac = vreinterpretq_f32_u32 (
+ vsraq_n_u32 (vshlq_n_u32 (sign, 31), dat->table_scale, 1));
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (xm, vfmaq_f32 (off, fac, y), cmp);
+#endif
+
+ return vfmaq_f32 (off, fac, y);
+}
+
+PL_SIG (V, F, 1, erfc, -4.0, 10.0)
+PL_TEST_ULP (V_NAME_F1 (erfc), 1.14)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (erfc), 0, 0x1p-26, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (erfc), 0x1p-26, 10.0625, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (erfc), -0x1p-26, -4.0, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (erfc), 10.0625, inf, 40000)
+PL_TEST_INTERVAL (V_NAME_F1 (erfc), -4.0, -inf, 40000)
diff --git a/pl/math/v_erff_1u5.c b/pl/math/v_erff_1u5.c
deleted file mode 100644
index 3a25cc8751d1..000000000000
--- a/pl/math/v_erff_1u5.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Single-precision vector erf(x) function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "include/mathlib.h"
-#include "math_config.h"
-#include "pl_sig.h"
-#include "pl_test.h"
-
-#if V_SUPPORTED
-
-VPCS_ATTR v_f32_t V_NAME (expf) (v_f32_t);
-
-#define AbsMask v_u32 (0x7fffffff)
-
-/* Special cases (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
-{
- return v_call_f32 (erff, x, y, cmp);
-}
-
-/* A structure to perform look-up in coeffs and other parameter tables. */
-struct entry
-{
- v_f32_t P[V_ERFF_NCOEFFS];
-};
-
-static inline struct entry
-lookup (v_u32_t i)
-{
- struct entry e;
-#ifdef SCALAR
- for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
- e.P[j] = __v_erff_data.coeffs[j][i];
-#else
- for (int j = 0; j < V_ERFF_NCOEFFS; ++j)
- {
- e.P[j][0] = __v_erff_data.coeffs[j][i[0]];
- e.P[j][1] = __v_erff_data.coeffs[j][i[1]];
- e.P[j][2] = __v_erff_data.coeffs[j][i[2]];
- e.P[j][3] = __v_erff_data.coeffs[j][i[3]];
- }
-#endif
- return e;
-}
-
-/* Optimized single precision vector error function erf.
- Maximum measured at +/- 0.931, 1.25ULP:
- v_erff(-0x1.dc59fap-1) got -0x1.9f9c88p-1
- want -0x1.9f9c8ap-1. */
-VPCS_ATTR
-v_f32_t V_NAME (erff) (v_f32_t x)
-{
- /* Handle both inf/nan as well as small values (|x|<2^-28). If any condition
- in the lane is true then a loop over scalar calls will be performed. */
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t atop = (ix >> 16) & v_u32 (0x7fff);
- v_u32_t cmp = v_cond_u32 (atop - v_u32 (0x3180) >= v_u32 (0x7ff0 - 0x3180));
-
- /* Get sign and absolute value. */
- v_u32_t sign = ix & ~AbsMask;
- /* |x| < 0.921875. */
- v_u32_t red = v_calt_f32 (x, v_f32 (0.921875f));
- /* |x| > 4.0. */
- v_u32_t bor = v_cagt_f32 (x, v_f32 (4.0f));
- /* Avoid dependency in abs(x) in division (and comparison). */
- v_u32_t i = v_sel_u32 (red, v_u32 (0), v_u32 (1));
-
- /* Get polynomial coefficients. */
- struct entry dat = lookup (i);
-
- v_f32_t a = v_abs_f32 (x);
- v_f32_t z = v_sel_f32 (red, x * x, a);
-
- /* Evaluate Polynomial of |x| or x^2. */
- v_f32_t r = dat.P[6];
- r = v_fma_f32 (z, r, dat.P[5]);
- r = v_fma_f32 (z, r, dat.P[4]);
- r = v_fma_f32 (z, r, dat.P[3]);
- r = v_fma_f32 (z, r, dat.P[2]);
- r = v_fma_f32 (z, r, dat.P[1]);
- r = v_sel_f32 (red, r, v_fma_f32 (z, r, dat.P[0]));
- r = v_fma_f32 (a, r, a);
-
- /* y = |x| + |x|*P(|x|) if |x| < 0.921875
- 1 - exp (-(|x|+|x|*P(x^2))) otherwise. */
- v_f32_t y = v_sel_f32 (red, r, v_f32 (1.0f) - V_NAME (expf) (-r));
-
- /* Boring domain (absolute value is required to get the sign of erf(-nan)
- right). */
- y = v_sel_f32 (bor, v_f32 (1.0f), v_abs_f32 (y));
-
- /* y=erf(x) if x>0, -erf(-x) otherwise. */
- y = v_as_f32_u32 (v_as_u32_f32 (y) ^ sign);
-
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
-}
-VPCS_ALIAS
-
-PL_SIG (V, F, 1, erf, -4.0, 4.0)
-PL_TEST_ULP (V_NAME (erff), 0.76)
-PL_TEST_INTERVAL (V_NAME (erff), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (V_NAME (erff), 0x1p-127, 0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erff), -0x1p-127, -0x1p-26, 40000)
-PL_TEST_INTERVAL (V_NAME (erff), 0x1p-26, 0x1p3, 40000)
-PL_TEST_INTERVAL (V_NAME (erff), -0x1p-26, -0x1p3, 40000)
-PL_TEST_INTERVAL (V_NAME (erff), 0, inf, 40000)
-#endif
diff --git a/pl/math/v_erff_2u.c b/pl/math/v_erff_2u.c
new file mode 100644
index 000000000000..502526407df2
--- /dev/null
+++ b/pl/math/v_erff_2u.c
@@ -0,0 +1,118 @@
+/*
+ * Single-precision vector erf(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32x4_t max, shift, third;
+#if WANT_SIMD_EXCEPT
+ float32x4_t tiny_bound, scale_minus_one;
+#endif
+} data = {
+ .max = V4 (3.9375), /* 4 - 8/128. */
+ .shift = V4 (0x1p16f),
+ .third = V4 (0x1.555556p-2f), /* 1/3. */
+#if WANT_SIMD_EXCEPT
+ .tiny_bound = V4 (0x1p-62f),
+ .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */
+#endif
+};
+
+#define AbsMask 0x7fffffff
+
+struct entry
+{
+ float32x4_t erf;
+ float32x4_t scale;
+};
+
+static inline struct entry
+lookup (uint32x4_t i)
+{
+ struct entry e;
+ float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
+ float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
+ float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
+ float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
+ float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
+ float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
+ e.erf = vuzp1q_f32 (e1, e2);
+ e.scale = vuzp2q_f32 (e1, e2);
+ return e;
+}
+
+/* Single-precision implementation of vector erf(x).
+ Approximation based on series expansion near x rounded to
+ nearest multiple of 1/128.
+ Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
+
+ erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
+
+ Values of erf(r) and scale are read from lookup tables.
+ For |x| > 3.9375, erf(|x|) rounds to 1.0f.
+
+ Maximum error: 1.93 ULP
+ _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
+ want 0x1.fd6868p-9. */
+float32x4_t VPCS_ATTR V_NAME_F1 (erf) (float32x4_t x)
+{
+ const struct data *dat = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ /* |x| < 2^-62. */
+ uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+ float32x4_t a = vabsq_f32 (x);
+ uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
+
+ /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to
+ 2/sqrt(pi), when x reduced to r = 0. */
+ float32x4_t shift = dat->shift;
+ float32x4_t z = vaddq_f32 (a, shift);
+
+ uint32x4_t i
+ = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift));
+ i = vminq_u32 (i, v_u32 (512));
+ struct entry e = lookup (i);
+
+ float32x4_t r = vsubq_f32 (z, shift);
+
+ /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */
+ float32x4_t d = vsubq_f32 (a, r);
+ float32x4_t d2 = vmulq_f32 (d, d);
+ float32x4_t y = vfmaq_f32 (r, dat->third, d);
+ y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y));
+
+ /* Solves the |x| = inf case. */
+ y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
+
+ /* Copy sign. */
+ y = vbslq_f32 (v_u32 (AbsMask), y, x);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (cmp)))
+ return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
+#endif
+ return y;
+}
+
+PL_SIG (V, F, 1, erf, -4.0, 4.0)
+PL_TEST_ULP (V_NAME_F1 (erf), 1.43)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (erf), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, 3.9375, 40000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 3.9375, inf, 40000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (erf), 0, inf, 40000)
diff --git a/pl/math/v_erff_data.c b/pl/math/v_erff_data.c
deleted file mode 100644
index 73ccb5cbcfa8..000000000000
--- a/pl/math/v_erff_data.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Data for approximation of vector erff.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* Minimax approximation of erff. */
-const struct v_erff_data __v_erff_data
- = {.coeffs = {{0x0p0f, 0x1.079d0cp-3f},
- {0x1.06eba6p-03f, 0x1.450aa0p-1},
- {-0x1.8126e0p-02f, 0x1.b55cb0p-4f},
- {0x1.ce1a46p-04f, -0x1.8d6300p-6f},
- {-0x1.b68bd2p-06f, 0x1.fd1336p-9f},
- {0x1.473f48p-08f, -0x1.91d2ccp-12f},
- {-0x1.3a1a82p-11f, 0x1.222900p-16f}}};
diff --git a/pl/math/v_erfinv_25u.c b/pl/math/v_erfinv_25u.c
new file mode 100644
index 000000000000..654a7336e85b
--- /dev/null
+++ b/pl/math/v_erfinv_25u.c
@@ -0,0 +1,161 @@
+/*
+ * Double-precision inverse error function (AdvSIMD variant).
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "pl_test.h"
+#include "mathlib.h"
+#include "math_config.h"
+#include "pl_sig.h"
+#include "poly_advsimd_f64.h"
+#define V_LOG_INLINE_POLY_ORDER 4
+#include "v_log_inline.h"
+
+const static struct data
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
+ coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
+ of the denominator. P is interleaved P_17 and P_37, similar for Q. P17
+ and Q17 are provided as homogenous vectors as well for when the shortcut
+ can be taken. */
+ double P[8][2], Q[7][2];
+ float64x2_t tailshift;
+ uint8x16_t idx;
+ struct v_log_inline_data log_tbl;
+ float64x2_t P_57[9], Q_57[10], P_17[7], Q_17[6];
+} data = { .P = { { 0x1.007ce8f01b2e8p+4, -0x1.f3596123109edp-7 },
+ { -0x1.6b23cc5c6c6d7p+6, 0x1.60b8fe375999ep-2 },
+ { 0x1.74e5f6ceb3548p+7, -0x1.779bb9bef7c0fp+1 },
+ { -0x1.5200bb15cc6bbp+7, 0x1.786ea384470a2p+3 },
+ { 0x1.05d193233a849p+6, -0x1.6a7c1453c85d3p+4 },
+ { -0x1.148c5474ee5e1p+3, 0x1.31f0fc5613142p+4 },
+ { 0x1.689181bbafd0cp-3, -0x1.5ea6c007d4dbbp+2 },
+ { 0, 0x1.e66f265ce9e5p-3 } },
+ .Q = { { 0x1.d8fb0f913bd7bp+3, -0x1.636b2dcf4edbep-7 },
+ { -0x1.6d7f25a3f1c24p+6, 0x1.0b5411e2acf29p-2 },
+ { 0x1.a450d8e7f4cbbp+7, -0x1.3413109467a0bp+1 },
+ { -0x1.bc3480485857p+7, 0x1.563e8136c554ap+3 },
+ { 0x1.ae6b0c504ee02p+6, -0x1.7b77aab1dcafbp+4 },
+ { -0x1.499dfec1a7f5fp+4, 0x1.8a3e174e05ddcp+4 },
+ { 0x1p+0, -0x1.4075c56404eecp+3 } },
+ .P_57 = { V2 (0x1.b874f9516f7f1p-14), V2 (0x1.5921f2916c1c4p-7),
+ V2 (0x1.145ae7d5b8fa4p-2), V2 (0x1.29d6dcc3b2fb7p+1),
+ V2 (0x1.cabe2209a7985p+2), V2 (0x1.11859f0745c4p+3),
+ V2 (0x1.b7ec7bc6a2ce5p+2), V2 (0x1.d0419e0bb42aep+1),
+ V2 (0x1.c5aa03eef7258p-1) },
+ .Q_57 = { V2 (0x1.b8747e12691f1p-14), V2 (0x1.59240d8ed1e0ap-7),
+ V2 (0x1.14aef2b181e2p-2), V2 (0x1.2cd181bcea52p+1),
+ V2 (0x1.e6e63e0b7aa4cp+2), V2 (0x1.65cf8da94aa3ap+3),
+ V2 (0x1.7e5c787b10a36p+3), V2 (0x1.0626d68b6cea3p+3),
+ V2 (0x1.065c5f193abf6p+2), V2 (0x1p+0) },
+ .P_17 = { V2 (0x1.007ce8f01b2e8p+4), V2 (-0x1.6b23cc5c6c6d7p+6),
+ V2 (0x1.74e5f6ceb3548p+7), V2 (-0x1.5200bb15cc6bbp+7),
+ V2 (0x1.05d193233a849p+6), V2 (-0x1.148c5474ee5e1p+3),
+ V2 (0x1.689181bbafd0cp-3) },
+ .Q_17 = { V2 (0x1.d8fb0f913bd7bp+3), V2 (-0x1.6d7f25a3f1c24p+6),
+ V2 (0x1.a450d8e7f4cbbp+7), V2 (-0x1.bc3480485857p+7),
+ V2 (0x1.ae6b0c504ee02p+6), V2 (-0x1.499dfec1a7f5fp+4) },
+ .tailshift = V2 (-0.87890625),
+ .idx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ .log_tbl = V_LOG_CONSTANTS };
+
+static inline float64x2_t
+special (float64x2_t x, const struct data *d)
+{
+ /* Note erfinv(inf) should return NaN, and erfinv(1) should return Inf.
+ By using log here, instead of log1p, we return finite values for both
+ these inputs, and values outside [-1, 1]. This is non-compliant, but is an
+ acceptable optimisation at Ofast. To get correct behaviour for all finite
+ values use the log1p_inline helper on -abs(x) - note that erfinv(inf)
+ will still be finite. */
+ float64x2_t t = vnegq_f64 (
+ v_log_inline (vsubq_f64 (v_f64 (1), vabsq_f64 (x)), &d->log_tbl));
+ t = vdivq_f64 (v_f64 (1), vsqrtq_f64 (t));
+ float64x2_t ts = vbslq_f64 (v_u64 (0x7fffffffffffffff), t, x);
+ return vdivq_f64 (v_horner_8_f64 (t, d->P_57),
+ vmulq_f64 (ts, v_horner_9_f64 (t, d->Q_57)));
+}
+
+static inline float64x2_t
+lookup (const double *c, uint8x16_t idx)
+{
+ float64x2_t x = vld1q_f64 (c);
+ return vreinterpretq_f64_u8 (vqtbl1q_u8 (vreinterpretq_u8_f64 (x), idx));
+}
+
+static inline float64x2_t VPCS_ATTR
+notails (float64x2_t x, const struct data *d)
+{
+ /* Shortcut when no input is in a tail region - no need to gather shift or
+ coefficients. */
+ float64x2_t t = vfmaq_f64 (v_f64 (-0.5625), x, x);
+ float64x2_t p = vmulq_f64 (v_horner_6_f64 (t, d->P_17), x);
+ float64x2_t q = vaddq_f64 (d->Q_17[5], t);
+ for (int i = 4; i >= 0; i--)
+ q = vfmaq_f64 (d->Q_17[i], q, t);
+ return vdivq_f64 (p, q);
+}
+
+/* Vector implementation of Blair et al's rational approximation to inverse
+ error function in single-precision. Largest observed error is 24.75 ULP:
+ _ZGVnN2v_erfinv(0x1.fc861d81c2ba8p-1) got 0x1.ea05472686625p+0
+ want 0x1.ea0547268660cp+0. */
+float64x2_t VPCS_ATTR V_NAME_D1 (erfinv) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* Calculate inverse error using algorithm described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error function",
+ Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7.
+
+ Algorithm has 3 intervals:
+ - 'Normal' region [-0.75, 0.75]
+ - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
+ - Extreme tail [-1, -0.9375] U [0.9375, 1]
+ Normal and tail are both rational approximation of similar order on
+ shifted input - these are typically performed in parallel using gather
+ loads to obtain correct coefficients depending on interval. */
+ uint64x2_t is_tail = vcagtq_f64 (x, v_f64 (0.75));
+
+ if (unlikely (!v_any_u64 (is_tail)))
+ /* If input is normally distributed in [-1, 1] then likelihood of this is
+ 0.75^2 ~= 0.56. */
+ return notails (x, d);
+
+ uint64x2_t extreme_tail = vcagtq_f64 (x, v_f64 (0.9375));
+
+ uint8x16_t off = vandq_u8 (vreinterpretq_u8_u64 (is_tail), vdupq_n_u8 (8));
+ uint8x16_t idx = vaddq_u8 (d->idx, off);
+
+ float64x2_t t = vbslq_f64 (is_tail, d->tailshift, v_f64 (-0.5625));
+ t = vfmaq_f64 (t, x, x);
+
+ float64x2_t p = lookup (&d->P[7][0], idx);
+ /* Last coeff of q is either 0 or 1 - use mask instead of load. */
+ float64x2_t q = vreinterpretq_f64_u64 (
+ vandq_u64 (is_tail, vreinterpretq_u64_f64 (v_f64 (1))));
+ for (int i = 6; i >= 0; i--)
+ {
+ p = vfmaq_f64 (lookup (&d->P[i][0], idx), p, t);
+ q = vfmaq_f64 (lookup (&d->Q[i][0], idx), q, t);
+ }
+ p = vmulq_f64 (p, x);
+
+ if (unlikely (v_any_u64 (extreme_tail)))
+ return vbslq_f64 (extreme_tail, special (x, d), vdivq_f64 (p, q));
+
+ return vdivq_f64 (p, q);
+}
+
+PL_SIG (V, D, 1, erfinv, -0.99, 0.99)
+PL_TEST_ULP (V_NAME_D1 (erfinv), 24.8)
+/* Test with control lane in each interval. */
+PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
+ 0.5)
+PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
+ 0.8)
+PL_TEST_SYM_INTERVAL_C (V_NAME_D1 (erfinv), 0, 0x1.fffffffffffffp-1, 100000,
+ 0.95)
diff --git a/pl/math/v_erfinvf_5u.c b/pl/math/v_erfinvf_5u.c
new file mode 100644
index 000000000000..5a6800b86ae9
--- /dev/null
+++ b/pl/math/v_erfinvf_5u.c
@@ -0,0 +1,163 @@
+/*
+ * Single-precision inverse error function (AdvSIMD variant).
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_advsimd_f32.h"
+#include "v_logf_inline.h"
+
+const static struct data
+{
+ /* We use P_N and Q_N to refer to arrays of coefficients, where P_N is the
+ coeffs of the numerator in table N of Blair et al, and Q_N is the coeffs
+ of the denominator. Coefficients are stored in various interleaved
+ formats to allow for table-based (vector-to-vector) lookup.
+
+ Plo is first two coefficients of P_10 and P_29 interleaved.
+ PQ is third coeff of P_10 and first of Q_29 interleaved.
+ Qhi is second and third coeffs of Q_29 interleaved.
+ P29_3 is a homogenous vector with fourth coeff of P_29.
+
+ P_10 and Q_10 are also stored in homogenous vectors to allow better
+ memory access when no lanes are in a tail region. */
+ float32x4_t Plo, PQ, Qhi, P29_3, tailshift;
+ float32x4_t P_50[6], Q_50[2];
+ float32x4_t P_10[3], Q_10[3];
+ uint8x16_t idxhi, idxlo;
+ struct v_logf_data logf_tbl;
+} data = {
+ .idxlo = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ .idxhi = { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 },
+ .P29_3 = V4 (0x1.b13626p-2),
+ .tailshift = V4 (-0.87890625),
+ .Plo = { -0x1.a31268p+3, -0x1.fc0252p-4, 0x1.ac9048p+4, 0x1.119d44p+0 },
+ .PQ = { -0x1.293ff6p+3, -0x1.f59ee2p+0, -0x1.8265eep+3, -0x1.69952p-4 },
+ .Qhi = { 0x1.ef5eaep+4, 0x1.c7b7d2p-1, -0x1.12665p+4, -0x1.167d7p+1 },
+ .P_50 = { V4 (0x1.3d8948p-3), V4 (0x1.61f9eap+0), V4 (0x1.61c6bcp-1),
+ V4 (-0x1.20c9f2p+0), V4 (0x1.5c704cp-1), V4 (-0x1.50c6bep-3) },
+ .Q_50 = { V4 (0x1.3d7dacp-3), V4 (0x1.629e5p+0) },
+ .P_10 = { V4 (-0x1.a31268p+3), V4 (0x1.ac9048p+4), V4 (-0x1.293ff6p+3) },
+ .Q_10 = { V4 (-0x1.8265eep+3), V4 (0x1.ef5eaep+4), V4 (-0x1.12665p+4) },
+ .logf_tbl = V_LOGF_CONSTANTS
+};
+
+static inline float32x4_t
+special (float32x4_t x, const struct data *d)
+{
+ /* Note erfinvf(inf) should return NaN, and erfinvf(1) should return Inf.
+ By using log here, instead of log1p, we return finite values for both
+ these inputs, and values outside [-1, 1]. This is non-compliant, but is an
+ acceptable optimisation at Ofast. To get correct behaviour for all finite
+ values use the log1pf_inline helper on -abs(x) - note that erfinvf(inf)
+ will still be finite. */
+ float32x4_t t = vdivq_f32 (
+ v_f32 (1), vsqrtq_f32 (vnegq_f32 (v_logf_inline (
+ vsubq_f32 (v_f32 (1), vabsq_f32 (x)), &d->logf_tbl))));
+ float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x);
+ float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t);
+ return vdivq_f32 (v_horner_5_f32 (t, d->P_50), vmulq_f32 (ts, q));
+}
+
+static inline float32x4_t
+notails (float32x4_t x, const struct data *d)
+{
+ /* Shortcut when no input is in a tail region - no need to gather shift or
+ coefficients. */
+ float32x4_t t = vfmaq_f32 (v_f32 (-0.5625), x, x);
+ float32x4_t q = vaddq_f32 (t, d->Q_10[2]);
+ q = vfmaq_f32 (d->Q_10[1], t, q);
+ q = vfmaq_f32 (d->Q_10[0], t, q);
+
+ return vdivq_f32 (vmulq_f32 (x, v_horner_2_f32 (t, d->P_10)), q);
+}
+
+static inline float32x4_t
+lookup (float32x4_t tbl, uint8x16_t idx)
+{
+ return vreinterpretq_f32_u8 (vqtbl1q_u8 (vreinterpretq_u8_f32 (tbl), idx));
+}
+
+/* Vector implementation of Blair et al's rational approximation to inverse
+ error function in single-precision. Worst-case error is 4.98 ULP, in the
+ tail region:
+ _ZGVnN4v_erfinvf(0x1.f7dbeep-1) got 0x1.b4793p+0
+ want 0x1.b4793ap+0 . */
+float32x4_t VPCS_ATTR V_NAME_F1 (erfinv) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* Calculate inverse error using algorithm described in
+ J. M. Blair, C. A. Edwards, and J. H. Johnson,
+ "Rational Chebyshev approximations for the inverse of the error
+ function", Math. Comp. 30, pp. 827--830 (1976).
+ https://doi.org/10.1090/S0025-5718-1976-0421040-7.
+
+ Algorithm has 3 intervals:
+ - 'Normal' region [-0.75, 0.75]
+ - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
+ - Extreme tail [-1, -0.9375] U [0.9375, 1]
+ Normal and tail are both rational approximation of similar order on
+ shifted input - these are typically performed in parallel using gather
+ loads to obtain correct coefficients depending on interval. */
+ uint32x4_t is_tail = vcageq_f32 (x, v_f32 (0.75));
+ uint32x4_t extreme_tail = vcageq_f32 (x, v_f32 (0.9375));
+
+ if (unlikely (!v_any_u32 (is_tail)))
+ /* Shortcut for if all lanes are in [-0.75, 0.75] - can avoid having to
+ gather coefficients. If input is uniform in [-1, 1] then likelihood of
+ this is 0.75^4 ~= 0.31. */
+ return notails (x, d);
+
+ /* Select requisite shift depending on interval: polynomial is evaluated on
+ x * x - shift.
+ Normal shift = 0.5625
+ Tail shift = 0.87890625. */
+ float32x4_t t
+ = vfmaq_f32 (vbslq_f32 (is_tail, d->tailshift, v_f32 (-0.5625)), x, x);
+
+ /* Calculate indexes for tbl: tbl is byte-wise, so:
+ [0, 1, 2, 3, 4, 5, 6, ....] copies the vector
+ Add 4 * i to a group of 4 lanes to copy 32-bit lane i. Each vector stores
+ two pairs of coeffs, so we need two idx vectors - one for each pair. */
+ uint8x16_t off = vandq_u8 (vreinterpretq_u8_u32 (is_tail), vdupq_n_u8 (4));
+ uint8x16_t idx_lo = vaddq_u8 (d->idxlo, off);
+ uint8x16_t idx_hi = vaddq_u8 (d->idxhi, off);
+
+ /* Load the tables. */
+ float32x4_t p_lo = d->Plo;
+ float32x4_t pq = d->PQ;
+ float32x4_t qhi = d->Qhi;
+
+ /* Do the lookup (and calculate p3 by masking non-tail lanes). */
+ float32x4_t p3 = vreinterpretq_f32_u32 (
+ vandq_u32 (is_tail, vreinterpretq_u32_f32 (d->P29_3)));
+ float32x4_t p0 = lookup (p_lo, idx_lo), p1 = lookup (p_lo, idx_hi),
+ p2 = lookup (pq, idx_lo), q0 = lookup (pq, idx_hi),
+ q1 = lookup (qhi, idx_lo), q2 = lookup (qhi, idx_hi);
+
+ float32x4_t p = vfmaq_f32 (p2, p3, t);
+ p = vfmaq_f32 (p1, p, t);
+ p = vfmaq_f32 (p0, p, t);
+ p = vmulq_f32 (x, p);
+
+ float32x4_t q = vfmaq_f32 (q1, vaddq_f32 (q2, t), t);
+ q = vfmaq_f32 (q0, q, t);
+
+ if (unlikely (v_any_u32 (extreme_tail)))
+ /* At least one lane is in the extreme tail - if input is uniform in
+ [-1, 1] the likelihood of this is ~0.23. */
+ return vbslq_f32 (extreme_tail, special (x, d), vdivq_f32 (p, q));
+
+ return vdivq_f32 (p, q);
+}
+
+PL_SIG (V, F, 1, erfinv, -0.99, 0.99)
+PL_TEST_ULP (V_NAME_F1 (erfinv), 4.49)
+/* Test with control lane in each interval. */
+PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.5)
+PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.8)
+PL_TEST_SYM_INTERVAL_C (V_NAME_F1 (erfinv), 0, 0x1.fffffep-1, 40000, 0.95)
diff --git a/pl/math/v_exp10_2u.c b/pl/math/v_exp10_2u.c
new file mode 100644
index 000000000000..29072a60fb3a
--- /dev/null
+++ b/pl/math/v_exp10_2u.c
@@ -0,0 +1,144 @@
+/*
+ * Double-precision vector 10^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 163840.0 /* 1280.0 * N. */
+
+const static struct data
+{
+ float64x2_t poly[4];
+ float64x2_t log10_2, log2_10_hi, log2_10_lo, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+ /* Coefficients generated using Remez algorithm.
+ rel error: 0x1.5ddf8f28p-54
+ abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ]
+ maxerr: 1.14432 +0.5 ulp. */
+ .poly = { V2 (0x1.26bb1bbb5524p1), V2 (0x1.53524c73cecdap1),
+ V2 (0x1.047060efb781cp1), V2 (0x1.2bd76040f0d16p0) },
+ .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */
+ .log2_10_hi = V2 (0x1.34413509f79ffp-9), /* log2(10)/N. */
+ .log2_10_lo = V2 (-0x1.9dc1da994fd21p-66),
+ .shift = V2 (0x1.8p+52),
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V2 (ScaleBound),
+ .special_bound = V2 (SpecialBound),
+#endif
+};
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask v_u64 (N - 1)
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
+# define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
+# define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine for special lanes. */
+ return v_call_f64 (exp10, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+ const struct data *d)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+/* Fast vector implementation of exp10.
+ Maximum measured error is 1.64 ulp.
+ _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5
+ want 0x1.f8dab6d7fed0ap+5. */
+float64x2_t VPCS_ATTR V_NAME_D1 (exp10) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t cmp;
+#if WANT_SIMD_EXCEPT
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special_case to fix special lanes later. This is only necessary if fenv
+ exceptions are to be triggered correctly. */
+ float64x2_t xm = x;
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), Thres);
+ if (unlikely (v_any_u64 (cmp)))
+ x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+ cmp = vcageq_f64 (x, d->special_bound);
+#endif
+
+ /* n = round(x/(log10(2)/N)). */
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n*log10(2)/N. */
+ float64x2_t r = x;
+ r = vfmsq_f64 (r, d->log2_10_hi, n);
+ r = vfmsq_f64 (r, d->log2_10_lo, n);
+
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+ uint64x2_t i = vandq_u64 (u, IndexMask);
+
+ /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t p = vfmaq_f64 (d->poly[0], r, d->poly[1]);
+ float64x2_t y = vfmaq_f64 (d->poly[2], r, d->poly[3]);
+ p = vfmaq_f64 (p, y, r2);
+ y = vmulq_f64 (r, p);
+
+ /* s = 2^(n/N). */
+ u = v_lookup_u64 (__v_exp_data, i);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ if (unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+ return special_case (s, y, n, d);
+#endif
+
+ return vfmaq_f64 (s, y, s);
+}
+
+PL_SIG (S, D, 1, exp10, -9.9, 9.9)
+PL_SIG (V, D, 1, exp10, -9.9, 9.9)
+PL_TEST_ULP (V_NAME_D1 (exp10), 1.15)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (exp10), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), 0, SpecialBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), SpecialBound, ScaleBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp10), ScaleBound, inf, 10000)
diff --git a/pl/math/v_exp10f_2u4.c b/pl/math/v_exp10f_2u4.c
new file mode 100644
index 000000000000..0e91becfa612
--- /dev/null
+++ b/pl/math/v_exp10f_2u4.c
@@ -0,0 +1,138 @@
+/*
+ * Single-precision vector 10^x function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+#include "poly_advsimd_f32.h"
+
+#define ScaleBound 192.0f
+
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t log10_2_and_inv, shift;
+
+#if !WANT_SIMD_EXCEPT
+ float32x4_t scale_thresh;
+#endif
+} data = {
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error.
+ rel error: 0x1.89dafa3p-24
+ abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
+ maxerr: 1.85943 +0.5 ulp. */
+ .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
+ V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
+ .shift = V4 (0x1.8p23f),
+
+ /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
+ .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
+#if !WANT_SIMD_EXCEPT
+ .scale_thresh = V4 (ScaleBound)
+#endif
+};
+
+#define ExponentBias v_u32 (0x3f800000)
+
+#if WANT_SIMD_EXCEPT
+
+# define SpecialBound 38.0f /* rint(log10(2^127)). */
+# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
+# define BigBound v_u32 (0x42180000) /* asuint (SpecialBound). */
+# define Thres v_u32 (0x22180000) /* BigBound - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+ /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+ routine to special lanes. */
+ return v_call_f32 (exp10f, x, y, cmp);
+}
+
+#else
+
+# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+/* Fast vector implementation of single-precision exp10.
+ Algorithm is accurate to 2.36 ULP.
+ _ZGVnN4v_exp10f(0x1.be2b36p+1) got 0x1.7e79c4p+11
+ want 0x1.7e79cp+11. */
+float32x4_t VPCS_ATTR V_NAME_F1 (exp10) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+#if WANT_SIMD_EXCEPT
+ /* asuint(x) - TinyBound >= BigBound - TinyBound. */
+ uint32x4_t cmp = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres);
+ float32x4_t xm = x;
+ /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+ special case handler to fix special lanes later. This is only necessary if
+ fenv exceptions are to be triggered correctly. */
+ if (unlikely (v_any_u32 (cmp)))
+ x = v_zerofy_f32 (x, cmp);
+#endif
+
+ /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+ with poly(r) in [1/sqrt(2), sqrt(2)] and
+ x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
+ float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0);
+ float32x4_t n = vsubq_f32 (z, d->shift);
+ float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1);
+ r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+
+#if !WANT_SIMD_EXCEPT
+ uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
+#endif
+
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t poly
+ = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
+ v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
+
+ if (unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+ return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+ return special_case (poly, n, e, cmp, scale, d);
+#endif
+
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+PL_SIG (S, F, 1, exp10, -9.9, 9.9)
+PL_SIG (V, F, 1, exp10, -9.9, 9.9)
+PL_TEST_ULP (V_NAME_F1 (exp10), 1.86)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (exp10), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), 0, SpecialBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), SpecialBound, ScaleBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (exp10), ScaleBound, inf, 10000)
diff --git a/pl/math/v_exp2_2u.c b/pl/math/v_exp2_2u.c
new file mode 100644
index 000000000000..de59779689f5
--- /dev/null
+++ b/pl/math/v_exp2_2u.c
@@ -0,0 +1,128 @@
+/*
+ * Double-precision vector 2^x function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define BigBound 1022.0
+#define UOFlowBound 1280.0
+
+static const struct data
+{
+ float64x2_t poly[4];
+ float64x2_t shift, scale_big_bound, scale_uoflow_bound;
+} data = {
+ /* Coefficients are computed using Remez algorithm with
+ minimisation of the absolute error. */
+ .poly = { V2 (0x1.62e42fefa3686p-1), V2 (0x1.ebfbdff82c241p-3),
+ V2 (0x1.c6b09b16de99ap-5), V2 (0x1.3b2abf5571ad8p-7) },
+ .shift = V2 (0x1.8p52 / N),
+ .scale_big_bound = V2 (BigBound),
+ .scale_uoflow_bound = V2 (UOFlowBound),
+};
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+ return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+ __v_exp_data[i[1] & IndexMask] };
+}
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */
+# define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */
+
+/* Call scalar exp2 as a fallback. */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special)
+{
+ return v_call_f64 (exp2, x, y, is_special);
+}
+
+#else
+
+# define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+# define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+# define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+ const struct data *d)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vclezq_f64 (n), v_u64 (SpecialOffset));
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (v_u64 (SpecialBias1), b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), v_u64 (SpecialBias2)), b));
+ uint64x2_t cmp = vcagtq_f64 (n, d->scale_uoflow_bound);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, s2, y), s1);
+ return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+/* Fast vector implementation of exp2.
+ Maximum measured error is 1.65 ulp.
+ _ZGVnN2v_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
+ want 0x1.f8db0d4df721dp-1. */
+VPCS_ATTR
+float64x2_t V_NAME_D1 (exp2) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t cmp;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres));
+ /* Mask special lanes and retain a copy of x for passing to special-case
+ handler. */
+ float64x2_t xc = x;
+ x = v_zerofy_f64 (x, cmp);
+#else
+ cmp = vcagtq_f64 (x, d->scale_big_bound);
+#endif
+
+ /* n = round(x/N). */
+ float64x2_t z = vaddq_f64 (d->shift, x);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n/N. */
+ float64x2_t r = vsubq_f64 (x, n);
+
+ /* s = 2^(n/N). */
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+ u = lookup_sbits (u);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+ /* y ~ exp2(r) - 1. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly);
+ y = vmulq_f64 (r, y);
+
+ if (unlikely (v_any_u64 (cmp)))
+#if !WANT_SIMD_EXCEPT
+ return special_case (s, y, n, d);
+#else
+ return special_case (xc, vfmaq_f64 (s, s, y), cmp);
+#endif
+ return vfmaq_f64 (s, s, y);
+}
+
+PL_SIG (V, D, 1, exp2, -9.9, 9.9)
+PL_TEST_ULP (V_NAME_D1 (exp2), 1.15)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (exp2), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), 0, TinyBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), TinyBound, BigBound, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), BigBound, UOFlowBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (exp2), UOFlowBound, inf, 10000)
diff --git a/pl/math/v_exp_data.c b/pl/math/v_exp_data.c
new file mode 100644
index 000000000000..fd01cf27606f
--- /dev/null
+++ b/pl/math/v_exp_data.c
@@ -0,0 +1,55 @@
+/*
+ * Scale values for vector exp and exp2
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+/* 2^(j/N), j=0..N, N=2^7=128. Copied from math/v_exp_data.c. */
+const uint64_t __v_exp_data[] = {
+ 0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
+ 0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
+ 0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
+ 0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
+ 0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
+ 0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
+ 0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
+ 0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
+ 0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
+ 0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
+ 0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
+ 0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
+ 0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
+ 0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
+ 0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
+ 0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
+ 0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
+ 0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
+ 0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
+ 0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
+ 0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
+ 0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
+ 0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
+ 0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
+ 0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
+ 0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
+ 0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
+ 0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
+ 0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
+ 0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
+ 0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
+ 0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
+ 0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
+ 0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
+ 0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
+ 0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
+ 0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
+ 0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
+ 0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
+ 0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
+ 0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
+ 0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
+ 0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
+};
diff --git a/pl/math/v_exp_tail.c b/pl/math/v_exp_tail.c
deleted file mode 100644
index fd38aa8ae6ea..000000000000
--- a/pl/math/v_exp_tail.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Double-precision vector e^(x+tail) function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "math_config.h"
-#if V_SUPPORTED
-#include "v_exp_tail.h"
-
-#define C1 v_f64 (C1_scal)
-#define C2 v_f64 (C2_scal)
-#define C3 v_f64 (C3_scal)
-#define InvLn2 v_f64 (InvLn2_scal)
-#define Ln2hi v_f64 (Ln2hi_scal)
-#define Ln2lo v_f64 (Ln2lo_scal)
-
-#define IndexMask v_u64 (IndexMask_scal)
-#define Shift v_f64 (Shift_scal)
-#define Thres v_f64 (Thres_scal)
-
-VPCS_ATTR
-static v_f64_t
-specialcase (v_f64_t s, v_f64_t y, v_f64_t n)
-{
- v_f64_t absn = v_abs_f64 (n);
-
- /* 2^(n/N) may overflow, break it up into s1*s2. */
- v_u64_t b = v_cond_u64 (n <= v_f64 (0.0)) & v_u64 (0x6000000000000000);
- v_f64_t s1 = v_as_f64_u64 (v_u64 (0x7000000000000000) - b);
- v_f64_t s2 = v_as_f64_u64 (v_as_u64_f64 (s) - v_u64 (0x3010000000000000) + b);
- v_u64_t cmp = v_cond_u64 (absn > v_f64 (1280.0 * N));
- v_f64_t r1 = s1 * s1;
- v_f64_t r0 = v_fma_f64 (y, s2, s2) * s1;
- return v_as_f64_u64 ((cmp & v_as_u64_f64 (r1)) | (~cmp & v_as_u64_f64 (r0)));
-}
-
-VPCS_ATTR
-v_f64_t V_NAME (exp_tail) (v_f64_t x, v_f64_t xtail)
-{
- v_f64_t n, r, s, y, z;
- v_u64_t cmp, u, e, i;
-
- cmp = v_cond_u64 (v_abs_f64 (x) > Thres);
-
- /* n = round(x/(ln2/N)). */
- z = v_fma_f64 (x, InvLn2, Shift);
- u = v_as_u64_f64 (z);
- n = z - Shift;
-
- /* r = x - n*ln2/N. */
- r = x;
- r = v_fma_f64 (-Ln2hi, n, r);
- r = v_fma_f64 (-Ln2lo, n, r);
-
- e = u << (52 - V_EXP_TAIL_TABLE_BITS);
- i = u & IndexMask;
-
- /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */
- y = v_fma_f64 (C3, r, C2);
- y = v_fma_f64 (y, r, C1);
- y = v_fma_f64 (y, r, v_f64 (1.0));
- y = v_fma_f64 (y, r, xtail);
-
- /* s = 2^(n/N). */
- u = v_lookup_u64 (Tab, i);
- s = v_as_f64_u64 (u + e);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (s, y, n);
- return v_fma_f64 (y, s, s);
-}
-#endif
diff --git a/pl/math/v_exp_tail_data.c b/pl/math/v_exp_tail_data.c
index 675eb769bf07..989dd41d949a 100644
--- a/pl/math/v_exp_tail_data.c
+++ b/pl/math/v_exp_tail_data.c
@@ -1,5 +1,5 @@
/*
- * Lookup table for double-precision e^(x+tail) vector function.
+ * Lookup table for double-precision e^x vector function.
*
* Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
@@ -7,91 +7,92 @@
#include "math_config.h"
-/* 2^(j/N), j=0..N (where N = 256). */
-const uint64_t __v_exp_tail_data[]
- = {0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
- 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
- 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
- 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
- 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
- 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
- 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
- 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
- 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
- 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
- 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
- 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
- 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
- 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
- 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
- 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
- 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
- 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
- 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
- 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
- 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
- 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
- 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
- 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
- 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
- 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
- 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
- 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
- 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
- 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
- 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
- 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
- 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
- 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
- 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
- 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
- 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
- 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
- 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
- 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
- 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
- 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
- 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
- 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
- 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
- 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
- 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
- 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
- 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
- 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
- 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
- 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
- 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
- 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
- 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
- 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
- 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
- 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
- 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
- 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
- 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
- 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
- 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
- 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
- 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
- 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
- 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
- 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
- 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
- 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
- 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
- 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
- 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
- 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
- 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
- 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
- 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
- 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
- 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
- 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
- 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
- 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
- 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
- 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
- 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
- 0x3feff9d96b2a23d9};
+/* 2^(j/N), j=0..N, N=2^8=256. Copied from math/v_exp_data.c. */
+const uint64_t __v_exp_tail_data[] = {
+ 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335,
+ 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc,
+ 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574,
+ 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836,
+ 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383,
+ 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85,
+ 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2,
+ 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e,
+ 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc,
+ 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e,
+ 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b,
+ 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f,
+ 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4,
+ 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027,
+ 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6,
+ 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1,
+ 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f,
+ 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29,
+ 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1,
+ 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f,
+ 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56,
+ 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd,
+ 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff,
+ 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b,
+ 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866,
+ 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4,
+ 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422,
+ 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024,
+ 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897,
+ 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232,
+ 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0,
+ 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7,
+ 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d,
+ 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee,
+ 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82,
+ 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2,
+ 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd,
+ 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03,
+ 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148,
+ 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4,
+ 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320,
+ 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6,
+ 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd,
+ 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645,
+ 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484,
+ 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a,
+ 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9,
+ 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6,
+ 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132,
+ 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491,
+ 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13,
+ 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21,
+ 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699,
+ 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778,
+ 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736,
+ 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2,
+ 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f,
+ 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2,
+ 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090,
+ 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e,
+ 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33,
+ 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052,
+ 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf,
+ 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774,
+ 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666,
+ 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1,
+ 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47,
+ 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f,
+ 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09,
+ 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c,
+ 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b,
+ 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db,
+ 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa,
+ 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968,
+ 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487,
+ 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075,
+ 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460,
+ 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17,
+ 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6,
+ 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740,
+ 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1,
+ 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a,
+ 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540,
+ 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89,
+ 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1,
+ 0x3feff9d96b2a23d9,
+};
diff --git a/pl/math/v_exp_tail_inline.h b/pl/math/v_exp_tail_inline.h
new file mode 100644
index 000000000000..76ecc6b0a33a
--- /dev/null
+++ b/pl/math/v_exp_tail_inline.h
@@ -0,0 +1,102 @@
+/*
+ * Double-precision vector e^(x+tail) function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+#ifndef PL_MATH_V_EXP_TAIL_INLINE_H
+#define PL_MATH_V_EXP_TAIL_INLINE_H
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+#ifndef WANT_V_EXP_TAIL_SPECIALCASE
+#error \
+ "Cannot use v_exp_tail_inline.h without specifying whether you need the special case computation."
+#endif
+
+#define N (1 << V_EXP_TAIL_TABLE_BITS)
+
+static const struct data
+{
+ float64x2_t poly[4];
+#if WANT_V_EXP_TAIL_SPECIALCASE
+ float64x2_t big_bound, huge_bound;
+#endif
+ float64x2_t shift, invln2, ln2_hi, ln2_lo;
+} data = {
+#if WANT_V_EXP_TAIL_SPECIALCASE
+ .big_bound = V2 (704.0),
+ .huge_bound = V2 (1280.0 * N),
+#endif
+ .shift = V2 (0x1.8p52),
+ .invln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */
+ .ln2_hi = V2 (0x1.62e42fefa39efp-9), /* ln2/N. */
+ .ln2_lo = V2 (0x1.abc9e3b39803f3p-64),
+ .poly = { V2 (1.0), V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3),
+ V2 (0x1.5555576a59599p-5) },
+};
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+ return (uint64x2_t){__v_exp_tail_data[i[0]], __v_exp_tail_data[i[1]]};
+}
+
+#if WANT_V_EXP_TAIL_SPECIALCASE
+#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* The following 2 bias when combined form the exponent bias:
+ SpecialBias1 - SpecialBias2 = asuint64(1.0). */
+#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+static float64x2_t VPCS_ATTR
+v_exp_tail_special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+ const struct data *d)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vclezq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t oflow = vcagtq_f64 (n, d->huge_bound);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ return vbslq_f64 (oflow, r1, r0);
+}
+#endif
+
+static inline float64x2_t VPCS_ATTR
+v_exp_tail_inline (float64x2_t x, float64x2_t xtail)
+{
+ const struct data *d = ptr_barrier (&data);
+#if WANT_V_EXP_TAIL_SPECIALCASE
+ uint64x2_t special = vcgtq_f64 (vabsq_f64 (x), d->big_bound);
+#endif
+ /* n = round(x/(ln2/N)). */
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->invln2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* r = x - n*ln2/N. */
+ float64x2_t r = x;
+ r = vfmsq_f64 (r, d->ln2_hi, n);
+ r = vfmsq_f64 (r, d->ln2_lo, n);
+
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS);
+ uint64x2_t i = vandq_u64 (u, v_u64 (N - 1));
+
+ /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4, using Horner. */
+ float64x2_t y = v_horner_3_f64 (r, d->poly);
+ y = vfmaq_f64 (xtail, y, r);
+
+ /* s = 2^(n/N). */
+ u = lookup_sbits (i);
+ float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+#if WANT_V_EXP_TAIL_SPECIALCASE
+ if (unlikely (v_any_u64 (special)))
+ return v_exp_tail_special_case (s, y, n, d);
+#endif
+ return vfmaq_f64 (s, y, s);
+}
+#endif // PL_MATH_V_EXP_TAIL_INLINE_H
diff --git a/pl/math/v_expf.c b/pl/math/v_expf.c
deleted file mode 100644
index a422e69feb62..000000000000
--- a/pl/math/v_expf.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Single-precision vector e^x function.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "v_math.h"
-#include "mathlib.h"
-#if V_SUPPORTED
-
-static const float Poly[] = {
- /* maxerr: 1.45358 +0.5 ulp. */
- 0x1.0e4020p-7f,
- 0x1.573e2ep-5f,
- 0x1.555e66p-3f,
- 0x1.fffdb6p-2f,
- 0x1.ffffecp-1f,
-};
-#define C0 v_f32 (Poly[0])
-#define C1 v_f32 (Poly[1])
-#define C2 v_f32 (Poly[2])
-#define C3 v_f32 (Poly[3])
-#define C4 v_f32 (Poly[4])
-
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define Ln2hi v_f32 (0x1.62e4p-1f)
-#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
-
-VPCS_ATTR
-static v_f32_t
-specialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
-{
- /* 2^n may overflow, break it up into s1*s2. */
- v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
- v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
- v_f32_t s2 = v_as_f32_u32 (e - b);
- v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
- v_u32_t r2 = v_as_u32_f32 (s1 * s1);
- v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
- /* Similar to r1 but avoids double rounding in the subnormal range. */
- v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
- return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
-}
-
-VPCS_ATTR
-v_f32_t
-V_NAME(expf) (v_f32_t x)
-{
- v_f32_t n, r, r2, scale, p, q, poly, absn, z;
- v_u32_t cmp, e;
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-#if 1
- z = v_fma_f32 (x, InvLn2, Shift);
- n = z - Shift;
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_f32 (z) << 23;
-#else
- z = x * InvLn2;
- n = v_round_f32 (z);
- r = v_fma_f32 (n, -Ln2hi, x);
- r = v_fma_f32 (n, -Ln2lo, r);
- e = v_as_u32_s32 (v_round_s32 (z)) << 23;
-#endif
- scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
- absn = v_abs_f32 (n);
- cmp = v_cond_u32 (absn > v_f32 (126.0f));
- r2 = r * r;
- p = v_fma_f32 (C0, r, C1);
- q = v_fma_f32 (C2, r, C3);
- q = v_fma_f32 (p, r2, q);
- p = C4 * r;
- poly = v_fma_f32 (q, r2, p);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (poly, n, e, absn, cmp, scale);
- return v_fma_f32 (poly, scale, scale);
-}
-VPCS_ALIAS
-#endif
diff --git a/pl/math/v_expf_inline.h b/pl/math/v_expf_inline.h
new file mode 100644
index 000000000000..166683726b4d
--- /dev/null
+++ b/pl/math/v_expf_inline.h
@@ -0,0 +1,60 @@
+/*
+ * Helper for single-precision routines which calculate exp(x) and do not
+ * need special-case handling
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#ifndef PL_MATH_V_EXPF_INLINE_H
+#define PL_MATH_V_EXPF_INLINE_H
+
+#include "v_math.h"
+
+struct v_expf_data
+{
+ float32x4_t poly[5];
+ float32x4_t shift, invln2_and_ln2;
+};
+
+/* maxerr: 1.45358 +0.5 ulp. */
+#define V_EXPF_DATA \
+ { \
+ .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
+ V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
+ .shift = V4 (0x1.8p23f), \
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ }
+
+#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
+#define C(i) d->poly[i]
+
+static inline float32x4_t
+v_expf_inline (float32x4_t x, const struct v_expf_data *d)
+{
+ /* Helper routine for calculating exp(x).
+ Copied from v_expf.c, with all special-case handling removed - the
+ calling routine should handle special values if required. */
+
+ /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t n, r, z;
+ z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0);
+ n = vsubq_f32 (z, d->shift);
+ r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1);
+ r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+
+ /* Custom order-4 Estrin avoids building high order monomial. */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p, q, poly;
+ p = vfmaq_f32 (C (1), C (0), r);
+ q = vfmaq_f32 (C (3), C (2), r);
+ q = vfmaq_f32 (q, p, r2);
+ p = vmulq_f32 (C (4), r);
+ poly = vfmaq_f32 (p, q, r2);
+ return vfmaq_f32 (scale, poly, scale);
+}
+
+#endif // PL_MATH_V_EXPF_INLINE_H
diff --git a/pl/math/v_expm1_2u5.c b/pl/math/v_expm1_2u5.c
index 4b491d17feef..dd255472cec0 100644
--- a/pl/math/v_expm1_2u5.c
+++ b/pl/math/v_expm1_2u5.c
@@ -6,65 +6,73 @@
*/
#include "v_math.h"
+#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
-#define InvLn2 v_f64 (0x1.71547652b82fep0)
-#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
-#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
-#define Shift v_f64 (0x1.8p52)
-#define TinyBound \
- 0x3cc0000000000000 /* 0x1p-51, below which expm1(x) is within 2 ULP of x. */
-#define SpecialBound \
- 0x40862b7d369a5aa9 /* 0x1.62b7d369a5aa9p+9. For |x| > SpecialBound, the \
- final stage of the algorithm overflows so fall back to \
- scalar. */
-#define AbsMask 0x7fffffffffffffff
-#define One 0x3ff0000000000000
-
-#define C(i) v_f64 (__expm1_poly[i])
-
-static inline v_f64_t
-eval_poly (v_f64_t f, v_f64_t f2)
+static const struct data
{
- /* Evaluate custom polynomial using Estrin scheme. */
- v_f64_t p_01 = v_fma_f64 (f, C (1), C (0));
- v_f64_t p_23 = v_fma_f64 (f, C (3), C (2));
- v_f64_t p_45 = v_fma_f64 (f, C (5), C (4));
- v_f64_t p_67 = v_fma_f64 (f, C (7), C (6));
- v_f64_t p_89 = v_fma_f64 (f, C (9), C (8));
-
- v_f64_t p_03 = v_fma_f64 (f2, p_23, p_01);
- v_f64_t p_47 = v_fma_f64 (f2, p_67, p_45);
- v_f64_t p_8a = v_fma_f64 (f2, C (10), p_89);
+ float64x2_t poly[11];
+ float64x2_t invln2, ln2, shift;
+ int64x2_t exponent_bias;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t thresh, tiny_bound;
+#else
+ float64x2_t oflow_bound;
+#endif
+} data = {
+ /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) },
+ .invln2 = V2 (0x1.71547652b82fep0),
+ .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 },
+ .shift = V2 (0x1.8p52),
+ .exponent_bias = V2 (0x3ff0000000000000),
+#if WANT_SIMD_EXCEPT
+ /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs
+ compare. */
+ .thresh = V2 (0x78c56fa6d34b552),
+ /* asuint64(0x1p-51) << 1. */
+ .tiny_bound = V2 (0x3cc0000000000000 << 1),
+#else
+ /* Value above which expm1(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V2 (0x1.62b7d369a5aa9p+9),
+#endif
+};
- v_f64_t f4 = f2 * f2;
- v_f64_t p_07 = v_fma_f64 (f4, p_47, p_03);
- return v_fma_f64 (f4 * f4, p_8a, p_07);
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
+{
+ return v_call_f64 (expm1, x, y, special);
}
/* Double-precision vector exp(x) - 1 function.
The maximum error observed error is 2.18 ULP:
- __v_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
- want 0x1.a8b9ea8d66e2p-2. */
-VPCS_ATTR
-v_f64_t V_NAME (expm1) (v_f64_t x)
+ _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
+ want 0x1.a8b9ea8d66e2p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t ax = ix & AbsMask;
+ const struct data *d = ptr_barrier (&data);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
#if WANT_SIMD_EXCEPT
- /* If fp exceptions are to be triggered correctly, fall back to the scalar
- variant for all lanes if any of them should trigger an exception. */
- v_u64_t special = v_cond_u64 ((ax >= SpecialBound) | (ax <= TinyBound));
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint64x2_t special
+ = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh);
if (unlikely (v_any_u64 (special)))
- return v_call_f64 (expm1, x, x, v_u64 (-1));
+ x = v_zerofy_f64 (x, special);
#else
/* Large input, NaNs and Infs. */
- v_u64_t special
- = v_cond_u64 ((ax >= SpecialBound) | (ix == 0x8000000000000000));
+ uint64x2_t special = vcageq_f64 (x, d->oflow_bound);
#endif
/* Reduce argument to smaller range:
@@ -72,42 +80,39 @@ v_f64_t V_NAME (expm1) (v_f64_t x)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
- v_s64_t i = v_to_s64_f64 (j);
- v_f64_t f = v_fma_f64 (j, MLn2hi, x);
- f = v_fma_f64 (j, MLn2lo, f);
+ float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift);
+ int64x2_t i = vcvtq_s64_f64 (n);
+ float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0);
+ f = vfmsq_laneq_f64 (f, n, d->ln2, 1);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- v_f64_t f2 = f * f;
- v_f64_t p = v_fma_f64 (f2, eval_poly (f, f2), f);
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t f8 = vmulq_f64 (f4, f4);
+ float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
- v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
- /* expm1(x) ~= p * t + (t - 1). */
- v_f64_t y = v_fma_f64 (p, t, t - 1);
+ int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias);
+ float64x2_t t = vreinterpretq_f64_s64 (u);
-#if !WANT_SIMD_EXCEPT
if (unlikely (v_any_u64 (special)))
- return v_call_f64 (expm1, x, y, special);
-#endif
+ return special_case (vreinterpretq_f64_u64 (ix),
+ vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t),
+ special);
- return y;
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (V_NAME (expm1), 1.68)
-PL_TEST_EXPECT_FENV (V_NAME (expm1), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (expm1), 0, 0x1p-51, 1000)
-PL_TEST_INTERVAL (V_NAME (expm1), -0, -0x1p-51, 1000)
-PL_TEST_INTERVAL (V_NAME (expm1), 0x1p-51, 0x1.63108c75a1937p+9, 100000)
-PL_TEST_INTERVAL (V_NAME (expm1), -0x1p-51, -0x1.740bf7c0d927dp+9, 100000)
-PL_TEST_INTERVAL (V_NAME (expm1), 0x1.63108c75a1937p+9, inf, 100)
-PL_TEST_INTERVAL (V_NAME (expm1), -0x1.740bf7c0d927dp+9, -inf, 100)
-#endif
+PL_TEST_ULP (V_NAME_D1 (expm1), 1.68)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (expm1), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0, 0x1p-51, 1000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1p-51, 0x1.62b7d369a5aa9p+9, 100000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (expm1), 0x1.62b7d369a5aa9p+9, inf, 100)
diff --git a/pl/math/v_expm1f_1u6.c b/pl/math/v_expm1f_1u6.c
index ab132427e58d..6b282d0cc00f 100644
--- a/pl/math/v_expm1f_1u6.c
+++ b/pl/math/v_expm1f_1u6.c
@@ -6,44 +6,71 @@
*/
#include "v_math.h"
+#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
+static const struct data
+{
+ float32x4_t poly[5];
+ float32x4_t invln2_and_ln2;
+ float32x4_t shift;
+ int32x4_t exponent_bias;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t thresh;
+#else
+ float32x4_t oflow_bound;
+#endif
+} data = {
+ /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
+ .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
+ V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
+ /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
+ .shift = V4 (0x1.8p23f),
+ .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+ /* Value above which expm1f(x) should overflow. Absolute value of the
+ underflow bound is greater than this, so it catches both cases - there is
+ a small window where fallbacks are triggered unnecessarily. */
+ .oflow_bound = V4 (0x1.5ebc4p+6),
+#else
+ /* asuint(oflow_bound) - asuint(0x1p-23), shifted left by 1 for absolute
+ compare. */
+ .thresh = V4 (0x1d5ebc40),
+#endif
+};
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define MLn2hi v_f32 (-0x1.62e4p-1f)
-#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
-#define AbsMask (0x7fffffff)
-#define One (0x3f800000)
-#define SpecialBound \
- (0x42af5e20) /* asuint(0x1.5ebc4p+6). Largest value of x for which expm1(x) \
- should round to -1. */
-#define TinyBound (0x34000000) /* asuint(0x1p-23). */
+/* asuint(0x1p-23), shifted by 1 for abs compare. */
+#define TinyBound v_u32 (0x34000000 << 1)
-#define C(i) v_f32 (__expm1f_poly[i])
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+{
+ return v_call_f32 (expm1f, x, y, special);
+}
/* Single-precision vector exp(x) - 1 function.
The maximum error is 1.51 ULP:
- expm1f(0x1.8baa96p-2) got 0x1.e2fb9p-2
- want 0x1.e2fb94p-2. */
-VPCS_ATTR
-v_f32_t V_NAME (expm1f) (v_f32_t x)
+ _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
+ want 0x1.e2fb94p-2. */
+float32x4_t VPCS_ATTR V_NAME_F1 (expm1) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t ax = ix & AbsMask;
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
#if WANT_SIMD_EXCEPT
- /* If fp exceptions are to be triggered correctly, fall back to the scalar
- variant for all lanes if any of them should trigger an exception. */
- v_u32_t special
- = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000) | (ax < TinyBound));
+ /* If fp exceptions are to be triggered correctly, fall back to scalar for
+ |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
+ shift-left by 1, and compare with thresh which was left-shifted offline -
+ this is effectively an absolute compare. */
+ uint32x4_t special
+ = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
if (unlikely (v_any_u32 (special)))
- return v_call_f32 (expm1f, x, x, v_u32 (0xffffffff));
+ x = v_zerofy_f32 (x, special);
#else
- /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf and -0. */
- v_u32_t special = v_cond_u32 ((ax >= SpecialBound) | (ix == 0x80000000));
+ /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
+ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
/* Reduce argument to smaller range:
@@ -51,44 +78,40 @@ v_f32_t V_NAME (expm1f) (v_f32_t x)
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where 2^i is exact because i is an integer. */
- v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
- v_s32_t i = v_to_s32_f32 (j);
- v_f32_t f = v_fma_f32 (j, MLn2hi, x);
- f = v_fma_f32 (j, MLn2lo, f);
+ float32x4_t j = vsubq_f32 (
+ vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
+ f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
/* Approximate expm1(f) using polynomial.
Taylor expansion for expm1(x) has the form:
x + ax^2 + bx^3 + cx^4 ....
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
-
- v_f32_t p = v_fma_f32 (C (4), f, C (3));
- p = v_fma_f32 (p, f, C (2));
- p = v_fma_f32 (p, f, C (1));
- p = v_fma_f32 (p, f, C (0));
- p = v_fma_f32 (f * f, p, f);
+ float32x4_t p = v_horner_4_f32 (f, d->poly);
+ p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
/* Assemble the result.
expm1(x) ~= 2^i * (p + 1) - 1
Let t = 2^i. */
- v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
- /* expm1(x) ~= p * t + (t - 1). */
- v_f32_t y = v_fma_f32 (p, t, t - 1);
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t t = vreinterpretq_f32_s32 (u);
-#if !WANT_SIMD_EXCEPT
if (unlikely (v_any_u32 (special)))
- return v_call_f32 (expm1f, x, y, special);
-#endif
+ return special_case (vreinterpretq_f32_u32 (ix),
+ vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
+ special);
- return y;
+ /* expm1(x) ~= p * t + (t - 1). */
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, expm1, -9.9, 9.9)
-PL_TEST_ULP (V_NAME (expm1f), 1.02)
-PL_TEST_EXPECT_FENV (V_NAME (expm1f), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (expm1f), 0, 0x1p-23, 1000)
-PL_TEST_INTERVAL (V_NAME (expm1f), -0, -0x1p-23, 1000)
-PL_TEST_INTERVAL (V_NAME (expm1f), 0x1p-23, 0x1.644716p6, 1000000)
-PL_TEST_INTERVAL (V_NAME (expm1f), -0x1p-23, -0x1.9bbabcp+6, 1000000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (expm1), 1.02)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (expm1), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (expm1), 0, 0x1p-23, 1000)
+PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, 0x1.5ebc4p+6, 1000000)
+PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1p-23, -0x1.9bbabcp+6, 1000000)
+PL_TEST_INTERVAL (V_NAME_F1 (expm1), 0x1.5ebc4p+6, inf, 1000)
+PL_TEST_INTERVAL (V_NAME_F1 (expm1), -0x1.9bbabcp+6, -inf, 1000)
diff --git a/pl/math/v_expm1f_inline.h b/pl/math/v_expm1f_inline.h
index c261941ebed6..6ae94c452de2 100644
--- a/pl/math/v_expm1f_inline.h
+++ b/pl/math/v_expm1f_inline.h
@@ -11,39 +11,53 @@
#include "v_math.h"
#include "math_config.h"
-#include "estrinf.h"
+#include "poly_advsimd_f32.h"
-#define One 0x3f800000
-#define Shift v_f32 (0x1.8p23f)
-#define InvLn2 v_f32 (0x1.715476p+0f)
-#define MLn2hi v_f32 (-0x1.62e4p-1f)
-#define MLn2lo v_f32 (-0x1.7f7d1cp-20f)
-
-#define C(i) v_f32 (__expm1f_poly[i])
-
-static inline v_f32_t
-expm1f_inline (v_f32_t x)
+struct v_expm1f_data
+{
+ float32x4_t poly[5];
+ float32x4_t invln2_and_ln2, shift;
+ int32x4_t exponent_bias;
+};
+
+/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
+ log(2)/2]. Exponent bias is asuint(1.0f).
+ invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
+#define V_EXPM1F_DATA \
+ { \
+ .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
+ V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
+ .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
+ .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ }
+
+static inline float32x4_t
+expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
{
/* Helper routine for calculating exp(x) - 1.
Copied from v_expm1f_1u6.c, with all special-case handling removed - the
calling routine should handle special values if required. */
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- v_f32_t j = v_fma_f32 (InvLn2, x, Shift) - Shift;
- v_s32_t i = v_to_s32_f32 (j);
- v_f32_t f = v_fma_f32 (j, MLn2hi, x);
- f = v_fma_f32 (j, MLn2lo, f);
+ float32x4_t j = vsubq_f32 (
+ vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift);
+ int32x4_t i = vcvtq_s32_f32 (j);
+ float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1);
+ f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2);
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
- Uses Estrin scheme, where the main __v_expm1f routine uses Horner. */
- v_f32_t f2 = f * f;
- v_f32_t p = ESTRIN_4 (f, f2, f2 * f2, C);
- p = v_fma_f32 (f2, p, f);
+ Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
+ Horner. */
+ float32x4_t f2 = vmulq_f32 (f, f);
+ float32x4_t f4 = vmulq_f32 (f2, f2);
+ float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+ p = vfmaq_f32 (f, f2, p);
/* t = 2^i. */
- v_f32_t t = v_as_f32_u32 (v_as_u32_s32 (i << 23) + One);
+ int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
+ float32x4_t t = vreinterpretq_f32_s32 (u);
/* expm1(x) ~= p * t + (t - 1). */
- return v_fma_f32 (p, t, t - 1);
+ return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
}
#endif // PL_MATH_V_EXPM1F_INLINE_H
diff --git a/pl/math/v_hypot_1u5.c b/pl/math/v_hypot_1u5.c
new file mode 100644
index 000000000000..d4ff7be89a8f
--- /dev/null
+++ b/pl/math/v_hypot_1u5.c
@@ -0,0 +1,95 @@
+/*
+ * Double-precision vector hypot(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+ uint64x2_t tiny_bound, thres;
+} data = {
+ .tiny_bound = V2 (0x2000000000000000), /* asuint (0x1p-511). */
+ .thres = V2 (0x3fe0000000000000), /* asuint (0x1p511) - tiny_bound. */
+};
+#else
+static const struct data
+{
+ uint64x2_t tiny_bound;
+ uint32x4_t thres;
+} data = {
+ .tiny_bound = V2 (0x0360000000000000), /* asuint (0x1p-969). */
+ .thres = V4 (0x7c900000), /* asuint (inf) - tiny_bound. */
+};
+#endif
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t sqsum,
+ uint32x2_t special)
+{
+ return v_call2_f64 (hypot, x, y, vsqrtq_f64 (sqsum), vmovl_u32 (special));
+}
+
+/* Vector implementation of double-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVnN2vv_hypot (0x1.6a1b193ff85b5p-204, 0x1.bc50676c2a447p-222)
+ got 0x1.6a1b19400964ep-204
+ want 0x1.6a1b19400964dp-204. */
+#if WANT_SIMD_EXCEPT
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ float64x2_t ay = vabsq_f64 (y);
+
+ uint64x2_t ix = vreinterpretq_u64_f64 (ax);
+ uint64x2_t iy = vreinterpretq_u64_f64 (ay);
+
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
+ fallback for correct flag handling. */
+ uint64x2_t specialx = vcgeq_u64 (vsubq_u64 (ix, d->tiny_bound), d->thres);
+ uint64x2_t specialy = vcgeq_u64 (vsubq_u64 (iy, d->tiny_bound), d->thres);
+ ax = v_zerofy_f64 (ax, specialx);
+ ay = v_zerofy_f64 (ay, specialy);
+ uint32x2_t special = vaddhn_u64 (specialx, specialy);
+
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (ax, ax), ay, ay);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f64 (sqsum);
+}
+#else
+
+float64x2_t VPCS_ATTR V_NAME_D2 (hypot) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t sqsum = vfmaq_f64 (vmulq_f64 (x, x), y, y);
+
+ uint32x2_t special = vcge_u32 (
+ vsubhn_u64 (vreinterpretq_u64_f64 (sqsum), d->tiny_bound),
+ vget_low_u32 (d->thres));
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f64 (sqsum);
+}
+#endif
+
+PL_SIG (V, D, 2, hypot, -10.0, 10.0)
+PL_TEST_ULP (V_NAME_D2 (hypot), 1.21)
+PL_TEST_EXPECT_FENV (V_NAME_D2 (hypot), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), 0, inf, -0, -inf, 10000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/pl/math/v_hypotf_1u5.c b/pl/math/v_hypotf_1u5.c
new file mode 100644
index 000000000000..3227b0a3fd8b
--- /dev/null
+++ b/pl/math/v_hypotf_1u5.c
@@ -0,0 +1,94 @@
+/*
+ * Single-precision vector hypot(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+#if WANT_SIMD_EXCEPT
+static const struct data
+{
+ uint32x4_t tiny_bound, thres;
+} data = {
+ .tiny_bound = V4 (0x20000000), /* asuint (0x1p-63). */
+ .thres = V4 (0x3f000000), /* asuint (0x1p63) - tiny_bound. */
+};
+#else
+static const struct data
+{
+ uint32x4_t tiny_bound;
+ uint16x8_t thres;
+} data = {
+ .tiny_bound = V4 (0x0C800000), /* asuint (0x1p-102). */
+ .thres = V8 (0x7300), /* asuint (inf) - tiny_bound. */
+};
+#endif
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum,
+ uint16x4_t special)
+{
+ return v_call2_f32 (hypotf, x, y, vsqrtq_f32 (sqsum), vmovl_u16 (special));
+}
+
+/* Vector implementation of single-precision hypot.
+ Maximum error observed is 1.21 ULP:
+ _ZGVnN4vv_hypotf (0x1.6a419cp-13, 0x1.82a852p-22) got 0x1.6a41d2p-13
+ want 0x1.6a41dp-13. */
+#if WANT_SIMD_EXCEPT
+
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ay = vabsq_f32 (y);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (ax);
+ uint32x4_t iy = vreinterpretq_u32_f32 (ay);
+
+ /* Extreme values, NaNs, and infinities should be handled by the scalar
+ fallback for correct flag handling. */
+ uint32x4_t specialx = vcgeq_u32 (vsubq_u32 (ix, d->tiny_bound), d->thres);
+ uint32x4_t specialy = vcgeq_u32 (vsubq_u32 (iy, d->tiny_bound), d->thres);
+ ax = v_zerofy_f32 (ax, specialx);
+ ay = v_zerofy_f32 (ay, specialy);
+ uint16x4_t special = vaddhn_u32 (specialx, specialy);
+
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay);
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f32 (sqsum);
+}
+#else
+
+float32x4_t VPCS_ATTR V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y);
+
+ uint16x4_t special = vcge_u16 (
+ vsubhn_u32 (vreinterpretq_u32_f32 (sqsum), d->tiny_bound),
+ vget_low_u16 (d->thres));
+
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, sqsum, special);
+
+ return vsqrtq_f32 (sqsum);
+}
+#endif
+
+PL_SIG (V, F, 2, hypot, -10.0, 10.0)
+PL_TEST_ULP (V_NAME_F2 (hypot), 1.21)
+PL_TEST_EXPECT_FENV (V_NAME_F2 (hypot), WANT_SIMD_EXCEPT)
+PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), 0, inf, -0, -inf, 10000)
+PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, 0, inf, 10000)
+PL_TEST_INTERVAL2 (V_NAME_F2 (hypot), -0, -inf, -0, -inf, 10000)
diff --git a/pl/math/v_log10_2u5.c b/pl/math/v_log10_2u5.c
index 86d398ca13a9..35dd62fe5e3e 100644
--- a/pl/math/v_log10_2u5.c
+++ b/pl/math/v_log10_2u5.c
@@ -6,105 +6,115 @@
*/
#include "v_math.h"
-#include "include/mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f64.h"
-#if V_SUPPORTED
-
-#define A(i) v_f64 (__v_log10_data.poly[i])
-#define T(s, i) __v_log10_data.tab[i].s
-#define Ln2 v_f64 (0x1.62e42fefa39efp-1)
#define N (1 << V_LOG10_TABLE_BITS)
-#define OFF v_u64 (0x3fe6900900000000)
+
+static const struct data
+{
+ uint64x2_t min_norm;
+ uint32x4_t special_bound;
+ float64x2_t poly[5];
+ float64x2_t invln10, log10_2, ln2;
+ uint64x2_t sign_exp_mask;
+} data = {
+ /* Computed from log coefficients divided by log(10) then rounded to double
+ precision. */
+ .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3),
+ V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4),
+ V2 (-0x1.287461742fee4p-4) },
+ .ln2 = V2 (0x1.62e42fefa39efp-1),
+ .invln10 = V2 (0x1.bcb7b1526e50ep-2),
+ .log10_2 = V2 (0x1.34413509f79ffp-2),
+ .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
+ .sign_exp_mask = V2 (0xfff0000000000000),
+};
+
+#define Off v_u64 (0x3fe6900900000000)
+#define IndexMask (N - 1)
+
+#define T(s, i) __v_log10_data.s[i]
struct entry
{
- v_f64_t invc;
- v_f64_t log10c;
+ float64x2_t invc;
+ float64x2_t log10c;
};
static inline struct entry
-lookup (v_u64_t i)
+lookup (uint64x2_t i)
{
struct entry e;
-#ifdef SCALAR
- e.invc = T (invc, i);
- e.log10c = T (log10c, i);
-#else
- e.invc[0] = T (invc, i[0]);
- e.log10c[0] = T (log10c, i[0]);
- e.invc[1] = T (invc, i[1]);
- e.log10c[1] = T (log10c, i[1]);
-#endif
+ uint64_t i0 = (i[0] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (i[1] >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.log10c = vuzp2q_f64 (e0, e1);
return e;
}
-VPCS_ATTR
-inline static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+ uint32x2_t special)
{
- return v_call_f64 (log10, x, y, cmp);
+ return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special));
}
-/* Our implementation of v_log10 is a slight modification of v_log (1.660ulps).
+/* Fast implementation of double-precision vector log10
+ is a slight modification of double-precision vector log.
Max ULP error: < 2.5 ulp (nearest rounding.)
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
- __v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
- want 0x1.fff6be3cae4b9p-6
- -0.459999 ulp err 1.96. */
-VPCS_ATTR
-v_f64_t V_NAME (log10) (v_f64_t x)
+ _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+ want 0x1.fff6be3cae4b9p-6. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
{
- v_f64_t z, r, r2, p, y, kd, hi;
- v_u64_t ix, iz, tmp, top, i, cmp;
- v_s64_t k;
- struct entry e;
-
- ix = v_as_u64_f64 (x);
- top = ix >> 48;
- cmp = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+ vget_low_u32 (d->special_bound));
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- tmp = ix - OFF;
- i = (tmp >> (52 - V_LOG10_TABLE_BITS)) % N;
- k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */
- iz = ix - (tmp & v_u64 (0xfffULL << 52));
- z = v_as_f64_u64 (iz);
- e = lookup (i);
+ uint64x2_t tmp = vsubq_u64 (ix, Off);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (tmp);
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
- r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
- kd = v_to_f64_s64 (k);
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
/* hi = r / log(10) + log10(c) + k*log10(2).
- Constants in `v_log10_data.c` are computed (in extended precision) as
+ Constants in v_log10_data.c are computed (in extended precision) as
e.log10c := e.logc * ivln10. */
- v_f64_t w = v_fma_f64 (r, v_f64 (__v_log10_data.invln10), e.log10c);
+ float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10);
/* y = log10(1+r) + n * log10(2). */
- hi = v_fma_f64 (kd, v_f64 (__v_log10_data.log10_2), w);
+ float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- r2 = r * r;
- y = v_fma_f64 (A (3), r, A (2));
- p = v_fma_f64 (A (1), r, A (0));
- y = v_fma_f64 (A (4), r2, y);
- y = v_fma_f64 (y, r2, p);
- y = v_fma_f64 (y, r2, hi);
-
- if (unlikely (v_any_u64 (cmp)))
- return specialcase (x, y, cmp);
- return y;
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
+
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, y, hi, r2, special);
+ return vfmaq_f64 (hi, r2, y);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (V_NAME (log10), 1.97)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10))
-PL_TEST_INTERVAL (V_NAME (log10), 0, 0xffff000000000000, 10000)
-PL_TEST_INTERVAL (V_NAME (log10), 0x1p-4, 0x1p4, 400000)
-PL_TEST_INTERVAL (V_NAME (log10), 0, inf, 400000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (log10), 1.97)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log10))
+PL_TEST_INTERVAL (V_NAME_D1 (log10), -0.0, -inf, 1000)
+PL_TEST_INTERVAL (V_NAME_D1 (log10), 0, 0x1p-149, 1000)
+PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log10), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log10), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log10), 100, inf, 50000)
diff --git a/pl/math/v_log10_data.c b/pl/math/v_log10_data.c
index fda85c886963..d9a624dab9ce 100644
--- a/pl/math/v_log10_data.c
+++ b/pl/math/v_log10_data.c
@@ -7,161 +7,157 @@
#include "math_config.h"
-#define N (1 << V_LOG10_TABLE_BITS)
-
-/* Algorithm:
+const struct v_log10_data __v_log10_data = {
+ /* Computed from log's coefficients div by log(10) then rounded to double
+ precision. */
+ .poly = { -0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
+ 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4 },
+ .invln10 = 0x1.bcb7b1526e50ep-2,
+ .log10_2 = 0x1.34413509f79ffp-2,
+ /* Algorithm:
x = 2^k z
log10(x) = k log10(2) + log10(c) + poly(z/c - 1) / log(10)
-where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,N=128)
-and log(c) and 1/c for the ith subinterval comes from a lookup table:
-
- tab[i].invc = 1/c
- tab[i].log10c = (double)log10(c)
-
-where c is near the center of the subinterval and is chosen by trying several
-floating point invc candidates around 1/center and selecting one for which
-the error in (double)log(c) is minimized (< 0x1p-74), except the subinterval
-that contains 1 and the previous one got tweaked to avoid cancellation.
-NB: invc should be optimized to minimize error in (double)log10(c) instead. */
-const struct v_log10_data __v_log10_data
- = {.tab = {{0x1.6a133d0dec120p+0, -0x1.345825f221684p-3},
- {0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3},
- {0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3},
- {0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3},
- {0x1.623f1d916f323p+0, -0x1.20e7081762193p-3},
- {0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3},
- {0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3},
- {0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3},
- {0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3},
- {0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3},
- {0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3},
- {0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4},
- {0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4},
- {0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4},
- {0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4},
- {0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4},
- {0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4},
- {0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4},
- {0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4},
- {0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4},
- {0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4},
- {0x1.446f12b278001p+0, -0x1.a56c091954f87p-4},
- {0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4},
- {0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4},
- {0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4},
- {0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4},
- {0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4},
- {0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4},
- {0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4},
- {0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4},
- {0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4},
- {0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4},
- {0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4},
- {0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4},
- {0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4},
- {0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4},
- {0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4},
- {0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4},
- {0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4},
- {0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4},
- {0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4},
- {0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5},
- {0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5},
- {0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5},
- {0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5},
- {0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5},
- {0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5},
- {0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5},
- {0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5},
- {0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5},
- {0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5},
- {0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5},
- {0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5},
- {0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5},
- {0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5},
- {0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5},
- {0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5},
- {0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5},
- {0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6},
- {0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6},
- {0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6},
- {0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6},
- {0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6},
- {0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6},
- {0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6},
- {0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6},
- {0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7},
- {0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7},
- {0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7},
- {0x1.062491aee9904p+0, -0x1.517249c15a75cp-7},
- {0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7},
- {0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8},
- {0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8},
- {0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9},
- {0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10},
- {1.0, 0.0},
- {0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9},
- {0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8},
- {0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7},
- {0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7},
- {0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6},
- {0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6},
- {0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6},
- {0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6},
- {0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6},
- {0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5},
- {0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5},
- {0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5},
- {0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5},
- {0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5},
- {0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5},
- {0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5},
- {0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5},
- {0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5},
- {0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5},
- {0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4},
- {0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4},
- {0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4},
- {0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4},
- {0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4},
- {0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4},
- {0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4},
- {0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4},
- {0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4},
- {0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4},
- {0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4},
- {0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4},
- {0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4},
- {0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4},
- {0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4},
- {0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4},
- {0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4},
- {0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4},
- {0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4},
- {0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4},
- {0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4},
- {0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4},
- {0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4},
- {0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3},
- {0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3},
- {0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3},
- {0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3},
- {0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3},
- {0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3},
- {0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3},
- {0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3},
- {0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3},
- {0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3}},
-
- /* Computed from log coeffs div by log(10) then rounded to double
- precision. */
- .poly
- = {-0x1.bcb7b1526e506p-3, 0x1.287a7636be1d1p-3, -0x1.bcb7b158af938p-4,
- 0x1.63c78734e6d07p-4, -0x1.287461742fee4p-4},
+ where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+ N=128) and log(c) and 1/c for the ith subinterval comes from lookup
+ tables:
- .invln10 = 0x1.bcb7b1526e50ep-2,
- .log10_2 = 0x1.34413509f79ffp-2
+ table[i].invc = 1/c
+ table[i].log10c = (double)log10(c)
+ where c is near the center of the subinterval and is chosen by trying
+ several floating point invc candidates around 1/center and selecting one
+ for which the error in (double)log(c) is minimized (< 0x1p-74), except the
+ subinterval that contains 1 and the previous one got tweaked to avoid
+ cancellation. NB: invc should be optimized to minimize error in
+ (double)log10(c) instead. */
+ .table = { { 0x1.6a133d0dec120p+0, -0x1.345825f221684p-3 },
+ { 0x1.6815f2f3e42edp+0, -0x1.2f71a1f0c554ep-3 },
+ { 0x1.661e39be1ac9ep+0, -0x1.2a91fdb30b1f4p-3 },
+ { 0x1.642bfa30ac371p+0, -0x1.25b9260981a04p-3 },
+ { 0x1.623f1d916f323p+0, -0x1.20e7081762193p-3 },
+ { 0x1.60578da220f65p+0, -0x1.1c1b914aeefacp-3 },
+ { 0x1.5e75349dea571p+0, -0x1.1756af5de404dp-3 },
+ { 0x1.5c97fd387a75ap+0, -0x1.12985059c90bfp-3 },
+ { 0x1.5abfd2981f200p+0, -0x1.0de0628f63df4p-3 },
+ { 0x1.58eca051dc99cp+0, -0x1.092ed492e08eep-3 },
+ { 0x1.571e526d9df12p+0, -0x1.0483954caf1dfp-3 },
+ { 0x1.5554d555b3fcbp+0, -0x1.ffbd27a9adbcp-4 },
+ { 0x1.539015e2a20cdp+0, -0x1.f67f7f2e3d1ap-4 },
+ { 0x1.51d0014ee0164p+0, -0x1.ed4e1071ceebep-4 },
+ { 0x1.50148538cd9eep+0, -0x1.e428bb47413c4p-4 },
+ { 0x1.4e5d8f9f698a1p+0, -0x1.db0f6003028d6p-4 },
+ { 0x1.4cab0edca66bep+0, -0x1.d201df6749831p-4 },
+ { 0x1.4afcf1a9db874p+0, -0x1.c9001ac5c9672p-4 },
+ { 0x1.495327136e16fp+0, -0x1.c009f3c78c79p-4 },
+ { 0x1.47ad9e84af28fp+0, -0x1.b71f4cb642e53p-4 },
+ { 0x1.460c47b39ae15p+0, -0x1.ae400818526b2p-4 },
+ { 0x1.446f12b278001p+0, -0x1.a56c091954f87p-4 },
+ { 0x1.42d5efdd720ecp+0, -0x1.9ca3332f096eep-4 },
+ { 0x1.4140cfe001a0fp+0, -0x1.93e56a3f23e55p-4 },
+ { 0x1.3fafa3b421f69p+0, -0x1.8b3292a3903bp-4 },
+ { 0x1.3e225c9c8ece5p+0, -0x1.828a9112d9618p-4 },
+ { 0x1.3c98ec29a211ap+0, -0x1.79ed4ac35f5acp-4 },
+ { 0x1.3b13442a413fep+0, -0x1.715aa51ed28c4p-4 },
+ { 0x1.399156baa3c54p+0, -0x1.68d2861c999e9p-4 },
+ { 0x1.38131639b4cdbp+0, -0x1.6054d40ded21p-4 },
+ { 0x1.36987540fbf53p+0, -0x1.57e17576bc9a2p-4 },
+ { 0x1.352166b648f61p+0, -0x1.4f7851798bb0bp-4 },
+ { 0x1.33adddb3eb575p+0, -0x1.47194f5690ae3p-4 },
+ { 0x1.323dcd99fc1d3p+0, -0x1.3ec456d58ec47p-4 },
+ { 0x1.30d129fefc7d2p+0, -0x1.36794ff3e5f55p-4 },
+ { 0x1.2f67e6b72fe7dp+0, -0x1.2e382315725e4p-4 },
+ { 0x1.2e01f7cf8b187p+0, -0x1.2600b8ed82e91p-4 },
+ { 0x1.2c9f518ddc86ep+0, -0x1.1dd2fa85efc12p-4 },
+ { 0x1.2b3fe86e5f413p+0, -0x1.15aed136e3961p-4 },
+ { 0x1.29e3b1211b25cp+0, -0x1.0d94269d1a30dp-4 },
+ { 0x1.288aa08b373cfp+0, -0x1.0582e4a7659f5p-4 },
+ { 0x1.2734abcaa8467p+0, -0x1.faf5eb655742dp-5 },
+ { 0x1.25e1c82459b81p+0, -0x1.eaf888487e8eep-5 },
+ { 0x1.2491eb1ad59c5p+0, -0x1.db0d75ef25a82p-5 },
+ { 0x1.23450a54048b5p+0, -0x1.cb348a49e6431p-5 },
+ { 0x1.21fb1bb09e578p+0, -0x1.bb6d9c69acdd8p-5 },
+ { 0x1.20b415346d8f7p+0, -0x1.abb88368aa7ap-5 },
+ { 0x1.1f6fed179a1acp+0, -0x1.9c1517476af14p-5 },
+ { 0x1.1e2e99b93c7b3p+0, -0x1.8c833051bfa4dp-5 },
+ { 0x1.1cf011a7a882ap+0, -0x1.7d02a78e7fb31p-5 },
+ { 0x1.1bb44b97dba5ap+0, -0x1.6d93565e97c5fp-5 },
+ { 0x1.1a7b3e66cdd4fp+0, -0x1.5e351695db0c5p-5 },
+ { 0x1.1944e11dc56cdp+0, -0x1.4ee7c2ba67adcp-5 },
+ { 0x1.18112aebb1a6ep+0, -0x1.3fab35ba16c01p-5 },
+ { 0x1.16e013231b7e9p+0, -0x1.307f4ad854bc9p-5 },
+ { 0x1.15b1913f156cfp+0, -0x1.2163ddf4f988cp-5 },
+ { 0x1.14859cdedde13p+0, -0x1.1258cb5d19e22p-5 },
+ { 0x1.135c2dc68cfa4p+0, -0x1.035defdba3188p-5 },
+ { 0x1.12353bdb01684p+0, -0x1.e8e651191bce4p-6 },
+ { 0x1.1110bf25b85b4p+0, -0x1.cb30a62be444cp-6 },
+ { 0x1.0feeafd2f8577p+0, -0x1.ad9a9b3043823p-6 },
+ { 0x1.0ecf062c51c3bp+0, -0x1.9023ecda1ccdep-6 },
+ { 0x1.0db1baa076c8bp+0, -0x1.72cc592bd82dp-6 },
+ { 0x1.0c96c5bb3048ep+0, -0x1.55939eb1f9c6ep-6 },
+ { 0x1.0b7e20263e070p+0, -0x1.38797ca6cc5ap-6 },
+ { 0x1.0a67c2acd0ce3p+0, -0x1.1b7db35c2c072p-6 },
+ { 0x1.0953a6391e982p+0, -0x1.fd400812ee9a2p-7 },
+ { 0x1.0841c3caea380p+0, -0x1.c3c05fb4620f1p-7 },
+ { 0x1.07321489b13eap+0, -0x1.8a7bf3c40e2e3p-7 },
+ { 0x1.062491aee9904p+0, -0x1.517249c15a75cp-7 },
+ { 0x1.05193497a7cc5p+0, -0x1.18a2ea5330c91p-7 },
+ { 0x1.040ff6b5f5e9fp+0, -0x1.c01abc8cdc4e2p-8 },
+ { 0x1.0308d19aa6127p+0, -0x1.4f6261750dec9p-8 },
+ { 0x1.0203beedb0c67p+0, -0x1.be37b6612afa7p-9 },
+ { 0x1.010037d38bcc2p+0, -0x1.bc3a8398ac26p-10 },
+ { 1.0, 0.0 },
+ { 0x1.fc06d493cca10p-1, 0x1.bb796219f30a5p-9 },
+ { 0x1.f81e6ac3b918fp-1, 0x1.b984fdcba61cep-8 },
+ { 0x1.f44546ef18996p-1, 0x1.49cf12adf8e8cp-7 },
+ { 0x1.f07b10382c84bp-1, 0x1.b6075b5217083p-7 },
+ { 0x1.ecbf7070e59d4p-1, 0x1.10b7466fc30ddp-6 },
+ { 0x1.e91213f715939p-1, 0x1.4603e4db6a3a1p-6 },
+ { 0x1.e572a9a75f7b7p-1, 0x1.7aeb10e99e105p-6 },
+ { 0x1.e1e0e2c530207p-1, 0x1.af6e49b0f0e36p-6 },
+ { 0x1.de5c72d8a8be3p-1, 0x1.e38f064f41179p-6 },
+ { 0x1.dae50fa5658ccp-1, 0x1.0ba75abbb7623p-5 },
+ { 0x1.d77a71145a2dap-1, 0x1.25575ee2dba86p-5 },
+ { 0x1.d41c51166623ep-1, 0x1.3ed83f477f946p-5 },
+ { 0x1.d0ca6ba0bb29fp-1, 0x1.582aa79af60efp-5 },
+ { 0x1.cd847e8e59681p-1, 0x1.714f400fa83aep-5 },
+ { 0x1.ca4a499693e00p-1, 0x1.8a46ad3901cb9p-5 },
+ { 0x1.c71b8e399e821p-1, 0x1.a311903b6b87p-5 },
+ { 0x1.c3f80faf19077p-1, 0x1.bbb086f216911p-5 },
+ { 0x1.c0df92dc2b0ecp-1, 0x1.d4242bdda648ep-5 },
+ { 0x1.bdd1de3cbb542p-1, 0x1.ec6d167c2af1p-5 },
+ { 0x1.baceb9e1007a3p-1, 0x1.0245ed8221426p-4 },
+ { 0x1.b7d5ef543e55ep-1, 0x1.0e40856c74f64p-4 },
+ { 0x1.b4e749977d953p-1, 0x1.1a269a31120fep-4 },
+ { 0x1.b20295155478ep-1, 0x1.25f8718fc076cp-4 },
+ { 0x1.af279f8e82be2p-1, 0x1.31b64ffc95bfp-4 },
+ { 0x1.ac5638197fdf3p-1, 0x1.3d60787ca5063p-4 },
+ { 0x1.a98e2f102e087p-1, 0x1.48f72ccd187fdp-4 },
+ { 0x1.a6cf5606d05c1p-1, 0x1.547aad6602f1cp-4 },
+ { 0x1.a4197fc04d746p-1, 0x1.5feb3989d3acbp-4 },
+ { 0x1.a16c80293dc01p-1, 0x1.6b490f3978c79p-4 },
+ { 0x1.9ec82c4dc5bc9p-1, 0x1.76946b3f5e703p-4 },
+ { 0x1.9c2c5a491f534p-1, 0x1.81cd895717c83p-4 },
+ { 0x1.9998e1480b618p-1, 0x1.8cf4a4055c30ep-4 },
+ { 0x1.970d9977c6c2dp-1, 0x1.9809f4c48c0ebp-4 },
+ { 0x1.948a5c023d212p-1, 0x1.a30db3f9899efp-4 },
+ { 0x1.920f0303d6809p-1, 0x1.ae001905458fcp-4 },
+ { 0x1.8f9b698a98b45p-1, 0x1.b8e15a2e3a2cdp-4 },
+ { 0x1.8d2f6b81726f6p-1, 0x1.c3b1ace2b0996p-4 },
+ { 0x1.8acae5bb55badp-1, 0x1.ce71456edfa62p-4 },
+ { 0x1.886db5d9275b8p-1, 0x1.d9205759882c4p-4 },
+ { 0x1.8617ba567c13cp-1, 0x1.e3bf1513af0dfp-4 },
+ { 0x1.83c8d27487800p-1, 0x1.ee4db0412c414p-4 },
+ { 0x1.8180de3c5dbe7p-1, 0x1.f8cc5998de3a5p-4 },
+ { 0x1.7f3fbe71cdb71p-1, 0x1.019da085eaeb1p-3 },
+ { 0x1.7d055498071c1p-1, 0x1.06cd4acdb4e3dp-3 },
+ { 0x1.7ad182e54f65ap-1, 0x1.0bf542bef813fp-3 },
+ { 0x1.78a42c3c90125p-1, 0x1.11159f14da262p-3 },
+ { 0x1.767d342f76944p-1, 0x1.162e761c10d1cp-3 },
+ { 0x1.745c7ef26b00ap-1, 0x1.1b3fddc60d43ep-3 },
+ { 0x1.7241f15769d0fp-1, 0x1.2049ebac86aa6p-3 },
+ { 0x1.702d70d396e41p-1, 0x1.254cb4fb7836ap-3 },
+ { 0x1.6e1ee3700cd11p-1, 0x1.2a484e8d0d252p-3 },
+ { 0x1.6c162fc9cbe02p-1, 0x1.2f3ccce1c860bp-3 } }
};
diff --git a/pl/math/v_log10f_3u5.c b/pl/math/v_log10f_3u5.c
index e9f7f0346ca2..92bc50ba5bd9 100644
--- a/pl/math/v_log10f_3u5.c
+++ b/pl/math/v_log10f_3u5.c
@@ -6,77 +6,77 @@
*/
#include "v_math.h"
-#include "mathlib.h"
+#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
-#define P(i) v_f32 (__v_log10f_poly[i])
-
-#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218. */
-#define InvLn10 v_f32 (0x1.bcb7b2p-2f)
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Mask v_u32 (0x007fffff)
-#define Off v_u32 (0x3f2aaaab) /* 0.666667. */
+static const struct data
+{
+ uint32x4_t min_norm;
+ uint16x8_t special_bound;
+ float32x4_t poly[8];
+ float32x4_t inv_ln10, ln2;
+ uint32x4_t off, mantissa_mask;
+} data = {
+ /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
+ [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
+ .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
+ V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
+ V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
+ .ln2 = V4 (0x1.62e43p-1f),
+ .inv_ln10 = V4 (0x1.bcb7b2p-2f),
+ .min_norm = V4 (0x00800000),
+ .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff),
+};
-VPCS_ATTR
-NOINLINE static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
+ uint16x4_t cmp)
{
/* Fall back to scalar code. */
- return v_call_f32 (log10f, x, y, cmp);
+ return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
}
-/* Our fast implementation of v_log10f uses a similar approach as v_logf.
- With the same offset as v_logf (i.e., 2/3) it delivers about 3.3ulps with
- order 9. This is more efficient than using a low order polynomial computed in
- double precision.
+/* Fast implementation of AdvSIMD log10f,
+ uses a similar approach as AdvSIMD logf with the same offset (i.e., 2/3) and
+ an order 9 polynomial.
Maximum error: 3.305ulps (nearest rounding.)
- __v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
- want 0x1.ffe2f4p-4 -0.304916 ulp err 2.80492. */
-VPCS_ATTR
-v_f32_t V_NAME (log10f) (v_f32_t x)
+ _ZGVnN4v_log10f(0x1.555c16p+0) got 0x1.ffe2fap-4
+ want 0x1.ffe2f4p-4. */
+float32x4_t VPCS_ATTR V_NAME_F1 (log10) (float32x4_t x)
{
- v_f32_t n, o, p, q, r, r2, y;
- v_u32_t u, cmp;
-
- u = v_as_u32_f32 (x);
- cmp = v_cond_u32 (u - Min >= Max - Min);
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t u = vreinterpretq_u32_f32 (x);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+ vget_low_u16 (d->special_bound));
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u -= Off;
- n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */
- u &= Mask;
- u += Off;
- r = v_as_f32_u32 (u) - v_f32 (1.0f);
+ u = vsubq_u32 (u, d->off);
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
+ u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
- /* y = log10(1+r) + n*log10(2). */
- r2 = r * r;
- /* (n*ln2 + r)*InvLn10 + r2*(P0 + r*P1 + r2*(P2 + r*P3 + r2*(P4 + r*P5 +
- r2*(P6+r*P7))). */
- o = v_fma_f32 (P (7), r, P (6));
- p = v_fma_f32 (P (5), r, P (4));
- q = v_fma_f32 (P (3), r, P (2));
- y = v_fma_f32 (P (1), r, P (0));
- p = v_fma_f32 (o, r2, p);
- q = v_fma_f32 (p, r2, q);
- y = v_fma_f32 (q, r2, y);
- /* Using p = Log10(2)*n + r*InvLn(10) is slightly faster
- but less accurate. */
- p = v_fma_f32 (Ln2, n, r);
- y = v_fma_f32 (y, r2, p * InvLn10);
+ /* y = log10(1+r) + n * log10(2). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
+ /* y = Log10(2) * n + poly * InvLn(10). */
+ float32x4_t y = vfmaq_f32 (r, d->ln2, n);
+ y = vmulq_f32 (y, d->inv_ln10);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, y, poly, r2, special);
+ return vfmaq_f32 (y, poly, r2);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, log10, 0.01, 11.1)
-PL_TEST_ULP (V_NAME (log10f), 2.81)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log10f))
-PL_TEST_INTERVAL (V_NAME (log10f), 0, 0xffff0000, 10000)
-PL_TEST_INTERVAL (V_NAME (log10f), 0x1p-4, 0x1p4, 500000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (log10), 2.81)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log10))
+PL_TEST_INTERVAL (V_NAME_F1 (log10), -0.0, -inf, 100)
+PL_TEST_INTERVAL (V_NAME_F1 (log10), 0, 0x1p-126, 100)
+PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log10), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log10), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log10), 100, inf, 50000)
diff --git a/pl/math/v_log10f_data.c b/pl/math/v_log10f_data.c
deleted file mode 100644
index 537482a92017..000000000000
--- a/pl/math/v_log10f_data.c
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * Coefficients for single-precision vector log10 function.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "math_config.h"
-
-const float __v_log10f_poly[] = {
- /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
- [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
- -0x1.bcb79cp-3f, 0x1.2879c8p-3f, -0x1.bcd472p-4f, 0x1.6408f8p-4f,
- -0x1.246f8p-4f, 0x1.f0e514p-5f, -0x1.0fc92cp-4f, 0x1.f5f76ap-5f};
diff --git a/pl/math/v_log1p_2u5.c b/pl/math/v_log1p_2u5.c
index e48291081ab3..face02ddc6c3 100644
--- a/pl/math/v_log1p_2u5.c
+++ b/pl/math/v_log1p_2u5.c
@@ -6,55 +6,65 @@
*/
#include "v_math.h"
-#include "estrin.h"
+#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
-#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
-#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
-#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */
-#define OneMHfRt2Top \
- 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \
- << 32. */
-#define OneTop12 0x3ff
-#define BottomMask 0xffffffff
-#define AbsMask 0x7fffffffffffffff
-#define C(i) v_f64 (__log1p_data.coeffs[i])
-
-static inline v_f64_t
-eval_poly (v_f64_t f)
+const static struct data
{
- v_f64_t f2 = f * f;
- v_f64_t f4 = f2 * f2;
- v_f64_t f8 = f4 * f4;
- return ESTRIN_18 (f, f2, f4, f8, f8 * f8, C);
-}
-
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t special)
+ float64x2_t poly[19], ln2[2];
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one;
+ int64x2_t one_top;
+} data = {
+ /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
+ .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2),
+ V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3),
+ V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3),
+ V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4),
+ V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4),
+ V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4),
+ V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4),
+ V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5),
+ V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4),
+ V2 (-0x1.cfa7385bdb37ep-6) },
+ .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) },
+ /* top32(asuint64(sqrt(2)/2)) << 32. */
+ .hf_rt2_top = V2 (0x3fe6a09e00000000),
+ /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000),
+ .umask = V2 (0x000fffff00000000),
+ .one_top = V2 (0x3ff),
+ .inf = V2 (0x7ff0000000000000),
+ .minus_one = V2 (0xbff0000000000000)
+};
+
+#define BottomMask v_u64 (0xffffffff)
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
{
return v_call_f64 (log1p, x, y, special);
}
-/* Vector log1p approximation using polynomial on reduced interval. Routine is a
- modification of the algorithm used in scalar log1p, with no shortcut for k=0
- and no narrowing for f and k. Maximum observed error is 2.46 ULP:
- __v_log1p(0x1.654a1307242a4p+11) got 0x1.fd5565fb590f4p+2
- want 0x1.fd5565fb590f6p+2 . */
-VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
+/* Vector log1p approximation using polynomial on reduced interval. Routine is
+ a modification of the algorithm used in scalar log1p, with no shortcut for
+ k=0 and no narrowing for f and k. Maximum observed error is 2.45 ULP:
+ _ZGVnN2v_log1p(0x1.658f7035c4014p+11) got 0x1.fd61d0727429dp+2
+ want 0x1.fd61d0727429fp+2 . */
+VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t ia = ix & AbsMask;
- v_u64_t special
- = v_cond_u64 ((ia >= v_u64 (0x7ff0000000000000))
- | (ix >= 0xbff0000000000000) | (ix == 0x8000000000000000));
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ uint64x2_t special = vcgeq_u64 (ia, d->inf);
#if WANT_SIMD_EXCEPT
+ special = vorrq_u64 (special,
+ vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1))));
if (unlikely (v_any_u64 (special)))
- x = v_sel_f64 (special, v_f64 (0), x);
+ x = v_zerofy_f64 (x, special);
+#else
+ special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1)));
#endif
/* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f
@@ -72,49 +82,47 @@ VPCS_ATTR v_f64_t V_NAME (log1p) (v_f64_t x)
The scalar algorithm casts down to 32-bit at this point to calculate k and
u_red. We stay in double-width to obtain f and k, using the same constants
as the scalar algorithm but shifted left by 32. */
- v_f64_t m = x + 1;
- v_u64_t mi = v_as_u64_f64 (m);
- v_u64_t u = mi + OneMHfRt2Top;
+ float64x2_t m = vaddq_f64 (x, v_f64 (1));
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
- v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop12;
- v_f64_t k = v_to_f64_s64 (ki);
+ int64x2_t ki
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+ float64x2_t k = vcvtq_f64_s64 (ki);
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
- v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
- v_u64_t u_red = utop | (mi & BottomMask);
- v_f64_t f = v_as_f64_u64 (u_red) - 1;
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
/* Correction term c/m. */
- v_f64_t cm = (x - (m - 1)) / m;
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
/* Approximate log1p(x) on the reduced input using a polynomial. Because
- log1p(0)=0 we choose an approximation of the form:
- x + C0*x^2 + C1*x^3 + C2x^4 + ...
- Hence approximation has the form f + f^2 * P(f)
+ log1p(0)=0 we choose an approximation of the form:
+ x + C0*x^2 + C1*x^3 + C2x^4 + ...
+ Hence approximation has the form f + f^2 * P(f)
where P(x) = C0 + C1*x + C2x^2 + ...
- Assembling this all correctly is dealt with at the final step. */
- v_f64_t p = eval_poly (f);
+ Assembling this all correctly is dealt with at the final step. */
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
- v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
- v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
- v_f64_t y = v_fma_f64 (f * f, p, ylo + yhi);
+ float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+ float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+ float64x2_t y = vaddq_f64 (ylo, yhi);
if (unlikely (v_any_u64 (special)))
- return specialcase (v_as_f64_u64 (ix), y, special);
+ return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p),
+ special);
- return y;
+ return vfmaq_f64 (y, f2, p);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (V_NAME (log1p), 1.97)
-PL_TEST_EXPECT_FENV (V_NAME (log1p), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (log1p), -10.0, 10.0, 10000)
-PL_TEST_INTERVAL (V_NAME (log1p), 0.0, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (log1p), 0x1p-23, 0.001, 50000)
-PL_TEST_INTERVAL (V_NAME (log1p), 0.001, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log1p), 0.0, -0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (log1p), -0x1p-23, -0.001, 50000)
-PL_TEST_INTERVAL (V_NAME (log1p), -0.001, -1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log1p), -1.0, inf, 5000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (log1p), 1.97)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (log1p), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.0, 0x1p-23, 50000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0x1p-23, 0.001, 50000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (log1p), 0.001, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log1p), 1, inf, 40000)
+PL_TEST_INTERVAL (V_NAME_D1 (log1p), -1.0, -inf, 500)
diff --git a/pl/math/v_log1p_inline.h b/pl/math/v_log1p_inline.h
index e5c733964bc0..bd57bfc6fe6e 100644
--- a/pl/math/v_log1p_inline.h
+++ b/pl/math/v_log1p_inline.h
@@ -9,22 +9,38 @@
#define PL_MATH_V_LOG1P_INLINE_H
#include "v_math.h"
-#include "pairwise_horner.h"
+#include "poly_advsimd_f64.h"
-#define Ln2Hi v_f64 (0x1.62e42fefa3800p-1)
-#define Ln2Lo v_f64 (0x1.ef35793c76730p-45)
-#define HfRt2Top 0x3fe6a09e00000000 /* top32(asuint64(sqrt(2)/2)) << 32. */
-#define OneMHfRt2Top \
- 0x00095f6200000000 /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) \
- << 32. */
-#define OneTop 0x3ff
-#define BottomMask 0xffffffff
-#define BigBoundTop 0x5fe /* top12 (asuint64 (0x1p511)). */
+struct v_log1p_data
+{
+ float64x2_t poly[19], ln2[2];
+ uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
+ int64x2_t one_top;
+};
+
+/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */
+#define V_LOG1P_CONSTANTS_TABLE \
+ { \
+ .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \
+ V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \
+ V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \
+ V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \
+ V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \
+ V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \
+ V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \
+ V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \
+ V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \
+ V2 (-0x1.cfa7385bdb37ep-6) }, \
+ .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \
+ .hf_rt2_top = V2 (0x3fe6a09e00000000), \
+ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \
+ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \
+ }
-#define C(i) v_f64 (__log1p_data.coeffs[i])
+#define BottomMask v_u64 (0xffffffff)
-static inline v_f64_t
-log1p_inline (v_f64_t x)
+static inline float64x2_t
+log1p_inline (float64x2_t x, const struct v_log1p_data *d)
{
/* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several
modifications:
@@ -35,43 +51,41 @@ log1p_inline (v_f64_t x)
0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in
the source of the caller before including this file.
See v_log1pf_2u1.c for details of the algorithm. */
- v_f64_t m = x + 1;
- v_u64_t mi = v_as_u64_f64 (m);
- v_u64_t u = mi + OneMHfRt2Top;
+ float64x2_t m = vaddq_f64 (x, v_f64 (1));
+ uint64x2_t mi = vreinterpretq_u64_f64 (m);
+ uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
- v_s64_t ki = v_as_s64_u64 (u >> 52) - OneTop;
- v_f64_t k = v_to_f64_s64 (ki);
+ int64x2_t ki
+ = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
+ float64x2_t k = vcvtq_f64_s64 (ki);
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
- v_u64_t utop = (u & 0x000fffff00000000) + HfRt2Top;
- v_u64_t u_red = utop | (mi & BottomMask);
- v_f64_t f = v_as_f64_u64 (u_red) - 1;
+ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
+ uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
+ float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1));
/* Correction term c/m. */
- v_f64_t cm = (x - (m - 1)) / m;
+ float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m);
#ifndef WANT_V_LOG1P_K0_SHORTCUT
#error \
"Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
#elif WANT_V_LOG1P_K0_SHORTCUT
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
- that the approximation is solely the polynomial. */
- v_u64_t k0 = k == 0;
- if (unlikely (v_any_u64 (k0)))
- {
- cm = v_sel_f64 (k0, v_f64 (0), cm);
- f = v_sel_f64 (k0, x, f);
- }
+ that the approximation is solely the polynomial. */
+ uint64x2_t k0 = vceqzq_f64 (k);
+ cm = v_zerofy_f64 (cm, k0);
+ f = vbslq_f64 (k0, x, f);
#endif
/* Approximate log1p(f) on the reduced input using a polynomial. */
- v_f64_t f2 = f * f;
- v_f64_t p = PAIRWISE_HORNER_18 (f, f2, C);
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly);
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
- v_f64_t ylo = v_fma_f64 (k, Ln2Lo, cm);
- v_f64_t yhi = v_fma_f64 (k, Ln2Hi, f);
- return v_fma_f64 (f2, p, ylo + yhi);
+ float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]);
+ float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]);
+ return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p);
}
#endif // PL_MATH_V_LOG1P_INLINE_H
diff --git a/pl/math/v_log1pf_2u1.c b/pl/math/v_log1pf_2u1.c
index 4a7732b403ec..153c88da9c88 100644
--- a/pl/math/v_log1pf_2u1.c
+++ b/pl/math/v_log1pf_2u1.c
@@ -8,104 +8,72 @@
#include "v_math.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f32.h"
-#if V_SUPPORTED
-
-#define AbsMask 0x7fffffff
-#define TinyBound 0x340 /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
-#define MinusOne 0xbf800000
-#define Ln2 (0x1.62e43p-1f)
-#define Four 0x40800000
-#define ThreeQuarters v_u32 (0x3f400000)
-
-#define C(i) v_f32 (__log1pf_data.coeffs[i])
-
-static inline v_f32_t
-eval_poly (v_f32_t m)
+const static struct data
{
-#ifdef V_LOG1PF_1U3
-
- /* Approximate log(1+m) on [-0.25, 0.5] using Horner scheme. */
- v_f32_t p = v_fma_f32 (C (8), m, C (7));
- p = v_fma_f32 (p, m, C (6));
- p = v_fma_f32 (p, m, C (5));
- p = v_fma_f32 (p, m, C (4));
- p = v_fma_f32 (p, m, C (3));
- p = v_fma_f32 (p, m, C (2));
- p = v_fma_f32 (p, m, C (1));
- p = v_fma_f32 (p, m, C (0));
- return v_fma_f32 (m, m * p, m);
-
-#elif defined(V_LOG1PF_2U5)
-
- /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */
- v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
- v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
- v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
- v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
-
- v_f32_t m2 = m * m;
- v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
- v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
- v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
-
- v_f32_t m4 = m2 * m2;
- v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
-
- return v_fma_f32 (m4, m4 * p_79, p_06);
-
-#else
-#error No precision specified for v_log1pf
-#endif
+ float32x4_t poly[8], ln2;
+ uint32x4_t tiny_bound, minus_one, four, thresh;
+ int32x4_t three_quarters;
+} data = {
+ .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+ (1, -0.5) are not stored as they can be generated more
+ efficiently. */
+ V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
+ V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
+ V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
+ .ln2 = V4 (0x1.62e43p-1f),
+ .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
+ .minus_one = V4 (0xbf800000),
+ .four = V4 (0x40800000),
+ .three_quarters = V4 (0x3f400000)
+};
+
+static inline float32x4_t
+eval_poly (float32x4_t m, const float32x4_t *p)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
+ float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
+ float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
+ float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
+ float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
+
+ float32x4_t m2 = vmulq_f32 (m, m);
+ float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
+ float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
+ float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
+
+ float32x4_t m4 = vmulq_f32 (m2, m2);
+ float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
+ return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
}
-static inline float
-handle_special (float x)
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
{
- uint32_t ix = asuint (x);
- uint32_t ia = ix & AbsMask;
- if (ix == 0xff800000 || ia > 0x7f800000 || ix > 0xbf800000)
- {
- /* x == -Inf => log1pf(x) = NaN.
- x < -1.0 => log1pf(x) = NaN.
- x == +/-NaN => log1pf(x) = NaN. */
-#if WANT_SIMD_EXCEPT
- return __math_invalidf (asfloat (ia));
-#else
- return NAN;
-#endif
- }
- if (ix == 0xbf800000)
- {
- /* x == -1.0 => log1pf(x) = -Inf. */
-#if WANT_SIMD_EXCEPT
- return __math_divzerof (ix);
-#else
- return -INFINITY;
-#endif
- }
- /* |x| < TinyBound => log1p(x) = x. */
- return x;
+ return v_call_f32 (log1pf, x, y, special);
}
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy is
- the same as for the scalar algorithm, i.e. worst-case error when using Estrin
+/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
is roughly 2.02 ULP:
log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
-VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t ia12 = (ix >> 20) & v_u32 (0x7f8);
- v_u32_t special_cases
- = v_cond_u32 (ia12 - v_u32 (TinyBound) >= (0x7f8 - TinyBound))
- | v_cond_u32 (ix >= MinusOne);
- v_f32_t special_arg = x;
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ uint32x4_t special_cases
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+ vcgeq_u32 (ix, d->minus_one));
+ float32x4_t special_arg = x;
#if WANT_SIMD_EXCEPT
if (unlikely (v_any_u32 (special_cases)))
/* Side-step special lanes so fenv exceptions are not triggered
inadvertently. */
- x = v_sel_f32 (special_cases, v_f32 (1), x);
+ x = v_zerofy_f32 (x, special_cases);
#endif
/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
@@ -117,44 +85,42 @@ VPCS_ATTR v_f32_t V_NAME (log1pf) (v_f32_t x)
scale factor s = 4*k*log(2) to ensure the scale is representable
as a normalised fp32 number. */
- v_f32_t m = x + v_f32 (1.0f);
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
/* Choose k to scale x to the range [-1/4, 1/2]. */
- v_s32_t k = (v_as_s32_f32 (m) - ThreeQuarters) & v_u32 (0xff800000);
+ int32x4_t k
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
+ v_s32 (0xff800000));
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
/* Scale x by exponent manipulation. */
- v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - v_as_u32_s32 (k));
+ float32x4_t m_scale
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
/* Scale up to ensure that the scale factor is representable as normalised
fp32 number, and scale m down accordingly. */
- v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
- m_scale = m_scale + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
/* Evaluate polynomial on the reduced interval. */
- v_f32_t p = eval_poly (m_scale);
+ float32x4_t p = eval_poly (m_scale, d->poly);
/* The scale factor to be applied back at the end - by multiplying float(k)
by 2^-23 we get the unbiased exponent of k. */
- v_f32_t scale_back = v_to_f32_s32 (k) * v_f32 (0x1p-23f);
+ float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
/* Apply the scaling back. */
- v_f32_t y = v_fma_f32 (scale_back, v_f32 (Ln2), p);
+ float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
if (unlikely (v_any_u32 (special_cases)))
- return v_call_f32 (handle_special, special_arg, y, special_cases);
+ return special_case (special_arg, y, special_cases);
return y;
}
-VPCS_ALIAS
PL_SIG (V, F, 1, log1p, -0.9, 10.0)
-PL_TEST_ULP (V_NAME (log1pf), 1.53)
-PL_TEST_EXPECT_FENV (V_NAME (log1pf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (log1pf), -10.0, 10.0, 10000)
-PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, 0x1p-23, 30000)
-PL_TEST_INTERVAL (V_NAME (log1pf), 0x1p-23, 0.001, 50000)
-PL_TEST_INTERVAL (V_NAME (log1pf), 0.001, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log1pf), 0.0, -0x1p-23, 30000)
-PL_TEST_INTERVAL (V_NAME (log1pf), -0x1p-23, -0.001, 30000)
-PL_TEST_INTERVAL (V_NAME (log1pf), -0.001, -1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log1pf), -1.0, inf, 1000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (log1p), 1.53)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (log1p), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0.0, 0x1p-23, 30000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (log1p), 0x1p-23, 1, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log1p), 1, inf, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log1p), -1.0, -inf, 1000)
diff --git a/pl/math/v_log1pf_inline.h b/pl/math/v_log1pf_inline.h
index e3048e667c26..c654c6bad08f 100644
--- a/pl/math/v_log1pf_inline.h
+++ b/pl/math/v_log1pf_inline.h
@@ -10,46 +10,58 @@
#define PL_MATH_V_LOG1PF_INLINE_H
#include "v_math.h"
-#include "math_config.h"
+#include "poly_advsimd_f32.h"
-#define Four 0x40800000
-#define Ln2 v_f32 (0x1.62e43p-1f)
-
-#define C(i) v_f32 (__log1pf_data.coeffs[i])
-
-static inline v_f32_t
-eval_poly (v_f32_t m)
+struct v_log1pf_data
{
- /* Approximate log(1+m) on [-0.25, 0.5] using Estrin scheme. */
- v_f32_t p_12 = v_fma_f32 (m, C (1), C (0));
- v_f32_t p_34 = v_fma_f32 (m, C (3), C (2));
- v_f32_t p_56 = v_fma_f32 (m, C (5), C (4));
- v_f32_t p_78 = v_fma_f32 (m, C (7), C (6));
+ float32x4_t poly[8], ln2;
+ uint32x4_t four;
+ int32x4_t three_quarters;
+};
- v_f32_t m2 = m * m;
- v_f32_t p_02 = v_fma_f32 (m2, p_12, m);
- v_f32_t p_36 = v_fma_f32 (m2, p_56, p_34);
- v_f32_t p_79 = v_fma_f32 (m2, C (8), p_78);
+/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
+ (1, -0.5) are not stored as they can be generated more efficiently. */
+#define V_LOG1PF_CONSTANTS_TABLE \
+ { \
+ .poly \
+ = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
+ V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
+ V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
+ }
- v_f32_t m4 = m2 * m2;
- v_f32_t p_06 = v_fma_f32 (m4, p_36, p_02);
-
- return v_fma_f32 (m4, m4 * p_79, p_06);
+static inline float32x4_t
+eval_poly (float32x4_t m, const float32x4_t *c)
+{
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
+ uses split Estrin, but this way reduces register pressure in the calling
+ routine). */
+ float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+ float32x4_t m2 = vmulq_f32 (m, m);
+ q = vfmaq_f32 (m, m2, q);
+ float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+ p = vmulq_f32 (m2, p);
+ return vfmaq_f32 (q, m2, p);
}
-static inline v_f32_t
-log1pf_inline (v_f32_t x)
+static inline float32x4_t
+log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
{
/* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
special-case handling. See that file for details of the algorithm. */
- v_f32_t m = x + 1.0f;
- v_u32_t k = (v_as_u32_f32 (m) - 0x3f400000) & 0xff800000;
- v_f32_t s = v_as_f32_u32 (v_u32 (Four) - k);
- v_f32_t m_scale = v_as_f32_u32 (v_as_u32_f32 (x) - k)
- + v_fma_f32 (v_f32 (0.25f), s, v_f32 (-1.0f));
- v_f32_t p = eval_poly (m_scale);
- v_f32_t scale_back = v_to_f32_u32 (k) * 0x1.0p-23f;
- return v_fma_f32 (scale_back, Ln2, p);
+ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+ int32x4_t k
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+ v_s32 (0xff800000));
+ uint32x4_t ku = vreinterpretq_u32_s32 (k);
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+ float32x4_t m_scale
+ = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+ m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+ float32x4_t p = eval_poly (m_scale, d.poly);
+ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
+ return vfmaq_f32 (p, scale_back, d.ln2);
}
#endif // PL_MATH_V_LOG1PF_INLINE_H
diff --git a/pl/math/v_log2_3u.c b/pl/math/v_log2_3u.c
index fac73f60c600..2dd2c34b7c97 100644
--- a/pl/math/v_log2_3u.c
+++ b/pl/math/v_log2_3u.c
@@ -6,95 +6,104 @@
*/
#include "v_math.h"
-#include "include/mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
+#include "poly_advsimd_f64.h"
-#if V_SUPPORTED
-
-#define InvLn2 v_f64 (0x1.71547652b82fep0)
#define N (1 << V_LOG2_TABLE_BITS)
-#define OFF v_u64 (0x3fe6900900000000)
-#define P(i) v_f64 (__v_log2_data.poly[i])
+
+static const struct data
+{
+ uint64x2_t min_norm;
+ uint32x4_t special_bound;
+ float64x2_t poly[5];
+ float64x2_t invln2;
+ uint64x2_t sign_exp_mask;
+} data = {
+ /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+ and N = 128, then scaled by log2(e) in extended precision and rounded back
+ to double precision. */
+ .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2),
+ V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2),
+ V2 (-0x1.ec738d616fe26p-3) },
+ .invln2 = V2 (0x1.71547652b82fep0),
+ .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
+ .sign_exp_mask = V2 (0xfff0000000000000),
+};
+
+#define Off v_u64 (0x3fe6900900000000)
+#define IndexMask (N - 1)
struct entry
{
- v_f64_t invc;
- v_f64_t log2c;
+ float64x2_t invc;
+ float64x2_t log2c;
};
static inline struct entry
-lookup (v_u64_t i)
+lookup (uint64x2_t i)
{
struct entry e;
-#ifdef SCALAR
- e.invc = __v_log2_data.tab[i].invc;
- e.log2c = __v_log2_data.tab[i].log2c;
-#else
- e.invc[0] = __v_log2_data.tab[i[0]].invc;
- e.log2c[0] = __v_log2_data.tab[i[0]].log2c;
- e.invc[1] = __v_log2_data.tab[i[1]].invc;
- e.log2c[1] = __v_log2_data.tab[i[1]].log2c;
-#endif
+ uint64_t i0 = (i[0] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (i[1] >> (52 - V_LOG2_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log2_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log2_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.log2c = vuzp2q_f64 (e0, e1);
return e;
}
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x, v_f64_t y, v_u64_t cmp)
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2,
+ uint32x2_t special)
{
- return v_call_f64 (log2, x, y, cmp);
+ return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special));
}
-/* Double-precision vector log2 routine. Implements the same algorithm as vector
- log10, with coefficients and table entries scaled in extended precision.
- The maximum observed error is 2.58 ULP:
- __v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
- want 0x1.fffb34198d9ddp-5. */
-VPCS_ATTR
-v_f64_t V_NAME (log2) (v_f64_t x)
+/* Double-precision vector log2 routine. Implements the same algorithm as
+ vector log10, with coefficients and table entries scaled in extended
+ precision. The maximum observed error is 2.58 ULP:
+ _ZGVnN2v_log2(0x1.0b556b093869bp+0) got 0x1.fffb34198d9dap-5
+ want 0x1.fffb34198d9ddp-5. */
+float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t top = ix >> 48;
- v_u64_t special
- = v_cond_u64 (top - v_u64 (0x0010) >= v_u64 (0x7ff0 - 0x0010));
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t ix = vreinterpretq_u64_f64 (x);
+ uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+ vget_low_u32 (d->special_bound));
- /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- v_u64_t tmp = ix - OFF;
- v_u64_t i = (tmp >> (52 - V_LOG2_TABLE_BITS)) % N;
- v_s64_t k = v_as_s64_u64 (tmp) >> 52; /* arithmetic shift. */
- v_u64_t iz = ix - (tmp & v_u64 (0xfffULL << 52));
- v_f64_t z = v_as_f64_u64 (iz);
- struct entry e = lookup (i);
+ uint64x2_t tmp = vsubq_u64 (ix, Off);
+ int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52);
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+
+ struct entry e = lookup (tmp);
/* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */
- v_f64_t r = v_fma_f64 (z, e.invc, v_f64 (-1.0));
- v_f64_t kd = v_to_f64_s64 (k);
- v_f64_t w = v_fma_f64 (r, InvLn2, e.log2c);
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+ float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2);
- v_f64_t r2 = r * r;
- v_f64_t p_23 = v_fma_f64 (P (3), r, P (2));
- v_f64_t p_01 = v_fma_f64 (P (1), r, P (0));
- v_f64_t y = v_fma_f64 (P (4), r2, p_23);
- y = v_fma_f64 (r2, y, p_01);
- y = v_fma_f64 (r2, y, kd + w);
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly);
+ w = vaddq_f64 (kd, w);
- if (unlikely (v_any_u64 (special)))
- return specialcase (x, y, special);
- return y;
+ if (unlikely (v_any_u32h (special)))
+ return special_case (x, y, w, r2, special);
+ return vfmaq_f64 (w, r2, y);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (V_NAME (log2), 2.09)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2))
-PL_TEST_INTERVAL (V_NAME (log2), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME (log2), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME (log2), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (log2), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log2), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME (log2), 100, inf, 50000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (log2), 2.09)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_D1 (log2))
+PL_TEST_INTERVAL (V_NAME_D1 (log2), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME_D1 (log2), 100, inf, 50000)
diff --git a/pl/math/v_log2_data.c b/pl/math/v_log2_data.c
index 2a1da6823fbc..50697daff925 100644
--- a/pl/math/v_log2_data.c
+++ b/pl/math/v_log2_data.c
@@ -9,147 +9,145 @@
#define N (1 << V_LOG2_TABLE_BITS)
-// clang-format off
-
const struct v_log2_data __v_log2_data = {
-/* Derived from the coefficients in log_data.c for N == 128 && LOG_POLY_ORDER == 6.
- Each coefficient was scaled by log2(e) in extended precision and rounded back to
- double. */
-.poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2,
- 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 },
+ /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9
+ and N = 128, then scaled by log2(e) in extended precision and rounded back
+ to double precision. */
+ .poly = { -0x1.71547652b83p-1, 0x1.ec709dc340953p-2, -0x1.71547651c8f35p-2,
+ 0x1.2777ebe12dda5p-2, -0x1.ec738d616fe26p-3 },
+
+ .invln2 = 0x1.71547652b82fep0,
-/* Derived from the table in v_log10_data.c. invc is unchanged. log2(c) was
- calculated by scaling log10(c) by log2(10) in extended precision and rounding
- back. */
-.tab = {
-{ 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 },
-{ 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 },
-{ 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 },
-{ 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 },
-{ 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 },
-{ 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 },
-{ 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 },
-{ 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 },
-{ 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 },
-{ 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 },
-{ 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 },
-{ 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 },
-{ 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 },
-{ 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 },
-{ 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 },
-{ 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 },
-{ 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 },
-{ 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 },
-{ 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 },
-{ 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 },
-{ 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 },
-{ 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 },
-{ 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 },
-{ 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 },
-{ 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 },
-{ 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 },
-{ 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 },
-{ 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 },
-{ 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 },
-{ 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 },
-{ 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 },
-{ 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 },
-{ 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 },
-{ 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 },
-{ 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 },
-{ 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 },
-{ 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 },
-{ 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 },
-{ 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 },
-{ 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 },
-{ 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 },
-{ 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 },
-{ 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 },
-{ 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 },
-{ 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 },
-{ 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 },
-{ 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 },
-{ 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 },
-{ 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 },
-{ 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 },
-{ 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 },
-{ 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 },
-{ 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 },
-{ 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 },
-{ 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 },
-{ 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 },
-{ 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 },
-{ 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 },
-{ 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 },
-{ 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 },
-{ 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 },
-{ 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 },
-{ 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 },
-{ 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 },
-{ 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 },
-{ 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 },
-{ 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 },
-{ 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 },
-{ 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 },
-{ 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 },
-{ 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 },
-{ 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 },
-{ 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 },
-{ 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 },
-{ 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 },
-{ 1.0, 0.0 },
-{ 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 },
-{ 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 },
-{ 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 },
-{ 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 },
-{ 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 },
-{ 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 },
-{ 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 },
-{ 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 },
-{ 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 },
-{ 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 },
-{ 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 },
-{ 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 },
-{ 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 },
-{ 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 },
-{ 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 },
-{ 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 },
-{ 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 },
-{ 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 },
-{ 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 },
-{ 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 },
-{ 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 },
-{ 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 },
-{ 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 },
-{ 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 },
-{ 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 },
-{ 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 },
-{ 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 },
-{ 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 },
-{ 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 },
-{ 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 },
-{ 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 },
-{ 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 },
-{ 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 },
-{ 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 },
-{ 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 },
-{ 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 },
-{ 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 },
-{ 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 },
-{ 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 },
-{ 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 },
-{ 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 },
-{ 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 },
-{ 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 },
-{ 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 },
-{ 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 },
-{ 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 },
-{ 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 },
-{ 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 },
-{ 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 },
-{ 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 },
-{ 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 },
-{ 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 }}
+ /* Derived from tables in v_log_data.c in a similar way as v_log10_data.c.
+ This means invc is unchanged and log2c was calculated by scaling log(c) by
+ log2(e) in extended precision and rounding back to double precision. */
+ .table = { { 0x1.6a133d0dec120p+0, -0x1.00130d57f5fadp-1 },
+ { 0x1.6815f2f3e42edp+0, -0x1.f802661bd725ep-2 },
+ { 0x1.661e39be1ac9ep+0, -0x1.efea1c6f73a5bp-2 },
+ { 0x1.642bfa30ac371p+0, -0x1.e7dd1dcd06f05p-2 },
+ { 0x1.623f1d916f323p+0, -0x1.dfdb4ae024809p-2 },
+ { 0x1.60578da220f65p+0, -0x1.d7e484d101958p-2 },
+ { 0x1.5e75349dea571p+0, -0x1.cff8ad452f6ep-2 },
+ { 0x1.5c97fd387a75ap+0, -0x1.c817a666c997fp-2 },
+ { 0x1.5abfd2981f200p+0, -0x1.c04152d640419p-2 },
+ { 0x1.58eca051dc99cp+0, -0x1.b87595a3f64b2p-2 },
+ { 0x1.571e526d9df12p+0, -0x1.b0b4526c44d07p-2 },
+ { 0x1.5554d555b3fcbp+0, -0x1.a8fd6d1a90f5ep-2 },
+ { 0x1.539015e2a20cdp+0, -0x1.a150ca2559fc6p-2 },
+ { 0x1.51d0014ee0164p+0, -0x1.99ae4e62cca29p-2 },
+ { 0x1.50148538cd9eep+0, -0x1.9215df1a1e842p-2 },
+ { 0x1.4e5d8f9f698a1p+0, -0x1.8a8761fe1f0d9p-2 },
+ { 0x1.4cab0edca66bep+0, -0x1.8302bd1cc9a54p-2 },
+ { 0x1.4afcf1a9db874p+0, -0x1.7b87d6fb437f6p-2 },
+ { 0x1.495327136e16fp+0, -0x1.741696673a86dp-2 },
+ { 0x1.47ad9e84af28fp+0, -0x1.6caee2b3c6fe4p-2 },
+ { 0x1.460c47b39ae15p+0, -0x1.6550a3666c27ap-2 },
+ { 0x1.446f12b278001p+0, -0x1.5dfbc08de02a4p-2 },
+ { 0x1.42d5efdd720ecp+0, -0x1.56b022766c84ap-2 },
+ { 0x1.4140cfe001a0fp+0, -0x1.4f6db1c955536p-2 },
+ { 0x1.3fafa3b421f69p+0, -0x1.4834579063054p-2 },
+ { 0x1.3e225c9c8ece5p+0, -0x1.4103fd2249a76p-2 },
+ { 0x1.3c98ec29a211ap+0, -0x1.39dc8c3fe6dabp-2 },
+ { 0x1.3b13442a413fep+0, -0x1.32bdeed4b5c8fp-2 },
+ { 0x1.399156baa3c54p+0, -0x1.2ba80f41e20ddp-2 },
+ { 0x1.38131639b4cdbp+0, -0x1.249ad8332f4a7p-2 },
+ { 0x1.36987540fbf53p+0, -0x1.1d96347e7f3ebp-2 },
+ { 0x1.352166b648f61p+0, -0x1.169a0f7d6604ap-2 },
+ { 0x1.33adddb3eb575p+0, -0x1.0fa654a221909p-2 },
+ { 0x1.323dcd99fc1d3p+0, -0x1.08baefcf8251ap-2 },
+ { 0x1.30d129fefc7d2p+0, -0x1.01d7cd14deecdp-2 },
+ { 0x1.2f67e6b72fe7dp+0, -0x1.f5f9b1ad55495p-3 },
+ { 0x1.2e01f7cf8b187p+0, -0x1.e853ff76a77afp-3 },
+ { 0x1.2c9f518ddc86ep+0, -0x1.dabe5d624cba1p-3 },
+ { 0x1.2b3fe86e5f413p+0, -0x1.cd38a5cef4822p-3 },
+ { 0x1.29e3b1211b25cp+0, -0x1.bfc2b38d315f9p-3 },
+ { 0x1.288aa08b373cfp+0, -0x1.b25c61f5edd0fp-3 },
+ { 0x1.2734abcaa8467p+0, -0x1.a5058d18e9cacp-3 },
+ { 0x1.25e1c82459b81p+0, -0x1.97be1113e47a3p-3 },
+ { 0x1.2491eb1ad59c5p+0, -0x1.8a85cafdf5e27p-3 },
+ { 0x1.23450a54048b5p+0, -0x1.7d5c97e8fc45bp-3 },
+ { 0x1.21fb1bb09e578p+0, -0x1.704255d6486e4p-3 },
+ { 0x1.20b415346d8f7p+0, -0x1.6336e2cedd7bfp-3 },
+ { 0x1.1f6fed179a1acp+0, -0x1.563a1d9b0cc6ap-3 },
+ { 0x1.1e2e99b93c7b3p+0, -0x1.494be541aaa6fp-3 },
+ { 0x1.1cf011a7a882ap+0, -0x1.3c6c1964dd0f2p-3 },
+ { 0x1.1bb44b97dba5ap+0, -0x1.2f9a99f19a243p-3 },
+ { 0x1.1a7b3e66cdd4fp+0, -0x1.22d747344446p-3 },
+ { 0x1.1944e11dc56cdp+0, -0x1.1622020d4f7f5p-3 },
+ { 0x1.18112aebb1a6ep+0, -0x1.097aabb3553f3p-3 },
+ { 0x1.16e013231b7e9p+0, -0x1.f9c24b48014c5p-4 },
+ { 0x1.15b1913f156cfp+0, -0x1.e0aaa3bdc858ap-4 },
+ { 0x1.14859cdedde13p+0, -0x1.c7ae257c952d6p-4 },
+ { 0x1.135c2dc68cfa4p+0, -0x1.aecc960a03e58p-4 },
+ { 0x1.12353bdb01684p+0, -0x1.9605bb724d541p-4 },
+ { 0x1.1110bf25b85b4p+0, -0x1.7d595ca7147cep-4 },
+ { 0x1.0feeafd2f8577p+0, -0x1.64c74165002d9p-4 },
+ { 0x1.0ecf062c51c3bp+0, -0x1.4c4f31c86d344p-4 },
+ { 0x1.0db1baa076c8bp+0, -0x1.33f0f70388258p-4 },
+ { 0x1.0c96c5bb3048ep+0, -0x1.1bac5abb3037dp-4 },
+ { 0x1.0b7e20263e070p+0, -0x1.0381272495f21p-4 },
+ { 0x1.0a67c2acd0ce3p+0, -0x1.d6de4eba2de2ap-5 },
+ { 0x1.0953a6391e982p+0, -0x1.a6ec4e8156898p-5 },
+ { 0x1.0841c3caea380p+0, -0x1.772be542e3e1bp-5 },
+ { 0x1.07321489b13eap+0, -0x1.479cadcde852dp-5 },
+ { 0x1.062491aee9904p+0, -0x1.183e4265faa5p-5 },
+ { 0x1.05193497a7cc5p+0, -0x1.d2207fdaa1b85p-6 },
+ { 0x1.040ff6b5f5e9fp+0, -0x1.742486cb4a6a2p-6 },
+ { 0x1.0308d19aa6127p+0, -0x1.1687d77cfc299p-6 },
+ { 0x1.0203beedb0c67p+0, -0x1.7293623a6b5dep-7 },
+ { 0x1.010037d38bcc2p+0, -0x1.70ec80ec8f25dp-8 },
+ { 1.0, 0.0 },
+ { 0x1.fc06d493cca10p-1, 0x1.704c1ca6b6bc9p-7 },
+ { 0x1.f81e6ac3b918fp-1, 0x1.6eac8ba664beap-6 },
+ { 0x1.f44546ef18996p-1, 0x1.11e67d040772dp-5 },
+ { 0x1.f07b10382c84bp-1, 0x1.6bc665e2105dep-5 },
+ { 0x1.ecbf7070e59d4p-1, 0x1.c4f8a9772bf1dp-5 },
+ { 0x1.e91213f715939p-1, 0x1.0ebff10fbb951p-4 },
+ { 0x1.e572a9a75f7b7p-1, 0x1.3aaf4d7805d11p-4 },
+ { 0x1.e1e0e2c530207p-1, 0x1.664ba81a4d717p-4 },
+ { 0x1.de5c72d8a8be3p-1, 0x1.9196387da6de4p-4 },
+ { 0x1.dae50fa5658ccp-1, 0x1.bc902f2b7796p-4 },
+ { 0x1.d77a71145a2dap-1, 0x1.e73ab5f584f28p-4 },
+ { 0x1.d41c51166623ep-1, 0x1.08cb78510d232p-3 },
+ { 0x1.d0ca6ba0bb29fp-1, 0x1.1dd2fe2f0dcb5p-3 },
+ { 0x1.cd847e8e59681p-1, 0x1.32b4784400df4p-3 },
+ { 0x1.ca4a499693e00p-1, 0x1.47706f3d49942p-3 },
+ { 0x1.c71b8e399e821p-1, 0x1.5c0768ee4a4dcp-3 },
+ { 0x1.c3f80faf19077p-1, 0x1.7079e86fc7c6dp-3 },
+ { 0x1.c0df92dc2b0ecp-1, 0x1.84c86e1183467p-3 },
+ { 0x1.bdd1de3cbb542p-1, 0x1.98f377a34b499p-3 },
+ { 0x1.baceb9e1007a3p-1, 0x1.acfb803bc924bp-3 },
+ { 0x1.b7d5ef543e55ep-1, 0x1.c0e10098b025fp-3 },
+ { 0x1.b4e749977d953p-1, 0x1.d4a46efe103efp-3 },
+ { 0x1.b20295155478ep-1, 0x1.e8463f45b8d0bp-3 },
+ { 0x1.af279f8e82be2p-1, 0x1.fbc6e3228997fp-3 },
+ { 0x1.ac5638197fdf3p-1, 0x1.079364f2e5aa8p-2 },
+ { 0x1.a98e2f102e087p-1, 0x1.1133306010a63p-2 },
+ { 0x1.a6cf5606d05c1p-1, 0x1.1ac309631bd17p-2 },
+ { 0x1.a4197fc04d746p-1, 0x1.24432485370c1p-2 },
+ { 0x1.a16c80293dc01p-1, 0x1.2db3b5449132fp-2 },
+ { 0x1.9ec82c4dc5bc9p-1, 0x1.3714ee1d7a32p-2 },
+ { 0x1.9c2c5a491f534p-1, 0x1.406700ab52c94p-2 },
+ { 0x1.9998e1480b618p-1, 0x1.49aa1d87522b2p-2 },
+ { 0x1.970d9977c6c2dp-1, 0x1.52de746d7ecb2p-2 },
+ { 0x1.948a5c023d212p-1, 0x1.5c0434336b343p-2 },
+ { 0x1.920f0303d6809p-1, 0x1.651b8ad6c90d1p-2 },
+ { 0x1.8f9b698a98b45p-1, 0x1.6e24a56ab5831p-2 },
+ { 0x1.8d2f6b81726f6p-1, 0x1.771fb04ec29b1p-2 },
+ { 0x1.8acae5bb55badp-1, 0x1.800cd6f19c25ep-2 },
+ { 0x1.886db5d9275b8p-1, 0x1.88ec441df11dfp-2 },
+ { 0x1.8617ba567c13cp-1, 0x1.91be21b7c93f5p-2 },
+ { 0x1.83c8d27487800p-1, 0x1.9a8298f8c7454p-2 },
+ { 0x1.8180de3c5dbe7p-1, 0x1.a339d255c04ddp-2 },
+ { 0x1.7f3fbe71cdb71p-1, 0x1.abe3f59f43db7p-2 },
+ { 0x1.7d055498071c1p-1, 0x1.b48129deca9efp-2 },
+ { 0x1.7ad182e54f65ap-1, 0x1.bd119575364c1p-2 },
+ { 0x1.78a42c3c90125p-1, 0x1.c5955e23ebcbcp-2 },
+ { 0x1.767d342f76944p-1, 0x1.ce0ca8f4e1557p-2 },
+ { 0x1.745c7ef26b00ap-1, 0x1.d6779a5a75774p-2 },
+ { 0x1.7241f15769d0fp-1, 0x1.ded6563550d27p-2 },
+ { 0x1.702d70d396e41p-1, 0x1.e728ffafd840ep-2 },
+ { 0x1.6e1ee3700cd11p-1, 0x1.ef6fb96c8d739p-2 },
+ { 0x1.6c162fc9cbe02p-1, 0x1.f7aaa57907219p-2 } }
};
-// clang-format on
diff --git a/pl/math/v_log2f_2u5.c b/pl/math/v_log2f_2u5.c
index 8f9241bed8e6..c64d88742136 100644
--- a/pl/math/v_log2f_2u5.c
+++ b/pl/math/v_log2f_2u5.c
@@ -6,63 +6,72 @@
*/
#include "v_math.h"
-#include "pairwise_hornerf.h"
+#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-#define C(i) v_f32 (__v_log2f_data.poly[i])
-
-#define Ln2 v_f32 (0x1.62e43p-1f) /* 0x3f317218 */
-#define Min v_u32 (0x00800000)
-#define Max v_u32 (0x7f800000)
-#define Mask v_u32 (0x007fffff)
-#define Off v_u32 (0x3f2aaaab) /* 0.666667 */
+static const struct data
+{
+ uint32x4_t min_norm;
+ uint16x8_t special_bound;
+ uint32x4_t off, mantissa_mask;
+ float32x4_t poly[9];
+} data = {
+ /* Coefficients generated using Remez algorithm approximate
+ log2(1+r)/r for r in [ -1/3, 1/3 ].
+ rel error: 0x1.c4c4b0cp-26. */
+ .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
+ V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
+ V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
+ V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
+ .min_norm = V4 (0x00800000),
+ .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .mantissa_mask = V4 (0x007fffff),
+};
-VPCS_ATTR
-NOINLINE static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
+ uint16x4_t cmp)
{
/* Fall back to scalar code. */
- return v_call_f32 (log2f, x, y, cmp);
+ return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
}
-/* Fast implementation for single precision log2,
- relies on same argument reduction as Neon logf.
+/* Fast implementation for single precision AdvSIMD log2,
+ relies on same argument reduction as AdvSIMD logf.
Maximum error: 2.48 ULPs
- __v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
- want 0x1.a9be8p-2. */
-VPCS_ATTR
-v_f32_t V_NAME (log2f) (v_f32_t x)
+ _ZGVnN4v_log2f(0x1.558174p+0) got 0x1.a9be84p-2
+ want 0x1.a9be8p-2. */
+float32x4_t VPCS_ATTR V_NAME_F1 (log2) (float32x4_t x)
{
- v_u32_t u = v_as_u32_f32 (x);
- v_u32_t cmp = v_cond_u32 (u - Min >= Max - Min);
+ const struct data *d = ptr_barrier (&data);
+ uint32x4_t u = vreinterpretq_u32_f32 (x);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+ vget_low_u16 (d->special_bound));
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u -= Off;
- v_f32_t n = v_to_f32_s32 (v_as_s32_u32 (u) >> 23); /* signextend. */
- u &= Mask;
- u += Off;
- v_f32_t r = v_as_f32_u32 (u) - v_f32 (1.0f);
+ u = vsubq_u32 (u, d->off);
+ float32x4_t n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
+ u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log2(1+r) + n. */
- v_f32_t r2 = r * r;
- v_f32_t p = PAIRWISE_HORNER_8 (r, r2, C);
- v_f32_t y = v_fma_f32 (p, r, n);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
- if (unlikely (v_any_u32 (cmp)))
- return specialcase (x, y, cmp);
- return y;
+ if (unlikely (v_any_u16h (special)))
+ return special_case (x, n, p, r, special);
+ return vfmaq_f32 (n, p, r);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, log2, 0.01, 11.1)
-PL_TEST_ULP (V_NAME (log2f), 1.99)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (log2f))
-PL_TEST_INTERVAL (V_NAME (log2f), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (log2f), 0x1p-23, 1.0, 50000)
-PL_TEST_INTERVAL (V_NAME (log2f), 1.0, 100, 50000)
-PL_TEST_INTERVAL (V_NAME (log2f), 100, inf, 50000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (log2), 1.99)
+PL_TEST_EXPECT_FENV_ALWAYS (V_NAME_F1 (log2))
+PL_TEST_INTERVAL (V_NAME_F1 (log2), -0.0, -0x1p126, 100)
+PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-149, 0x1p-126, 4000)
+PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-126, 0x1p-23, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log2), 0x1p-23, 1.0, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log2), 1.0, 100, 50000)
+PL_TEST_INTERVAL (V_NAME_F1 (log2), 100, inf, 50000)
diff --git a/pl/math/v_log2f_data.c b/pl/math/v_log2f_data.c
deleted file mode 100644
index b144e8f4992d..000000000000
--- a/pl/math/v_log2f_data.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Coefficients for vector log2f
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-
-#include "math_config.h"
-
-/* See tools/v_log2f.sollya for the algorithm used to generate these
- coefficients. */
-const struct v_log2f_data __v_log2f_data
- = {.poly = {0x1.715476p0f, /* (float)(1 / ln(2)). */
- -0x1.715458p-1f, 0x1.ec701cp-2f, -0x1.7171a4p-2f, 0x1.27a0b8p-2f,
- -0x1.e5143ep-3f, 0x1.9d8ecap-3f, -0x1.c675bp-3f, 0x1.9e495p-3f}};
diff --git a/pl/math/v_log_data.c b/pl/math/v_log_data.c
new file mode 100644
index 000000000000..a26e8a051d97
--- /dev/null
+++ b/pl/math/v_log_data.c
@@ -0,0 +1,161 @@
+/*
+ * Lookup table for double-precision log(x) vector function.
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct v_log_data __v_log_data = {
+ /* Worst-case error: 1.17 + 0.5 ulp.
+ Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
+ .poly = { -0x1.ffffffffffff7p-2, 0x1.55555555170d4p-2, -0x1.0000000399c27p-2,
+ 0x1.999b2e90e94cap-3, -0x1.554e550bd501ep-3 },
+ .ln2 = 0x1.62e42fefa39efp-1,
+ /* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + poly(z/c - 1)
+
+ where z is in [a;2a) which is split into N subintervals (a=0x1.69009p-1,
+ N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
+ tables:
+
+ table[i].invc = 1/c
+ table[i].logc = (double)log(c)
+
+ where c is near the center of the subinterval and is chosen by trying
+ several floating point invc candidates around 1/center and selecting one
+ for which the error in (double)log(c) is minimized (< 0x1p-74), except the
+ subinterval that contains 1 and the previous one got tweaked to avoid
+ cancellation. */
+ .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
+ { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
+ { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
+ { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
+ { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
+ { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
+ { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
+ { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
+ { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
+ { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
+ { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
+ { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
+ { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
+ { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
+ { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
+ { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
+ { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
+ { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
+ { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
+ { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
+ { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
+ { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
+ { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
+ { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
+ { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
+ { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
+ { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
+ { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
+ { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
+ { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
+ { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
+ { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
+ { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
+ { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
+ { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
+ { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
+ { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
+ { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
+ { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
+ { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
+ { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
+ { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
+ { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
+ { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
+ { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
+ { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
+ { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
+ { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
+ { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
+ { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
+ { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
+ { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
+ { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
+ { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
+ { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
+ { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
+ { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
+ { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
+ { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
+ { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
+ { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
+ { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
+ { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
+ { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
+ { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
+ { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
+ { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
+ { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
+ { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
+ { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
+ { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
+ { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
+ { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
+ { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
+ { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
+ { 1.0, 0.0 },
+ { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
+ { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
+ { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
+ { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
+ { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
+ { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
+ { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
+ { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
+ { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
+ { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
+ { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
+ { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
+ { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
+ { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
+ { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
+ { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
+ { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
+ { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
+ { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
+ { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
+ { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
+ { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
+ { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
+ { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
+ { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
+ { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
+ { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
+ { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
+ { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
+ { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
+ { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
+ { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
+ { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
+ { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
+ { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
+ { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
+ { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
+ { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
+ { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
+ { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
+ { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
+ { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
+ { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
+ { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
+ { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
+ { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
+ { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
+ { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
+ { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
+ { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
+ { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
+ { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
+};
diff --git a/pl/math/v_log_inline.h b/pl/math/v_log_inline.h
new file mode 100644
index 000000000000..2df00cf4ddf4
--- /dev/null
+++ b/pl/math/v_log_inline.h
@@ -0,0 +1,104 @@
+/*
+ * Double-precision vector log(x) function - inline version
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "math_config.h"
+
+#ifndef V_LOG_INLINE_POLY_ORDER
+# error Cannot use inline log helper without specifying poly order (options are 4 or 5)
+#endif
+
+#if V_LOG_INLINE_POLY_ORDER == 4
+# define POLY \
+ { \
+ V2 (-0x1.ffffffffcbad3p-2), V2 (0x1.555555578ed68p-2), \
+ V2 (-0x1.0000d3a1e7055p-2), V2 (0x1.999392d02a63ep-3) \
+ }
+#elif V_LOG_INLINE_POLY_ORDER == 5
+# define POLY \
+ { \
+ V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), \
+ V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), \
+ V2 (-0x1.554e550bd501ep-3) \
+ }
+#else
+# error Can only choose order 4 or 5 for log poly
+#endif
+
+struct v_log_inline_data
+{
+ float64x2_t poly[V_LOG_INLINE_POLY_ORDER];
+ float64x2_t ln2;
+ uint64x2_t off, sign_exp_mask;
+};
+
+#define V_LOG_CONSTANTS \
+ { \
+ .poly = POLY, .ln2 = V2 (0x1.62e42fefa39efp-1), \
+ .sign_exp_mask = V2 (0xfff0000000000000), .off = V2 (0x3fe6900900000000) \
+ }
+
+#define A(i) d->poly[i]
+#define N (1 << V_LOG_TABLE_BITS)
+#define IndexMask (N - 1)
+
+struct entry
+{
+ float64x2_t invc;
+ float64x2_t logc;
+};
+
+static inline struct entry
+log_lookup (uint64x2_t i)
+{
+ /* Since N is a power of 2, n % N = n & (N - 1). */
+ struct entry e;
+ uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
+ return e;
+}
+
+static inline float64x2_t
+v_log_inline (float64x2_t x, const struct v_log_inline_data *d)
+{
+ float64x2_t z, r, r2, p, y, kd, hi;
+ uint64x2_t ix, iz, tmp;
+ int64x2_t k;
+ struct entry e;
+
+ ix = vreinterpretq_u64_f64 (x);
+
+ /* x = 2^k z; where z is in range [Off,2*Off) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ tmp = vsubq_u64 (ix, d->off);
+ k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
+ iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
+ z = vreinterpretq_f64_u64 (iz);
+ e = log_lookup (tmp);
+
+ /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
+ r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
+ kd = vcvtq_f64_s64 (k);
+
+ /* hi = r + log(c) + k*Ln2. */
+ hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2);
+ /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
+ r2 = vmulq_f64 (r, r);
+ y = vfmaq_f64 (A (2), A (3), r);
+ p = vfmaq_f64 (A (0), A (1), r);
+#if V_LOG_POLY_ORDER == 5
+ y = vfmaq_f64 (y, A (4), r2);
+#endif
+ y = vfmaq_f64 (p, y, r2);
+
+ return vfmaq_f64 (hi, y, r2);
+}
diff --git a/pl/math/v_logf_inline.h b/pl/math/v_logf_inline.h
new file mode 100644
index 000000000000..c00fe0909afc
--- /dev/null
+++ b/pl/math/v_logf_inline.h
@@ -0,0 +1,59 @@
+/*
+ * Single-precision vector log function - inline version
+ *
+ * Copyright (c) 2019-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+struct v_logf_data
+{
+ float32x4_t poly[7];
+ float32x4_t ln2;
+ uint32x4_t off, mantissa_mask;
+};
+
+#define V_LOGF_CONSTANTS \
+ { \
+ .poly \
+ = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), \
+ V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), \
+ V4 (-0x1.ffffc8p-2f) }, \
+ .ln2 = V4 (0x1.62e43p-1f), .off = V4 (0x3f2aaaab), \
+ .mantissa_mask = V4 (0x007fffff) \
+ }
+
+#define P(i) d->poly[7 - i]
+
+static inline float32x4_t
+v_logf_inline (float32x4_t x, const struct v_logf_data *d)
+{
+ float32x4_t n, p, q, r, r2, y;
+ uint32x4_t u;
+
+ u = vreinterpretq_u32_f32 (x);
+
+ /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
+ u = vsubq_u32 (u, d->off);
+ n = vcvtq_f32_s32 (
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
+ u = vandq_u32 (u, d->mantissa_mask);
+ u = vaddq_u32 (u, d->off);
+ r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+
+ /* y = log(1+r) + n*ln2. */
+ r2 = vmulq_f32 (r, r);
+ /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
+ p = vfmaq_f32 (P (5), P (6), r);
+ q = vfmaq_f32 (P (3), P (4), r);
+ y = vfmaq_f32 (P (1), P (2), r);
+ p = vfmaq_f32 (p, P (7), r2);
+ q = vfmaq_f32 (q, p, r2);
+ y = vfmaq_f32 (y, q, r2);
+ p = vfmaq_f32 (r, d->ln2, n);
+
+ return vfmaq_f32 (p, y, r2);
+}
+
+#undef P
diff --git a/pl/math/v_math.h b/pl/math/v_math.h
index a8fa091a7cbf..1b10929faccc 100644
--- a/pl/math/v_math.h
+++ b/pl/math/v_math.h
@@ -12,844 +12,164 @@
/* Enable the build of vector math code. */
# define WANT_VMATH 1
#endif
-#if WANT_VMATH
-
-/* The goal of this header is to allow vector (only Neon for now)
- and scalar build of the same algorithm. */
-#if SCALAR
-#define V_NAME(x) __s_##x
-#elif VPCS && __aarch64__
-#define V_NAME(x) __vn_##x
-#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
-#else
-#define V_NAME(x) __v_##x
-#endif
-
-#ifndef VPCS_ATTR
-#define VPCS_ATTR
-#endif
-#ifndef VPCS_ALIAS
-#define VPCS_ALIAS
-#endif
+#if WANT_VMATH
-#include <stdint.h>
-#include "math_config.h"
+# if __aarch64__
+# define VPCS_ATTR __attribute__ ((aarch64_vector_pcs))
+# else
+# error "Cannot build without AArch64"
+# endif
-typedef float f32_t;
-typedef uint32_t u32_t;
-typedef int32_t s32_t;
-typedef double f64_t;
-typedef uint64_t u64_t;
-typedef int64_t s64_t;
+# include <stdint.h>
+# include "math_config.h"
+# if __aarch64__
-/* reinterpret as type1 from type2. */
-static inline u32_t
-as_u32_f32 (f32_t x)
-{
- union { f32_t f; u32_t u; } r = {x};
- return r.u;
-}
-static inline f32_t
-as_f32_u32 (u32_t x)
-{
- union { u32_t u; f32_t f; } r = {x};
- return r.f;
-}
-static inline s32_t
-as_s32_u32 (u32_t x)
-{
- union { u32_t u; s32_t i; } r = {x};
- return r.i;
-}
-static inline u32_t
-as_u32_s32 (s32_t x)
-{
- union { s32_t i; u32_t u; } r = {x};
- return r.u;
-}
-static inline u64_t
-as_u64_f64 (f64_t x)
-{
- union { f64_t f; u64_t u; } r = {x};
- return r.u;
-}
-static inline f64_t
-as_f64_u64 (u64_t x)
-{
- union { u64_t u; f64_t f; } r = {x};
- return r.f;
-}
-static inline s64_t
-as_s64_u64 (u64_t x)
-{
- union { u64_t u; s64_t i; } r = {x};
- return r.i;
-}
-static inline u64_t
-as_u64_s64 (s64_t x)
-{
- union { s64_t i; u64_t u; } r = {x};
- return r.u;
-}
+# include <arm_neon.h>
-#if SCALAR
-#define V_SUPPORTED 1
-typedef f32_t v_f32_t;
-typedef u32_t v_u32_t;
-typedef s32_t v_s32_t;
-typedef f64_t v_f64_t;
-typedef u64_t v_u64_t;
-typedef s64_t v_s64_t;
+/* Shorthand helpers for declaring constants. */
+# define V2(X) { X, X }
+# define V4(X) { X, X, X, X }
+# define V8(X) { X, X, X, X, X, X, X, X }
static inline int
-v_lanes32 (void)
-{
- return 1;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
- return x;
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
- return x;
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
- return x;
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
- return x;
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
- return x;
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
+v_any_u16h (uint16x4_t x)
{
- return x;
+ return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
}
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
+static inline float32x4_t
+v_f32 (float x)
{
- *x = v;
+ return (float32x4_t) V4 (x);
}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
+static inline uint32x4_t
+v_u32 (uint32_t x)
{
- *x = v;
+ return (uint32x4_t) V4 (x);
}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
+static inline int32x4_t
+v_s32 (int32_t x)
{
- *x = v;
+ return (int32x4_t) V4 (x);
}
-/* true if any elements of a v_cond result is non-zero. */
+/* true if any elements of a vector compare result is non-zero. */
static inline int
-v_any_u32 (v_u32_t x)
-{
- return x != 0;
-}
-/* to wrap the result of relational operators. */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
- return x ? -1 : 0;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
- return __builtin_fabsf (x);
-}
-static inline v_u32_t
-v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
-{
- return (y & ~m) | (x & m);
-}
-static inline v_u32_t
-v_cagt_f32 (v_f32_t x, v_f32_t y)
-{
- return fabsf (x) > fabsf (y);
-}
-/* to wrap |x| >= |y|. */
-static inline v_u32_t
-v_cage_f32 (v_f32_t x, v_f32_t y)
-{
- return fabsf (x) >= fabsf (y);
-}
-static inline v_u32_t
-v_calt_f32 (v_f32_t x, v_f32_t y)
-{
- return fabsf (x) < fabsf (y);
-}
-static inline v_f32_t
-v_div_f32 (v_f32_t x, v_f32_t y)
-{
- return x / y;
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
- return __builtin_fmaf (x, y, z);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
- return __builtin_roundf (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
- return __builtin_lroundf (x); /* relies on -fno-math-errno. */
-}
-static inline v_f32_t
-v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
-{
- return p ? x : y;
-}
-static inline v_u32_t
-v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
-{
- return p ? x : y;
-}
-static inline v_f32_t
-v_sqrt_f32 (v_f32_t x)
-{
- return __builtin_sqrtf (x);
-}
-/* convert to type1 from type2. */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
- return x;
-}
-static inline v_s32_t
-v_to_s32_f32 (v_f32_t x)
-{
- return x;
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
- return x;
-}
-/* reinterpret as type1 from type2. */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
- union { v_f32_t f; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_s32_t
-v_as_s32_f32 (v_f32_t x)
-{
- union
- {
- v_f32_t f;
- v_s32_t u;
- } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_f32_t f; } r = {x};
- return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_s32_t i; } r = {x};
- return r.i;
-}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
-{
- union { v_s32_t i; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
-{
- return tab[idx];
-}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
-{
- return tab[idx];
-}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
-{
- return f (x);
-}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
- v_u32_t p)
-{
- return f (x1, x2);
-}
-
-static inline int
-v_lanes64 (void)
-{
- return 1;
-}
-static inline v_f64_t
-v_f64 (f64_t x)
-{
- return x;
-}
-static inline v_u64_t
-v_u64 (u64_t x)
-{
- return x;
-}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
- return x;
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
- return x;
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
- *x = v;
-}
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u64 (v_u64_t x)
-{
- return x != 0;
-}
-/* true if all elements of a v_cond result is non-zero. */
-static inline int
-v_all_u64 (v_u64_t x)
-{
- return x;
-}
-/* to wrap the result of relational operators. */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
- return x ? -1 : 0;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
- return __builtin_fabs (x);
-}
-static inline v_u64_t
-v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
-{
- return (y & ~m) | (x & m);
-}
-static inline v_u64_t
-v_cagt_f64 (v_f64_t x, v_f64_t y)
-{
- return fabs (x) > fabs (y);
-}
-static inline v_f64_t
-v_div_f64 (v_f64_t x, v_f64_t y)
-{
- return x / y;
-}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
- return __builtin_fma (x, y, z);
-}
-static inline v_f64_t
-v_min_f64(v_f64_t x, v_f64_t y) {
- return x < y ? x : y;
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
- return __builtin_round (x);
-}
-static inline v_f64_t
-v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
-{
- return p ? x : y;
-}
-static inline v_f64_t
-v_sqrt_f64 (v_f64_t x)
-{
- return __builtin_sqrt (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
- return __builtin_lround (x); /* relies on -fno-math-errno. */
-}
-static inline v_u64_t
-v_trunc_u64 (v_f64_t x)
-{
- return __builtin_trunc (x);
-}
-/* convert to type1 from type2. */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
- return x;
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
- return x;
-}
-
-static inline v_s64_t
-v_to_s64_f64 (v_f64_t x)
-{
- return x;
-}
-/* reinterpret as type1 from type2. */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
- union { v_f64_t f; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_f64_t f; } r = {x};
- return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_s64_t i; } r = {x};
- return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
- union { v_s64_t i; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
- return tab[idx];
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
-{
- return tab[idx];
-}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
-{
- return f (x);
-}
-static inline v_f64_t
-v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
- v_u64_t p)
-{
- return f (x1, x2);
-}
-
-#elif __aarch64__
-#define V_SUPPORTED 1
-#include <arm_neon.h>
-typedef float32x4_t v_f32_t;
-typedef uint32x4_t v_u32_t;
-typedef int32x4_t v_s32_t;
-typedef float64x2_t v_f64_t;
-typedef uint64x2_t v_u64_t;
-typedef int64x2_t v_s64_t;
-
-static inline int
-v_lanes32 (void)
-{
- return 4;
-}
-
-static inline v_f32_t
-v_f32 (f32_t x)
-{
- return (v_f32_t){x, x, x, x};
-}
-static inline v_u32_t
-v_u32 (u32_t x)
-{
- return (v_u32_t){x, x, x, x};
-}
-static inline v_s32_t
-v_s32 (s32_t x)
-{
- return (v_s32_t){x, x, x, x};
-}
-
-static inline f32_t
-v_get_f32 (v_f32_t x, int i)
-{
- return x[i];
-}
-static inline u32_t
-v_get_u32 (v_u32_t x, int i)
-{
- return x[i];
-}
-static inline s32_t
-v_get_s32 (v_s32_t x, int i)
-{
- return x[i];
-}
-
-static inline void
-v_set_f32 (v_f32_t *x, int i, f32_t v)
-{
- (*x)[i] = v;
-}
-static inline void
-v_set_u32 (v_u32_t *x, int i, u32_t v)
-{
- (*x)[i] = v;
-}
-static inline void
-v_set_s32 (v_s32_t *x, int i, s32_t v)
-{
- (*x)[i] = v;
-}
-
-/* true if any elements of a v_cond result is non-zero. */
-static inline int
-v_any_u32 (v_u32_t x)
+v_any_u32 (uint32x4_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
}
-/* to wrap the result of relational operators. */
-static inline v_u32_t
-v_cond_u32 (v_u32_t x)
-{
- return x;
-}
-static inline v_f32_t
-v_abs_f32 (v_f32_t x)
-{
- return vabsq_f32 (x);
-}
-static inline v_u32_t
-v_bsl_u32 (v_u32_t m, v_u32_t x, v_u32_t y)
-{
- return vbslq_u32 (m, x, y);
-}
-static inline v_u32_t
-v_cagt_f32 (v_f32_t x, v_f32_t y)
-{
- return vcagtq_f32 (x, y);
-}
-/* to wrap |x| >= |y|. */
-static inline v_u32_t
-v_cage_f32 (v_f32_t x, v_f32_t y)
-{
- return vcageq_f32 (x, y);
-}
-static inline v_u32_t
-v_calt_f32 (v_f32_t x, v_f32_t y)
-{
- return vcaltq_f32 (x, y);
-}
-static inline v_f32_t
-v_div_f32 (v_f32_t x, v_f32_t y)
-{
- return vdivq_f32 (x, y);
-}
-static inline v_f32_t
-v_fma_f32 (v_f32_t x, v_f32_t y, v_f32_t z)
-{
- return vfmaq_f32 (z, x, y);
-}
-static inline v_f32_t
-v_round_f32 (v_f32_t x)
-{
- return vrndaq_f32 (x);
-}
-static inline v_s32_t
-v_round_s32 (v_f32_t x)
-{
- return vcvtaq_s32_f32 (x);
-}
-static inline v_f32_t
-v_sel_f32 (v_u32_t p, v_f32_t x, v_f32_t y)
-{
- return vbslq_f32 (p, x, y);
-}
-static inline v_u32_t
-v_sel_u32 (v_u32_t p, v_u32_t x, v_u32_t y)
-{
- return vbslq_u32 (p, x, y);
-}
-static inline v_f32_t
-v_sqrt_f32 (v_f32_t x)
-{
- return vsqrtq_f32 (x);
-}
-/* convert to type1 from type2. */
-static inline v_f32_t
-v_to_f32_s32 (v_s32_t x)
-{
- return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-static inline v_s32_t
-v_to_s32_f32 (v_f32_t x)
-{
- return vcvtq_s32_f32 (x);
-}
-static inline v_f32_t
-v_to_f32_u32 (v_u32_t x)
-{
- return (v_f32_t){x[0], x[1], x[2], x[3]};
-}
-/* reinterpret as type1 from type2. */
-static inline v_u32_t
-v_as_u32_f32 (v_f32_t x)
-{
- union { v_f32_t f; v_u32_t u; } r = {x};
- return r.u;
-}
-static inline v_s32_t
-v_as_s32_f32 (v_f32_t x)
-{
- union
- {
- v_f32_t f;
- v_s32_t u;
- } r = {x};
- return r.u;
-}
-static inline v_f32_t
-v_as_f32_u32 (v_u32_t x)
-{
- union { v_u32_t u; v_f32_t f; } r = {x};
- return r.f;
-}
-static inline v_s32_t
-v_as_s32_u32 (v_u32_t x)
+static inline int
+v_any_u32h (uint32x2_t x)
{
- union { v_u32_t u; v_s32_t i; } r = {x};
- return r.i;
+ return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
}
-static inline v_u32_t
-v_as_u32_s32 (v_s32_t x)
+static inline float32x4_t
+v_lookup_f32 (const float *tab, uint32x4_t idx)
{
- union { v_s32_t i; v_u32_t u; } r = {x};
- return r.u;
+ return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
}
-static inline v_f32_t
-v_lookup_f32 (const f32_t *tab, v_u32_t idx)
+static inline uint32x4_t
+v_lookup_u32 (const uint32_t *tab, uint32x4_t idx)
{
- return (v_f32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+ return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] };
}
-static inline v_u32_t
-v_lookup_u32 (const u32_t *tab, v_u32_t idx)
+static inline float32x4_t
+v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p)
{
- return (v_u32_t){tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]};
+ return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
+ p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] };
}
-static inline v_f32_t
-v_call_f32 (f32_t (*f) (f32_t), v_f32_t x, v_f32_t y, v_u32_t p)
+static inline float32x4_t
+v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2,
+ float32x4_t y, uint32x4_t p)
{
- return (v_f32_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1],
- p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3]};
+ return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0],
+ p[1] ? f (x1[1], x2[1]) : y[1],
+ p[2] ? f (x1[2], x2[2]) : y[2],
+ p[3] ? f (x1[3], x2[3]) : y[3] };
}
-static inline v_f32_t
-v_call2_f32 (f32_t (*f) (f32_t, f32_t), v_f32_t x1, v_f32_t x2, v_f32_t y,
- v_u32_t p)
+static inline float32x4_t
+v_zerofy_f32 (float32x4_t x, uint32x4_t mask)
{
- return (
- v_f32_t){p[0] ? f (x1[0], x2[0]) : y[0], p[1] ? f (x1[1], x2[1]) : y[1],
- p[2] ? f (x1[2], x2[2]) : y[2], p[3] ? f (x1[3], x2[3]) : y[3]};
+ return vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), mask));
}
-static inline int
-v_lanes64 (void)
+static inline float64x2_t
+v_f64 (double x)
{
- return 2;
+ return (float64x2_t) V2 (x);
}
-static inline v_f64_t
-v_f64 (f64_t x)
+static inline uint64x2_t
+v_u64 (uint64_t x)
{
- return (v_f64_t){x, x};
+ return (uint64x2_t) V2 (x);
}
-static inline v_u64_t
-v_u64 (u64_t x)
+static inline int64x2_t
+v_s64 (int64_t x)
{
- return (v_u64_t){x, x};
+ return (int64x2_t) V2 (x);
}
-static inline v_s64_t
-v_s64 (s64_t x)
-{
- return (v_s64_t){x, x};
-}
-static inline f64_t
-v_get_f64 (v_f64_t x, int i)
-{
- return x[i];
-}
-static inline void
-v_set_f64 (v_f64_t *x, int i, f64_t v)
-{
- (*x)[i] = v;
-}
-/* true if any elements of a v_cond result is non-zero. */
+
+/* true if any elements of a vector compare result is non-zero. */
static inline int
-v_any_u64 (v_u64_t x)
+v_any_u64 (uint64x2_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (x) != 0;
}
-/* true if all elements of a v_cond result is 1. */
+/* true if all elements of a vector compare result is 1. */
static inline int
-v_all_u64 (v_u64_t x)
+v_all_u64 (uint64x2_t x)
{
/* assume elements in x are either 0 or -1u. */
return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2;
}
-/* to wrap the result of relational operators. */
-static inline v_u64_t
-v_cond_u64 (v_u64_t x)
-{
- return x;
-}
-static inline v_f64_t
-v_abs_f64 (v_f64_t x)
-{
- return vabsq_f64 (x);
-}
-static inline v_u64_t
-v_bsl_u64 (v_u64_t m, v_u64_t x, v_u64_t y)
-{
- return vbslq_u64 (m, x, y);
-}
-static inline v_u64_t
-v_cagt_f64 (v_f64_t x, v_f64_t y)
+static inline float64x2_t
+v_lookup_f64 (const double *tab, uint64x2_t idx)
{
- return vcagtq_f64 (x, y);
+ return (float64x2_t){ tab[idx[0]], tab[idx[1]] };
}
-static inline v_f64_t
-v_div_f64 (v_f64_t x, v_f64_t y)
+static inline uint64x2_t
+v_lookup_u64 (const uint64_t *tab, uint64x2_t idx)
{
- return vdivq_f64 (x, y);
+ return (uint64x2_t){ tab[idx[0]], tab[idx[1]] };
}
-static inline v_f64_t
-v_fma_f64 (v_f64_t x, v_f64_t y, v_f64_t z)
-{
- return vfmaq_f64 (z, x, y);
-}
-static inline v_f64_t
-v_min_f64(v_f64_t x, v_f64_t y) {
- return vminq_f64(x, y);
-}
-static inline v_f64_t
-v_round_f64 (v_f64_t x)
-{
- return vrndaq_f64 (x);
-}
-static inline v_f64_t
-v_sel_f64 (v_u64_t p, v_f64_t x, v_f64_t y)
-{
- return vbslq_f64 (p, x, y);
-}
-static inline v_f64_t
-v_sqrt_f64 (v_f64_t x)
-{
- return vsqrtq_f64 (x);
-}
-static inline v_s64_t
-v_round_s64 (v_f64_t x)
-{
- return vcvtaq_s64_f64 (x);
-}
-static inline v_u64_t
-v_trunc_u64 (v_f64_t x)
-{
- return vcvtq_u64_f64 (x);
-}
-/* convert to type1 from type2. */
-static inline v_f64_t
-v_to_f64_s64 (v_s64_t x)
-{
- return (v_f64_t){x[0], x[1]};
-}
-static inline v_f64_t
-v_to_f64_u64 (v_u64_t x)
-{
- return (v_f64_t){x[0], x[1]};
-}
-static inline v_s64_t
-v_to_s64_f64 (v_f64_t x)
-{
- return vcvtq_s64_f64 (x);
-}
-/* reinterpret as type1 from type2. */
-static inline v_u64_t
-v_as_u64_f64 (v_f64_t x)
-{
- union { v_f64_t f; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_as_f64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_f64_t f; } r = {x};
- return r.f;
-}
-static inline v_s64_t
-v_as_s64_u64 (v_u64_t x)
-{
- union { v_u64_t u; v_s64_t i; } r = {x};
- return r.i;
-}
-static inline v_u64_t
-v_as_u64_s64 (v_s64_t x)
-{
- union { v_s64_t i; v_u64_t u; } r = {x};
- return r.u;
-}
-static inline v_f64_t
-v_lookup_f64 (const f64_t *tab, v_u64_t idx)
-{
- return (v_f64_t){tab[idx[0]], tab[idx[1]]};
-}
-static inline v_u64_t
-v_lookup_u64 (const u64_t *tab, v_u64_t idx)
+
+static inline float64x2_t
+v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p)
{
- return (v_u64_t){tab[idx[0]], tab[idx[1]]};
+ double p1 = p[1];
+ double x1 = x[1];
+ if (likely (p[0]))
+ y[0] = f (x[0]);
+ if (likely (p1))
+ y[1] = f (x1);
+ return y;
}
-static inline v_f64_t
-v_call_f64 (f64_t (*f) (f64_t), v_f64_t x, v_f64_t y, v_u64_t p)
+
+static inline float64x2_t
+v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2,
+ float64x2_t y, uint64x2_t p)
{
- return (v_f64_t){p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1]};
+ double p1 = p[1];
+ double x1h = x1[1];
+ double x2h = x2[1];
+ if (likely (p[0]))
+ y[0] = f (x1[0], x2[0]);
+ if (likely (p1))
+ y[1] = f (x1h, x2h);
+ return y;
}
-static inline v_f64_t
-v_call2_f64 (f64_t (*f) (f64_t, f64_t), v_f64_t x1, v_f64_t x2, v_f64_t y,
- v_u64_t p)
+static inline float64x2_t
+v_zerofy_f64 (float64x2_t x, uint64x2_t mask)
{
- return (v_f64_t){p[0] ? f (x1[0], x2[0]) : y[0],
- p[1] ? f (x1[1], x2[1]) : y[1]};
+ return vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), mask));
}
-#endif
+# endif
#endif
+
#endif
diff --git a/pl/math/v_pow_1u5.c b/pl/math/v_pow_1u5.c
new file mode 100644
index 000000000000..9053347d4e35
--- /dev/null
+++ b/pl/math/v_pow_1u5.c
@@ -0,0 +1,259 @@
+/*
+ * Double-precision vector pow function.
+ *
+ * Copyright (c) 2020-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+/* Defines parameters of the approximation and scalar fallback. */
+#include "finite_pow.h"
+
+#define VecSmallExp v_u64 (SmallExp)
+#define VecThresExp v_u64 (ThresExp)
+
+#define VecSmallPowX v_u64 (SmallPowX)
+#define VecThresPowX v_u64 (ThresPowX)
+#define VecSmallPowY v_u64 (SmallPowY)
+#define VecThresPowY v_u64 (ThresPowY)
+
+static const struct data
+{
+ float64x2_t log_poly[7];
+ float64x2_t exp_poly[3];
+ float64x2_t ln2_hi, ln2_lo;
+ float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n;
+} data = {
+ /* Coefficients copied from v_pow_log_data.c
+ relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
+ Coefficients are scaled to match the scaling during evaluation. */
+ .log_poly = { V2 (-0x1p-1), V2 (0x1.555555555556p-2 * -2),
+ V2 (-0x1.0000000000006p-2 * -2), V2 (0x1.999999959554ep-3 * 4),
+ V2 (-0x1.555555529a47ap-3 * 4), V2 (0x1.2495b9b4845e9p-3 * -8),
+ V2 (-0x1.0002b8b263fc3p-3 * -8) },
+ .ln2_hi = V2 (0x1.62e42fefa3800p-1),
+ .ln2_lo = V2 (0x1.ef35793c76730p-45),
+ /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549
+ (0.550 without fma) if |x| < ln2/512. */
+ .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3),
+ V2 (0x1.5555576a5adcep-5) },
+ .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */
+ .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */
+ .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */
+ .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45),
+};
+
+#define A(i) data.log_poly[i]
+#define C(i) data.exp_poly[i]
+
+/* This version implements an algorithm close to AOR scalar pow but
+ - does not implement the trick in the exp's specialcase subroutine to avoid
+ double-rounding,
+ - does not use a tail in the exponential core computation,
+ - and pow's exp polynomial order and table bits might differ.
+
+ Maximum measured error is 1.04 ULPs:
+ _ZGVnN2vv_pow(0x1.024a3e56b3c3p-136, 0x1.87910248b58acp-13)
+ got 0x1.f71162f473251p-1
+ want 0x1.f71162f473252p-1. */
+
+static inline float64x2_t
+v_masked_lookup_f64 (const double *table, uint64x2_t i)
+{
+ return (float64x2_t){
+ table[(i[0] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)],
+ table[(i[1] >> (52 - V_POW_LOG_TABLE_BITS)) & (N_LOG - 1)]
+ };
+}
+
+/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
+ additional 15 bits precision. IX is the bit representation of x, but
+ normalized in the subnormal range using the sign bit for the exponent. */
+static inline float64x2_t
+v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d)
+{
+ /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
+ The range is split into N subintervals.
+ The ith subinterval contains z and c is near its center. */
+ uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off));
+ int64x2_t k
+ = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
+ uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52)));
+ float64x2_t z = vreinterpretq_f64_u64 (iz);
+ float64x2_t kd = vcvtq_f64_s64 (k);
+ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
+ float64x2_t invc = v_masked_lookup_f64 (__v_pow_log_data.invc, tmp);
+ float64x2_t logc = v_masked_lookup_f64 (__v_pow_log_data.logc, tmp);
+ float64x2_t logctail = v_masked_lookup_f64 (__v_pow_log_data.logctail, tmp);
+ /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
+ |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
+ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc);
+ /* k*Ln2 + log(c) + r. */
+ float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi);
+ float64x2_t t2 = vaddq_f64 (t1, r);
+ float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo);
+ float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r);
+ /* Evaluation is optimized assuming superscalar pipelined execution. */
+ float64x2_t ar = vmulq_f64 (A (0), r);
+ float64x2_t ar2 = vmulq_f64 (r, ar);
+ float64x2_t ar3 = vmulq_f64 (r, ar2);
+ /* k*Ln2 + log(c) + r + A[0]*r*r. */
+ float64x2_t hi = vaddq_f64 (t2, ar2);
+ float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r);
+ float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2);
+ /* p = log1p(r) - r - A[0]*r*r. */
+ float64x2_t a56 = vfmaq_f64 (A (5), r, A (6));
+ float64x2_t a34 = vfmaq_f64 (A (3), r, A (4));
+ float64x2_t a12 = vfmaq_f64 (A (1), r, A (2));
+ float64x2_t p = vfmaq_f64 (a34, ar2, a56);
+ p = vfmaq_f64 (a12, ar2, p);
+ p = vmulq_f64 (ar3, p);
+ float64x2_t lo
+ = vaddq_f64 (vaddq_f64 (vaddq_f64 (vaddq_f64 (lo1, lo2), lo3), lo4), p);
+ float64x2_t y = vaddq_f64 (hi, lo);
+ *tail = vaddq_f64 (vsubq_f64 (hi, y), lo);
+ return y;
+}
+
+/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */
+static inline float64x2_t
+v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d)
+{
+ /* Fallback to scalar exp_inline for all lanes if any lane
+ contains value of x s.t. |x| <= 2^-54 or >= 512. */
+ uint64x2_t abstop
+ = vandq_u64 (vshrq_n_u64 (vreinterpretq_u64_f64 (x), 52), v_u64 (0x7ff));
+ uint64x2_t uoflowx
+ = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp);
+ if (unlikely (v_any_u64 (uoflowx)))
+ return v_call2_f64 (exp_nosignbias, x, xtail, x, v_u64 (-1));
+ /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
+ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */
+ float64x2_t z = vmulq_f64 (d->inv_ln2_n, x);
+ /* z - kd is in [-1, 1] in non-nearest rounding modes. */
+ float64x2_t kd = vaddq_f64 (z, d->shift);
+ uint64x2_t ki = vreinterpretq_u64_f64 (kd);
+ kd = vsubq_f64 (kd, d->shift);
+ float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n);
+ r = vfmsq_f64 (r, kd, d->ln2_lo_n);
+ /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
+ r = vaddq_f64 (r, xtail);
+ /* 2^(k/N) ~= scale. */
+ uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1));
+ uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS);
+ /* This is only a valid scale when -1023*N < k < 1024*N. */
+ uint64x2_t sbits = v_lookup_u64 (SBits, idx);
+ sbits = vaddq_u64 (sbits, top);
+ /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t tmp = vfmaq_f64 (C (1), r, C (2));
+ tmp = vfmaq_f64 (C (0), r, tmp);
+ tmp = vfmaq_f64 (r, r2, tmp);
+ float64x2_t scale = vreinterpretq_f64_u64 (sbits);
+ /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+ is no spurious underflow here even without fma. */
+ return vfmaq_f64 (scale, scale, tmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y)
+{
+ const struct data *d = ptr_barrier (&data);
+ /* Case of x <= 0 is too complicated to be vectorised efficiently here,
+ fallback to scalar pow for all lanes if any x < 0 detected. */
+ if (v_any_u64 (vclezq_s64 (vreinterpretq_s64_f64 (x))))
+ return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
+
+ uint64x2_t vix = vreinterpretq_u64_f64 (x);
+ uint64x2_t viy = vreinterpretq_u64_f64 (y);
+ uint64x2_t vtopx = vshrq_n_u64 (vix, 52);
+ uint64x2_t vtopy = vshrq_n_u64 (viy, 52);
+ uint64x2_t vabstopx = vandq_u64 (vtopx, v_u64 (0x7ff));
+ uint64x2_t vabstopy = vandq_u64 (vtopy, v_u64 (0x7ff));
+
+ /* Special cases of x or y. */
+#if WANT_SIMD_EXCEPT
+ /* Small or large. */
+ uint64x2_t specialx
+ = vcgeq_u64 (vsubq_u64 (vtopx, VecSmallPowX), VecThresPowX);
+ uint64x2_t specialy
+ = vcgeq_u64 (vsubq_u64 (vabstopy, VecSmallPowY), VecThresPowY);
+#else
+ /* Inf or nan. */
+ uint64x2_t specialx = vcgeq_u64 (vabstopx, v_u64 (0x7ff));
+ uint64x2_t specialy = vcgeq_u64 (vabstopy, v_u64 (0x7ff));
+ /* The case y==0 does not trigger a special case, since in this case it is
+ necessary to fix the result only if x is a signalling nan, which already
+ triggers a special case. We test y==0 directly in the scalar fallback. */
+#endif
+ uint64x2_t special = vorrq_u64 (specialx, specialy);
+ /* Fallback to scalar on all lanes if any lane is inf or nan. */
+ if (unlikely (v_any_u64 (special)))
+ return v_call2_f64 (__pl_finite_pow, x, y, x, v_u64 (-1));
+
+ /* Small cases of x: |x| < 0x1p-126. */
+ uint64x2_t smallx = vcltq_u64 (vabstopx, VecSmallPowX);
+ if (unlikely (v_any_u64 (smallx)))
+ {
+ /* Update ix if top 12 bits of x are 0. */
+ uint64x2_t sub_x = vceqzq_u64 (vtopx);
+ if (unlikely (v_any_u64 (sub_x)))
+ {
+ /* Normalize subnormal x so exponent becomes negative. */
+ uint64x2_t vix_norm
+ = vreinterpretq_u64_f64 (vmulq_f64 (x, v_f64 (0x1p52)));
+ vix_norm = vandq_u64 (vix_norm, v_u64 (0x7fffffffffffffff));
+ vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52));
+ vix = vbslq_u64 (sub_x, vix_norm, vix);
+ }
+ }
+
+ /* Vector Log(ix, &lo). */
+ float64x2_t vlo;
+ float64x2_t vhi = v_log_inline (vix, &vlo, d);
+
+ /* Vector Exp(y_loghi, y_loglo). */
+ float64x2_t vehi = vmulq_f64 (y, vhi);
+ float64x2_t velo = vmulq_f64 (y, vlo);
+ float64x2_t vemi = vfmsq_f64 (vehi, y, vhi);
+ velo = vsubq_f64 (velo, vemi);
+ return v_exp_inline (vehi, velo, d);
+}
+
+PL_SIG (V, D, 2, pow)
+PL_TEST_ULP (V_NAME_D2 (pow), 0.55)
+PL_TEST_EXPECT_FENV (V_NAME_D2 (pow), WANT_SIMD_EXCEPT)
+/* Wide intervals spanning the whole domain but shared between x and y. */
+#define V_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (V_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
+ PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
+ PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
+#define EXPAND(str) str##000000000
+#define SHL52(str) EXPAND (str)
+V_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
+V_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
+V_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
+V_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
+V_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
+V_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
+V_POW_INTERVAL2 (0, inf, 0, inf, 1000)
+/* x~1 or y~1. */
+V_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
+V_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
+V_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
+/* around argmaxs of ULP error. */
+V_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
+V_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
+/* x is negative, y is odd or even integer, or y is real not integer. */
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
+/* 1.0^y. */
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
+PL_TEST_INTERVAL2 (V_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
diff --git a/math/v_exp_data.c b/pl/math/v_pow_exp_data.c
index 30421da81429..5d921ef648a4 100644
--- a/math/v_exp_data.c
+++ b/pl/math/v_pow_exp_data.c
@@ -1,147 +1,34 @@
/*
- * Lookup table for double-precision e^x vector function.
+ * Shared data between exp, exp2 and pow.
*
- * Copyright (c) 2019, Arm Limited.
+ * Copyright (c) 2018-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
-#include "v_exp.h"
-#if WANT_VMATH
+#include "math_config.h"
-#define N (1 << V_EXP_TABLE_BITS)
+#define N (1 << V_POW_EXP_TABLE_BITS)
-/* 2^(j/N), j=0..N. */
-const u64_t __v_exp_data[] = {
-#if N == 128
-0x3ff0000000000000,
-0x3feff63da9fb3335,
-0x3fefec9a3e778061,
-0x3fefe315e86e7f85,
-0x3fefd9b0d3158574,
-0x3fefd06b29ddf6de,
-0x3fefc74518759bc8,
-0x3fefbe3ecac6f383,
-0x3fefb5586cf9890f,
-0x3fefac922b7247f7,
-0x3fefa3ec32d3d1a2,
-0x3fef9b66affed31b,
-0x3fef9301d0125b51,
-0x3fef8abdc06c31cc,
-0x3fef829aaea92de0,
-0x3fef7a98c8a58e51,
-0x3fef72b83c7d517b,
-0x3fef6af9388c8dea,
-0x3fef635beb6fcb75,
-0x3fef5be084045cd4,
-0x3fef54873168b9aa,
-0x3fef4d5022fcd91d,
-0x3fef463b88628cd6,
-0x3fef3f49917ddc96,
-0x3fef387a6e756238,
-0x3fef31ce4fb2a63f,
-0x3fef2b4565e27cdd,
-0x3fef24dfe1f56381,
-0x3fef1e9df51fdee1,
-0x3fef187fd0dad990,
-0x3fef1285a6e4030b,
-0x3fef0cafa93e2f56,
-0x3fef06fe0a31b715,
-0x3fef0170fc4cd831,
-0x3feefc08b26416ff,
-0x3feef6c55f929ff1,
-0x3feef1a7373aa9cb,
-0x3feeecae6d05d866,
-0x3feee7db34e59ff7,
-0x3feee32dc313a8e5,
-0x3feedea64c123422,
-0x3feeda4504ac801c,
-0x3feed60a21f72e2a,
-0x3feed1f5d950a897,
-0x3feece086061892d,
-0x3feeca41ed1d0057,
-0x3feec6a2b5c13cd0,
-0x3feec32af0d7d3de,
-0x3feebfdad5362a27,
-0x3feebcb299fddd0d,
-0x3feeb9b2769d2ca7,
-0x3feeb6daa2cf6642,
-0x3feeb42b569d4f82,
-0x3feeb1a4ca5d920f,
-0x3feeaf4736b527da,
-0x3feead12d497c7fd,
-0x3feeab07dd485429,
-0x3feea9268a5946b7,
-0x3feea76f15ad2148,
-0x3feea5e1b976dc09,
-0x3feea47eb03a5585,
-0x3feea34634ccc320,
-0x3feea23882552225,
-0x3feea155d44ca973,
-0x3feea09e667f3bcd,
-0x3feea012750bdabf,
-0x3fee9fb23c651a2f,
-0x3fee9f7df9519484,
-0x3fee9f75e8ec5f74,
-0x3fee9f9a48a58174,
-0x3fee9feb564267c9,
-0x3feea0694fde5d3f,
-0x3feea11473eb0187,
-0x3feea1ed0130c132,
-0x3feea2f336cf4e62,
-0x3feea427543e1a12,
-0x3feea589994cce13,
-0x3feea71a4623c7ad,
-0x3feea8d99b4492ed,
-0x3feeaac7d98a6699,
-0x3feeace5422aa0db,
-0x3feeaf3216b5448c,
-0x3feeb1ae99157736,
-0x3feeb45b0b91ffc6,
-0x3feeb737b0cdc5e5,
-0x3feeba44cbc8520f,
-0x3feebd829fde4e50,
-0x3feec0f170ca07ba,
-0x3feec49182a3f090,
-0x3feec86319e32323,
-0x3feecc667b5de565,
-0x3feed09bec4a2d33,
-0x3feed503b23e255d,
-0x3feed99e1330b358,
-0x3feede6b5579fdbf,
-0x3feee36bbfd3f37a,
-0x3feee89f995ad3ad,
-0x3feeee07298db666,
-0x3feef3a2b84f15fb,
-0x3feef9728de5593a,
-0x3feeff76f2fb5e47,
-0x3fef05b030a1064a,
-0x3fef0c1e904bc1d2,
-0x3fef12c25bd71e09,
-0x3fef199bdd85529c,
-0x3fef20ab5fffd07a,
-0x3fef27f12e57d14b,
-0x3fef2f6d9406e7b5,
-0x3fef3720dcef9069,
-0x3fef3f0b555dc3fa,
-0x3fef472d4a07897c,
-0x3fef4f87080d89f2,
-0x3fef5818dcfba487,
-0x3fef60e316c98398,
-0x3fef69e603db3285,
-0x3fef7321f301b460,
-0x3fef7c97337b9b5f,
-0x3fef864614f5a129,
-0x3fef902ee78b3ff6,
-0x3fef9a51fbc74c83,
-0x3fefa4afa2a490da,
-0x3fefaf482d8e67f1,
-0x3fefba1bee615a27,
-0x3fefc52b376bba97,
-0x3fefd0765b6e4540,
-0x3fefdbfdad9cbe14,
-0x3fefe7c1819e90d8,
-0x3feff3c22b8f71f1,
-#elif N == 256
+const struct v_pow_exp_data __v_pow_exp_data = {
+// exp polynomial coefficients.
+.poly = {
+// abs error: 1.43*2^-58
+// ulp error: 0.549 (0.550 without fma)
+// if |x| < ln2/512
+0x1.fffffffffffd4p-2,
+0x1.5555571d6ef9p-3,
+0x1.5555576a5adcep-5,
+},
+// N/ln2
+.n_over_ln2 = 0x1.71547652b82fep0 * N,
+// ln2/N
+.ln2_over_n_hi = 0x1.62e42fefc0000p-9,
+.ln2_over_n_lo = -0x1.c610ca86c3899p-45,
+// Used for rounding to nearest integer without using intrinsics.
+.shift = 0x1.8p52,
+// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
+// sbits[k] = asuint64(H[k]) - (k << 52)/N
+.sbits = {
0x3ff0000000000000,
0x3feffb1afa5abcbf,
0x3feff63da9fb3335,
@@ -398,6 +285,5 @@ const u64_t __v_exp_data[] = {
0x3fefedba3692d514,
0x3feff3c22b8f71f1,
0x3feff9d96b2a23d9,
-#endif
+},
};
-#endif
diff --git a/pl/math/v_pow_log_data.c b/pl/math/v_pow_log_data.c
new file mode 100644
index 000000000000..036faa5c97c1
--- /dev/null
+++ b/pl/math/v_pow_log_data.c
@@ -0,0 +1,174 @@
+/*
+ * Data for the log part of pow.
+ *
+ * Copyright (c) 2018-2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+#define N (1 << V_POW_LOG_TABLE_BITS)
+
+/* Algorithm:
+
+ x = 2^k z
+ log(x) = k ln2 + log(c) + log(z/c)
+ log(z/c) = poly(z/c - 1)
+
+ where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals
+ and z falls into the ith one, then table entries are computed as
+
+ tab[i].invc = 1/c
+ tab[i].logc = round(0x1p43*log(c))/0x1p43
+ tab[i].logctail = (double)(log(c) - logc)
+
+ where c is chosen near the center of the subinterval such that 1/c has only
+ a few precision bits so z/c - 1 is exactly representible as double:
+
+ 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2
+
+ Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| <
+ 0x1p-97, the last few bits of logc are rounded away so k*ln2hi + logc has no
+ rounding error and the interval for z is selected such that near x == 1,
+ where log(x)
+ is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */
+const struct v_pow_log_data __v_pow_log_data = {
+ /* relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8]
+ Coefficients are scaled to match the scaling during evaluation. */
+ .poly = { -0x1p-1, -0x1.555555555556p-1, 0x1.0000000000006p-1,
+ 0x1.999999959554ep-1, -0x1.555555529a47ap-1, -0x1.2495b9b4845e9p0,
+ 0x1.0002b8b263fc3p0, },
+ .ln2_hi = 0x1.62e42fefa3800p-1,
+ .ln2_lo = 0x1.ef35793c76730p-45,
+ .invc = { 0x1.6a00000000000p+0, 0x1.6800000000000p+0, 0x1.6600000000000p+0,
+ 0x1.6400000000000p+0, 0x1.6200000000000p+0, 0x1.6000000000000p+0,
+ 0x1.5e00000000000p+0, 0x1.5c00000000000p+0, 0x1.5a00000000000p+0,
+ 0x1.5800000000000p+0, 0x1.5600000000000p+0, 0x1.5600000000000p+0,
+ 0x1.5400000000000p+0, 0x1.5200000000000p+0, 0x1.5000000000000p+0,
+ 0x1.4e00000000000p+0, 0x1.4c00000000000p+0, 0x1.4a00000000000p+0,
+ 0x1.4a00000000000p+0, 0x1.4800000000000p+0, 0x1.4600000000000p+0,
+ 0x1.4400000000000p+0, 0x1.4200000000000p+0, 0x1.4000000000000p+0,
+ 0x1.4000000000000p+0, 0x1.3e00000000000p+0, 0x1.3c00000000000p+0,
+ 0x1.3a00000000000p+0, 0x1.3a00000000000p+0, 0x1.3800000000000p+0,
+ 0x1.3600000000000p+0, 0x1.3400000000000p+0, 0x1.3400000000000p+0,
+ 0x1.3200000000000p+0, 0x1.3000000000000p+0, 0x1.3000000000000p+0,
+ 0x1.2e00000000000p+0, 0x1.2c00000000000p+0, 0x1.2c00000000000p+0,
+ 0x1.2a00000000000p+0, 0x1.2800000000000p+0, 0x1.2600000000000p+0,
+ 0x1.2600000000000p+0, 0x1.2400000000000p+0, 0x1.2400000000000p+0,
+ 0x1.2200000000000p+0, 0x1.2000000000000p+0, 0x1.2000000000000p+0,
+ 0x1.1e00000000000p+0, 0x1.1c00000000000p+0, 0x1.1c00000000000p+0,
+ 0x1.1a00000000000p+0, 0x1.1a00000000000p+0, 0x1.1800000000000p+0,
+ 0x1.1600000000000p+0, 0x1.1600000000000p+0, 0x1.1400000000000p+0,
+ 0x1.1400000000000p+0, 0x1.1200000000000p+0, 0x1.1000000000000p+0,
+ 0x1.1000000000000p+0, 0x1.0e00000000000p+0, 0x1.0e00000000000p+0,
+ 0x1.0c00000000000p+0, 0x1.0c00000000000p+0, 0x1.0a00000000000p+0,
+ 0x1.0a00000000000p+0, 0x1.0800000000000p+0, 0x1.0800000000000p+0,
+ 0x1.0600000000000p+0, 0x1.0400000000000p+0, 0x1.0400000000000p+0,
+ 0x1.0200000000000p+0, 0x1.0200000000000p+0, 0x1.0000000000000p+0,
+ 0x1.0000000000000p+0, 0x1.fc00000000000p-1, 0x1.f800000000000p-1,
+ 0x1.f400000000000p-1, 0x1.f000000000000p-1, 0x1.ec00000000000p-1,
+ 0x1.e800000000000p-1, 0x1.e400000000000p-1, 0x1.e200000000000p-1,
+ 0x1.de00000000000p-1, 0x1.da00000000000p-1, 0x1.d600000000000p-1,
+ 0x1.d400000000000p-1, 0x1.d000000000000p-1, 0x1.cc00000000000p-1,
+ 0x1.ca00000000000p-1, 0x1.c600000000000p-1, 0x1.c400000000000p-1,
+ 0x1.c000000000000p-1, 0x1.be00000000000p-1, 0x1.ba00000000000p-1,
+ 0x1.b800000000000p-1, 0x1.b400000000000p-1, 0x1.b200000000000p-1,
+ 0x1.ae00000000000p-1, 0x1.ac00000000000p-1, 0x1.aa00000000000p-1,
+ 0x1.a600000000000p-1, 0x1.a400000000000p-1, 0x1.a000000000000p-1,
+ 0x1.9e00000000000p-1, 0x1.9c00000000000p-1, 0x1.9a00000000000p-1,
+ 0x1.9600000000000p-1, 0x1.9400000000000p-1, 0x1.9200000000000p-1,
+ 0x1.9000000000000p-1, 0x1.8c00000000000p-1, 0x1.8a00000000000p-1,
+ 0x1.8800000000000p-1, 0x1.8600000000000p-1, 0x1.8400000000000p-1,
+ 0x1.8200000000000p-1, 0x1.7e00000000000p-1, 0x1.7c00000000000p-1,
+ 0x1.7a00000000000p-1, 0x1.7800000000000p-1, 0x1.7600000000000p-1,
+ 0x1.7400000000000p-1, 0x1.7200000000000p-1, 0x1.7000000000000p-1,
+ 0x1.6e00000000000p-1, 0x1.6c00000000000p-1, },
+ .logc
+ = { -0x1.62c82f2b9c800p-2, -0x1.5d1bdbf580800p-2, -0x1.5767717455800p-2,
+ -0x1.51aad872df800p-2, -0x1.4be5f95777800p-2, -0x1.4618bc21c6000p-2,
+ -0x1.404308686a800p-2, -0x1.3a64c55694800p-2, -0x1.347dd9a988000p-2,
+ -0x1.2e8e2bae12000p-2, -0x1.2895a13de8800p-2, -0x1.2895a13de8800p-2,
+ -0x1.22941fbcf7800p-2, -0x1.1c898c1699800p-2, -0x1.1675cababa800p-2,
+ -0x1.1058bf9ae4800p-2, -0x1.0a324e2739000p-2, -0x1.0402594b4d000p-2,
+ -0x1.0402594b4d000p-2, -0x1.fb9186d5e4000p-3, -0x1.ef0adcbdc6000p-3,
+ -0x1.e27076e2af000p-3, -0x1.d5c216b4fc000p-3, -0x1.c8ff7c79aa000p-3,
+ -0x1.c8ff7c79aa000p-3, -0x1.bc286742d9000p-3, -0x1.af3c94e80c000p-3,
+ -0x1.a23bc1fe2b000p-3, -0x1.a23bc1fe2b000p-3, -0x1.9525a9cf45000p-3,
+ -0x1.87fa06520d000p-3, -0x1.7ab890210e000p-3, -0x1.7ab890210e000p-3,
+ -0x1.6d60fe719d000p-3, -0x1.5ff3070a79000p-3, -0x1.5ff3070a79000p-3,
+ -0x1.526e5e3a1b000p-3, -0x1.44d2b6ccb8000p-3, -0x1.44d2b6ccb8000p-3,
+ -0x1.371fc201e9000p-3, -0x1.29552f81ff000p-3, -0x1.1b72ad52f6000p-3,
+ -0x1.1b72ad52f6000p-3, -0x1.0d77e7cd09000p-3, -0x1.0d77e7cd09000p-3,
+ -0x1.fec9131dbe000p-4, -0x1.e27076e2b0000p-4, -0x1.e27076e2b0000p-4,
+ -0x1.c5e548f5bc000p-4, -0x1.a926d3a4ae000p-4, -0x1.a926d3a4ae000p-4,
+ -0x1.8c345d631a000p-4, -0x1.8c345d631a000p-4, -0x1.6f0d28ae56000p-4,
+ -0x1.51b073f062000p-4, -0x1.51b073f062000p-4, -0x1.341d7961be000p-4,
+ -0x1.341d7961be000p-4, -0x1.16536eea38000p-4, -0x1.f0a30c0118000p-5,
+ -0x1.f0a30c0118000p-5, -0x1.b42dd71198000p-5, -0x1.b42dd71198000p-5,
+ -0x1.77458f632c000p-5, -0x1.77458f632c000p-5, -0x1.39e87b9fec000p-5,
+ -0x1.39e87b9fec000p-5, -0x1.f829b0e780000p-6, -0x1.f829b0e780000p-6,
+ -0x1.7b91b07d58000p-6, -0x1.fc0a8b0fc0000p-7, -0x1.fc0a8b0fc0000p-7,
+ -0x1.fe02a6b100000p-8, -0x1.fe02a6b100000p-8, 0x0.0000000000000p+0,
+ 0x0.0000000000000p+0, 0x1.0101575890000p-7, 0x1.0205658938000p-6,
+ 0x1.8492528c90000p-6, 0x1.0415d89e74000p-5, 0x1.466aed42e0000p-5,
+ 0x1.894aa149fc000p-5, 0x1.ccb73cdddc000p-5, 0x1.eea31c006c000p-5,
+ 0x1.1973bd1466000p-4, 0x1.3bdf5a7d1e000p-4, 0x1.5e95a4d97a000p-4,
+ 0x1.700d30aeac000p-4, 0x1.9335e5d594000p-4, 0x1.b6ac88dad6000p-4,
+ 0x1.c885801bc4000p-4, 0x1.ec739830a2000p-4, 0x1.fe89139dbe000p-4,
+ 0x1.1178e8227e000p-3, 0x1.1aa2b7e23f000p-3, 0x1.2d1610c868000p-3,
+ 0x1.365fcb0159000p-3, 0x1.4913d8333b000p-3, 0x1.527e5e4a1b000p-3,
+ 0x1.6574ebe8c1000p-3, 0x1.6f0128b757000p-3, 0x1.7898d85445000p-3,
+ 0x1.8beafeb390000p-3, 0x1.95a5adcf70000p-3, 0x1.a93ed3c8ae000p-3,
+ 0x1.b31d8575bd000p-3, 0x1.bd087383be000p-3, 0x1.c6ffbc6f01000p-3,
+ 0x1.db13db0d49000p-3, 0x1.e530effe71000p-3, 0x1.ef5ade4dd0000p-3,
+ 0x1.f991c6cb3b000p-3, 0x1.07138604d5800p-2, 0x1.0c42d67616000p-2,
+ 0x1.1178e8227e800p-2, 0x1.16b5ccbacf800p-2, 0x1.1bf99635a6800p-2,
+ 0x1.214456d0eb800p-2, 0x1.2bef07cdc9000p-2, 0x1.314f1e1d36000p-2,
+ 0x1.36b6776be1000p-2, 0x1.3c25277333000p-2, 0x1.419b423d5e800p-2,
+ 0x1.4718dc271c800p-2, 0x1.4c9e09e173000p-2, 0x1.522ae0738a000p-2,
+ 0x1.57bf753c8d000p-2, 0x1.5d5bddf596000p-2, },
+ .logctail
+ = { 0x1.ab42428375680p-48, -0x1.ca508d8e0f720p-46, -0x1.362a4d5b6506dp-45,
+ -0x1.684e49eb067d5p-49, -0x1.41b6993293ee0p-47, 0x1.3d82f484c84ccp-46,
+ 0x1.c42f3ed820b3ap-50, 0x1.0b1c686519460p-45, 0x1.5594dd4c58092p-45,
+ 0x1.67b1e99b72bd8p-45, 0x1.5ca14b6cfb03fp-46, 0x1.5ca14b6cfb03fp-46,
+ -0x1.65a242853da76p-46, -0x1.fafbc68e75404p-46, 0x1.f1fc63382a8f0p-46,
+ -0x1.6a8c4fd055a66p-45, -0x1.c6bee7ef4030ep-47, -0x1.036b89ef42d7fp-48,
+ -0x1.036b89ef42d7fp-48, 0x1.d572aab993c87p-47, 0x1.b26b79c86af24p-45,
+ -0x1.72f4f543fff10p-46, 0x1.1ba91bbca681bp-45, 0x1.7794f689f8434p-45,
+ 0x1.7794f689f8434p-45, 0x1.94eb0318bb78fp-46, 0x1.a4e633fcd9066p-52,
+ -0x1.58c64dc46c1eap-45, -0x1.58c64dc46c1eap-45, -0x1.ad1d904c1d4e3p-45,
+ 0x1.bbdbf7fdbfa09p-45, 0x1.bdb9072534a58p-45, 0x1.bdb9072534a58p-45,
+ -0x1.0e46aa3b2e266p-46, -0x1.e9e439f105039p-46, -0x1.e9e439f105039p-46,
+ -0x1.0de8b90075b8fp-45, 0x1.70cc16135783cp-46, 0x1.70cc16135783cp-46,
+ 0x1.178864d27543ap-48, -0x1.48d301771c408p-45, -0x1.e80a41811a396p-45,
+ -0x1.e80a41811a396p-45, 0x1.a699688e85bf4p-47, 0x1.a699688e85bf4p-47,
+ -0x1.575545ca333f2p-45, 0x1.a342c2af0003cp-45, 0x1.a342c2af0003cp-45,
+ -0x1.d0c57585fbe06p-46, 0x1.53935e85baac8p-45, 0x1.53935e85baac8p-45,
+ 0x1.37c294d2f5668p-46, 0x1.37c294d2f5668p-46, -0x1.69737c93373dap-45,
+ 0x1.f025b61c65e57p-46, 0x1.f025b61c65e57p-46, 0x1.c5edaccf913dfp-45,
+ 0x1.c5edaccf913dfp-45, 0x1.47c5e768fa309p-46, 0x1.d599e83368e91p-45,
+ 0x1.d599e83368e91p-45, 0x1.c827ae5d6704cp-46, 0x1.c827ae5d6704cp-46,
+ -0x1.cfc4634f2a1eep-45, -0x1.cfc4634f2a1eep-45, 0x1.502b7f526feaap-48,
+ 0x1.502b7f526feaap-48, -0x1.980267c7e09e4p-45, -0x1.980267c7e09e4p-45,
+ -0x1.88d5493faa639p-45, -0x1.f1e7cf6d3a69cp-50, -0x1.f1e7cf6d3a69cp-50,
+ -0x1.9e23f0dda40e4p-46, -0x1.9e23f0dda40e4p-46, 0x0.0000000000000p+0,
+ 0x0.0000000000000p+0, -0x1.0c76b999d2be8p-46, -0x1.3dc5b06e2f7d2p-45,
+ -0x1.aa0ba325a0c34p-45, 0x1.111c05cf1d753p-47, -0x1.c167375bdfd28p-45,
+ -0x1.97995d05a267dp-46, -0x1.a68f247d82807p-46, -0x1.e113e4fc93b7bp-47,
+ -0x1.5325d560d9e9bp-45, 0x1.cc85ea5db4ed7p-45, -0x1.c69063c5d1d1ep-45,
+ 0x1.c1e8da99ded32p-49, 0x1.3115c3abd47dap-45, -0x1.390802bf768e5p-46,
+ 0x1.646d1c65aacd3p-45, -0x1.dc068afe645e0p-45, -0x1.534d64fa10afdp-45,
+ 0x1.1ef78ce2d07f2p-45, 0x1.ca78e44389934p-45, 0x1.39d6ccb81b4a1p-47,
+ 0x1.62fa8234b7289p-51, 0x1.5837954fdb678p-45, 0x1.633e8e5697dc7p-45,
+ 0x1.9cf8b2c3c2e78p-46, -0x1.5118de59c21e1p-45, -0x1.c661070914305p-46,
+ -0x1.73d54aae92cd1p-47, 0x1.7f22858a0ff6fp-47, -0x1.8724350562169p-45,
+ -0x1.c358d4eace1aap-47, -0x1.d4bc4595412b6p-45, -0x1.1ec72c5962bd2p-48,
+ -0x1.aff2af715b035p-45, 0x1.212276041f430p-51, -0x1.a211565bb8e11p-51,
+ 0x1.bcbecca0cdf30p-46, 0x1.89cdb16ed4e91p-48, 0x1.7188b163ceae9p-45,
+ -0x1.c210e63a5f01cp-45, 0x1.b9acdf7a51681p-45, 0x1.ca6ed5147bdb7p-45,
+ 0x1.a87deba46baeap-47, 0x1.a9cfa4a5004f4p-45, -0x1.8e27ad3213cb8p-45,
+ 0x1.16ecdb0f177c8p-46, 0x1.83b54b606bd5cp-46, 0x1.8e436ec90e09dp-47,
+ -0x1.f27ce0967d675p-45, -0x1.e20891b0ad8a4p-45, 0x1.ebe708164c759p-45,
+ 0x1.fadedee5d40efp-46, -0x1.a0b2a08a465dcp-47, },
+};
diff --git a/pl/math/v_powf_data.c b/pl/math/v_powf_data.c
new file mode 100644
index 000000000000..ded211924b80
--- /dev/null
+++ b/pl/math/v_powf_data.c
@@ -0,0 +1,89 @@
+/*
+ * Coefficients for single-precision SVE pow(x) function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "math_config.h"
+
+const struct v_powf_data __v_powf_data = {
+ .invc = { 0x1.6489890582816p+0,
+ 0x1.5cf19b35e3472p+0,
+ 0x1.55aac0e956d65p+0,
+ 0x1.4eb0022977e01p+0,
+ 0x1.47fcccda1dd1fp+0,
+ 0x1.418ceabab68c1p+0,
+ 0x1.3b5c788f1edb3p+0,
+ 0x1.3567de48e9c9ap+0,
+ 0x1.2fabc80fd19bap+0,
+ 0x1.2a25200ce536bp+0,
+ 0x1.24d108e0152e3p+0,
+ 0x1.1facd8ab2fbe1p+0,
+ 0x1.1ab614a03efdfp+0,
+ 0x1.15ea6d03af9ffp+0,
+ 0x1.1147b994bb776p+0,
+ 0x1.0ccbf650593aap+0,
+ 0x1.0875408477302p+0,
+ 0x1.0441d42a93328p+0,
+ 0x1p+0,
+ 0x1.f1d006c855e86p-1,
+ 0x1.e28c3341aa301p-1,
+ 0x1.d4bdf9aa64747p-1,
+ 0x1.c7b45a24e5803p-1,
+ 0x1.bb5f5eb2ed60ap-1,
+ 0x1.afb0bff8fe6b4p-1,
+ 0x1.a49badf7ab1f5p-1,
+ 0x1.9a14a111fc4c9p-1,
+ 0x1.901131f5b2fdcp-1,
+ 0x1.8687f73f6d865p-1,
+ 0x1.7d7067eb77986p-1,
+ 0x1.74c2c1cf97b65p-1,
+ 0x1.6c77f37cff2a1p-1
+ },
+ .logc = { -0x1.e960f97b22702p+3,
+ -0x1.c993406cd4db6p+3,
+ -0x1.aa711d9a7d0f3p+3,
+ -0x1.8bf37bacdce9bp+3,
+ -0x1.6e13b3519946ep+3,
+ -0x1.50cb8281e4089p+3,
+ -0x1.341504a237e2bp+3,
+ -0x1.17eaab624ffbbp+3,
+ -0x1.f88e708f8c853p+2,
+ -0x1.c24b6da113914p+2,
+ -0x1.8d02ee397cb1dp+2,
+ -0x1.58ac1223408b3p+2,
+ -0x1.253e6fd190e89p+2,
+ -0x1.e5641882c12ffp+1,
+ -0x1.81fea712926f7p+1,
+ -0x1.203e240de64a3p+1,
+ -0x1.8029b86a78281p0,
+ -0x1.85d713190fb9p-1,
+ 0x0p+0,
+ 0x1.4c1cc07312997p0,
+ 0x1.5e1848ccec948p+1,
+ 0x1.04cfcb7f1196fp+2,
+ 0x1.582813d463c21p+2,
+ 0x1.a936fa68760ccp+2,
+ 0x1.f81bc31d6cc4ep+2,
+ 0x1.2279a09fae6b1p+3,
+ 0x1.47ec0b6df5526p+3,
+ 0x1.6c71762280f1p+3,
+ 0x1.90155070798dap+3,
+ 0x1.b2e23b1d3068cp+3,
+ 0x1.d4e21b0daa86ap+3,
+ 0x1.f61e2a2f67f3fp+3
+ },
+ .scale = { 0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f,
+ 0x3fef9301d0125b51, 0x3fef72b83c7d517b, 0x3fef54873168b9aa,
+ 0x3fef387a6e756238, 0x3fef1e9df51fdee1, 0x3fef06fe0a31b715,
+ 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d,
+ 0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429,
+ 0x3feea47eb03a5585, 0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74,
+ 0x3feea11473eb0187, 0x3feea589994cce13, 0x3feeace5422aa0db,
+ 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d,
+ 0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c,
+ 0x3fef3720dcef9069, 0x3fef5818dcfba487, 0x3fef7c97337b9b5f,
+ 0x3fefa4afa2a490da, 0x3fefd0765b6e4540,
+ },
+};
diff --git a/pl/math/v_sincos_3u5.c b/pl/math/v_sincos_3u5.c
new file mode 100644
index 000000000000..6fc014c120b8
--- /dev/null
+++ b/pl/math/v_sincos_3u5.c
@@ -0,0 +1,57 @@
+/*
+ * Double-precision vector sincos function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincos declaration. If building
+ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+ be linked against the scalar sincosf from math/. */
+#define _GNU_SOURCE
+#include <math.h>
+#undef _GNU_SOURCE
+
+#include "v_math.h"
+#include "pl_test.h"
+#include "v_sincos_common.h"
+
+static void VPCS_ATTR NOINLINE
+special_case (float64x2_t x, uint64x2_t special, double *out_sin,
+ double *out_cos)
+{
+ if (special[0])
+ sincos (x[0], out_sin, out_cos);
+ if (special[1])
+ sincos (x[1], out_sin + 1, out_cos + 1);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+VPCS_ATTR void
+_ZGVnN2vl8l8_sincos (float64x2_t x, double *out_sin, double *out_cos)
+{
+ const struct v_sincos_data *d = ptr_barrier (&v_sincos_data);
+ uint64x2_t special = check_ge_rangeval (x, d);
+
+ float64x2x2_t sc = v_sincos_inline (x, d);
+
+ vst1q_f64 (out_sin, sc.val[0]);
+ vst1q_f64 (out_cos, sc.val[1]);
+
+ if (unlikely (v_any_u64 (special)))
+ special_case (x, special, out_sin, out_cos);
+}
+
+PL_TEST_ULP (_ZGVnN2v_sincos_sin, 2.73)
+PL_TEST_ULP (_ZGVnN2v_sincos_cos, 2.73)
+#define V_SINCOS_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN2v_sincos_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN2v_sincos_cos, lo, hi, n)
+V_SINCOS_INTERVAL (0, 0x1p23, 500000)
+V_SINCOS_INTERVAL (-0, -0x1p23, 500000)
+V_SINCOS_INTERVAL (0x1p23, inf, 10000)
+V_SINCOS_INTERVAL (-0x1p23, -inf, 10000)
diff --git a/pl/math/v_sincos_common.h b/pl/math/v_sincos_common.h
new file mode 100644
index 000000000000..ee7937e0785a
--- /dev/null
+++ b/pl/math/v_sincos_common.h
@@ -0,0 +1,86 @@
+/*
+ * Core approximation for double-precision vector sincos
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+
+static const struct v_sincos_data
+{
+ float64x2_t sin_poly[7], cos_poly[6], pio2[3];
+ float64x2_t inv_pio2, shift, range_val;
+} v_sincos_data = {
+ .inv_pio2 = V2 (0x1.45f306dc9c882p-1),
+ .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26),
+ V2 (0x1.1a62633145c07p-54) },
+ .shift = V2 (0x1.8p52),
+ .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */
+ V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+ V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+ V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+ V2 (-0x1.9e9540300a1p-41) },
+ .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */
+ V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10),
+ V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22),
+ V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) },
+ .range_val = V2 (0x1p23), };
+
+static inline uint64x2_t
+check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d)
+{
+ return vcagtq_f64 (x, d->range_val);
+}
+
+/* Double-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate polynomials.
+ Largest observed error is for sin, 3.22 ULP:
+ v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
+ want -0x1.ffe9537d5dbb4p-3. */
+static inline float64x2x2_t
+v_sincos_inline (float64x2_t x, const struct v_sincos_data *d)
+{
+ /* q = nearest integer to 2 * x / pi. */
+ float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift);
+ int64x2_t n = vcvtq_s64_f64 (q);
+
+ /* Use q to reduce x to r in [-pi/4, pi/4], by:
+ r = x - q * pi/2, in extended precision. */
+ float64x2_t r = x;
+ r = vfmsq_f64 (r, q, d->pio2[0]);
+ r = vfmsq_f64 (r, q, d->pio2[1]);
+ r = vfmsq_f64 (r, q, d->pio2[2]);
+
+ float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2;
+
+ /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
+ float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly);
+ s = vfmaq_f64 (r, r3, s);
+
+ /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
+ float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly);
+ c = vfmaq_f64 (v_f64 (-0.5), r2, c);
+ c = vfmaq_f64 (v_f64 (1), r2, c);
+
+ /* If odd quadrant, swap cos and sin. */
+ uint64x2_t swap = vtstq_s64 (n, v_s64 (1));
+ float64x2_t ss = vbslq_f64 (swap, c, s);
+ float64x2_t cc = vbslq_f64 (swap, s, c);
+
+ /* Fix signs according to quadrant.
+ ss = asdouble(asuint64(ss) ^ ((n & 2) << 62))
+ cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */
+ uint64x2_t sin_sign
+ = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62);
+ uint64x2_t cos_sign = vshlq_n_u64 (
+ vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)),
+ 62);
+ ss = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign));
+ cc = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign));
+
+ return (float64x2x2_t){ ss, cc };
+}
diff --git a/pl/math/v_sincosf_1u8.c b/pl/math/v_sincosf_1u8.c
new file mode 100644
index 000000000000..bf77afaa14db
--- /dev/null
+++ b/pl/math/v_sincosf_1u8.c
@@ -0,0 +1,58 @@
+/*
+ * Single-precision vector sincos function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+/* Define _GNU_SOURCE in order to include sincosf declaration. If building
+ pre-GLIBC 2.1, or on a non-GNU conforming system, this routine will need to
+ be linked against the scalar sincosf from math/. */
+#define _GNU_SOURCE
+#include <math.h>
+#undef _GNU_SOURCE
+
+#include "v_sincosf_common.h"
+#include "v_math.h"
+#include "pl_test.h"
+
+static void VPCS_ATTR NOINLINE
+special_case (float32x4_t x, uint32x4_t special, float *out_sin,
+ float *out_cos)
+{
+ for (int i = 0; i < 4; i++)
+ if (special[i])
+ sincosf (x[i], out_sin + i, out_cos + i);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+VPCS_ATTR void
+_ZGVnN4vl4l4_sincosf (float32x4_t x, float *out_sin, float *out_cos)
+{
+ const struct v_sincosf_data *d = ptr_barrier (&v_sincosf_data);
+ uint32x4_t special = check_ge_rangeval (x, d);
+
+ float32x4x2_t sc = v_sincosf_inline (x, d);
+
+ vst1q_f32 (out_sin, sc.val[0]);
+ vst1q_f32 (out_cos, sc.val[1]);
+
+ if (unlikely (v_any_u32 (special)))
+ special_case (x, special, out_sin, out_cos);
+}
+
+PL_TEST_ULP (_ZGVnN4v_sincosf_sin, 1.17)
+PL_TEST_ULP (_ZGVnN4v_sincosf_cos, 1.31)
+#define V_SINCOSF_INTERVAL(lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN4v_sincosf_sin, lo, hi, n) \
+ PL_TEST_INTERVAL (_ZGVnN4v_sincosf_cos, lo, hi, n)
+V_SINCOSF_INTERVAL (0, 0x1p20, 500000)
+V_SINCOSF_INTERVAL (-0, -0x1p20, 500000)
+V_SINCOSF_INTERVAL (0x1p20, inf, 10000)
+V_SINCOSF_INTERVAL (-0x1p20, -inf, 10000)
diff --git a/pl/math/v_sincosf_common.h b/pl/math/v_sincosf_common.h
new file mode 100644
index 000000000000..8239bd9f0176
--- /dev/null
+++ b/pl/math/v_sincosf_common.h
@@ -0,0 +1,84 @@
+/*
+ * Core approximation for single-precision vector sincos
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "v_math.h"
+
+const static struct v_sincosf_data
+{
+ float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val;
+} v_sincosf_data = {
+ .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */
+ V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) },
+ .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */
+ V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) },
+ .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) },
+ .inv_pio2 = V4 (0x1.45f306p-1f),
+ .shift = V4 (0x1.8p23),
+ .range_val = V4 (0x1p20),
+};
+
+static inline uint32x4_t
+check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d)
+{
+ return vcagtq_f32 (x, d->range_val);
+}
+
+/* Single-precision vector function allowing calculation of both sin and cos in
+ one function call, using shared argument reduction and separate low-order
+ polynomials.
+ Worst-case error for sin is 1.67 ULP:
+ v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5
+ Worst-case error for cos is 1.81 ULP:
+ v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */
+static inline float32x4x2_t
+v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d)
+{
+ /* n = rint ( x / (pi/2) ). */
+ float32x4_t shift = d->shift;
+ float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2);
+ q = vsubq_f32 (q, shift);
+ int32x4_t n = vcvtq_s32_f32 (q);
+
+ /* Reduce x such that r is in [ -pi/4, pi/4 ]. */
+ float32x4_t r = x;
+ r = vfmsq_f32 (r, q, d->pio2[0]);
+ r = vfmsq_f32 (r, q, d->pio2[1]);
+ r = vfmsq_f32 (r, q, d->pio2[2]);
+
+ /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */
+ float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2);
+ float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]);
+ s = vfmaq_f32 (d->poly_sin[0], r2, s);
+ s = vfmaq_f32 (r, r3, s);
+
+ /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+ float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]);
+ float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]);
+ c = vfmaq_f32 (c, r4, p);
+ c = vfmaq_f32 (v_f32 (1), c, r2);
+
+ /* If odd quadrant, swap cos and sin. */
+ uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1));
+ float32x4_t ss = vbslq_f32 (swap, c, s);
+ float32x4_t cc = vbslq_f32 (swap, s, c);
+
+ /* Fix signs according to quadrant.
+ ss = asfloat(asuint(ss) ^ ((n & 2) << 30))
+ cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */
+ uint32x4_t sin_sign
+ = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30);
+ uint32x4_t cos_sign = vshlq_n_u32 (
+ vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)),
+ 30);
+ ss = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign));
+ cc = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign));
+
+ return (float32x4x2_t){ ss, cc };
+}
diff --git a/pl/math/v_sinh_3u.c b/pl/math/v_sinh_3u.c
index 57ec66ecc282..a644f54b4a0f 100644
--- a/pl/math/v_sinh_3u.c
+++ b/pl/math/v_sinh_3u.c
@@ -6,47 +6,73 @@
*/
#include "v_math.h"
-#include "estrin.h"
+#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
-#define AbsMask 0x7fffffffffffffff
-#define Half 0x3fe0000000000000
-#define BigBound \
- 0x4080000000000000 /* 2^9. expm1 helper overflows for large input. */
-#define TinyBound \
- 0x3e50000000000000 /* 2^-26, below which sinh(x) rounds to x. */
-#define InvLn2 v_f64 (0x1.71547652b82fep0)
-#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
-#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
-#define Shift v_f64 (0x1.8p52)
-#define One 0x3ff0000000000000
-#define C(i) v_f64 (__expm1_poly[i])
+static const struct data
+{
+ float64x2_t poly[11];
+ float64x2_t inv_ln2, m_ln2, shift;
+ uint64x2_t halff;
+ int64x2_t onef;
+#if WANT_SIMD_EXCEPT
+ uint64x2_t tiny_bound, thresh;
+#else
+ uint64x2_t large_bound;
+#endif
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
-#if V_SUPPORTED
+ .inv_ln2 = V2 (0x1.71547652b82fep0),
+ .m_ln2 = (float64x2_t) {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56},
+ .shift = V2 (0x1.8p52),
-static inline v_f64_t
-expm1_inline (v_f64_t x)
+ .halff = V2 (0x3fe0000000000000),
+ .onef = V2 (0x3ff0000000000000),
+#if WANT_SIMD_EXCEPT
+ /* 2^-26, below which sinh(x) rounds to x. */
+ .tiny_bound = V2 (0x3e50000000000000),
+ /* asuint(large_bound) - asuint(tiny_bound). */
+ .thresh = V2 (0x0230000000000000),
+#else
+/* 2^9. expm1 helper overflows for large input. */
+ .large_bound = V2 (0x4080000000000000),
+#endif
+};
+
+static inline float64x2_t
+expm1_inline (float64x2_t x)
{
+ const struct data *d = ptr_barrier (&data);
+
/* Reduce argument:
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
where i = round(x / ln2)
and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */
- v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
- v_s64_t i = v_to_s64_f64 (j);
- v_f64_t f = v_fma_f64 (j, MLn2hi, x);
- f = v_fma_f64 (j, MLn2lo, f);
+ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+ int64x2_t i = vcvtq_s64_f64 (j);
+ float64x2_t f = vfmaq_laneq_f64 (x, j, d->m_ln2, 0);
+ f = vfmaq_laneq_f64 (f, j, d->m_ln2, 1);
/* Approximate expm1(f) using polynomial. */
- v_f64_t f2 = f * f, f4 = f2 * f2, f8 = f4 * f4;
- v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f8, C), f);
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t f8 = vmulq_f64 (f4, f4);
+ float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly));
/* t = 2^i. */
- v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+ float64x2_t t = vreinterpretq_f64_u64 (
+ vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef)));
/* expm1(x) ~= p * t + (t - 1). */
- return v_fma_f64 (p, t, t - 1);
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t);
}
-static NOINLINE VPCS_ATTR v_f64_t
-special_case (v_f64_t x)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x)
{
return v_call_f64 (sinh, x, x, v_u64 (-1));
}
@@ -54,20 +80,22 @@ special_case (v_f64_t x)
/* Approximation for vector double-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
The greatest observed error is 2.57 ULP:
- sinh(0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
- want 0x1.ab34e59d678d9p-2. */
-VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
+ _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2
+ want 0x1.ab34e59d678d9p-2. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t iax = ix & AbsMask;
- v_f64_t ax = v_as_f64_u64 (iax);
- v_u64_t sign = ix & ~AbsMask;
- v_f64_t halfsign = v_as_f64_u64 (sign | Half);
+ const struct data *d = ptr_barrier (&data);
+
+ float64x2_t ax = vabsq_f64 (x);
+ uint64x2_t sign
+ = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax));
+ float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff));
#if WANT_SIMD_EXCEPT
- v_u64_t special = v_cond_u64 ((iax - TinyBound) >= (BigBound - TinyBound));
+ uint64x2_t special = vcgeq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh);
#else
- v_u64_t special = v_cond_u64 (iax >= BigBound);
+ uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound);
#endif
/* Fall back to scalar variant for all lanes if any of them are special. */
@@ -77,18 +105,14 @@ VPCS_ATTR v_f64_t V_NAME (sinh) (v_f64_t x)
/* Up to the point that expm1 overflows, we can use it to calculate sinh
using a slight rearrangement of the definition of sinh. This allows us to
retain acceptable accuracy for very small inputs. */
- v_f64_t t = expm1_inline (ax);
- return (t + t / (t + 1)) * halfsign;
+ float64x2_t t = expm1_inline (ax);
+ t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0))));
+ return vmulq_f64 (t, halfsign);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (sinh), 2.08)
-PL_TEST_EXPECT_FENV (V_NAME (sinh), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (sinh), 0, TinyBound, 1000)
-PL_TEST_INTERVAL (V_NAME (sinh), -0, -TinyBound, 1000)
-PL_TEST_INTERVAL (V_NAME (sinh), TinyBound, BigBound, 500000)
-PL_TEST_INTERVAL (V_NAME (sinh), -TinyBound, -BigBound, 500000)
-PL_TEST_INTERVAL (V_NAME (sinh), BigBound, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (sinh), -BigBound, -inf, 1000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (sinh), 2.08)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (sinh), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0, 0x1p-26, 1000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p-26, 0x1p9, 500000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinh), 0x1p9, inf, 1000)
diff --git a/pl/math/v_sinhf_2u3.c b/pl/math/v_sinhf_2u3.c
index 49cf078d0651..cd8c0f08f784 100644
--- a/pl/math/v_sinhf_2u3.c
+++ b/pl/math/v_sinhf_2u3.c
@@ -9,61 +9,76 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
#include "v_expm1f_inline.h"
-#define AbsMask 0x7fffffff
-#define Half 0x3f000000
-#define BigBound \
- 0x42b0c0a7 /* 0x1.61814ep+6, above which expm1f helper overflows. */
-#define TinyBound \
- 0x2fb504f4 /* 0x1.6a09e8p-32, below which expm1f underflows. */
+static const struct data
+{
+ struct v_expm1f_data expm1f_consts;
+ uint32x4_t halff;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t tiny_bound, thresh;
+#else
+ uint32x4_t oflow_bound;
+#endif
+} data = {
+ .expm1f_consts = V_EXPM1F_DATA,
+ .halff = V4 (0x3f000000),
+#if WANT_SIMD_EXCEPT
+ /* 0x1.6a09e8p-32, below which expm1f underflows. */
+ .tiny_bound = V4 (0x2fb504f4),
+ /* asuint(oflow_bound) - asuint(tiny_bound). */
+ .thresh = V4 (0x12fbbbb3),
+#else
+ /* 0x1.61814ep+6, above which expm1f helper overflows. */
+ .oflow_bound = V4 (0x42b0c0a7),
+#endif
+};
-static NOINLINE VPCS_ATTR v_f32_t
-special_case (v_f32_t x)
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
{
- return v_call_f32 (sinhf, x, x, v_u32 (-1));
+ return v_call_f32 (sinhf, x, y, special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
sinh(x) = (exp(x) - exp(-x)) / 2.
The maximum error is 2.26 ULP:
- __v_sinhf(0x1.e34a9ep-4) got 0x1.e469ep-4 want 0x1.e469e4p-4. */
-VPCS_ATTR v_f32_t V_NAME (sinhf) (v_f32_t x)
+ _ZGVnN4v_sinhf (0x1.e34a9ep-4) got 0x1.e469ep-4
+ want 0x1.e469e4p-4. */
+float32x4_t VPCS_ATTR V_NAME_F1 (sinh) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iax = ix & AbsMask;
- v_f32_t ax = v_as_f32_u32 (iax);
- v_u32_t sign = ix & ~AbsMask;
- v_f32_t halfsign = v_as_f32_u32 (sign | Half);
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t sign = veorq_u32 (ix, iax);
+ float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
#if WANT_SIMD_EXCEPT
- v_u32_t special = v_cond_u32 ((iax - TinyBound) >= (BigBound - TinyBound));
+ uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+ ax = v_zerofy_f32 (ax, special);
#else
- v_u32_t special = v_cond_u32 (iax >= BigBound);
+ uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
#endif
- /* Fall back to the scalar variant for all lanes if any of them should trigger
- an exception. */
+ /* Up to the point that expm1f overflows, we can use it to calculate sinhf
+ using a slight rearrangement of the definition of asinh. This allows us
+ to retain acceptable accuracy for very small inputs. */
+ float32x4_t t = expm1f_inline (ax, &d->expm1f_consts);
+ t = vaddq_f32 (t, vdivq_f32 (t, vaddq_f32 (t, v_f32 (1.0))));
+
+ /* Fall back to the scalar variant for any lanes that should trigger an
+ exception. */
if (unlikely (v_any_u32 (special)))
- return special_case (x);
+ return special_case (x, vmulq_f32 (t, halfsign), special);
- /* Up to the point that expm1f overflows, we can use it to calculate sinhf
- using a slight rearrangement of the definition of asinh. This allows us to
- retain acceptable accuracy for very small inputs. */
- v_f32_t t = expm1f_inline (ax);
- return (t + t / (t + 1)) * halfsign;
+ return vmulq_f32 (t, halfsign);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, sinh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (sinhf), 1.76)
-PL_TEST_EXPECT_FENV (V_NAME (sinhf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (sinhf), 0, TinyBound, 1000)
-PL_TEST_INTERVAL (V_NAME (sinhf), -0, -TinyBound, 1000)
-PL_TEST_INTERVAL (V_NAME (sinhf), TinyBound, BigBound, 100000)
-PL_TEST_INTERVAL (V_NAME (sinhf), -TinyBound, -BigBound, 100000)
-PL_TEST_INTERVAL (V_NAME (sinhf), BigBound, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (sinhf), -BigBound, -inf, 1000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (sinh), 1.76)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (sinh), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0, 0x2fb504f4, 1000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x2fb504f4, 0x42b0c0a7, 100000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinh), 0x42b0c0a7, inf, 1000)
diff --git a/pl/math/v_sinpi_3u1.c b/pl/math/v_sinpi_3u1.c
new file mode 100644
index 000000000000..8d2917ff8ecd
--- /dev/null
+++ b/pl/math/v_sinpi_3u1.c
@@ -0,0 +1,86 @@
+/*
+ * Double-precision vector sinpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "poly_advsimd_f64.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float64x2_t poly[10];
+} data = {
+ /* Polynomial coefficients generated using Remez algorithm,
+ see sinpi.sollya for details. */
+ .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2),
+ V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1),
+ V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8),
+ V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16),
+ V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) },
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */
+/* asuint64(0x1p64) - TinyBound. */
+# define Thresh v_u64 (0x07f0000000000000)
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+ return v_call_f64 (sinpi, x, y, cmp);
+}
+#endif
+
+/* Approximation for vector double-precision sinpi(x).
+ Maximum Error 3.05 ULP:
+ _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1
+ want 0x1.fb295878301cap-1. */
+float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
+ to avoid them under/overflowing and throwing exceptions. */
+ float64x2_t r = v_zerofy_f64 (x, cmp);
+#else
+ float64x2_t r = x;
+#endif
+
+ /* If r is odd, the sign of the result should be inverted. */
+ uint64x2_t odd
+ = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63);
+
+ /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */
+ r = vsubq_f64 (r, vrndaq_f64 (r));
+
+ /* y = sin(r). */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t r4 = vmulq_f64 (r2, r2);
+ float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u64 (cmp)))
+ return special_case (x, y, odd, cmp);
+#endif
+
+ return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
+
+PL_SIG (V, D, 1, sinpi, -0.9, 0.9)
+PL_TEST_ULP (V_NAME_D1 (sinpi), 3.06)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (sinpi), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0, 0x1p-63, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p-63, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0.5, 0x1p51, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (sinpi), 0x1p51, inf, 10000)
diff --git a/pl/math/v_sinpif_3u.c b/pl/math/v_sinpif_3u.c
new file mode 100644
index 000000000000..3d6eeff333f7
--- /dev/null
+++ b/pl/math/v_sinpif_3u.c
@@ -0,0 +1,81 @@
+/*
+ * Single-precision vector sinpi function.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "mathlib.h"
+#include "v_math.h"
+#include "poly_advsimd_f32.h"
+#include "pl_sig.h"
+#include "pl_test.h"
+
+static const struct data
+{
+ float32x4_t poly[6];
+} data = {
+ /* Taylor series coefficents for sin(pi * x). */
+ .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f),
+ V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) },
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */
+# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+ /* Fall back to scalar code. */
+ y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+ return v_call_f32 (sinpif, x, y, cmp);
+}
+#endif
+
+/* Approximation for vector single-precision sinpi(x)
+ Maximum Error 3.03 ULP:
+ _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1
+ want 0x1.f7cd5p-1. */
+float32x4_t VPCS_ATTR V_NAME_F1 (sinpi) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+#if WANT_SIMD_EXCEPT
+ uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+
+ /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0
+ to avoid them under/overflowing and throwing exceptions. */
+ float32x4_t r = v_zerofy_f32 (x, cmp);
+#else
+ float32x4_t r = x;
+#endif
+
+ /* If r is odd, the sign of the result should be inverted. */
+ uint32x4_t odd
+ = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31);
+
+ /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */
+ r = vsubq_f32 (r, vrndaq_f32 (r));
+
+ /* Pairwise Horner approximation for y = sin(r * pi). */
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t r4 = vmulq_f32 (r2, r2);
+ float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r);
+
+#if WANT_SIMD_EXCEPT
+ if (unlikely (v_any_u32 (cmp)))
+ return special_case (x, y, odd, cmp);
+#endif
+
+ return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
+
+PL_SIG (V, F, 1, sinpi, -0.9, 0.9)
+PL_TEST_ULP (V_NAME_F1 (sinpi), 2.54)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (sinpi), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p-31, 0.5, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0.5, 0x1p31f, 10000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (sinpi), 0x1p31f, inf, 10000)
diff --git a/pl/math/v_tan_3u5.c b/pl/math/v_tan_3u5.c
index f87baccc4fd7..c431c8c4889e 100644
--- a/pl/math/v_tan_3u5.c
+++ b/pl/math/v_tan_3u5.c
@@ -6,62 +6,76 @@
*/
#include "v_math.h"
-#include "estrin.h"
+#include "poly_advsimd_f64.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
+static const struct data
+{
+ float64x2_t poly[9];
+ float64x2_t half_pi, two_over_pi, shift;
+#if !WANT_SIMD_EXCEPT
+ float64x2_t range_val;
+#endif
+} data = {
+ /* Coefficients generated using FPMinimax. */
+ .poly = { V2 (0x1.5555555555556p-2), V2 (0x1.1111111110a63p-3),
+ V2 (0x1.ba1ba1bb46414p-5), V2 (0x1.664f47e5b5445p-6),
+ V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9),
+ V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11),
+ V2 (0x1.4e4fd14147622p-12) },
+ .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 },
+ .two_over_pi = V2 (0x1.45f306dc9c883p-1),
+ .shift = V2 (0x1.8p52),
+#if !WANT_SIMD_EXCEPT
+ .range_val = V2 (0x1p23),
+#endif
+};
-#define MHalfPiHi v_f64 (__v_tan_data.neg_half_pi_hi)
-#define MHalfPiLo v_f64 (__v_tan_data.neg_half_pi_lo)
-#define TwoOverPi v_f64 (0x1.45f306dc9c883p-1)
-#define Shift v_f64 (0x1.8p52)
-#define AbsMask 0x7fffffffffffffff
-#define RangeVal 0x4160000000000000 /* asuint64(2^23). */
+#define RangeVal 0x4160000000000000 /* asuint64(0x1p23). */
#define TinyBound 0x3e50000000000000 /* asuint64(2^-26). */
-#define C(i) v_f64 (__v_tan_data.poly[i])
+#define Thresh 0x310000000000000 /* RangeVal - TinyBound. */
/* Special cases (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f64_t
-specialcase (v_f64_t x)
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x)
{
return v_call_f64 (tan, x, x, v_u64 (-1));
}
/* Vector approximation for double-precision tan.
Maximum measured error is 3.48 ULP:
- __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
- want -0x1.f6ccd8ecf7deap+37. */
-VPCS_ATTR
-v_f64_t V_NAME (tan) (v_f64_t x)
+ _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37
+ want -0x1.f6ccd8ecf7deap+37. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x)
{
- v_u64_t iax = v_as_u64_f64 (x) & AbsMask;
-
- /* Our argument reduction cannot calculate q with sufficient accuracy for very
- large inputs. Fall back to scalar routine for all lanes if any are too
- large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny
- input to avoid underflow. Note pl does not supply a scalar double-precision
- tan, so the fallback will be statically linked from the system libm. */
+ const struct data *dat = ptr_barrier (&data);
+ /* Our argument reduction cannot calculate q with sufficient accuracy for
+ very large inputs. Fall back to scalar routine for all lanes if any are
+ too large, or Inf/NaN. If fenv exceptions are expected, also fall back for
+ tiny input to avoid underflow. */
#if WANT_SIMD_EXCEPT
- if (unlikely (v_any_u64 (iax - TinyBound > RangeVal - TinyBound)))
-#else
- if (unlikely (v_any_u64 (iax > RangeVal)))
+ uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ /* iax - tiny_bound > range_val - tiny_bound. */
+ uint64x2_t special
+ = vcgtq_u64 (vsubq_u64 (iax, v_u64 (TinyBound)), v_u64 (Thresh));
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x);
#endif
- return specialcase (x);
/* q = nearest integer to 2 * x / pi. */
- v_f64_t q = v_fma_f64 (x, TwoOverPi, Shift) - Shift;
- v_s64_t qi = v_to_s64_f64 (q);
+ float64x2_t q
+ = vsubq_f64 (vfmaq_f64 (dat->shift, x, dat->two_over_pi), dat->shift);
+ int64x2_t qi = vcvtq_s64_f64 (q);
/* Use q to reduce x to r in [-pi/4, pi/4], by:
r = x - q * pi/2, in extended precision. */
- v_f64_t r = x;
- r = v_fma_f64 (q, MHalfPiHi, r);
- r = v_fma_f64 (q, MHalfPiLo, r);
+ float64x2_t r = x;
+ r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0);
+ r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1);
/* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle
formula. */
- r = r * 0.5;
+ r = vmulq_n_f64 (r, 0.5);
/* Approximate tan(r) using order 8 polynomial.
tan(x) is odd, so polynomial has the form:
@@ -69,34 +83,38 @@ v_f64_t V_NAME (tan) (v_f64_t x)
Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ...
Then compute the approximation by:
tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */
- v_f64_t r2 = r * r, r4 = r2 * r2, r8 = r4 * r4;
- /* Use offset version of Estrin wrapper to evaluate from C1 onwards. */
- v_f64_t p = ESTRIN_7_ (r2, r4, r8, C, 1);
- p = v_fma_f64 (p, r2, C (0));
- p = v_fma_f64 (r2, p * r, r);
+ float64x2_t r2 = vmulq_f64 (r, r), r4 = vmulq_f64 (r2, r2),
+ r8 = vmulq_f64 (r4, r4);
+ /* Offset coefficients to evaluate from C1 onwards. */
+ float64x2_t p = v_estrin_7_f64 (r2, r4, r8, dat->poly + 1);
+ p = vfmaq_f64 (dat->poly[0], p, r2);
+ p = vfmaq_f64 (r, r2, vmulq_f64 (p, r));
/* Recombination uses double-angle formula:
tan(2x) = 2 * tan(x) / (1 - (tan(x))^2)
and reciprocity around pi/2:
tan(x) = 1 / (tan(pi/2 - x))
to assemble result using change-of-sign and conditional selection of
- numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */
- v_f64_t n = v_fma_f64 (p, p, v_f64 (-1));
- v_f64_t d = p * 2;
+ numerator/denominator, dependent on odd/even-ness of q (hence quadrant).
+ */
+ float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p);
+ float64x2_t d = vaddq_f64 (p, p);
- v_u64_t use_recip = v_cond_u64 ((v_as_u64_s64 (qi) & 1) == 0);
+ uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1));
- return v_sel_f64 (use_recip, -d, n) / v_sel_f64 (use_recip, n, d);
+#if !WANT_SIMD_EXCEPT
+ uint64x2_t special = vcageq_f64 (x, dat->range_val);
+ if (unlikely (v_any_u64 (special)))
+ return special_case (x);
+#endif
+
+ return vdivq_f64 (vbslq_f64 (no_recip, n, vnegq_f64 (d)),
+ vbslq_f64 (no_recip, d, n));
}
-VPCS_ALIAS
PL_SIG (V, D, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME (tan), 2.99)
-PL_TEST_EXPECT_FENV (V_NAME (tan), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (tan), 0, TinyBound, 5000)
-PL_TEST_INTERVAL (V_NAME (tan), TinyBound, RangeVal, 100000)
-PL_TEST_INTERVAL (V_NAME (tan), RangeVal, inf, 5000)
-PL_TEST_INTERVAL (V_NAME (tan), -0, -TinyBound, 5000)
-PL_TEST_INTERVAL (V_NAME (tan), -TinyBound, -RangeVal, 100000)
-PL_TEST_INTERVAL (V_NAME (tan), -RangeVal, -inf, 5000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (tan), 2.99)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (tan), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), 0, TinyBound, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), TinyBound, RangeVal, 100000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (tan), RangeVal, inf, 5000)
diff --git a/pl/math/v_tan_data.c b/pl/math/v_tan_data.c
deleted file mode 100644
index 04e25169bd88..000000000000
--- a/pl/math/v_tan_data.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Coefficients and helpers for double-precision vector tan(x) function.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "math_config.h"
-
-const struct v_tan_data __v_tan_data
- = {.neg_half_pi_hi = -0x1.921fb54442d18p0,
- .neg_half_pi_lo = -0x1.1a62633145c07p-54,
- .poly
- = {0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5,
- 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9,
- 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, 0x1.4e4fd14147622p-12}};
diff --git a/pl/math/v_tanf_3u5.c b/pl/math/v_tanf_3u5.c
index 828466b03182..98948b0a9ecf 100644
--- a/pl/math/v_tanf_3u5.c
+++ b/pl/math/v_tanf_3u5.c
@@ -6,87 +6,95 @@
*/
#include "v_math.h"
-#include "estrinf.h"
+#include "poly_advsimd_f32.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
-/* Constants. */
-#define NegPio2_1 (v_f32 (-0x1.921fb6p+0f))
-#define NegPio2_2 (v_f32 (0x1.777a5cp-25f))
-#define NegPio2_3 (v_f32 (0x1.ee59dap-50f))
-#define InvPio2 (v_f32 (0x1.45f306p-1f))
-#define RangeVal (0x47000000) /* asuint32(0x1p15f). */
-#define TinyBound (0x30000000) /* asuint32 (0x1p-31). */
-#define Shift (v_f32 (0x1.8p+23f))
-#define AbsMask (v_u32 (0x7fffffff))
+static const struct data
+{
+ float32x4_t poly[6];
+ float32x4_t pi_consts;
+ float32x4_t shift;
+#if !WANT_SIMD_EXCEPT
+ float32x4_t range_val;
+#endif
+} data = {
+ /* Coefficients generated using FPMinimax. */
+ .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f),
+ V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) },
+ /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */
+ .pi_consts
+ = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f },
+ .shift = V4 (0x1.8p+23f),
+#if !WANT_SIMD_EXCEPT
+ .range_val = V4 (0x1p15f),
+#endif
+};
-#define poly(i) v_f32 (__tanf_poly_data.poly_tan[i])
+#define RangeVal v_u32 (0x47000000) /* asuint32(0x1p15f). */
+#define TinyBound v_u32 (0x30000000) /* asuint32 (0x1p-31f). */
+#define Thresh v_u32 (0x16000000) /* asuint32(RangeVal) - TinyBound. */
/* Special cases (fall back to scalar calls). */
-VPCS_ATTR
-NOINLINE static v_f32_t
-specialcase (v_f32_t x, v_f32_t y, v_u32_t cmp)
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
{
return v_call_f32 (tanf, x, y, cmp);
}
/* Use a full Estrin scheme to evaluate polynomial. */
-static inline v_f32_t
-eval_poly (v_f32_t z)
+static inline float32x4_t
+eval_poly (float32x4_t z, const struct data *d)
{
- v_f32_t z2 = z * z;
+ float32x4_t z2 = vmulq_f32 (z, z);
#if WANT_SIMD_EXCEPT
- /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions
- are to be triggered correctly, sidestep this by fixing such lanes to 0. */
- v_u32_t will_uflow = v_cond_u32 ((v_as_u32_f32 (z) & AbsMask) <= TinyBound);
+ /* Tiny z (<= 0x1p-31) will underflow when calculating z^4.
+ If fp exceptions are to be triggered correctly,
+ sidestep this by fixing such lanes to 0. */
+ uint32x4_t will_uflow
+ = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound);
if (unlikely (v_any_u32 (will_uflow)))
- z2 = v_sel_f32 (will_uflow, v_f32 (0), z2);
+ z2 = vbslq_f32 (will_uflow, v_f32 (0), z2);
#endif
- v_f32_t z4 = z2 * z2;
- return ESTRIN_5 (z, z2, z4, poly);
+ float32x4_t z4 = vmulq_f32 (z2, z2);
+ return v_estrin_5_f32 (z, z2, z4, d->poly);
}
-/* Fast implementation of Neon tanf.
+/* Fast implementation of AdvSIMD tanf.
Maximum error is 3.45 ULP:
__v_tanf(-0x1.e5f0cap+13) got 0x1.ff9856p-1
want 0x1.ff9850p-1. */
-VPCS_ATTR
-v_f32_t V_NAME (tanf) (v_f32_t x)
+float32x4_t VPCS_ATTR V_NAME_F1 (tan) (float32x4_t x)
{
- v_f32_t special_arg = x;
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iax = ix & AbsMask;
+ const struct data *d = ptr_barrier (&data);
+ float32x4_t special_arg = x;
/* iax >= RangeVal means x, if not inf or NaN, is too large to perform fast
regression. */
#if WANT_SIMD_EXCEPT
+ uint32x4_t iax = vreinterpretq_u32_f32 (vabsq_f32 (x));
/* If fp exceptions are to be triggered correctly, also special-case tiny
input, as this will load to overflow later. Fix any special lanes to 1 to
prevent any exceptions being triggered. */
- v_u32_t special = v_cond_u32 (iax - TinyBound >= RangeVal - TinyBound);
+ uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, TinyBound), Thresh);
if (unlikely (v_any_u32 (special)))
- x = v_sel_f32 (special, v_f32 (1.0f), x);
+ x = vbslq_f32 (special, v_f32 (1.0f), x);
#else
/* Otherwise, special-case large and special values. */
- v_u32_t special = v_cond_u32 (iax >= RangeVal);
+ uint32x4_t special = vcageq_f32 (x, d->range_val);
#endif
/* n = rint(x/(pi/2)). */
- v_f32_t q = v_fma_f32 (InvPio2, x, Shift);
- v_f32_t n = q - Shift;
- /* n is representable as a signed integer, simply convert it. */
- v_s32_t in = v_round_s32 (n);
+ float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3);
+ float32x4_t n = vsubq_f32 (q, d->shift);
/* Determine if x lives in an interval, where |tan(x)| grows to infinity. */
- v_s32_t alt = in & 1;
- v_u32_t pred_alt = (alt != 0);
+ uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1));
/* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */
- v_f32_t r;
- r = v_fma_f32 (NegPio2_1, n, x);
- r = v_fma_f32 (NegPio2_2, n, r);
- r = v_fma_f32 (NegPio2_3, n, r);
+ float32x4_t r;
+ r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0);
+ r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1);
+ r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2);
/* If x lives in an interval, where |tan(x)|
- is finite, then use a polynomial approximation of the form
@@ -95,37 +103,25 @@ v_f32_t V_NAME (tanf) (v_f32_t x)
tan(r) = cotan(pi/2 - r) to express tan(x) as 1/tan(-r). Finally, use
the same polynomial approximation of tan as above. */
- /* Perform additional reduction if required. */
- v_f32_t z = v_sel_f32 (pred_alt, -r, r);
+ /* Invert sign of r if odd quadrant. */
+ float32x4_t z = vmulq_f32 (r, vbslq_f32 (pred_alt, v_f32 (-1), v_f32 (1)));
/* Evaluate polynomial approximation of tangent on [-pi/4, pi/4]. */
- v_f32_t z2 = r * r;
- v_f32_t p = eval_poly (z2);
- v_f32_t y = v_fma_f32 (z * z2, p, z);
+ float32x4_t z2 = vmulq_f32 (r, r);
+ float32x4_t p = eval_poly (z2, d);
+ float32x4_t y = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
/* Compute reciprocal and apply if required. */
- v_f32_t inv_y = v_div_f32 (v_f32 (1.0f), y);
- y = v_sel_f32 (pred_alt, inv_y, y);
-
- /* Fast reduction does not handle the x = -0.0 case well,
- therefore it is fixed here. */
- y = v_sel_f32 (x == v_f32 (-0.0), x, y);
+ float32x4_t inv_y = vdivq_f32 (v_f32 (1.0f), y);
if (unlikely (v_any_u32 (special)))
- return specialcase (special_arg, y, special);
- return y;
+ return special_case (special_arg, vbslq_f32 (pred_alt, inv_y, y), special);
+ return vbslq_f32 (pred_alt, inv_y, y);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, tan, -3.1, 3.1)
-PL_TEST_ULP (V_NAME (tanf), 2.96)
-PL_TEST_EXPECT_FENV (V_NAME (tanf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (tanf), -0.0, -0x1p126, 100)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-149, 0x1p-126, 4000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-126, 0x1p-23, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p-23, 0.7, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0.7, 1.5, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 1.5, 100, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 100, 0x1p17, 50000)
-PL_TEST_INTERVAL (V_NAME (tanf), 0x1p17, inf, 50000)
-#endif
+PL_TEST_ULP (V_NAME_F1 (tan), 2.96)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (tan), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0, 0x1p-31, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p-31, 0x1p15, 500000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (tan), 0x1p15, inf, 5000)
diff --git a/pl/math/v_tanh_3u.c b/pl/math/v_tanh_3u.c
index c8b6c251d453..5de85c68da2c 100644
--- a/pl/math/v_tanh_3u.c
+++ b/pl/math/v_tanh_3u.c
@@ -5,90 +5,102 @@
*/
#include "v_math.h"
-#include "estrin.h"
+#include "poly_advsimd_f64.h"
#include "mathlib.h"
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
+static const struct data
+{
+ float64x2_t poly[11];
+ float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+ uint64x2_t onef;
+ uint64x2_t thresh, tiny_bound;
+} data = {
+ /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */
+ .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5),
+ V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10),
+ V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16),
+ V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22),
+ V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), },
-#define AbsMask v_u64 (0x7fffffffffffffff)
-#define InvLn2 v_f64 (0x1.71547652b82fep0)
-#define MLn2hi v_f64 (-0x1.62e42fefa39efp-1)
-#define MLn2lo v_f64 (-0x1.abc9e3b39803fp-56)
-#define Shift v_f64 (0x1.8p52)
-#define C(i) v_f64 (__expm1_poly[i])
+ .inv_ln2 = V2 (0x1.71547652b82fep0),
+ .ln2_hi = V2 (-0x1.62e42fefa39efp-1),
+ .ln2_lo = V2 (-0x1.abc9e3b39803fp-56),
+ .shift = V2 (0x1.8p52),
-#define BoringBound 0x403241bf835f9d5f /* asuint64 (0x1.241bf835f9d5fp+4). */
-#define TinyBound 0x3e40000000000000 /* asuint64 (0x1p-27). */
-#define One v_u64 (0x3ff0000000000000)
+ .onef = V2 (0x3ff0000000000000),
+ .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */
+ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */
+ .thresh = V2 (0x01f241bf835f9d5f),
+};
-static inline v_f64_t
-expm1_inline (v_f64_t x)
+static inline float64x2_t
+expm1_inline (float64x2_t x, const struct data *d)
{
/* Helper routine for calculating exp(x) - 1. Vector port of the helper from
the scalar variant of tanh. */
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- v_f64_t j = v_fma_f64 (InvLn2, x, Shift) - Shift;
- v_s64_t i = v_to_s64_f64 (j);
- v_f64_t f = v_fma_f64 (j, MLn2hi, x);
- f = v_fma_f64 (j, MLn2lo, f);
+ float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift);
+ int64x2_t i = vcvtq_s64_f64 (j);
+ float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi);
+ f = vfmaq_f64 (f, j, d->ln2_lo);
/* Approximate expm1(f) using polynomial. */
- v_f64_t f2 = f * f;
- v_f64_t f4 = f2 * f2;
- v_f64_t p = v_fma_f64 (f2, ESTRIN_10 (f, f2, f4, f4 * f4, C), f);
+ float64x2_t f2 = vmulq_f64 (f, f);
+ float64x2_t f4 = vmulq_f64 (f2, f2);
+ float64x2_t p = vfmaq_f64 (
+ f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly));
/* t = 2 ^ i. */
- v_f64_t t = v_as_f64_u64 (v_as_u64_s64 (i << 52) + One);
+ float64x2_t t = vreinterpretq_f64_u64 (
+ vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef));
/* expm1(x) = p * t + (t - 1). */
- return v_fma_f64 (p, t, t - 1);
+ return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t);
}
-static NOINLINE v_f64_t
-special_case (v_f64_t x, v_f64_t y, v_u64_t special)
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
{
return v_call_f64 (tanh, x, y, special);
}
/* Vector approximation for double-precision tanh(x), using a simplified
- version of expm1. The greatest observed error is 2.75 ULP:
- __v_tanh(-0x1.c143c3a44e087p-3) got -0x1.ba31ba4691ab7p-3
- want -0x1.ba31ba4691ab4p-3. */
-VPCS_ATTR v_f64_t V_NAME (tanh) (v_f64_t x)
+ version of expm1. The greatest observed error is 2.77 ULP:
+ _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
+ want -0x1.bd6a21a163624p-3. */
+float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x)
{
- v_u64_t ix = v_as_u64_f64 (x);
- v_u64_t ia = ix & AbsMask;
+ const struct data *d = ptr_barrier (&data);
- /* Trigger special-cases for tiny, boring and infinity/NaN. */
- v_u64_t special = v_cond_u64 ((ia - TinyBound) > (BoringBound - TinyBound));
- v_f64_t u;
+ uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x));
+ float64x2_t u = x;
+
+ /* Trigger special-cases for tiny, boring and infinity/NaN. */
+ uint64x2_t special = vcgtq_u64 (vsubq_u64 (ia, d->tiny_bound), d->thresh);
+#if WANT_SIMD_EXCEPT
/* To trigger fp exceptions correctly, set special lanes to a neutral value.
They will be fixed up later by the special-case handler. */
if (unlikely (v_any_u64 (special)))
- u = v_sel_f64 (special, v_f64 (1), x) * 2;
- else
- u = x * 2;
+ u = v_zerofy_f64 (u, special);
+#endif
+
+ u = vaddq_f64 (u, u);
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- v_f64_t q = expm1_inline (u);
- v_f64_t y = q / (q + 2);
+ float64x2_t q = expm1_inline (u, d);
+ float64x2_t qp2 = vaddq_f64 (q, v_f64 (2));
if (unlikely (v_any_u64 (special)))
- return special_case (x, y, special);
- return y;
+ return special_case (x, vdivq_f64 (q, qp2), special);
+ return vdivq_f64 (q, qp2);
}
-VPCS_ALIAS
PL_SIG (V, D, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (tanh), 2.26)
-PL_TEST_EXPECT_FENV_ALWAYS (V_NAME (tanh))
-PL_TEST_INTERVAL (V_NAME (tanh), 0, TinyBound, 1000)
-PL_TEST_INTERVAL (V_NAME (tanh), -0, -TinyBound, 1000)
-PL_TEST_INTERVAL (V_NAME (tanh), TinyBound, BoringBound, 100000)
-PL_TEST_INTERVAL (V_NAME (tanh), -TinyBound, -BoringBound, 100000)
-PL_TEST_INTERVAL (V_NAME (tanh), BoringBound, inf, 1000)
-PL_TEST_INTERVAL (V_NAME (tanh), -BoringBound, -inf, 1000)
-#endif
+PL_TEST_ULP (V_NAME_D1 (tanh), 2.27)
+PL_TEST_EXPECT_FENV (V_NAME_D1 (tanh), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0, 0x1p-27, 5000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1p-27, 0x1.241bf835f9d5fp+4, 50000)
+PL_TEST_SYM_INTERVAL (V_NAME_D1 (tanh), 0x1.241bf835f9d5fp+4, inf, 1000)
diff --git a/pl/math/v_tanhf_2u6.c b/pl/math/v_tanhf_2u6.c
index 36166118c0f0..d1cb9fb6eeb3 100644
--- a/pl/math/v_tanhf_2u6.c
+++ b/pl/math/v_tanhf_2u6.c
@@ -9,61 +9,65 @@
#include "pl_sig.h"
#include "pl_test.h"
-#if V_SUPPORTED
-
#include "v_expm1f_inline.h"
-#define BoringBound \
- 0x41102cb3 /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for \
- negative). */
-#define AbsMask 0x7fffffff
+static const struct data
+{
+ struct v_expm1f_data expm1f_consts;
+ uint32x4_t boring_bound, large_bound, onef;
+} data = {
+ .expm1f_consts = V_EXPM1F_DATA,
+ /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
+ .boring_bound = V4 (0x41102cb3),
+ .large_bound = V4 (0x7f800000),
+ .onef = V4 (0x3f800000),
+};
-static NOINLINE v_f32_t
-special_case (v_f32_t x, v_f32_t y, v_u32_t special)
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
{
return v_call_f32 (tanhf, x, y, special);
}
-/* Approximation for single-precision vector tanh(x), using a simplified version
- of expm1f. The maximum error is 2.58 ULP:
- __v_tanhf(0x1.fa5eep-5) got 0x1.f9ba02p-5
- want 0x1.f9ba08p-5. */
-VPCS_ATTR v_f32_t V_NAME (tanhf) (v_f32_t x)
+/* Approximation for single-precision vector tanh(x), using a simplified
+ version of expm1f. The maximum error is 2.58 ULP:
+ _ZGVnN4v_tanhf (0x1.fa5eep-5) got 0x1.f9ba02p-5
+ want 0x1.f9ba08p-5. */
+float32x4_t VPCS_ATTR V_NAME_F1 (tanh) (float32x4_t x)
{
- v_u32_t ix = v_as_u32_f32 (x);
- v_u32_t iax = ix & AbsMask;
- v_u32_t sign = ix & ~AbsMask;
- v_u32_t is_boring = v_cond_u32 (iax > BoringBound);
- v_f32_t boring = v_as_f32_u32 (sign | One);
+ const struct data *d = ptr_barrier (&data);
+
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t sign = veorq_u32 (ix, iax);
+ uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
- lanes to 1, which will trigger no exceptions, and fix them up later. */
- v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax < 0x34000000));
- ix = v_sel_u32 (is_boring, v_u32 (One), ix);
+ lanes to 0, which will trigger no exceptions, and fix them up later. */
+ uint32x4_t special = vorrq_u32 (vcgtq_u32 (iax, d->large_bound),
+ vcltq_u32 (iax, v_u32 (0x34000000)));
+ x = v_zerofy_f32 (x, is_boring);
if (unlikely (v_any_u32 (special)))
- ix = v_sel_u32 (special, v_u32 (One), ix);
+ x = v_zerofy_f32 (x, special);
#else
- v_u32_t special = v_cond_u32 ((iax > 0x7f800000) | (iax == 0));
+ uint32x4_t special = vcgtq_u32 (iax, d->large_bound);
#endif
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
- v_f32_t q = expm1f_inline (2 * v_as_f32_u32 (ix));
- v_f32_t y = q / (q + 2);
- y = v_sel_f32 (is_boring, boring, y);
+ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
if (unlikely (v_any_u32 (special)))
- return special_case (x, y, special);
- return y;
+ return special_case (vreinterpretq_f32_u32 (ix),
+ vbslq_f32 (is_boring, boring, y), special);
+ return vbslq_f32 (is_boring, boring, y);
}
-VPCS_ALIAS
PL_SIG (V, F, 1, tanh, -10.0, 10.0)
-PL_TEST_ULP (V_NAME (tanhf), 2.09)
-PL_TEST_EXPECT_FENV (V_NAME (tanhf), WANT_SIMD_EXCEPT)
-PL_TEST_INTERVAL (V_NAME (tanhf), 0, 0x1p-23, 1000)
-PL_TEST_INTERVAL (V_NAME (tanhf), -0, -0x1p-23, 1000)
-PL_TEST_INTERVAL (V_NAME (tanhf), 0x1p-23, 0x1.205966p+3, 100000)
-PL_TEST_INTERVAL (V_NAME (tanhf), -0x1p-23, -0x1.205966p+3, 100000)
-PL_TEST_INTERVAL (V_NAME (tanhf), 0x1.205966p+3, inf, 100)
-PL_TEST_INTERVAL (V_NAME (tanhf), -0x1.205966p+3, -inf, 100)
-#endif
+PL_TEST_ULP (V_NAME_F1 (tanh), 2.09)
+PL_TEST_EXPECT_FENV (V_NAME_F1 (tanh), WANT_SIMD_EXCEPT)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0, 0x1p-23, 1000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1p-23, 0x1.205966p+3, 100000)
+PL_TEST_SYM_INTERVAL (V_NAME_F1 (tanh), 0x1.205966p+3, inf, 100)
diff --git a/pl/math/vn_acosh_3u5.c b/pl/math/vn_acosh_3u5.c
deleted file mode 100644
index 649735b140f3..000000000000
--- a/pl/math/vn_acosh_3u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_acosh.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_acosh, _ZGVnN2v_acosh)
-#include "v_acosh_3u5.c"
-#endif
diff --git a/pl/math/vn_acoshf_3u1.c b/pl/math/vn_acoshf_3u1.c
deleted file mode 100644
index 8c5f106992a7..000000000000
--- a/pl/math/vn_acoshf_3u1.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_acoshf.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_acoshf, _ZGVnN4v_acoshf)
-#include "v_acoshf_3u1.c"
-#endif
diff --git a/pl/math/vn_asinh_3u5.c b/pl/math/vn_asinh_3u5.c
deleted file mode 100644
index 0d2373b5e4b2..000000000000
--- a/pl/math/vn_asinh_3u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_asinh.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_asinh, _ZGVnN2v_asinh)
-#include "v_asinh_3u5.c"
-#endif
diff --git a/pl/math/vn_asinhf_2u7.c b/pl/math/vn_asinhf_2u7.c
deleted file mode 100644
index 6c8927f0875b..000000000000
--- a/pl/math/vn_asinhf_2u7.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_asinhf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_asinhf, _ZGVnN4v_asinhf)
-#include "v_asinhf_2u7.c"
-#endif
diff --git a/pl/math/vn_atan2_3u.c b/pl/math/vn_atan2_3u.c
deleted file mode 100644
index 925b5b4ef324..000000000000
--- a/pl/math/vn_atan2_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atan2.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_atan2, _ZGVnN2vv_atan2)
-#include "v_atan2_3u.c"
-#endif
diff --git a/pl/math/vn_atan2f_3u.c b/pl/math/vn_atan2f_3u.c
deleted file mode 100644
index 51d33d50f6ef..000000000000
--- a/pl/math/vn_atan2f_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atan2f.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_atan2f, _ZGVnN4vv_atan2f)
-#include "v_atan2f_3u.c"
-#endif
diff --git a/pl/math/vn_atan_2u5.c b/pl/math/vn_atan_2u5.c
deleted file mode 100644
index ccebce2dc2ed..000000000000
--- a/pl/math/vn_atan_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atan.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_atan, _ZGVnN2v_atan)
-#include "v_atan_2u5.c"
-#endif
diff --git a/pl/math/vn_atanf_3u.c b/pl/math/vn_atanf_3u.c
deleted file mode 100644
index b8797276d981..000000000000
--- a/pl/math/vn_atanf_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atanf.
- *
- * Copyright (c) 2021-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_atanf, _ZGVnN4v_atanf)
-#include "v_atanf_3u.c"
-#endif
diff --git a/pl/math/vn_atanh_3u5.c b/pl/math/vn_atanh_3u5.c
deleted file mode 100644
index 19429b209b3a..000000000000
--- a/pl/math/vn_atanh_3u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atanh.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_atanh, _ZGVnN2v_atanh)
-#include "v_atanh_3u5.c"
-#endif
diff --git a/pl/math/vn_atanhf_3u1.c b/pl/math/vn_atanhf_3u1.c
deleted file mode 100644
index 7de226dda054..000000000000
--- a/pl/math/vn_atanhf_3u1.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_atanhf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_atanhf, _ZGVnN4v_atanhf)
-#include "v_atanhf_3u1.c"
-#endif
diff --git a/pl/math/vn_cbrt_2u.c b/pl/math/vn_cbrt_2u.c
deleted file mode 100644
index 4cb0dc8cefb5..000000000000
--- a/pl/math/vn_cbrt_2u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cbrt.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_cbrt, _ZGVnN2v_cbrt)
-#include "v_cbrt_2u.c"
-#endif
diff --git a/pl/math/vn_cbrtf_1u5.c b/pl/math/vn_cbrtf_1u5.c
deleted file mode 100644
index 40a72d8c301e..000000000000
--- a/pl/math/vn_cbrtf_1u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cbrtf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_cbrtf, _ZGVnN4v_cbrtf)
-#include "v_cbrtf_1u5.c"
-#endif
diff --git a/pl/math/vn_cosh_2u.c b/pl/math/vn_cosh_2u.c
deleted file mode 100644
index 9bf7f026447a..000000000000
--- a/pl/math/vn_cosh_2u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_cosh.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_cosh, _ZGVnN2v_cosh)
-#include "v_cosh_2u.c"
-#endif
diff --git a/pl/math/vn_coshf_2u4.c b/pl/math/vn_coshf_2u4.c
deleted file mode 100644
index b149cb34df61..000000000000
--- a/pl/math/vn_coshf_2u4.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_coshf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_coshf, _ZGVnN4v_coshf)
-#include "v_coshf_2u4.c"
-#endif
diff --git a/pl/math/vn_erf_2u.c b/pl/math/vn_erf_2u.c
deleted file mode 100644
index 95bd141554e4..000000000000
--- a/pl/math/vn_erf_2u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_erf.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_erf, _ZGVnN2v_erf)
-#include "v_erf_2u.c"
-#endif
diff --git a/pl/math/vn_erfc_4u.c b/pl/math/vn_erfc_4u.c
deleted file mode 100644
index 1cf6546ce715..000000000000
--- a/pl/math/vn_erfc_4u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_erfc.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_erfc, _ZGVnN2v_erfc)
-#include "v_erfc_4u.c"
-#endif
diff --git a/pl/math/vn_erfcf_1u.c b/pl/math/vn_erfcf_1u.c
deleted file mode 100644
index ef5a21d6336c..000000000000
--- a/pl/math/vn_erfcf_1u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_erfcf.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_erfcf, _ZGVnN4v_erfcf)
-#include "v_erfcf_1u.c"
-#endif
diff --git a/pl/math/vn_erff_1u5.c b/pl/math/vn_erff_1u5.c
deleted file mode 100644
index ee8848ee24ed..000000000000
--- a/pl/math/vn_erff_1u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_erff.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_erff, _ZGVnN4v_erff)
-#include "v_erff_1u5.c"
-#endif
diff --git a/pl/math/vn_exp_tail.c b/pl/math/vn_exp_tail.c
deleted file mode 100644
index 52a57feefbff..000000000000
--- a/pl/math/vn_exp_tail.c
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_erfc.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#include "v_exp_tail.c"
-#endif
diff --git a/pl/math/vn_expf.c b/pl/math/vn_expf.c
deleted file mode 100644
index 83e7f0a2070b..000000000000
--- a/pl/math/vn_expf.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expf.
- *
- * Copyright (c) 2019-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_expf, _ZGVnN4v_expf)
-#include "v_expf.c"
-#endif
diff --git a/pl/math/vn_expm1_2u5.c b/pl/math/vn_expm1_2u5.c
deleted file mode 100644
index 35111e2fc221..000000000000
--- a/pl/math/vn_expm1_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expm1.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_expm1, _ZGVnN2v_expm1)
-#include "v_expm1_2u5.c"
-#endif
diff --git a/pl/math/vn_expm1f_1u6.c b/pl/math/vn_expm1f_1u6.c
deleted file mode 100644
index bea491f4898e..000000000000
--- a/pl/math/vn_expm1f_1u6.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_expm1f.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_expm1f, _ZGVnN4v_expm1f)
-#include "v_expm1f_1u6.c"
-#endif
diff --git a/pl/math/vn_log10_2u5.c b/pl/math/vn_log10_2u5.c
deleted file mode 100644
index 5f32c33e059f..000000000000
--- a/pl/math/vn_log10_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log10.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_log10, _ZGVnN2v_log10)
-#include "v_log10_2u5.c"
-#endif
diff --git a/pl/math/vn_log10f_3u5.c b/pl/math/vn_log10f_3u5.c
deleted file mode 100644
index 2673ef515df7..000000000000
--- a/pl/math/vn_log10f_3u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log10f.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_log10f, _ZGVnN4v_log10f)
-#include "v_log10f_3u5.c"
-#endif
diff --git a/pl/math/vn_log1p_2u5.c b/pl/math/vn_log1p_2u5.c
deleted file mode 100644
index 3f4f8d1bd297..000000000000
--- a/pl/math/vn_log1p_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log1p.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_log1p, _ZGVnN2v_log1p)
-#include "v_log1p_2u5.c"
-#endif
diff --git a/pl/math/vn_log1pf_2u1.c b/pl/math/vn_log1pf_2u1.c
deleted file mode 100644
index a319bc98f491..000000000000
--- a/pl/math/vn_log1pf_2u1.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log1pf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_log1pf, _ZGVnN4v_log1pf)
-#include "v_log1pf_2u1.c"
-#endif
diff --git a/pl/math/vn_log2_3u.c b/pl/math/vn_log2_3u.c
deleted file mode 100644
index a87039204439..000000000000
--- a/pl/math/vn_log2_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log2.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_log2, _ZGVnN2v_log2)
-#include "v_log2_3u.c"
-#endif
diff --git a/pl/math/vn_log2f_2u5.c b/pl/math/vn_log2f_2u5.c
deleted file mode 100644
index b4a9cb708bae..000000000000
--- a/pl/math/vn_log2f_2u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_log2f.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS strong_alias (__vn_log2f, _ZGVnN4v_log2f)
-#include "v_log2f_2u5.c"
-#endif
diff --git a/pl/math/vn_sinh_3u.c b/pl/math/vn_sinh_3u.c
deleted file mode 100644
index 7c881de21688..000000000000
--- a/pl/math/vn_sinh_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sinh.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_sinh, _ZGVnN2v_sinh)
-#include "v_sinh_3u.c"
-#endif
diff --git a/pl/math/vn_sinhf_2u3.c b/pl/math/vn_sinhf_2u3.c
deleted file mode 100644
index 251e73232d01..000000000000
--- a/pl/math/vn_sinhf_2u3.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_sinhf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_sinhf, _ZGVnN4v_sinhf)
-#include "v_sinhf_2u3.c"
-#endif
diff --git a/pl/math/vn_tan_3u5.c b/pl/math/vn_tan_3u5.c
deleted file mode 100644
index a4efb065bc08..000000000000
--- a/pl/math/vn_tan_3u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_tan.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_tan, _ZGVnN2v_tan)
-#include "v_tan_3u5.c"
-#endif
diff --git a/pl/math/vn_tanf_3u5.c b/pl/math/vn_tanf_3u5.c
deleted file mode 100644
index a88cb4077b3d..000000000000
--- a/pl/math/vn_tanf_3u5.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_tanf.
- *
- * Copyright (c) 2020-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_tanf, _ZGVnN4v_tanf)
-#include "v_tanf_3u5.c"
-#endif
diff --git a/pl/math/vn_tanh_3u.c b/pl/math/vn_tanh_3u.c
deleted file mode 100644
index cb2746cf22a5..000000000000
--- a/pl/math/vn_tanh_3u.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_tanh.
- *
- * Copyright (c) 2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_tanh, _ZGVnN2v_tanh)
-#include "v_tanh_3u.c"
-#endif
diff --git a/pl/math/vn_tanhf_2u6.c b/pl/math/vn_tanhf_2u6.c
deleted file mode 100644
index 47f0a7f57d05..000000000000
--- a/pl/math/vn_tanhf_2u6.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * AdvSIMD vector PCS variant of __v_tanhf.
- *
- * Copyright (c) 2022-2023, Arm Limited.
- * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
- */
-#include "include/mathlib.h"
-#ifdef __vpcs
-#define VPCS 1
-#define VPCS_ALIAS PL_ALIAS (__vn_tanhf, _ZGVnN4v_tanhf)
-#include "v_tanhf_2u6.c"
-#endif
diff --git a/string/aarch64/asmdefs.h b/string/aarch64/asmdefs.h
index 069b146f4a69..131b95e1fea9 100644
--- a/string/aarch64/asmdefs.h
+++ b/string/aarch64/asmdefs.h
@@ -21,6 +21,19 @@
#define FEATURE_1_PAC 2
/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#ifdef __ILP32__
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 2; \
+ .word 4; \
+ .word 12; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .text
+#else
#define GNU_PROPERTY(type, value) \
.section .note.gnu.property, "a"; \
.p2align 3; \
@@ -33,6 +46,7 @@
.word value; \
.word 0; \
.text
+#endif
/* If set then the GNU Property Note section will be added to
mark objects to support BTI and PAC-RET. */
diff --git a/string/aarch64/memcpy-advsimd.S b/string/aarch64/memcpy-advsimd.S
index e6527d0dac2c..9d3027d4d3cd 100644
--- a/string/aarch64/memcpy-advsimd.S
+++ b/string/aarch64/memcpy-advsimd.S
@@ -1,7 +1,7 @@
/*
* memcpy - copy memory area
*
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -56,11 +56,12 @@ ENTRY (__memcpy_aarch64_simd)
PTR_ARG (1)
SIZE_ARG (2)
add srcend, src, count
- add dstend, dstin, count
cmp count, 128
b.hi L(copy_long)
+ add dstend, dstin, count
cmp count, 32
b.hi L(copy32_128)
+ nop
/* Small copies: 0..32 bytes. */
cmp count, 16
@@ -71,6 +72,18 @@ ENTRY (__memcpy_aarch64_simd)
str B_q, [dstend, -16]
ret
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ .p2align 4
/* Copy 8-15 bytes. */
L(copy16):
tbz count, 3, L(copy8)
@@ -80,7 +93,6 @@ L(copy16):
str A_h, [dstend, -8]
ret
- .p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
@@ -90,31 +102,6 @@ L(copy8):
str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
-L(copy0):
- ret
-
- .p2align 4
- /* Medium copies: 33..128 bytes. */
-L(copy32_128):
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
- cmp count, 64
- b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
- ret
-
- .p2align 4
/* Copy 65..128 bytes. */
L(copy128):
ldp E_q, F_q, [src, 32]
@@ -128,8 +115,24 @@ L(copy96):
stp C_q, D_q, [dstend, -32]
ret
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 3
/* Copy more than 128 bytes. */
L(copy_long):
+ add dstend, dstin, count
+
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
cmp tmp1, count
@@ -166,6 +169,9 @@ L(copy64_from_end):
stp A_q, B_q, [dstend, -32]
ret
+ .p2align 4
+ nop
+
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align srcend to 16-byte alignment. */
L(copy_long_backwards):
diff --git a/string/aarch64/memcpy-mops.S b/string/aarch64/memcpy-mops.S
new file mode 100644
index 000000000000..b45c31418717
--- /dev/null
+++ b/string/aarch64/memcpy-mops.S
@@ -0,0 +1,21 @@
+/*
+ * memcpy using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memcpy_aarch64_mops)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ mov x3, x0
+ .inst 0x19010443 /* cpyfp [x3]!, [x1]!, x2! */
+ .inst 0x19410443 /* cpyfm [x3]!, [x1]!, x2! */
+ .inst 0x19810443 /* cpyfe [x3]!, [x1]!, x2! */
+ ret
+
+END (__memcpy_aarch64_mops)
diff --git a/string/aarch64/memmove-mops.S b/string/aarch64/memmove-mops.S
new file mode 100644
index 000000000000..6c73017bb16f
--- /dev/null
+++ b/string/aarch64/memmove-mops.S
@@ -0,0 +1,21 @@
+/*
+ * memmove using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memmove_aarch64_mops)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ mov x3, x0
+ .inst 0x1d010443 /* cpyp [x3]!, [x1]!, x2! */
+ .inst 0x1d410443 /* cpym [x3]!, [x1]!, x2! */
+ .inst 0x1d810443 /* cpye [x3]!, [x1]!, x2! */
+ ret
+
+END (__memmove_aarch64_mops)
diff --git a/string/aarch64/memset-mops.S b/string/aarch64/memset-mops.S
new file mode 100644
index 000000000000..ec791493bae9
--- /dev/null
+++ b/string/aarch64/memset-mops.S
@@ -0,0 +1,20 @@
+/*
+ * memset using MOPS extension.
+ *
+ * Copyright (c) 2023, Arm Limited.
+ * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
+ */
+
+#include "asmdefs.h"
+
+ENTRY (__memset_aarch64_mops)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+ mov x3, x0
+ .inst 0x19c10443 /* setp [x3]!, x2!, x1 */
+ .inst 0x19c14443 /* setm [x3]!, x2!, x1 */
+ .inst 0x19c18443 /* sete [x3]!, x2!, x1 */
+ ret
+
+END (__memset_aarch64_mops)
diff --git a/string/bench/memcpy.c b/string/bench/memcpy.c
index 1468663e51cd..b628f9b60d96 100644
--- a/string/bench/memcpy.c
+++ b/string/bench/memcpy.c
@@ -1,7 +1,7 @@
/*
* memcpy benchmark.
*
- * Copyright (c) 2020-2022, Arm Limited.
+ * Copyright (c) 2020-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -39,6 +39,9 @@ static const struct fun
# if __ARM_FEATURE_SVE
F(__memcpy_aarch64_sve)
# endif
+# if WANT_MOPS
+ F(__memcpy_aarch64_mops)
+# endif
#elif __arm__
F(__memcpy_arm)
#endif
diff --git a/string/include/stringlib.h b/string/include/stringlib.h
index f41a46446888..01da7ebfc18d 100644
--- a/string/include/stringlib.h
+++ b/string/include/stringlib.h
@@ -1,7 +1,7 @@
/*
* Public API.
*
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -52,6 +52,11 @@ size_t __strlen_aarch64_sve (const char *);
size_t __strnlen_aarch64_sve (const char *, size_t);
int __strncmp_aarch64_sve (const char *, const char *, size_t);
# endif
+# if WANT_MOPS
+void *__memcpy_aarch64_mops (void *__restrict, const void *__restrict, size_t);
+void *__memmove_aarch64_mops (void *__restrict, const void *__restrict, size_t);
+void *__memset_aarch64_mops (void *, int, size_t);
+# endif
# if __ARM_FEATURE_MEMORY_TAGGING
void *__mtag_tag_region (void *, size_t);
void *__mtag_tag_zero_region (void *, size_t);
diff --git a/string/test/memcpy.c b/string/test/memcpy.c
index fa15a95b2bda..dc95844bd45a 100644
--- a/string/test/memcpy.c
+++ b/string/test/memcpy.c
@@ -1,7 +1,7 @@
/*
* memcpy test.
*
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -31,6 +31,9 @@ static const struct fun
# if __ARM_FEATURE_SVE
F(__memcpy_aarch64_sve, 1)
# endif
+# if WANT_MOPS
+ F(__memcpy_aarch64_mops, 1)
+# endif
#elif __arm__
F(__memcpy_arm, 0)
#endif
diff --git a/string/test/memmove.c b/string/test/memmove.c
index 5d509c03affa..b85dd1e864ef 100644
--- a/string/test/memmove.c
+++ b/string/test/memmove.c
@@ -1,7 +1,7 @@
/*
* memmove test.
*
- * Copyright (c) 2019-2022, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -31,6 +31,9 @@ static const struct fun
# if __ARM_FEATURE_SVE
F(__memmove_aarch64_sve, 1)
# endif
+# if WANT_MOPS
+ F(__memmove_aarch64_mops, 1)
+# endif
#endif
{0, 0, 0}
// clang-format on
diff --git a/string/test/memset.c b/string/test/memset.c
index 5543f44bb026..7d09c267ffec 100644
--- a/string/test/memset.c
+++ b/string/test/memset.c
@@ -1,7 +1,7 @@
/*
* memset test.
*
- * Copyright (c) 2019-2020, Arm Limited.
+ * Copyright (c) 2019-2023, Arm Limited.
* SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
*/
@@ -25,6 +25,9 @@ static const struct fun
F(memset, 0)
#if __aarch64__
F(__memset_aarch64, 1)
+# if WANT_MOPS
+ F(__memset_aarch64_mops, 1)
+# endif
#elif __arm__
F(__memset_arm, 0)
#endif