diff options
Diffstat (limited to 'contrib/arm-optimized-routines/math/test/mathbench.c')
-rw-r--r-- | contrib/arm-optimized-routines/math/test/mathbench.c | 426 |
1 files changed, 117 insertions, 309 deletions
diff --git a/contrib/arm-optimized-routines/math/test/mathbench.c b/contrib/arm-optimized-routines/math/test/mathbench.c index 0c17826e5296..653c58fbc484 100644 --- a/contrib/arm-optimized-routines/math/test/mathbench.c +++ b/contrib/arm-optimized-routines/math/test/mathbench.c @@ -1,10 +1,23 @@ /* * Microbenchmark for math functions. * - * Copyright (c) 2018-2020, Arm Limited. - * SPDX-License-Identifier: MIT + * Copyright (c) 2018-2024, Arm Limited. + * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ +#if WANT_SVE_TESTS +# if __aarch64__ && __linux__ +# ifdef __clang__ +# pragma clang attribute push(__attribute__((target("sve"))), \ + apply_to = any(function)) +# else +# pragma GCC target("+sve") +# endif +# else +# error "SVE not supported - please disable WANT_SVE_TESTS" +# endif +#endif + #undef _GNU_SOURCE #define _GNU_SOURCE 1 #include <stdint.h> @@ -15,11 +28,6 @@ #include <math.h> #include "mathlib.h" -#ifndef WANT_VMATH -/* Enable the build of vector math code. */ -# define WANT_VMATH 1 -#endif - /* Number of measurements, best result is reported. */ #define MEASURE 60 /* Array size. */ @@ -34,50 +42,6 @@ static float Af[N]; static long measurecount = MEASURE; static long itercount = ITER; -#if __aarch64__ && WANT_VMATH -typedef __f64x2_t v_double; - -#define v_double_len() 2 - -static inline v_double -v_double_load (const double *p) -{ - return (v_double){p[0], p[1]}; -} - -static inline v_double -v_double_dup (double x) -{ - return (v_double){x, x}; -} - -typedef __f32x4_t v_float; - -#define v_float_len() 4 - -static inline v_float -v_float_load (const float *p) -{ - return (v_float){p[0], p[1], p[2], p[3]}; -} - -static inline v_float -v_float_dup (float x) -{ - return (v_float){x, x, x, x}; -} -#else -/* dummy definitions to make things compile. */ -typedef double v_double; -typedef float v_float; -#define v_double_len(x) 1 -#define v_double_load(x) (x)[0] -#define v_double_dup(x) (x) -#define v_float_len(x) 1 -#define v_float_load(x) (x)[0] -#define v_float_dup(x) (x) -#endif - static double dummy (double x) { @@ -89,128 +53,35 @@ dummyf (float x) { return x; } - -#if WANT_VMATH -#if __aarch64__ -static v_double -__v_dummy (v_double x) +#if __aarch64__ && __linux__ +__vpcs static float64x2_t +__vn_dummy (float64x2_t x) { return x; } -static v_float -__v_dummyf (v_float x) +__vpcs static float32x4_t +__vn_dummyf (float32x4_t x) { return x; } - -#ifdef __vpcs -__vpcs static v_double -__vn_dummy (v_double x) +#endif +#if WANT_SVE_TESTS +static svfloat64_t +__sv_dummy (svfloat64_t x, svbool_t pg) { return x; } -__vpcs static v_float -__vn_dummyf (v_float x) +static svfloat32_t +__sv_dummyf (svfloat32_t x, svbool_t pg) { return x; } -__vpcs static v_float -xy__vn_powf (v_float x) -{ - return __vn_powf (x, x); -} - -__vpcs static v_float -xy_Z_powf (v_float x) -{ - return _ZGVnN4vv_powf (x, x); -} - -__vpcs static v_double -xy__vn_pow (v_double x) -{ - return __vn_pow (x, x); -} - -__vpcs static v_double -xy_Z_pow (v_double x) -{ - return _ZGVnN2vv_pow (x, x); -} #endif -static v_float -xy__v_powf (v_float x) -{ - return __v_powf (x, x); -} - -static v_double -xy__v_pow (v_double x) -{ - return __v_pow (x, x); -} -#endif - -static float -xy__s_powf (float x) -{ - return __s_powf (x, x); -} - -static double -xy__s_pow (double x) -{ - return __s_pow (x, x); -} -#endif - -static double -xypow (double x) -{ - return pow (x, x); -} - -static float -xypowf (float x) -{ - return powf (x, x); -} - -static double -xpow (double x) -{ - return pow (x, 23.4); -} - -static float -xpowf (float x) -{ - return powf (x, 23.4f); -} - -static double -ypow (double x) -{ - return pow (2.34, x); -} - -static float -ypowf (float x) -{ - return powf (2.34f, x); -} - -static float -sincosf_wrap (float x) -{ - float s, c; - sincosf (x, &s, &c); - return s + c; -} +#include "test/mathbench_wrappers.h" static const struct fun { @@ -223,127 +94,42 @@ static const struct fun { double (*d) (double); float (*f) (float); - v_double (*vd) (v_double); - v_float (*vf) (v_float); -#ifdef __vpcs - __vpcs v_double (*vnd) (v_double); - __vpcs v_float (*vnf) (v_float); +#if __aarch64__ && __linux__ + __vpcs float64x2_t (*vnd) (float64x2_t); + __vpcs float32x4_t (*vnf) (float32x4_t); +#endif +#if WANT_SVE_TESTS + svfloat64_t (*svd) (svfloat64_t, svbool_t); + svfloat32_t (*svf) (svfloat32_t, svbool_t); #endif } fun; } funtab[] = { +// clang-format off #define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}}, #define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}}, -#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}}, -#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}}, #define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}}, #define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}}, +#define SVD(func, lo, hi) {#func, 'd', 's', lo, hi, {.svd = func}}, +#define SVF(func, lo, hi) {#func, 'f', 's', lo, hi, {.svf = func}}, D (dummy, 1.0, 2.0) -D (exp, -9.9, 9.9) -D (exp, 0.5, 1.0) -D (exp2, -9.9, 9.9) -D (log, 0.01, 11.1) -D (log, 0.999, 1.001) -D (log2, 0.01, 11.1) -D (log2, 0.999, 1.001) -{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}}, -D (xpow, 0.01, 11.1) -D (ypow, -9.9, 9.9) -D (erf, -6.0, 6.0) - F (dummyf, 1.0, 2.0) -F (expf, -9.9, 9.9) -F (exp2f, -9.9, 9.9) -F (logf, 0.01, 11.1) -F (log2f, 0.01, 11.1) -{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}}, -F (xpowf, 0.01, 11.1) -F (ypowf, -9.9, 9.9) -{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}}, -{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}}, -F (sinf, 0.1, 0.7) -F (sinf, 0.8, 3.1) -F (sinf, -3.1, 3.1) -F (sinf, 3.3, 33.3) -F (sinf, 100, 1000) -F (sinf, 1e6, 1e32) -F (cosf, 0.1, 0.7) -F (cosf, 0.8, 3.1) -F (cosf, -3.1, 3.1) -F (cosf, 3.3, 33.3) -F (cosf, 100, 1000) -F (cosf, 1e6, 1e32) -F (erff, -4.0, 4.0) -#if WANT_VMATH -D (__s_sin, -3.1, 3.1) -D (__s_cos, -3.1, 3.1) -D (__s_exp, -9.9, 9.9) -D (__s_log, 0.01, 11.1) -{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}}, -F (__s_expf, -9.9, 9.9) -F (__s_expf_1u, -9.9, 9.9) -F (__s_exp2f, -9.9, 9.9) -F (__s_exp2f_1u, -9.9, 9.9) -F (__s_logf, 0.01, 11.1) -{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}}, -F (__s_sinf, -3.1, 3.1) -F (__s_cosf, -3.1, 3.1) -#if __aarch64__ -VD (__v_dummy, 1.0, 2.0) -VD (__v_sin, -3.1, 3.1) -VD (__v_cos, -3.1, 3.1) -VD (__v_exp, -9.9, 9.9) -VD (__v_log, 0.01, 11.1) -{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}}, -VF (__v_dummyf, 1.0, 2.0) -VF (__v_expf, -9.9, 9.9) -VF (__v_expf_1u, -9.9, 9.9) -VF (__v_exp2f, -9.9, 9.9) -VF (__v_exp2f_1u, -9.9, 9.9) -VF (__v_logf, 0.01, 11.1) -{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}}, -VF (__v_sinf, -3.1, 3.1) -VF (__v_cosf, -3.1, 3.1) -#ifdef __vpcs +#if __aarch64__ && __linux__ VND (__vn_dummy, 1.0, 2.0) -VND (__vn_exp, -9.9, 9.9) -VND (_ZGVnN2v_exp, -9.9, 9.9) -VND (__vn_log, 0.01, 11.1) -VND (_ZGVnN2v_log, 0.01, 11.1) -{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}}, -{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}}, -VND (__vn_sin, -3.1, 3.1) -VND (_ZGVnN2v_sin, -3.1, 3.1) -VND (__vn_cos, -3.1, 3.1) -VND (_ZGVnN2v_cos, -3.1, 3.1) VNF (__vn_dummyf, 1.0, 2.0) -VNF (__vn_expf, -9.9, 9.9) -VNF (_ZGVnN4v_expf, -9.9, 9.9) -VNF (__vn_expf_1u, -9.9, 9.9) -VNF (__vn_exp2f, -9.9, 9.9) -VNF (_ZGVnN4v_exp2f, -9.9, 9.9) -VNF (__vn_exp2f_1u, -9.9, 9.9) -VNF (__vn_logf, 0.01, 11.1) -VNF (_ZGVnN4v_logf, 0.01, 11.1) -{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}}, -{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}}, -VNF (__vn_sinf, -3.1, 3.1) -VNF (_ZGVnN4v_sinf, -3.1, 3.1) -VNF (__vn_cosf, -3.1, 3.1) -VNF (_ZGVnN4v_cosf, -3.1, 3.1) -#endif #endif +#if WANT_SVE_TESTS +SVD (__sv_dummy, 1.0, 2.0) +SVF (__sv_dummyf, 1.0, 2.0) #endif +#include "test/mathbench_funcs.h" {0}, #undef F #undef D -#undef VF -#undef VD #undef VNF #undef VND +#undef SVF +#undef SVD + // clang-format on }; static void @@ -442,69 +228,77 @@ runf_latency (float f (float)) prev = f (Af[i] + prev * z); } +#if __aarch64__ && __linux__ static void -run_v_thruput (v_double f (v_double)) +run_vn_thruput (__vpcs float64x2_t f (float64x2_t)) { - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); + for (int i = 0; i < N; i += 2) + f (vld1q_f64 (A + i)); } static void -runf_v_thruput (v_float f (v_float)) +runf_vn_thruput (__vpcs float32x4_t f (float32x4_t)) { - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); + for (int i = 0; i < N; i += 4) + f (vld1q_f32 (Af + i)); } static void -run_v_latency (v_double f (v_double)) +run_vn_latency (__vpcs float64x2_t f (float64x2_t)) { - v_double z = v_double_dup (zero); - v_double prev = z; - for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + volatile uint64x2_t vsel = (uint64x2_t) { 0, 0 }; + uint64x2_t sel = vsel; + float64x2_t prev = vdupq_n_f64 (0); + for (int i = 0; i < N; i += 2) + prev = f (vbslq_f64 (sel, prev, vld1q_f64 (A + i))); } static void -runf_v_latency (v_float f (v_float)) +runf_vn_latency (__vpcs float32x4_t f (float32x4_t)) { - v_float z = v_float_dup (zero); - v_float prev = z; - for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + volatile uint32x4_t vsel = (uint32x4_t) { 0, 0, 0, 0 }; + uint32x4_t sel = vsel; + float32x4_t prev = vdupq_n_f32 (0); + for (int i = 0; i < N; i += 4) + prev = f (vbslq_f32 (sel, prev, vld1q_f32 (Af + i))); } +#endif -#ifdef __vpcs +#if WANT_SVE_TESTS static void -run_vn_thruput (__vpcs v_double f (v_double)) +run_sv_thruput (svfloat64_t f (svfloat64_t, svbool_t)) { - for (int i = 0; i < N; i += v_double_len ()) - f (v_double_load (A+i)); + for (int i = 0; i < N; i += svcntd ()) + f (svld1_f64 (svptrue_b64 (), A + i), svptrue_b64 ()); } static void -runf_vn_thruput (__vpcs v_float f (v_float)) +runf_sv_thruput (svfloat32_t f (svfloat32_t, svbool_t)) { - for (int i = 0; i < N; i += v_float_len ()) - f (v_float_load (Af+i)); + for (int i = 0; i < N; i += svcntw ()) + f (svld1_f32 (svptrue_b32 (), Af + i), svptrue_b32 ()); } static void -run_vn_latency (__vpcs v_double f (v_double)) +run_sv_latency (svfloat64_t f (svfloat64_t, svbool_t)) { - v_double z = v_double_dup (zero); - v_double prev = z; - for (int i = 0; i < N; i += v_double_len ()) - prev = f (v_double_load (A+i) + prev * z); + volatile svbool_t vsel = svptrue_b64 (); + svbool_t sel = vsel; + svfloat64_t prev = svdup_f64 (0); + for (int i = 0; i < N; i += svcntd ()) + prev = f (svsel_f64 (sel, svld1_f64 (svptrue_b64 (), A + i), prev), + svptrue_b64 ()); } static void -runf_vn_latency (__vpcs v_float f (v_float)) +runf_sv_latency (svfloat32_t f (svfloat32_t, svbool_t)) { - v_float z = v_float_dup (zero); - v_float prev = z; - for (int i = 0; i < N; i += v_float_len ()) - prev = f (v_float_load (Af+i) + prev * z); + volatile svbool_t vsel = svptrue_b32 (); + svbool_t sel = vsel; + svfloat32_t prev = svdup_f32 (0); + for (int i = 0; i < N; i += svcntw ()) + prev = f (svsel_f32 (sel, svld1_f32 (svptrue_b32 (), Af + i), prev), + svptrue_b32 ()); } #endif @@ -512,7 +306,11 @@ static uint64_t tic (void) { struct timespec ts; +#if defined(_MSC_VER) + if (!timespec_get (&ts, TIME_UTC)) +#else if (clock_gettime (CLOCK_REALTIME, &ts)) +#endif abort (); return ts.tv_sec * 1000000000ULL + ts.tv_nsec; } @@ -539,10 +337,12 @@ bench1 (const struct fun *f, int type, double lo, double hi) const char *s = type == 't' ? "rthruput" : "latency"; int vlen = 1; - if (f->vec && f->prec == 'd') - vlen = v_double_len(); - else if (f->vec && f->prec == 'f') - vlen = v_float_len(); + if (f->vec == 'n') + vlen = f->prec == 'd' ? 2 : 4; +#if WANT_SVE_TESTS + else if (f->vec == 's') + vlen = f->prec == 'd' ? svcntd () : svcntw (); +#endif if (f->prec == 'd' && type == 't' && f->vec == 0) TIMEIT (run_thruput, f->fun.d); @@ -552,15 +352,7 @@ bench1 (const struct fun *f, int type, double lo, double hi) TIMEIT (runf_thruput, f->fun.f); else if (f->prec == 'f' && type == 'l' && f->vec == 0) TIMEIT (runf_latency, f->fun.f); - else if (f->prec == 'd' && type == 't' && f->vec == 'v') - TIMEIT (run_v_thruput, f->fun.vd); - else if (f->prec == 'd' && type == 'l' && f->vec == 'v') - TIMEIT (run_v_latency, f->fun.vd); - else if (f->prec == 'f' && type == 't' && f->vec == 'v') - TIMEIT (runf_v_thruput, f->fun.vf); - else if (f->prec == 'f' && type == 'l' && f->vec == 'v') - TIMEIT (runf_v_latency, f->fun.vf); -#ifdef __vpcs +#if __aarch64__ && __linux__ else if (f->prec == 'd' && type == 't' && f->vec == 'n') TIMEIT (run_vn_thruput, f->fun.vnd); else if (f->prec == 'd' && type == 'l' && f->vec == 'n') @@ -570,20 +362,32 @@ bench1 (const struct fun *f, int type, double lo, double hi) else if (f->prec == 'f' && type == 'l' && f->vec == 'n') TIMEIT (runf_vn_latency, f->fun.vnf); #endif +#if WANT_SVE_TESTS + else if (f->prec == 'd' && type == 't' && f->vec == 's') + TIMEIT (run_sv_thruput, f->fun.svd); + else if (f->prec == 'd' && type == 'l' && f->vec == 's') + TIMEIT (run_sv_latency, f->fun.svd); + else if (f->prec == 'f' && type == 't' && f->vec == 's') + TIMEIT (runf_sv_thruput, f->fun.svf); + else if (f->prec == 'f' && type == 'l' && f->vec == 's') + TIMEIT (runf_sv_latency, f->fun.svf); +#endif if (type == 't') { ns100 = (100 * dt + itercount * N / 2) / (itercount * N); - printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } else if (type == 'l') { ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen); - printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s, + printf ("%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g] vlen %d\n", + f->name, s, (unsigned) (ns100 / 100), (unsigned) (ns100 % 100), - (unsigned long long) dt, lo, hi); + (unsigned long long) dt, lo, hi, vlen); } fflush (stdout); } @@ -771,3 +575,7 @@ main (int argc, char *argv[]) } return 0; } + +#if __aarch64__ && __linux__ && WANT_SVE_TESTS && defined(__clang__) +# pragma clang attribute pop +#endif |