diff options
Diffstat (limited to 'sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c')
-rw-r--r-- | sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c | 483 |
1 files changed, 303 insertions, 180 deletions
diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c index c3809a2827be..5684b4ff1a97 100644 --- a/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c +++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3_impl.c @@ -6,7 +6,7 @@ * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. + * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * @@ -23,185 +23,270 @@ * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de> */ +#include <sys/simd.h> #include <sys/zfs_context.h> -#include <sys/zio_checksum.h> +#include <sys/zfs_impl.h> +#include <sys/blake3.h> #include "blake3_impl.h" -static const blake3_impl_ops_t *const blake3_impls[] = { - &blake3_generic_impl, -#if defined(__aarch64__) || \ +#if !defined(OMIT_SIMD) && (defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse2_impl, -#endif -#if defined(__aarch64__) || \ - (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ - (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) - &blake3_sse41_impl, -#endif -#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) - &blake3_avx2_impl, -#endif -#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) - &blake3_avx512_impl, + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))) +#define USE_SIMD #endif -}; -/* this pointer holds current ops for implementation */ -static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; +#ifdef USE_SIMD +extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} -/* special implementation selections */ -#define IMPL_FASTEST (UINT32_MAX) -#define IMPL_CYCLE (UINT32_MAX-1) -#define IMPL_USER (UINT32_MAX-2) -#define IMPL_PARAM (UINT32_MAX-3) +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} -#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) -static uint32_t icp_blake3_impl = IMPL_FASTEST; +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} -#define BLAKE3_IMPL_NAME_MAX 16 +static boolean_t blake3_is_sse2_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} -/* id of fastest implementation */ -static uint32_t blake3_fastest_id = 0; +const blake3_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif -/* currently used id */ -static uint32_t blake3_current_id = 0; +#ifdef USE_SIMD -/* id of module parameter (-1 == unused) */ -static int blake3_param_id = -1; +extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); -/* return number of supported implementations */ -int -blake3_get_impl_count(void) -{ - static int impls = 0; - int i; +extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); - if (impls) - return (impls); +extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); - for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - impls++; - } +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); +} - return (impls); +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); } -/* return id of selected implementation */ -int -blake3_get_impl_id(void) -{ - return (blake3_current_id); +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* return name of selected implementation */ -const char * -blake3_get_impl_name(void) +static boolean_t blake3_is_sse41_supported(void) { - return (blake3_selected_impl->name); +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif } -/* setup id as fastest implementation */ -void -blake3_set_impl_fastest(uint32_t id) -{ - blake3_fastest_id = id; +const blake3_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* set implementation by id */ -void -blake3_set_impl_id(uint32_t id) +static boolean_t blake3_is_avx2_supported(void) { - int i, cid; - - /* select fastest */ - if (id == IMPL_FASTEST) - id = blake3_fastest_id; - - /* select next or first */ - if (id == IMPL_CYCLE) - id = (++blake3_current_id) % blake3_get_impl_count(); - - /* 0..N for the real impl */ - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - if (cid == id) { - blake3_current_id = cid; - blake3_selected_impl = blake3_impls[i]; - return; - } - cid++; - } + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); } -/* set implementation by name */ -int -blake3_set_impl_name(const char *name) -{ - int i, cid; - - if (strcmp(name, "fastest") == 0) { - atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); - blake3_set_impl_id(IMPL_FASTEST); - return (0); - } else if (strcmp(name, "cycle") == 0) { - atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); - blake3_set_impl_id(IMPL_CYCLE); - return (0); - } +const blake3_ops_t +blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - if (strcmp(name, blake3_impls[i]->name) == 0) { - if (icp_blake3_impl == IMPL_PARAM) { - blake3_param_id = cid; - return (0); - } - blake3_selected_impl = blake3_impls[i]; - blake3_current_id = cid; - return (0); - } - cid++; - } +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} - return (-EINVAL); +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); } -/* setup implementation */ -void -blake3_setup_impl(void) -{ - switch (IMPL_READ(icp_blake3_impl)) { - case IMPL_PARAM: - blake3_set_impl_id(blake3_param_id); - atomic_swap_32(&icp_blake3_impl, IMPL_USER); - break; - case IMPL_FASTEST: - blake3_set_impl_id(IMPL_FASTEST); - break; - case IMPL_CYCLE: - blake3_set_impl_id(IMPL_CYCLE); - break; - default: - blake3_set_impl_id(blake3_current_id); - break; - } +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); } -/* return selected implementation */ -const blake3_impl_ops_t * -blake3_impl_get_ops(void) +static boolean_t blake3_is_avx512_supported(void) { - /* each call to ops will cycle */ - if (icp_blake3_impl == IMPL_CYCLE) - blake3_set_impl_id(IMPL_CYCLE); - - return (blake3_selected_impl); + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); } -#if defined(_KERNEL) +const blake3_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif + +extern const blake3_ops_t blake3_generic_impl; + +static const blake3_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#ifdef USE_SIMD +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +#endif +}; + +/* use the generic implementation functions */ +#define IMPL_NAME "blake3" +#define IMPL_OPS_T blake3_ops_t +#define IMPL_ARRAY blake3_impls +#define IMPL_GET_OPS blake3_get_ops +#define ZFS_IMPL_OPS zfs_blake3_ops +#include <generic_impl.c> + +#ifdef _KERNEL void **blake3_per_cpu_ctx; void @@ -227,58 +312,96 @@ blake3_per_cpu_ctx_fini(void) memset(blake3_per_cpu_ctx, 0, max_ncpus * sizeof (void *)); kmem_free(blake3_per_cpu_ctx, max_ncpus * sizeof (void *)); } -#endif -#if defined(_KERNEL) && defined(__linux__) +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) + static int -icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) +blake3_param_get(char *buffer, zfs_kernel_param_t *unused) { - char req_name[BLAKE3_IMPL_NAME_MAX]; - size_t i; + const uint32_t impl = IMPL_READ(generic_impl_chosen); + char *fmt; + int cnt = 0; - /* sanitize input */ - i = strnlen(name, BLAKE3_IMPL_NAME_MAX); - if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) - return (-EINVAL); + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, "fastest"); + + /* list all supported implementations */ + generic_impl_init(); + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, + blake3_impls[i]->name); + } - strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); - while (i > 0 && isspace(req_name[i-1])) - i--; - req_name[i] = '\0'; + return (cnt); +} - atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); - return (blake3_set_impl_name(req_name)); +static int +blake3_param_set(const char *val, zfs_kernel_param_t *unused) +{ + (void) unused; + return (generic_impl_setname(val)); } +#elif defined(__FreeBSD__) + +#include <sys/sbuf.h> + static int -icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +blake3_param(ZFS_MODULE_PARAM_ARGS) { - int i, cid, cnt = 0; - char *fmt; + int err; - /* cycling */ - fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; - cnt += sprintf(buffer + cnt, fmt); - - /* fastest one */ - fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; - cnt += sprintf(buffer + cnt, fmt); - - /* user selected */ - for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { - if (!blake3_impls[i]->is_supported()) continue; - fmt = (icp_blake3_impl == IMPL_USER && - cid == blake3_current_id) ? "[%s] " : "%s "; - cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); - cid++; + generic_impl_init(); + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(generic_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* cycling */ + fmt = IMPL_FMT(impl, IMPL_CYCLE); + (void) sbuf_printf(s, fmt, "cycle"); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); } - buffer[cnt] = 0; + char buf[16]; - return (cnt); + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) { + return (err); + } + + return (-generic_impl_setname(buf)); } +#endif + +#undef IMPL_FMT -module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, - NULL, 0644); -MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, blake3_impl, + blake3_param_set, blake3_param_get, ZMOD_RW, \ + "Select BLAKE3 implementation."); #endif |