diff options
author | Martin Matuska <mm@FreeBSD.org> | 2021-02-16 00:39:34 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2021-02-16 01:46:28 +0000 |
commit | 184c1b943937986c81e1996d999d21626ec7a4ff (patch) | |
tree | f7321df93d0bd5ffb8cf9245c84745dac7e81ce1 /sys/contrib/openzfs/module | |
parent | 10fc4c3218381fef7189a5b8d46a757cd1989dff (diff) | |
parent | 83dd4a9252fd2044038a399d7afc68259d483b8e (diff) | |
download | src-184c1b943937986c81e1996d999d21626ec7a4ff.tar.gz src-184c1b943937986c81e1996d999d21626ec7a4ff.zip |
zfs: merge OpenZFS master-436ab35a5
- speed up writing to ZFS pools without ZIL devices (aa755b3)
- speed up importing ZFS pools (2d8f72d, a0e0199, cf0977a)
...
MFC after: 2 weeks
Reviewed by: mjg (partial)
Tested by: pho
Differential Revision: https://reviews.freebsd.org/D28677
Diffstat (limited to 'sys/contrib/openzfs/module')
59 files changed, 1276 insertions, 878 deletions
diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in index 0ee2c447221a..69caf48570e9 100644 --- a/sys/contrib/openzfs/module/Makefile.in +++ b/sys/contrib/openzfs/module/Makefile.in @@ -14,7 +14,8 @@ check: modules modules-Linux modules-FreeBSD modules-unknown \ clean clean-Linux clean-FreeBSD \ modules_install modules_install-Linux modules_install-FreeBSD \ - modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD + modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \ + cppcheck cppcheck-Linux cppcheck-FreeBSD # Filter out options that FreeBSD make doesn't understand getflags = ( \ @@ -106,6 +107,25 @@ modules_uninstall-FreeBSD: modules_uninstall: modules_uninstall-@ac_system@ +cppcheck-Linux: + @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \ + --inline-suppr --suppress=noValidConfiguration \ + --enable=warning,information -D_KERNEL \ + --include=@LINUX_OBJ@/include/generated/autoconf.h \ + --include=@top_srcdir@/zfs_config.h \ + --config-exclude=@LINUX_OBJ@/include \ + -I @LINUX_OBJ@/include \ + -I @top_srcdir@/include/os/linux/kernel \ + -I @top_srcdir@/include/os/linux/spl \ + -I @top_srcdir@/include/os/linux/zfs \ + -I @top_srcdir@/include \ + avl icp lua nvpair spl unicode zcommon zfs zstd os/linux + +cppcheck-FreeBSD: + @true + +cppcheck: cppcheck-@ac_system@ + distdir: (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \ while read path; do \ diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c index 48865365d8e3..d0473d883b3d 100644 --- a/sys/contrib/openzfs/module/avl/avl.c +++ b/sys/contrib/openzfs/module/avl/avl.c @@ -492,7 +492,6 @@ avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) int which_child = AVL_INDEX2CHILD(where); size_t off = tree->avl_offset; - ASSERT(tree); #ifdef _LP64 ASSERT(((uintptr_t)new_data & 0x7) == 0); #endif @@ -680,8 +679,6 @@ avl_remove(avl_tree_t *tree, void *data) int which_child; size_t off = tree->avl_offset; - ASSERT(tree); - delete = AVL_DATA2NODE(data, off); /* diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c index faae9722bd04..59743c7d6829 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c @@ -43,11 +43,11 @@ crypto_init_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset) break; case CRYPTO_DATA_UIO: { - uio_t *uiop = out->cd_uio; + zfs_uio_t *uiop = out->cd_uio; uint_t vec_idx; offset = out->cd_offset; - offset = uio_index_at_offset(uiop, offset, &vec_idx); + offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx); *current_offset = offset; *iov_or_mp = (void *)(uintptr_t)vec_idx; @@ -85,7 +85,7 @@ crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset, } case CRYPTO_DATA_UIO: { - uio_t *uio = out->cd_uio; + zfs_uio_t *uio = out->cd_uio; offset_t offset; uint_t vec_idx; uint8_t *p; @@ -94,7 +94,7 @@ crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset, offset = *current_offset; vec_idx = (uintptr_t)(*iov_or_mp); - uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); + zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); p = (uint8_t *)iov_base + offset; *out_data_1 = p; @@ -106,10 +106,10 @@ crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset, } else { /* one block spans two iovecs */ *out_data_1_len = iov_len - offset; - if (vec_idx == uio_iovcnt(uio)) + if (vec_idx == zfs_uio_iovcnt(uio)) return; vec_idx++; - uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); + zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); *out_data_2 = (uint8_t *)iov_base; *current_offset = amt - *out_data_1_len; } diff --git a/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c b/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c index 905ef6657336..1b115d976232 100644 --- a/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c +++ b/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c @@ -40,7 +40,7 @@ int crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, void *digest_ctx, void (*update)(void)) { - uio_t *uiop = data->cd_uio; + zfs_uio_t *uiop = data->cd_uio; off_t offset = data->cd_offset; size_t length = len; uint_t vec_idx; @@ -48,7 +48,7 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, uchar_t *datap; ASSERT(data->cd_format == CRYPTO_DATA_UIO); - if (uio_segflg(uiop) != UIO_SYSSPACE) { + if (zfs_uio_segflg(uiop) != UIO_SYSSPACE) { return (CRYPTO_ARGUMENTS_BAD); } @@ -56,9 +56,9 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, * Jump to the first iovec containing data to be * processed. */ - offset = uio_index_at_offset(uiop, offset, &vec_idx); + offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx); - if (vec_idx == uio_iovcnt(uiop) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The caller specified an offset that is larger than * the total size of the buffers it provided. @@ -66,11 +66,11 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, return (CRYPTO_DATA_LEN_RANGE); } - while (vec_idx < uio_iovcnt(uiop) && length > 0) { - cur_len = MIN(uio_iovlen(uiop, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) - offset, length); - datap = (uchar_t *)(uio_iovbase(uiop, vec_idx) + offset); + datap = (uchar_t *)(zfs_uio_iovbase(uiop, vec_idx) + offset); switch (cmd) { case COPY_FROM_DATA: bcopy(datap, buf, cur_len); @@ -97,7 +97,7 @@ crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, offset = 0; } - if (vec_idx == uio_iovcnt(uiop) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed. @@ -166,7 +166,7 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, void (*copy_block)(uint8_t *, uint64_t *)) { common_ctx_t *common_ctx = ctx; - uio_t *uiop = input->cd_uio; + zfs_uio_t *uiop = input->cd_uio; off_t offset = input->cd_offset; size_t length = input->cd_length; uint_t vec_idx; @@ -178,7 +178,7 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, &common_ctx->cc_iv[0]); } - if (uio_segflg(input->cd_uio) != UIO_SYSSPACE) { + if (zfs_uio_segflg(input->cd_uio) != UIO_SYSSPACE) { return (CRYPTO_ARGUMENTS_BAD); } @@ -186,8 +186,8 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, * Jump to the first iovec containing data to be * processed. */ - offset = uio_index_at_offset(uiop, offset, &vec_idx); - if (vec_idx == uio_iovcnt(uiop) && length > 0) { + offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -198,11 +198,11 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, /* * Now process the iovecs. */ - while (vec_idx < uio_iovcnt(uiop) && length > 0) { - cur_len = MIN(uio_iovlen(uiop, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) - offset, length); - int rv = (cipher)(ctx, uio_iovbase(uiop, vec_idx) + offset, + int rv = (cipher)(ctx, zfs_uio_iovbase(uiop, vec_idx) + offset, cur_len, output); if (rv != CRYPTO_SUCCESS) { @@ -213,7 +213,7 @@ crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, offset = 0; } - if (vec_idx == uio_iovcnt(uiop) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. diff --git a/sys/contrib/openzfs/module/icp/io/sha1_mod.c b/sys/contrib/openzfs/module/icp/io/sha1_mod.c index ffae143cded0..6dcee6b2ecf2 100644 --- a/sys/contrib/openzfs/module/icp/io/sha1_mod.c +++ b/sys/contrib/openzfs/module/icp/io/sha1_mod.c @@ -271,15 +271,15 @@ sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) size_t cur_len; /* we support only kernel buffer */ - if (uio_segflg(data->cd_uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ - offset = uio_index_at_offset(data->cd_uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(data->cd_uio)) { + offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -290,12 +290,12 @@ sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) /* * Now do the digesting on the iovecs. */ - while (vec_idx < uio_iovcnt(data->cd_uio) && length > 0) { - cur_len = MIN(uio_iovlen(data->cd_uio, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) - offset, length); SHA1Update(sha1_ctx, - (uint8_t *)uio_iovbase(data->cd_uio, vec_idx) + offset, + (uint8_t *)zfs_uio_iovbase(data->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; @@ -303,7 +303,7 @@ sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) offset = 0; } - if (vec_idx == uio_iovcnt(data->cd_uio) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -330,15 +330,15 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, uint_t vec_idx = 0; /* we support only kernel buffer */ - if (uio_segflg(digest->cd_uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to * be returned. */ - offset = uio_index_at_offset(digest->cd_uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(digest->cd_uio)) { + offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -348,7 +348,7 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, } if (offset + digest_len <= - uio_iovlen(digest->cd_uio, vec_idx)) { + zfs_uio_iovlen(digest->cd_uio, vec_idx)) { /* * The computed SHA1 digest will fit in the current * iovec. @@ -360,11 +360,11 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, * the user only what was requested. */ SHA1Final(digest_scratch, sha1_ctx); - bcopy(digest_scratch, (uchar_t *)uio_iovbase(digest-> - cd_uio, vec_idx) + offset, + bcopy(digest_scratch, (uchar_t *) + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, digest_len); } else { - SHA1Final((uchar_t *)uio_iovbase(digest-> + SHA1Final((uchar_t *)zfs_uio_iovbase(digest-> cd_uio, vec_idx) + offset, sha1_ctx); } @@ -382,11 +382,11 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, SHA1Final(digest_tmp, sha1_ctx); - while (vec_idx < uio_iovcnt(digest->cd_uio) && length > 0) { - cur_len = MIN(uio_iovlen(digest->cd_uio, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) - offset, length); bcopy(digest_tmp + scratch_offset, - uio_iovbase(digest->cd_uio, vec_idx) + offset, + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; @@ -395,7 +395,7 @@ sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, offset = 0; } - if (vec_idx == uio_iovcnt(digest->cd_uio) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -1096,12 +1096,12 @@ sha1_mac_verify_atomic(crypto_provider_handle_t provider, size_t cur_len; /* we support only kernel buffer */ - if (uio_segflg(mac->cd_uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* jump to the first iovec containing the expected digest */ - offset = uio_index_at_offset(mac->cd_uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(mac->cd_uio)) { + offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -1112,12 +1112,12 @@ sha1_mac_verify_atomic(crypto_provider_handle_t provider, } /* do the comparison of computed digest vs specified one */ - while (vec_idx < uio_iovcnt(mac->cd_uio) && length > 0) { - cur_len = MIN(uio_iovlen(mac->cd_uio, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) - offset, length); if (bcmp(digest + scratch_offset, - uio_iovbase(mac->cd_uio, vec_idx) + offset, + zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset, cur_len) != 0) { ret = CRYPTO_INVALID_MAC; break; diff --git a/sys/contrib/openzfs/module/icp/io/sha2_mod.c b/sys/contrib/openzfs/module/icp/io/sha2_mod.c index a4a5c6041dd0..d690cd0bcb05 100644 --- a/sys/contrib/openzfs/module/icp/io/sha2_mod.c +++ b/sys/contrib/openzfs/module/icp/io/sha2_mod.c @@ -296,15 +296,15 @@ sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data) size_t cur_len; /* we support only kernel buffer */ - if (uio_segflg(data->cd_uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ - offset = uio_index_at_offset(data->cd_uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(data->cd_uio)) { + offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -315,18 +315,18 @@ sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data) /* * Now do the digesting on the iovecs. */ - while (vec_idx < uio_iovcnt(data->cd_uio) && length > 0) { - cur_len = MIN(uio_iovlen(data->cd_uio, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) - offset, length); - SHA2Update(sha2_ctx, (uint8_t *)uio_iovbase(data->cd_uio, + SHA2Update(sha2_ctx, (uint8_t *)zfs_uio_iovbase(data->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; vec_idx++; offset = 0; } - if (vec_idx == uio_iovcnt(data->cd_uio) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -353,15 +353,15 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, uint_t vec_idx = 0; /* we support only kernel buffer */ - if (uio_segflg(digest->cd_uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to * be returned. */ - offset = uio_index_at_offset(digest->cd_uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(digest->cd_uio)) { + offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -371,7 +371,7 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, } if (offset + digest_len <= - uio_iovlen(digest->cd_uio, vec_idx)) { + zfs_uio_iovlen(digest->cd_uio, vec_idx)) { /* * The computed SHA2 digest will fit in the current * iovec. @@ -387,11 +387,11 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, */ SHA2Final(digest_scratch, sha2_ctx); - bcopy(digest_scratch, (uchar_t *)uio_iovbase(digest-> - cd_uio, vec_idx) + offset, + bcopy(digest_scratch, (uchar_t *) + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, digest_len); } else { - SHA2Final((uchar_t *)uio_iovbase(digest-> + SHA2Final((uchar_t *)zfs_uio_iovbase(digest-> cd_uio, vec_idx) + offset, sha2_ctx); @@ -410,12 +410,12 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, SHA2Final(digest_tmp, sha2_ctx); - while (vec_idx < uio_iovcnt(digest->cd_uio) && length > 0) { + while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) { cur_len = - MIN(uio_iovlen(digest->cd_uio, vec_idx) - + MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) - offset, length); bcopy(digest_tmp + scratch_offset, - uio_iovbase(digest->cd_uio, vec_idx) + offset, + zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset, cur_len); length -= cur_len; @@ -424,7 +424,7 @@ sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, offset = 0; } - if (vec_idx == uio_iovcnt(digest->cd_uio) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -1251,12 +1251,12 @@ sha2_mac_verify_atomic(crypto_provider_handle_t provider, size_t cur_len; /* we support only kernel buffer */ - if (uio_segflg(mac->cd_uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* jump to the first iovec containing the expected digest */ - offset = uio_index_at_offset(mac->cd_uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(mac->cd_uio)) { + offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) { /* * The caller specified an offset that is * larger than the total size of the buffers @@ -1267,12 +1267,12 @@ sha2_mac_verify_atomic(crypto_provider_handle_t provider, } /* do the comparison of computed digest vs specified one */ - while (vec_idx < uio_iovcnt(mac->cd_uio) && length > 0) { - cur_len = MIN(uio_iovlen(mac->cd_uio, vec_idx) - + while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) - offset, length); if (bcmp(digest + scratch_offset, - uio_iovbase(mac->cd_uio, vec_idx) + offset, + zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset, cur_len) != 0) { ret = CRYPTO_INVALID_MAC; break; diff --git a/sys/contrib/openzfs/module/icp/io/skein_mod.c b/sys/contrib/openzfs/module/icp/io/skein_mod.c index 18026807fd84..5ee36af12bcb 100644 --- a/sys/contrib/openzfs/module/icp/io/skein_mod.c +++ b/sys/contrib/openzfs/module/icp/io/skein_mod.c @@ -272,18 +272,18 @@ skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) size_t length = data->cd_length; uint_t vec_idx = 0; size_t cur_len; - uio_t *uio = data->cd_uio; + zfs_uio_t *uio = data->cd_uio; /* we support only kernel buffer */ - if (uio_segflg(uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ - offset = uio_index_at_offset(uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(uio)) { + offset = zfs_uio_index_at_offset(uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -294,16 +294,16 @@ skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) /* * Now do the digesting on the iovecs. */ - while (vec_idx < uio_iovcnt(uio) && length > 0) { - cur_len = MIN(uio_iovlen(uio, vec_idx) - offset, length); - SKEIN_OP(ctx, Update, (uint8_t *)uio_iovbase(uio, vec_idx) + while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length); + SKEIN_OP(ctx, Update, (uint8_t *)zfs_uio_iovbase(uio, vec_idx) + offset, cur_len); length -= cur_len; vec_idx++; offset = 0; } - if (vec_idx == uio_iovcnt(uio) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. @@ -322,19 +322,19 @@ static int skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) { - off_t offset = digest->cd_offset; - uint_t vec_idx = 0; - uio_t *uio = digest->cd_uio; + off_t offset = digest->cd_offset; + uint_t vec_idx = 0; + zfs_uio_t *uio = digest->cd_uio; /* we support only kernel buffer */ - if (uio_segflg(uio) != UIO_SYSSPACE) + if (zfs_uio_segflg(uio) != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to be returned. */ - offset = uio_index_at_offset(uio, offset, &vec_idx); - if (vec_idx == uio_iovcnt(uio)) { + offset = zfs_uio_index_at_offset(uio, offset, &vec_idx); + if (vec_idx == zfs_uio_iovcnt(uio)) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. @@ -342,10 +342,10 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, return (CRYPTO_DATA_LEN_RANGE); } if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <= - uio_iovlen(uio, vec_idx)) { + zfs_uio_iovlen(uio, vec_idx)) { /* The computed digest will fit in the current iovec. */ SKEIN_OP(ctx, Final, - (uchar_t *)uio_iovbase(uio, vec_idx) + offset); + (uchar_t *)zfs_uio_iovbase(uio, vec_idx) + offset); } else { uint8_t *digest_tmp; off_t scratch_offset = 0; @@ -357,11 +357,11 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, if (digest_tmp == NULL) return (CRYPTO_HOST_MEMORY); SKEIN_OP(ctx, Final, digest_tmp); - while (vec_idx < uio_iovcnt(uio) && length > 0) { - cur_len = MIN(uio_iovlen(uio, vec_idx) - offset, + while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) { + cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length); bcopy(digest_tmp + scratch_offset, - uio_iovbase(uio, vec_idx) + offset, cur_len); + zfs_uio_iovbase(uio, vec_idx) + offset, cur_len); length -= cur_len; vec_idx++; @@ -370,7 +370,7 @@ skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, } kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen)); - if (vec_idx == uio_iovcnt(uio) && length > 0) { + if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. diff --git a/sys/contrib/openzfs/module/lua/ldebug.c b/sys/contrib/openzfs/module/lua/ldebug.c index 2e1efa4e7250..da005c44376e 100644 --- a/sys/contrib/openzfs/module/lua/ldebug.c +++ b/sys/contrib/openzfs/module/lua/ldebug.c @@ -324,7 +324,6 @@ static void kname (Proto *p, int pc, int c, const char **name) { if (ISK(c)) { /* is 'c' a constant? */ TValue *kvalue = &p->k[INDEXK(c)]; if (ttisstring(kvalue)) { /* literal constant? */ - // cppcheck-suppress autoVariables *name = svalue(kvalue); /* it is its own name */ return; } diff --git a/sys/contrib/openzfs/module/lua/ldo.c b/sys/contrib/openzfs/module/lua/ldo.c index 474fe659bcef..f3c3dcb4d81a 100644 --- a/sys/contrib/openzfs/module/lua/ldo.c +++ b/sys/contrib/openzfs/module/lua/ldo.c @@ -196,7 +196,6 @@ int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { struct lua_longjmp lj; lj.status = LUA_OK; lj.previous = L->errorJmp; /* chain new error handler */ - // cppcheck-suppress autoVariables L->errorJmp = &lj; LUAI_TRY(L, &lj, (*f)(L, ud); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c index c6b610394718..f5f3524f7b9d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c @@ -43,31 +43,32 @@ #include <sys/param.h> #include <sys/uio.h> #include <sys/vnode.h> +#include <sys/zfs_znode.h> /* - * same as uiomove() but doesn't modify uio structure. + * same as zfs_uiomove() but doesn't modify uio structure. * return in cbytes how many bytes were copied. */ int -uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) +zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) { struct iovec small_iovec[1]; struct uio small_uio_clone; struct uio *uio_clone; int error; - ASSERT3U(uio->uio_rw, ==, rw); - if (uio->uio_iovcnt == 1) { - small_uio_clone = *uio; - small_iovec[0] = *uio->uio_iov; + ASSERT3U(zfs_uio_rw(uio), ==, rw); + if (zfs_uio_iovcnt(uio) == 1) { + small_uio_clone = *(GET_UIO_STRUCT(uio)); + small_iovec[0] = *(GET_UIO_STRUCT(uio)->uio_iov); small_uio_clone.uio_iov = small_iovec; uio_clone = &small_uio_clone; } else { - uio_clone = cloneuio(uio); + uio_clone = cloneuio(GET_UIO_STRUCT(uio)); } error = vn_io_fault_uiomove(p, n, uio_clone); - *cbytes = uio->uio_resid - uio_clone->uio_resid; + *cbytes = zfs_uio_resid(uio) - uio_clone->uio_resid; if (uio_clone != &small_uio_clone) free(uio_clone, M_IOV); return (error); @@ -77,16 +78,23 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) * Drop the next n chars out of *uiop. */ void -uioskip(uio_t *uio, size_t n) +zfs_uioskip(zfs_uio_t *uio, size_t n) { - enum uio_seg segflg; + zfs_uio_seg_t segflg; /* For the full compatibility with illumos. */ - if (n > uio->uio_resid) + if (n > zfs_uio_resid(uio)) return; - segflg = uio->uio_segflg; - uio->uio_segflg = UIO_NOCOPY; - uiomove(NULL, n, uio->uio_rw, uio); - uio->uio_segflg = segflg; + segflg = zfs_uio_segflg(uio); + zfs_uio_segflg(uio) = UIO_NOCOPY; + zfs_uiomove(NULL, n, zfs_uio_rw(uio), uio); + zfs_uio_segflg(uio) = segflg; +} + +int +zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) +{ + ASSERT(zfs_uio_rw(uio) == dir); + return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio))); } diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c index 9e16c0029087..09c8401267df 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c @@ -240,7 +240,9 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, #endif VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; +#ifdef VIRF_MOUNTPOINT vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); +#endif vp->v_mountedhere = mp; VI_UNLOCK(vp); /* Put the new filesystem on the mount list. */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index 0a323e8856a3..ab82b2aaeb78 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -202,7 +202,7 @@ abd_free_chunks(abd_t *abd) } abd_t * -abd_alloc_struct(size_t size) +abd_alloc_struct_impl(size_t size) { uint_t chunkcnt = abd_chunkcnt_for_bytes(size); /* @@ -216,22 +216,18 @@ abd_alloc_struct(size_t size) offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); ASSERT3P(abd, !=, NULL); - list_link_init(&abd->abd_gang_link); - mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); ABDSTAT_INCR(abdstat_struct_size, abd_size); return (abd); } void -abd_free_struct(abd_t *abd) +abd_free_struct_impl(abd_t *abd) { uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : abd_scatter_chunkcnt(abd); ssize_t size = MAX(sizeof (abd_t), offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); - mutex_destroy(&abd->abd_mtx); - ASSERT(!list_link_active(&abd->abd_gang_link)); kmem_free(abd, size); ABDSTAT_INCR(abdstat_struct_size, -size); } @@ -249,10 +245,8 @@ abd_alloc_zero_scatter(void) abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); - abd_zero_scatter->abd_flags = ABD_FLAG_OWNER | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - abd_zero_scatter->abd_parent = NULL; - zfs_refcount_create(&abd_zero_scatter->abd_children); ABD_SCATTER(abd_zero_scatter).abd_offset = 0; ABD_SCATTER(abd_zero_scatter).abd_chunk_size = @@ -270,7 +264,6 @@ abd_alloc_zero_scatter(void) static void abd_free_zero_scatter(void) { - zfs_refcount_destroy(&abd_zero_scatter->abd_children); ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size); @@ -355,10 +348,8 @@ abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt) } abd_t * -abd_get_offset_scatter(abd_t *sabd, size_t off) +abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off) { - abd_t *abd = NULL; - abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); @@ -366,14 +357,24 @@ abd_get_offset_scatter(abd_t *sabd, size_t off) uint_t chunkcnt = abd_scatter_chunkcnt(sabd) - (new_offset / zfs_abd_chunk_size); - abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); + /* + * If an abd struct is provided, it is only the minimum size. If we + * need additional chunks, we need to allocate a new struct. + */ + if (abd != NULL && + offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) > + sizeof (abd_t)) { + abd = NULL; + } + + if (abd == NULL) + abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - abd->abd_flags = 0; ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c index b86ffc59a21d..fbf998416234 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c @@ -197,7 +197,7 @@ static void freebsd_crypt_uio_debug_log(boolean_t encrypt, freebsd_crypt_session_t *input_sessionp, struct zio_crypt_info *c_info, - uio_t *data_uio, + zfs_uio_t *data_uio, crypto_key_t *key, uint8_t *ivbuf, size_t datalen, @@ -222,13 +222,13 @@ freebsd_crypt_uio_debug_log(boolean_t encrypt, printf("%02x ", b[i]); } printf("}\n"); - for (int i = 0; i < data_uio->uio_iovcnt; i++) { + for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) { printf("\tiovec #%d: <%p, %u>\n", i, - data_uio->uio_iov[i].iov_base, - (unsigned int)data_uio->uio_iov[i].iov_len); - total += data_uio->uio_iov[i].iov_len; + zfs_uio_iovbase(data_uio, i), + (unsigned int)zfs_uio_iovlen(data_uio, i)); + total += zfs_uio_iovlen(data_uio, i); } - data_uio->uio_resid = total; + zfs_uio_resid(data_uio) = total; #endif } /* @@ -310,7 +310,7 @@ int freebsd_crypt_uio(boolean_t encrypt, freebsd_crypt_session_t *input_sessionp, struct zio_crypt_info *c_info, - uio_t *data_uio, + zfs_uio_t *data_uio, crypto_key_t *key, uint8_t *ivbuf, size_t datalen, @@ -323,9 +323,9 @@ freebsd_crypt_uio(boolean_t encrypt, freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, key, ivbuf, datalen, auth_len); - for (int i = 0; i < data_uio->uio_iovcnt; i++) - total += data_uio->uio_iov[i].iov_len; - data_uio->uio_resid = total; + for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) + total += zfs_uio_iovlen(data_uio, i); + zfs_uio_resid(data_uio) = total; if (input_sessionp == NULL) { session = kmem_zalloc(sizeof (*session), KM_SLEEP); error = freebsd_crypt_newsession(session, c_info, key); @@ -343,7 +343,7 @@ freebsd_crypt_uio(boolean_t encrypt, CRYPTO_OP_VERIFY_DIGEST; } crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE; - crypto_use_uio(crp, data_uio); + crypto_use_uio(crp, GET_UIO_STRUCT(data_uio)); crp->crp_aad_start = 0; crp->crp_aad_length = auth_len; @@ -480,7 +480,7 @@ int freebsd_crypt_uio(boolean_t encrypt, freebsd_crypt_session_t *input_sessionp, struct zio_crypt_info *c_info, - uio_t *data_uio, + zfs_uio_t *data_uio, crypto_key_t *key, uint8_t *ivbuf, size_t datalen, @@ -564,7 +564,7 @@ freebsd_crypt_uio(boolean_t encrypt, crp->crp_session = session->fs_sid; crp->crp_ilen = auth_len + datalen; - crp->crp_buf = (void*)data_uio; + crp->crp_buf = (void*)GET_UIO_STRUCT(data_uio); crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC; auth_desc->crd_skip = 0; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c index 6901f1ca915a..f472aecdbafb 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -251,7 +251,7 @@ sfs_reclaim_vnode(vnode_t *vp) static int sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, - uio_t *uio, off_t *offp) + zfs_uio_t *uio, off_t *offp) { struct dirent entry; int error; @@ -260,26 +260,26 @@ sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; - if (uio->uio_resid < sizeof (entry)) + if (zfs_uio_resid(uio) < sizeof (entry)) return (SET_ERROR(EINVAL)); - if (uio->uio_offset < 0) + if (zfs_uio_offset(uio) < 0) return (SET_ERROR(EINVAL)); - if (uio->uio_offset == 0) { + if (zfs_uio_offset(uio) == 0) { entry.d_fileno = id; entry.d_type = DT_DIR; entry.d_name[0] = '.'; entry.d_name[1] = '\0'; entry.d_namlen = 1; entry.d_reclen = sizeof (entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio)); if (error != 0) return (SET_ERROR(error)); } - if (uio->uio_offset < sizeof (entry)) + if (zfs_uio_offset(uio) < sizeof (entry)) return (SET_ERROR(EINVAL)); - if (uio->uio_offset == sizeof (entry)) { + if (zfs_uio_offset(uio) == sizeof (entry)) { entry.d_fileno = parent_id; entry.d_type = DT_DIR; entry.d_name[0] = '.'; @@ -287,7 +287,7 @@ sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, entry.d_name[2] = '\0'; entry.d_namlen = 2; entry.d_reclen = sizeof (entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio)); if (error != 0) return (SET_ERROR(error)); } @@ -666,21 +666,23 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; zfsctl_root_t *node = vp->v_data; - uio_t *uio = ap->a_uio; + zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; int error; + zfs_uio_init(&uio, ap->a_uio); + ASSERT(vp->v_type == VDIR); - error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio, &dots_offset); if (error != 0) { if (error == ENAMETOOLONG) /* ran out of destination space */ error = 0; return (error); } - if (uio->uio_offset != dots_offset) + if (zfs_uio_offset(&uio) != dots_offset) return (SET_ERROR(EINVAL)); CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name)); @@ -689,7 +691,7 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) strcpy(entry.d_name, node->snapdir->sn_name); entry.d_namlen = strlen(entry.d_name); entry.d_reclen = sizeof (entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { if (error == ENAMETOOLONG) error = 0; @@ -1030,15 +1032,17 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) struct dirent entry; vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - uio_t *uio = ap->a_uio; + zfs_uio_t uio; int *eofp = ap->a_eofflag; off_t dots_offset; int error; + zfs_uio_init(&uio, ap->a_uio); + ASSERT(vp->v_type == VDIR); - error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, - &dots_offset); + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, + &uio, &dots_offset); if (error != 0) { if (error == ENAMETOOLONG) /* ran out of destination space */ error = 0; @@ -1050,7 +1054,7 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) uint64_t cookie; uint64_t id; - cookie = uio->uio_offset - dots_offset; + cookie = zfs_uio_offset(&uio) - dots_offset; dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), @@ -1071,14 +1075,14 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap) strcpy(entry.d_name, snapname); entry.d_namlen = strlen(entry.d_name); entry.d_reclen = sizeof (entry); - error = vfs_read_dirent(ap, &entry, uio->uio_offset); + error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio)); if (error != 0) { if (error == ENAMETOOLONG) error = 0; ZFS_EXIT(zfsvfs); return (SET_ERROR(error)); } - uio->uio_offset = cookie + dots_offset; + zfs_uio_setoffset(&uio, cookie + dots_offset); } /* NOTREACHED */ } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index 8fb259f4ba76..06546c12e420 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -290,7 +290,7 @@ zfs_file_private(zfs_file_t *fp) int zfs_file_unlink(const char *fnamep) { - enum uio_seg seg = UIO_SYSSPACE; + zfs_uio_seg_t seg = UIO_SYSSPACE; int rc; #if __FreeBSD_version >= 1300018 diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index 42f5786ce5c7..d5f0da9ecd4b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -518,7 +518,7 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) * in one single dmu_read() call. */ int -mappedread_sf(znode_t *zp, int nbytes, uio_t *uio) +mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) { vnode_t *vp = ZTOV(zp); objset_t *os = zp->z_zfsvfs->z_os; @@ -530,14 +530,14 @@ mappedread_sf(znode_t *zp, int nbytes, uio_t *uio) int len = nbytes; int error = 0; - ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(zfs_uio_segflg(uio) == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); - ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + ASSERT((zfs_uio_offset(uio) & PAGEOFFSET) == 0); zfs_vmobject_wlock_12(obj); - for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), @@ -584,8 +584,7 @@ mappedread_sf(znode_t *zp, int nbytes, uio_t *uio) } if (error) break; - uio->uio_resid -= bytes; - uio->uio_offset += bytes; + zfs_uio_advance(uio, bytes); len -= bytes; } zfs_vmobject_wunlock_12(obj); @@ -603,7 +602,7 @@ mappedread_sf(znode_t *zp, int nbytes, uio_t *uio) * the file is memory mapped. */ int -mappedread(znode_t *zp, int nbytes, uio_t *uio) +mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { vnode_t *vp = ZTOV(zp); vm_object_t obj; @@ -616,7 +615,7 @@ mappedread(znode_t *zp, int nbytes, uio_t *uio) obj = vp->v_object; ASSERT(obj != NULL); - start = uio->uio_loffset; + start = zfs_uio_offset(uio); off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { @@ -629,7 +628,8 @@ mappedread(znode_t *zp, int nbytes, uio_t *uio) zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); - error = vn_io_fault_uiomove(va + off, bytes, uio); + error = vn_io_fault_uiomove(va + off, bytes, + GET_UIO_STRUCT(uio)); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unhold(pp); @@ -1649,7 +1649,7 @@ zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) */ /* ARGSUSED */ static int -zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, +zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, int *ncookies, ulong_t **cookies) { znode_t *zp = VTOZ(vp); @@ -1694,7 +1694,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, /* * Check for valid iov_len. */ - if (uio->uio_iov->iov_len <= 0) { + if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } @@ -1709,7 +1709,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, error = 0; os = zfsvfs->z_os; - offset = uio->uio_loffset; + offset = zfs_uio_offset(uio); prefetch = zp->z_zn_prefetch; /* @@ -1730,9 +1730,9 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, /* * Get space to change directory entries into fs independent format. */ - iovp = uio->uio_iov; + iovp = GET_UIO_STRUCT(uio)->uio_iov; bytes_wanted = iovp->iov_len; - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; @@ -1747,7 +1747,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, /* * Minimum entry size is dirent size and 1 byte for a file name. */ - ncooks = uio->uio_resid / (sizeof (struct dirent) - + ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) - sizeof (((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); *cookies = cooks; @@ -1927,20 +1927,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, if (ncookies != NULL) *ncookies -= ncooks; - if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; - uio->uio_resid -= outcount; - } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + zfs_uio_resid(uio) -= outcount; + } else if ((error = + zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) { /* * Reset the pointer. */ - offset = uio->uio_loffset; + offset = zfs_uio_offset(uio); } update: zap_cursor_fini(&zc); - if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) @@ -1948,7 +1949,7 @@ update: ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - uio->uio_loffset = offset; + zfs_uio_setoffset(uio, offset); ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); @@ -3631,7 +3632,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, */ /* ARGSUSED */ static int -zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -4414,8 +4415,9 @@ struct vop_read_args { static int zfs_freebsd_read(struct vop_read_args *ap) { - - return (zfs_read(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag), + zfs_uio_t uio; + zfs_uio_init(&uio, ap->a_uio); + return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), ap->a_cred)); } @@ -4431,8 +4433,9 @@ struct vop_write_args { static int zfs_freebsd_write(struct vop_write_args *ap) { - - return (zfs_write(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag), + zfs_uio_t uio; + zfs_uio_init(&uio, ap->a_uio); + return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), ap->a_cred)); } @@ -4704,8 +4707,9 @@ struct vop_readdir_args { static int zfs_freebsd_readdir(struct vop_readdir_args *ap) { - - return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, + zfs_uio_t uio; + zfs_uio_init(&uio, ap->a_uio); + return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } @@ -5008,26 +5012,27 @@ struct vop_readlink_args { static int zfs_freebsd_readlink(struct vop_readlink_args *ap) { + zfs_uio_t uio; znode_t *zp = VTOZ(ap->a_vp); - struct uio *auio; char *symlink, *base; size_t symlink_len; int error; bool trycache; - auio = ap->a_uio; + zfs_uio_init(&uio, ap->a_uio); trycache = false; - if (auio->uio_segflg == UIO_SYSSPACE && auio->uio_iovcnt == 1) { - base = auio->uio_iov->iov_base; - symlink_len = auio->uio_iov->iov_len; + if (zfs_uio_segflg(&uio) == UIO_SYSSPACE && + zfs_uio_iovcnt(&uio) == 1) { + base = zfs_uio_iovbase(&uio, 0); + symlink_len = zfs_uio_iovlen(&uio, 0); trycache = true; } - error = zfs_readlink(ap->a_vp, auio, ap->a_cred, NULL); + error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL); if (atomic_load_ptr(&zp->z_cached_symlink) != NULL || error != 0 || !trycache) { return (error); } - symlink_len -= auio->uio_resid; + symlink_len -= zfs_uio_resid(&uio); symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK); if (symlink != NULL) { memcpy(symlink, base, symlink_len); @@ -5504,11 +5509,14 @@ zfs_listextattr(struct vop_listextattr_args *ap) uint8_t dirbuf[sizeof (struct dirent)]; struct dirent *dp; struct iovec aiov; - struct uio auio, *uio = ap->a_uio; + struct uio auio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; + zfs_uio_t uio; + + zfs_uio_init(&uio, ap->a_uio); /* * If the xattr property is off, refuse the request. @@ -5590,15 +5598,16 @@ zfs_listextattr(struct vop_listextattr_args *ap) nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; - else if (uio != NULL) { + else if (GET_UIO_STRUCT(&uio) != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ - error = uiomove(&nlen, 1, uio->uio_rw, uio); + error = zfs_uiomove(&nlen, 1, zfs_uio_rw(&uio), + &uio); if (error == 0) { - error = uiomove(dp->d_name + plen, nlen, - uio->uio_rw, uio); + error = zfs_uiomove(dp->d_name + plen, + nlen, zfs_uio_rw(&uio), &uio); } if (error != 0) break; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index d9f2635b0129..0491b2ff3e28 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -1912,8 +1912,10 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, size_t complen; int is_xattrdir; - if (prevdb) + if (prevdb) { + ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); + } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) @@ -2020,7 +2022,7 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, void -zfs_inode_update(znode_t *zp) +zfs_znode_update_vfs(znode_t *zp) { vm_object_t object; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index fd2beee7bdd2..9fe678d2574f 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -404,7 +404,7 @@ int failed_decrypt_size; static int zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, - uio_t *uio, uint_t auth_len) + zfs_uio_t *uio, uint_t auth_len) { zio_crypt_info_t *ci; int ret; @@ -439,7 +439,8 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ - uio_t cuio; + zfs_uio_t cuio; + struct uio cuio_s; iovec_t iovecs[4]; uint64_t crypt = key->zk_crypt; uint_t enc_len, keydata_len, aad_len; @@ -447,6 +448,8 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + zfs_uio_init(&cuio, &cuio_s); + keydata_len = zio_crypt_table[crypt].ci_keylen; /* generate iv for wrapping the master and hmac key */ @@ -489,9 +492,9 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, iovecs[0].iov_len = aad_len; enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; - cuio.uio_iov = iovecs; - cuio.uio_iovcnt = 4; - cuio.uio_segflg = UIO_SYSSPACE; + GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; + zfs_uio_iovcnt(&cuio) = 4; + zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* encrypt the keys and store the resulting ciphertext and mac */ ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, @@ -517,7 +520,8 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, * input and output. Also, the AAD (for AES-GMC at least) * needs to logically go in front. */ - uio_t cuio; + zfs_uio_t cuio; + struct uio cuio_s; iovec_t iovecs[4]; void *src, *dst; uint_t enc_len, keydata_len, aad_len; @@ -528,6 +532,8 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, keydata_len = zio_crypt_table[crypt].ci_keylen; rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + zfs_uio_init(&cuio, &cuio_s); + /* * Since we only support one buffer, we need to copy * the encrypted buffer (source) to the plain buffer @@ -565,9 +571,9 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, iovecs[0].iov_base = aad; iovecs[0].iov_len = aad_len; - cuio.uio_iov = iovecs; - cuio.uio_iovcnt = 4; - cuio.uio_segflg = UIO_SYSSPACE; + GET_UIO_STRUCT(&cuio)->uio_iov = iovecs; + zfs_uio_iovcnt(&cuio) = 4; + zfs_uio_segflg(&cuio) = UIO_SYSSPACE; /* decrypt the keys and store the result in the output buffers */ ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, @@ -1150,10 +1156,11 @@ error: } static void -zio_crypt_destroy_uio(uio_t *uio) +zio_crypt_destroy_uio(zfs_uio_t *uio) { - if (uio->uio_iov) - kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); + if (GET_UIO_STRUCT(uio)->uio_iov) + kmem_free(GET_UIO_STRUCT(uio)->uio_iov, + zfs_uio_iovcnt(uio) * sizeof (iovec_t)); } /* @@ -1247,14 +1254,14 @@ zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, * accommodate some of the drivers, the authbuf needs to be logically before * the data. This means that we need to copy the source to the destination, * and set up an extra iovec_t at the beginning to handle the authbuf. - * It also means we'll only return one uio_t. + * It also means we'll only return one zfs_uio_t. */ /* ARGSUSED */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, - uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, + zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { uint8_t *aadbuf = zio_buf_alloc(datalen); @@ -1398,8 +1405,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; - out_uio->uio_iov = dst_iovecs; - out_uio->uio_iovcnt = nr_iovecs; + GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; + zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } @@ -1410,7 +1417,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uio_t *puio, uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, + zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { uint8_t *aadbuf = zio_buf_alloc(datalen); @@ -1547,8 +1554,8 @@ zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, *enc_len = total_len; *authbuf = aadbuf; *auth_len = aad_len; - out_uio->uio_iov = dst_iovecs; - out_uio->uio_iovcnt = nr_iovecs; + GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs; + zfs_uio_iovcnt(out_uio) = nr_iovecs; return (0); } @@ -1556,7 +1563,7 @@ zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, /* ARGSUSED */ static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *out_uio, + uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len) { int ret; @@ -1584,8 +1591,8 @@ zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, cipher_iovecs[0].iov_len = datalen; *enc_len = datalen; - out_uio->uio_iov = cipher_iovecs; - out_uio->uio_iovcnt = nr_cipher; + GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs; + zfs_uio_iovcnt(out_uio) = nr_cipher; return (0); @@ -1596,8 +1603,8 @@ error: kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); *enc_len = 0; - out_uio->uio_iov = NULL; - out_uio->uio_iovcnt = 0; + GET_UIO_STRUCT(out_uio)->uio_iov = NULL; + zfs_uio_iovcnt(out_uio) = 0; return (ret); } @@ -1613,8 +1620,8 @@ error: static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, - uint_t *auth_len, boolean_t *no_crypt) + uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, + uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; @@ -1646,9 +1653,11 @@ zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, goto error; /* populate the uios */ - cuio->uio_segflg = UIO_SYSSPACE; + zfs_uio_segflg(cuio) = UIO_SYSSPACE; - mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov = + ((iovec_t *)&(GET_UIO_STRUCT(cuio)-> + uio_iov[zfs_uio_iovcnt(cuio) - 1])); mac_iov->iov_base = (void *)mac; mac_iov->iov_len = ZIO_DATA_MAC_LEN; @@ -1675,14 +1684,18 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; - uio_t puio, cuio; + zfs_uio_t puio, cuio; + struct uio puio_s, cuio_s; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; freebsd_crypt_session_t *tmpl = NULL; uint8_t *authbuf = NULL; - bzero(&puio, sizeof (uio_t)); - bzero(&cuio, sizeof (uio_t)); + + zfs_uio_init(&puio, &puio_s); + zfs_uio_init(&cuio, &cuio_s); + bzero(GET_UIO_STRUCT(&puio), sizeof (struct uio)); + bzero(GET_UIO_STRUCT(&cuio), sizeof (struct uio)); #ifdef FCRYPTO_DEBUG printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 6c44e3681709..2389b1a06355 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -746,12 +746,15 @@ out: */ static int -zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) +zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; + zfs_uio_t uio; + + zfs_uio_init(&uio, uio_s); zv = dev->si_drv2; @@ -760,20 +763,20 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) * uio_loffset == volsize isn't an error as * its required for EOF processing. */ - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + if (zfs_uio_resid(&uio) > 0 && + (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); - lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, - uio->uio_resid, RL_READER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), + zfs_uio_resid(&uio), RL_READER); + while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { + uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); /* don't read past the end */ - if (bytes > volsize - uio->uio_loffset) - bytes = volsize - uio->uio_loffset; + if (bytes > volsize - zfs_uio_offset(&uio)) + bytes = volsize - zfs_uio_offset(&uio); - error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -787,20 +790,23 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) } static int -zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) +zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; boolean_t sync; + zfs_uio_t uio; zv = dev->si_drv2; volsize = zv->zv_volsize; - if (uio->uio_resid > 0 && - (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + zfs_uio_init(&uio, uio_s); + + if (zfs_uio_resid(&uio) > 0 && + (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); sync = (ioflag & IO_SYNC) || @@ -809,11 +815,11 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); - lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, - uio->uio_resid, RL_WRITER); - while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { - uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); - uint64_t off = uio->uio_loffset; + lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), + zfs_uio_resid(&uio), RL_WRITER); + while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { + uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); + uint64_t off = zfs_uio_offset(&uio); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ @@ -825,7 +831,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index 1da7618185ec..36fdff72a133 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -284,9 +284,7 @@ int64_t __divdi3(int64_t u, int64_t v) { int64_t q, t; - // cppcheck-suppress shiftTooManyBitsSigned q = __udivdi3(abs64(u), abs64(v)); - // cppcheck-suppress shiftTooManyBitsSigned t = (u ^ v) >> 63; // If u, v have different return ((q ^ t) - t); // signs, negate q. } diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c index dbbf72c8569d..c7f1aadf784e 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c @@ -486,7 +486,7 @@ proc_kstat_open(struct inode *inode, struct file *filp) f = filp->private_data; f->private = PDE_DATA(inode); - return (rc); + return (0); } static ssize_t diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index e8d89bfeabe5..61631256c858 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -274,8 +274,6 @@ taskq_lowest_id(taskq_t *tq) taskq_ent_t *t; taskq_thread_t *tqt; - ASSERT(tq); - if (!list_empty(&tq->tq_pend_list)) { t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); lowest_id = MIN(lowest_id, t->tqent_id); @@ -995,6 +993,7 @@ error: spin_unlock_irqrestore(&tq->tq_lock, flags); tsd_set(taskq_tsd, NULL); + thread_exit(); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 0abac228447f..d82e5f4dcf15 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -185,7 +185,7 @@ abd_chunkcnt_for_bytes(size_t size) } abd_t * -abd_alloc_struct(size_t size) +abd_alloc_struct_impl(size_t size) { /* * In Linux we do not use the size passed in during ABD @@ -193,18 +193,14 @@ abd_alloc_struct(size_t size) */ abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); ASSERT3P(abd, !=, NULL); - list_link_init(&abd->abd_gang_link); - mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); return (abd); } void -abd_free_struct(abd_t *abd) +abd_free_struct_impl(abd_t *abd) { - mutex_destroy(&abd->abd_mtx); - ASSERT(!list_link_active(&abd->abd_gang_link)); kmem_cache_free(abd_cache, abd); ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); } @@ -472,14 +468,12 @@ abd_alloc_zero_scatter(void) ASSERT3U(table.nents, ==, nr_pages); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); - abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; ABD_SCATTER(abd_zero_scatter).abd_offset = 0; ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - abd_zero_scatter->abd_parent = NULL; abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; - zfs_refcount_create(&abd_zero_scatter->abd_children); abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { sg_set_page(sg, abd_zero_page, PAGESIZE, 0); @@ -599,12 +593,11 @@ abd_alloc_zero_scatter(void) abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); memset(abd_zero_page, 0, PAGESIZE); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); - abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; ABD_SCATTER(abd_zero_scatter).abd_offset = 0; ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - abd_zero_scatter->abd_parent = NULL; zfs_refcount_create(&abd_zero_scatter->abd_children); ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * sizeof (struct scatterlist), KM_SLEEP); @@ -678,7 +671,6 @@ abd_verify_scatter(abd_t *abd) static void abd_free_zero_scatter(void) { - zfs_refcount_destroy(&abd_zero_scatter->abd_children); ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); @@ -747,9 +739,7 @@ abd_free_linear_page(abd_t *abd) ABD_SCATTER(abd).abd_sgl = sg; abd_free_chunks(abd); - zfs_refcount_destroy(&abd->abd_children); abd_update_scatter_stats(abd, ABDSTAT_DECR); - abd_free_struct(abd); } /* @@ -770,9 +760,8 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) } abd_t * -abd_get_offset_scatter(abd_t *sabd, size_t off) +abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off) { - abd_t *abd = NULL; int i = 0; struct scatterlist *sg = NULL; @@ -781,14 +770,14 @@ abd_get_offset_scatter(abd_t *sabd, size_t off) size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; - abd = abd_alloc_struct(0); + if (abd == NULL) + abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - abd->abd_flags = 0; abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { if (new_offset < sg->length) @@ -936,17 +925,28 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) { unsigned long pos; - while (abd_is_gang(abd)) - abd = abd_gang_get_offset(abd, &off); + if (abd_is_gang(abd)) { + unsigned long count = 0; + + for (abd_t *cabd = abd_gang_get_offset(abd, &off); + cabd != NULL && size != 0; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT3U(off, <, cabd->abd_size); + int mysize = MIN(size, cabd->abd_size - off); + count += abd_nr_pages_off(cabd, mysize, off); + size -= mysize; + off = 0; + } + return (count); + } - ASSERT(!abd_is_gang(abd)); if (abd_is_linear(abd)) pos = (unsigned long)abd_to_buf(abd) + off; else pos = ABD_SCATTER(abd).abd_offset + off; - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); + return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT)); } static unsigned int @@ -1021,7 +1021,6 @@ unsigned int abd_bio_map_off(struct bio *bio, abd_t *abd, unsigned int io_size, size_t off) { - int i; struct abd_iter aiter; ASSERT3U(io_size, <=, abd->abd_size - off); @@ -1035,7 +1034,7 @@ abd_bio_map_off(struct bio *bio, abd_t *abd, abd_iter_init(&aiter, abd); abd_iter_advance(&aiter, off); - for (i = 0; i < bio->bi_max_vecs; i++) { + for (int i = 0; i < bio->bi_max_vecs; i++) { struct page *pg; size_t len, sgoff, pgoff; struct scatterlist *sg; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 4bd27d1b516f..b373f2c2e83c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -350,19 +350,14 @@ vdev_disk_close(vdev_t *v) static dio_request_t * vdev_disk_dio_alloc(int bio_count) { - dio_request_t *dr; - int i; - - dr = kmem_zalloc(sizeof (dio_request_t) + + dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); - if (dr) { - atomic_set(&dr->dr_ref, 0); - dr->dr_bio_count = bio_count; - dr->dr_error = 0; + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; - for (i = 0; i < dr->dr_bio_count; i++) - dr->dr_bio[i] = NULL; - } + for (int i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; return (dr); } @@ -536,8 +531,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; - int bio_size, bio_count = 16; - int i = 0, error = 0; + int bio_size; + int bio_count = 16; + int error = 0; struct blk_plug plug; /* @@ -552,8 +548,6 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, retry: dr = vdev_disk_dio_alloc(bio_count); - if (dr == NULL) - return (SET_ERROR(ENOMEM)); if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bio_set_flags_failfast(bdev, &flags); @@ -561,26 +555,28 @@ retry: dr->dr_zio = zio; /* - * When the IO size exceeds the maximum bio size for the request - * queue we are forced to break the IO in multiple bio's and wait - * for them all to complete. Ideally, all pool users will set - * their volume block size to match the maximum request size and - * the common case will be one bio per vdev IO request. + * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which + * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio + * can cover at least 128KB and at most 1MB. When the required number + * of iovec's exceeds this, we are forced to break the IO in multiple + * bio's and wait for them all to complete. This is likely if the + * recordsize property is increased beyond 1MB. The default + * bio_count=16 should typically accommodate the maximum-size zio of + * 16MB. */ abd_offset = 0; bio_offset = io_offset; - bio_size = io_size; - for (i = 0; i <= dr->dr_bio_count; i++) { + bio_size = io_size; + for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* - * By default only 'bio_count' bio's per dio are allowed. - * However, if we find ourselves in a situation where more - * are needed we allocate a larger dio and warn the user. + * If additional bio's are required, we have to retry, but + * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); @@ -622,9 +618,10 @@ retry: blk_start_plug(&plug); /* Submit all bio's associated with this dio */ - for (i = 0; i < dr->dr_bio_count; i++) + for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); + } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index e435e1a9f78a..3b0f824115f8 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -55,7 +55,7 @@ * a non-zero errno on failure. */ static int -uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) +zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct iovec *iov = uio->uio_iov; size_t skip = uio->uio_skip; @@ -126,7 +126,7 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) } static int -uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) +zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { const struct bio_vec *bv = uio->uio_bvec; size_t skip = uio->uio_skip; @@ -160,7 +160,7 @@ uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) #if defined(HAVE_VFS_IOV_ITER) static int -uiomove_iter(void *p, size_t n, enum uio_rw rw, struct uio *uio, +zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, boolean_t revert) { size_t cnt = MIN(n, uio->uio_resid); @@ -182,7 +182,7 @@ uiomove_iter(void *p, size_t n, enum uio_rw rw, struct uio *uio, return (EFAULT); /* - * Revert advancing the uio_iter. This is set by uiocopy() + * Revert advancing the uio_iter. This is set by zfs_uiocopy() * to avoid consuming the uio and its iov_iter structure. */ if (revert) @@ -196,33 +196,69 @@ uiomove_iter(void *p, size_t n, enum uio_rw rw, struct uio *uio, #endif int -uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) +zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) { if (uio->uio_segflg == UIO_BVEC) - return (uiomove_bvec(p, n, rw, uio)); + return (zfs_uiomove_bvec(p, n, rw, uio)); #if defined(HAVE_VFS_IOV_ITER) else if (uio->uio_segflg == UIO_ITER) - return (uiomove_iter(p, n, rw, uio, B_FALSE)); + return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); #endif else - return (uiomove_iov(p, n, rw, uio)); + return (zfs_uiomove_iov(p, n, rw, uio)); } -EXPORT_SYMBOL(uiomove); +EXPORT_SYMBOL(zfs_uiomove); +/* + * Fault in the pages of the first n bytes specified by the uio structure. + * 1 byte in each page is touched and the uio struct is unmodified. Any + * error will terminate the process as this is only a best attempt to get + * the pages resident. + */ int -uio_prefaultpages(ssize_t n, struct uio *uio) +zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) { - struct iov_iter iter, *iterp = NULL; - -#if defined(HAVE_IOV_ITER_FAULT_IN_READABLE) - if (uio->uio_segflg == UIO_USERSPACE) { - iterp = &iter; - iov_iter_init_compat(iterp, READ, uio->uio_iov, - uio->uio_iovcnt, uio->uio_resid); + if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) { + /* There's never a need to fault in kernel pages */ + return (0); #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { - iterp = uio->uio_iter; + /* + * At least a Linux 4.9 kernel, iov_iter_fault_in_readable() + * can be relied on to fault in user pages when referenced. + */ + if (iov_iter_fault_in_readable(uio->uio_iter, n)) + return (EFAULT); #endif + } else { + /* Fault in all user pages */ + ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE); + const struct iovec *iov = uio->uio_iov; + int iovcnt = uio->uio_iovcnt; + size_t skip = uio->uio_skip; + uint8_t tmp; + caddr_t p; + + for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { + ulong_t cnt = MIN(iov->iov_len - skip, n); + /* empty iov */ + if (cnt == 0) + continue; + n -= cnt; + /* touch each page in this segment. */ + p = iov->iov_base + skip; + while (cnt) { + if (get_user(tmp, (uint8_t *)p)) + return (EFAULT); + ulong_t incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; + } + /* touch the last byte in case it straddles a page. */ + p--; + if (get_user(tmp, (uint8_t *)p)) + return (EFAULT); + } } if (iterp && iov_iter_fault_in_readable(iterp, n)) @@ -230,40 +266,40 @@ uio_prefaultpages(ssize_t n, struct uio *uio) #endif return (0); } -EXPORT_SYMBOL(uio_prefaultpages); +EXPORT_SYMBOL(zfs_uio_prefaultpages); /* - * The same as uiomove() but doesn't modify uio structure. + * The same as zfs_uiomove() but doesn't modify uio structure. * return in cbytes how many bytes were copied. */ int -uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) +zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) { - struct uio uio_copy; + zfs_uio_t uio_copy; int ret; - bcopy(uio, &uio_copy, sizeof (struct uio)); + bcopy(uio, &uio_copy, sizeof (zfs_uio_t)); if (uio->uio_segflg == UIO_BVEC) - ret = uiomove_bvec(p, n, rw, &uio_copy); + ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); #if defined(HAVE_VFS_IOV_ITER) else if (uio->uio_segflg == UIO_ITER) - ret = uiomove_iter(p, n, rw, &uio_copy, B_TRUE); + ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); #endif else - ret = uiomove_iov(p, n, rw, &uio_copy); + ret = zfs_uiomove_iov(p, n, rw, &uio_copy); *cbytes = uio->uio_resid - uio_copy.uio_resid; return (ret); } -EXPORT_SYMBOL(uiocopy); +EXPORT_SYMBOL(zfs_uiocopy); /* * Drop the next n chars out of *uio. */ void -uioskip(uio_t *uio, size_t n) +zfs_uioskip(zfs_uio_t *uio, size_t n) { if (n > uio->uio_resid) return; @@ -292,5 +328,6 @@ uioskip(uio_t *uio, size_t n) uio->uio_loffset += n; uio->uio_resid -= n; } -EXPORT_SYMBOL(uioskip); +EXPORT_SYMBOL(zfs_uioskip); + #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 165c1218ae79..3cc4b560e477 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1772,7 +1772,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) *ipp = ZTOI(zp); if (*ipp) - zfs_inode_update(ITOZ(*ipp)); + zfs_znode_update_vfs(ITOZ(*ipp)); ZFS_EXIT(zfsvfs); return (0); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 3be387a30e5c..84c33b541ea3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -87,15 +87,18 @@ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * - * (2) zrele() should always be the last thing except for zil_commit() - * (if necessary) and ZFS_EXIT(). This is for 3 reasons: - * First, if it's the last reference, the vnode/znode - * can be freed, so the zp may point to freed memory. Second, the last - * reference will call zfs_zinactive(), which may induce a lot of work -- - * pushing cached pages (which acquires range locks) and syncing out - * cached atime changes. Third, zfs_zinactive() may require a new tx, - * which could deadlock the system if you were already holding one. - * If you must call zrele() within a tx then use zfs_zrele_async(). + * (2) zrele() should always be the last thing except for zil_commit() (if + * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the + * last reference, the vnode/znode can be freed, so the zp may point to + * freed memory. Second, the last reference will call zfs_zinactive(), + * which may induce a lot of work -- pushing cached pages (which acquires + * range locks) and syncing out cached atime changes. Third, + * zfs_zinactive() may require a new tx, which could deadlock the system + * if you were already holding one. This deadlock occurs because the tx + * currently being operated on prevents a txg from syncing, which + * prevents the new tx from progressing, resulting in a deadlock. If you + * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() + * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. @@ -298,7 +301,7 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) * the file is memory mapped. */ int -mappedread(znode_t *zp, int nbytes, uio_t *uio) +mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; @@ -320,7 +323,7 @@ mappedread(znode_t *zp, int nbytes, uio_t *uio) unlock_page(pp); pb = kmap(pp); - error = uiomove(pb + off, bytes, UIO_READ, uio); + error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) @@ -372,8 +375,8 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len, iov.iov_base = (void *)data; iov.iov_len = len; - uio_t uio; - uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); @@ -381,8 +384,8 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len, if (error == 0) { if (residp != NULL) - *residp = uio_resid(&uio); - else if (uio_resid(&uio) != 0) + *residp = zfs_uio_resid(&uio); + else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } @@ -398,11 +401,18 @@ zfs_zrele_async(znode_t *zp) ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); - if (atomic_read(&ip->i_count) == 1) + /* + * If decrementing the count would put us at 0, we can't do it inline + * here, because that would be synchronous. Instead, dispatch an iput + * to run later. + * + * For more information on the dangers of a synchronous iput, see the + * header comment of this file. + */ + if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); - else - zrele(zp); + } } @@ -516,7 +526,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) - zfs_inode_update(*zpp); + zfs_znode_update_vfs(*zpp); ZFS_EXIT(zfsvfs); return (error); @@ -779,8 +789,8 @@ out: if (zp) zrele(zp); } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); *zpp = zp; } @@ -902,8 +912,8 @@ out: if (zp) zrele(zp); } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } @@ -1129,8 +1139,8 @@ out: pn_free(realnmp); zfs_dirent_unlock(dl); - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); @@ -1138,7 +1148,7 @@ out: zfs_zrele_async(zp); if (xzp) { - zfs_inode_update(xzp); + zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } @@ -1335,8 +1345,8 @@ out: if (error != 0) { zrele(zp); } else { - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); } ZFS_EXIT(zfsvfs); return (error); @@ -1461,8 +1471,8 @@ top: out: zfs_dirent_unlock(dl); - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) @@ -2532,7 +2542,7 @@ out: err2 = zfs_setattr_dir(attrzp); zrele(attrzp); } - zfs_inode_update(zp); + zfs_znode_update_vfs(zp); } out2: @@ -2990,17 +3000,17 @@ out: zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); - zfs_inode_update(sdzp); + zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) - zfs_inode_update(tdzp); + zfs_znode_update_vfs(tdzp); - zfs_inode_update(szp); + zfs_znode_update_vfs(szp); zrele(szp); if (tzp) { - zfs_inode_update(tzp); + zfs_znode_update_vfs(tzp); zrele(tzp); } @@ -3159,8 +3169,8 @@ top: txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_znode_update_vfs(dzp); + zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); @@ -3198,7 +3208,7 @@ top: */ /* ARGSUSED */ int -zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) +zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -3409,8 +3419,8 @@ top: if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); - zfs_inode_update(tdzp); - zfs_inode_update(szp); + zfs_znode_update_vfs(tdzp); + zfs_znode_update_vfs(szp); ZFS_EXIT(zfsvfs); return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index b33594488ee0..d59c1bb0716a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -479,14 +479,10 @@ zfs_set_inode_flags(znode_t *zp, struct inode *ip) } /* - * Update the embedded inode given the znode. We should work toward - * eliminating this function as soon as possible by removing values - * which are duplicated between the znode and inode. If the generic - * inode has the correct field it should be used, and the ZFS code - * updated to access the inode. This can be done incrementally. + * Update the embedded inode given the znode. */ void -zfs_inode_update(znode_t *zp) +zfs_znode_update_vfs(znode_t *zp) { zfsvfs_t *zfsvfs; struct inode *ip; @@ -602,7 +598,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ZFS_TIME_DECODE(&ip->i_ctime, ctime); ip->i_ino = zp->z_id; - zfs_inode_update(zp); + zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* @@ -1278,7 +1274,7 @@ zfs_rezget(znode_t *zp) zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; - zfs_inode_update(zp); + zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send @@ -1796,7 +1792,7 @@ log: dmu_tx_commit(tx); - zfs_inode_update(zp); + zfs_znode_update_vfs(zp); error = 0; out: @@ -2127,8 +2123,10 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, size_t complen; int is_xattrdir = 0; - if (prevdb) + if (prevdb) { + ASSERT(prevhdl != NULL); zfs_release_sa_handle(prevhdl, prevdb, FTAG); + } if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index 8106359e1c77..284ca706ede5 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -376,7 +376,7 @@ error: static int zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, - uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) + zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len) { int ret; crypto_data_t plaindata, cipherdata; @@ -479,7 +479,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) { int ret; - uio_t puio, cuio; + zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint64_t crypt = key->zk_crypt; @@ -495,7 +495,7 @@ zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, if (ret != 0) goto error; - /* initialize uio_ts */ + /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; @@ -550,7 +550,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, uint8_t *mac, zio_crypt_key_t *key) { crypto_mechanism_t mech; - uio_t puio, cuio; + zfs_uio_t puio, cuio; uint64_t aad[3]; iovec_t plain_iovecs[2], cipher_iovecs[3]; uint_t enc_len, keydata_len, aad_len; @@ -563,7 +563,7 @@ zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, keydata_len = zio_crypt_table[crypt].ci_keylen; - /* initialize uio_ts */ + /* initialize zfs_uio_ts */ plain_iovecs[0].iov_base = key->zk_master_keydata; plain_iovecs[0].iov_len = keydata_len; plain_iovecs[1].iov_base = key->zk_hmac_keydata; @@ -1296,7 +1296,7 @@ error: } static void -zio_crypt_destroy_uio(uio_t *uio) +zio_crypt_destroy_uio(zfs_uio_t *uio) { if (uio->uio_iov) kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); @@ -1386,8 +1386,8 @@ zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, */ static int zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, - uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio, + zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; @@ -1581,7 +1581,7 @@ error: static int zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; @@ -1764,7 +1764,7 @@ error: static int zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, - uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio, + uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len) { int ret; @@ -1824,8 +1824,8 @@ error: static int zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, - uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, - uint_t *auth_len, boolean_t *no_crypt) + uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, + uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt) { int ret; iovec_t *mac_iov; @@ -1884,7 +1884,7 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, uint64_t crypt = key->zk_crypt; uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; uint_t enc_len, auth_len; - uio_t puio, cuio; + zfs_uio_t puio, cuio; uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; crypto_key_t tmp_ckey, *ckey = NULL; crypto_ctx_template_t tmpl; @@ -1950,8 +1950,8 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, /* If the hardware implementation fails fall back to software */ } - bzero(&puio, sizeof (uio_t)); - bzero(&cuio, sizeof (uio_t)); + bzero(&puio, sizeof (zfs_uio_t)); + bzero(&cuio, sizeof (zfs_uio_t)); /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 9e08c94e2147..970db4a8b73a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -242,13 +242,13 @@ zpl_file_accessed(struct file *filp) * Otherwise, for older kernels extract the iovec and pass it instead. */ static void -zpl_uio_init(uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, +zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, loff_t pos, ssize_t count, size_t skip) { #if defined(HAVE_VFS_IOV_ITER) - uio_iov_iter_init(uio, to, pos, count, skip); + zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else - uio_iovec_init(uio, to->iov, to->nr_segs, pos, + zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif @@ -261,7 +261,7 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; ssize_t count = iov_iter_count(to); - uio_t uio; + zfs_uio_t uio; zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); @@ -317,7 +317,7 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; - uio_t uio; + zfs_uio_t uio; size_t count = 0; ssize_t ret; @@ -364,8 +364,8 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, if (ret) return (ret); - uio_t uio; - uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); @@ -407,8 +407,8 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, if (ret) return (ret); - uio_t uio; - uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index f336fbb1272b..e79d334edc9b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -499,8 +499,8 @@ zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - uio_t uio; - uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index 1ec3dae2bb81..83812f2dcba8 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -306,15 +306,15 @@ zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, iov.iov_base = (void *)value; iov.iov_len = size; - uio_t uio; - uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); cookie = spl_fstrans_mark(); error = -zfs_read(ITOZ(xip), &uio, 0, cr); spl_fstrans_unmark(cookie); if (error == 0) - error = size - uio_resid(&uio); + error = size - zfs_uio_resid(&uio); out: if (xzp) zrele(xzp); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index cdc2076702af..0caf31307718 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -85,9 +85,9 @@ zvol_write(void *arg) zv_request_t *zvr = arg; struct bio *bio = zvr->bio; int error = 0; - uio_t uio; + zfs_uio_t uio; - uio_bvec_init(&uio, bio); + zfs_uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); @@ -247,9 +247,9 @@ zvol_read(void *arg) zv_request_t *zvr = arg; struct bio *bio = zvr->bio; int error = 0; - uio_t uio; + zfs_uio_t uio; - uio_bvec_init(&uio, bio); + zfs_uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 68d4aa5f5cb4..7d3a2f6d69e2 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -105,26 +105,6 @@ /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; -boolean_t -abd_is_linear(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); -} - -boolean_t -abd_is_linear_page(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? - B_TRUE : B_FALSE); -} - -boolean_t -abd_is_gang(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_GANG) != 0 ? B_TRUE : - B_FALSE); -} - void abd_verify(abd_t *abd) { @@ -133,8 +113,10 @@ abd_verify(abd_t *abd) ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | - ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS)); + ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); +#ifdef ZFS_DEBUG IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); +#endif IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); @@ -153,11 +135,43 @@ abd_verify(abd_t *abd) } } -uint_t -abd_get_size(abd_t *abd) +static void +abd_init_struct(abd_t *abd) { - abd_verify(abd); - return (abd->abd_size); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + abd->abd_flags = 0; +#ifdef ZFS_DEBUG + zfs_refcount_create(&abd->abd_children); + abd->abd_parent = NULL; +#endif + abd->abd_size = 0; +} + +static void +abd_fini_struct(abd_t *abd) +{ + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_gang_link)); +#ifdef ZFS_DEBUG + zfs_refcount_destroy(&abd->abd_children); +#endif +} + +abd_t * +abd_alloc_struct(size_t size) +{ + abd_t *abd = abd_alloc_struct_impl(size); + abd_init_struct(abd); + abd->abd_flags |= ABD_FLAG_ALLOCD; + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + abd_fini_struct(abd); + abd_free_struct_impl(abd); } /* @@ -173,7 +187,7 @@ abd_alloc(size_t size, boolean_t is_metadata) VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd_t *abd = abd_alloc_struct(size); - abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_flags |= ABD_FLAG_OWNER; abd->abd_u.abd_scatter.abd_offset = 0; abd_alloc_chunks(abd, size); @@ -181,65 +195,12 @@ abd_alloc(size_t size, boolean_t is_metadata) abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } -static void -abd_free_scatter(abd_t *abd) -{ - abd_free_chunks(abd); - - zfs_refcount_destroy(&abd->abd_children); - abd_update_scatter_stats(abd, ABDSTAT_DECR); - abd_free_struct(abd); -} - -static void -abd_put_gang_abd(abd_t *abd) -{ - ASSERT(abd_is_gang(abd)); - abd_t *cabd; - - while ((cabd = list_remove_head(&ABD_GANG(abd).abd_gang_chain)) - != NULL) { - ASSERT0(cabd->abd_flags & ABD_FLAG_GANG_FREE); - abd->abd_size -= cabd->abd_size; - abd_put(cabd); - } - ASSERT0(abd->abd_size); - list_destroy(&ABD_GANG(abd).abd_gang_chain); -} - -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - if (abd == NULL) - return; - - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - if (abd_is_gang(abd)) - abd_put_gang_abd(abd); - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - /* * Allocate an ABD that must be linear, along with its own underlying data * buffer. Only use this when it would be very annoying to write your ABD @@ -252,13 +213,11 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); if (is_metadata) { ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); @@ -284,19 +243,16 @@ abd_free_linear(abd_t *abd) zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } - zfs_refcount_destroy(&abd->abd_children); abd_update_linear_stats(abd, ABDSTAT_DECR); - - abd_free_struct(abd); } static void -abd_free_gang_abd(abd_t *abd) +abd_free_gang(abd_t *abd) { ASSERT(abd_is_gang(abd)); - abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + abd_t *cabd; - while (cabd != NULL) { + while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) { /* * We must acquire the child ABDs mutex to ensure that if it * is being added to another gang ABD we will set the link @@ -307,24 +263,29 @@ abd_free_gang_abd(abd_t *abd) ASSERT(list_link_active(&cabd->abd_gang_link)); list_remove(&ABD_GANG(abd).abd_gang_chain, cabd); mutex_exit(&cabd->abd_mtx); - abd->abd_size -= cabd->abd_size; - if (cabd->abd_flags & ABD_FLAG_GANG_FREE) { - if (cabd->abd_flags & ABD_FLAG_OWNER) - abd_free(cabd); - else - abd_put(cabd); - } - cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + if (cabd->abd_flags & ABD_FLAG_GANG_FREE) + abd_free(cabd); } - ASSERT0(abd->abd_size); list_destroy(&ABD_GANG(abd).abd_gang_chain); - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + abd_free_chunks(abd); + abd_update_scatter_stats(abd, ABDSTAT_DECR); } /* - * Free an ABD. Only use this on ABDs allocated with abd_alloc(), - * abd_alloc_linear(), or abd_alloc_gang_abd(). + * Free an ABD. Use with any kind of abd: those created with abd_alloc_*() + * and abd_get_*(), including abd_get_offset_struct(). + * + * If the ABD was created with abd_alloc_*(), the underlying data + * (scatterlist or linear buffer) will also be freed. (Subject to ownership + * changes via abd_*_ownership_of_buf().) + * + * Unless the ABD was created with abd_get_offset_struct(), the abd_t will + * also be freed. */ void abd_free(abd_t *abd) @@ -333,14 +294,30 @@ abd_free(abd_t *abd) return; abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else if (abd_is_gang(abd)) - abd_free_gang_abd(abd); - else - abd_free_scatter(abd); +#ifdef ZFS_DEBUG + IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL); +#endif + + if (abd_is_gang(abd)) { + abd_free_gang(abd); + } else if (abd_is_linear(abd)) { + if (abd->abd_flags & ABD_FLAG_OWNER) + abd_free_linear(abd); + } else { + if (abd->abd_flags & ABD_FLAG_OWNER) + abd_free_scatter(abd); + } + +#ifdef ZFS_DEBUG + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } +#endif + + abd_fini_struct(abd); + if (abd->abd_flags & ABD_FLAG_ALLOCD) + abd_free_struct_impl(abd); } /* @@ -359,24 +336,18 @@ abd_alloc_sametype(abd_t *sabd, size_t size) } } - /* * Create gang ABD that will be the head of a list of ABD's. This is used * to "chain" scatter/gather lists together when constructing aggregated * IO's. To free this abd, abd_free() must be called. */ abd_t * -abd_alloc_gang_abd(void) +abd_alloc_gang(void) { - abd_t *abd; - - abd = abd_alloc_struct(0); - abd->abd_flags = ABD_FLAG_GANG | ABD_FLAG_OWNER; - abd->abd_size = 0; - abd->abd_parent = NULL; + abd_t *abd = abd_alloc_struct(0); + abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER; list_create(&ABD_GANG(abd).abd_gang_chain, sizeof (abd_t), offsetof(abd_t, abd_gang_link)); - zfs_refcount_create(&abd->abd_children); return (abd); } @@ -392,8 +363,8 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) if (free_on_free) { /* * If the parent is responsible for freeing the child gang - * ABD we will just splice the childs children ABD list to - * the parents list and immediately free the child gang ABD + * ABD we will just splice the child's children ABD list to + * the parent's list and immediately free the child gang ABD * struct. The parent gang ABDs children from the child gang * will retain all the free_on_free settings after being * added to the parents list. @@ -403,7 +374,7 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); abd_verify(pabd); - abd_free_struct(cabd); + abd_free(cabd); } else { for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain); child != NULL; @@ -431,7 +402,7 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) /* * If the child being added is a gang ABD, we will add the - * childs ABDs to the parent gang ABD. This alllows us to account + * child's ABDs to the parent gang ABD. This allows us to account * for the offset correctly in the parent gang ABD. */ if (abd_is_gang(cabd)) { @@ -458,7 +429,7 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) * allocated ABD with ABD_FLAG_GANG_FREE, before * adding it to the gang ABD's list, to make the * gang ABD aware that it is responsible to call - * abd_put(). We use abd_get_offset() in order + * abd_free(). We use abd_get_offset() in order * to just allocate a new ABD but avoid copying the * data over into the newly allocated ABD. * @@ -515,73 +486,96 @@ abd_gang_get_offset(abd_t *abd, size_t *off) } /* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. + * Allocate a new ABD, using the provided struct (if non-NULL, and if + * circumstances allow - otherwise allocate the struct). The returned ABD will + * point to offset off of sabd. It shares the underlying buffer data with sabd. + * Use abd_free() to free. sabd must not be freed while any derived ABDs exist. */ static abd_t * -abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) { - abd_t *abd = NULL; - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); + ASSERT3U(off + size, <=, sabd->abd_size); if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(0); - + if (abd == NULL) + abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ - abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR; ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else if (abd_is_gang(sabd)) { size_t left = size; - abd = abd_alloc_gang_abd(); + if (abd == NULL) { + abd = abd_alloc_gang(); + } else { + abd->abd_flags |= ABD_FLAG_GANG; + list_create(&ABD_GANG(abd).abd_gang_chain, + sizeof (abd_t), offsetof(abd_t, abd_gang_link)); + } + abd->abd_flags &= ~ABD_FLAG_OWNER; for (abd_t *cabd = abd_gang_get_offset(sabd, &off); cabd != NULL && left > 0; cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) { int csize = MIN(left, cabd->abd_size - off); - abd_t *nabd = abd_get_offset_impl(cabd, off, csize); - abd_gang_add(abd, nabd, B_FALSE); + abd_t *nabd = abd_get_offset_size(cabd, off, csize); + abd_gang_add(abd, nabd, B_TRUE); left -= csize; off = 0; } ASSERT3U(left, ==, 0); } else { - abd = abd_get_offset_scatter(sabd, off); + abd = abd_get_offset_scatter(abd, sabd, off); } + ASSERT3P(abd, !=, NULL); abd->abd_size = size; +#ifdef ZFS_DEBUG abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); +#endif return (abd); } +/* + * Like abd_get_offset_size(), but memory for the abd_t is provided by the + * caller. Using this routine can improve performance by avoiding the cost + * of allocating memory for the abd_t struct, and updating the abd stats. + * Usually, the provided abd is returned, but in some circumstances (FreeBSD, + * if sabd is scatter and size is more than 2 pages) a new abd_t may need to + * be allocated. Therefore callers should be careful to use the returned + * abd_t*. + */ +abd_t * +abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size) +{ + abd_init_struct(abd); + return (abd_get_offset_impl(abd, sabd, off, size)); +} + abd_t * abd_get_offset(abd_t *sabd, size_t off) { size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; VERIFY3U(size, >, 0); - return (abd_get_offset_impl(sabd, off, size)); + return (abd_get_offset_impl(NULL, sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); - return (abd_get_offset_impl(sabd, off, size)); + return (abd_get_offset_impl(NULL, sabd, off, size)); } /* - * Return a size scatter ABD. In order to free the returned - * ABD abd_put() must be called. + * Return a size scatter ABD containing only zeros. */ abd_t * abd_get_zeros(size_t size) @@ -592,8 +586,7 @@ abd_get_zeros(size_t size) } /* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. + * Allocate a linear ABD structure for buf. */ abd_t * abd_get_from_buf(void *buf, size_t size) @@ -607,10 +600,8 @@ abd_get_from_buf(void *buf, size_t size) * own the underlying data buffer, which is not true in this case. * Therefore, we don't ever use ABD_FLAG_META here. */ - abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR; abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); ABD_LINEAR_BUF(abd) = buf; @@ -645,7 +636,9 @@ abd_borrow_buf(abd_t *abd, size_t n) } else { buf = zio_buf_alloc(n); } +#ifdef ZFS_DEBUG (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif return (buf); } @@ -676,7 +669,9 @@ abd_return_buf(abd_t *abd, void *buf, size_t n) ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } +#ifdef ZFS_DEBUG (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif } void @@ -790,12 +785,12 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - boolean_t abd_multi = abd_is_gang(abd); + boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { /* If we are at the end of the gang ABD we are done */ - if (abd_multi && !c_abd) + if (gang && !c_abd) break; abd_iter_map(&aiter); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index fecc752e16ff..b4f0c8a85b64 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -3065,7 +3065,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); - abd_put(hdr->b_l1hdr.b_pabd); + abd_free(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; @@ -4163,7 +4163,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, mutex_enter(&arc_evict_lock); arc_evict_count += bytes_evicted; - if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) { + if (arc_free_memory() > arc_sys_free / 2) { arc_evict_waiter_t *aw; while ((aw = list_head(&arc_evict_waiters)) != NULL && aw->aew_count <= arc_evict_count) { @@ -5242,14 +5242,20 @@ arc_wait_for_eviction(uint64_t amount) list_link_init(&aw.aew_node); cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); - arc_evict_waiter_t *last = - list_tail(&arc_evict_waiters); - if (last != NULL) { - ASSERT3U(last->aew_count, >, arc_evict_count); - aw.aew_count = last->aew_count + amount; - } else { - aw.aew_count = arc_evict_count + amount; + uint64_t last_count = 0; + if (!list_is_empty(&arc_evict_waiters)) { + arc_evict_waiter_t *last = + list_tail(&arc_evict_waiters); + last_count = last->aew_count; } + /* + * Note, the last waiter's count may be less than + * arc_evict_count if we are low on memory in which + * case arc_evict_state_impl() may have deferred + * wakeups (but still incremented arc_evict_count). + */ + aw.aew_count = + MAX(last_count, arc_evict_count) + amount; list_insert_tail(&arc_evict_waiters, &aw); @@ -7041,7 +7047,7 @@ arc_write_done(zio_t *zio) ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); - abd_put(zio->io_abd); + abd_free(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } @@ -9037,7 +9043,7 @@ l2arc_blk_fetch_done(zio_t *zio) cb = zio->io_private; if (cb->l2rcb_abd != NULL) - abd_put(cb->l2rcb_abd); + abd_free(cb->l2rcb_abd); kmem_free(cb, sizeof (l2arc_read_callback_t)); } @@ -9075,17 +9081,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * Copy buffers for L2ARC writing. */ - for (int try = 0; try < L2ARC_FEED_TYPES; try++) { + for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) { /* - * If try == 1 or 3, we cache MRU metadata and data + * If pass == 1 or 3, we cache MRU metadata and data * respectively. */ if (l2arc_mfuonly) { - if (try == 1 || try == 3) + if (pass == 1 || pass == 3) continue; } - multilist_sublist_t *mls = l2arc_sublist_lock(try); + multilist_sublist_t *mls = l2arc_sublist_lock(pass); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); @@ -10011,7 +10017,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); - abd_put(abd); + abd_free(abd); if (err != 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); @@ -10379,7 +10385,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); - abd_put(abd); + abd_free(abd); if (err != 0) { zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " @@ -10468,7 +10474,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) fletcher_4_native(tmpbuf, asize, NULL, &l2dhdr->dh_start_lbps[0].lbp_cksum); - abd_put(abd_buf->abd); + abd_free(abd_buf->abd); /* perform the write itself */ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 93445a80294b..a6cdc017cd21 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -4656,7 +4656,7 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) - abd_put(zio->io_abd); + abd_free(zio->io_abd); } typedef struct dbuf_remap_impl_callback_arg { diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index a02f43df13fd..ed345f0b6ec3 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -1170,7 +1170,7 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, #ifdef _KERNEL int -dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) +dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; @@ -1179,7 +1179,7 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size, + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); @@ -1191,16 +1191,12 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) ASSERT(size > 0); - bufoff = uio_offset(uio) - db->db_offset; + bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); -#ifdef __FreeBSD__ - err = vn_io_fault_uiomove((char *)db->db_data + bufoff, - tocpy, uio); -#else - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); -#endif + err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + if (err) break; @@ -1214,14 +1210,14 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. - * Starting at offset uio->uio_loffset. + * Starting at zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int -dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) +dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; @@ -1241,10 +1237,10 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) /* * Read 'size' bytes into the uio buffer. * From the specified object - * Starting at offset uio->uio_loffset. + * Starting at offset zfs_uio_offset(uio). */ int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size) { dnode_t *dn; int err; @@ -1264,14 +1260,14 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) } int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; - err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size, + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); @@ -1283,7 +1279,7 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(size > 0); - bufoff = uio_offset(uio) - db->db_offset; + bufoff = zfs_uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -1294,18 +1290,14 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_will_dirty(db, tx); /* - * XXX uiomove could block forever (eg.nfs-backed + * XXX zfs_uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't + * to lock the pages in memory, so that zfs_uiomove won't * block. */ -#ifdef __FreeBSD__ - err = vn_io_fault_uiomove((char *)db->db_data + bufoff, - tocpy, uio); -#else - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); -#endif + err = zfs_uio_fault_move((char *)db->db_data + bufoff, + tocpy, UIO_WRITE, uio); + if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -1322,14 +1314,14 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. - * Starting at offset uio->uio_loffset. + * Starting at offset zfs_uio_offset(uio). * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int -dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, +dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; @@ -1350,10 +1342,10 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, /* * Write 'size' bytes from the uio buffer. * To the specified object. - * Starting at offset uio->uio_loffset. + * Starting at offset zfs_uio_offset(uio). */ int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, +dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; @@ -1600,7 +1592,7 @@ dmu_sync_late_arrival_done(zio_t *zio) dsa->dsa_done(dsa->dsa_zgd, zio->io_error); - abd_put(zio->io_abd); + abd_free(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index 66a8f20092e0..bfb4adf262d5 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -326,7 +326,7 @@ smallblk_changed_cb(void *arg, uint64_t newval) /* * Inheritance and range checking should have been done by now. */ - ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE); + ASSERT(newval <= SPA_MAXBLOCKSIZE); ASSERT(ISP2(newval)); os->os_zpl_special_smallblock = newval; diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 0ebed4e6fbdf..73667915df0f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -1012,6 +1012,22 @@ dmu_tx_unassign(dmu_tx_t *tx) * details on the throttle). This is used by the VFS operations, after * they have already called dmu_tx_wait() (though most likely on a * different tx). + * + * It is guaranteed that subsequent successful calls to dmu_tx_assign() + * will assign the tx to monotonically increasing txgs. Of course this is + * not strong monotonicity, because the same txg can be returned multiple + * times in a row. This guarantee holds both for subsequent calls from + * one thread and for multiple threads. For example, it is impossible to + * observe the following sequence of events: + * + * Thread 1 Thread 2 + * + * dmu_tx_assign(T1, ...) + * 1 <- dmu_tx_get_txg(T1) + * dmu_tx_assign(T2, ...) + * 2 <- dmu_tx_get_txg(T2) + * dmu_tx_assign(T3, ...) + * 1 <- dmu_tx_get_txg(T3) */ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index de60c33589e3..6da5faf01edf 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -2322,18 +2322,7 @@ void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { nvlist_t *propval = fnvlist_alloc(); - nvlist_t *val; - - /* - * We use nvlist_alloc() instead of fnvlist_alloc() because the - * latter would allocate the list with NV_UNIQUE_NAME flag. - * As a result, every time a clone name is appended to the list - * it would be (linearly) searched for a duplicate name. - * We already know that all clone names must be unique and we - * want avoid the quadratic complexity of double-checking that - * because we can have a large number of clones. - */ - VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); + nvlist_t *val = fnvlist_alloc(); if (get_clones_stat_impl(ds, val) == 0) { fnvlist_add_nvlist(propval, ZPROP_VALUE, val); diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c index 26fdf96341b9..837d78987e75 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c +++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c @@ -600,26 +600,21 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, /* * lzc_destroy_snaps() is documented to take an nvlist whose * values "don't matter". We need to convert that nvlist to - * one that we know can be converted to LUA. We also don't - * care about any duplicate entries because the nvlist will - * be converted to a LUA table which should take care of this. + * one that we know can be converted to LUA. */ - nvlist_t *snaps_normalized; - VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP)); + nvlist_t *snaps_normalized = fnvlist_alloc(); for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { fnvlist_add_boolean_value(snaps_normalized, nvpair_name(pair), B_TRUE); } - nvlist_t *arg; - VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP)); + nvlist_t *arg = fnvlist_alloc(); fnvlist_add_nvlist(arg, "snaps", snaps_normalized); fnvlist_free(snaps_normalized); fnvlist_add_boolean_value(arg, "defer", defer); - nvlist_t *wrapper; - VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP)); + nvlist_t *wrapper = fnvlist_alloc(); fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg); fnvlist_free(arg); @@ -654,7 +649,7 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, B_TRUE, 0, zfs_lua_max_memlimit, - nvlist_next_nvpair(wrapper, NULL), result); + fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result); if (error != 0) { char *errorstr = NULL; (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index bed6bf64c928..bc4f007b61a1 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -522,9 +522,10 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, KM_SLEEP); + mutex_enter(&mc->mc_lock); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; + metaslab_group_t *mg = vdev_get_mg(tvd, mc); /* * Skip any holes, uninitialized top-levels, or @@ -535,13 +536,18 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) continue; } + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + } + mutex_exit(&mc->mc_lock); kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -1004,16 +1010,22 @@ metaslab_group_initialized(metaslab_group_t *mg) uint64_t metaslab_group_get_space(metaslab_group_t *mg) { - return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); + /* + * Note that the number of nodes in mg_metaslab_tree may be one less + * than vdev_ms_count, due to the embedded log metaslab. + */ + mutex_enter(&mg->mg_lock); + uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); + mutex_exit(&mg->mg_lock); + return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); } void metaslab_group_histogram_verify(metaslab_group_t *mg) { uint64_t *mg_hist; - vdev_t *vd = mg->mg_vd; - uint64_t ashift = vd->vdev_ashift; - int i; + avl_tree_t *t = &mg->mg_metaslab_tree; + uint64_t ashift = mg->mg_vd->vdev_ashift; if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) return; @@ -1024,21 +1036,25 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, SPACE_MAP_HISTOGRAM_SIZE + ashift); - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - /* skip if not active or not a member */ - if (msp->ms_sm == NULL || msp->ms_group != mg) + mutex_enter(&mg->mg_lock); + for (metaslab_t *msp = avl_first(t); + msp != NULL; msp = AVL_NEXT(t, msp)) { + VERIFY3P(msp->ms_group, ==, mg); + /* skip if not active */ + if (msp->ms_sm == NULL) continue; - for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { mg_hist[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; + } } - for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + mutex_exit(&mg->mg_lock); + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } @@ -1053,12 +1069,16 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) return; mutex_enter(&mg->mg_lock); + mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; } + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -1073,17 +1093,21 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) return; mutex_enter(&mg->mg_lock); + mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { ASSERT3U(mg->mg_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); + IMPLY(mg == mg->mg_vd->vdev_log_mg, + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; } + mutex_exit(&mc->mc_lock); mutex_exit(&mg->mg_lock); } @@ -2741,37 +2765,47 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); - metaslab_space_update(vd, mg->mg_class, - -metaslab_allocated_space(msp), 0, -msp->ms_size); + /* + * If the range trees haven't been allocated, this metaslab hasn't + * been through metaslab_sync_done() for the first time yet, so its + * space hasn't been accounted for in its vdev and doesn't need to be + * subtracted. + */ + if (msp->ms_freed != NULL) { + metaslab_space_update(vd, mg->mg_class, + -metaslab_allocated_space(msp), 0, -msp->ms_size); + } space_map_close(msp->ms_sm); msp->ms_sm = NULL; metaslab_unload(msp); + range_tree_destroy(msp->ms_allocatable); - range_tree_destroy(msp->ms_freeing); - range_tree_destroy(msp->ms_freed); - ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, - metaslab_unflushed_changes_memused(msp)); - spa->spa_unflushed_stats.sus_memused -= - metaslab_unflushed_changes_memused(msp); - range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_allocs); - range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); - range_tree_destroy(msp->ms_unflushed_frees); + if (msp->ms_freed != NULL) { + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); - for (int t = 0; t < TXG_SIZE; t++) { - range_tree_destroy(msp->ms_allocating[t]); - } + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_destroy(msp->ms_checkpointing); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_destroy(msp->ms_defer[t]); + for (int t = 0; t < TXG_SIZE; t++) { + range_tree_destroy(msp->ms_allocating[t]); + } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defer[t]); + } } ASSERT0(msp->ms_deferspace); - range_tree_destroy(msp->ms_checkpointing); - for (int t = 0; t < TXG_SIZE; t++) ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); @@ -5113,7 +5147,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * all else fails. */ if (vd != NULL && vd->vdev_mg != NULL) { - mg = vd->vdev_mg; + mg = vdev_get_mg(vd, mc); if (flags & METASLAB_HINTBP_AVOID && mg->mg_next != NULL) diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index 83a10e7b4548..5af0aaa7d0aa 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -1502,7 +1502,7 @@ sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) #ifdef _KERNEL int -sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio) { int error; sa_bulk_attr_t bulk; @@ -1515,8 +1515,8 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { - error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, - uio_resid(uio)), UIO_READ, uio); + error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + zfs_uio_resid(uio)), UIO_READ, uio); } mutex_exit(&hdl->sa_lock); return (error); diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 53ffbc31c186..56354a107e66 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -303,10 +303,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); + size += metaslab_class_get_space(spa_embedded_log_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); @@ -1196,6 +1198,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_embedded_log_class = + metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); @@ -1347,6 +1351,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; + metaslab_class_destroy(spa->spa_embedded_log_class); + spa->spa_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; @@ -2103,6 +2110,9 @@ spa_check_logs(spa_t *spa) return (rv); } +/* + * Passivate any log vdevs (note, does not apply to embedded log metaslabs). + */ static boolean_t spa_passivate_log(spa_t *spa) { @@ -2113,10 +2123,10 @@ spa_passivate_log(spa_t *spa) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { - metaslab_group_passivate(mg); + ASSERT3P(tvd->vdev_log_mg, ==, NULL); + metaslab_group_passivate(tvd->vdev_mg); slog_found = B_TRUE; } } @@ -2124,6 +2134,9 @@ spa_passivate_log(spa_t *spa) return (slog_found); } +/* + * Activate any log vdevs (note, does not apply to embedded log metaslabs). + */ static void spa_activate_log(spa_t *spa) { @@ -2133,10 +2146,11 @@ spa_activate_log(spa_t *spa) for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; - if (tvd->vdev_islog) - metaslab_group_activate(mg); + if (tvd->vdev_islog) { + ASSERT3P(tvd->vdev_log_mg, ==, NULL); + metaslab_group_activate(tvd->vdev_mg); + } } } @@ -6236,6 +6250,7 @@ static int spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { + int error; spa_t *spa; if (oldconfig) @@ -6288,13 +6303,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, * references. If we are resetting a pool, allow references by * fault injection handlers. */ - if (!spa_refcount_zero(spa) || - (spa->spa_inject_ref != 0 && - new_state != POOL_STATE_UNINITIALIZED)) { - spa_async_resume(spa); - spa->spa_is_exporting = B_FALSE; - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EBUSY)); + if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { + error = SET_ERROR(EBUSY); + goto fail; } if (spa->spa_sync_on) { @@ -6306,10 +6317,8 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { - spa_async_resume(spa); - spa->spa_is_exporting = B_FALSE; - mutex_exit(&spa_namespace_lock); - return (SET_ERROR(EXDEV)); + error = SET_ERROR(EXDEV); + goto fail; } /* @@ -6371,6 +6380,12 @@ export_spa: mutex_exit(&spa_namespace_lock); return (0); + +fail: + spa->spa_is_exporting = B_FALSE; + spa_async_resume(spa); + mutex_exit(&spa_namespace_lock); + return (error); } /* @@ -8033,12 +8048,16 @@ spa_async_thread(void *arg) old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); + old_space += metaslab_class_get_space( + spa_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); + new_space += metaslab_class_get_space( + spa_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c index 2939c0366504..0482e0f6c39d 100644 --- a/sys/contrib/openzfs/module/zfs/spa_history.c +++ b/sys/contrib/openzfs/module/zfs/spa_history.c @@ -288,7 +288,6 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) } #endif - fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename); if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { @@ -396,6 +395,12 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) } fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); + /* + * Since the history is recorded asynchronously, the effective time is + * now, which may be considerably before the change is made on disk. + */ + fnvlist_add_uint64(nvarg, ZPOOL_HIST_TIME, gethrestime_sec()); + /* Kick this off asynchronously; errors are ignored. */ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx); dmu_tx_commit(tx); @@ -526,6 +531,7 @@ log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); + fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(nvl, tx); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index f49be8eec01a..b4c73f58d3bc 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -349,9 +349,11 @@ int spa_asize_inflation = 24; * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed. This ensures that we don't run the pool * completely out of space, due to unaccounted changes (e.g. to the MOS). - * It also limits the worst-case time to allocate space. If we have - * less than this amount of free space, most ZPL operations (e.g. write, - * create) will return ENOSPC. + * It also limits the worst-case time to allocate space. If we have less than + * this amount of free space, most ZPL operations (e.g. write, create) will + * return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are also part of + * this 3.2% of space which can't be consumed by normal writes; the slop space + * "proper" (spa_get_slop_space()) is decreased by the embedded log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half @@ -1026,10 +1028,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl) /* * Spares are tracked globally due to the following constraints: * - * - A spare may be part of multiple pools. - * - A spare may be added to a pool even if it's actively in use within + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within * another pool. - * - A spare in use in any pool can only be the source of a replacement if + * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference @@ -1236,6 +1238,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); @@ -1776,17 +1779,37 @@ spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) } /* - * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), - * or at least 128MB, unless that would cause it to be more than half the - * pool size. - * - * See the comment above spa_slop_shift for details. + * Return the amount of slop space in bytes. It is typically 1/32 of the pool + * (3.2%), minus the embedded log space. On very small pools, it may be + * slightly larger than this. The embedded log space is not included in + * spa_dspace. By subtracting it, the usable space (per "zfs list") is a + * constant 97% of the total space, regardless of metaslab size (assuming the + * default spa_slop_shift=5 and a non-tiny pool). + * + * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = spa_get_dspace(spa); - return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); + uint64_t slop = space >> spa_slop_shift; + + /* + * Subtract the embedded log space, but no more than half the (3.2%) + * unusable space. Note, the "no more than half" is only relevant if + * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by + * default. + */ + uint64_t embedded_log = + metaslab_class_get_dspace(spa_embedded_log_class(spa)); + slop -= MIN(embedded_log, slop >> 1); + + /* + * Slop space should be at least spa_min_slop, but no more than half + * the entire pool. + */ + slop = MAX(slop, MIN(space >> 1, spa_min_slop)); + return (slop); } uint64_t @@ -1873,6 +1896,12 @@ spa_log_class(spa_t *spa) } metaslab_class_t * +spa_embedded_log_class(spa_t *spa) +{ + return (spa->spa_embedded_log_class); +} + +metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); @@ -1891,12 +1920,10 @@ metaslab_class_t * spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk) { - if (DMU_OT_IS_ZIL(objtype)) { - if (spa->spa_log_class->mc_groups != 0) - return (spa_log_class(spa)); - else - return (spa_normal_class(spa)); - } + /* + * ZIL allocations determine their class in zio_alloc_zil(). + */ + ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; @@ -2432,9 +2459,9 @@ spa_fini(void) } /* - * Return whether this pool has slogs. No locking needed. + * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for - * performance and not correctness + * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c index 3efd26155014..497e19dd58eb 100644 --- a/sys/contrib/openzfs/module/zfs/txg.c +++ b/sys/contrib/openzfs/module/zfs/txg.c @@ -292,6 +292,27 @@ txg_sync_stop(dsl_pool_t *dp) mutex_exit(&tx->tx_sync_lock); } +/* + * Get a handle on the currently open txg and keep it open. + * + * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for + * the handle. Once txg_rele_to_quiesce() has been called, the txg stays + * in quiescing state until txg_rele_to_sync() is called for the handle. + * + * It is guaranteed that subsequent calls return monotonically increasing + * txgs for the same dsl_pool_t. Of course this is not strong monotonicity, + * because the same txg can be returned multiple times in a row. This + * guarantee holds both for subsequent calls from one thread and for multiple + * threads. For example, it is impossible to observe the following sequence + * of events: + * + * Thread 1 Thread 2 + * + * 1 <- txg_hold_open(P, ...) + * 2 <- txg_hold_open(P, ...) + * 1 <- txg_hold_open(P, ...) + * + */ uint64_t txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) { @@ -393,7 +414,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time); /* - * Quiesce the transaction group by waiting for everyone to txg_exit(). + * Quiesce the transaction group by waiting for everyone to + * call txg_rele_to_sync() for their open transaction handles. */ for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 7ffe924212da..36001e0a6626 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -59,6 +59,27 @@ #include <sys/zvol.h> #include <sys/zfs_ratelimit.h> +/* + * One metaslab from each (normal-class) vdev is used by the ZIL. These are + * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are + * part of the spa_embedded_log_class. The metaslab with the most free space + * in each vdev is selected for this purpose when the pool is opened (or a + * vdev is added). See vdev_metaslab_init(). + * + * Log blocks can be allocated from the following locations. Each one is tried + * in order until the allocation succeeds: + * 1. dedicated log vdevs, aka "slog" (spa_log_class) + * 2. embedded slog metaslabs (spa_embedded_log_class) + * 3. other metaslabs in normal vdevs (spa_normal_class) + * + * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer + * than this number of metaslabs in the vdev. This ensures that we don't set + * aside an unreasonable amount of space for the ZIL. If set to less than + * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced + * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab. + */ +int zfs_embedded_slog_min_ms = 64; + /* default target for number of metaslabs per top-level vdev */ int zfs_vdev_default_ms_count = 200; @@ -223,6 +244,22 @@ vdev_getops(const char *type) return (ops); } +/* + * Given a vdev and a metaslab class, find which metaslab group we're + * interested in. All vdevs may belong to two different metaslab classes. + * Dedicated slog devices use only the primary metaslab group, rather than a + * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. + */ +metaslab_group_t * +vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) +{ + if (mc == spa_embedded_log_class(vd->vdev_spa) && + vd->vdev_log_mg != NULL) + return (vd->vdev_log_mg); + else + return (vd->vdev_mg); +} + /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, @@ -978,6 +1015,11 @@ vdev_free(vdev_t *vd) metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } + if (vd->vdev_log_mg != NULL) { + ASSERT0(vd->vdev_ms_count); + metaslab_group_destroy(vd->vdev_log_mg); + vd->vdev_log_mg = NULL; + } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); @@ -1098,14 +1140,20 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); + if (tvd->vdev_log_mg) + ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; + svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; + if (tvd->vdev_log_mg != NULL) + tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; @@ -1283,7 +1331,7 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } -static void +void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; @@ -1317,6 +1365,11 @@ vdev_metaslab_group_create(vdev_t *vd) vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); + if (!vd->vdev_islog) { + vd->vdev_log_mg = metaslab_group_create( + spa_embedded_log_class(spa), vd, 1); + } + /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundry @@ -1340,8 +1393,6 @@ int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; - uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; @@ -1369,16 +1420,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { - uint64_t object = 0; + for (uint64_t m = oldc; m < newc; m++) { + uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { - error = dmu_read(mos, vd->vdev_ms_array, + error = dmu_read(spa->spa_meta_objset, + vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { @@ -1388,17 +1440,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } -#ifndef _KERNEL - /* - * To accommodate zdb_leak_init() fake indirect - * metaslabs, we allocate a metaslab group for - * indirect vdevs which normally don't have one. - */ - if (vd->vdev_mg == NULL) { - ASSERT0(vdev_is_concrete(vd)); - vdev_metaslab_group_create(vd); - } -#endif error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { @@ -1408,6 +1449,47 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) } } + /* + * Find the emptiest metaslab on the vdev and mark it for use for + * embedded slog by moving it from the regular to the log metaslab + * group. + */ + if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + vd->vdev_ms_count > zfs_embedded_slog_min_ms && + avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { + uint64_t slog_msid = 0; + uint64_t smallest = UINT64_MAX; + + /* + * Note, we only search the new metaslabs, because the old + * (pre-existing) ones may be active (e.g. have non-empty + * range_tree's), and we don't move them to the new + * metaslab_t. + */ + for (uint64_t m = oldc; m < newc; m++) { + uint64_t alloc = + space_map_allocated(vd->vdev_ms[m]->ms_sm); + if (alloc < smallest) { + slog_msid = m; + smallest = alloc; + } + } + metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; + /* + * The metaslab was marked as dirty at the end of + * metaslab_init(). Remove it from the dirty list so that we + * can uninitialize and reinitialize it to the new class. + */ + if (txg != 0) { + (void) txg_list_remove_this(&vd->vdev_ms_list, + slog_ms, txg); + } + uint64_t sm_obj = space_map_object(slog_ms->ms_sm); + metaslab_fini(slog_ms); + VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, + &vd->vdev_ms[slog_msid])); + } + if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); @@ -1418,6 +1500,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); + if (vd->vdev_log_mg != NULL) + metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) @@ -1453,7 +1537,12 @@ vdev_metaslab_fini(vdev_t *vd) if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + if (vd->vdev_log_mg != NULL) { + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); + } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { @@ -1463,11 +1552,13 @@ vdev_metaslab_fini(vdev_t *vd) } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; - vd->vdev_ms_count = 0; - for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); + if (vd->vdev_log_mg != NULL) + ASSERT0(vd->vdev_log_mg->mg_histogram[i]); + } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); @@ -1634,6 +1725,14 @@ vdev_probe(vdev_t *vd, zio_t *zio) } static void +vdev_load_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_load_error = vdev_load(vd); +} + +static void vdev_open_child(void *arg) { vdev_t *vd = arg; @@ -2037,6 +2136,16 @@ vdev_open(vdev_t *vd) return (0); } +static void +vdev_validate_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_validate_thread = curthread; + vd->vdev_validate_error = vdev_validate(vd); + vd->vdev_validate_thread = NULL; +} + /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't @@ -2051,18 +2160,43 @@ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; + int children = vd->vdev_children; if (vdev_validate_skip) return (0); - for (uint64_t c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c]) != 0) + if (children > 0) { + tq = taskq_create("vdev_validate", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + } + + for (uint64_t c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (tq == NULL || vdev_uses_zvols(cvd)) { + vdev_validate_child(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, + TQ_SLEEP) != TASKQID_INVALID); + } + } + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } + for (int c = 0; c < children; c++) { + int error = vd->vdev_child[c]->vdev_validate_error; + + if (error != 0) return (SET_ERROR(EBADF)); + } + /* * If the device has already failed, or was marked offline, don't do @@ -3259,18 +3393,46 @@ vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) int vdev_load(vdev_t *vd) { + int children = vd->vdev_children; int error = 0; + taskq_t *tq = NULL; + + /* + * It's only worthwhile to use the taskq for the root vdev, because the + * slow part is metaslab_init, and that only happens for top-level + * vdevs. + */ + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { + tq = taskq_create("vdev_load", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { - error = vdev_load(vd->vdev_child[c]); - if (error != 0) { - return (error); + vdev_t *cvd = vd->vdev_child[c]; + + if (tq == NULL || vdev_uses_zvols(cvd)) { + cvd->vdev_load_error = vdev_load(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_load_child, + cvd, TQ_SLEEP) != TASKQID_INVALID); } } + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } + + for (int c = 0; c < vd->vdev_children; c++) { + int error = vd->vdev_child[c]->vdev_load_error; + + if (error != 0) + return (error); + } + vdev_set_deflate_ratio(vd); /* @@ -3531,8 +3693,11 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) != NULL) metaslab_sync_done(msp, txg); - if (reassess) + if (reassess) { metaslab_sync_reassess(vd->vdev_mg); + if (vd->vdev_log_mg != NULL) + metaslab_sync_reassess(vd->vdev_log_mg); + } } void @@ -3856,6 +4021,7 @@ top: /* * Prevent any future allocations. */ + ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); @@ -4256,6 +4422,12 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { + /* + * The vdev fragmentation rating doesn't take into + * account the embedded slog metaslab (vdev_log_mg). + * Since it's only one metaslab, it would have a tiny + * impact on the overall fragmentation. + */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } @@ -5234,6 +5406,9 @@ ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); +ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW, + "Minimum number of metaslabs required to dedicate one for log blocks"); + ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index 6b7ad7021a50..a4f48cf744b0 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -716,7 +716,7 @@ vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) offset = 0; for (; x < rr->rr_cols; x++) { - abd_put(rr->rr_col[x].rc_abd); + abd_free(rr->rr_col[x].rc_abd); if (offset == good_size) { /* empty data column (small write) */ @@ -754,11 +754,7 @@ vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) offset = 0; for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { - if (offset == good_size || x < rr->rr_bigcols) - abd_put(rr->rr_col[x].rc_abd); - else - abd_free(rr->rr_col[x].rc_abd); - + abd_free(rr->rr_col[x].rc_abd); rr->rr_col[x].rc_abd = abd_get_offset_size( rr->rr_abd_copy, offset, rr->rr_col[x].rc_size); @@ -797,7 +793,7 @@ vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_put((abd_t *)good); + abd_free((abd_t *)good); } /* @@ -852,11 +848,7 @@ vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) offset, col->rc_size); abd_copy(tmp, col->rc_abd, col->rc_size); - - if (abd_is_gang(col->rc_abd)) - abd_free(col->rc_abd); - else - abd_put(col->rc_abd); + abd_free(col->rc_abd); col->rc_abd = tmp; offset += col->rc_size; @@ -902,12 +894,12 @@ vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) rc->rc_abd = abd_get_zeros(skip_size); } else if (rc->rc_size == parity_size) { /* this is a "big column" */ - rc->rc_abd = abd_get_offset_size(zio->io_abd, - abd_off, rc->rc_size); + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, abd_off, rc->rc_size); } else { /* short data column, add a skip sector */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); - rc->rc_abd = abd_alloc_gang_abd(); + rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd_get_offset_size( zio->io_abd, abd_off, rc->rc_size), B_TRUE); abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), @@ -958,13 +950,13 @@ vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) skip_off += skip_size; } else if (rc->rc_size == parity_size) { /* this is a "big column" */ - rc->rc_abd = abd_get_offset_size(zio->io_abd, - abd_off, rc->rc_size); + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, abd_off, rc->rc_size); } else { /* short data column, add a skip sector */ ASSERT3U(rc->rc_size + skip_size, ==, parity_size); ASSERT3U(rr->rr_nempty, !=, 0); - rc->rc_abd = abd_alloc_gang_abd(); + rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, abd_get_offset_size( zio->io_abd, abd_off, rc->rc_size), B_TRUE); abd_gang_add(rc->rc_abd, abd_get_offset_size( @@ -1006,8 +998,8 @@ vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size > 0) { - rc->rc_abd = abd_get_offset_size(zio->io_abd, - abd_off, rc->rc_size); + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, abd_off, rc->rc_size); abd_off += rc->rc_size; } } @@ -1056,7 +1048,7 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) ASSERT3P(rc->rc_abd, !=, NULL); ASSERT(!abd_is_gang(rc->rc_abd)); abd_t *read_abd = rc->rc_abd; - rc->rc_abd = abd_alloc_gang_abd(); + rc->rc_abd = abd_alloc_gang(); abd_gang_add(rc->rc_abd, read_abd, B_TRUE); abd_gang_add(rc->rc_abd, abd_get_offset_size( rr->rr_abd_empty, skip_off, skip_size), B_TRUE); diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 07d1c922a50c..b26d0993711a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -1187,7 +1187,7 @@ vdev_indirect_child_io_done(zio_t *zio) pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); - abd_put(zio->io_abd); + abd_free(zio->io_abd); } /* diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index fbd117d2d9ae..04202a9f8960 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -754,16 +754,17 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; - vdev_phys_t *vp; - abd_t *vp_abd; - zio_t *zio; + vdev_phys_t *vp[VDEV_LABELS]; + abd_t *vp_abd[VDEV_LABELS]; + zio_t *zio[VDEV_LABELS]; uint64_t best_txg = 0; uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd->vdev_validate_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (!vdev_readable(vd)) return (NULL); @@ -776,21 +777,24 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (vd->vdev_ops == &vdev_draid_spare_ops) return (vdev_draid_read_config_spare(vd)); - vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); - vp = abd_to_buf(vp_abd); + for (int l = 0; l < VDEV_LABELS; l++) { + vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp[l] = abd_to_buf(vp_abd[l]); + } retry: for (int l = 0; l < VDEV_LABELS; l++) { - nvlist_t *label = NULL; - - zio = zio_root(spa, NULL, NULL, flags); + zio[l] = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp_abd, - offsetof(vdev_label_t, vl_vdev_phys), - sizeof (vdev_phys_t), NULL, NULL, flags); + vdev_label_read(zio[l], vd, l, vp_abd[l], + offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), + NULL, NULL, flags); + } + for (int l = 0; l < VDEV_LABELS; l++) { + nvlist_t *label = NULL; - if (zio_wait(zio) == 0 && - nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + if (zio_wait(zio[l]) == 0 && + nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist), &label, 0) == 0) { /* * Auxiliary vdevs won't have txg values in their @@ -803,6 +807,8 @@ retry: ZPOOL_CONFIG_POOL_TXG, &label_txg); if ((error || label_txg == 0) && !config) { config = label; + for (l++; l < VDEV_LABELS; l++) + zio_wait(zio[l]); break; } else if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; @@ -831,7 +837,9 @@ retry: (u_longlong_t)txg); } - abd_free(vp_abd); + for (int l = 0; l < VDEV_LABELS; l++) { + abd_free(vp_abd[l]); + } return (config); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index 02040c3ee198..25a4bc69cc23 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -789,7 +789,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, maxblocksize); - abd = abd_alloc_gang_abd(); + abd = abd_alloc_gang(); if (abd == NULL) return (NULL); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 5b152f38bd63..f4812e61252c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -138,30 +138,15 @@ static void vdev_raidz_row_free(raidz_row_t *rr) { - int c; - - for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) { - abd_free(rr->rr_col[c].rc_abd); + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; - if (rr->rr_col[c].rc_gdata != NULL) { - abd_free(rr->rr_col[c].rc_gdata); - } - if (rr->rr_col[c].rc_orig_data != NULL) { - zio_buf_free(rr->rr_col[c].rc_orig_data, - rr->rr_col[c].rc_size); - } - } - for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { - if (rr->rr_col[c].rc_size != 0) { - if (abd_is_gang(rr->rr_col[c].rc_abd)) - abd_free(rr->rr_col[c].rc_abd); - else - abd_put(rr->rr_col[c].rc_abd); - } - if (rr->rr_col[c].rc_orig_data != NULL) { - zio_buf_free(rr->rr_col[c].rc_orig_data, - rr->rr_col[c].rc_size); - } + if (rc->rc_size != 0) + abd_free(rc->rc_abd); + if (rc->rc_gdata != NULL) + abd_free(rc->rc_gdata); + if (rc->rc_orig_data != NULL) + zio_buf_free(rc->rc_orig_data, rc->rc_size); } if (rr->rr_abd_copy != NULL) @@ -249,7 +234,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) /* fill in the data columns from good_data */ offset = 0; for (; x < rr->rr_cols; x++) { - abd_put(rr->rr_col[x].rc_abd); + abd_free(rr->rr_col[x].rc_abd); rr->rr_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, @@ -268,7 +253,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) offset = 0; for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { - abd_put(rr->rr_col[x].rc_abd); + abd_free(rr->rr_col[x].rc_abd); rr->rr_col[x].rc_abd = abd_get_offset_size( rr->rr_abd_copy, offset, rr->rr_col[x].rc_size); @@ -291,7 +276,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_put((abd_t *)good); + abd_free((abd_t *)good); } /* @@ -344,7 +329,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); + abd_free(col->rc_abd); col->rc_abd = tmp; offset += col->rc_size; @@ -379,7 +364,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; - uint64_t off = 0; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); @@ -477,13 +461,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_col[c].rc_abd = abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); - rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rr->rr_col[c].rc_size); - off = rr->rr_col[c].rc_size; - - for (c = c + 1; c < acols; c++) { + for (uint64_t off = 0; c < acols; c++) { raidz_col_t *rc = &rr->rr_col[c]; - rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size); + rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, + zio->io_abd, off, rc->rc_size); off += rc->rc_size; } diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 6eaaddd3979f..a758fe4fb343 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -1207,6 +1207,11 @@ vdev_remove_complete(spa_t *spa) vd->vdev_mg = NULL; spa_log_sm_set_blocklimit(spa); } + if (vd->vdev_log_mg != NULL) { + ASSERT0(vd->vdev_ms_count); + metaslab_group_destroy(vd->vdev_log_mg); + vd->vdev_log_mg = NULL; + } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); @@ -1780,6 +1785,8 @@ spa_vdev_remove_cancel_impl(spa_t *spa) spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_t *vd = vdev_lookup_top(spa, vdid); metaslab_group_activate(vd->vdev_mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } @@ -1858,6 +1865,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); + ASSERT3P(vd->vdev_log_mg, ==, NULL); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* @@ -1893,6 +1901,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + ASSERT3P(vd->vdev_log_mg, ==, NULL); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); @@ -2121,6 +2130,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_passivate(vd->vdev_log_mg); /* * Wait for the youngest allocations and frees to sync, @@ -2157,6 +2168,8 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + ASSERT(!vd->vdev_islog); + metaslab_group_activate(vd->vdev_log_mg); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 8eb9474cadb0..0e35fd069cbb 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -7418,6 +7418,7 @@ zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; + hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; @@ -7576,6 +7577,8 @@ zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } + fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, + gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c index cbb773ffbdfa..67be131da63b 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_sa.c +++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c @@ -71,7 +71,7 @@ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { #ifdef _KERNEL int -zfs_sa_readlink(znode_t *zp, uio_t *uio) +zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio) { dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); size_t bufsz; @@ -79,15 +79,16 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio) bufsz = zp->z_size; if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { - error = uiomove((caddr_t)db->db_data + + error = zfs_uiomove((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio_resid(uio)), UIO_READ, uio); + MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ, uio); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio_resid(uio)), UIO_READ, uio); + error = zfs_uiomove(dbp->db_data, + MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ, + uio); dmu_buf_rele(dbp, FTAG); } } diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 2dcc231b30b6..61d5f06c6455 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -187,7 +187,7 @@ static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ */ /* ARGSUSED */ int -zfs_read(struct znode *zp, uio_t *uio, int ioflag, cred_t *cr) +zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; boolean_t frsync = B_FALSE; @@ -210,7 +210,7 @@ zfs_read(struct znode *zp, uio_t *uio, int ioflag, cred_t *cr) /* * Validate file offset */ - if (uio->uio_loffset < (offset_t)0) { + if (zfs_uio_offset(uio) < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } @@ -218,7 +218,7 @@ zfs_read(struct znode *zp, uio_t *uio, int ioflag, cred_t *cr) /* * Fasttrack empty reads */ - if (uio->uio_resid == 0) { + if (zfs_uio_resid(uio) == 0) { ZFS_EXIT(zfsvfs); return (0); } @@ -242,26 +242,26 @@ zfs_read(struct znode *zp, uio_t *uio, int ioflag, cred_t *cr) * Lock the range against changes. */ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); + zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ - if (uio->uio_loffset >= zp->z_size) { + if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } - ASSERT(uio->uio_loffset < zp->z_size); - ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ASSERT(zfs_uio_offset(uio) < zp->z_size); + ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; while (n > 0) { ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_vnops_read_chunk_size)); + P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); #ifdef UIO_NOCOPY - if (uio->uio_segflg == UIO_NOCOPY) + if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif @@ -314,10 +314,10 @@ out: /* ARGSUSED */ int -zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) +zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; - ssize_t start_resid = uio->uio_resid; + ssize_t start_resid = zfs_uio_resid(uio); /* * Fasttrack empty write @@ -354,7 +354,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && - (uio->uio_loffset < zp->z_size))) { + (zfs_uio_offset(uio) < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } @@ -362,7 +362,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) /* * Validate file offset */ - offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset; + offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); @@ -375,7 +375,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } @@ -399,7 +399,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) */ woff = zp->z_size; } - uio->uio_loffset = woff; + zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of @@ -409,7 +409,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } - if (zn_rlimit_fsize(zp, uio, uio->uio_td)) { + if (zn_rlimit_fsize(zp, uio)) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); @@ -439,7 +439,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) * and allows us to do more fine-grained space accounting. */ while (n > 0) { - woff = uio->uio_loffset; + woff = zfs_uio_offset(uio); if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || @@ -467,7 +467,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, + if ((error = zfs_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; @@ -528,11 +528,11 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) ssize_t tx_bytes; if (abuf == NULL) { - tx_bytes = uio->uio_resid; - uio_fault_disable(uio, B_TRUE); + tx_bytes = zfs_uio_resid(uio); + zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); - uio_fault_disable(uio, B_FALSE); + zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { dmu_tx_commit(tx); @@ -540,12 +540,13 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) * Account for partial writes before * continuing the loop. * Update needs to occur before the next - * uio_prefaultpages, or prefaultpages may + * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ - if (tx_bytes != uio->uio_resid) - n -= tx_bytes - uio->uio_resid; - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + if (tx_bytes != zfs_uio_resid(uio)) + n -= tx_bytes - zfs_uio_resid(uio); + if (zfs_uio_prefaultpages(MIN(n, max_blksz), + uio)) { break; } continue; @@ -555,7 +556,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) dmu_tx_commit(tx); break; } - tx_bytes -= uio->uio_resid; + tx_bytes -= zfs_uio_resid(uio); } else { /* Implied by abuf != NULL: */ ASSERT3S(n, >=, max_blksz); @@ -580,8 +581,8 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) dmu_tx_commit(tx); break; } - ASSERT3S(nbytes, <=, uio->uio_resid); - uioskip(uio, nbytes); + ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); + zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } if (tx_bytes && zn_has_cached_data(zp) && @@ -632,9 +633,9 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ - while ((end_size = zp->z_size) < uio->uio_loffset) { + while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); + zfs_uio_offset(uio)); ASSERT(error == 0); } /* @@ -657,14 +658,14 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) n -= nbytes; if (n > 0) { - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { error = SET_ERROR(EFAULT); break; } } } - zfs_inode_update(zp); + zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* @@ -672,7 +673,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid || + if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); @@ -682,7 +683,7 @@ zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); - const int64_t nwritten = start_resid - uio->uio_resid; + const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 632fef29bff4..7b52f9249298 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -1230,7 +1230,7 @@ zil_lwb_write_done(zio_t *zio) ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(BP_GET_FILL(zio->io_bp) == 0); - abd_put(zio->io_abd); + abd_free(zio->io_abd); mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index dfd92b893b9f..7f3cb19d46db 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -2481,7 +2481,7 @@ zio_resume_wait(spa_t *spa) static void zio_gang_issue_func_done(zio_t *zio) { - abd_put(zio->io_abd); + abd_free(zio->io_abd); } static zio_t * @@ -2525,7 +2525,7 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); - abd_put(buf); + abd_free(buf); } /* * If we are here to damage data for testing purposes, @@ -2653,7 +2653,7 @@ zio_gang_tree_assemble_done(zio_t *zio) ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); - abd_put(zio->io_abd); + abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; @@ -2777,14 +2777,13 @@ zio_write_gang_done(zio_t *zio) * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) - abd_put(zio->io_abd); + abd_free(zio->io_abd); } static zio_t * -zio_write_gang_block(zio_t *pio) +zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; - metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -3501,6 +3500,17 @@ zio_dva_allocate(zio_t *zio) zio->io_metaslab_class = mc; } + /* + * Try allocating the block in the usual metaslab class. + * If that's full, allocate it in the normal class. + * If that's full, allocate as a gang block, + * and if all are full, the allocation fails (which shouldn't happen). + * + * Note that we do not fall back on embedded slog (ZIL) space, to + * preserve unfragmented slog space, which is critical for decent + * sync write performance. If a log allocation fails, we will fall + * back to spa_sync() which is abysmal for performance. + */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); @@ -3520,26 +3530,38 @@ zio_dva_allocate(zio_t *zio) zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; - mc = spa_normal_class(spa); - VERIFY(metaslab_class_throttle_reserve(mc, + VERIFY(metaslab_class_throttle_reserve( + spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); - } else { - mc = spa_normal_class(spa); } - zio->io_metaslab_class = mc; + zio->io_metaslab_class = mc = spa_normal_class(spa); + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { + zfs_dbgmsg("%s: metaslab allocation failure, " + "trying normal class: zio %px, size %llu, error %d", + spa_name(spa), zio, zio->io_size, error); + } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { + if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { + zfs_dbgmsg("%s: metaslab allocation failure, " + "trying ganging: zio %px, size %llu, error %d", + spa_name(spa), zio, zio->io_size, error); + } + return (zio_write_gang_block(zio, mc)); + } if (error != 0) { - zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " - "size %llu, error %d", spa_name(spa), zio, zio->io_size, - error); - if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) - return (zio_write_gang_block(zio)); + if (error != ENOSPC || + (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { + zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " + "size %llu, error %d", + spa_name(spa), zio, zio->io_size, error); + } zio->io_error = error; } @@ -3619,15 +3641,18 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, - 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); - if (error == 0) { - *slog = TRUE; - } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, - 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); - if (error == 0) - *slog = FALSE; + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, + txg, NULL, flags, &io_alloc_list, NULL, allocator); + *slog = (error == 0); + if (error != 0) { + error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, + new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); + } + if (error != 0) { + error = metaslab_alloc(spa, spa_normal_class(spa), size, + new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); |