diff options
Diffstat (limited to 'sys/contrib/openzfs/module')
115 files changed, 2185 insertions, 962 deletions
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 7a20e6ee4615..581d50e64b42 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -150,12 +150,8 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) -# Suppress objtool "can't find jump dest instruction at" warnings. They -# are caused by the constants which are defined in the text section of the -# assembly file using .byte instructions (e.g. bswap_mask). The objtool -# utility tries to interpret them as opcodes and obviously fails doing so. +# Suppress objtool "return with modified stack frame" warnings. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y -OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y # Suppress objtool "unsupported stack pointer realignment" warnings. We are # not using a DRAP register while aligning the stack to a 64 byte boundary. diff --git a/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c b/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c index 5f7018598820..604e05847ee6 100644 --- a/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c +++ b/sys/contrib/openzfs/module/icp/algs/blake3/blake3.c @@ -189,9 +189,7 @@ static void chunk_state_update(const blake3_ops_t *ops, input_len -= BLAKE3_BLOCK_LEN; } - size_t take = chunk_state_fill_buf(ctx, input, input_len); - input += take; - input_len -= take; + chunk_state_fill_buf(ctx, input, input_len); } static output_t chunk_state_output(const blake3_chunk_state_t *ctx) diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ccm.c b/sys/contrib/openzfs/module/icp/algs/modes/ccm.c index ed5498dafaa1..4a8bb9bbc2c8 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/ccm.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/ccm.c @@ -67,7 +67,6 @@ ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, return (CRYPTO_SUCCESS); } - lastp = (uint8_t *)ctx->ccm_cb; crypto_init_ptrs(out, &iov_or_mp, &offset); mac_buf = (uint8_t *)ctx->ccm_mac_buf; diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ctr.c b/sys/contrib/openzfs/module/icp/algs/modes/ctr.c index c116ba3662ba..db6b1c71d5cd 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/ctr.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/ctr.c @@ -60,7 +60,6 @@ ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, return (CRYPTO_SUCCESS); } - lastp = (uint8_t *)ctx->ctr_cb; crypto_init_ptrs(out, &iov_or_mp, &offset); do { diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c index ca328d54a7e6..16ef14b8ccaf 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c @@ -118,7 +118,6 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, return (CRYPTO_SUCCESS); } - lastp = (uint8_t *)ctx->gcm_cb; crypto_init_ptrs(out, &iov_or_mp, &offset); gops = gcm_impl_get_ops(); diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c index b98db0ac14ec..2d1b5ff1a919 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c @@ -106,8 +106,10 @@ crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset, } else { /* one block spans two iovecs */ *out_data_1_len = iov_len - offset; - if (vec_idx == zfs_uio_iovcnt(uio)) + if (vec_idx == zfs_uio_iovcnt(uio)) { + *out_data_2 = NULL; return; + } vec_idx++; zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); *out_data_2 = (uint8_t *)iov_base; diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S index f546e8933be1..a0525dd464f5 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S @@ -704,6 +704,7 @@ enc_tab: ENTRY_NP(aes_encrypt_amd64) + ENDBR #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface @@ -809,6 +810,7 @@ dec_tab: ENTRY_NP(aes_decrypt_amd64) + ENDBR #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S index f4d9cb766d46..cb08430b81ed 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -30,16 +30,6 @@ #define _ASM #include <sys/asm_linkage.h> -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - .intel_syntax noprefix .global zfs_blake3_hash_many_avx2 .text @@ -47,7 +37,7 @@ .type zfs_blake3_hash_many_avx2,@function .p2align 6 zfs_blake3_hash_many_avx2: - _CET_ENDBR + ENDBR push r15 push r14 push r13 diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S index 71b5715c88c1..960406ea2c01 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_avx512.S @@ -30,16 +30,6 @@ #define _ASM #include <sys/asm_linkage.h> -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - .intel_syntax noprefix .global zfs_blake3_hash_many_avx512 .global zfs_blake3_compress_in_place_avx512 @@ -52,7 +42,7 @@ .p2align 6 zfs_blake3_hash_many_avx512: - _CET_ENDBR + ENDBR push r15 push r14 push r13 @@ -2409,7 +2399,7 @@ zfs_blake3_hash_many_avx512: jmp 4b .p2align 6 zfs_blake3_compress_in_place_avx512: - _CET_ENDBR + ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b @@ -2491,7 +2481,7 @@ zfs_blake3_compress_in_place_avx512: .p2align 6 zfs_blake3_compress_xof_avx512: - _CET_ENDBR + ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S index 20689a7dcef5..c4290aaa8faf 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S @@ -30,16 +30,6 @@ #define _ASM #include <sys/asm_linkage.h> -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - .intel_syntax noprefix .global zfs_blake3_hash_many_sse2 .global zfs_blake3_compress_in_place_sse2 @@ -52,7 +42,7 @@ .p2align 6 zfs_blake3_hash_many_sse2: - _CET_ENDBR + ENDBR push r15 push r14 push r13 @@ -2050,7 +2040,7 @@ zfs_blake3_hash_many_sse2: .p2align 6 zfs_blake3_compress_in_place_sse2: - _CET_ENDBR + ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] @@ -2161,7 +2151,7 @@ zfs_blake3_compress_in_place_sse2: .p2align 6 zfs_blake3_compress_xof_sse2: - _CET_ENDBR + ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S index c5975a4f0877..45b90cc9ed89 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S @@ -30,16 +30,6 @@ #define _ASM #include <sys/asm_linkage.h> -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -#if __has_include(<cet.h>) -#include <cet.h> -#endif -#endif - -#if !defined(_CET_ENDBR) -#define _CET_ENDBR -#endif - .intel_syntax noprefix .global zfs_blake3_compress_in_place_sse41 .global zfs_blake3_compress_xof_sse41 @@ -52,7 +42,7 @@ .p2align 6 zfs_blake3_hash_many_sse41: - _CET_ENDBR + ENDBR push r15 push r14 push r13 @@ -1812,7 +1802,7 @@ zfs_blake3_hash_many_sse41: jmp 4b .p2align 6 zfs_blake3_compress_in_place_sse41: - _CET_ENDBR + ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] @@ -1911,7 +1901,7 @@ zfs_blake3_compress_in_place_sse41: RET .p2align 6 zfs_blake3_compress_xof_sse41: - _CET_ENDBR + ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index 70e419c2e4ab..cf17b3768712 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -47,6 +47,9 @@ #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) +#define _ASM +#include <sys/asm_linkage.h> + .extern gcm_avx_can_use_movbe .text @@ -56,6 +59,7 @@ .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc + ENDBR vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 @@ -363,7 +367,7 @@ _aesni_ctr32_ghash_6x: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 - .byte 0xf3,0xc3 + RET .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x #endif /* ifdef HAVE_MOVBE */ @@ -372,6 +376,7 @@ _aesni_ctr32_ghash_6x: .align 32 _aesni_ctr32_ghash_no_movbe_6x: .cfi_startproc + ENDBR vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 @@ -691,7 +696,7 @@ _aesni_ctr32_ghash_no_movbe_6x: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 - .byte 0xf3,0xc3 + RET .cfi_endproc .size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x @@ -700,6 +705,7 @@ _aesni_ctr32_ghash_no_movbe_6x: .align 32 aesni_gcm_decrypt: .cfi_startproc + ENDBR xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort @@ -810,13 +816,14 @@ aesni_gcm_decrypt: .cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax - .byte 0xf3,0xc3 + RET .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt .type _aesni_ctr32_6x,@function .align 32 _aesni_ctr32_6x: .cfi_startproc + ENDBR vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. @@ -880,7 +887,7 @@ _aesni_ctr32_6x: vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi - .byte 0xf3,0xc3 + RET .align 32 .Lhandle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 @@ -911,6 +918,7 @@ _aesni_ctr32_6x: .align 32 aesni_gcm_encrypt: .cfi_startproc + ENDBR xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort @@ -1186,7 +1194,7 @@ aesni_gcm_encrypt: .cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax - .byte 0xf3,0xc3 + RET .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt @@ -1239,6 +1247,7 @@ atomic_toggle_boolean_nv: RET .size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv +.pushsection .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -1252,6 +1261,7 @@ atomic_toggle_boolean_nv: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S index 90cc36b43a78..bf3724a23eae 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -97,6 +97,9 @@ #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) +#define _ASM +#include <sys/asm_linkage.h> + .text .globl gcm_gmult_clmul @@ -104,6 +107,7 @@ .align 16 gcm_gmult_clmul: .cfi_startproc + ENDBR .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 @@ -149,7 +153,7 @@ gcm_gmult_clmul: pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) - .byte 0xf3,0xc3 + RET .cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul @@ -158,6 +162,7 @@ gcm_gmult_clmul: .align 32 gcm_init_htab_avx: .cfi_startproc + ENDBR vzeroupper vmovdqu (%rsi),%xmm2 @@ -262,7 +267,7 @@ gcm_init_htab_avx: vmovdqu %xmm5,-16(%rdi) vzeroupper - .byte 0xf3,0xc3 + RET .cfi_endproc .size gcm_init_htab_avx,.-gcm_init_htab_avx @@ -271,6 +276,7 @@ gcm_init_htab_avx: .align 32 gcm_gmult_avx: .cfi_startproc + ENDBR jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx @@ -279,6 +285,7 @@ gcm_gmult_avx: .align 32 gcm_ghash_avx: .cfi_startproc + ENDBR vzeroupper vmovdqu (%rdi),%xmm10 @@ -649,9 +656,11 @@ gcm_ghash_avx: vpshufb %xmm13,%xmm10,%xmm10 vmovdqu %xmm10,(%rdi) vzeroupper - .byte 0xf3,0xc3 + RET .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx + +.pushsection .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 @@ -705,6 +714,7 @@ gcm_ghash_avx: .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S index 1391bd59a017..60d34b4a3be0 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -84,6 +84,7 @@ SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) ENTRY_NP(SHA256TransformBlocks) .cfi_startproc + ENDBR movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S index e61e96957bc6..ed7fb362a1ac 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -85,6 +85,7 @@ SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) ENTRY_NP(SHA512TransformBlocks) .cfi_startproc + ENDBR movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h index 58964c5d4497..e3e769ffd858 100644 --- a/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h +++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h @@ -30,9 +30,29 @@ #include <sys/stack.h> #include <sys/trap.h> -#if defined(__linux__) && defined(CONFIG_SLS) -#define RET ret; int3 -#else +#if defined(_KERNEL) && defined(__linux__) +#include <linux/linkage.h> +#endif + +#ifndef ENDBR +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +/* CSTYLED */ +#if __has_include(<cet.h>) + +#include <cet.h> + +#ifdef _CET_ENDBR +#define ENDBR _CET_ENDBR +#endif /* _CET_ENDBR */ + +#endif /* <cet.h> */ +#endif /* __ELF__ && __CET__ && __has_include */ +#endif /* !ENDBR */ + +#ifndef ENDBR +#define ENDBR +#endif +#ifndef RET #define RET ret #endif @@ -122,6 +142,7 @@ extern "C" { * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ +#undef ENTRY #define ENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ diff --git a/sys/contrib/openzfs/module/icp/io/sha2_mod.c b/sys/contrib/openzfs/module/icp/io/sha2_mod.c index fadb58b81881..a58f0982c8c0 100644 --- a/sys/contrib/openzfs/module/icp/io/sha2_mod.c +++ b/sys/contrib/openzfs/module/icp/io/sha2_mod.c @@ -737,12 +737,15 @@ sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, */ if (mechanism->cm_type % 3 == 2) { if (mechanism->cm_param == NULL || - mechanism->cm_param_len != sizeof (ulong_t)) - ret = CRYPTO_MECHANISM_PARAM_INVALID; - PROV_SHA2_GET_DIGEST_LEN(mechanism, - PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len); - if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > sha_digest_len) + mechanism->cm_param_len != sizeof (ulong_t)) { ret = CRYPTO_MECHANISM_PARAM_INVALID; + } else { + PROV_SHA2_GET_DIGEST_LEN(mechanism, + PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len); + if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > + sha_digest_len) + ret = CRYPTO_MECHANISM_PARAM_INVALID; + } } if (ret != CRYPTO_SUCCESS) { diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c index 726e5c2ad4bb..703cf4cc2a36 100644 --- a/sys/contrib/openzfs/module/lua/lapi.c +++ b/sys/contrib/openzfs/module/lua/lapi.c @@ -250,6 +250,8 @@ LUA_API int lua_type (lua_State *L, int idx) { LUA_API const char *lua_typename (lua_State *L, int t) { UNUSED(L); + if (t > 8 || t < 0) + return "internal_type_error"; return ttypename(t); } @@ -442,7 +444,7 @@ LUA_API const void *lua_topointer (lua_State *L, int idx) { case LUA_TTABLE: return hvalue(o); case LUA_TLCL: return clLvalue(o); case LUA_TCCL: return clCvalue(o); - case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o))); + case LUA_TLCF: return cast(void *, cast(uintptr_t, fvalue(o))); case LUA_TTHREAD: return thvalue(o); case LUA_TUSERDATA: case LUA_TLIGHTUSERDATA: diff --git a/sys/contrib/openzfs/module/lua/ldo.c b/sys/contrib/openzfs/module/lua/ldo.c index 24677596de12..6bef80514ce2 100644 --- a/sys/contrib/openzfs/module/lua/ldo.c +++ b/sys/contrib/openzfs/module/lua/ldo.c @@ -452,7 +452,7 @@ int luaD_poscall (lua_State *L, StkId firstResult) { } res = ci->func; /* res == final position of 1st result */ wanted = ci->nresults; - L->ci = ci = ci->previous; /* back to caller */ + L->ci = ci->previous; /* back to caller */ /* move results to correct place */ for (i = wanted; i != 0 && firstResult < L->top; i--) setobjs2s(L, res++, firstResult++); diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S index fd661d72eedf..7e13fea05dda 100644 --- a/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S +++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S @@ -23,7 +23,15 @@ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ +#if defined(_KERNEL) && defined(__linux__) +#include <linux/linkage.h> +#endif + +#ifndef RET +#define RET ret +#endif +#undef ENTRY #define ENTRY(x) \ .text; \ .align 8; \ @@ -34,13 +42,6 @@ x: #define SET_SIZE(x) \ .size x, [.-x] - -#if defined(__linux__) && defined(CONFIG_SLS) -#define RET ret; int3 -#else -#define RET ret -#endif - /* * Setjmp and longjmp implement non-local gotos using state vectors * type label_t. diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c index b692ccdf232a..04a5d2869d1b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c @@ -1654,13 +1654,13 @@ acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) */ int ace_trivial_common(void *acep, int aclcnt, - uint64_t (*walk)(void *, uint64_t, int aclcnt, + uintptr_t (*walk)(void *, uintptr_t, int aclcnt, uint16_t *, uint16_t *, uint32_t *)) { uint16_t flags; uint32_t mask; uint16_t type; - uint64_t cookie = 0; + uintptr_t cookie = 0; while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { switch (flags & ACE_TYPE_FLAGS) { diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c index 523e10ff6936..eb74720c984a 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c @@ -105,3 +105,33 @@ kmem_strfree(char *str) ASSERT3P(str, !=, NULL); kmem_free(str, strlen(str) + 1); } + +/* + * kmem_scnprintf() will return the number of characters that it would have + * printed whenever it is limited by value of the size variable, rather than + * the number of characters that it did print. This can cause misbehavior on + * subsequent uses of the return value, so we define a safe version that will + * return the number of characters actually printed, minus the NULL format + * character. Subsequent use of this by the safe string functions is safe + * whether it is snprintf(), strlcat() or strlcpy(). + */ + +int +kmem_scnprintf(char *restrict str, size_t size, const char *restrict fmt, ...) +{ + int n; + va_list ap; + + /* Make the 0 case a no-op so that we do not return -1 */ + if (size == 0) + return (0); + + va_start(ap, fmt); + n = vsnprintf(str, size, fmt, ap); + va_end(ap); + + if (n >= size) + n = size - 1; + + return (n); +} diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c index ff11f5d7acb8..a07098afc5b4 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c @@ -125,7 +125,6 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, struct vfsconf *vfsp; struct mount *mp; vnode_t *vp, *mvp; - struct ucred *pcr, *tcr; int error; ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); @@ -195,18 +194,7 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, */ mp->mnt_flag |= MNT_IGNORE; - /* - * XXX: This is evil, but we can't mount a snapshot as a regular user. - * XXX: Is is safe when snapshot is mounted from within a jail? - */ - tcr = td->td_ucred; - pcr = td->td_proc->p_ucred; - td->td_ucred = kcred; - td->td_proc->p_ucred = kcred; error = VFS_MOUNT(mp); - td->td_ucred = tcr; - td->td_proc->p_ucred = pcr; - if (error != 0) { /* * Clear VI_MOUNT and decrement the use count "atomically", diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c index 0a19fbba717d..bb3cbc39ec75 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c @@ -124,7 +124,9 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, int vecnum; zfs_iocparm_t *zp; zfs_cmd_t *zc; +#ifdef ZFS_LEGACY_SUPPORT zfs_cmd_legacy_t *zcl; +#endif int rc, error; void *uaddr; @@ -133,7 +135,9 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, zp = (void *)arg; uaddr = (void *)(uintptr_t)zp->zfs_cmd; error = 0; +#ifdef ZFS_LEGACY_SUPPORT zcl = NULL; +#endif if (len != sizeof (zfs_iocparm_t)) { printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n", @@ -142,6 +146,7 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, } zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); +#ifdef ZFS_LEGACY_SUPPORT /* * Remap ioctl code for legacy user binaries */ @@ -157,22 +162,29 @@ zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, goto out; } zfs_cmd_legacy_to_ozfs(zcl, zc); - } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + } else +#endif + if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { error = SET_ERROR(EFAULT); goto out; } error = zfsdev_ioctl_common(vecnum, zc, 0); +#ifdef ZFS_LEGACY_SUPPORT if (zcl) { zfs_cmd_ozfs_to_legacy(zc, zcl); rc = copyout(zcl, uaddr, sizeof (*zcl)); - } else { + } else +#endif + { rc = copyout(zc, uaddr, sizeof (*zc)); } if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); out: +#ifdef ZFS_LEGACY_SUPPORT if (zcl) kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); +#endif kmem_free(zc, sizeof (zfs_cmd_t)); MPASS(tsd_get(rrw_tsd_key) == NULL); return (error); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 980bb1c0f941..48af1eaf8ea7 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -137,11 +137,11 @@ SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, /* arc.c */ int -param_set_arc_long(SYSCTL_HANDLER_ARGS) +param_set_arc_u64(SYSCTL_HANDLER_ARGS) { int err; - err = sysctl_handle_long(oidp, arg1, 0, req); + err = sysctl_handle_64(oidp, arg1, 0, req); if (err != 0 || req->newptr == NULL) return (err); @@ -171,7 +171,7 @@ param_set_arc_max(SYSCTL_HANDLER_ARGS) int err; val = zfs_arc_max; - err = sysctl_handle_long(oidp, &val, 0, req); + err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); @@ -203,7 +203,7 @@ param_set_arc_min(SYSCTL_HANDLER_ARGS) int err; val = zfs_arc_min; - err = sysctl_handle_long(oidp, &val, 0, req); + err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); @@ -599,7 +599,7 @@ param_set_multihost_interval(SYSCTL_HANDLER_ARGS) { int err; - err = sysctl_handle_long(oidp, &zfs_multihost_interval, 0, req); + err = sysctl_handle_64(oidp, &zfs_multihost_interval, 0, req); if (err != 0 || req->newptr == NULL) return (err); @@ -676,7 +676,7 @@ param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) int err; val = zfs_deadman_synctime_ms; - err = sysctl_handle_long(oidp, &val, 0, req); + err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_synctime_ms = val; @@ -693,7 +693,7 @@ param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) int err; val = zfs_deadman_ziotime_ms; - err = sysctl_handle_long(oidp, &val, 0, req); + err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); zfs_deadman_ziotime_ms = val; @@ -761,11 +761,11 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, int param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) { - uint64_t val; + int val; int err; val = zfs_vdev_min_auto_ashift; - err = sysctl_handle_64(oidp, &val, 0, req); + err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); @@ -779,20 +779,20 @@ param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, - CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), - param_set_min_auto_ashift, "QU", + param_set_min_auto_ashift, "IU", "Min ashift used when creating new top-level vdev. (LEGACY)"); /* END CSTYLED */ int param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) { - uint64_t val; + int val; int err; val = zfs_vdev_max_auto_ashift; - err = sysctl_handle_64(oidp, &val, 0, req); + err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (SET_ERROR(err)); @@ -806,9 +806,9 @@ param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) /* BEGIN CSTYLED */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, - CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), - param_set_max_auto_ashift, "QU", + param_set_max_auto_ashift, "IU", "Max ashift used when optimizing for logical -> physical sector size on" " new top-level vdevs. (LEGACY)"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c index 73cc6aa48c0b..a65dfec86caf 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c @@ -40,8 +40,8 @@ static taskq_t *vdev_file_taskq; -static unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; -static unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; +static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; void vdev_file_init(void) @@ -350,7 +350,7 @@ vdev_ops_t vdev_disk_ops = { #endif -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, "Logical ashift for file-based devices"); -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, "Physical ashift for file-based devices"); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c index 963102f3b62a..16bcd338de21 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -631,8 +631,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, return (NULL); } -static uint64_t -zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, +static uintptr_t +zfs_ace_walk(void *datap, uintptr_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { (void) aclcnt; @@ -642,7 +642,7 @@ zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); - return ((uint64_t)(uintptr_t)acep); + return ((uintptr_t)acep); } /* @@ -1133,6 +1133,7 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } + ASSERT3P(cb->cb_acl_node, !=, NULL); *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } @@ -1618,7 +1619,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zuserns_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; @@ -1788,7 +1789,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (mask == 0) return (SET_ERROR(ENOSYS)); - if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, NULL))) return (error); mutex_enter(&zp->z_acl_lock); @@ -1951,7 +1952,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, NULL))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, @@ -2340,7 +2341,8 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) * can define any form of access. */ int -zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, + zuserns_t *mnt_ns) { uint32_t working_mode; int error; @@ -2470,9 +2472,11 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, + zuserns_t *mnt_ns) { - return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, + mnt_ns)); } /* @@ -2483,7 +2487,7 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, NULL)); } static int @@ -2539,7 +2543,7 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, * */ int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) { uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; @@ -2626,7 +2630,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) + znode_t *tzp, cred_t *cr, zuserns_t *mnt_ns) { int add_perm; int error; @@ -2646,7 +2650,8 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, * to another. */ if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { - if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr, + mnt_ns))) return (error); } @@ -2656,19 +2661,19 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, * If that succeeds then check for add_file/add_subdir permissions */ - if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + if ((error = zfs_zaccess_delete(sdzp, szp, cr, mnt_ns))) return (error); /* * If we have a tzp, see if we can delete it? */ - if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr))) + if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr, mnt_ns))) return (error); /* * Now check for add permissions */ - error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr, mnt_ns); return (error); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c index 778e4151656d..07232086d52b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c @@ -809,7 +809,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) *xvpp = NULL; if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, - &acl_ids)) != 0) + &acl_ids, NULL)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { zfs_acl_ids_free(&acl_ids); @@ -955,7 +955,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && - zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL) == 0)) return (0); else return (secpolicy_vnode_remove(ZTOV(zp), cr)); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c index d495cc0dc3f1..3ddffec91e83 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/cmn_err.h> #include <sys/zfs_ioctl_compat.h> +#ifdef ZFS_LEGACY_SUPPORT enum zfs_ioc_legacy { ZFS_IOC_LEGACY_NONE = -1, ZFS_IOC_LEGACY_FIRST = 0, @@ -319,7 +320,7 @@ zfs_ioctl_legacy_to_ozfs(int request) int zfs_ioctl_ozfs_to_legacy(int request) { - if (request > ZFS_IOC_LAST) + if (request >= ZFS_IOC_LAST) return (-1); if (request > ZFS_IOC_PLATFORM) { @@ -361,3 +362,4 @@ zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst) sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); dst->zc_jailid = src->zc_zoneid; } +#endif /* ZFS_LEGACY_SUPPORT */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 4cb7f63b5230..b4c122bdf4c8 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -63,6 +63,7 @@ #include <sys/dmu_objset.h> #include <sys/dsl_dir.h> #include <sys/jail.h> +#include <sys/osd.h> #include <ufs/ufs/quota.h> #include <sys/zfs_quota.h> @@ -88,6 +89,20 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); +struct zfs_jailparam { + int mount_snapshot; +}; + +static struct zfs_jailparam zfs_jailparam0 = { + .mount_snapshot = 0, +}; + +static int zfs_jailparam_slot; + +SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters"); +SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I", + "Allow mounting snapshots in the .zfs directory for unjailed datasets"); + SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, @@ -1298,7 +1313,7 @@ zfs_mount(vfs_t *vfsp) char *osname; int error = 0; int canwrite; - bool checkpointrewind; + bool checkpointrewind, isctlsnap = false; if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); @@ -1313,6 +1328,7 @@ zfs_mount(vfs_t *vfsp) } fetch_osname_options(osname, &checkpointrewind); + isctlsnap = (zfsctl_is_node(mvp) && strchr(osname, '@') != NULL); /* * Check for mount privilege? @@ -1321,7 +1337,9 @@ zfs_mount(vfs_t *vfsp) * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); - if (error) { + if (error && isctlsnap) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } else if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; @@ -1358,8 +1376,27 @@ zfs_mount(vfs_t *vfsp) */ if (!INGLOBALZONE(curproc) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { - error = SET_ERROR(EPERM); - goto out; + boolean_t mount_snapshot = B_FALSE; + + /* + * Snapshots may be mounted in .zfs for unjailed datasets + * if allowed by the jail param zfs.mount_snapshot. + */ + if (isctlsnap) { + struct prison *pr; + struct zfs_jailparam *zjp; + + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + zjp = osd_jail_get(pr, zfs_jailparam_slot); + mtx_unlock(&pr->pr_mtx); + if (zjp && zjp->mount_snapshot) + mount_snapshot = B_TRUE; + } + if (!mount_snapshot) { + error = SET_ERROR(EPERM); + goto out; + } } vfsp->vfs_flag |= MNT_NFS4ACLS; @@ -2316,3 +2353,236 @@ zfsvfs_update_fromname(const char *oldname, const char *newname) mtx_unlock(&mountlist_mtx); } #endif + +/* + * Find a prison with ZFS info. + * Return the ZFS info and the (locked) prison. + */ +static struct zfs_jailparam * +zfs_jailparam_find(struct prison *spr, struct prison **prp) +{ + struct prison *pr; + struct zfs_jailparam *zjp; + + for (pr = spr; ; pr = pr->pr_parent) { + mtx_lock(&pr->pr_mtx); + if (pr == &prison0) { + zjp = &zfs_jailparam0; + break; + } + zjp = osd_jail_get(pr, zfs_jailparam_slot); + if (zjp != NULL) + break; + mtx_unlock(&pr->pr_mtx); + } + *prp = pr; + + return (zjp); +} + +/* + * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the + * ZFS info and lock the prison. + */ +static void +zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp) +{ + struct prison *ppr; + struct zfs_jailparam *zjp, *nzjp; + void **rsv; + + /* If this prison already has ZFS info, return that. */ + zjp = zfs_jailparam_find(pr, &ppr); + if (ppr == pr) + goto done; + + /* + * Allocate a new info record. Then check again, in case something + * changed during the allocation. + */ + mtx_unlock(&ppr->pr_mtx); + nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK); + rsv = osd_reserve(zfs_jailparam_slot); + zjp = zfs_jailparam_find(pr, &ppr); + if (ppr == pr) { + free(nzjp, M_PRISON); + osd_free_reserved(rsv); + goto done; + } + /* Inherit the initial values from the ancestor. */ + mtx_lock(&pr->pr_mtx); + (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp); + (void) memcpy(nzjp, zjp, sizeof (*zjp)); + zjp = nzjp; + mtx_unlock(&ppr->pr_mtx); +done: + if (zjpp != NULL) + *zjpp = zjp; + else + mtx_unlock(&pr->pr_mtx); +} + +/* + * Jail OSD methods for ZFS VFS info. + */ +static int +zfs_jailparam_create(void *obj, void *data) +{ + struct prison *pr = obj; + struct vfsoptlist *opts = data; + int jsys; + + if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 && + jsys == JAIL_SYS_INHERIT) + return (0); + /* + * Inherit a prison's initial values from its parent + * (different from JAIL_SYS_INHERIT which also inherits changes). + */ + zfs_jailparam_alloc(pr, NULL); + return (0); +} + +static int +zfs_jailparam_get(void *obj, void *data) +{ + struct prison *ppr, *pr = obj; + struct vfsoptlist *opts = data; + struct zfs_jailparam *zjp; + int jsys, error; + + zjp = zfs_jailparam_find(pr, &ppr); + jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; + error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys)); + if (error != 0 && error != ENOENT) + goto done; + if (jsys == JAIL_SYS_NEW) { + error = vfs_setopt(opts, "zfs.mount_snapshot", + &zjp->mount_snapshot, sizeof (zjp->mount_snapshot)); + if (error != 0 && error != ENOENT) + goto done; + } else { + /* + * If this prison is inheriting its ZFS info, report + * empty/zero parameters. + */ + static int mount_snapshot = 0; + + error = vfs_setopt(opts, "zfs.mount_snapshot", + &mount_snapshot, sizeof (mount_snapshot)); + if (error != 0 && error != ENOENT) + goto done; + } + error = 0; +done: + mtx_unlock(&ppr->pr_mtx); + return (error); +} + +static int +zfs_jailparam_set(void *obj, void *data) +{ + struct prison *pr = obj; + struct prison *ppr; + struct vfsoptlist *opts = data; + int error, jsys, mount_snapshot; + + /* Set the parameters, which should be correct. */ + error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); + if (error == ENOENT) + jsys = -1; + error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, + sizeof (mount_snapshot)); + if (error == ENOENT) + mount_snapshot = -1; + else + jsys = JAIL_SYS_NEW; + if (jsys == JAIL_SYS_NEW) { + /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */ + struct zfs_jailparam *zjp; + + /* + * A child jail cannot have more permissions than its parent + */ + if (pr->pr_parent != &prison0) { + zjp = zfs_jailparam_find(pr->pr_parent, &ppr); + mtx_unlock(&ppr->pr_mtx); + if (zjp->mount_snapshot < mount_snapshot) { + return (EPERM); + } + } + zfs_jailparam_alloc(pr, &zjp); + if (mount_snapshot != -1) + zjp->mount_snapshot = mount_snapshot; + mtx_unlock(&pr->pr_mtx); + } else { + /* "zfs=inherit": inherit the parent's ZFS info. */ + mtx_lock(&pr->pr_mtx); + osd_jail_del(pr, zfs_jailparam_slot); + mtx_unlock(&pr->pr_mtx); + } + return (0); +} + +static int +zfs_jailparam_check(void *obj __unused, void *data) +{ + struct vfsoptlist *opts = data; + int error, jsys, mount_snapshot; + + /* Check that the parameters are correct. */ + error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)); + if (error != ENOENT) { + if (error != 0) + return (error); + if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT) + return (EINVAL); + } + error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot, + sizeof (mount_snapshot)); + if (error != ENOENT) { + if (error != 0) + return (error); + if (mount_snapshot != 0 && mount_snapshot != 1) + return (EINVAL); + } + return (0); +} + +static void +zfs_jailparam_destroy(void *data) +{ + + free(data, M_PRISON); +} + +static void +zfs_jailparam_sysinit(void *arg __unused) +{ + struct prison *pr; + osd_method_t methods[PR_MAXMETHOD] = { + [PR_METHOD_CREATE] = zfs_jailparam_create, + [PR_METHOD_GET] = zfs_jailparam_get, + [PR_METHOD_SET] = zfs_jailparam_set, + [PR_METHOD_CHECK] = zfs_jailparam_check, + }; + + zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods); + /* Copy the defaults to any existing prisons. */ + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) + zfs_jailparam_alloc(pr, NULL); + sx_sunlock(&allprison_lock); +} + +static void +zfs_jailparam_sysuninit(void *arg __unused) +{ + + osd_jail_deregister(zfs_jailparam_slot); +} + +SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, + zfs_jailparam_sysinit, NULL); +SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, + zfs_jailparam_sysuninit, NULL); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index fae390a148d6..8a350ab4985c 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -837,7 +837,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, /* * Do we have permission to get into attribute directory? */ - error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL); if (error) { vrele(ZTOV(zp)); } @@ -856,7 +856,8 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cnp->cn_flags &= ~NOEXECCHECK; } else #endif - if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, + NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -1036,6 +1037,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set + * mnt_ns - Unused on FreeBSD * * OUT: vpp - vnode of created or trunc'd entry. * @@ -1047,7 +1049,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, */ int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, - znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) + znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zuserns_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp; @@ -1110,7 +1112,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, * Create a new file object and update the directory * to reference it. */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { goto out; } @@ -1126,7 +1128,7 @@ zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, } if ((error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) + cr, vsecp, &acl_ids, NULL)) != 0) goto out; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) @@ -1231,7 +1233,7 @@ zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) xattr_obj = 0; xzp = NULL; - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) { goto out; } @@ -1387,6 +1389,7 @@ zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) * ct - caller context * flags - case flags * vsecp - ACL to be set + * mnt_ns - Unused on FreeBSD * * OUT: vpp - vnode of created directory. * @@ -1398,7 +1401,7 @@ zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) */ int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, - cred_t *cr, int flags, vsecattr_t *vsecp) + cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) { (void) flags, (void) vsecp; znode_t *zp; @@ -1447,7 +1450,7 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - NULL, &acl_ids)) != 0) { + NULL, &acl_ids, NULL)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -1468,7 +1471,8 @@ zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, } ASSERT3P(zp, ==, NULL); - if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, + mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); @@ -1585,7 +1589,7 @@ zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) zilog = zfsvfs->z_log; - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) { goto out; } @@ -1976,7 +1980,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, - skipaclchk, cr))) { + skipaclchk, cr, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -2142,7 +2146,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. - * ct - caller context + * mnt_ns - Unused on FreeBSD * * RETURN: 0 on success, error code on failure. * @@ -2150,7 +2154,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) * vp - ctime updated, mtime updated if size changed. */ int -zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -2322,7 +2326,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); + skipaclchk, cr, mnt_ns); } if (mask & (AT_UID|AT_GID)) { @@ -2359,7 +2363,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { + skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ @@ -2468,7 +2472,8 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) } if (mask & AT_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, + mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { @@ -3264,7 +3269,7 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, * Note that if target and source are the same, this can be * done in a single check. */ - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL))) goto out; if ((*svpp)->v_type == VDIR) { @@ -3368,11 +3373,6 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); - - /* - * Update path information for the target vnode - */ - vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created @@ -3415,7 +3415,7 @@ out: int zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, - cred_t *cr, int flags) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) { struct componentname scn, tcn; vnode_t *sdvp, *tdvp; @@ -3423,6 +3423,9 @@ zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, int error; svp = tvp = NULL; + if (rflags != 0 || wo_vap != NULL) + return (SET_ERROR(EINVAL)); + sdvp = ZTOV(sdzp); tdvp = ZTOV(tdzp); error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); @@ -3460,6 +3463,7 @@ fail: * cr - credentials of caller. * ct - caller context * flags - case flags + * mnt_ns - Unused on FreeBSD * * RETURN: 0 on success, error code on failure. * @@ -3468,7 +3472,7 @@ fail: */ int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, - const char *link, znode_t **zpp, cred_t *cr, int flags) + const char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) { (void) flags; znode_t *zp; @@ -3499,7 +3503,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, } if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { + vap, cr, NULL, &acl_ids, NULL)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -3514,7 +3518,7 @@ zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, return (error); } - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); @@ -3730,7 +3734,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, return (SET_ERROR(EPERM)); } - if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -3831,7 +3835,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -4607,7 +4611,7 @@ zfs_freebsd_create(struct vop_create_args *ap) *ap->a_vpp = NULL; rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode, - &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL); if (rc == 0) *ap->a_vpp = ZTOV(zp); if (zfsvfs->z_use_namecache && @@ -4661,7 +4665,7 @@ zfs_freebsd_mkdir(struct vop_mkdir_args *ap) *ap->a_vpp = NULL; rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, - ap->a_cnp->cn_cred, 0, NULL); + ap->a_cnp->cn_cred, 0, NULL, NULL); if (rc == 0) *ap->a_vpp = ZTOV(zp); @@ -4914,7 +4918,7 @@ zfs_freebsd_setattr(struct vop_setattr_args *ap) xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } - return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); + return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL)); } #ifndef _SYS_SYSPROTO_H_ @@ -4985,7 +4989,7 @@ zfs_freebsd_symlink(struct vop_symlink_args *ap) *ap->a_vpp = NULL; rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, - ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); + ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL); if (rc == 0) { *ap->a_vpp = ZTOV(zp); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index 6345e9e69d30..6c269480cb4b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -298,7 +298,7 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, - kcred, NULL, &acl_ids)); + kcred, NULL, &acl_ids, NULL)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); @@ -1773,7 +1773,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) rootzp->z_zfsvfs = zfsvfs; VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, - cr, NULL, &acl_ids)); + cr, NULL, &acl_ids, NULL)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); @@ -1949,7 +1949,6 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, } else if (error != ENOENT) { return (error); } - error = 0; for (;;) { uint64_t pobj; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index 0410ddd65a5c..c5e745f7d196 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -1735,7 +1735,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, goto error; if (locked) { rw_exit(&key->zk_salt_lock); - locked = B_FALSE; } if (authbuf != NULL) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 8d2a6d77624b..631e020db9c9 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -1386,6 +1386,7 @@ zvol_os_create_minor(const char *name) uint64_t volsize; uint64_t volmode, hash; int error; + bool replayed_zil = B_FALSE; ZFS_LOG(1, "Creating ZVOL %s...", name); hash = zvol_name_hash(name); @@ -1490,11 +1491,12 @@ zvol_os_create_minor(const char *name) zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) - zil_destroy(zv->zv_zilog, B_FALSE); + replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else - zil_replay(os, zv, zvol_replay_vector); + replayed_zil = zil_replay(os, zv, zvol_replay_vector); } - zil_close(zv->zv_zilog); + if (replayed_zil) + zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* TODO: prefetch for geom tasting */ diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c index d0461a9f1298..e87954714e3a 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c @@ -37,7 +37,7 @@ #endif #define MAX_HRTIMEOUT_SLACK_US 1000 -unsigned int spl_schedule_hrtimeout_slack_us = 0; +static unsigned int spl_schedule_hrtimeout_slack_us = 0; static int param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c index 7d3f6127c4af..29781b9515b2 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c @@ -32,7 +32,7 @@ * analysis and other such goodies. * But we would still default to the current default of not to do that. */ -unsigned int spl_panic_halt; +static unsigned int spl_panic_halt; /* CSTYLED */ module_param(spl_panic_halt, uint, 0644); MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index bc39ece9a427..71eedf635f73 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -23,6 +23,7 @@ * Solaris Porting Layer (SPL) Generic Implementation. */ +#include <sys/isa_defs.h> #include <sys/sysmacros.h> #include <sys/systeminfo.h> #include <sys/vmsystm.h> @@ -48,6 +49,7 @@ #include <sys/cred.h> #include <sys/vnode.h> #include <sys/misc.h> +#include <linux/mod_compat.h> unsigned long spl_hostid = 0; EXPORT_SYMBOL(spl_hostid); @@ -60,10 +62,10 @@ proc_t p0; EXPORT_SYMBOL(p0); /* - * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * xoshiro256++ 1.0 PRNG by David Blackman and Sebastiano Vigna * - * "Further scramblings of Marsaglia's xorshift generators" - * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * "Scrambled Linear Pseudorandom Number Generators∗" + * https://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf * * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose * is to provide bytes containing random numbers. It is mapped to /dev/urandom @@ -75,66 +77,85 @@ EXPORT_SYMBOL(p0); * free of atomic instructions. * * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() - * to generate words larger than 128 bits will paradoxically be limited to - * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` - * 128-bit words and selecting the first will implicitly select the second. If + * to generate words larger than 256 bits will paradoxically be limited to + * `2^256 - 1` possibilities. This is because we have a sequence of `2^256 - 1` + * 256-bit words and selecting the first will implicitly select the second. If * a caller finds this behavior undesirable, random_get_bytes() should be used * instead. * * XXX: Linux interrupt handlers that trigger within the critical section - * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * formed by `s[3] = xp[3];` and `xp[0] = s[0];` and call this function will * see the same numbers. Nothing in the code currently calls this in an * interrupt handler, so this is considered to be okay. If that becomes a * problem, we could create a set of per-cpu variables for interrupt handlers * and use them when in_interrupt() from linux/preempt_mask.h evaluates to * true. */ -void __percpu *spl_pseudo_entropy; +static void __percpu *spl_pseudo_entropy; /* - * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed - * file: + * rotl()/spl_rand_next()/spl_rand_jump() are copied from the following CC-0 + * licensed file: * - * http://xorshift.di.unimi.it/xorshift128plus.c + * https://prng.di.unimi.it/xoshiro256plusplus.c */ +static inline uint64_t rotl(const uint64_t x, int k) +{ + return ((x << k) | (x >> (64 - k))); +} + static inline uint64_t spl_rand_next(uint64_t *s) { - uint64_t s1 = s[0]; - const uint64_t s0 = s[1]; - s[0] = s0; - s1 ^= s1 << 23; // a - s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c - return (s[1] + s0); + const uint64_t result = rotl(s[0] + s[3], 23) + s[0]; + + const uint64_t t = s[1] << 17; + + s[2] ^= s[0]; + s[3] ^= s[1]; + s[1] ^= s[2]; + s[0] ^= s[3]; + + s[2] ^= t; + + s[3] = rotl(s[3], 45); + + return (result); } static inline void spl_rand_jump(uint64_t *s) { - static const uint64_t JUMP[] = - { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, + 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c }; uint64_t s0 = 0; uint64_t s1 = 0; + uint64_t s2 = 0; + uint64_t s3 = 0; int i, b; for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) for (b = 0; b < 64; b++) { if (JUMP[i] & 1ULL << b) { s0 ^= s[0]; s1 ^= s[1]; + s2 ^= s[2]; + s3 ^= s[3]; } (void) spl_rand_next(s); } s[0] = s0; s[1] = s1; + s[2] = s2; + s[3] = s3; } int random_get_pseudo_bytes(uint8_t *ptr, size_t len) { - uint64_t *xp, s[2]; + uint64_t *xp, s[4]; ASSERT(ptr); @@ -142,6 +163,8 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) s[0] = xp[0]; s[1] = xp[1]; + s[2] = xp[2]; + s[3] = xp[3]; while (len) { union { @@ -153,12 +176,22 @@ random_get_pseudo_bytes(uint8_t *ptr, size_t len) len -= i; entropy.ui64 = spl_rand_next(s); + /* + * xoshiro256++ has low entropy lower bytes, so we copy the + * higher order bytes first. + */ while (i--) +#ifdef _ZFS_BIG_ENDIAN *ptr++ = entropy.byte[i]; +#else + *ptr++ = entropy.byte[7 - i]; +#endif } xp[0] = s[0]; xp[1] = s[1]; + xp[2] = s[2]; + xp[3] = s[3]; put_cpu_ptr(spl_pseudo_entropy); @@ -518,6 +551,29 @@ ddi_copyin(const void *from, void *to, size_t len, int flags) } EXPORT_SYMBOL(ddi_copyin); +#define define_spl_param(type, fmt) \ +int \ +spl_param_get_##type(char *buf, zfs_kernel_param_t *kp) \ +{ \ + return (scnprintf(buf, PAGE_SIZE, fmt "\n", \ + *(type *)kp->arg)); \ +} \ +int \ +spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp) \ +{ \ + return (kstrto##type(buf, 0, (type *)kp->arg)); \ +} \ +const struct kernel_param_ops spl_param_ops_##type = { \ + .set = spl_param_set_##type, \ + .get = spl_param_get_##type, \ +}; \ +EXPORT_SYMBOL(spl_param_get_##type); \ +EXPORT_SYMBOL(spl_param_set_##type); \ +EXPORT_SYMBOL(spl_param_ops_##type); + +define_spl_param(s64, "%lld") +define_spl_param(u64, "%llu") + /* * Post a uevent to userspace whenever a new vdev adds to the pool. It is * necessary to sync blkid information with udev, which zed daemon uses @@ -741,10 +797,10 @@ spl_kvmem_init(void) static int __init spl_random_init(void) { - uint64_t s[2]; + uint64_t s[4]; int i = 0; - spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t), + spl_pseudo_entropy = __alloc_percpu(4 * sizeof (uint64_t), sizeof (uint64_t)); if (!spl_pseudo_entropy) @@ -752,17 +808,19 @@ spl_random_init(void) get_random_bytes(s, sizeof (s)); - if (s[0] == 0 && s[1] == 0) { + if (s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0) { if (jiffies != 0) { s[0] = jiffies; s[1] = ~0 - jiffies; + s[2] = ~jiffies; + s[3] = jiffies - ~0; } else { - (void) memcpy(s, "improbable seed", sizeof (s)); + (void) memcpy(s, "improbable seed", 16); } printk("SPL: get_random_bytes() returned 0 " "when generating random seed. Setting initial seed to " - "0x%016llx%016llx.\n", cpu_to_be64(s[0]), - cpu_to_be64(s[1])); + "0x%016llx%016llx%016llx%016llx.\n", cpu_to_be64(s[0]), + cpu_to_be64(s[1]), cpu_to_be64(s[2]), cpu_to_be64(s[3])); } for_each_possible_cpu(i) { @@ -772,6 +830,8 @@ spl_random_init(void) wordp[0] = s[0]; wordp[1] = s[1]; + wordp[2] = s[2]; + wordp[3] = s[3]; } return (0); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index e355e2bfc3a0..edd04783b363 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -151,7 +151,7 @@ MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, struct list_head spl_kmem_cache_list; /* List of caches */ struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ +static taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c index a4a24dcae2bd..5e073950d61a 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c @@ -23,9 +23,9 @@ */ #include <sys/list.h> -#include <sys/mutex.h> #include <sys/procfs_list.h> #include <linux/proc_fs.h> +#include <sys/mutex.h> /* * A procfs_list is a wrapper around a linked list which implements the seq_file diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index abf4dca585b2..84497359ce2e 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -1048,7 +1048,6 @@ taskq_create(const char *name, int threads_arg, pri_t pri, ASSERT(name != NULL); ASSERT(minalloc >= 0); - ASSERT(maxalloc <= INT_MAX); ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ /* Scale the number of threads using nthreads as a percentage */ diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c index 9421f81bf0c8..b489179f1257 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zone.c @@ -25,7 +25,6 @@ */ #include <sys/types.h> -#include <sys/mutex.h> #include <sys/sysmacros.h> #include <sys/kmem.h> #include <linux/file.h> @@ -37,6 +36,8 @@ #include <linux/proc_ns.h> #endif +#include <sys/mutex.h> + static kmutex_t zone_datasets_lock; static struct list_head zone_datasets; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 2ab85f8cccd0..16530d82693e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -132,7 +132,7 @@ static abd_stats_t abd_stats = { { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, }; -struct { +static struct { wmsum_t abdstat_struct_size; wmsum_t abdstat_linear_cnt; wmsum_t abdstat_linear_data_size; @@ -597,10 +597,8 @@ abd_free_chunks(abd_t *abd) struct scatterlist *sg; abd_for_each_sg(abd, sg, n, i) { - for (int j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); - umem_free(p, PAGESIZE); - } + struct page *p = nth_page(sg_page(sg), 0); + umem_free_aligned(p, PAGESIZE); } abd_free_sg_table(abd); } @@ -706,7 +704,7 @@ abd_free_zero_scatter(void) __free_page(abd_zero_page); #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ #else - umem_free(abd_zero_page, PAGESIZE); + umem_free_aligned(abd_zero_page, PAGESIZE); #endif /* _KERNEL */ } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index eaaf7d0bb746..6f730e9ddd83 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -358,11 +358,11 @@ arc_lowmem_fini(void) } int -param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) +param_set_arc_u64(const char *buf, zfs_kernel_param_t *kp) { int error; - error = param_set_long(buf, kp); + error = spl_param_set_u64(buf, kp); if (error < 0) return (SET_ERROR(error)); @@ -374,13 +374,13 @@ param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) int param_set_arc_min(const char *buf, zfs_kernel_param_t *kp) { - return (param_set_arc_long(buf, kp)); + return (param_set_arc_u64(buf, kp)); } int param_set_arc_max(const char *buf, zfs_kernel_param_t *kp) { - return (param_set_arc_long(buf, kp)); + return (param_set_arc_u64(buf, kp)); } int diff --git a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c index d502127b5ba3..7e5bd392437e 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c @@ -30,7 +30,7 @@ param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp) { int ret; - ret = param_set_ulong(val, kp); + ret = spl_param_set_u64(val, kp); if (ret < 0) return (ret); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c index a69618978622..eaf38df864d3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c @@ -214,8 +214,10 @@ secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, * Determine that subject can set the file setgid flag. */ int -secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid, zuserns_t *mnt_ns, + zuserns_t *fs_ns) { + gid = zfs_gid_to_vfsgid(mnt_ns, fs_ns, gid); #if defined(CONFIG_USER_NS) if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) return (EPERM); @@ -284,8 +286,11 @@ secpolicy_setid_clear(vattr_t *vap, cred_t *cr) * Determine that subject can set the file setid flags. */ static int -secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner, zuserns_t *mnt_ns, + zuserns_t *fs_ns) { + owner = zfs_uid_to_vfsuid(mnt_ns, fs_ns, owner); + if (crgetuid(cr) == owner) return (0); @@ -310,13 +315,13 @@ secpolicy_vnode_stky_modify(const cred_t *cr) int secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, - const vattr_t *ovap, cred_t *cr) + const vattr_t *ovap, cred_t *cr, zuserns_t *mnt_ns, zuserns_t *fs_ns) { int error; if ((vap->va_mode & S_ISUID) != 0 && (error = secpolicy_vnode_setid_modify(cr, - ovap->va_uid)) != 0) { + ovap->va_uid, mnt_ns, fs_ns)) != 0) { return (error); } @@ -334,7 +339,8 @@ secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, * group-id bit. */ if ((vap->va_mode & S_ISGID) != 0 && - secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + secpolicy_vnode_setids_setgids(cr, ovap->va_gid, + mnt_ns, fs_ns) != 0) { vap->va_mode &= ~S_ISGID; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c index f999df3b7db9..3efc8b9644fd 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c @@ -60,7 +60,7 @@ param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp) { int error; - error = param_set_ulong(val, kp); + error = spl_param_set_u64(val, kp); if (error < 0) return (SET_ERROR(error)); @@ -74,7 +74,7 @@ param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp) { int error; - error = param_set_ulong(val, kp); + error = spl_param_set_u64(val, kp); if (error < 0) return (SET_ERROR(error)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 0fed09df5203..4f33009f14d4 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -56,7 +56,7 @@ static void *zfs_vdev_holder = VDEV_HOLDER; * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ -static unsigned zfs_vdev_open_timeout_ms = 1000; +static uint_t zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. @@ -74,6 +74,12 @@ typedef struct dio_request { struct bio *dr_bio[0]; /* Attached bio's */ } dio_request_t; +/* + * BIO request failfast mask. + */ + +static unsigned int zfs_vdev_failfast_mask = 1; + static fmode_t vdev_bdev_mode(spa_mode_t spa_mode) { @@ -173,7 +179,7 @@ vdev_disk_error(zio_t *zio) * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " - "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), + "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); @@ -659,8 +665,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, retry: dr = vdev_disk_dio_alloc(bio_count); - if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) - bio_set_flags_failfast(bdev, &flags); + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && + zio->io_vd->vdev_failfast == B_TRUE) { + bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, + zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); + } dr->dr_zio = zio; @@ -1006,17 +1015,17 @@ MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { - uint64_t val; + uint_t val; int error; - error = kstrtoull(buf, 0, &val); + error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); - error = param_set_ulong(buf, kp); + error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); @@ -1026,19 +1035,25 @@ param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { - uint64_t val; + uint_t val; int error; - error = kstrtoull(buf, 0, &val); + error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); - error = param_set_ulong(buf, kp); + error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, + "Timeout before determining that a device is missing"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, + "Defines failfast mask: 1 - device, 2 - transport, 4 - driver"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c index 46e412f6eeb4..5abc0426d1a7 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c @@ -53,8 +53,8 @@ static taskq_t *vdev_file_taskq; * impact the vdev_ashift setting which can only be set at vdev creation * time. */ -static unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; -static unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; +static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; +static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; static void vdev_file_hold(vdev_t *vd) @@ -376,7 +376,7 @@ vdev_ops_t vdev_disk_ops = { #endif -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, "Logical ashift for file-based devices"); -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, "Physical ashift for file-based devices"); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index 5935403b49d0..7d14863c56c4 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -629,18 +629,18 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, return (NULL); } -static uint64_t -zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, +static uintptr_t +zfs_ace_walk(void *datap, uintptr_t cookie, int aclcnt, uint16_t *flags, uint16_t *type, uint32_t *mask) { (void) aclcnt; zfs_acl_t *aclp = datap; - zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)cookie; uint64_t who; acep = zfs_acl_next_ace(aclp, acep, &who, mask, flags, type); - return ((uint64_t)(uintptr_t)acep); + return ((uintptr_t)acep); } /* @@ -1163,6 +1163,7 @@ zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, cb->cb_acl_node); } + ASSERT3P(cb->cb_acl_node, !=, NULL); *dataptr = cb->cb_acl_node->z_acldata; *length = cb->cb_acl_node->z_size; } @@ -1284,7 +1285,7 @@ acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) */ static int ace_trivial_common(void *acep, int aclcnt, - uint64_t (*walk)(void *, uint64_t, int aclcnt, + uintptr_t (*walk)(void *, uintptr_t, int, uint16_t *, uint16_t *, uint32_t *)) { uint16_t flags; @@ -1801,7 +1802,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zuserns_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -1888,8 +1889,10 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) + secpolicy_vnode_setids_setgids(cr, gid, mnt_ns, + zfs_i_user_ns(ZTOI(dzp))) != 0) { acl_ids->z_mode &= ~S_ISGID; + } } if (acl_ids->z_aclp == NULL) { @@ -1977,7 +1980,8 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (mask == 0) return (SET_ERROR(ENOSYS)); - if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, + kcred->user_ns))) return (error); mutex_enter(&zp->z_acl_lock); @@ -2136,7 +2140,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (zp->z_pflags & ZFS_IMMUTABLE) return (SET_ERROR(EPERM)); - if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, + kcred->user_ns))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, @@ -2282,7 +2287,7 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - boolean_t anyaccess, cred_t *cr) + boolean_t anyaccess, cred_t *cr, zuserns_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_acl_t *aclp; @@ -2298,7 +2303,13 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, uid_t gowner; uid_t fowner; - zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + if (mnt_ns) { + fowner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + KUID_TO_SUID(ZTOI(zp)->i_uid)); + gowner = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + KGID_TO_SGID(ZTOI(zp)->i_gid)); + } else + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); @@ -2409,7 +2420,8 @@ zfs_has_access(znode_t *zp, cred_t *cr) { uint32_t have = ACE_ALL_PERMS; - if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr, + kcred->user_ns) != 0) { uid_t owner; owner = zfs_fuid_map_id(ZTOZSB(zp), @@ -2439,7 +2451,8 @@ zfs_has_access(znode_t *zp, cred_t *cr) * we want to avoid that here. */ static int -zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) +zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, + zuserns_t *mnt_ns) { int err, mask; int unmapped = 0; @@ -2453,7 +2466,10 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) } #if defined(HAVE_IOPS_PERMISSION_USERNS) - err = generic_permission(cr->user_ns, ZTOI(zp), mask); + if (mnt_ns) + err = generic_permission(mnt_ns, ZTOI(zp), mask); + else + err = generic_permission(cr->user_ns, ZTOI(zp), mask); #else err = generic_permission(ZTOI(zp), mask); #endif @@ -2468,7 +2484,7 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr, zuserns_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int err; @@ -2518,20 +2534,20 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, } if (zp->z_pflags & ZFS_ACL_TRIVIAL) - return (zfs_zaccess_trivial(zp, working_mode, cr)); + return (zfs_zaccess_trivial(zp, working_mode, cr, mnt_ns)); - return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr, mnt_ns)); } static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - cred_t *cr) + cred_t *cr, zuserns_t *mnt_ns) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, - check_privs, B_FALSE, cr)); + check_privs, B_FALSE, cr, mnt_ns)); } int @@ -2598,7 +2614,8 @@ slow: DTRACE_PROBE(zfs__fastpath__execute__access__miss); if ((error = zfs_enter(ZTOZSB(zdp), FTAG)) != 0) return (error); - error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, + kcred->user_ns); zfs_exit(ZTOZSB(zdp), FTAG); return (error); } @@ -2610,7 +2627,8 @@ slow: * can define any form of access. */ int -zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, + zuserns_t *mnt_ns) { uint32_t working_mode; int error; @@ -2649,8 +2667,10 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } } - owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), - cr, ZFS_OWNER); + owner = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ZTOI(zp)), + KUID_TO_SUID(ZTOI(zp)->i_uid)); + owner = zfs_fuid_map_id(ZTOZSB(zp), owner, cr, ZFS_OWNER); + /* * Map the bits required to the standard inode flags * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits @@ -2675,7 +2695,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) needed_bits |= S_IXUSR; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, - &check_privs, skipaclchk, cr)) == 0) { + &check_privs, skipaclchk, cr, mnt_ns)) == 0) { if (is_attr) zrele(xzp); return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, @@ -2689,7 +2709,8 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) } if (error && (flags & V_APPEND)) { - error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr, + mnt_ns); } if (error && check_privs) { @@ -2756,9 +2777,11 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) * NFSv4-style ZFS ACL format and call zfs_zaccess() */ int -zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, + zuserns_t *mnt_ns) { - return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, + mnt_ns)); } /* @@ -2769,7 +2792,7 @@ zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, kcred->user_ns)); } /* See zfs_zaccess_delete() */ @@ -2846,7 +2869,7 @@ static const boolean_t zfs_write_implies_delete_child = B_TRUE; * zfs_write_implies_delete_child */ int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) { uint32_t wanted_dirperms; uint32_t dzp_working_mode = 0; @@ -2873,7 +2896,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) * (This is part of why we're checking the target first.) */ zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, - &zpcheck_privs, B_FALSE, cr); + &zpcheck_privs, B_FALSE, cr, mnt_ns); if (zp_error == EACCES) { /* We hit a DENY ACE. */ if (!zpcheck_privs) @@ -2895,7 +2918,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) if (zfs_write_implies_delete_child) wanted_dirperms |= ACE_WRITE_DATA; dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr, mnt_ns); if (dzp_error == EACCES) { /* We hit a DENY ACE. */ if (!dzpcheck_privs) @@ -2977,7 +3000,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr) + znode_t *tzp, cred_t *cr, zuserns_t *mnt_ns) { int add_perm; int error; @@ -2999,21 +3022,21 @@ zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, * If that succeeds then check for add_file/add_subdir permissions */ - if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + if ((error = zfs_zaccess_delete(sdzp, szp, cr, mnt_ns))) return (error); /* * If we have a tzp, see if we can delete it? */ if (tzp) { - if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + if ((error = zfs_zaccess_delete(tdzp, tzp, cr, mnt_ns))) return (error); } /* * Now check for add permissions */ - error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr, mnt_ns); return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index 4ae0a65370e5..519f13212fac 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -487,7 +487,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_is_sa = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; - zp->z_is_stale = B_FALSE; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c index 819416b68d5f..e5a600250659 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c @@ -35,7 +35,7 @@ typedef struct zfs_dbgmsg { static procfs_list_t zfs_dbgmsgs; static uint_t zfs_dbgmsg_size = 0; -uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ /* * Internal ZFS debug messages are enabled by default. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c index 6738d237b923..85aa94d8df6a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c @@ -926,6 +926,74 @@ zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, return (error); } +static int +zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[3]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + if (ZTOI(zp)->i_nlink <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); + set_nlink(ZTOI(zp), zp_is_dir + 1); + } + drop_nlink(ZTOI(zp)); + if (ZTOI(zp)->i_nlink == zp_is_dir) { + zp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(zp)); + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT3U(error, ==, 0); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Forcefully drop an nlink reference from (zp) and mark it for deletion if it + * was the last link. This *must* only be done to znodes which have already + * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in + * the error path of zfs_rename(), where we have to correct the nlink count if + * we failed to link the target as well as failing to re-link the original + * znodes. + */ +int +zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp) +{ + int error; + + mutex_enter(&zp->z_lock); + error = zfs_drop_nlink_locked(zp, tx, unlinkedp); + mutex_exit(&zp->z_lock); + + return (error); +} + /* * Unlink zp from dl, and mark zp for deletion if this was the last link. Can * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). @@ -966,31 +1034,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, return (error); } - if (ZTOI(zp)->i_nlink <= zp_is_dir) { - zfs_panic_recover("zfs: link count on %lu is %u, " - "should be at least %u", zp->z_id, - (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); - set_nlink(ZTOI(zp), zp_is_dir + 1); - } - drop_nlink(ZTOI(zp)); - if (ZTOI(zp)->i_nlink == zp_is_dir) { - zp->z_unlinked = B_TRUE; - clear_nlink(ZTOI(zp)); - unlinked = B_TRUE; - } else { - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), - NULL, &ctime, sizeof (ctime)); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), - NULL, &zp->z_pflags, sizeof (zp->z_pflags)); - zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, - ctime); - } - links = ZTOI(zp)->i_nlink; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), - NULL, &links, sizeof (links)); - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - count = 0; - ASSERT(error == 0); + /* The only error is !zfs_dirempty() and we checked earlier. */ + error = zfs_drop_nlink_locked(zp, tx, &unlinked); + ASSERT3U(error, ==, 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); @@ -1066,11 +1112,12 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) *xzpp = NULL; - if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) + if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr, + kcred->user_ns))) return (error); if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, - &acl_ids)) != 0) + &acl_ids, kcred->user_ns)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { zfs_acl_ids_free(&acl_ids); @@ -1218,7 +1265,8 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || - zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0) + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, + kcred->user_ns) == 0) return (0); else return (secpolicy_vnode_remove(cr)); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c index da80428402cd..bc753614be27 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c @@ -246,7 +246,7 @@ zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { loff_t rc; - if (*offp < 0 || *offp > MAXOFFSET_T) + if (*offp < 0) return (EINVAL); rc = vfs_llseek(fp, *offp, whence); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c index 4a2091f3c396..e2431fe8a803 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c @@ -279,11 +279,11 @@ zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, for (int i = 0; i < ARRAY_SIZE(type_map); i++) { if (type_map[i].ztm_type & property->pd_types) { - len += snprintf(buf + len, buflen - len, "%s ", - type_map[i].ztm_name); + len += kmem_scnprintf(buf + len, buflen - len, + "%s ", type_map[i].ztm_name); } } - len += snprintf(buf + len, buflen - len, "\n"); + len += kmem_scnprintf(buf + len, buflen - len, "\n"); return (len); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 64d6b4616e1c..c921e587c75c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -1522,7 +1522,6 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; - sb->s_d_op = &zpl_dentry_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); @@ -1556,6 +1555,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); + zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ goto out; } @@ -1563,6 +1563,7 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); + zfsvfs = NULL; /* avoid double-free; first in zfs_umount */ error = SET_ERROR(ENOMEM); goto out; } @@ -1879,8 +1880,8 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { + zpl_d_drop_aliases(ZTOI(zp)); remove_inode_hash(ZTOI(zp)); - zp->z_is_stale = B_TRUE; } /* see comment in zfs_suspend_fs() */ diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index 1ff88c121a79..29d62837a82a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -476,7 +476,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, - B_TRUE, cr))) { + B_TRUE, cr, kcred->user_ns))) { zrele(*zpp); *zpp = NULL; } @@ -494,7 +494,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, * Check accessibility of directory. */ - if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, + kcred->user_ns))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -526,6 +527,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set + * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * @@ -537,7 +539,8 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, - int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, + zuserns_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -624,7 +627,8 @@ top: * Create a new file object and update the directory * to reference it. */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, + mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; @@ -643,7 +647,7 @@ top: } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) + cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; @@ -738,7 +742,8 @@ top: /* * Verify requested access to file. */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, + mnt_ns))) { goto out; } @@ -782,7 +787,8 @@ out: int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, - int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, + zuserns_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); @@ -829,14 +835,14 @@ top: * Create a new file object and update the directory * to reference it. */ - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, - cr, vsecp, &acl_ids)) != 0) + cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; @@ -967,7 +973,7 @@ top: return (error); } - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { goto out; } @@ -1147,6 +1153,7 @@ out: * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set + * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * @@ -1159,7 +1166,7 @@ out: */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, - cred_t *cr, int flags, vsecattr_t *vsecp) + cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -1216,7 +1223,7 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, - vsecp, &acl_ids)) != 0) { + vsecp, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -1237,7 +1244,8 @@ top: return (error); } - if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, + mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); @@ -1379,7 +1387,7 @@ top: return (error); } - if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { goto out; } @@ -1811,6 +1819,7 @@ next: * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. + * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure @@ -1819,7 +1828,7 @@ next: * ip - ctime updated, mtime updated if size changed. */ int -zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); @@ -1968,7 +1977,8 @@ top: */ if (mask & ATTR_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, + mnt_ns); if (err) goto out3; @@ -1993,13 +2003,15 @@ top: XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, - skipaclchk, cr); + skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; + uid_t uid; + gid_t gid; /* * NOTE: even if a new mode is being set, @@ -2013,9 +2025,13 @@ top: * Take ownership or chgrp to group we are a member of */ - take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + uid = zfs_uid_to_vfsuid((struct user_namespace *)mnt_ns, + zfs_i_user_ns(ip), vap->va_uid); + gid = zfs_gid_to_vfsgid((struct user_namespace *)mnt_ns, + zfs_i_user_ns(ip), vap->va_gid); + take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && - zfs_groupmember(zfsvfs, vap->va_gid, cr); + zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and @@ -2031,7 +2047,7 @@ top: ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, - skipaclchk, cr) == 0) { + skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ @@ -2144,12 +2160,12 @@ top: mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, + mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, - &oldva, cr); + &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; - trim_mask |= ATTR_MODE; } else { need_policy = TRUE; @@ -2640,6 +2656,9 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) * tnm - New entry name. * cr - credentials of caller. * flags - case flags + * rflags - RENAME_* flags + * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). + * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * @@ -2648,7 +2667,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, - cred_t *cr, int flags) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); @@ -2660,10 +2679,33 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, int error = 0; int zflg = 0; boolean_t waited = B_FALSE; + /* Needed for whiteout inode creation. */ + boolean_t fuid_dirtied; + zfs_acl_ids_t acl_ids; + boolean_t have_acl = B_FALSE; + znode_t *wzp = NULL; + if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); + if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return (SET_ERROR(EINVAL)); + + /* Already checked by Linux VFS, but just to make sure. */ + if (rflags & RENAME_EXCHANGE && + (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) + return (SET_ERROR(EINVAL)); + + /* + * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the + * right kind of vattr_t for the whiteout file. These are set + * internally by ZFS so should never be incorrect. + */ + VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); + VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); + VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); + if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; @@ -2840,8 +2882,7 @@ top: * Note that if target and source are the same, this can be * done in a single check. */ - - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { @@ -2857,17 +2898,19 @@ top: * Does target exist? */ if (tzp) { + if (rflags & RENAME_NOREPLACE) { + error = SET_ERROR(EEXIST); + goto out; + } /* - * Source and target must be the same type. + * Source and target must be the same type (unless exchanging). */ - if (S_ISDIR(ZTOI(szp)->i_mode)) { - if (!S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(ENOTDIR); - goto out; - } - } else { - if (S_ISDIR(ZTOI(tzp)->i_mode)) { - error = SET_ERROR(EISDIR); + if (!(rflags & RENAME_EXCHANGE)) { + boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; + boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; + + if (s_is_dir != t_is_dir) { + error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } @@ -2880,12 +2923,43 @@ top: error = 0; goto out; } + } else if (rflags & RENAME_EXCHANGE) { + /* Target must exist for RENAME_EXCHANGE. */ + error = SET_ERROR(ENOENT); + goto out; + } + + /* Set up inode creation for RENAME_WHITEOUT. */ + if (rflags & RENAME_WHITEOUT) { + /* + * Whiteout files are not regular files or directories, so to + * match zfs_create() we do not inherit the project id. + */ + uint64_t wo_projid = ZFS_DEFAULT_PROJID; + + error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); + if (error) + goto out; + + if (!have_acl) { + error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, + &acl_ids, mnt_ns); + if (error) + goto out; + have_acl = B_TRUE; + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { + error = SET_ERROR(EDQUOT); + goto out; + } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, sdzp->z_id, + (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); @@ -2895,7 +2969,21 @@ top: dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } + if (rflags & RENAME_WHITEOUT) { + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); @@ -2925,58 +3013,110 @@ top: return (error); } - if (tzp) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + /* + * Unlink the source. + */ + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + VERIFY0(error); - if (error == 0) { - error = zfs_link_create(tdl, szp, tx, ZRENAMING); - if (error == 0) { - szp->z_pflags |= ZFS_AV_MODIFIED; - if (tdzp->z_pflags & ZFS_PROJINHERIT) - szp->z_pflags |= ZFS_PROJINHERIT; - - error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), - (void *)&szp->z_pflags, sizeof (uint64_t), tx); + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error) + goto commit; + + /* + * Unlink the target. + */ + if (tzp) { + int tzflg = zflg; + + if (rflags & RENAME_EXCHANGE) { + /* This inode will be re-linked soon. */ + tzflg |= ZRENAMING; + + tzp->z_pflags |= ZFS_AV_MODIFIED; + if (sdzp->z_pflags & ZFS_PROJINHERIT) + tzp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); + } + error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); + if (error) + goto commit_link_szp; + } - error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); - if (error == 0) { - zfs_log_rename(zilog, tx, TX_RENAME | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); - } else { - /* - * At this point, we have successfully created - * the target name, but have failed to remove - * the source name. Since the create was done - * with the ZRENAMING flag, there are - * complications; for one, the link count is - * wrong. The easiest way to deal with this - * is to remove the newly created target, and - * return the original error. This must - * succeed; fortunately, it is very unlikely to - * fail, since we just created it. - */ - VERIFY3U(zfs_link_destroy(tdl, szp, tx, - ZRENAMING, NULL), ==, 0); - } - } else { - /* - * If we had removed the existing target, subsequent - * call to zfs_link_create() to add back the same entry - * but, the new dnode (szp) should not fail. - */ - ASSERT(tzp == NULL); + /* + * Create the new target links: + * * We always link the target. + * * RENAME_EXCHANGE: Link the old target to the source. + * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. + */ + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error) { + /* + * If we have removed the existing target, a subsequent call to + * zfs_link_create() to add back the same entry, but with a new + * dnode (szp), should not fail. + */ + ASSERT3P(tzp, ==, NULL); + goto commit_link_tzp; + } + + switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { + case RENAME_EXCHANGE: + error = zfs_link_create(sdl, tzp, tx, ZRENAMING); + /* + * The same argument as zfs_link_create() failing for + * szp applies here, since the source directory must + * have had an entry we are replacing. + */ + ASSERT0(error); + if (error) + goto commit_unlink_td_szp; + break; + case RENAME_WHITEOUT: + zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); + error = zfs_link_create(sdl, wzp, tx, ZNEW); + if (error) { + zfs_znode_delete(wzp, tx); + remove_inode_hash(ZTOI(wzp)); + goto commit_unlink_td_szp; } + break; } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { + case RENAME_EXCHANGE: + zfs_log_rename_exchange(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, + tdzp, tdl->dl_name, szp); + break; + case RENAME_WHITEOUT: + zfs_log_rename_whiteout(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, + tdzp, tdl->dl_name, szp, wzp); + break; + default: + ASSERT0(rflags & ~RENAME_NOREPLACE); + zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), + sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + break; + } + +commit: dmu_tx_commit(tx); out: - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); + if (have_acl) + zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) @@ -2987,16 +3127,57 @@ out: zfs_znode_update_vfs(szp); zrele(szp); + if (wzp) { + zfs_znode_update_vfs(wzp); + zrele(wzp); + } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); + + /* + * Clean-up path for broken link state. + * + * At this point we are in a (very) bad state, so we need to do our + * best to correct the state. In particular, all of the nlinks are + * wrong because we were destroying and creating links with ZRENAMING. + * + * In some form, all of these operations have to resolve the state: + * + * * link_destroy() *must* succeed. Fortunately, this is very likely + * since we only just created it. + * + * * link_create()s are allowed to fail (though they shouldn't because + * we only just unlinked them and are putting the entries back + * during clean-up). But if they fail, we can just forcefully drop + * the nlink value to (at the very least) avoid broken nlink values + * -- though in the case of non-empty directories we will have to + * panic (otherwise we'd have a leaked directory with a broken ..). + */ +commit_unlink_td_szp: + VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); +commit_link_tzp: + if (tzp) { + if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) + VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); + } +commit_link_szp: + if (zfs_link_create(sdl, szp, tx, ZRENAMING)) + VERIFY0(zfs_drop_nlink(szp, tx, NULL)); + goto commit; } /* @@ -3008,6 +3189,7 @@ out: * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags + * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * @@ -3018,7 +3200,7 @@ out: */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, - znode_t **zpp, cred_t *cr, int flags) + znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; @@ -3056,7 +3238,7 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, } if ((error = zfs_acl_ids_create(dzp, 0, - vap, cr, NULL, &acl_ids)) != 0) { + vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -3073,7 +3255,7 @@ top: return (error); } - if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); @@ -3325,7 +3507,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, return (SET_ERROR(EPERM)); } - if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, + kcred->user_ns))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -3951,7 +4134,8 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, + kcred->user_ns))) { zfs_exit(zfsvfs, FTAG); return (error); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index 73c21b6c00a8..662147ab4722 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -422,7 +422,12 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) break; case S_IFDIR: +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER + ip->i_flags |= S_IOPS_WRAPPER; + ip->i_op = &zpl_dir_inode_operations.ops; +#else ip->i_op = &zpl_dir_inode_operations; +#endif ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; @@ -552,7 +557,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_atime_dirty = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; - zp->z_is_stale = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; @@ -1960,7 +1964,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, - cr, NULL, &acl_ids)); + cr, NULL, &acl_ids, kcred->user_ns)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); @@ -2136,7 +2140,6 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, } else if (error != ENOENT) { return (error); } - error = 0; for (;;) { uint64_t pobj = 0; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index 671300932384..6f2bf7ed7569 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -231,6 +231,7 @@ zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) keydata_len = zio_crypt_table[crypt].ci_keylen; memset(key, 0, sizeof (zio_crypt_key_t)); + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); /* fill keydata buffers and salt with random data */ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); @@ -282,7 +283,6 @@ zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) key->zk_crypt = crypt; key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; key->zk_salt_count = 0; - rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); return (0); @@ -1968,7 +1968,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, if (locked) { rw_exit(&key->zk_salt_lock); - locked = B_FALSE; } if (authbuf != NULL) diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index 837629e4a5e0..f0779c81dc75 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -371,7 +371,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dip, mode | S_IFDIR, cr); +#ifdef HAVE_IOPS_MKDIR_USERNS + zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns); +#else + zpl_vap_init(vap, dip, mode | S_IFDIR, cr, kcred->user_ns); +#endif error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 25fc6b223297..c56e3691e70a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -1085,7 +1085,7 @@ zpl_ioctl_setflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, kcred->user_ns); spl_fstrans_unmark(cookie); crfree(cr); @@ -1133,7 +1133,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, kcred->user_ns); spl_fstrans_unmark(cookie); crfree(cr); @@ -1221,7 +1221,7 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, kcred->user_ns); spl_fstrans_unmark(cookie); crfree(cr); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index 7578753ed8ce..93eae7201506 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -24,6 +24,7 @@ */ +#include <sys/sysmacros.h> #include <sys/zfs_ctldir.h> #include <sys/zfs_vfsops.h> #include <sys/zfs_vnops.h> @@ -33,7 +34,6 @@ #include <sys/zpl.h> #include <sys/file.h> - static struct dentry * zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -112,18 +112,22 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } void -zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr) +zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, + zuserns_t *mnt_ns) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; - vap->va_uid = crgetuid(cr); - if (dir && dir->i_mode & S_ISGID) { + vap->va_uid = zfs_vfsuid_to_uid((struct user_namespace *)mnt_ns, + zfs_i_user_ns(dir), crgetuid(cr)); + + if (dir->i_mode & S_ISGID) { vap->va_gid = KGID_TO_SGID(dir->i_gid); if (S_ISDIR(mode)) vap->va_mode |= S_ISGID; } else { - vap->va_gid = crgetgid(cr); + vap->va_gid = zfs_vfsgid_to_gid((struct user_namespace *)mnt_ns, + zfs_i_user_ns(dir), crgetgid(cr)); } } @@ -140,14 +144,17 @@ zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) vattr_t *vap; int error; fstrans_cookie_t cookie; +#ifndef HAVE_IOPS_CREATE_USERNS + zuserns_t *user_ns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); + zpl_vap_init(vap, dir, mode, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, - mode, &zp, cr, 0, NULL); + mode, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) @@ -184,6 +191,9 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, vattr_t *vap; int error; fstrans_cookie_t cookie; +#ifndef HAVE_IOPS_MKNOD_USERNS + zuserns_t *user_ns = kcred->user_ns; +#endif /* * We currently expect Linux to supply rdev=0 for all sockets @@ -194,12 +204,12 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode, cr); + zpl_vap_init(vap, dir, mode, cr, user_ns); vap->va_rdev = rdev; cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, - mode, &zp, cr, 0, NULL); + mode, &zp, cr, 0, NULL, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) @@ -236,6 +246,9 @@ zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) vattr_t *vap; int error; fstrans_cookie_t cookie; +#ifndef HAVE_TMPFILE_USERNS + zuserns_t *userns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); @@ -245,10 +258,10 @@ zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) */ if (!IS_POSIXACL(dir)) mode &= ~current_umask(); - zpl_vap_init(vap, dir, mode, cr); + zpl_vap_init(vap, dir, mode, cr, userns); cookie = spl_fstrans_mark(); - error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); + error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL, userns); if (error == 0) { /* d_tmpfile will do drop_nlink, so we should set it first */ set_nlink(ip, 1); @@ -311,13 +324,17 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) znode_t *zp; int error; fstrans_cookie_t cookie; +#ifndef HAVE_IOPS_MKDIR_USERNS + zuserns_t *user_ns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, mode | S_IFDIR, cr); + zpl_vap_init(vap, dir, mode | S_IFDIR, cr, user_ns); cookie = spl_fstrans_mark(); - error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL); + error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL, + user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) @@ -439,7 +456,11 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) int error; fstrans_cookie_t cookie; +#ifdef HAVE_SETATTR_PREPARE_USERNS + error = zpl_setattr_prepare(user_ns, dentry, ia); +#else error = zpl_setattr_prepare(kcred->user_ns, dentry, ia); +#endif if (error) return (error); @@ -447,8 +468,20 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; vap->va_mode = ia->ia_mode; - vap->va_uid = KUID_TO_SUID(ia->ia_uid); - vap->va_gid = KGID_TO_SGID(ia->ia_gid); + if (ia->ia_valid & ATTR_UID) +#ifdef HAVE_IATTR_VFSID + vap->va_uid = zfs_vfsuid_to_uid(user_ns, zfs_i_user_ns(ip), + __vfsuid_val(ia->ia_vfsuid)); +#else + vap->va_uid = KUID_TO_SUID(ia->ia_uid); +#endif + if (ia->ia_valid & ATTR_GID) +#ifdef HAVE_IATTR_VFSID + vap->va_gid = zfs_vfsgid_to_gid(user_ns, zfs_i_user_ns(ip), + __vfsgid_val(ia->ia_vfsgid)); +#else + vap->va_gid = KGID_TO_SGID(ia->ia_gid); +#endif vap->va_size = ia->ia_size; vap->va_atime = ia->ia_atime; vap->va_mtime = ia->ia_mtime; @@ -458,7 +491,11 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); cookie = spl_fstrans_mark(); - error = -zfs_setattr(ITOZ(ip), vap, 0, cr); +#ifdef HAVE_SETATTR_PREPARE_USERNS + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); +#else + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, kcred->user_ns); +#endif if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); @@ -474,32 +511,42 @@ static int #ifdef HAVE_IOPS_RENAME_USERNS zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, - unsigned int flags) + unsigned int rflags) #else zpl_rename2(struct inode *sdip, struct dentry *sdentry, - struct inode *tdip, struct dentry *tdentry, unsigned int flags) + struct inode *tdip, struct dentry *tdentry, unsigned int rflags) #endif { cred_t *cr = CRED(); + vattr_t *wo_vap = NULL; int error; fstrans_cookie_t cookie; - - /* We don't have renameat2(2) support */ - if (flags) - return (-EINVAL); +#ifndef HAVE_IOPS_RENAME_USERNS + zuserns_t *user_ns = kcred->user_ns; +#endif crhold(cr); + if (rflags & RENAME_WHITEOUT) { + wo_vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(wo_vap, sdip, S_IFCHR, cr, user_ns); + wo_vap->va_rdev = makedevice(0, 0); + } + cookie = spl_fstrans_mark(); error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), - dname(tdentry), cr, 0); + dname(tdentry), cr, 0, rflags, wo_vap, user_ns); spl_fstrans_unmark(cookie); + if (wo_vap) + kmem_free(wo_vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } -#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) +#if !defined(HAVE_IOPS_RENAME_USERNS) && \ + !defined(HAVE_RENAME_WANTS_FLAGS) && \ + !defined(HAVE_RENAME2) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -521,14 +568,17 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) znode_t *zp; int error; fstrans_cookie_t cookie; +#ifndef HAVE_IOPS_SYMLINK_USERNS + zuserns_t *user_ns = kcred->user_ns; +#endif crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); - zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); + zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr, user_ns); cookie = spl_fstrans_mark(); error = -zfs_symlink(ITOZ(dir), dname(dentry), vap, - (char *)name, &zp, cr, 0); + (char *)name, &zp, cr, 0, user_ns); if (error == 0) { error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error) { @@ -698,46 +748,6 @@ out: return (error); } -static int -#ifdef HAVE_D_REVALIDATE_NAMEIDATA -zpl_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - unsigned int flags = (nd ? nd->flags : 0); -#else -zpl_revalidate(struct dentry *dentry, unsigned int flags) -{ -#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ - /* CSTYLED */ - zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; - int error; - - if (flags & LOOKUP_RCU) - return (-ECHILD); - - /* - * After a rollback negative dentries created before the rollback - * time must be invalidated. Otherwise they can obscure files which - * are only present in the rolled back dataset. - */ - if (dentry->d_inode == NULL) { - spin_lock(&dentry->d_lock); - error = time_before(dentry->d_time, zfsvfs->z_rollback_time); - spin_unlock(&dentry->d_lock); - - if (error) - return (0); - } - - /* - * The dentry may reference a stale inode if a mounted file system - * was rolled back to a point in time where the object didn't exist. - */ - if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) - return (0); - - return (1); -} - const struct inode_operations zpl_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, @@ -755,7 +765,12 @@ const struct inode_operations zpl_inode_operations = { #endif /* CONFIG_FS_POSIX_ACL */ }; +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER +const struct inode_operations_wrapper zpl_dir_inode_operations = { + .ops = { +#else const struct inode_operations zpl_dir_inode_operations = { +#endif .create = zpl_create, .lookup = zpl_lookup, .link = zpl_link, @@ -764,7 +779,9 @@ const struct inode_operations zpl_dir_inode_operations = { .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, -#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) +#ifdef HAVE_RENAME2 + .rename2 = zpl_rename2, +#elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, #else .rename = zpl_rename, @@ -786,6 +803,10 @@ const struct inode_operations zpl_dir_inode_operations = { #endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ +#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER + }, + .rename2 = zpl_rename2, +#endif }; const struct inode_operations zpl_symlink_inode_operations = { @@ -826,7 +847,3 @@ const struct inode_operations zpl_special_inode_operations = { .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ }; - -dentry_operations_t zpl_dentry_operations = { - .d_revalidate = zpl_revalidate, -}; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index e3945a2a05fe..63ba731dd804 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -374,7 +374,11 @@ const struct super_operations zpl_super_operations = { struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, +#if defined(HAVE_IDMAP_MNT_API) + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, +#else .fs_flags = FS_USERNS_MOUNT, +#endif .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index a010667adfa8..99d9b3793f29 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -499,7 +499,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, vap->va_gid = crgetgid(cr); error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, - cr, 0, NULL); + cr, 0, NULL, kcred->user_ns); if (error) goto out; } @@ -738,9 +738,11 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); static int -__zpl_xattr_user_set(struct inode *ip, const char *name, +__zpl_xattr_user_set(struct user_namespace *user_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) user_ns; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME @@ -846,9 +848,11 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); static int -__zpl_xattr_trusted_set(struct inode *ip, const char *name, +__zpl_xattr_trusted_set(struct user_namespace *user_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) user_ns; char *xattr_name; int error; @@ -914,9 +918,11 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); static int -__zpl_xattr_security_set(struct inode *ip, const char *name, +__zpl_xattr_security_set(struct user_namespace *user_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { + (void) user_ns; char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ @@ -940,7 +946,7 @@ zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, int error = 0; for (xattr = xattrs; xattr->name != NULL; xattr++) { - error = __zpl_xattr_security_set(ip, + error = __zpl_xattr_security_set(NULL, ip, xattr->name, xattr->value, xattr->value_len, 0); if (error < 0) @@ -1300,7 +1306,8 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); static int -__zpl_xattr_acl_set_access(struct inode *ip, const char *name, +__zpl_xattr_acl_set_access(struct user_namespace *mnt_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; @@ -1314,8 +1321,14 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); +#if defined(HAVE_XATTR_SET_USERNS) + if (!zpl_inode_owner_or_capable(mnt_ns, ip)) + return (-EPERM); +#else + (void) mnt_ns; if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); +#endif if (value) { acl = zpl_acl_from_xattr(value, size); @@ -1339,7 +1352,8 @@ __zpl_xattr_acl_set_access(struct inode *ip, const char *name, ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); static int -__zpl_xattr_acl_set_default(struct inode *ip, const char *name, +__zpl_xattr_acl_set_default(struct user_namespace *mnt_ns, + struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; @@ -1353,8 +1367,14 @@ __zpl_xattr_acl_set_default(struct inode *ip, const char *name, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); +#if defined(HAVE_XATTR_SET_USERNS) + if (!zpl_inode_owner_or_capable(mnt_ns, ip)) + return (-EPERM); +#else + (void) mnt_ns; if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); +#endif if (value) { acl = zpl_acl_from_xattr(value, size); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 0d4e0dcd5a3d..01e6456207b0 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -114,7 +114,7 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -taskq_t *zvol_taskq; +static taskq_t *zvol_taskq; static struct ida zvol_ida; typedef struct zv_request_stack { @@ -1279,6 +1279,7 @@ zvol_os_create_minor(const char *name) int error = 0; int idx; uint64_t hash = zvol_name_hash(name); + bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); @@ -1420,11 +1421,12 @@ zvol_os_create_minor(const char *name) zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) - zil_destroy(zv->zv_zilog, B_FALSE); + replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else - zil_replay(os, zv, zvol_replay_vector); + replayed_zil = zil_replay(os, zv, zvol_replay_vector); } - zil_close(zv->zv_zilog); + if (replayed_zil) + zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 0e91304ecd4b..9c65702b8d43 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -25,6 +25,7 @@ * Copyright 2016, Joyent, Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -369,6 +370,8 @@ zfs_prop_init(void) static const zprop_index_t redundant_metadata_table[] = { { "all", ZFS_REDUNDANT_METADATA_ALL }, { "most", ZFS_REDUNDANT_METADATA_MOST }, + { "some", ZFS_REDUNDANT_METADATA_SOME }, + { "none", ZFS_REDUNDANT_METADATA_NONE }, { NULL } }; @@ -388,7 +391,7 @@ zfs_prop_init(void) zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", ZFS_REDUNDANT_METADATA_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "all | most", "REDUND_MD", + "all | most | some | none", "REDUND_MD", redundant_metadata_table, sfeatures); zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, @@ -745,6 +748,8 @@ zfs_prop_init(void) boolean_t zfs_prop_delegatable(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); zprop_desc_t *pd = &zfs_prop_table[prop]; /* The mlslabel property is never delegatable. */ @@ -855,6 +860,8 @@ zfs_prop_valid_for_type(int prop, zfs_type_t types, boolean_t headcheck) zprop_type_t zfs_prop_get_type(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_proptype); } @@ -864,6 +871,8 @@ zfs_prop_get_type(zfs_prop_t prop) boolean_t zfs_prop_readonly(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_attr == PROP_READONLY || zfs_prop_table[prop].pd_attr == PROP_ONETIME || zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT); @@ -875,6 +884,8 @@ zfs_prop_readonly(zfs_prop_t prop) boolean_t zfs_prop_visible(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_visible && zfs_prop_table[prop].pd_zfs_mod_supported); } @@ -885,6 +896,8 @@ zfs_prop_visible(zfs_prop_t prop) boolean_t zfs_prop_setonce(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_attr == PROP_ONETIME || zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT); } @@ -892,12 +905,16 @@ zfs_prop_setonce(zfs_prop_t prop) const char * zfs_prop_default_string(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_strdefault); } uint64_t zfs_prop_default_numeric(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_numdefault); } @@ -908,6 +925,8 @@ zfs_prop_default_numeric(zfs_prop_t prop) const char * zfs_prop_to_name(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_name); } @@ -917,6 +936,8 @@ zfs_prop_to_name(zfs_prop_t prop) boolean_t zfs_prop_inheritable(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || zfs_prop_table[prop].pd_attr == PROP_ONETIME); } @@ -969,6 +990,8 @@ zfs_prop_valid_keylocation(const char *str, boolean_t encrypted) const char * zfs_prop_values(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_values); } @@ -980,6 +1003,8 @@ zfs_prop_values(zfs_prop_t prop) int zfs_prop_is_string(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); } @@ -991,6 +1016,8 @@ zfs_prop_is_string(zfs_prop_t prop) const char * zfs_prop_column_name(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_colname); } @@ -1001,6 +1028,8 @@ zfs_prop_column_name(zfs_prop_t prop) boolean_t zfs_prop_align_right(zfs_prop_t prop) { + ASSERT3S(prop, >=, 0); + ASSERT3S(prop, <, ZFS_NUM_PROPS); return (zfs_prop_table[prop].pd_rightalign); } diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c index 4737bd628ddf..285b97909631 100644 --- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c @@ -420,6 +420,9 @@ vdev_prop_init(void) boolean_na_table, sfeatures); /* default index properties */ + zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "FAILFAST", boolean_table, + sfeatures); /* hidden properties */ zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING, diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 11a1e5112544..d4921d0ba7db 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -667,15 +667,15 @@ abd_return_buf(abd_t *abd, void *buf, size_t n) { abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } -#ifdef ZFS_DEBUG - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -#endif } void diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 33865f715b0f..f51f427c1bfd 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -419,12 +419,12 @@ boolean_t arc_warm; /* * These tunables are for performance analysis. */ -unsigned long zfs_arc_max = 0; -unsigned long zfs_arc_min = 0; -unsigned long zfs_arc_meta_limit = 0; -unsigned long zfs_arc_meta_min = 0; -static unsigned long zfs_arc_dnode_limit = 0; -static unsigned long zfs_arc_dnode_reduce_percent = 10; +uint64_t zfs_arc_max = 0; +uint64_t zfs_arc_min = 0; +uint64_t zfs_arc_meta_limit = 0; +uint64_t zfs_arc_meta_min = 0; +static uint64_t zfs_arc_dnode_limit = 0; +static uint_t zfs_arc_dnode_reduce_percent = 10; static uint_t zfs_arc_grow_retry = 0; static uint_t zfs_arc_shrink_shift = 0; static uint_t zfs_arc_p_min_shift = 0; @@ -449,17 +449,17 @@ int zfs_compressed_arc_enabled = B_TRUE; * ARC will evict meta buffers that exceed arc_meta_limit. This * tunable make arc_meta_limit adjustable for different workloads. */ -static unsigned long zfs_arc_meta_limit_percent = 75; +static uint64_t zfs_arc_meta_limit_percent = 75; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ -static unsigned long zfs_arc_dnode_limit_percent = 10; +static uint_t zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux-specific */ -static unsigned long zfs_arc_sys_free = 0; +static uint64_t zfs_arc_sys_free = 0; static uint_t zfs_arc_min_prefetch_ms = 0; static uint_t zfs_arc_min_prescient_prefetch_ms = 0; static int zfs_arc_p_dampener_disable = 1; @@ -781,12 +781,12 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ @@ -909,7 +909,7 @@ static int l2arc_mfuonly = 0; * will vary depending of how well the specific device handles * these commands. */ -static unsigned long l2arc_trim_ahead = 0; +static uint64_t l2arc_trim_ahead = 0; /* * Performance tuning of L2ARC persistence: @@ -925,7 +925,7 @@ static unsigned long l2arc_trim_ahead = 0; * not to waste space. */ static int l2arc_rebuild_enabled = B_TRUE; -static unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; +static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; /* L2ARC persistence rebuild control routines. */ void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); @@ -3939,7 +3939,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ - hdr = arc_hdr_realloc(hdr, hdr_full_cache, + (void) arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); *real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE; } else { @@ -4469,7 +4469,7 @@ restart: * meta buffers. Requests to the upper layers will be made with * increasingly large scan sizes until the ARC is below the limit. */ - if (meta_used > arc_meta_limit) { + if (meta_used > arc_meta_limit || arc_available_memory() < 0) { if (type == ARC_BUFC_DATA) { type = ARC_BUFC_METADATA; } else { @@ -5136,7 +5136,7 @@ arc_adapt(int bytes, arc_state_t *state) if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ - arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); + arc_p = MIN(arc_c - arc_p_min, arc_p + (uint64_t)bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; @@ -5173,7 +5173,7 @@ arc_adapt(int bytes, arc_state_t *state) atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; - else if (state == arc_anon) + else if (state == arc_anon && arc_p < arc_c >> 1) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; @@ -5386,7 +5386,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag, if (aggsum_upper_bound(&arc_sums.arcstat_size) < arc_c && hdr->b_l1hdr.b_state == arc_anon && (zfs_refcount_count(&arc_anon->arcs_size) + - zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) + zfs_refcount_count(&arc_mru->arcs_size) > arc_p && + arc_p < arc_c >> 1)) arc_p = MIN(arc_c, arc_p + size); } } @@ -8539,6 +8540,7 @@ l2arc_dev_get_next(void) else if (next == first) break; + ASSERT3P(next, !=, NULL); } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || next->l2ad_trim_all); @@ -11076,20 +11078,20 @@ EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, - param_get_ulong, ZMOD_RW, "Minimum ARC size in bytes"); + spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, - param_get_ulong, ZMOD_RW, "Maximum ARC size in bytes"); + spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, - param_get_ulong, ZMOD_RW, "Metadata limit for ARC size in bytes"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_u64, + spl_param_get_u64, ZMOD_RW, "Metadata limit for ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, - param_set_arc_long, param_get_ulong, ZMOD_RW, + param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC size for ARC meta limit"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, - param_get_ulong, ZMOD_RW, "Minimum ARC metadata size in bytes"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_u64, + spl_param_get_u64, ZMOD_RW, "Minimum ARC metadata size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, "Meta objects to scan for prune"); @@ -11128,25 +11130,25 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, param_set_arc_int, param_get_uint, ZMOD_RW, "Min life of prescient prefetched block in ms"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW, "Max write bytes per interval"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW, "Extra write bytes during device warmup"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW, "Number of max device writes to precache"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW, "Compressed l2arc_headroom multiplier"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW, "TRIM ahead L2ARC write size multiplier"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW, "Seconds between L2ARC writing"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW, "Min feed interval in milliseconds"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, @@ -11164,7 +11166,7 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, "Rebuild the L2ARC when importing a pool"); -ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW, "Min size in bytes to write rebuild log blocks in L2ARC"); ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, @@ -11176,17 +11178,17 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, - param_get_ulong, ZMOD_RW, "System free memory target size in bytes"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64, + spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes"); -ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, - param_get_ulong, ZMOD_RW, "Minimum bytes of dnodes in ARC"); +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64, + spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, - param_set_arc_long, param_get_ulong, ZMOD_RW, + param_set_arc_int, param_get_uint, ZMOD_RW, "Percent of ARC meta buffers for dnodes"); -ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c index f0a9222a4308..4c25afaa8199 100644 --- a/sys/contrib/openzfs/module/zfs/btree.c +++ b/sys/contrib/openzfs/module/zfs/btree.c @@ -102,7 +102,7 @@ zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); (void) memset(leaf->btl_elems + (hdr->bth_first + hdr->bth_count) * size, 0x0f, - BTREE_LEAF_ESIZE - + tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - (hdr->bth_first + hdr->bth_count) * size); } #endif @@ -173,16 +173,44 @@ zfs_btree_fini(void) kmem_cache_destroy(zfs_btree_leaf_cache); } +static void * +zfs_btree_leaf_alloc(zfs_btree_t *tree) +{ + if (tree->bt_leaf_size == BTREE_LEAF_SIZE) + return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); + else + return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); +} + +static void +zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) +{ + if (tree->bt_leaf_size == BTREE_LEAF_SIZE) + return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); + else + return (kmem_free(ptr, tree->bt_leaf_size)); +} + void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), size_t size) { - ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2); + zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); +} + +void +zfs_btree_create_custom(zfs_btree_t *tree, + int (*compar) (const void *, const void *), + size_t size, size_t lsize) +{ + size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); + ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; tree->bt_elem_size = size; - tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2); + tree->bt_leaf_size = lsize; + tree->bt_leaf_cap = P2ALIGN(esize / size, 2); tree->bt_height = -1; tree->bt_bulk = NULL; } @@ -290,7 +318,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) zfs_btree_core_t *node = NULL; uint32_t child = 0; - uint64_t depth = 0; + uint32_t depth = 0; /* * Iterate down the tree, finding which child the value should be in @@ -811,8 +839,7 @@ zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, move_count++; } tree->bt_num_nodes++; - zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, - KM_SLEEP); + zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + @@ -1078,8 +1105,7 @@ zfs_btree_add_idx(zfs_btree_t *tree, const void *value, ASSERT0(where->bti_offset); tree->bt_num_nodes++; - zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, - KM_SLEEP); + zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); tree->bt_root = &leaf->btl_hdr; tree->bt_height++; @@ -1378,7 +1404,7 @@ zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) { tree->bt_num_nodes--; if (!zfs_btree_is_core(node)) { - kmem_cache_free(zfs_btree_leaf_cache, node); + zfs_btree_leaf_free(tree, node); } else { kmem_free(node, sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * tree->bt_elem_size); @@ -1991,7 +2017,7 @@ zfs_btree_verify_counts(zfs_btree_t *tree) */ static uint64_t zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, - int64_t height) + int32_t height) { if (!zfs_btree_is_core(hdr)) { VERIFY0(height); @@ -2117,8 +2143,10 @@ zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (size_t i = 0; i < hdr->bth_first * size; i++) VERIFY3U(leaf->btl_elems[i], ==, 0x0f); + size_t esize = tree->bt_leaf_size - + offsetof(zfs_btree_leaf_t, btl_elems); for (size_t i = (hdr->bth_first + hdr->bth_count) * size; - i < BTREE_LEAF_ESIZE; i++) + i < esize; i++) VERIFY3U(leaf->btl_elems[i], ==, 0x0f); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c index b63f42a21e44..57b8faf213eb 100644 --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -128,8 +128,13 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) " snprintf() for kstat name returned %d", (unsigned long long)dmu_objset_id(objset), n); return (SET_ERROR(EINVAL)); + } else if (n >= KSTAT_STRLEN) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + "kstat name length (%d) exceeds limit (%d)", + (unsigned long long)dmu_objset_id(objset), + n, KSTAT_STRLEN); + return (SET_ERROR(ENAMETOOLONG)); } - ASSERT3U(n, <, KSTAT_STRLEN); kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name, "dataset", KSTAT_TYPE_NAMED, diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index db1123d37d98..7982d9702896 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -227,8 +227,8 @@ typedef struct dbuf_cache { dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; /* Size limits for the caches */ -static unsigned long dbuf_cache_max_bytes = ULONG_MAX; -static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; +static uint64_t dbuf_cache_max_bytes = UINT64_MAX; +static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX; /* Set the default sizes of the caches to log2 fraction of arc size */ static uint_t dbuf_cache_shift = 5; @@ -1549,7 +1549,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - err = zio_flags = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -2687,6 +2686,7 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) dbuf_dirty_record_t *dr; dr = list_head(&db->db_dirty_records); + ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; @@ -2748,6 +2748,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, dmu_buf_will_not_fill(dbuf, tx); dr = list_head(&db->db_dirty_records); + ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, @@ -5120,7 +5121,7 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_get_blkptr); -ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW, "Maximum size in bytes of the dbuf cache."); ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, @@ -5129,7 +5130,7 @@ ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, "Percentage below dbuf_cache_max_bytes when dbuf eviction stops."); -ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW, "Maximum size in bytes of dbuf metadata cache."); ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 9e67eb51f415..45304e7ddf7a 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -28,6 +28,7 @@ * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #include <sys/dmu.h> @@ -70,7 +71,7 @@ static int zfs_nopwrite_enabled = 1; * will wait until the next TXG. * A value of zero will disable this throttle. */ -static unsigned long zfs_per_txg_dirty_frees_percent = 30; +static uint_t zfs_per_txg_dirty_frees_percent = 30; /* * Enable/disable forcing txg sync when dirty checking for holes with lseek(). @@ -1435,7 +1436,7 @@ dmu_return_arcbuf(arc_buf_t *buf) */ int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, - const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) + const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx) { dbuf_dirty_record_t *dr = dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); @@ -1992,12 +1993,22 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; - if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || - (os->os_redundant_metadata == - ZFS_REDUNDANT_METADATA_MOST && - (level >= zfs_redundant_metadata_most_ditto_level || - DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) + switch (os->os_redundant_metadata) { + case ZFS_REDUNDANT_METADATA_ALL: copies++; + break; + case ZFS_REDUNDANT_METADATA_MOST: + if (level >= zfs_redundant_metadata_most_ditto_level || + DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) + copies++; + break; + case ZFS_REDUNDANT_METADATA_SOME: + if (DMU_OT_IS_CRITICAL(type)) + copies++; + break; + case ZFS_REDUNDANT_METADATA_NONE: + break; + } } else if (wp & WP_NOFILL) { ASSERT(level == 0); @@ -2355,7 +2366,7 @@ EXPORT_SYMBOL(dmu_ot); ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, "Enable NOP writes"); -ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW, "Percentage of dirtied blocks from frees in one TXG"); ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index 4c20afcdb9c6..c17c829a04d8 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -32,6 +32,7 @@ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -287,7 +288,9 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval) * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || - newval == ZFS_REDUNDANT_METADATA_MOST); + newval == ZFS_REDUNDANT_METADATA_MOST || + newval == ZFS_REDUNDANT_METADATA_SOME || + newval == ZFS_REDUNDANT_METADATA_NONE); os->os_redundant_metadata = newval; } @@ -479,7 +482,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int size; - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index a9e4a6745905..339fb149a49f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -646,7 +646,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing * with a dataset we may encrypt. */ - if (drba->drba_dcp != NULL && + if (drba->drba_dcp == NULL || drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) { dsflags |= DS_HOLD_FLAG_DECRYPT; } @@ -1344,7 +1344,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; - enum zio_flag flags = ZIO_FLAG_SPECULATIVE | + zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; if (rwa->raw) @@ -2186,7 +2186,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) zio_prop_t zp; dmu_write_policy(rwa->os, dn, 0, 0, &zp); - enum zio_flag zio_flags = 0; + zio_flag_t zio_flags = 0; if (rwa->raw) { zp.zp_encrypt = B_TRUE; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 4ee3ffc352b8..ccb7eb20756d 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -934,7 +934,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); if (BP_GET_TYPE(bp) == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + zio_flag_t zioflags = ZIO_FLAG_CANFAIL; if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { ASSERT(BP_IS_PROTECTED(bp)); @@ -1654,7 +1654,7 @@ issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); - enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + zio_flag_t zioflags = ZIO_FLAG_CANFAIL; if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) { zioflags |= ZIO_FLAG_RAW; @@ -2511,8 +2511,7 @@ dmu_send_impl(struct dmu_send_params *dspp) } if (featureflags & DMU_BACKUP_FEATURE_RAW) { - uint64_t ivset_guid = (ancestor_zb != NULL) ? - ancestor_zb->zbm_ivset_guid : 0; + uint64_t ivset_guid = ancestor_zb->zbm_ivset_guid; nvlist_t *keynvl = NULL; ASSERT(os->os_encrypted); @@ -2716,6 +2715,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; err = dmu_send_impl(&dspp); } + if (dspp.fromredactsnaps) + kmem_free(dspp.fromredactsnaps, + dspp.numfromredactsnaps * sizeof (uint64_t)); + dsl_dataset_rele(dspp.to_ds, FTAG); return (err); } @@ -2924,6 +2927,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, /* dmu_send_impl will call dsl_pool_rele for us. */ err = dmu_send_impl(&dspp); } else { + if (dspp.fromredactsnaps) + kmem_free(dspp.fromredactsnaps, + dspp.numfromredactsnaps * + sizeof (uint64_t)); dsl_pool_rele(dspp.dp, FTAG); } } else { diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c index 2ed75640f68d..377634c72bba 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c +++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c @@ -111,6 +111,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); + ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -670,7 +671,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 101d2ee7b7a2..1d63d7de65a1 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -58,7 +58,7 @@ unsigned int zfetch_max_distance = 64 * 1024 * 1024; /* max bytes to prefetch indirects for per stream (default 64MB) */ unsigned int zfetch_max_idistance = 64 * 1024 * 1024; /* max number of bytes in an array_read in which we allow prefetching (1MB) */ -unsigned long zfetch_array_rd_sz = 1024 * 1024; +uint64_t zfetch_array_rd_sz = 1024 * 1024; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; @@ -565,5 +565,5 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, "Max bytes to prefetch indirects for per stream"); -ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW, "Number of bytes in a array_read"); diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index 8ca7ba8957aa..b95c94beff1f 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -229,7 +229,6 @@ dsl_bookmark_create_check_impl(dsl_pool_t *dp, switch (error) { case ESRCH: /* happy path: new bmark doesn't exist, proceed after switch */ - error = 0; break; case 0: error = SET_ERROR(EEXIST); diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c index ce2e6ce742a2..382de208b01d 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c +++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c @@ -2671,6 +2671,7 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, objset_phys_t *osp = buf; uint8_t portable_mac[ZIO_OBJSET_MAC_LEN]; uint8_t local_mac[ZIO_OBJSET_MAC_LEN]; + const uint8_t zeroed_mac[ZIO_OBJSET_MAC_LEN] = {0}; /* look up the key from the spa's keystore */ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); @@ -2696,8 +2697,21 @@ spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, if (memcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 || memcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) { - abd_return_buf(abd, buf, datalen); - return (SET_ERROR(ECKSUM)); + /* + * If the MAC is zeroed out, we failed to decrypt it. + * This should only arise, at least on Linux, + * if we hit edge case handling for useraccounting, since we + * shouldn't get here without bailing out on error earlier + * otherwise. + * + * So if we're in that case, we can just fall through and + * special-casing noticing that it's zero will handle it + * elsewhere, since we can just regenerate it. + */ + if (memcmp(local_mac, zeroed_mac, ZIO_OBJSET_MAC_LEN) != 0) { + abd_return_buf(abd, buf, datalen); + return (SET_ERROR(ECKSUM)); + } } abd_return_buf(abd, buf, datalen); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 7a066b786cd0..c7577fc584af 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -3421,7 +3421,8 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) conflicting_snaps = B_TRUE; } else if (err == ESRCH) { err = 0; - } else if (err != 0) { + } + if (err != 0) { goto out; } } diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c index 1ecae0fe3865..2b33446e66af 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c @@ -92,7 +92,7 @@ * will be loaded into memory and shouldn't take up an inordinate amount of * space. We settled on ~500000 entries, corresponding to roughly 128M. */ -unsigned long zfs_livelist_max_entries = 500000; +uint64_t zfs_livelist_max_entries = 500000; /* * We can approximate how much of a performance gain a livelist will give us @@ -542,6 +542,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); ASSERT3P(dle, !=, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); + ASSERT3P(dle_prev, !=, NULL); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); @@ -1039,7 +1040,7 @@ dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, return (err); } -ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW, "Size to start the next sub-livelist in a livelist"); ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c index d93c7f08c1c2..c1afaa6aaf82 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -54,6 +54,15 @@ #include "zfs_prop.h" /* + * This controls if we verify the ZVOL quota or not. + * Currently, quotas are not implemented for ZVOLs. + * The quota size is the size of the ZVOL. + * The size of the volume already implies the ZVOL size quota. + * The quota mechanism can introduce a significant performance drop. + */ +static int zvol_enforce_quotas = B_TRUE; + +/* * Filesystem and Snapshot Limits * ------------------------------ * @@ -815,6 +824,18 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative @@ -834,19 +855,6 @@ dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, if (delta == 0) return (0); - if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { - /* - * We don't enforce the limit for temporary snapshots. This is - * indicated by a NULL cred_t argument. - */ - if (cr == NULL) - return (0); - - count_prop = DD_FIELD_SNAPSHOT_COUNT; - } else { - count_prop = DD_FIELD_FILESYSTEM_COUNT; - } - /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount @@ -1268,6 +1276,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, uint64_t quota; struct tempreserve *tr; int retval; + uint64_t ext_quota; uint64_t ref_rsrv; top_of_function: @@ -1311,7 +1320,9 @@ top_of_function: * If this transaction will result in a net free of space, * we want to let it through. */ - if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) + if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 || + (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL && + zvol_enforce_quotas == B_FALSE)) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; @@ -1343,7 +1354,16 @@ top_of_function: * on-disk is over quota and there are no pending changes * or deferred frees (which may free up space for us). */ - if (used_on_disk + est_inflight >= quota) { + ext_quota = quota >> 5; + if (quota == UINT64_MAX) + ext_quota = 0; + + if (used_on_disk >= quota) { + /* Quota exceeded */ + mutex_exit(&dd->dd_lock); + DMU_TX_STAT_BUMP(dmu_tx_quota); + return (retval); + } else if (used_on_disk + est_inflight >= quota + ext_quota) { if (est_inflight > 0 || used_on_disk < quota) { retval = SET_ERROR(ERESTART); } else { @@ -1390,10 +1410,9 @@ top_of_function: ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); first = B_FALSE; goto top_of_function; - - } else { - return (0); } + + return (0); } /* @@ -2474,3 +2493,7 @@ dsl_dir_cancel_waiters(dsl_dir_t *dd) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif + +/* CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW, + "Enable strict ZVOL quota enforcment"); diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 4fd3722a051e..5ca918a87ee1 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -99,8 +99,8 @@ * capped at zfs_dirty_data_max_max. It can also be overridden with a module * parameter. */ -unsigned long zfs_dirty_data_max = 0; -unsigned long zfs_dirty_data_max_max = 0; +uint64_t zfs_dirty_data_max = 0; +uint64_t zfs_dirty_data_max_max = 0; uint_t zfs_dirty_data_max_percent = 10; uint_t zfs_dirty_data_max_max_percent = 25; @@ -109,7 +109,7 @@ uint_t zfs_dirty_data_max_max_percent = 25; * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ -unsigned long zfs_wrlog_data_max = 0; +uint64_t zfs_wrlog_data_max = 0; /* * If there's at least this much dirty data (as a percentage of @@ -138,7 +138,7 @@ uint_t zfs_delay_min_dirty_percent = 60; * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ -unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; +uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* * This determines the number of threads used by the dp_sync_taskq. @@ -331,7 +331,6 @@ dsl_pool_open(dsl_pool_t *dp) /* * We might not have created the remap bpobj yet. */ - err = 0; } else { goto out; } @@ -1465,20 +1464,20 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, UINT, ZMOD_RD, ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, UINT, ZMOD_RW, "Transaction delay threshold"); -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, U64, ZMOD_RW, "Determines the dirty space limit"); -ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, U64, ZMOD_RW, "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ -ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, U64, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, UINT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); -ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, U64, ZMOD_RW, "How quickly delay approaches infinity"); ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c index 610e887b3fba..d1c0059092b1 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_prop.c +++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #include <sys/zfs_context.h> @@ -41,6 +42,7 @@ #define ZPROP_INHERIT_SUFFIX "$inherit" #define ZPROP_RECVD_SUFFIX "$recvd" +#define ZPROP_IUV_SUFFIX "$iuv" static int dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) @@ -69,6 +71,17 @@ dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) return (0); } +static int +dsl_prop_known_index(zfs_prop_t prop, uint64_t value) +{ + const char *str = NULL; + if (prop != ZPROP_CONT && prop != ZPROP_INVAL && + zfs_prop_get_type(prop) == PROP_TYPE_INDEX) + return (!zfs_prop_index_to_string(prop, value, &str)); + + return (-1); +} + int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) @@ -81,6 +94,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, boolean_t inheriting = B_FALSE; char *inheritstr; char *recvdstr; + char *iuvstr; ASSERT(dsl_pool_config_held(dd->dd_pool)); @@ -91,6 +105,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, inheritable = (prop == ZPROP_USERPROP || zfs_prop_inheritable(prop)); inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX); /* * Note: dd may become NULL, therefore we shouldn't dereference it @@ -105,6 +120,18 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, inheriting = B_TRUE; } + /* Check for a iuv value. */ + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + iuvstr, intsz, numints, buf); + if (dsl_prop_known_index(zfs_name_to_prop(propname), + *(uint64_t *)buf) != 1) + err = ENOENT; + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + dsl_dir_name(dd, setpoint); + break; + } + /* Check for a local value. */ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, propname, intsz, numints, buf); @@ -155,6 +182,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, kmem_strfree(inheritstr); kmem_strfree(recvdstr); + kmem_strfree(iuvstr); return (err); } @@ -647,6 +675,45 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_rele(dd, FTAG); } + +/* + * For newer values in zfs index type properties, we add a new key + * propname$iuv (iuv = Ignore Unknown Values) to the properties zap object + * to store the new property value and store the default value in the + * existing prop key. So that the propname$iuv key is ignored by the older zfs + * versions and the default property value from the existing prop key is + * used. + */ +static void +dsl_prop_set_iuv(objset_t *mos, uint64_t zapobj, const char *propname, + int intsz, int numints, const void *value, dmu_tx_t *tx) +{ + char *iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX); + boolean_t iuv = B_FALSE; + zfs_prop_t prop = zfs_name_to_prop(propname); + + switch (prop) { + case ZFS_PROP_REDUNDANT_METADATA: + if (*(uint64_t *)value == ZFS_REDUNDANT_METADATA_SOME || + *(uint64_t *)value == ZFS_REDUNDANT_METADATA_NONE) + iuv = B_TRUE; + break; + default: + break; + } + + if (iuv) { + VERIFY0(zap_update(mos, zapobj, iuvstr, intsz, numints, + value, tx)); + uint64_t val = zfs_prop_default_numeric(prop); + VERIFY0(zap_update(mos, zapobj, propname, intsz, numints, + &val, tx)); + } else { + zap_remove(mos, zapobj, iuvstr, tx); + } + kmem_strfree(iuvstr); +} + void dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, zprop_source_t source, int intsz, int numints, const void *value, @@ -659,6 +726,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, const char *valstr = NULL; char *inheritstr; char *recvdstr; + char *iuvstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); @@ -692,6 +760,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + iuvstr = kmem_asprintf("%s%s", propname, ZPROP_IUV_SUFFIX); switch ((int)source) { case ZPROP_SRC_NONE: @@ -709,11 +778,14 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, /* * remove propname$inherit * set propname -> value + * set propname$iuv -> new property value */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); VERIFY0(zap_update(mos, zapobj, propname, intsz, numints, value, tx)); + (void) dsl_prop_set_iuv(mos, zapobj, propname, intsz, + numints, value, tx); break; case ZPROP_SRC_INHERITED: /* @@ -723,6 +795,8 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, iuvstr, tx); + ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; @@ -763,6 +837,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, kmem_strfree(inheritstr); kmem_strfree(recvdstr); + kmem_strfree(iuvstr); /* * If we are left with an empty snap zap we can destroy it. @@ -1012,6 +1087,14 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, propname = za.za_name; source = setpoint; + + /* Skip if iuv entries are preset. */ + valstr = kmem_asprintf("%s%s", propname, + ZPROP_IUV_SUFFIX); + err = zap_contains(mos, propobj, valstr); + kmem_strfree(valstr); + if (err == 0) + continue; } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { /* Skip explicitly inherited entries. */ continue; @@ -1044,6 +1127,16 @@ dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, source = ((flags & DSL_PROP_GET_INHERITING) ? setpoint : ZPROP_SOURCE_VAL_RECVD); + } else if (strcmp(suffix, ZPROP_IUV_SUFFIX) == 0) { + (void) strlcpy(buf, za.za_name, + MIN(sizeof (buf), suffix - za.za_name + 1)); + propname = buf; + source = setpoint; + prop = zfs_name_to_prop(propname); + + if (dsl_prop_known_index(prop, + za.za_first_integer) != 1) + continue; } else { /* * For backward compatibility, skip suffixes we don't diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index f0cd1feaf55b..03c2aa313af0 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -147,13 +147,13 @@ static int zfs_scan_strict_mem_lim = B_FALSE; * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ -static unsigned long zfs_scan_vdev_limit = 4 << 20; +static uint64_t zfs_scan_vdev_limit = 4 << 20; static uint_t zfs_scan_issue_strategy = 0; /* don't queue & sort zios, go direct */ static int zfs_scan_legacy = B_FALSE; -static unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ +static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ /* * fill_weight is non-tunable at runtime, so we copy it at module init from @@ -192,9 +192,9 @@ static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ static const enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ -static unsigned long zfs_async_block_max_blocks = ULONG_MAX; +static uint64_t zfs_async_block_max_blocks = UINT64_MAX; /* max number of dedup blocks to free in a single TXG */ -static unsigned long zfs_max_async_dedup_frees = 100000; +static uint64_t zfs_max_async_dedup_frees = 100000; /* set to disable resilver deferring */ static int zfs_resilver_disable_defer = B_FALSE; @@ -1470,6 +1470,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); + ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -4446,7 +4447,7 @@ dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); } -ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for scrubs and resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW, @@ -4470,10 +4471,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, "Set to disable scrub prefetching"); -ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW, "Max number of blocks freed in one txg"); -ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW, "Max number of dedup blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, @@ -4494,7 +4495,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW, "Scan progress on-disk checkpointing interval"); -ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW, "Max gap in bytes between sequential scrub / resilver I/Os"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c index 32b5cf8facd1..3f05d759770b 100644 --- a/sys/contrib/openzfs/module/zfs/fm.c +++ b/sys/contrib/openzfs/module/zfs/fm.c @@ -955,6 +955,7 @@ fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, } atomic_inc_64( &erpt_kstat_data.fmri_set_failed.value.ui64); + va_end(ap); return; } } diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index efcfeecd778e..c624833bc981 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -51,12 +51,12 @@ * operation, we will try to write this amount of data to each disk before * moving on to the next top-level vdev. */ -static unsigned long metaslab_aliquot = 1024 * 1024; +static uint64_t metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. */ -unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; +uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* * In pools where the log space map feature is not enabled we touch @@ -286,7 +286,7 @@ static const int max_disabled_ms = 3; * Time (in seconds) to respect ms_max_size when the metaslab is not loaded. * To avoid 64-bit overflow, don't set above UINT32_MAX. */ -static unsigned long zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */ +static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */ /* * Maximum percentage of memory to use on storing loaded metaslabs. If loading @@ -5131,8 +5131,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, if (vd != NULL && vd->vdev_mg != NULL) { mg = vdev_get_mg(vd, mc); - if (flags & METASLAB_HINTBP_AVOID && - mg->mg_next != NULL) + if (flags & METASLAB_HINTBP_AVOID) mg = mg->mg_next; } else { mg = mca->mca_rotor; @@ -5201,12 +5200,11 @@ top: ASSERT(mg->mg_initialized); /* - * Avoid writing single-copy data to a failing, + * Avoid writing single-copy data to an unhealthy, * non-redundant vdev, unless we've already tried all * other vdevs. */ - if ((vd->vdev_stat.vs_write_errors > 0 || - vd->vdev_state < VDEV_STATE_HEALTHY) && + if (vd->vdev_state < VDEV_STATE_HEALTHY && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, TRACE_VDEV_ERROR, allocator); @@ -6203,7 +6201,7 @@ metaslab_unflushed_txg(metaslab_t *ms) return (ms->ms_unflushed_txg); } -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW, "Allocation granularity (a.k.a. stripe size)"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, @@ -6251,7 +6249,7 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, "Segment-based metaslab selection maximum buckets before switching"); -ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW, "Blocks larger than this size are forced to be gang blocks"); ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, @@ -6260,7 +6258,7 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, "When looking in size tree, use largest segment instead of exact fit"); -ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64, ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 92fd6c422330..ef0e01df390f 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -156,7 +156,7 @@ * vary with the I/O load and this observed value is the ub_mmp_delay which is * stored in the uberblock. The minimum allowed value is 100 ms. */ -ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; +uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; /* * Used to control the duration of the activity test on import. Smaller values @@ -303,8 +303,10 @@ mmp_next_leaf(spa_t *spa) do { leaf = list_next(&spa->spa_leaf_list, leaf); - if (leaf == NULL) + if (leaf == NULL) { leaf = list_head(&spa->spa_leaf_list); + ASSERT3P(leaf, !=, NULL); + } /* * We skip unwritable, offline, detached, and dRAID spare @@ -548,11 +550,11 @@ mmp_thread(void *arg) uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( zfs_multihost_fail_intervals); hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; - boolean_t last_spa_suspended = suspended; - boolean_t last_spa_multihost = multihost; - uint64_t last_mmp_interval = mmp_interval; - uint32_t last_mmp_fail_intervals = mmp_fail_intervals; - hrtime_t last_mmp_fail_ns = mmp_fail_ns; + boolean_t last_spa_suspended; + boolean_t last_spa_multihost; + uint64_t last_mmp_interval; + uint32_t last_mmp_fail_intervals; + hrtime_t last_mmp_fail_ns; callb_cpr_t cpr; int skip_wait = 0; @@ -734,7 +736,7 @@ mmp_signal_all_threads(void) /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, - param_set_multihost_interval, param_get_ulong, ZMOD_RW, + param_set_multihost_interval, spl_param_get_u64, ZMOD_RW, "Milliseconds between mmp writes to each leaf"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index a2923d1664c7..894c30fcae16 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -369,6 +369,7 @@ range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * invalid as soon as we do any mutating btree operations. */ rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); + ASSERT3P(rs_after, !=, NULL); rs_set_start_raw(rs_after, rt, before_start); rs_set_fill(rs_after, rt, after_fill + before_fill + fill); rs = rs_after; diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index cc367745e486..fe7051db2737 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -218,7 +218,7 @@ static int spa_load_print_vdev_tree = B_FALSE; * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ -unsigned long zfs_max_missing_tvds = 0; +uint64_t zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only @@ -5267,7 +5267,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, * If we've recovered the pool, pass back any information we * gathered while doing the load. */ - if (state == SPA_LOAD_RECOVER) { + if (state == SPA_LOAD_RECOVER && config != NULL) { fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); } @@ -6803,8 +6803,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, pvd = oldvd->vdev_parent; - if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ATTACH)) != 0) + if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) @@ -6819,10 +6819,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, return (spa_vdev_exit(spa, newrootvd, txg, error)); /* - * Spares can't replace logs + * log, dedup and special vdevs should not be replaced by spares. */ - if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || + oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } /* * A dRAID spare can only replace a child of its parent dRAID vdev. @@ -7160,7 +7162,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) * it may be that the unwritability of the disk is the reason * it's being detached! */ - error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. @@ -8867,36 +8869,36 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), (longlong_t)intval); - } else { - ASSERT(0); /* not allowed */ - } - switch (prop) { - case ZPOOL_PROP_DELEGATION: - spa->spa_delegation = intval; - break; - case ZPOOL_PROP_BOOTFS: - spa->spa_bootfs = intval; - break; - case ZPOOL_PROP_FAILUREMODE: - spa->spa_failmode = intval; - break; - case ZPOOL_PROP_AUTOTRIM: - spa->spa_autotrim = intval; - spa_async_request(spa, - SPA_ASYNC_AUTOTRIM_RESTART); - break; - case ZPOOL_PROP_AUTOEXPAND: - spa->spa_autoexpand = intval; - if (tx->tx_txg != TXG_INITIAL) + switch (prop) { + case ZPOOL_PROP_DELEGATION: + spa->spa_delegation = intval; + break; + case ZPOOL_PROP_BOOTFS: + spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + case ZPOOL_PROP_AUTOTRIM: + spa->spa_autotrim = intval; spa_async_request(spa, - SPA_ASYNC_AUTOEXPAND); - break; - case ZPOOL_PROP_MULTIHOST: - spa->spa_multihost = intval; - break; - default: - break; + SPA_ASYNC_AUTOTRIM_RESTART); + break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + if (tx->tx_txg != TXG_INITIAL) + spa_async_request(spa, + SPA_ASYNC_AUTOEXPAND); + break; + case ZPOOL_PROP_MULTIHOST: + spa->spa_multihost = intval; + break; + default: + break; + } + } else { + ASSERT(0); /* not allowed */ } } @@ -10016,7 +10018,7 @@ ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, "Number of threads per IO worker taskqueue"); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c index a837b1ce97ec..b588f7041e5c 100644 --- a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c +++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c @@ -158,7 +158,7 @@ * amount of checkpointed data that has been freed within them while * the pool had a checkpoint. */ -static unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024; +static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; int spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) @@ -631,7 +631,7 @@ EXPORT_SYMBOL(spa_checkpoint_discard_thread); EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW, "Limit for memory used in prefetching the checkpoint space map done " "on each vdev while discarding the checkpoint"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c index 4ecce8214f6a..2878e68c6e4b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c +++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c @@ -188,13 +188,13 @@ static const unsigned long zfs_log_sm_blksz = 1ULL << 17; * (thus the _ppm suffix; reads as "parts per million"). As an example, * the default of 1000 allows 0.1% of memory to be used. */ -static unsigned long zfs_unflushed_max_mem_ppm = 1000; +static uint64_t zfs_unflushed_max_mem_ppm = 1000; /* * Specific hard-limit in memory that ZFS allows to be used for * unflushed changes. */ -static unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; +static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30; /* * The following tunable determines the number of blocks that can be used for @@ -243,33 +243,33 @@ static unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; * provide upper and lower bounds for the log block limit. * [see zfs_unflushed_log_block_{min,max}] */ -static unsigned long zfs_unflushed_log_block_pct = 400; +static uint_t zfs_unflushed_log_block_pct = 400; /* * If the number of metaslabs is small and our incoming rate is high, we could * get into a situation that we are flushing all our metaslabs every TXG. Thus * we always allow at least this many log blocks. */ -static unsigned long zfs_unflushed_log_block_min = 1000; +static uint64_t zfs_unflushed_log_block_min = 1000; /* * If the log becomes too big, the import time of the pool can take a hit in * terms of performance. Thus we have a hard limit in the size of the log in * terms of blocks. */ -static unsigned long zfs_unflushed_log_block_max = (1ULL << 17); +static uint64_t zfs_unflushed_log_block_max = (1ULL << 17); /* * Also we have a hard limit in the size of the log in terms of dirty TXGs. */ -static unsigned long zfs_unflushed_log_txg_max = 1000; +static uint64_t zfs_unflushed_log_txg_max = 1000; /* * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and * stability of the flushing algorithm (longer summary) vs its runtime overhead * (smaller summary is faster to traverse). */ -static unsigned long zfs_max_logsm_summary_length = 10; +static uint64_t zfs_max_logsm_summary_length = 10; /* * Tunable that sets the lower bound on the metaslabs to flush every TXG. @@ -282,7 +282,7 @@ static unsigned long zfs_max_logsm_summary_length = 10; * The point of this tunable is to be used in extreme cases where we really * want to flush more metaslabs than our adaptable heuristic plans to flush. */ -static unsigned long zfs_min_metaslabs_to_flush = 1; +static uint64_t zfs_min_metaslabs_to_flush = 1; /* * Tunable that specifies how far in the past do we want to look when trying to @@ -293,7 +293,7 @@ static unsigned long zfs_min_metaslabs_to_flush = 1; * average over all the blocks that we walk * [see spa_estimate_incoming_log_blocks]. */ -static unsigned long zfs_max_log_walking = 5; +static uint64_t zfs_max_log_walking = 5; /* * This tunable exists solely for testing purposes. It ensures that the log @@ -507,6 +507,7 @@ void spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) { log_summary_entry_t *e = list_head(&spa->spa_log_summary); + ASSERT3P(e, !=, NULL); if (e->lse_txgcount > 0) e->lse_txgcount--; for (; e != NULL; e = list_head(&spa->spa_log_summary)) { @@ -690,7 +691,8 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * based on the incoming rate until we exceed it. */ if (available_blocks >= 0 && available_txgs >= 0) { - uint64_t skip_txgs = MIN(available_txgs + 1, + uint64_t skip_txgs = (incoming == 0) ? + available_txgs + 1 : MIN(available_txgs + 1, (available_blocks / incoming) + 1); available_blocks -= (skip_txgs * incoming); available_txgs -= skip_txgs; @@ -1356,34 +1358,34 @@ spa_ld_log_spacemaps(spa_t *spa) } /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW, "Specific hard-limit in memory that ZFS allows to be used for " "unflushed changes"); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW, "Percentage of the overall system memory that ZFS allows to be " "used for unflushed changes (value is calculated over 1000000 for " "finer granularity)"); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW, "Hard limit (upper-bound) in the size of the space map log " "in terms of blocks."); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW, "Lower-bound limit for the maximum amount of blocks allowed in " "log spacemap (see zfs_unflushed_log_block_max)"); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW, "Hard limit (upper-bound) in the size of the space map log " "in terms of dirty TXGs."); -ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW, "Tunable used to determine the number of blocks that can be used for " "the spacemap log, expressed as a percentage of the total number of " "metaslabs in the pool (e.g. 400 means the number of log blocks is " "capped at 4 times the number of metaslabs)"); -ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW, "The number of past TXGs that the flushing algorithm of the log " "spacemap feature uses to estimate incoming log blocks"); @@ -1392,8 +1394,8 @@ ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, "during pool export/destroy"); /* END CSTYLED */ -ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW, "Maximum number of rows allowed in the summary of the spacemap log"); -ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW, "Minimum number of metaslabs to flush per dirty TXG"); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 102070013404..ca55d55405d3 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -304,20 +304,20 @@ int zfs_free_leak_on_eio = B_FALSE; * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ -unsigned long zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ +uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ -unsigned long zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ +uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ -unsigned long zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ +uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. @@ -1536,7 +1536,7 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } - SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, + SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum, compress); } @@ -2922,7 +2922,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); -ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, @@ -2943,11 +2943,11 @@ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, - param_set_deadman_synctime, param_get_ulong, ZMOD_RW, + param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, - param_set_deadman_ziotime, param_get_ulong, ZMOD_RW, + param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 66cec052b669..4520ca31b7d7 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -144,8 +144,8 @@ int zfs_nocacheflush = 0; * be forced by vdev logical ashift or by user via ashift property, but won't * be set automatically as a performance optimization. */ -uint64_t zfs_vdev_max_auto_ashift = 14; -uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; +uint_t zfs_vdev_max_auto_ashift = 14; +uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) @@ -3563,6 +3563,26 @@ vdev_load(vdev_t *vd) } } + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + uint64_t failfast; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), + 1, &failfast); + if (error == 0) { + vd->vdev_failfast = failfast & 1; + } else if (error == ENOENT) { + vd->vdev_failfast = vdev_prop_default_numeric( + VDEV_PROP_FAILFAST); + } else { + vdev_dbgmsg(vd, + "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + } + } + /* * Load any rebuild state from the top-level vdev zap. */ @@ -5648,7 +5668,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; - int error; + int error = 0; ASSERT(vd != NULL); @@ -5709,6 +5729,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) else error = spa_vdev_alloc(spa, vdev_guid); break; + case VDEV_PROP_FAILFAST: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_failfast = intval & 1; + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6022,6 +6049,25 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, strval, intval, src); break; + case VDEV_PROP_FAILFAST: + src = ZPROP_SRC_LOCAL; + strval = NULL; + + err = zap_lookup(mos, objid, nvpair_name(elem), + sizeof (uint64_t), 1, &intval); + if (err == ENOENT) { + intval = vdev_prop_default_numeric( + prop); + err = 0; + } else if (err) { + break; + } + if (intval == vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + + vdev_prop_add_list(outnvl, propname, strval, + intval, src); + break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ @@ -6078,7 +6124,6 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za.za_name; - prop = vdev_name_to_prop(propname); switch (za.za_integer_length) { case 8: @@ -6156,11 +6201,11 @@ ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, - param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, + param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, - param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, + param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 0ca0c245e952..814a1f0efe4c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -189,14 +189,14 @@ static uint_t zfs_condense_indirect_obsolete_pct = 25; * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ -static unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; +static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ -static unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; +static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain @@ -1319,6 +1319,7 @@ vdev_indirect_io_start(zio_t *zio) vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); + ASSERT3P(first, !=, NULL); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire @@ -1891,11 +1892,11 @@ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT, "Minimum obsolete percent of bytes in the mapping " "to attempt condensing"); -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); -ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index 965fb7ef0593..75beb0cc3d12 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -36,17 +36,13 @@ /* * Value that is written to disk during initialization. */ -#ifdef _ILP32 -static unsigned long zfs_initialize_value = 0xdeadbeefUL; -#else -static unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; -#endif +static uint64_t zfs_initialize_value = 0xdeadbeefdeadbeeeULL; /* maximum number of I/Os outstanding per leaf vdev */ static const int zfs_initialize_limit = 1; /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ -static unsigned long zfs_initialize_chunk_size = 1024 * 1024; +static uint64_t zfs_initialize_chunk_size = 1024 * 1024; static boolean_t vdev_initialize_should_stop(vdev_t *vd) @@ -261,15 +257,9 @@ vdev_initialize_block_fill(void *buf, size_t len, void *unused) (void) unused; ASSERT0(len % sizeof (uint64_t)); -#ifdef _ILP32 - for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { - *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; - } -#else for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; } -#endif return (0); } @@ -765,8 +755,8 @@ EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); EXPORT_SYMBOL(vdev_initialize_restart); -ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, U64, ZMOD_RW, "Value written during zpool initialize"); -ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, U64, ZMOD_RW, "Size in bytes of writes by zpool initialize"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index 1acb89cea393..ec55674393ce 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -605,7 +605,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) int maxblocksize; boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); - enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; @@ -725,6 +725,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * after our span is mandatory. */ dio = AVL_NEXT(t, last); + ASSERT3P(dio, !=, NULL); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { /* do not include the optional i/o */ @@ -756,6 +757,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) do { dio = nio; nio = AVL_NEXT(t, dio); + ASSERT3P(dio, !=, NULL); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c index f74a76a8d5ba..2980f8acfbd7 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c @@ -285,17 +285,17 @@ raidz_math_kstat_headers(char *buf, size_t size) { ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); - ssize_t off = snprintf(buf, size, "%-17s", "implementation"); + ssize_t off = kmem_scnprintf(buf, size, "%-17s", "implementation"); for (int i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) - off += snprintf(buf + off, size - off, "%-16s", + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_gen_name[i]); for (int i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) - off += snprintf(buf + off, size - off, "%-16s", + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_rec_name[i]); - (void) snprintf(buf + off, size - off, "\n"); + (void) kmem_scnprintf(buf + off, size - off, "\n"); return (0); } @@ -311,34 +311,35 @@ raidz_math_kstat_data(char *buf, size_t size, void *data) ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); if (cstat == fstat) { - off += snprintf(buf + off, size - off, "%-17s", "fastest"); + off += kmem_scnprintf(buf + off, size - off, "%-17s", + "fastest"); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) { int id = fstat->gen[i]; - off += snprintf(buf + off, size - off, "%-16s", + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_supp_impl[id]->name); } for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) { int id = fstat->rec[i]; - off += snprintf(buf + off, size - off, "%-16s", + off += kmem_scnprintf(buf + off, size - off, "%-16s", raidz_supp_impl[id]->name); } } else { ptrdiff_t id = cstat - raidz_impl_kstats; - off += snprintf(buf + off, size - off, "%-17s", + off += kmem_scnprintf(buf + off, size - off, "%-17s", raidz_supp_impl[id]->name); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) - off += snprintf(buf + off, size - off, "%-16llu", + off += kmem_scnprintf(buf + off, size - off, "%-16llu", (u_longlong_t)cstat->gen[i]); for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) - off += snprintf(buf + off, size - off, "%-16llu", + off += kmem_scnprintf(buf + off, size - off, "%-16llu", (u_longlong_t)cstat->rec[i]); } - (void) snprintf(buf + off, size - off, "\n"); + (void) kmem_scnprintf(buf + off, size - off, "\n"); return (0); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c index 1ce578e228d8..1f56275c853b 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -22,6 +22,7 @@ * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #include <sys/vdev_impl.h> @@ -103,7 +104,7 @@ * Size of rebuild reads; defaults to 1MiB per data disk and is capped at * SPA_MAXBLOCKSIZE. */ -static unsigned long zfs_rebuild_max_segment = 1024 * 1024; +static uint64_t zfs_rebuild_max_segment = 1024 * 1024; /* * Maximum number of parallelly executed bytes per leaf vdev caused by a @@ -121,7 +122,7 @@ static unsigned long zfs_rebuild_max_segment = 1024 * 1024; * With a value of 32MB the sequential resilver write rate was measured at * 800MB/s sustained while rebuilding to a distributed spare. */ -static unsigned long zfs_rebuild_vdev_limit = 32 << 20; +static uint64_t zfs_rebuild_vdev_limit = 32 << 20; /* * Automatically start a pool scrub when the last active sequential resilver @@ -134,6 +135,7 @@ static int zfs_rebuild_scrub_enabled = 1; * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg); +static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx); /* * Clear the per-vdev rebuild bytes value for a vdev tree. @@ -307,6 +309,17 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); + + /* + * Handle a second device failure if it occurs after all rebuild I/O + * has completed but before this sync task has been executed. + */ + if (vd->vdev_rebuild_reset_wanted) { + mutex_exit(&vd->vdev_rebuild_lock); + vdev_rebuild_reset_sync(arg, tx); + return; + } + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; vrp->vrp_end_time = gethrestime_sec(); @@ -760,7 +773,6 @@ vdev_rebuild_thread(void *arg) ASSERT(vd->vdev_rebuilding); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); - ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; @@ -1138,10 +1150,10 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) return (error); } -ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW, "Max segment size in bytes of rebuild reads"); -ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW, "Max bytes in flight per leaf vdev for sequential resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index 5905d9a07571..5b5076c8722c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -1188,12 +1188,11 @@ vdev_autotrim_thread(void *arg) mutex_exit(&vd->vdev_autotrim_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; - uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; - while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); boolean_t issued_trim = B_FALSE; + uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; + uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; /* * All of the metaslabs are divided in to groups of size diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c index 25c2d5163a26..2e8489c7dfcf 100644 --- a/sys/contrib/openzfs/module/zfs/zap_leaf.c +++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c @@ -646,7 +646,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, * form of the name. But all callers have one of these on hand anyway, * so might as well take advantage. A cleaner but slower interface * would accept neither argument, and compute the normalized name as - * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))). */ boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, @@ -667,7 +667,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, continue; if (zn == NULL) { - zn = zap_name_alloc(zap, name, MT_NORMALIZE); + zn = zap_name_alloc_str(zap, name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c index 58a5c9f600b7..606f426404cc 100644 --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -33,7 +33,7 @@ #include <sys/zap.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> -#include <sys/avl.h> +#include <sys/btree.h> #include <sys/arc.h> #include <sys/dmu_objset.h> @@ -92,7 +92,7 @@ zap_hash(zap_name_t *zn) wp++, i++) { uint64_t word = *wp; - for (int j = 0; j < zn->zn_key_intlen; j++) { + for (int j = 0; j < 8; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; @@ -162,18 +162,25 @@ zap_match(zap_name_t *zn, const char *matchname) } } +static zap_name_t * +zap_name_alloc(zap_t *zap) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zn->zn_zap = zap; + return (zn); +} + void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } -zap_name_t * -zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) +static int +zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) { - zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zap_t *zap = zn->zn_zap; - zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; @@ -194,17 +201,13 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, - zap->zap_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } + zap->zap_normflags) != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { - if (mt != 0) { - zap_name_free(zn); - return (NULL); - } + if (mt != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } @@ -217,13 +220,22 @@ zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, - zn->zn_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } + zn->zn_normflags) != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } + return (0); +} + +zap_name_t * +zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) +{ + zap_name_t *zn = zap_name_alloc(zap); + if (zap_name_init_str(zn, key, mt) != 0) { + zap_name_free(zn); + return (NULL); + } return (zn); } @@ -277,45 +289,46 @@ mze_compare(const void *arg1, const void *arg2) const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); - if (likely(cmp)) - return (cmp); - - return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, + (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash) +mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { + mzap_ent_t mze; + ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); - mze->mze_chunkid = chunkid; - mze->mze_hash = hash; - mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; - ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); - avl_add(&zap->zap_m.zap_avl, mze); + mze.mze_chunkid = chunkid; + ASSERT0(hash & 0xffffffff); + mze.mze_hash = hash >> 32; + ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); + mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; + ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); + zfs_btree_add(&zap->zap_m.zap_tree, &mze); } static mzap_ent_t * -mze_find(zap_name_t *zn) +mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; + zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - mze_tofind.mze_hash = zn->zn_hash; + ASSERT0(zn->zn_hash & 0xffffffff); + mze_tofind.mze_hash = zn->zn_hash >> 32; mze_tofind.mze_cd = 0; - mze = avl_find(avl, &mze_tofind, &idx); + mze = zfs_btree_find(tree, &mze_tofind, idx); if (mze == NULL) - mze = avl_nearest(avl, idx, AVL_AFTER); - for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + mze = zfs_btree_next(tree, idx, idx); + for (; mze && mze->mze_hash == mze_tofind.mze_hash; + mze = zfs_btree_next(tree, idx, idx)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); @@ -328,18 +341,21 @@ static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + zfs_btree_index_t idx; + zfs_btree_t *tree = &zap->zap_m.zap_tree; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT0(hash & 0xffffffff); + hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; - for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); + mze && mze->mze_hash == hash; + mze = zfs_btree_next(tree, &idx, &idx)) { if (mze->mze_cd != cd) break; cd++; @@ -364,16 +380,18 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; mzap_ent_t mze_tofind; - mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + zfs_btree_index_t idx; + zfs_btree_t *tree = &zap->zap_m.zap_tree; uint32_t mzap_ents = 0; + ASSERT0(hash & 0xffffffff); + hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; - for (mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); + mze && mze->mze_hash == hash; + mze = zfs_btree_next(tree, &idx, &idx)) { mzap_ents++; } @@ -384,24 +402,10 @@ mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) } static void -mze_remove(zap_t *zap, mzap_ent_t *mze) -{ - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - avl_remove(&zap->zap_m.zap_avl, mze); - kmem_free(mze, sizeof (mzap_ent_t)); -} - -static void mze_destroy(zap_t *zap) { - mzap_ent_t *mze; - void *avlcookie = NULL; - - while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) - kmem_free(mze, sizeof (mzap_ent_t)); - avl_destroy(&zap->zap_m.zap_avl); + zfs_btree_clear(&zap->zap_m.zap_tree); + zfs_btree_destroy(&zap->zap_m.zap_tree); } static zap_t * @@ -448,21 +452,26 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; - avl_create(&zap->zap_m.zap_avl, mze_compare, - sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); - for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + /* + * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() + * overhead on massive inserts below. It still allows to store + * 62 entries before we have to add 2KB B-tree core node. + */ + zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, + sizeof (mzap_ent_t), 512); + + zap_name_t *zn = zap_name_alloc(zap); + for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { - zap_name_t *zn; - zap->zap_m.zap_num_entries++; - zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_init_str(zn, mze->mze_name, 0); mze_insert(zap, i, zn->zn_hash); - zap_name_free(zn); } } + zap_name_free(zn); } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; @@ -657,24 +666,25 @@ mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags) dprintf("upgrading obj=%llu with %u chunks\n", (u_longlong_t)zap->zap_object, nchunks); - /* XXX destroy the avl later, so we can use the stored hash value */ + /* XXX destroy the tree later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); + zap_name_t *zn = zap_name_alloc(zap); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, (u_longlong_t)mze->mze_value); - zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_init_str(zn, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx)); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ - zap_name_free(zn); } + zap_name_free(zn); vmem_free(mzp, sz); *zapp = zap; return (0); @@ -916,22 +926,23 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t -mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, + zfs_btree_index_t *idx) { - int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; + mzap_ent_t *other; + zfs_btree_index_t oidx; if (zap->zap_normflags == 0) return (B_FALSE); -again: - for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; - other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { - zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, - MT_NORMALIZE); + zn = zap_name_alloc_str(zap, + MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { @@ -941,9 +952,20 @@ again: } } - if (direction == AVL_BEFORE) { - direction = AVL_AFTER; - goto again; + for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); + other && other->mze_hash == mze->mze_hash; + other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { + + if (zn == NULL) { + zn = zap_name_alloc_str(zap, + MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } } if (allocdzn) @@ -971,7 +993,7 @@ zap_lookup_impl(zap_t *zap, const char *name, { int err = 0; - zap_name_t *zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); @@ -979,7 +1001,8 @@ zap_lookup_impl(zap_t *zap, const char *name, err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -990,11 +1013,13 @@ zap_lookup_impl(zap_t *zap, const char *name, } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; - (void) strlcpy(realname, - MZE_PHYS(zap, mze)->mze_name, rn_len); + if (realname != NULL) + (void) strlcpy(realname, + MZE_PHYS(zap, mze)->mze_name, + rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, - zn, mze); + zn, mze, &idx); } } } @@ -1031,7 +1056,7 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); - zn = zap_name_alloc(zap, name, 0); + zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1134,7 +1159,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1142,7 +1167,8 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -1182,7 +1208,7 @@ static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; - int start = zap->zap_m.zap_alloc_next; + uint16_t start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -1198,7 +1224,7 @@ mzap_addent(zap_name_t *zn, uint64_t value) ASSERT(cd < zap_maxcd(zap)); again: - for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { + for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; @@ -1229,7 +1255,7 @@ zap_add_impl(zap_t *zap, const char *key, const uint64_t *intval = val; int err = 0; - zap_name_t *zn = zap_name_alloc(zap, key, 0); + zap_name_t *zn = zap_name_alloc_str(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); @@ -1247,7 +1273,8 @@ zap_add_impl(zap_t *zap, const char *key, } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { - if (mze_find(zn) != NULL) { + zfs_btree_index_t idx; + if (mze_find(zn, &idx) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); @@ -1327,7 +1354,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1348,7 +1375,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze != NULL) { MZE_PHYS(zap, mze)->mze_value = *intval; } else { @@ -1398,20 +1426,20 @@ zap_remove_impl(zap_t *zap, const char *name, { int err = 0; - zap_name_t *zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; - memset(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], 0, - sizeof (mzap_ent_phys_t)); - mze_remove(zap, mze); + memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); + zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); } } zap_name_free(zn); @@ -1582,29 +1610,30 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { - avl_index_t idx; + zfs_btree_index_t idx; mzap_ent_t mze_tofind; - mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_hash = zc->zc_hash >> 32; mze_tofind.mze_cd = zc->zc_cd; - mzap_ent_t *mze = - avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, + &mze_tofind, &idx); if (mze == NULL) { - mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, - idx, AVL_AFTER); + mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, + &idx, &idx); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = - mzap_normalization_conflict(zc->zc_zap, NULL, mze); + mzap_normalization_conflict(zc->zc_zap, NULL, + mze, &idx); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strlcpy(za->za_name, mzep->mze_name, sizeof (za->za_name)); - zc->zc_hash = mze->mze_hash; + zc->zc_hash = (uint64_t)mze->mze_hash << 32; zc->zc_cd = mze->mze_cd; err = 0; } else { diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c index fe90242ca40d..5ebf1bbbc8cc 100644 --- a/sys/contrib/openzfs/module/zfs/zcp.c +++ b/sys/contrib/openzfs/module/zfs/zcp.c @@ -109,8 +109,8 @@ #define ZCP_NVLIST_MAX_DEPTH 20 static const uint64_t zfs_lua_check_instrlimit_interval = 100; -unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; -unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; +uint64_t zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; +uint64_t zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; /* * Forward declarations for mutually recursive functions @@ -277,9 +277,9 @@ zcp_table_to_nvlist(lua_State *state, int index, int depth) } break; case LUA_TNUMBER: - VERIFY3U(sizeof (buf), >, - snprintf(buf, sizeof (buf), "%lld", - (longlong_t)lua_tonumber(state, -2))); + (void) snprintf(buf, sizeof (buf), "%lld", + (longlong_t)lua_tonumber(state, -2)); + key = buf; if (saw_str_could_collide) { key_could_collide = B_TRUE; @@ -1443,8 +1443,8 @@ zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, } } -ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, U64, ZMOD_RW, "Max instruction limit that can be specified for a channel program"); -ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, U64, ZMOD_RW, "Max memory limit that can be specified for a channel program"); diff --git a/sys/contrib/openzfs/module/zfs/zcp_get.c b/sys/contrib/openzfs/module/zfs/zcp_get.c index cd17374eb422..f28266b8095f 100644 --- a/sys/contrib/openzfs/module/zfs/zcp_get.c +++ b/sys/contrib/openzfs/module/zfs/zcp_get.c @@ -467,7 +467,8 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) } else { error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), 1, &numval, setpoint); - + if (error != 0) + goto out; #ifdef _KERNEL /* Fill in temporary value for prop, if applicable */ (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); @@ -489,6 +490,7 @@ get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) (void) lua_pushnumber(state, numval); } } +out: kmem_free(strval, ZAP_MAXVALUELEN); if (error == 0) get_prop_src(state, setpoint, zfs_prop); diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c index 74b4cb8d2e63..4a9a36d87e66 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_chksum.c +++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c @@ -81,15 +81,15 @@ chksum_kstat_headers(char *buf, size_t size) { ssize_t off = 0; - off += snprintf(buf + off, size, "%-23s", "implementation"); - off += snprintf(buf + off, size - off, "%8s", "1k"); - off += snprintf(buf + off, size - off, "%8s", "4k"); - off += snprintf(buf + off, size - off, "%8s", "16k"); - off += snprintf(buf + off, size - off, "%8s", "64k"); - off += snprintf(buf + off, size - off, "%8s", "256k"); - off += snprintf(buf + off, size - off, "%8s", "1m"); - off += snprintf(buf + off, size - off, "%8s", "4m"); - (void) snprintf(buf + off, size - off, "%8s\n", "16m"); + off += kmem_scnprintf(buf + off, size, "%-23s", "implementation"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "1k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "4k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "16k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "64k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "256k"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "1m"); + off += kmem_scnprintf(buf + off, size - off, "%8s", "4m"); + (void) kmem_scnprintf(buf + off, size - off, "%8s\n", "16m"); return (0); } @@ -102,23 +102,23 @@ chksum_kstat_data(char *buf, size_t size, void *data) char b[24]; cs = (chksum_stat_t *)data; - snprintf(b, 23, "%s-%s", cs->name, cs->impl); - off += snprintf(buf + off, size - off, "%-23s", b); - off += snprintf(buf + off, size - off, "%8llu", + kmem_scnprintf(b, 23, "%s-%s", cs->name, cs->impl); + off += kmem_scnprintf(buf + off, size - off, "%-23s", b); + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs1k); - off += snprintf(buf + off, size - off, "%8llu", + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs4k); - off += snprintf(buf + off, size - off, "%8llu", + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs16k); - off += snprintf(buf + off, size - off, "%8llu", + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs64k); - off += snprintf(buf + off, size - off, "%8llu", + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs256k); - off += snprintf(buf + off, size - off, "%8llu", + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs1m); - off += snprintf(buf + off, size - off, "%8llu", + off += kmem_scnprintf(buf + off, size - off, "%8llu", (u_longlong_t)cs->bs4m); - (void) snprintf(buf + off, size - off, "%8llu\n", + (void) kmem_scnprintf(buf + off, size - off, "%8llu\n", (u_longlong_t)cs->bs16m); return (0); diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index 06aa1214ace8..fd0dc7d69bf8 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -253,7 +253,6 @@ void zfs_ereport_clear(spa_t *spa, vdev_t *vd) { uint64_t vdev_guid, pool_guid; - int cnt = 0; ASSERT(vd != NULL || spa != NULL); if (vd == NULL) { @@ -277,7 +276,6 @@ zfs_ereport_clear(spa_t *spa, vdev_t *vd) avl_remove(&recent_events_tree, entry); list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); - cnt++; } } diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index c3266c09306b..a5168b937588 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -229,14 +229,14 @@ static zfsdev_state_t *zfsdev_state_list; * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ -unsigned long zfs_max_nvlist_src_size = 0; +uint64_t zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less than DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ -static unsigned long zfs_history_output_max = 1024 * 1024; +static uint64_t zfs_history_output_max = 1024 * 1024; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; @@ -7884,8 +7884,8 @@ zfs_kmod_fini(void) tsd_destroy(&zfs_allow_log_key); } -ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); -ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c index c92044337bce..77bf9140d52d 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_log.c +++ b/sys/contrib/openzfs/module/zfs/zfs_log.c @@ -494,6 +494,29 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zil_itx_assign(zilog, itx, tx); } +static void +do_zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, + const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) +{ + itx_t *itx; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + memcpy((char *)(lr + 1), sname, snamesize); + memcpy((char *)(lr + 1) + snamesize, dname, dnamesize); + itx->itx_oid = szp->z_id; + + zil_itx_assign(zilog, itx, tx); +} + /* * Handles TX_RENAME transactions. */ @@ -501,18 +524,71 @@ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, znode_t *szp) { + txtype |= TX_RENAME; + do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); +} + +/* + * Handles TX_RENAME_EXCHANGE transactions. + */ +void +zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp) +{ + txtype |= TX_RENAME_EXCHANGE; + do_zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); +} + +/* + * Handles TX_RENAME_WHITEOUT transactions. + * + * Unfortunately we cannot reuse do_zfs_log_rename because we we need to call + * zfs_mknode() on replay which requires stashing bits as with TX_CREATE. + */ +void +zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname, + znode_t *szp, znode_t *wzp) +{ itx_t *itx; - lr_rename_t *lr; + lr_rename_whiteout_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; if (zil_replaying(zilog, tx)) return; + txtype |= TX_RENAME_WHITEOUT; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); - lr = (lr_rename_t *)&itx->itx_lr; - lr->lr_sdoid = sdzp->z_id; - lr->lr_tdoid = tdzp->z_id; + lr = (lr_rename_whiteout_t *)&itx->itx_lr; + lr->lr_rename.lr_sdoid = sdzp->z_id; + lr->lr_rename.lr_tdoid = tdzp->z_id; + + /* + * RENAME_WHITEOUT will create an entry at the source znode, so we need + * to store the same data that the equivalent call to zfs_log_create() + * would. + */ + lr->lr_wfoid = wzp->z_id; + LR_FOID_SET_SLOTS(lr->lr_wfoid, wzp->z_dnodesize >> DNODE_SHIFT); + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(wzp)), &lr->lr_wgen, + sizeof (uint64_t)); + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(wzp)), + lr->lr_wcrtime, sizeof (uint64_t) * 2); + lr->lr_wmode = wzp->z_mode; + lr->lr_wuid = (uint64_t)KUID_TO_SUID(ZTOUID(wzp)); + lr->lr_wgid = (uint64_t)KGID_TO_SGID(ZTOGID(wzp)); + + /* + * This rdev will always be makdevice(0, 0) but because the ZIL log and + * replay code needs to be platform independent (and there is no + * platform independent makdev()) we need to copy the one created + * during the rename operation. + */ + (void) sa_lookup(wzp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(wzp)), &lr->lr_wrdev, + sizeof (lr->lr_wrdev)); + memcpy((char *)(lr + 1), sname, snamesize); memcpy((char *)(lr + 1) + snamesize, dname, dnamesize); itx->itx_oid = szp->z_id; @@ -525,7 +601,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, * called as soon as the write is on stable storage (be it via a DMU sync or a * ZIL commit). */ -static long zfs_immediate_write_sz = 32768; +static int64_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, @@ -815,5 +891,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, zil_itx_assign(zilog, itx, tx); } -ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW, "Largest data block to write to zil"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c index dfcdeeb5b46f..63acf7ab2e4d 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c +++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c @@ -151,7 +151,7 @@ zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, - uint64_t *action_handle) + uintptr_t *action_handle) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; @@ -170,7 +170,7 @@ zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, list_insert_tail(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (action_handle) - *action_handle = (uint64_t)(uintptr_t)ap; + *action_handle = (uintptr_t)ap; return (0); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c index 379e1d1a7b57..0293e46d5858 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_replay.c +++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c @@ -386,8 +386,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) lr->lr_uid, lr->lr_gid); } +#if defined(__linux__) error = zfs_create(dzp, name, &xva.xva_vattr, - 0, 0, &zp, kcred, vflg, &vsec); + 0, 0, &zp, kcred, vflg, &vsec, kcred->user_ns); +#else + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, &vsec, NULL); +#endif break; case TX_MKDIR_ACL: aclstart = (caddr_t)(lracl + 1); @@ -416,8 +421,13 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, lr->lr_uid, lr->lr_gid); } +#if defined(__linux__) + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, &vsec, kcred->user_ns); +#else error = zfs_mkdir(dzp, name, &xva.xva_vattr, - &zp, kcred, vflg, &vsec); + &zp, kcred, vflg, &vsec, NULL); +#endif break; default: error = SET_ERROR(ENOTSUP); @@ -527,8 +537,13 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) if (name == NULL) name = (char *)start; +#if defined(__linux__) + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, NULL, kcred->user_ns); +#else error = zfs_create(dzp, name, &xva.xva_vattr, - 0, 0, &zp, kcred, vflg, NULL); + 0, 0, &zp, kcred, vflg, NULL, NULL); +#endif break; case TX_MKDIR_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); @@ -545,8 +560,14 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) if (name == NULL) name = (char *)(lr + 1); +#if defined(__linux__) + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, NULL, kcred->user_ns); +#else error = zfs_mkdir(dzp, name, &xva.xva_vattr, - &zp, kcred, vflg, NULL); + &zp, kcred, vflg, NULL, NULL); +#endif + break; case TX_MKXATTR: error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred); @@ -554,8 +575,13 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) case TX_SYMLINK: name = (char *)(lr + 1); link = name + strlen(name) + 1; +#if defined(__linux__) + error = zfs_symlink(dzp, name, &xva.xva_vattr, + link, &zp, kcred, vflg, kcred->user_ns); +#else error = zfs_symlink(dzp, name, &xva.xva_vattr, - link, &zp, kcred, vflg); + link, &zp, kcred, vflg, NULL); +#endif break; default: error = SET_ERROR(ENOTSUP); @@ -643,18 +669,21 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) } static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname, + char *tname, uint64_t rflags, vattr_t *wo_vap) { - zfsvfs_t *zfsvfs = arg1; - lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; - int error; - int vflg = 0; + int error, vflg = 0; - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); + /* Only Linux currently supports RENAME_* flags. */ +#ifdef __linux__ + VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT)); + + /* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */ + VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); +#else + VERIFY0(rflags); +#endif if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) return (error); @@ -667,7 +696,13 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg); +#if defined(__linux__) + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, + wo_vap, kcred->user_ns); +#else + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, + wo_vap, NULL); +#endif zrele(tdzp); zrele(sdzp); @@ -675,6 +710,86 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) } static int +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); +} + +static int +zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef __linux__ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, + NULL)); +#else + return (SET_ERROR(ENOTSUP)); +#endif +} + +static int +zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) +{ +#ifdef __linux__ + zfsvfs_t *zfsvfs = arg1; + lr_rename_whiteout_t *lr = arg2; + int error; + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; + /* For the whiteout file. */ + xvattr_t xva; + uint64_t objid; + uint64_t dnodesize; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + objid = LR_FOID_GET_OBJ(lr->lr_wfoid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, + lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid); + + /* + * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which + * assigns the object's creation time, generation number, and dnode + * slot count. The generic zfs_rename() has no concept of these + * attributes, so we smuggle the values inside the vattr's otherwise + * unused va_ctime, va_nblocks, and va_fsid fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime); + xva.xva_vattr.va_nblocks = lr->lr_wgen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) + return (error); + + return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, + RENAME_WHITEOUT, &xva.xva_vattr)); +#else + return (SET_ERROR(ENOTSUP)); +#endif +} + +static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; @@ -860,7 +975,11 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, lr->lr_uid, lr->lr_gid); - error = zfs_setattr(zp, vap, 0, kcred); +#if defined(__linux__) + error = zfs_setattr(zp, vap, 0, kcred, kcred->user_ns); +#else + error = zfs_setattr(zp, vap, 0, kcred, NULL); +#endif zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; @@ -1069,4 +1188,6 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ zfs_replay_setsaxattr, /* TX_SETSAXATTR */ + zfs_replay_rename_exchange, /* TX_RENAME_EXCHANGE */ + zfs_replay_rename_whiteout, /* TX_RENAME_WHITEOUT */ }; diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 57f03f116273..45ecb0773260 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -64,7 +64,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) int error = 0; zfsvfs_t *zfsvfs = ZTOZSB(zp); - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + (void) tsd_set(zfs_fsyncer_key, (void *)(uintptr_t)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) @@ -168,15 +168,25 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); +#if defined(__linux__) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, + kcred->user_ns); +#else + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, + NULL); +#endif else - error = zfs_zaccess_rwx(zp, mode, flag, cr); +#if defined(__linux__) + error = zfs_zaccess_rwx(zp, mode, flag, cr, kcred->user_ns); +#else + error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); +#endif zfs_exit(zfsvfs, FTAG); return (error); } -static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ +static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. @@ -991,5 +1001,5 @@ EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); -ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index dc5b8018e16e..02e6f4b83b9c 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -132,7 +132,7 @@ static int zil_nocacheflush = 0; * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. */ -static unsigned long zil_slog_bulk = 768 * 1024; +static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; @@ -237,7 +237,7 @@ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; @@ -315,7 +315,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { - enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; @@ -339,6 +339,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (wbuf == NULL) zio_flags |= ZIO_FLAG_RAW; + ASSERT3U(BP_GET_LSIZE(bp), !=, 0); SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -479,8 +480,18 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, lrbuf, &end); - if (error != 0) + if (error != 0) { + if (claimed) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + + dmu_objset_name(zilog->zl_os, name); + + cmn_err(CE_WARN, "ZFS read log block error %d, " + "dataset %s, seq 0x%llx\n", error, name, + (u_longlong_t)blk_seq); + } break; + } for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; @@ -504,10 +515,6 @@ done: zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; - ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || - (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) || - (decrypt && error == EIO)); - zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); @@ -758,11 +765,9 @@ zil_commit_activate_saxattr_feature(zilog_t *zilog) uint64_t txg = 0; dmu_tx_t *tx = NULL; - if (spa_feature_is_enabled(zilog->zl_spa, - SPA_FEATURE_ZILSAXATTR) && + if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL && - !dsl_dataset_feature_is_active(ds, - SPA_FEATURE_ZILSAXATTR)) { + !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) { tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_dataset_dirty(ds, tx); @@ -882,8 +887,9 @@ zil_create(zilog_t *zilog) * txg_wait_synced() here either when keep_first is set, because both * zil_create() and zil_destroy() will wait for any in-progress destroys * to complete. + * Return B_TRUE if there were any entries to replay. */ -void +boolean_t zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; @@ -899,7 +905,7 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) - return; + return (B_FALSE); tx = dmu_tx_create(zilog->zl_os); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); @@ -932,6 +938,8 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); + + return (B_TRUE); } void @@ -3844,8 +3852,9 @@ zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) /* * If this dataset has a non-empty intent log, replay it and destroy it. + * Return B_TRUE if there were any entries to replay. */ -void +boolean_t zil_replay(objset_t *os, void *arg, zil_replay_func_t *const replay_func[TX_MAX_TYPE]) { @@ -3854,8 +3863,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { - zil_destroy(zilog, B_TRUE); - return; + return (zil_destroy(zilog, B_TRUE)); } zr.zr_replay = replay_func; @@ -3878,6 +3886,8 @@ zil_replay(objset_t *os, void *arg, zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; + + return (B_TRUE); } boolean_t @@ -3945,7 +3955,7 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, "Disable ZIL cache flushes"); -ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW, "Limit in bytes slog sync writes per commit"); ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index c2e3c6169fa3..928e28813931 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -512,8 +512,9 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) /* * If this is an authenticated block, just check the MAC. It would be - * nice to separate this out into its own flag, but for the moment - * enum zio_flag is out of bits. + * nice to separate this out into its own flag, but when this was done, + * we had run out of bits in what is now zio_flag_t. Future cleanup + * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { @@ -802,7 +803,7 @@ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, - enum zio_flag flags, vdev_t *vd, uint64_t offset, + zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { @@ -901,7 +902,7 @@ zio_destroy(zio_t *zio) zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, - void *private, enum zio_flag flags) + void *private, zio_flag_t flags) { zio_t *zio; @@ -913,7 +914,7 @@ zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, } zio_t * -zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) +zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } @@ -1099,7 +1100,7 @@ zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) + zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -1117,7 +1118,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, - void *private, zio_priority_t priority, enum zio_flag flags, + void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -1160,7 +1161,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) + zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; @@ -1203,7 +1204,6 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) */ if (BP_IS_EMBEDDED(bp)) return; - metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be @@ -1220,6 +1220,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { + metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); @@ -1233,7 +1234,7 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - enum zio_flag flags) + zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); @@ -1266,7 +1267,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - zio_done_func_t *done, void *private, enum zio_flag flags) + zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; @@ -1303,7 +1304,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, enum zio_flag flags) + zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; int c; @@ -1328,7 +1329,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, - enum zio_flag flags, enum trim_flag trim_flags) + zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; @@ -1348,7 +1349,7 @@ zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, boolean_t labels) + zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; @@ -1369,7 +1370,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, - zio_priority_t priority, enum zio_flag flags, boolean_t labels) + zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; @@ -1406,7 +1407,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; @@ -1480,7 +1481,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, - zio_type_t type, zio_priority_t priority, enum zio_flag flags, + zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; @@ -2030,7 +2031,7 @@ zio_deadman_impl(zio_t *pio, int ziodepth) "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " - "priority=%d flags=0x%x stage=0x%x " + "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " @@ -2040,8 +2041,8 @@ zio_deadman_impl(zio_t *pio, int ziodepth) (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, - pio->io_priority, pio->io_flags, pio->io_stage, - pio->io_pipeline, pio->io_pipeline_trace, + pio->io_priority, (u_longlong_t)pio->io_flags, + pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, @@ -3360,7 +3361,7 @@ zio_ddt_write(zio_t *zio) return (zio); } -ddt_entry_t *freedde; /* for debugging */ +static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c index 4c9cbc962093..0fb91ac81522 100644 --- a/sys/contrib/openzfs/module/zfs/zio_compress.c +++ b/sys/contrib/openzfs/module/zfs/zio_compress.c @@ -44,7 +44,7 @@ * If nonzero, every 1/X decompression attempts will fail, simulating * an undetected memory error. */ -unsigned long zio_decompress_fail_fraction = 0; +static unsigned long zio_decompress_fail_fraction = 0; /* * Compression vectors. diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 2e2860ff0212..20578a8223b2 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -514,6 +514,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ zvol_replay_err, /* TX_SETSAXATTR */ + zvol_replay_err, /* TX_RENAME_EXCHANGE */ + zvol_replay_err, /* TX_RENAME_WHITEOUT */ }; /* @@ -1026,8 +1028,7 @@ zvol_add_clones(const char *dsname, list_t *minors_list) out: if (dd != NULL) dsl_dir_rele(dd, FTAG); - if (dp != NULL) - dsl_pool_rele(dp, FTAG); + dsl_pool_rele(dp, FTAG); } /* |