diff options
Diffstat (limited to 'sys/contrib/openzfs/module')
96 files changed, 8901 insertions, 4922 deletions
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd index 4feb9e1eaf0c..e7cddcc5bb5e 100644 --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -24,7 +24,6 @@ KMOD= openzfs CFLAGS+= -I${.OBJDIR:H}/include CFLAGS+= -I${INCDIR} -CFLAGS+= -I${INCDIR}/spl CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs @@ -40,7 +39,13 @@ CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSS .endif .if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" -CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS +CFLAGS+= -DZFS_DEBUG -g +.if defined(WITH_INVARIANTS) && ${WITH_INVARIANTS} == "true" + CFLAGS+= -DINVARIANTS -DWITNESS -DOPENSOLARIS_WITNESS +.endif +.if defined(WITH_O0) && ${WITH_O0} == "true" + CFLAGS+= -O0 +.endif .else CFLAGS += -DNDEBUG .endif @@ -102,9 +107,10 @@ SRCS+= nvpair.c \ #os/freebsd/spl SRCS+= acl_common.c \ - btree.c \ callb.c \ list.c \ + sha256c.c \ + sha512c.c \ spl_acl.c \ spl_cmn_err.c \ spl_dtrace.c \ @@ -112,6 +118,7 @@ SRCS+= acl_common.c \ spl_kstat.c \ spl_misc.c \ spl_policy.c \ + spl_procfs_list.c \ spl_string.c \ spl_sunddi.c \ spl_sysevent.c \ @@ -119,11 +126,8 @@ SRCS+= acl_common.c \ spl_uio.c \ spl_vfs.c \ spl_vm.c \ - spl_zone.c \ - sha256c.c \ - sha512c.c \ - spl_procfs_list.c \ - spl_zlib.c + spl_zlib.c \ + spl_zone.c .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ @@ -133,6 +137,7 @@ SRCS+= spl_atomic.c #os/freebsd/zfs SRCS+= abd_os.c \ + arc_os.c \ crypto_os.c \ dmu_os.c \ hkdf.c \ @@ -140,17 +145,16 @@ SRCS+= abd_os.c \ spa_os.c \ sysctl_os.c \ vdev_file.c \ - vdev_label_os.c \ vdev_geom.c \ + vdev_label_os.c \ zfs_acl.c \ zfs_ctldir.c \ + zfs_debug.c \ zfs_dir.c \ zfs_ioctl_compat.c \ zfs_ioctl_os.c \ - zfs_log.c \ - zfs_replay.c \ zfs_vfsops.c \ - zfs_vnops.c \ + zfs_vnops_os.c \ zfs_znode.c \ zio_crypt.c \ zvol_os.c @@ -178,10 +182,10 @@ SRCS+= zfeature_common.c \ SRCS+= abd.c \ aggsum.c \ arc.c \ - arc_os.c \ blkptr.c \ bplist.c \ bpobj.c \ + btree.c \ cityhash.c \ dbuf.c \ dbuf_stats.c \ @@ -245,6 +249,8 @@ SRCS+= abd.c \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_indirect.c \ vdev_indirect_births.c \ vdev_indirect_mapping.c \ @@ -276,16 +282,18 @@ SRCS+= abd.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ - zfs_debug.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ zfs_ioctl.c \ + zfs_log.c \ zfs_onexit.c \ zfs_quota.c \ zfs_ratelimit.c \ + zfs_replay.c \ zfs_rlock.c \ zfs_sa.c \ + zfs_vnops.c \ zil.c \ zio.c \ zio_checksum.c \ @@ -323,7 +331,7 @@ CFLAGS.spl_vm.c= -Wno-cast-qual CFLAGS.spl_zlib.c= -Wno-cast-qual CFLAGS.abd.c= -Wno-cast-qual CFLAGS.zfs_log.c= -Wno-cast-qual -CFLAGS.zfs_vnops.c= -Wno-pointer-arith +CFLAGS.zfs_vnops_os.c= -Wno-pointer-arith CFLAGS.u8_textprep.c= -Wno-cast-qual CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith @@ -341,6 +349,7 @@ CFLAGS.lz4.c= -Wno-cast-qual CFLAGS.spa.c= -Wno-cast-qual CFLAGS.spa_misc.c= -Wno-cast-qual CFLAGS.sysctl_os.c= -include ../zfs_config.h +CFLAGS.vdev_draid.c= -Wno-cast-qual CFLAGS.vdev_raidz.c= -Wno-cast-qual CFLAGS.vdev_raidz_math.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c index 5553c55e11cd..23686c59e8ce 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c @@ -59,10 +59,12 @@ boolean_t gcm_avx_can_use_movbe = B_FALSE; static boolean_t gcm_use_avx = B_FALSE; #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) +extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + static inline boolean_t gcm_avx_will_work(void); static inline void gcm_set_avx(boolean_t); static inline boolean_t gcm_toggle_avx(void); -extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); +static inline size_t gcm_simd_get_htab_size(boolean_t); static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); @@ -629,6 +631,21 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, (volatile boolean_t *)&gcm_avx_can_use_movbe); } } + /* Allocate Htab memory as needed. */ + if (gcm_ctx->gcm_use_avx == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + + if (htab_len == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + gcm_ctx->gcm_htab_len = htab_len; + gcm_ctx->gcm_Htable = + (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag); + + if (gcm_ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ @@ -689,6 +706,22 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } + /* Allocate Htab memory as needed. */ + if (gcm_ctx->gcm_use_avx == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + + if (htab_len == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + gcm_ctx->gcm_htab_len = htab_len; + gcm_ctx->gcm_Htable = + (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag); + + if (gcm_ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } + /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ @@ -1018,7 +1051,7 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); /* Clear the FPU registers since they hold sensitive internal state. */ #define clear_fpu_regs() clear_fpu_regs_avx() #define GHASH_AVX(ctx, in, len) \ - gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \ + gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) @@ -1036,8 +1069,8 @@ extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); extern void aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); -extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]); -extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2], +extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); +extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, @@ -1073,6 +1106,18 @@ gcm_toggle_avx(void) } } +static inline size_t +gcm_simd_get_htab_size(boolean_t simd_mode) +{ + switch (simd_mode) { + case B_TRUE: + return (2 * 6 * 2 * sizeof (uint64_t)); + + default: + return (0); + } +} + /* * Clear sensitive data in the context. * @@ -1088,7 +1133,6 @@ gcm_clear_ctx(gcm_ctx_t *ctx) { bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder)); bzero(ctx->gcm_H, sizeof (ctx->gcm_H)); - bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable)); bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0)); bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp)); } diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c index f07876a478e2..faae9722bd04 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c @@ -152,6 +152,14 @@ crypto_free_mode_ctx(void *ctx) vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf, ((gcm_ctx_t *)ctx)->gcm_pt_buf_len); +#ifdef CAN_USE_GCM_ASM + if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) { + gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx; + bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len); + kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len); + } +#endif + kmem_free(ctx, sizeof (gcm_ctx_t)); } } diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index ed9f660fce5b..dc71ae2c1c89 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -55,6 +55,7 @@ .type _aesni_ctr32_ghash_6x,@function .align 32 _aesni_ctr32_ghash_6x: +.cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 @@ -363,12 +364,14 @@ _aesni_ctr32_ghash_6x: vpxor %xmm4,%xmm8,%xmm8 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x #endif /* ifdef HAVE_MOVBE */ .type _aesni_ctr32_ghash_no_movbe_6x,@function .align 32 _aesni_ctr32_ghash_no_movbe_6x: +.cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 @@ -689,6 +692,7 @@ _aesni_ctr32_ghash_no_movbe_6x: vpxor %xmm4,%xmm8,%xmm8 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x .globl aesni_gcm_decrypt @@ -714,6 +718,8 @@ aesni_gcm_decrypt: .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 + pushq %r9 +.cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 @@ -726,7 +732,8 @@ aesni_gcm_decrypt: andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 + movq 32(%r9),%r9 + leaq 32(%r9),%r9 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. vpshufb %xmm0,%xmm8,%xmm8 @@ -782,7 +789,9 @@ aesni_gcm_decrypt: vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) + movq -56(%rax),%r9 +.cfi_restore %r9 + vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 @@ -807,6 +816,7 @@ aesni_gcm_decrypt: .type _aesni_ctr32_6x,@function .align 32 _aesni_ctr32_6x: +.cfi_startproc vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. @@ -893,6 +903,7 @@ _aesni_ctr32_6x: vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 +.cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt @@ -918,6 +929,8 @@ aesni_gcm_encrypt: .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 + pushq %r9 +.cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 @@ -960,7 +973,8 @@ aesni_gcm_encrypt: call _aesni_ctr32_6x vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 + movq 32(%r9),%r9 + leaq 32(%r9),%r9 subq $12,%rdx movq $192,%r10 vpshufb %xmm0,%xmm8,%xmm8 @@ -1151,7 +1165,9 @@ aesni_gcm_encrypt: vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) + movq -56(%rax),%r9 +.cfi_restore %r9 + vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 diff --git a/sys/contrib/openzfs/module/icp/core/kcf_sched.c b/sys/contrib/openzfs/module/icp/core/kcf_sched.c index 40d50553d67e..81fd15f8ea26 100644 --- a/sys/contrib/openzfs/module/icp/core/kcf_sched.c +++ b/sys/contrib/openzfs/module/icp/core/kcf_sched.c @@ -1308,9 +1308,7 @@ kcf_reqid_insert(kcf_areq_node_t *areq) kcf_areq_node_t *headp; kcf_reqid_table_t *rt; - kpreempt_disable(); - rt = kcf_reqid_table[CPU_SEQID & REQID_TABLE_MASK]; - kpreempt_enable(); + rt = kcf_reqid_table[CPU_SEQID_UNSTABLE & REQID_TABLE_MASK]; mutex_enter(&rt->rt_lock); diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h index 57a211ccf1bf..ab71197542eb 100644 --- a/sys/contrib/openzfs/module/icp/include/modes/modes.h +++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h @@ -219,14 +219,14 @@ typedef struct gcm_ctx { size_t gcm_pt_buf_len; uint32_t gcm_tmp[4]; /* - * The relative positions of gcm_ghash, gcm_H and pre-computed - * gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S, - * so please don't change (or adjust accordingly). + * The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded + * in aesni-gcm-x86_64.S, so please don't change (or adjust there). */ uint64_t gcm_ghash[2]; uint64_t gcm_H[2]; #ifdef CAN_USE_GCM_ASM - uint64_t gcm_Htable[12][2]; + uint64_t *gcm_Htable; + size_t gcm_htab_len; #endif uint64_t gcm_J0[2]; uint64_t gcm_len_a_len_c[2]; diff --git a/sys/contrib/openzfs/module/icp/io/aes.c b/sys/contrib/openzfs/module/icp/io/aes.c index 96fb6bb1af30..e540af4473f7 100644 --- a/sys/contrib/openzfs/module/icp/io/aes.c +++ b/sys/contrib/openzfs/module/icp/io/aes.c @@ -1051,6 +1051,16 @@ out: bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); } +#ifdef CAN_USE_GCM_ASM + if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) && + ((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) { + + gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx; + + bzero(ctx->gcm_Htable, ctx->gcm_htab_len); + kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); + } +#endif return (ret); } @@ -1209,6 +1219,14 @@ out: vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf, ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len); } +#ifdef CAN_USE_GCM_ASM + if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) { + gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx; + + bzero(ctx->gcm_Htable, ctx->gcm_htab_len); + kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); + } +#endif } return (ret); diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c index 8f072531fde5..6a845c461052 100644 --- a/sys/contrib/openzfs/module/lua/lapi.c +++ b/sys/contrib/openzfs/module/lua/lapi.c @@ -1300,7 +1300,7 @@ module_exit(lua_fini); ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS"); ZFS_MODULE_AUTHOR("Lua.org"); -ZFS_MODULE_LICENSE("MIT"); +ZFS_MODULE_LICENSE("Dual MIT/GPL"); ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(lua_absindex); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c index 5cd5c69efa71..5ecd3d310361 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include <sys/jail.h> #include <sys/policy.h> #include <sys/zfs_vfsops.h> +#include <sys/zfs_znode.h> int @@ -312,11 +313,11 @@ secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) } int -secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, +secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr, boolean_t issuidroot __unused) { - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0) return (0); return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index a7bda509bf54..0a323e8856a3 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -106,13 +106,13 @@ abd_free_chunk(void *c) kmem_cache_free(abd_chunk_cache, c); } -static size_t +static uint_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); } -static inline size_t +static inline uint_t abd_scatter_chunkcnt(abd_t *abd) { ASSERT(!abd_is_linear(abd)); @@ -129,7 +129,7 @@ abd_size_alloc_linear(size_t size) void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { - size_t n = abd_scatter_chunkcnt(abd); + uint_t n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = n * zfs_abd_chunk_size - abd->abd_size; if (op == ABDSTAT_INCR) { @@ -161,25 +161,28 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) void abd_verify_scatter(abd_t *abd) { + uint_t i, n; + /* * There is no scatter linear pages in FreeBSD so there is an * if an error if the ABD has been marked as a linear page. */ - VERIFY(!abd_is_linear_page(abd)); + ASSERT(!abd_is_linear_page(abd)); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL); } } void abd_alloc_chunks(abd_t *abd, size_t size) { - size_t n = abd_chunkcnt_for_bytes(size); - for (int i = 0; i < n; i++) { + uint_t i, n; + + n = abd_chunkcnt_for_bytes(size); + for (i = 0; i < n; i++) { void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); ASSERT3P(c, !=, NULL); ABD_SCATTER(abd).abd_chunks[i] = c; @@ -190,8 +193,10 @@ abd_alloc_chunks(abd_t *abd, size_t size) void abd_free_chunks(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { + uint_t i, n; + + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); } } @@ -199,7 +204,7 @@ abd_free_chunks(abd_t *abd) abd_t * abd_alloc_struct(size_t size) { - size_t chunkcnt = abd_chunkcnt_for_bytes(size); + uint_t chunkcnt = abd_chunkcnt_for_bytes(size); /* * In the event we are allocating a gang ABD, the size passed in * will be 0. We must make sure to set abd_size to the size of an @@ -221,9 +226,9 @@ abd_alloc_struct(size_t size) void abd_free_struct(abd_t *abd) { - size_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : + uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = MAX(sizeof (abd_t), + ssize_t size = MAX(sizeof (abd_t), offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); @@ -238,7 +243,9 @@ abd_free_struct(abd_t *abd) static void abd_alloc_zero_scatter(void) { - size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + uint_t i, n; + + n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); @@ -251,7 +258,7 @@ abd_alloc_zero_scatter(void) ABD_SCATTER(abd_zero_scatter).abd_chunk_size = zfs_abd_chunk_size; - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = abd_zero_buf; } @@ -356,7 +363,7 @@ abd_get_offset_scatter(abd_t *sabd, size_t off) ASSERT3U(off, <=, sabd->abd_size); size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + uint_t chunkcnt = abd_scatter_chunkcnt(sabd) - (new_offset / zfs_abd_chunk_size); abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c index 94df750035a4..4fc7468bfa47 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -243,3 +243,13 @@ arc_lowmem_fini(void) if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); } + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 1b37ce0d7f6b..647c1463ba14 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -114,6 +114,7 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -228,15 +229,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { - uint32_t val; - int err; + int err, val; val = arc_no_grow_shift; - err = sysctl_handle_32(oidp, &val, 0, req); + err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); - if (val >= arc_shrink_shift) + if (val < 0 || val >= arc_shrink_shift) return (EINVAL); arc_no_grow_shift = val; @@ -244,8 +244,8 @@ sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, - CTLTYPE_U32 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, sizeof (uint32_t), - sysctl_vfs_zfs_arc_no_grow_shift, "U", + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int), + sysctl_vfs_zfs_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing)"); int diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c index cf762c5fd61c..825bd706e0c0 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c @@ -292,19 +292,28 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* @@ -313,19 +322,28 @@ vdev_ops_t vdev_file_ops = { #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c index f042eff7cd2e..c9e8e21982cf 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c @@ -1141,7 +1141,6 @@ sendreq: break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; - bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; @@ -1190,17 +1189,26 @@ vdev_geom_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { - vdev_geom_open, - vdev_geom_close, - vdev_default_asize, - vdev_geom_io_start, - vdev_geom_io_done, - NULL, - NULL, - vdev_geom_hold, - vdev_geom_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_geom_open, + .vdev_op_close = vdev_geom_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_geom_io_start, + .vdev_op_io_done = vdev_geom_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_geom_hold, + .vdev_op_rele = vdev_geom_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index d7786d5136a2..8fb259f4ba76 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -158,7 +158,8 @@ zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); if (rc) return (SET_ERROR(rc)); - *resid = auio.uio_resid; + if (resid) + *resid = auio.uio_resid; *offp += count - auio.uio_resid; return (SET_ERROR(0)); } @@ -296,7 +297,8 @@ zfs_file_unlink(const char *fnamep) rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); #else #ifdef AT_BENEATH - rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0); + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0, 0); #else rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), seg, 0); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c index 8b22f2fdc3b3..e69de29bb2d1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c @@ -1,70 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/errno.h> -#include <sys/kmem.h> -#include <sys/sunddi.h> -#include <sys/zfs_ioctl.h> -#include <sys/zfs_onexit.h> - -static int -zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) -{ - *zo = zfsdev_get_state(minor, ZST_ONEXIT); - if (*zo == NULL) - return (SET_ERROR(EBADF)); - - return (0); -} - -int -zfs_onexit_fd_hold(int fd, minor_t *minorp) -{ - file_t *fp, *tmpfp; - zfs_onexit_t *zo; - void *data; - int error; - - if ((error = zfs_file_get(fd, &fp))) - return (error); - - tmpfp = curthread->td_fpop; - curthread->td_fpop = fp; - error = devfs_get_cdevpriv(&data); - if (error == 0) - *minorp = (minor_t)(uintptr_t)data; - curthread->td_fpop = tmpfp; - if (error != 0) - return (SET_ERROR(EBADF)); - return (zfs_onexit_minor_to_state(*minorp, &zo)); -} - -void -zfs_onexit_fd_rele(int fd) -{ - zfs_file_put(fd); -} diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 54ebfa7532dd..7bc6b83d0272 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -42,6 +42,7 @@ #include <sys/mount.h> #include <sys/cmn_err.h> #include <sys/zfs_znode.h> +#include <sys/zfs_vnops.h> #include <sys/zfs_dir.h> #include <sys/zil.h> #include <sys/fs/zfs.h> @@ -433,7 +434,7 @@ zfs_sync(vfs_t *vfsp, int waitfor) } else { /* * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the + * run sync(8). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c index 3c3285f93389..2e8eadb5e16e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c @@ -29,6 +29,7 @@ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ + #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -270,69 +271,13 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) return (0); } -/* - * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and - * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off) -{ - znode_t *zp = VTOZ(vp); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_size; - if (noff >= file_sz) { - return (SET_ERROR(ENXIO)); - } - - if (cmd == _FIO_SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); - - if (error == ESRCH) - return (SET_ERROR(ENXIO)); - - /* file was dirty, so fall back to using generic logic */ - if (error == EBUSY) { - if (hole) - *off = file_sz; - - return (0); - } - - /* - * We could find a hole that begins after the logical end-of-file, - * because dmu_offset_next() only works on whole blocks. If the - * EOF falls mid-block, then indicate that the "virtual hole" - * at the end of the file begins at the logical EOF, rather than - * at the end of the last block. - */ - if (noff > file_sz) { - ASSERT(hole); - noff = file_sz; - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, int *rvalp) { - offset_t off; + loff_t off; int error; - zfsvfs_t *zfsvfs; - znode_t *zp; switch (com) { case _FIOFFS: @@ -350,18 +295,12 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, return (0); } - case _FIO_SEEK_DATA: - case _FIO_SEEK_HOLE: + case F_SEEK_DATA: + case F_SEEK_HOLE: { off = *(offset_t *)data; - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - /* offset parameter is in/out */ - error = zfs_holey(vp, com, &off); - ZFS_EXIT(zfsvfs); + error = zfs_holey(VTOZ(vp), com, &off); if (error) return (error); *(offset_t *)data = off; @@ -525,16 +464,15 @@ page_unhold(vm_page_t pp) * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ -static void -update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, - int segflg, dmu_tx_t *tx) +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { vm_object_t obj; struct sf_buf *sf; + vnode_t *vp = ZTOV(zp); caddr_t va; int off; - ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); @@ -552,8 +490,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); - (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + va + off, DMU_READ_PREFETCH); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); @@ -579,10 +517,10 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ -static int -mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +int +mappedread_sf(znode_t *zp, int nbytes, uio_t *uio) { - znode_t *zp = VTOZ(vp); + vnode_t *vp = ZTOV(zp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; @@ -664,10 +602,10 @@ mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ -static int -mappedread(vnode_t *vp, int nbytes, uio_t *uio) +int +mappedread(znode_t *zp, int nbytes, uio_t *uio) { - znode_t *zp = VTOZ(vp); + vnode_t *vp = ZTOV(zp); vm_object_t obj; int64_t start; int len = nbytes; @@ -710,523 +648,6 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio) return (error); } -offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - -/* - * Read bytes from specified file into supplied buffer. - * - * IN: vp - vnode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - SYNC flags; used to provide FRSYNC semantics. - * cr - credentials of caller. - * ct - caller context - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 on success, error code on failure. - * - * Side Effects: - * vp - atime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ssize_t n, nbytes, start_resid; - int error = 0; - int64_t nread; - zfs_locked_range_t *lr; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* We don't copy out anything useful for directories. */ - if (vp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - */ - if (zfsvfs->z_log && - (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); - - /* - * Lock the range against changes. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset, - uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_size); - n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); - start_resid = n; - - while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (uio->uio_segflg == UIO_NOCOPY) - error = mappedread_sf(vp, nbytes, uio); - else if (vn_has_cached_data(vp)) { - error = mappedread(vp, nbytes, uio); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - - n -= nbytes; - } - - nread = start_resid - n; - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - -out: - zfs_rangelock_exit(lr); - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Write the bytes to a file. - * - * IN: vp - vnode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is - * set if in append mode. - * cr - credentials of caller. - * ct - caller context (NFS/CIFS fem monitor only) - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - ctime|mtime updated if byte count > 0 - */ - -/* ARGSUSED */ -static int -zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - rlim64_t limit = MAXOFFSET_T; - ssize_t start_resid = uio->uio_resid; - ssize_t tx_bytes; - uint64_t end_size; - dmu_buf_impl_t *db; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog; - offset_t woff; - ssize_t n, nbytes; - zfs_locked_range_t *lr; - int max_blksz = zfsvfs->z_max_blksz; - int error = 0; - arc_buf_t *abuf; - iovec_t *aiov = NULL; - xuio_t *xuio = NULL; - int i_iov = 0; - int iovcnt __unused = uio->uio_iovcnt; - iovec_t *iovp = uio->uio_iov; - int write_eof; - int count = 0; - sa_bulk_attr_t bulk[4]; - uint64_t mtime[2], ctime[2]; - uint64_t uid, gid, projid; - int64_t nwritten; - - /* - * Fasttrack empty write - */ - n = start_resid; - if (n == 0) - return (0); - - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * If immutable or not appending then return EPERM. - * Intentionally allow ZFS_READONLY through here. - * See zfs_zaccess_common() - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_size))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - zilog = zfsvfs->z_log; - - /* - * Validate file offset - */ - woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * If in append mode, set the io offset pointer to eof. - */ - if (ioflag & FAPPEND) { - /* - * Obtain an appending range lock to guarantee file append - * semantics. We reset the write offset once we have the lock. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); - woff = lr->lr_offset; - if (lr->lr_length == UINT64_MAX) { - /* - * We overlocked the file because this write will cause - * the file block size to increase. - * Note that zp_size cannot change with this lock held. - */ - woff = zp->z_size; - } - uio->uio_loffset = woff; - } else { - /* - * Note that if the file block size will change as a result of - * this write, then this range lock will lock the entire file - * so that we can re-write the block safely. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); - } - - if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (EFBIG); - } - - if (woff >= limit) { - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFBIG)); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* Will this write extend the file length? */ - write_eof = (woff + n > zp->z_size); - - end_size = MAX(zp->z_size, woff + n); - - uid = zp->z_uid; - gid = zp->z_gid; - projid = zp->z_projid; - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - woff = uio->uio_loffset; - - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || - (projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - projid))) { - error = SET_ERROR(EDQUOT); - break; - } - - abuf = NULL; - if (xuio) { - ASSERT(i_iov < iovcnt); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - DTRACE_PROBE3(zfs_cp_write, int, i_iov, - iovec_t *, aiov, arc_buf_t *, abuf); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; - } else if (n >= max_blksz && - woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { - /* - * This write covers a full block. "Borrow" a buffer - * from the dmu so that we can fill it before we enter - * a transaction. This avoids the possibility of - * holding up the transaction if the data copy hangs - * up on a pagefault (e.g., from an NFS server mapping). - */ - size_t cbytes; - - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); - ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { - dmu_return_arcbuf(abuf); - break; - } - ASSERT(cbytes == max_blksz); - } - - /* - * Start a transaction. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); - DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); - DB_DNODE_EXIT(db); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (abuf != NULL) - dmu_return_arcbuf(abuf); - break; - } - - /* - * If zfs_range_lock() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. - */ - if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_rangelock_reduce(lr, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - if (woff + nbytes > zp->z_size) - vnode_pager_setsize(vp, woff + nbytes); - - if (abuf == NULL) { - tx_bytes = uio->uio_resid; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); - tx_bytes -= uio->uio_resid; - } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); - /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). - */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - aiov->iov_len, aiov->iov_base, tx); - dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, - abuf, tx); - } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); - } - if (tx_bytes && vn_has_cached_data(vp)) { - update_pages(vp, woff, tx_bytes, zfsvfs->z_os, - zp->z_id, uio->uio_segflg, tx); - } - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the execute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(vp, cr, - (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); - - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_size) < uio->uio_loffset) { - (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); - ASSERT(error == 0 || error == EFAULT); - } - /* - * If we are replaying and eof is non zero then force - * the file size to the specified eof. Note, there's no - * concurrency during replay. - */ - if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) - zp->z_size = zfsvfs->z_replay_eof; - - if (error == 0) - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - else - (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, - ioflag, NULL, NULL); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - - } - - zfs_rangelock_exit(lr); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * EFAULT means that at least one page of the source buffer was not - * available. VFS will re-try remaining I/O upon this error. - */ - if (error == EFAULT) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (FSYNC | FDSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_id); - - nwritten = start_resid - uio->uio_resid; - dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - - ZFS_EXIT(zfsvfs); - return (0); -} - int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *presid) @@ -1249,184 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len, return (error); } -static void -zfs_get_done(zgd_t *zgd, int error) +void +zfs_zrele_async(znode_t *zp) { - znode_t *zp = zgd->zgd_private; - objset_t *os = zp->z_zfsvfs->z_os; - - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); + vnode_t *vp = ZTOV(zp); + objset_t *os = ITOZSB(vp)->z_os; - zfs_rangelock_exit(zgd->zgd_lr); - - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); - - kmem_free(zgd, sizeof (zgd_t)); -} - -#ifdef ZFS_DEBUG -static int zil_fault_io = 0; -#endif - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error = 0; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), - dsl_pool_zrele_taskq(dmu_objset_pool(os))); - return (SET_ERROR(ENOENT)); - } - - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zp; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, - size, RL_READER); - /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_size) { - error = SET_ERROR(ENOENT); - } else { - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - } - ASSERT(error == 0 || error == ENOENT); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's - * written out and its checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); - } - /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_size) - error = SET_ERROR(ENOENT); -#ifdef ZFS_DEBUG - if (zil_fault_io) { - error = SET_ERROR(EIO); - zil_fault_io = 0; - } -#endif - if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP, - * so there's no need to zio_flush() its - * vdevs (flushing would needlesly hurt - * performance, and doesn't work on - * indirect vdevs). - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } - } - } - - zfs_get_done(zgd, error); - - return (error); -} - -/*ARGSUSED*/ -static int -zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); + VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os))); } static int @@ -2708,27 +1958,6 @@ update: return (error); } -ulong_t zfs_fsync_sync_cnt = 4; - -static int -zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - tsd_set(zfs_fsyncer_key, NULL); - return (0); -} - - /* * Get the requested file attributes and place them in the provided * vattr structure. @@ -3905,7 +3134,7 @@ zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) return (error); } -#if __FreeBSD_version < 1300110 +#if __FreeBSD_version < 1300124 static void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) @@ -4793,45 +4022,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, } } -/*ARGSUSED*/ -static int -zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) @@ -5225,7 +4415,7 @@ static int zfs_freebsd_read(struct vop_read_args *ap) { - return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + return (zfs_read(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred)); } @@ -5242,7 +4432,7 @@ static int zfs_freebsd_write(struct vop_write_args *ap) { - return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + return (zfs_write(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred)); } @@ -5301,7 +4491,7 @@ zfs_freebsd_access(struct vop_access_args *ap) */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) - error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); + error = zfs_access(zp, accmode, 0, ap->a_cred); /* * VADMIN has to be handled by vaccess(). @@ -5512,7 +4702,7 @@ zfs_freebsd_fsync(struct vop_fsync_args *ap) { vop_stdfsync(ap); - return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); + return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); } #ifndef _SYS_SYSPROTO_H_ @@ -5825,7 +5015,11 @@ zfs_freebsd_inactive(struct vop_inactive_args *ap) { vnode_t *vp = ap->a_vp; +#if __FreeBSD_version >= 1300123 zfs_inactive(vp, curthread->td_ucred, NULL); +#else + zfs_inactive(vp, ap->a_td->td_ucred, NULL); +#endif return (0); } @@ -6377,7 +5571,8 @@ zfs_freebsd_getacl(struct vop_getacl_args *ap) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; - if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))) + if ((error = zfs_getsecattr(VTOZ(ap->a_vp), + &vsecattr, 0, ap->a_cred))) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, @@ -6510,7 +5705,13 @@ zfs_vptocnp(struct vop_vptocnp_args *ap) error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); #endif if (error == 0) { - error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, ap->a_buflen); +#if __FreeBSD_version >= 1300123 + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, + ap->a_buflen); +#else + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); +#endif vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index 40baa0b80928..6a21623c5f67 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -149,7 +149,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_vnode = NULL; - zp->z_moved = 0; return (0); } @@ -278,7 +277,6 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) sharezp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); - sharezp->z_moved = 0; sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; @@ -437,7 +435,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, vp->v_data = zp; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - zp->z_moved = 0; zp->z_sa_hdl = NULL; zp->z_unlinked = 0; @@ -1692,7 +1689,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) rootzp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); - rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); @@ -2015,6 +2011,20 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } + +void +zfs_inode_update(znode_t *zp) +{ + vm_object_t object; + + if ((object = ZTOV(zp)->v_object) == NULL || + zp->z_size == object->un_pager.vnp.vnp_size) + return; + + vnode_pager_setsize(ZTOV(zp), zp->z_size); +} + + #ifdef _KERNEL int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index fb88bc325d3c..fd2beee7bdd2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -1071,6 +1071,16 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); /* + * This is necessary here as we check next whether + * OBJSET_FLAG_USERACCOUNTING_COMPLETE or + * OBJSET_FLAG_USEROBJACCOUNTING are set in order to + * decide if the local_mac should be zeroed out. + */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + + /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ @@ -1081,7 +1091,10 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || - (datalen <= OBJSET_PHYS_SIZE_V1)) { + (datalen <= OBJSET_PHYS_SIZE_V1) || + (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 || + (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) && + key->zk_version > 0)) { bzero(local_mac, ZIO_OBJSET_MAC_LEN); return (0); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 092eb34eaa47..6c44e3681709 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -116,7 +116,6 @@ enum zvol_geom_state { }; struct zvol_state_os { - int zso_volmode; #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom union { @@ -134,6 +133,7 @@ struct zvol_state_os { enum zvol_geom_state zsg_state; } _zso_geom; } _zso_state; + int zso_dying; }; static uint32_t zvol_minors; @@ -209,7 +209,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; int err = 0; - boolean_t drop_suspend = B_TRUE; + boolean_t drop_suspend = B_FALSE; boolean_t drop_namespace = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { @@ -228,16 +228,15 @@ retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { - if (drop_namespace) - mutex_exit(&spa_namespace_lock); rw_exit(&zvol_state_lock); - return (SET_ERROR(ENXIO)); + err = SET_ERROR(ENXIO); + goto out_locked; } if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { /* * We need to guarantee that the namespace lock is held - * to avoid spurious failures in zvol_first_open + * to avoid spurious failures in zvol_first_open. */ drop_namespace = B_TRUE; if (!mutex_tryenter(&spa_namespace_lock)) { @@ -247,8 +246,12 @@ retry: } } mutex_enter(&zv->zv_state_lock); - - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + if (zv->zv_zso->zso_dying) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_zv_locked; + } + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * make sure zvol is not suspended during first open @@ -256,6 +259,7 @@ retry: * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { + drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); @@ -266,8 +270,6 @@ retry: drop_suspend = B_FALSE; } } - } else { - drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); @@ -277,7 +279,7 @@ retry: ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flag & FWRITE)); if (err) - goto out_mutex; + goto out_zv_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; @@ -289,41 +291,37 @@ retry: */ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || dmu_objset_incompatible_encryption_version(zv->zv_objset))) { - err = EROFS; - goto out_open_count; + err = SET_ERROR(EROFS); + goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { - err = EBUSY; - goto out_open_count; + err = SET_ERROR(EBUSY); + goto out_opened; } #ifdef FEXCL if (flag & FEXCL) { if (zv->zv_open_count != 0) { - err = EBUSY; - goto out_open_count; + err = SET_ERROR(EBUSY); + goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_open_count += count; - if (drop_namespace) - mutex_exit(&spa_namespace_lock); - mutex_exit(&zv->zv_state_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - return (0); - -out_open_count: - if (zv->zv_open_count == 0) +out_opened: + if (zv->zv_open_count == 0) { zvol_last_close(zv); -out_mutex: + wakeup(zv); + } +out_zv_locked: + mutex_exit(&zv->zv_state_lock); +out_locked: if (drop_namespace) mutex_exit(&spa_namespace_lock); - mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(err)); + return (err); } /*ARGSUSED*/ @@ -332,6 +330,7 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; + int new_open_count; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; @@ -342,30 +341,32 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_open_count == 1); + ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ - ASSERT(zv->zv_open_count > 0); + ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ - if ((zv->zv_open_count - count) == 0) { + new_open_count = zv->zv_open_count - count; + if (new_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 1) { + new_open_count = zv->zv_open_count - count; + if (new_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } @@ -380,11 +381,11 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) /* * You may get multiple opens, but only one close. */ - zv->zv_open_count -= count; - + zv->zv_open_count = new_open_count; if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); + wakeup(zv); } mutex_exit(&zv->zv_state_lock); @@ -400,7 +401,7 @@ zvol_geom_run(zvol_state_t *zv) struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); g_error_provider(pp, 0); @@ -414,7 +415,7 @@ zvol_geom_destroy(zvol_state_t *zv) struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); g_topology_assert(); @@ -422,10 +423,25 @@ zvol_geom_destroy(zvol_state_t *zv) VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); mutex_exit(&zv->zv_state_lock); zsg->zsg_provider = NULL; - pp->private = NULL; g_wither_geom(pp->geom, ENXIO); } +void +zvol_wait_close(zvol_state_t *zv) +{ + + if (zv->zv_volmode != ZFS_VOLMODE_GEOM) + return; + mutex_enter(&zv->zv_state_lock); + zv->zv_zso->zso_dying = B_TRUE; + + if (zv->zv_open_count) + msleep(zv, &zv->zv_state_lock, + PRIBIO, "zvol:dying", 10*hz); + mutex_exit(&zv->zv_state_lock); +} + + static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) { @@ -483,7 +499,7 @@ zvol_geom_worker(void *arg) struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct bio *bp; - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); thread_lock(curthread); sched_prio(curthread, PRIBIO); @@ -512,9 +528,13 @@ static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct zvol_state_geom *zsg; boolean_t first; + if (zv == NULL) { + g_io_deliver(bp, ENXIO); + return; + } if (bp->bio_cmd == BIO_GETATTR) { if (zvol_geom_bio_getattr(bp)) g_io_deliver(bp, EOPNOTSUPP); @@ -522,6 +542,7 @@ zvol_geom_bio_start(struct bio *bp) } if (!THREAD_CAN_SLEEP()) { + zsg = &zv->zv_zso->zso_geom; mtx_lock(&zsg->zsg_queue_mtx); first = (bioq_first(&zsg->zsg_queue) == NULL); bioq_insert_tail(&zsg->zsg_queue, bp); @@ -540,7 +561,7 @@ zvol_geom_bio_getattr(struct bio *bp) zvol_state_t *zv; zv = bp->bio_to->private; - ASSERT(zv != NULL); + ASSERT3P(zv, !=, NULL); spa_t *spa = dmu_objset_spa(zv->zv_objset); uint64_t refd, avail, usedobjs, availobjs; @@ -613,7 +634,7 @@ zvol_geom_bio_strategy(struct bio *bp) goto sync; break; default: - error = EOPNOTSUPP; + error = SET_ERROR(EOPNOTSUPP); goto resume; } @@ -621,7 +642,7 @@ zvol_geom_bio_strategy(struct bio *bp) volsize = zv->zv_volsize; os = zv->zv_objset; - ASSERT(os != NULL); + ASSERT3P(os, !=, NULL); addr = bp->bio_data; resid = bp->bio_length; @@ -688,7 +709,7 @@ unlock: bp->bio_completed = bp->bio_length - resid; if (bp->bio_completed < bp->bio_length && off > volsize) - error = EINVAL; + error = SET_ERROR(EINVAL); switch (bp->bio_cmd) { case BIO_FLUSH: @@ -825,18 +846,33 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) zvol_state_t *zv; struct zvol_state_dev *zsd; int err = 0; - boolean_t drop_suspend = B_TRUE; + boolean_t drop_suspend = B_FALSE; + boolean_t drop_namespace = B_FALSE; +retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); - return (SET_ERROR(ENXIO)); + err = SET_ERROR(ENXIO); + goto out_locked; } + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * We need to guarantee that the namespace lock is held + * to avoid spurious failures in zvol_first_open. + */ + drop_namespace = B_TRUE; + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + goto retry; + } + } mutex_enter(&zv->zv_state_lock); - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * make sure zvol is not suspended during first open @@ -844,6 +880,7 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { + drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); @@ -854,8 +891,6 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) drop_suspend = B_FALSE; } } - } else { - drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); @@ -865,21 +900,21 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flags & FWRITE)); if (err) - goto out_locked; + goto out_zv_locked; } if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - err = EROFS; + err = SET_ERROR(EROFS); goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { - err = EBUSY; + err = SET_ERROR(EBUSY); goto out_opened; } #ifdef FEXCL if (flags & FEXCL) { if (zv->zv_open_count != 0) { - err = EBUSY; + err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; @@ -894,20 +929,19 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); } - - mutex_exit(&zv->zv_state_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - return (0); - out_opened: - if (zv->zv_open_count == 0) + if (zv->zv_open_count == 0) { zvol_last_close(zv); -out_locked: + wakeup(zv); + } +out_zv_locked: mutex_exit(&zv->zv_state_lock); +out_locked: + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(err)); + return (err); } static int @@ -926,17 +960,17 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_open_count == 1); + ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ - ASSERT(zv->zv_open_count > 0); + ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition @@ -972,6 +1006,7 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); + wakeup(zv); } mutex_exit(&zv->zv_state_lock); @@ -1022,7 +1057,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); - error = EINVAL; + error = SET_ERROR(EINVAL); break; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); @@ -1076,7 +1111,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, refd = metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = refd / DEV_BSIZE; } else - error = ENOIOCTL; + error = SET_ERROR(ENOIOCTL); break; } case FIOSEEKHOLE: @@ -1092,7 +1127,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, break; } default: - error = ENOIOCTL; + error = SET_ERROR(ENOIOCTL); } return (error); @@ -1144,14 +1179,14 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; struct g_geom *gp; g_topology_lock(); gp = pp->geom; - ASSERT(gp != NULL); + ASSERT3P(gp, !=, NULL); zsg->zsg_provider = NULL; g_wither_provider(pp, ENXIO); @@ -1164,7 +1199,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) zsg->zsg_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); - } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; @@ -1206,26 +1241,30 @@ zvol_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count == 0); + ASSERT0(zv->zv_open_count); ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp __maybe_unused = zsg->zsg_provider; + + ASSERT3P(pp->private, ==, NULL); g_topology_lock(); zvol_geom_destroy(zv); g_topology_unlock(); mtx_destroy(&zsg->zsg_queue_mtx); - } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; - if (dev != NULL) - destroy_dev(dev); + ASSERT3P(dev->si_drv2, ==, NULL); + + destroy_dev(dev); } mutex_destroy(&zv->zv_state_lock); @@ -1249,7 +1288,6 @@ zvol_create_minor_impl(const char *name) int error; ZFS_LOG(1, "Creating ZVOL %s...", name); - hash = zvol_name_hash(name); if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1258,10 +1296,11 @@ zvol_create_minor_impl(const char *name) } DROP_GIANT(); - /* lie and say we're read-only */ - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; @@ -1275,8 +1314,10 @@ zvol_create_minor_impl(const char *name) error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); - if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) + if (error || volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; + error = 0; + /* * zvol_alloc equivalent ... */ @@ -1284,8 +1325,8 @@ zvol_create_minor_impl(const char *name) zv->zv_hash = hash; mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); - zv->zv_zso->zso_volmode = volmode; - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + zv->zv_volmode = volmode; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp; struct g_geom *gp; @@ -1298,7 +1339,6 @@ zvol_create_minor_impl(const char *name) gp->start = zvol_geom_bio_start; gp->access = zvol_geom_access; pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - /* TODO: NULL check? */ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = 0; @@ -1306,7 +1346,7 @@ zvol_create_minor_impl(const char *name) zsg->zsg_provider = pp; bioq_init(&zsg->zsg_queue); - } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; @@ -1320,12 +1360,12 @@ zvol_create_minor_impl(const char *name) args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); - if (error != 0) { - mutex_destroy(&zv->zv_state_lock); + if (error) { kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + mutex_destroy(&zv->zv_state_lock); kmem_free(zv, sizeof (*zv)); dmu_objset_disown(os, B_TRUE, FTAG); - goto out_giant; + goto out_doi; } dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; @@ -1350,15 +1390,14 @@ zvol_create_minor_impl(const char *name) ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); - /* XXX do prefetch */ + /* TODO: prefetch for geom tasting */ zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { - if (error == 0) - zvol_geom_run(zv); + if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { + zvol_geom_run(zv); g_topology_unlock(); } out_doi: @@ -1368,9 +1407,8 @@ out_doi: zvol_insert(zv); zvol_minors++; rw_exit(&zvol_state_lock); + ZFS_LOG(1, "ZVOL %s created.", name); } - ZFS_LOG(1, "ZVOL %s created.", name); -out_giant: PICKUP_GIANT(); return (error); } @@ -1379,11 +1417,11 @@ static void zvol_clear_private(zvol_state_t *zv) { ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - if (pp == NULL) /* XXX when? */ + if (pp->private == NULL) /* already cleared */ return; mtx_lock(&zsg->zsg_queue_mtx); @@ -1391,11 +1429,15 @@ zvol_clear_private(zvol_state_t *zv) pp->private = NULL; wakeup_one(&zsg->zsg_queue); while (zsg->zsg_state != ZVOL_GEOM_RUNNING) - msleep(&zsg->zsg_state, - &zsg->zsg_queue_mtx, + msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 0, "zvol:w", 0); mtx_unlock(&zsg->zsg_queue_mtx); ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + dev->si_drv2 = NULL; } } @@ -1403,15 +1445,17 @@ static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { zv->zv_volsize = volsize; - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - if (pp == NULL) /* XXX when? */ - return (0); - g_topology_lock(); + if (pp->private == NULL) { + g_topology_unlock(); + return (SET_ERROR(ENXIO)); + } + /* * Do not invoke resize event when initial size was zero. * ZVOL initializes the size on first open, this is not diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index fafadffc751c..e8d89bfeabe5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -28,6 +28,9 @@ #include <sys/kmem.h> #include <sys/tsd.h> #include <sys/trace_spl.h> +#ifdef HAVE_CPU_HOTPLUG +#include <linux/cpuhotplug.h> +#endif int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); @@ -35,7 +38,7 @@ MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); int spl_taskq_thread_dynamic = 1; -module_param(spl_taskq_thread_dynamic, int, 0644); +module_param(spl_taskq_thread_dynamic, int, 0444); MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); int spl_taskq_thread_priority = 1; @@ -59,6 +62,11 @@ EXPORT_SYMBOL(system_delay_taskq); static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); +#ifdef HAVE_CPU_HOTPLUG +/* Multi-callback id for cpu hotplugging. */ +static int spl_taskq_cpuhp_state; +#endif + /* List of all taskqs */ LIST_HEAD(tq_list); struct rw_semaphore tq_list_sem; @@ -1024,13 +1032,14 @@ taskq_thread_create(taskq_t *tq) } taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, +taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) { taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; unsigned long irqflags; + int nthreads = threads_arg; ASSERT(name != NULL); ASSERT(minalloc >= 0); @@ -1041,15 +1050,27 @@ taskq_create(const char *name, int nthreads, pri_t pri, if (flags & TASKQ_THREADS_CPU_PCT) { ASSERT(nthreads <= 100); ASSERT(nthreads >= 0); - nthreads = MIN(nthreads, 100); + nthreads = MIN(threads_arg, 100); nthreads = MAX(nthreads, 0); - nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + nthreads = MAX((num_online_cpus() * nthreads) /100, 1); } tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); if (tq == NULL) return (NULL); + tq->tq_hp_support = B_FALSE; +#ifdef HAVE_CPU_HOTPLUG + if (flags & TASKQ_THREADS_CPU_PCT) { + tq->tq_hp_support = B_TRUE; + if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, + &tq->tq_hp_cb_node) != 0) { + kmem_free(tq, sizeof (*tq)); + return (NULL); + } + } +#endif + spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); @@ -1058,6 +1079,7 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_nthreads = 0; tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; + tq->tq_cpu_pct = threads_arg; tq->tq_pri = pri; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; @@ -1131,6 +1153,12 @@ taskq_destroy(taskq_t *tq) tq->tq_flags &= ~TASKQ_ACTIVE; spin_unlock_irqrestore(&tq->tq_lock, flags); +#ifdef HAVE_CPU_HOTPLUG + if (tq->tq_hp_support) { + VERIFY0(cpuhp_state_remove_instance_nocalls( + spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); + } +#endif /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may * new worker threads be spawned for dynamic taskq. @@ -1198,7 +1226,6 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); - static unsigned int spl_taskq_kick = 0; /* @@ -1255,12 +1282,96 @@ module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); +#ifdef HAVE_CPU_HOTPLUG +/* + * This callback will be called exactly once for each core that comes online, + * for each dynamic taskq. We attempt to expand taskqs that have + * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every + * time, to correctly determine whether or not to add a thread. + */ +static int +spl_taskq_expand(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + int err = 0; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads > tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1); + taskq_thread_t *tqt = taskq_thread_create(tq); + if (tqt == NULL) + err = -1; + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (err); +} + +/* + * While we don't support offlining CPUs, it is possible that CPUs will fail + * to online successfully. We do need to be able to handle this case + * gracefully. + */ +static int +spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads < tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); + taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + struct task_struct *thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + return (0); + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); +} +#endif + int spl_taskq_init(void) { init_rwsem(&tq_list_sem); tsd_create(&taskq_tsd, NULL); +#ifdef HAVE_CPU_HOTPLUG + spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); +#endif + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) @@ -1269,6 +1380,9 @@ spl_taskq_init(void) system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_delay_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); return (1); } @@ -1276,6 +1390,9 @@ spl_taskq_init(void) dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); if (dynamic_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); return (1); @@ -1304,4 +1421,9 @@ spl_taskq_fini(void) system_taskq = NULL; tsd_destroy(&taskq_tsd); + +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); + spl_taskq_cpuhp_state = 0; +#endif } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in index 87414d6eacc5..75bec52c94e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in +++ b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in @@ -23,8 +23,9 @@ $(MODULE)-objs += ../os/linux/zfs/zfs_dir.o $(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o +$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o $(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o -$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_znode.o $(MODULE)-objs += ../os/linux/zfs/zio_crypt.o $(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index c2281449ed12..0abac228447f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -178,7 +178,7 @@ static struct page *abd_zero_page = NULL; static kmem_cache_t *abd_cache = NULL; static kstat_t *abd_ksp; -static size_t +static uint_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index 792c75d46ffe..83d4a3d8496c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -48,6 +48,8 @@ #include <sys/vmsystm.h> #include <sys/zpl.h> #include <linux/page_compat.h> +#include <linux/notifier.h> +#include <linux/memory.h> #endif #include <sys/callb.h> #include <sys/kstat.h> @@ -73,6 +75,9 @@ */ int zfs_arc_shrinker_limit = 10000; +#ifdef CONFIG_MEMORY_HOTPLUG +static struct notifier_block arc_hotplug_callback_mem_nb; +#endif /* * Return a default max arc size based on the amount of physical memory. @@ -278,18 +283,9 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) return (0); } -void -arc_lowmem_init(void) +static void +arc_set_sys_free(uint64_t allmem) { - uint64_t allmem = arc_all_memory(); - - /* - * Register a shrinker to support synchronous (direct) memory - * reclaim from the arc. This is done to prevent kswapd from - * swapping out pages when it is preferable to shrink the arc. - */ - spl_register_shrinker(&arc_shrinker); - /* * The ARC tries to keep at least this much memory available for the * system. This gives the ARC time to shrink in response to memory @@ -343,6 +339,20 @@ arc_lowmem_init(void) } void +arc_lowmem_init(void) +{ + uint64_t allmem = arc_all_memory(); + + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); + arc_set_sys_free(allmem); +} + +void arc_lowmem_fini(void) { spl_unregister_shrinker(&arc_shrinker); @@ -375,6 +385,52 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) return (0); } + +#ifdef CONFIG_MEMORY_HOTPLUG +/* ARGSUSED */ +static int +arc_hotplug_callback(struct notifier_block *self, unsigned long action, + void *arg) +{ + uint64_t allmem = arc_all_memory(); + if (action != MEM_ONLINE) + return (NOTIFY_OK); + + arc_set_limits(allmem); + +#ifdef __LP64__ + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#else + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#endif + + arc_set_sys_free(allmem); + return (NOTIFY_OK); +} +#endif + +void +arc_register_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback; + /* There is no significance to the value 100 */ + arc_hotplug_callback_mem_nb.priority = 100; + register_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} + +void +arc_unregister_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} #else /* _KERNEL */ int64_t arc_available_memory(void) @@ -405,6 +461,16 @@ arc_free_memory(void) { return (spa_get_random(arc_all_memory() * 20 / 100)); } + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} #endif /* _KERNEL */ /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c index 5267d67eea82..8780d7f6c70a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c @@ -204,7 +204,8 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) * Enforced in the Linux VFS. */ int -secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, + boolean_t issuidroot) { return (priv_policy_user(cr, CAP_FSETID, EPERM)); } @@ -271,7 +272,7 @@ void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, + secpolicy_vnode_setid_retain(NULL, cr, (vap->va_mode & S_ISUID) != 0 && (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { vap->va_mask |= AT_MODE; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index a54961c76870..4bd27d1b516f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -94,6 +94,14 @@ bdev_capacity(struct block_device *bdev) return (i_size_read(bdev->bd_inode)); } +#if !defined(HAVE_BDEV_WHOLE) +static inline struct block_device * +bdev_whole(struct block_device *bdev) +{ + return (bdev->bd_contains); +} +#endif + /* * Returns the maximum expansion capacity of the block device (in bytes). * @@ -118,7 +126,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) uint64_t psize; int64_t available; - if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to @@ -132,7 +140,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) * "reserved" EFI partition: in such cases return the device * usable capacity. */ - available = i_size_read(bdev->bd_contains->bd_inode) - + available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); @@ -192,8 +200,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, vd->vd_bdev = NULL; if (bdev) { - if (v->vdev_expanding && bdev != bdev->bd_contains) { - bdevname(bdev->bd_contains, disk_name + 5); + if (v->vdev_expanding && bdev != bdev_whole(bdev)) { + bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition @@ -468,7 +476,11 @@ vdev_blkg_tryget(struct blkcg_gq *blkg) this_cpu_inc(*count); rc = true; } else { +#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA + rc = atomic_long_inc_not_zero(&ref->data->count); +#else rc = atomic_long_inc_not_zero(&ref->count); +#endif } rcu_read_unlock_sched(); @@ -787,7 +799,7 @@ vdev_disk_io_done(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - if (check_disk_change(vd->vd_bdev)) { + if (zfs_check_media_change(vd->vd_bdev)) { invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); @@ -822,9 +834,13 @@ vdev_disk_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, @@ -833,6 +849,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c index 423ce858144c..bf8a13ae6154 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c @@ -305,9 +305,13 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -316,6 +320,11 @@ vdev_ops_t vdev_file_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; @@ -341,9 +350,13 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -352,6 +365,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index c13a9771235d..a1668e46e4f9 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -467,7 +467,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_zn_prefetch = B_FALSE; - zp->z_moved = B_FALSE; zp->z_is_sa = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 36bbd5d0829b..165c1218ae79 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -294,7 +294,7 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) } else { /* * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the + * run sync(1). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); @@ -1451,7 +1451,7 @@ int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; - struct inode *root_inode; + struct inode *root_inode = NULL; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c index b668c7dff013..3be387a30e5c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c @@ -240,78 +240,6 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) return (0); } -#if defined(SEEK_HOLE) && defined(SEEK_DATA) -/* - * Lseek support for finding holes (cmd == SEEK_HOLE) and - * data (cmd == SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey_common(struct inode *ip, int cmd, loff_t *off) -{ - znode_t *zp = ITOZ(ip); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_size; - if (noff >= file_sz) { - return (SET_ERROR(ENXIO)); - } - - if (cmd == SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); - - if (error == ESRCH) - return (SET_ERROR(ENXIO)); - - /* file was dirty, so fall back to using generic logic */ - if (error == EBUSY) { - if (hole) - *off = file_sz; - - return (0); - } - - /* - * We could find a hole that begins after the logical end-of-file, - * because dmu_offset_next() only works on whole blocks. If the - * EOF falls mid-block, then indicate that the "virtual hole" - * at the end of the file begins at the logical EOF, rather than - * at the end of the last block. - */ - if (noff > file_sz) { - ASSERT(hole); - noff = file_sz; - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - -int -zfs_holey(struct inode *ip, int cmd, loff_t *off) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_holey_common(ip, cmd, off); - - ZFS_EXIT(zfsvfs); - return (error); -} -#endif /* SEEK_HOLE && SEEK_DATA */ - #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized @@ -320,10 +248,10 @@ zfs_holey(struct inode *ip, int cmd, loff_t *off) * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ -static void -update_pages(struct inode *ip, int64_t start, int len, - objset_t *os, uint64_t oid) +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { + struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; @@ -340,8 +268,8 @@ update_pages(struct inode *ip, int64_t start, int len, flush_dcache_page(pp); pb = kmap(pp); - (void) dmu_read(os, oid, start+off, nbytes, pb+off, - DMU_READ_PREFETCH); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + pb + off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) @@ -369,12 +297,12 @@ update_pages(struct inode *ip, int64_t start, int len, * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ -static int -mappedread(struct inode *ip, int nbytes, uio_t *uio) +int +mappedread(znode_t *zp, int nbytes, uio_t *uio) { + struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; - znode_t *zp = ITOZ(ip); int64_t start, off; uint64_t bytes; int len = nbytes; @@ -414,575 +342,9 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio) } #endif /* _KERNEL */ -unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* - * Read bytes from specified file into supplied buffer. - * - * IN: ip - inode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - O_SYNC flags; used to provide FRSYNC semantics. - * O_DIRECT flag; used to bypass page cache. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 on success, error code on failure. - * - * Side Effects: - * inode - atime updated if byte count > 0 - */ -/* ARGSUSED */ -int -zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) -{ - int error = 0; - boolean_t frsync = B_FALSE; - - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - -#ifdef FRSYNC - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - * Only do this for non-snapshots. - * - * Some platforms do not support FRSYNC and instead map it - * to O_SYNC, which results in unnecessary calls to zil_commit. We - * only honor FRSYNC requests on platforms which support it. - */ - frsync = !!(ioflag & FRSYNC); -#endif - if (zfsvfs->z_log && - (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); - - /* - * Lock the range against changes. - */ - zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_size); - ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); - ssize_t start_resid = n; - -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { - int nblk; - int blksz = zp->z_blksz; - uint64_t offset = uio->uio_loffset; - - xuio = (xuio_t *)uio; - if ((ISP2(blksz))) { - nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, - blksz)) / blksz; - } else { - ASSERT(offset + n <= blksz); - nblk = 1; - } - (void) dmu_xuio_init(xuio, nblk); - - if (vn_has_cached_data(ip)) { - /* - * For simplicity, we always allocate a full buffer - * even if we only expect to read a portion of a block. - */ - while (--nblk >= 0) { - (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz), 0, blksz); - } - } - } -#endif /* HAVE_UIO_ZEROCOPY */ - - while (n > 0) { - ssize_t nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { - error = mappedread(ip, nbytes, uio); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); - } - - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - - n -= nbytes; - } - - int64_t nread = start_resid - n; - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - task_io_account_read(nread); -out: - zfs_rangelock_exit(lr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Write the bytes to a file. - * - * IN: ip - inode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - O_APPEND flag set if in append mode. - * O_DIRECT flag; used to bypass page cache. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime|mtime updated if byte count > 0 - */ - -/* ARGSUSED */ -int -zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) -{ - int error = 0; - ssize_t start_resid = uio->uio_resid; - - /* - * Fasttrack empty write - */ - ssize_t n = start_resid; - if (n == 0) - return (0); - - rlim64_t limit = uio->uio_limit; - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - sa_bulk_attr_t bulk[4]; - int count = 0; - uint64_t mtime[2], ctime[2]; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * If immutable or not appending then return EPERM - */ - if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && - (uio->uio_loffset < zp->z_size))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* - * Validate file offset - */ - offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset; - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - int max_blksz = zfsvfs->z_max_blksz; - xuio_t *xuio = NULL; - - /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ -#ifdef HAVE_UIO_ZEROCOPY - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else -#endif - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFAULT)); - } - - /* - * If in append mode, set the io offset pointer to eof. - */ - zfs_locked_range_t *lr; - if (ioflag & O_APPEND) { - /* - * Obtain an appending range lock to guarantee file append - * semantics. We reset the write offset once we have the lock. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); - woff = lr->lr_offset; - if (lr->lr_length == UINT64_MAX) { - /* - * We overlocked the file because this write will cause - * the file block size to increase. - * Note that zp_size cannot change with this lock held. - */ - woff = zp->z_size; - } - uio->uio_loffset = woff; - } else { - /* - * Note that if the file block size will change as a result of - * this write, then this range lock will lock the entire file - * so that we can re-write the block safely. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); - } - - if (woff >= limit) { - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFBIG)); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* Will this write extend the file length? */ - int write_eof = (woff + n > zp->z_size); - - uint64_t end_size = MAX(zp->z_size, woff + n); - zilog_t *zilog = zfsvfs->z_log; -#ifdef HAVE_UIO_ZEROCOPY - int i_iov = 0; - const iovec_t *iovp = uio->uio_iov; - int iovcnt __maybe_unused = uio->uio_iovcnt; -#endif - - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - woff = uio->uio_loffset; - - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { - error = SET_ERROR(EDQUOT); - break; - } - - arc_buf_t *abuf = NULL; - const iovec_t *aiov = NULL; - if (xuio) { -#ifdef HAVE_UIO_ZEROCOPY - ASSERT(i_iov < iovcnt); - ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; -#endif - } else if (n >= max_blksz && woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { - /* - * This write covers a full block. "Borrow" a buffer - * from the dmu so that we can fill it before we enter - * a transaction. This avoids the possibility of - * holding up the transaction if the data copy hangs - * up on a pagefault (e.g., from an NFS server mapping). - */ - size_t cbytes; - - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); - ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { - dmu_return_arcbuf(abuf); - break; - } - ASSERT(cbytes == max_blksz); - } - - /* - * Start a transaction. - */ - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); - DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); - DB_DNODE_EXIT(db); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (abuf != NULL) - dmu_return_arcbuf(abuf); - break; - } - - /* - * If rangelock_enter() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since rangelock_reduce() will - * shrink down lr_length to the appropriate size. - */ - if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_rangelock_reduce(lr, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - ssize_t tx_bytes; - if (abuf == NULL) { - tx_bytes = uio->uio_resid; - uio->uio_fault_disable = B_TRUE; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); - uio->uio_fault_disable = B_FALSE; - if (error == EFAULT) { - dmu_tx_commit(tx); - /* - * Account for partial writes before - * continuing the loop. - * Update needs to occur before the next - * uio_prefaultpages, or prefaultpages may - * error, and we may break the loop early. - */ - if (tx_bytes != uio->uio_resid) - n -= tx_bytes - uio->uio_resid; - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - break; - } - continue; - } else if (error != 0) { - dmu_tx_commit(tx); - break; - } - tx_bytes -= uio->uio_resid; - } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); - /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). - */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - /* cppcheck-suppress nullPointer */ - aiov->iov_len, aiov->iov_base, tx); - dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - error = dmu_assign_arcbuf_by_dbuf( - sa_get_db(zp->z_sa_hdl), woff, abuf, tx); - if (error != 0) { - dmu_return_arcbuf(abuf); - dmu_tx_commit(tx); - break; - } - } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); - } - if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { - update_pages(ip, woff, - tx_bytes, zfsvfs->z_os, zp->z_id); - } - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the execute bits is set. - * - * It would be nice to do this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - uint32_t uid = KUID_TO_SUID(ip->i_uid); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - ip->i_mode = newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); - - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_size) < uio->uio_loffset) { - (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); - ASSERT(error == 0); - } - /* - * If we are replaying and eof is non zero then force - * the file size to the specified eof. Note, there's no - * concurrency during replay. - */ - if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) - zp->z_size = zfsvfs->z_replay_eof; - - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, - NULL, NULL); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - - if (!xuio && n > 0) { - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = EFAULT; - break; - } - } - } - - zfs_inode_update(zp); - zfs_rangelock_exit(lr); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (O_SYNC | O_DSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_id); - - int64_t nwritten = start_resid - uio->uio_resid; - dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - task_io_account_write(nwritten); - - ZFS_EXIT(zfsvfs); - return (0); -} - -/* * Write the bytes to a file. * * IN: zp - znode of file to be written to @@ -993,37 +355,40 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * OUT: resid - remaining bytes to write * * RETURN: 0 if success - * positive error code if failure + * positive error code if failure. EIO is returned + * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, - loff_t pos, size_t *resid) + loff_t pos, size_t *residp) { - ssize_t written; - int error = 0; + fstrans_cookie_t cookie; + int error; - written = zpl_write_common(ZTOI(zp), data, len, &pos, - UIO_SYSSPACE, 0, kcred); - if (written < 0) { - error = -written; - } else if (resid == NULL) { - if (written < len) - error = SET_ERROR(EIO); /* short write */ - } else { - *resid = len - written; + struct iovec iov; + iov.iov_base = (void *)data; + iov.iov_len = len; + + uio_t uio; + uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); + + cookie = spl_fstrans_mark(); + error = zfs_write(zp, &uio, 0, kcred); + spl_fstrans_unmark(cookie); + + if (error == 0) { + if (residp != NULL) + *residp = uio_resid(&uio); + else if (uio_resid(&uio) != 0) + error = SET_ERROR(EIO); } + return (error); } -/* - * Drop a reference on the passed inode asynchronously. This ensures - * that the caller will never drop the last reference on an inode in - * the current context. Doing so while holding open a tx could result - * in a deadlock if iput_final() re-enters the filesystem code. - */ void zfs_zrele_async(znode_t *zp) { @@ -1040,179 +405,6 @@ zfs_zrele_async(znode_t *zp) zrele(zp); } -/* ARGSUSED */ -static void -zfs_get_done(zgd_t *zgd, int error) -{ - znode_t *zp = zgd->zgd_private; - - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - zfs_rangelock_exit(zgd->zgd_lr); - - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_zrele_async(zp); - - kmem_free(zgd, sizeof (zgd_t)); -} - -#ifdef ZFS_DEBUG -static int zil_fault_io = 0; -#endif - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error = 0; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_zrele_async(zp); - return (SET_ERROR(ENOENT)); - } - - zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zp; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_size) { - error = SET_ERROR(ENOENT); - } else { - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - } - ASSERT(error == 0 || error == ENOENT); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's - * written out and its checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); - } - /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_size) - error = SET_ERROR(ENOENT); -#ifdef ZFS_DEBUG - if (zil_fault_io) { - error = SET_ERROR(EIO); - zil_fault_io = 0; - } -#endif - if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } - } - } - - zfs_get_done(zgd, error); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); -} /* * Lookup an entry in a directory, or an extended attribute directory. @@ -2440,26 +1632,6 @@ out: return (error); } -ulong_t zfs_fsync_sync_cnt = 4; - -int -zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - tsd_set(zfs_fsyncer_key, NULL); - - return (0); -} - /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source @@ -4796,207 +3968,9 @@ zfs_fid(struct inode *ip, fid_t *fidp) return (0); } -/*ARGSUSED*/ -int -zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -#ifdef HAVE_UIO_ZEROCOPY -/* - * The smallest read we may consider to loan out an arcbuf. - * This must be a power of 2. - */ -int zcr_blksz_min = (1 << 10); /* 1K */ -/* - * If set to less than the file block size, allow loaning out of an - * arcbuf for a partial block read. This must be a power of 2. - */ -int zcr_blksz_max = (1 << 17); /* 128K */ - -/*ARGSUSED*/ -static int -zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int max_blksz = zfsvfs->z_max_blksz; - uio_t *uio = &xuio->xu_uio; - ssize_t size = uio->uio_resid; - offset_t offset = uio->uio_loffset; - int blksz; - int fullblk, i; - arc_buf_t *abuf; - ssize_t maxsize; - int preamble, postamble; - - if (xuio->xu_type != UIOTYPE_ZEROCOPY) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - switch (ioflag) { - case UIO_WRITE: - /* - * Loan out an arc_buf for write if write size is bigger than - * max_blksz, and the file's block size is also max_blksz. - */ - blksz = max_blksz; - if (size < blksz || zp->z_blksz != blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - /* - * Caller requests buffers for write before knowing where the - * write offset might be (e.g. NFS TCP write). - */ - if (offset == -1) { - preamble = 0; - } else { - preamble = P2PHASE(offset, blksz); - if (preamble) { - preamble = blksz - preamble; - size -= preamble; - } - } - - postamble = P2PHASE(size, blksz); - size -= postamble; - - fullblk = size / blksz; - (void) dmu_xuio_init(xuio, - (preamble != 0) + fullblk + (postamble != 0)); - - /* - * Have to fix iov base/len for partial buffers. They - * currently represent full arc_buf's. - */ - if (preamble) { - /* data begins in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, - blksz - preamble, preamble); - } - - for (i = 0; i < fullblk; i++) { - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, blksz); - } - - if (postamble) { - /* data ends in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, postamble); - } - break; - case UIO_READ: - /* - * Loan out an arc_buf for read if the read size is larger than - * the current file block size. Block alignment is not - * considered. Partial arc_buf will be loaned out for read. - */ - blksz = zp->z_blksz; - if (blksz < zcr_blksz_min) - blksz = zcr_blksz_min; - if (blksz > zcr_blksz_max) - blksz = zcr_blksz_max; - /* avoid potential complexity of dealing with it */ - if (blksz > max_blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - maxsize = zp->z_size - uio->uio_loffset; - if (size > maxsize) - size = maxsize; - - if (size < blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - break; - default: - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - uio->uio_extflg = UIO_XUIO; - XUIO_XUZC_RW(xuio) = ioflag; - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -static int -zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) -{ - int i; - arc_buf_t *abuf; - int ioflag = XUIO_XUZC_RW(xuio); - - ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); - - i = dmu_xuio_cnt(xuio); - while (i-- > 0) { - abuf = dmu_xuio_arcbuf(xuio, i); - /* - * if abuf == NULL, it must be a write buffer - * that has been returned in zfs_write(). - */ - if (abuf) - dmu_return_arcbuf(abuf); - ASSERT(abuf || ioflag == UIO_WRITE); - } - - dmu_xuio_fini(xuio); - return (0); -} -#endif /* HAVE_UIO_ZEROCOPY */ - #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); -EXPORT_SYMBOL(zfs_read); -EXPORT_SYMBOL(zfs_write); -EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); @@ -5004,7 +3978,6 @@ EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); -EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); @@ -5014,8 +3987,6 @@ EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); -EXPORT_SYMBOL(zfs_getsecattr); -EXPORT_SYMBOL(zfs_setsecattr); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); @@ -5024,8 +3995,6 @@ EXPORT_SYMBOL(zfs_map); /* BEGIN CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); -module_param(zfs_read_chunk_size, ulong, 0644); -MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); /* END CSTYLED */ #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index a542c662cb15..b33594488ee0 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -134,7 +134,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_moved = B_FALSE; return (0); } @@ -505,6 +504,7 @@ zfs_inode_update(znode_t *zp) dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); + ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); @@ -546,7 +546,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; - zp->z_moved = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_is_stale = B_FALSE; @@ -619,7 +618,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; - membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); unlock_new_inode(ip); @@ -1901,7 +1899,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; - rootzp->z_moved = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index 96dabe55a138..8106359e1c77 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -1198,6 +1198,16 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); /* + * This is necessary here as we check next whether + * OBJSET_FLAG_USERACCOUNTING_COMPLETE or + * OBJSET_FLAG_USEROBJACCOUNTING are set in order to + * decide if the local_mac should be zeroed out. + */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + + /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. */ @@ -1208,7 +1218,10 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || - (datalen <= OBJSET_PHYS_SIZE_V1)) { + (datalen <= OBJSET_PHYS_SIZE_V1) || + (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 || + (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) && + key->zk_version > 0)) { bzero(local_mac, ZIO_OBJSET_MAC_LEN); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index fa4500f6f8d1..e6420f19ed87 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -55,7 +55,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (!zpl_dir_emit_dots(filp, ctx)) goto out; @@ -76,7 +76,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) ctx->pos++; } out: - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (error); } @@ -242,13 +242,14 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) uint64_t id, pos; int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) goto out; - pos = ctx->pos; + /* Start the position at 0 if it already emitted . and .. */ + pos = (ctx->pos == 2 ? 0 : ctx->pos); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, @@ -265,7 +266,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) } out: spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); if (error == -ENOENT) return (0); @@ -368,13 +369,13 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); generic_fillattr(ip, stat); stat->nlink = stat->size = 2; stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (0); } @@ -452,7 +453,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) znode_t *dzp; int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { @@ -471,7 +472,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); @@ -502,13 +503,13 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, znode_t *dzp; int error; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { generic_fillattr(path->dentry->d_inode, stat); stat->nlink = stat->size = 2; stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (0); } @@ -518,7 +519,7 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, iput(ZTOI(dzp)); } - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 51e189a87272..9e08c94e2147 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -212,244 +212,221 @@ zfs_io_flags(struct kiocb *kiocb) return (flags); } -static ssize_t -zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) +/* + * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() + * is true. This is needed since datasets with inherited "relatime" property + * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after + * `zfs set relatime=...`), which is what relatime test in VFS by + * relatime_need_update() is based on. + */ +static inline void +zpl_file_accessed(struct file *filp) { - ssize_t read; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_read(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - read = count - uio.uio_resid; - *ppos += read; + struct inode *ip = filp->f_mapping->host; - return (read); + if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { + if (zfs_relatime_need_update(ip)) + file_accessed(filp); + } else { + file_accessed(filp); + } } -inline ssize_t -zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; +#if defined(HAVE_VFS_RW_ITERATE) - return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); +/* + * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports + * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to + * manipulate the iov_iter are available. In which case the full iov_iter + * can be attached to the uio and correctly handled in the lower layers. + * Otherwise, for older kernels extract the iovec and pass it instead. + */ +static void +zpl_uio_init(uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, + loff_t pos, ssize_t count, size_t skip) +{ +#if defined(HAVE_VFS_IOV_ITER) + uio_iov_iter_init(uio, to, pos, count, skip); +#else + uio_iovec_init(uio, to->iov, to->nr_segs, pos, + to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + count, skip); +#endif } static ssize_t -zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; - struct inode *ip = filp->f_mapping->host; - zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); - ssize_t read; - unsigned int f_flags = filp->f_flags; + ssize_t count = iov_iter_count(to); + uio_t uio; + + zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); - f_flags |= zfs_io_flags(kiocb); crhold(cr); - read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); crfree(cr); - /* - * If relatime is enabled, call file_accessed() only if - * zfs_relatime_need_update() is true. This is needed since datasets - * with inherited "relatime" property aren't necessarily mounted with - * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what - * relatime test in VFS by relatime_need_update() is based on. - */ - if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { - if (zfs_relatime_need_update(ip)) - file_accessed(filp); - } else { - file_accessed(filp); - } + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); return (read); } -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +static inline ssize_t +zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, + size_t *countp) { - ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; - if (to->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (to->type & ITER_BVEC) - seg = UIO_BVEC; - ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, - iov_iter_count(to), seg, to->iov_offset); - if (ret > 0) - iov_iter_advance(to, ret); - return (ret); -} +#ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB + ssize_t ret = generic_write_checks(kiocb, from); + if (ret <= 0) + return (ret); + + *countp = ret; #else -static ssize_t -zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - size_t count; + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); + *countp = iov_iter_count(from); + ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); if (ret) return (ret); +#endif - return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); + return (0); } -#endif /* HAVE_VFS_RW_ITERATE */ static ssize_t -zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) +zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { - ssize_t wrote; - uio_t uio = { { 0 }, 0 }; - int error; + cred_t *cr = CRED(); fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + uio_t uio; + size_t count = 0; + ssize_t ret; - if (flags & O_APPEND) - *ppos = i_size_read(ip); + ret = zpl_generic_write_checks(kiocb, from, &count); + if (ret) + return (ret); - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); + crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_write(ip, &uio, flags, cr); + + int error = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + spl_fstrans_unmark(cookie); + crfree(cr); + if (error < 0) return (error); - wrote = count - uio.uio_resid; - *ppos += wrote; + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + if (wrote > 0) + iov_iter_advance(from, wrote); return (wrote); } -inline ssize_t -zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} +#else /* !HAVE_VFS_RW_ITERATE */ static ssize_t -zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); + fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; - ssize_t wrote; - unsigned int f_flags = filp->f_flags; - - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - return (wrote); -} - -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) -{ size_t count; ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; -#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); - - count = iov_iter_count(from); - ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); -#else - /* - * XXX - ideally this check should be in the same lock region with - * write operations, so that there's no TOCTTOU race when doing - * append and someone else grow the file. - */ - ret = generic_write_checks(kiocb, from); - if (ret <= 0) - return (ret); - count = ret; -#endif - if (from->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (from->type & ITER_BVEC) - seg = UIO_BVEC; + uio_t uio; + uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); - ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, - count, seg, from->iov_offset); - if (ret > 0) - iov_iter_advance(from, ret); + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); - return (ret); + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); } -#else + static ssize_t -zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; size_t count; ssize_t ret; - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); - ret = generic_write_checks(file, &pos, &count, isblk); + ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); if (ret) return (ret); - return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); + uio_t uio; + uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ @@ -486,14 +463,27 @@ zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) #error "Unknown direct IO interface" #endif -#else +#else /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t -zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, +zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { if (rw == WRITE) + return (zpl_aio_write(kiocb, iov, nr_segs, pos)); + else + return (zpl_aio_read(kiocb, iov, nr_segs, pos)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + const struct iovec *iovp = iov_iter_iovec(iter); + unsigned long nr_segs = iter->nr_segs; + + ASSERT3S(pos, ==, kiocb->ki_pos); + if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); @@ -517,7 +507,7 @@ zpl_llseek(struct file *filp, loff_t offset, int whence) spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); - error = -zfs_holey(ip, whence, &offset); + error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); @@ -603,10 +593,6 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). - * - * Current this function relies on zpl_read_common() and the O_DIRECT - * flag to read in a page. This works but the more correct way is to - * update zfs_fillpage() to be Linux friendly and use that interface. */ static int zpl_readpage(struct file *filp, struct page *pp) @@ -675,10 +661,10 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) enum writeback_sync_modes sync_mode; int result; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); sync_mode = wbc->sync_mode; /* @@ -691,11 +677,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); /* * We need to call write_cache_pages() again (we can't just @@ -1037,6 +1023,10 @@ const struct file_operations zpl_file_operations = { #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, +#ifdef HAVE_VFS_IOV_ITER + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +#endif #else .read = do_sync_read, .write = do_sync_write, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index f3b97a22074c..f336fbb1272b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -490,19 +490,17 @@ zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) { fstrans_cookie_t cookie; cred_t *cr = CRED(); - struct iovec iov; - uio_t uio = { { 0 }, 0 }; int error; crhold(cr); *link = NULL; + + struct iovec iov; iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = (MAXPATHLEN - 1); + uio_t uio; + uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 9db8bda4cc66..c2fd3fee1401 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -185,14 +185,27 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data) static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { - char *fsname; + ZPL_ENTER(zfsvfs); - ZFS_ENTER(zfsvfs); - fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); - seq_puts(seq, fsname); + + for (int i = 0; fsname[i] != 0; i++) { + /* + * Spaces in the dataset name must be converted to their + * octal escape sequence for getmntent(3) to correctly + * parse then fsname portion of /proc/self/mounts. + */ + if (fsname[i] == ' ') { + seq_puts(seq, "\\040"); + } else { + seq_putc(seq, fsname[i]); + } + } + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - ZFS_EXIT(zfsvfs); + + ZPL_EXIT(zfsvfs); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index 9b5fd0fd397b..1ec3dae2bb81 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -274,10 +274,10 @@ static int zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { + fstrans_cookie_t cookie; struct inode *xip = NULL; znode_t *dxzp = NULL; znode_t *xzp = NULL; - loff_t pos = 0; int error; /* Lookup the xattr directory */ @@ -302,7 +302,19 @@ zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, goto out; } - error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); + struct iovec iov; + iov.iov_base = (void *)value; + iov.iov_len = size; + + uio_t uio; + uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); + + cookie = spl_fstrans_mark(); + error = -zfs_read(ITOZ(xip), &uio, 0, cr); + spl_fstrans_unmark(cookie); + + if (error == 0) + error = size - uio_resid(&uio); out: if (xzp) zrele(xzp); @@ -441,7 +453,6 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, znode_t *dxzp = NULL; znode_t *xzp = NULL; vattr_t *vap = NULL; - ssize_t wrote; int lookup_flags, error; const int xattr_mode = S_IFREG | 0644; loff_t pos = 0; @@ -496,13 +507,8 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, if (error) goto out; - wrote = zpl_write_common(ZTOI(xzp), value, size, &pos, - UIO_SYSSPACE, 0, cr); - if (wrote < 0) - error = wrote; - + error = -zfs_write_simple(xzp, value, size, pos, NULL); out: - if (error == 0) { ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 218e1101edf8..cdc2076702af 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -66,49 +66,33 @@ typedef struct zv_request { * Given a path, return TRUE if path is a ZVOL. */ static boolean_t -zvol_is_zvol_impl(const char *device) +zvol_is_zvol_impl(const char *path) { - struct block_device *bdev; - unsigned int major; + dev_t dev = 0; - bdev = vdev_lookup_bdev(device); - if (IS_ERR(bdev)) + if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); - major = MAJOR(bdev->bd_dev); - bdput(bdev); - - if (major == zvol_major) + if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void -uio_from_bio(uio_t *uio, struct bio *bio) -{ - uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); - uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; - uio->uio_segflg = UIO_BVEC; - uio->uio_limit = MAXOFFSET_T; - uio->uio_resid = BIO_BI_SIZE(bio); - uio->uio_skip = BIO_BI_SKIP(bio); -} - -static void zvol_write(void *arg) { - int error = 0; - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; - uio_t uio = { { 0 }, 0 }; - uio_from_bio(&uio, bio); + int error = 0; + uio_t uio; + + uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; - ASSERT(zv && zv->zv_open_count > 0); - ASSERT(zv->zv_zilog != NULL); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + ASSERT3P(zv->zv_zilog, !=, NULL); /* bio marked as FLUSH need to flush before write */ if (bio_is_flush(bio)) @@ -122,10 +106,14 @@ zvol_write(void *arg) return; } + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; - unsigned long start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, - bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); boolean_t sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; @@ -169,8 +157,10 @@ zvol_write(void *arg) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_zso->zvo_queue, - WRITE, &zv->zv_zso->zvo_disk->part0, start_jif); + + if (acct) + blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } @@ -187,14 +177,18 @@ zvol_discard(void *arg) boolean_t sync; int error = 0; dmu_tx_t *tx; - unsigned long start_jif; - ASSERT(zv && zv->zv_open_count > 0); - ASSERT(zv->zv_zilog != NULL); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + ASSERT3P(zv->zv_zilog, !=, NULL); + + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time; - start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, - bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; @@ -239,8 +233,10 @@ zvol_discard(void *arg) unlock: rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE, - &zv->zv_zso->zvo_disk->part0, start_jif); + + if (acct) + blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } @@ -248,20 +244,25 @@ unlock: static void zvol_read(void *arg) { - int error = 0; - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; - uio_t uio = { { 0 }, 0 }; - uio_from_bio(&uio, bio); + int error = 0; + uio_t uio; + + uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; - ASSERT(zv && zv->zv_open_count > 0); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; - unsigned long start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio), - &zv->zv_zso->zvo_disk->part0); + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, READ, bio); zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); @@ -289,8 +290,10 @@ zvol_read(void *arg) task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ, - &zv->zv_zso->zvo_disk->part0, start_jif); + + if (acct) + blk_generic_end_io_acct(q, disk, READ, bio, start_time); + BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } @@ -482,9 +485,9 @@ zvol_open(struct block_device *bdev, fmode_t flag) rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); if (zv->zv_open_count == 0) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); if (error) goto out_mutex; @@ -501,7 +504,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - check_disk_change(bdev); + zfs_check_media_change(bdev); return (0); @@ -530,7 +533,7 @@ zvol_release(struct gendisk *disk, fmode_t mode) zv = disk->private_data; mutex_enter(&zv->zv_state_lock); - ASSERT(zv->zv_open_count > 0); + ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition @@ -553,11 +556,12 @@ zvol_release(struct gendisk *disk, fmode_t mode) rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); zv->zv_open_count--; - if (zv->zv_open_count == 0) + if (zv->zv_open_count == 0) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); + } mutex_exit(&zv->zv_state_lock); @@ -652,8 +656,15 @@ zvol_revalidate_disk(struct gendisk *disk) static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { + struct gendisk *disk = zv->zv_zso->zvo_disk; - revalidate_disk(zv->zv_zso->zvo_disk); +#if defined(HAVE_REVALIDATE_DISK_SIZE) + revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); +#elif defined(HAVE_REVALIDATE_DISK) + revalidate_disk(disk); +#else + zvol_revalidate_disk(disk); +#endif return (0); } @@ -697,46 +708,6 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } -/* - * Find a zvol_state_t given the full major+minor dev_t. If found, - * return with zv_state_lock taken, otherwise, return (NULL) without - * taking zv_state_lock. - */ -static zvol_state_t * -zvol_find_by_dev(dev_t dev) -{ - zvol_state_t *zv; - - rw_enter(&zvol_state_lock, RW_READER); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zvo_dev == dev) { - rw_exit(&zvol_state_lock); - return (zv); - } - mutex_exit(&zv->zv_state_lock); - } - rw_exit(&zvol_state_lock); - - return (NULL); -} - -static struct kobject * -zvol_probe(dev_t dev, int *part, void *arg) -{ - zvol_state_t *zv; - struct kobject *kobj; - - zv = zvol_find_by_dev(dev); - kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL; - ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); - if (zv) - mutex_exit(&zv->zv_state_lock); - - return (kobj); -} - static struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, @@ -774,6 +745,7 @@ zvol_alloc(dev_t dev, const char *name) zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; + zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); @@ -859,8 +831,8 @@ zvol_free(zvol_state_t *zv) ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count == 0); - ASSERT(zv->zv_zso->zvo_disk->private_data == NULL); + ASSERT0(zv->zv_open_count); + ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); @@ -879,6 +851,11 @@ zvol_free(zvol_state_t *zv) kmem_free(zv, sizeof (zvol_state_t)); } +void +zvol_wait_close(zvol_state_t *zv) +{ +} + /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block @@ -1083,9 +1060,6 @@ zvol_init(void) return (-ENOMEM); } zvol_init_impl(); - blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, - THIS_MODULE, zvol_probe, NULL, NULL); - ida_init(&zvol_ida); zvol_register_ops(&zvol_linux_ops); return (0); @@ -1095,7 +1069,6 @@ void zvol_fini(void) { zvol_fini_impl(); - blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); diff --git a/sys/contrib/openzfs/module/zcommon/Makefile.in b/sys/contrib/openzfs/module/zcommon/Makefile.in index b5cdf4c0c9fe..ebc538440445 100644 --- a/sys/contrib/openzfs/module/zcommon/Makefile.in +++ b/sys/contrib/openzfs/module/zcommon/Makefile.in @@ -19,7 +19,6 @@ $(MODULE)-objs += zfs_fletcher_superscalar.o $(MODULE)-objs += zfs_fletcher_superscalar4.o $(MODULE)-objs += zfs_namecheck.o $(MODULE)-objs += zfs_prop.o -$(MODULE)-objs += zfs_uio.o $(MODULE)-objs += zpool_prop.o $(MODULE)-objs += zprop_common.o diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c index 97ddacbab9e0..34ebabcf3b3c 100644 --- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c +++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c @@ -576,7 +576,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", - "Support for sequential device rebuilds", + "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { @@ -589,6 +589,10 @@ zpool_feature_init(void) "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); } + + zfeature_register(SPA_FEATURE_DRAID, + "org.openzfs:draid", "draid", "Support for distributed spare RAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c index 3e0632a32864..7a9de4a4309d 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c @@ -660,7 +660,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ } -#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ +#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); @@ -885,23 +885,26 @@ zio_abd_checksum_func_t fletcher_4_abd_ops = { .acf_iter = abd_fletcher_4_iter }; +#if defined(_KERNEL) + +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") -#if defined(_KERNEL) && defined(__linux__) +#if defined(__linux__) static int fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); char *fmt; - int i, cnt = 0; + int cnt = 0; /* list fastest */ - fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; + fmt = IMPL_FMT(impl, IMPL_FASTEST); cnt += sprintf(buffer + cnt, fmt, "fastest"); /* list all supported implementations */ - for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { - fmt = (i == impl) ? "[%s] " : "%s "; + for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); cnt += sprintf(buffer + cnt, fmt, fletcher_4_supp_impls[i]->name); } @@ -915,14 +918,62 @@ fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) return (fletcher_4_impl_set(val)); } +#else + +#include <sys/sbuf.h> + +static int +fletcher_4_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, + fletcher_4_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (-fletcher_4_impl_set(buf)); +} + +#endif + +#undef IMPL_FMT + /* * Choose a fletcher 4 implementation in ZFS. * Users can choose "cycle" to exercise all implementations, but this is * for testing purpose therefore it can only be set in user space. */ -module_param_call(zfs_fletcher_4_impl, - fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); -MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); +/* BEGIN CSTYLED */ +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl, + fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW, + "Select fletcher 4 implementation."); +/* END CSTYLED */ EXPORT_SYMBOL(fletcher_init); EXPORT_SYMBOL(fletcher_2_incremental_native); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c index f8625042a74c..0011a971cacb 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c @@ -442,7 +442,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 0352b13aa240..b78331187e13 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -551,14 +551,14 @@ zfs_prop_init(void) PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "filesystem | volume | snapshot | bookmark", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "on | off | sharemgr(1M) options", "SHARESMB"); + "on | off | SMB share options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "<sensitivity label>", "MLSLABEL"); @@ -1016,7 +1016,7 @@ zcommon_fini(void) kfpu_fini(); } -module_init(zcommon_init); +module_init_early(zcommon_init); module_exit(zcommon_fini); #endif diff --git a/sys/contrib/openzfs/module/zcommon/zfs_uio.c b/sys/contrib/openzfs/module/zcommon/zfs_uio.c index d586e0a1220a..e435e1a9f78a 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_uio.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_uio.c @@ -39,12 +39,6 @@ * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ -/* - * The uio support from OpenSolaris has been added as a short term - * work around. The hope is to adopt native Linux type and drop the - * use of uio's entirely. Under Linux they only add overhead and - * when possible we want to use native APIs for the ZPL layer. - */ #ifdef _KERNEL #include <sys/types.h> @@ -71,7 +65,6 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) cnt = MIN(iov->iov_len - skip, n); switch (uio->uio_segflg) { case UIO_USERSPACE: - case UIO_USERISPACE: /* * p = kernel data pointer * iov->iov_base = user data pointer @@ -165,81 +158,82 @@ uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) return (0); } +#if defined(HAVE_VFS_IOV_ITER) +static int +uiomove_iter(void *p, size_t n, enum uio_rw rw, struct uio *uio, + boolean_t revert) +{ + size_t cnt = MIN(n, uio->uio_resid); + + if (uio->uio_skip) + iov_iter_advance(uio->uio_iter, uio->uio_skip); + + if (rw == UIO_READ) + cnt = copy_to_iter(p, cnt, uio->uio_iter); + else + cnt = copy_from_iter(p, cnt, uio->uio_iter); + + /* + * When operating on a full pipe no bytes are processed. + * In which case return EFAULT which is converted to EAGAIN + * by the kernel's generic_file_splice_read() function. + */ + if (cnt == 0) + return (EFAULT); + + /* + * Revert advancing the uio_iter. This is set by uiocopy() + * to avoid consuming the uio and its iov_iter structure. + */ + if (revert) + iov_iter_revert(uio->uio_iter, cnt); + + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + + return (0); +} +#endif + int uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) { - if (uio->uio_segflg != UIO_BVEC) - return (uiomove_iov(p, n, rw, uio)); - else + if (uio->uio_segflg == UIO_BVEC) return (uiomove_bvec(p, n, rw, uio)); +#if defined(HAVE_VFS_IOV_ITER) + else if (uio->uio_segflg == UIO_ITER) + return (uiomove_iter(p, n, rw, uio, B_FALSE)); +#endif + else + return (uiomove_iov(p, n, rw, uio)); } EXPORT_SYMBOL(uiomove); -#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) - -/* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. Any - * error will terminate the process as this is only a best attempt to get - * the pages resident. - */ int uio_prefaultpages(ssize_t n, struct uio *uio) { - const struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - uint8_t tmp; - int iovcnt; - size_t skip; + struct iov_iter iter, *iterp = NULL; - /* no need to fault in kernel pages */ - switch (uio->uio_segflg) { - case UIO_SYSSPACE: - case UIO_BVEC: - return (0); - case UIO_USERSPACE: - case UIO_USERISPACE: - break; - default: - ASSERT(0); - } - - iov = uio->uio_iov; - iovcnt = uio->uio_iovcnt; - skip = uio->uio_skip; - - for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { - cnt = MIN(iov->iov_len - skip, n); - /* empty iov */ - if (cnt == 0) - continue; - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base + skip; - while (cnt) { - if (fuword8((uint8_t *)p, &tmp)) - return (EFAULT); - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - if (fuword8((uint8_t *)p, &tmp)) - return (EFAULT); +#if defined(HAVE_IOV_ITER_FAULT_IN_READABLE) + if (uio->uio_segflg == UIO_USERSPACE) { + iterp = &iter; + iov_iter_init_compat(iterp, READ, uio->uio_iov, + uio->uio_iovcnt, uio->uio_resid); +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + iterp = uio->uio_iter; +#endif } + if (iterp && iov_iter_fault_in_readable(iterp, n)) + return (EFAULT); +#endif return (0); } EXPORT_SYMBOL(uio_prefaultpages); /* - * same as uiomove() but doesn't modify uio structure. + * The same as uiomove() but doesn't modify uio structure. * return in cbytes how many bytes were copied. */ int @@ -249,39 +243,54 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) int ret; bcopy(uio, &uio_copy, sizeof (struct uio)); - ret = uiomove(p, n, rw, &uio_copy); + + if (uio->uio_segflg == UIO_BVEC) + ret = uiomove_bvec(p, n, rw, &uio_copy); +#if defined(HAVE_VFS_IOV_ITER) + else if (uio->uio_segflg == UIO_ITER) + ret = uiomove_iter(p, n, rw, &uio_copy, B_TRUE); +#endif + else + ret = uiomove_iov(p, n, rw, &uio_copy); + *cbytes = uio->uio_resid - uio_copy.uio_resid; + return (ret); } EXPORT_SYMBOL(uiocopy); /* - * Drop the next n chars out of *uiop. + * Drop the next n chars out of *uio. */ void -uioskip(uio_t *uiop, size_t n) +uioskip(uio_t *uio, size_t n) { - if (n > uiop->uio_resid) + if (n > uio->uio_resid) return; - uiop->uio_skip += n; - if (uiop->uio_segflg != UIO_BVEC) { - while (uiop->uio_iovcnt && - uiop->uio_skip >= uiop->uio_iov->iov_len) { - uiop->uio_skip -= uiop->uio_iov->iov_len; - uiop->uio_iov++; - uiop->uio_iovcnt--; + if (uio->uio_segflg == UIO_BVEC) { + uio->uio_skip += n; + while (uio->uio_iovcnt && + uio->uio_skip >= uio->uio_bvec->bv_len) { + uio->uio_skip -= uio->uio_bvec->bv_len; + uio->uio_bvec++; + uio->uio_iovcnt--; } +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + iov_iter_advance(uio->uio_iter, n); +#endif } else { - while (uiop->uio_iovcnt && - uiop->uio_skip >= uiop->uio_bvec->bv_len) { - uiop->uio_skip -= uiop->uio_bvec->bv_len; - uiop->uio_bvec++; - uiop->uio_iovcnt--; + uio->uio_skip += n; + while (uio->uio_iovcnt && + uio->uio_skip >= uio->uio_iov->iov_len) { + uio->uio_skip -= uio->uio_iov->iov_len; + uio->uio_iov++; + uio->uio_iovcnt--; } } - uiop->uio_loffset += n; - uiop->uio_resid -= n; + uio->uio_loffset += n; + uio->uio_resid -= n; } EXPORT_SYMBOL(uioskip); #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in index 259ac4dc926c..653ea0da9bcc 100644 --- a/sys/contrib/openzfs/module/zfs/Makefile.in +++ b/sys/contrib/openzfs/module/zfs/Makefile.in @@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o +$(MODULE)-objs += vdev_draid.o +$(MODULE)-objs += vdev_draid_rand.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o @@ -120,6 +122,7 @@ $(MODULE)-objs += zfs_ratelimit.o $(MODULE)-objs += zfs_replay.o $(MODULE)-objs += zfs_rlock.o $(MODULE)-objs += zfs_sa.o +$(MODULE)-objs += zfs_vnops.o $(MODULE)-objs += zil.o $(MODULE)-objs += zio.o $(MODULE)-objs += zio_checksum.o diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 6018a42ca0d8..68d4aa5f5cb4 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -781,16 +781,17 @@ int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { - int ret = 0; struct abd_iter aiter; - boolean_t abd_multi; - abd_t *c_abd; + int ret = 0; + + if (size == 0) + return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_multi = abd_is_gang(abd); - c_abd = abd_init_abd_iter(abd, &aiter, off); + boolean_t abd_multi = abd_is_gang(abd); + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { /* If we are at the end of the gang ABD we are done */ @@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; + if (size == 0) + return (0); + abd_verify(dabd); abd_verify(sabd); diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c index a2fec27744e1..e46da95f676c 100644 --- a/sys/contrib/openzfs/module/zfs/aggsum.c +++ b/sys/contrib/openzfs/module/zfs/aggsum.c @@ -70,6 +70,11 @@ * zeroing out the borrowed value (forcing that thread to borrow on its next * request, which will also be expensive). This is what makes aggsums well * suited for write-many read-rarely operations. + * + * Note that the aggsums do not expand if more CPUs are hot-added. In that + * case, we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ /* @@ -167,9 +172,7 @@ aggsum_add(aggsum_t *as, int64_t delta) struct aggsum_bucket *asb; int64_t borrow; - kpreempt_disable(); - asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets]; - kpreempt_enable(); + asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets]; /* Try fast path if we already borrowed enough before. */ mutex_enter(&asb->asc_lock); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 68508cf152a8..c21ae27b9af8 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -492,6 +492,8 @@ arc_stats_t arc_stats = { { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, @@ -533,6 +535,11 @@ arc_stats_t arc_stats = { { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, + { "l2_mru_asize", KSTAT_DATA_UINT64 }, + { "l2_mfu_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, @@ -894,6 +901,17 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); +static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only); + +#define l2arc_hdr_arcstats_increment(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) +#define l2arc_hdr_arcstats_decrement(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) +#define l2arc_hdr_arcstats_increment_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) +#define l2arc_hdr_arcstats_decrement_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU @@ -951,7 +969,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, - const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); @@ -1727,7 +1745,7 @@ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, - boolean_t prefetch) + boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; @@ -1751,6 +1769,7 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; + hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } @@ -2312,7 +2331,11 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } } @@ -2595,9 +2618,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } } - if (HDR_HAS_L1HDR(hdr)) + if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; + if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { + l2arc_hdr_arcstats_decrement_state(hdr); + hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; + l2arc_hdr_arcstats_increment_state(hdr); + } + } + /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. @@ -3685,6 +3715,76 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, } static void +l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + arc_buf_contents_t type = hdr->b_type; + int64_t lsize_s; + int64_t psize_s; + int64_t asize_s; + + if (incr) { + lsize_s = lsize; + psize_s = psize; + asize_s = asize; + } else { + lsize_s = -lsize; + psize_s = -psize; + asize_s = -asize; + } + + /* If the buffer is a prefetch, count it as such. */ + if (HDR_PREFETCH(hdr)) { + ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); + } else { + /* + * We use the value stored in the L2 header upon initial + * caching in L2ARC. This value will be updated in case + * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC + * metadata (log entry) cannot currently be updated. Having + * the ARC state in the L2 header solves the problem of a + * possibly absent L1 header (apparent in buffers restored + * from persistent L2ARC). + */ + switch (hdr->b_l2hdr.b_arcs_state) { + case ARC_STATE_MRU_GHOST: + case ARC_STATE_MRU: + ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); + break; + case ARC_STATE_MFU_GHOST: + case ARC_STATE_MFU: + ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); + break; + default: + break; + } + } + + if (state_only) + return; + + ARCSTAT_INCR(arcstat_l2_psize, psize_s); + ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); + + switch (type) { + case ARC_BUFC_DATA: + ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); + break; + case ARC_BUFC_METADATA: + ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); + break; + default: + break; + } +} + + +static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; @@ -3697,9 +3797,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - + l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), @@ -3903,6 +4001,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); + + switch (state->arcs_state) { + case ARC_STATE_MRU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mru, + HDR_GET_LSIZE(hdr)); + break; + case ARC_STATE_MFU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mfu, + HDR_GET_LSIZE(hdr)); + break; + default: + break; + } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); @@ -4769,14 +4882,7 @@ arc_kmem_reap_soon(void) static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { - /* - * This is necessary so that any changes which may have been made to - * many of the zfs_arc_* module parameters will be propagated to - * their actual internal variable counterparts. Without this, - * changing those module params at runtime would have no effect. - */ - arc_tuning_update(B_FALSE); - +#ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the @@ -4784,12 +4890,11 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date(the arc_evict_zthr has a maximum sleep - * time of 1 second); but that should suffice. The - * arc_state_t structures can be queried directly if more - * accurate information is needed. + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this call, the data might be out of date if the + * evict thread hasn't been woken recently; but that should + * suffice. The arc_state_t structures can be queried + * directly if more accurate information is needed. */ #ifndef __FreeBSD__ if (arc_ksp != NULL) @@ -5347,11 +5452,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } hdr->b_l1hdr.b_arc_access = now; return; @@ -5382,13 +5491,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * was evicted from the cache. Move it to the * MFU state. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -5641,7 +5753,7 @@ arc_read_done(zio_t *zio) */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { - if (!acb->acb_done) + if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; @@ -5806,6 +5918,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); + boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; int rc = 0; ASSERT(!embedded_bp || @@ -5890,6 +6003,7 @@ top: acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; + acb->acb_nobuf = no_buf; acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, @@ -5899,8 +6013,6 @@ top: acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - mutex_exit(hash_lock); - goto out; } mutex_exit(hash_lock); goto out; @@ -5909,7 +6021,7 @@ top: ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); - if (done) { + if (done && !no_buf) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to @@ -5963,8 +6075,12 @@ top: ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); @@ -6108,8 +6224,13 @@ top: } if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) @@ -6178,7 +6299,11 @@ top: metadata, misses); } - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* Check if the spa even has l2 configured */ + const boolean_t spa_has_l2 = l2arc_ndev != 0 && + spa->spa_l2cache.sav_count > 0; + + if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -6186,7 +6311,7 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch and l2arc_noprefetch is set. + * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && @@ -6279,15 +6404,24 @@ top: } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); + /* - * Skip ARC stat bump for block pointers with - * embedded data. The data are read from the blkptr - * itself via decode_embedded_bp_compressed(). + * Only a spa with l2 should contribute to l2 + * miss stats. (Including the case of having a + * faulted cache device - that's also a miss.) */ - if (l2arc_ndev != 0 && !embedded_bp) { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); + if (spa_has_l2) { + /* + * Skip ARC stat bump for block pointers with + * embedded data. The data are read from the + * blkptr itself via + * decode_embedded_bp_compressed(). + */ + if (!embedded_bp) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } } @@ -7072,9 +7206,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); - - if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && - anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && + uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; + if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && + anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( @@ -7082,9 +7216,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " - "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", arc_tempreserve >> 10, meta_esize >> 10, - data_esize >> 10, reserve >> 10, arc_c >> 10); + data_esize >> 10, reserve >> 10, rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); @@ -7452,6 +7586,15 @@ arc_target_bytes(void) } void +arc_set_limits(uint64_t allmem) +{ + /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ + arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); + + /* How to set default max varies by platform. */ + arc_c_max = arc_default_max(arc_c_min, allmem); +} +void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); @@ -7466,11 +7609,7 @@ arc_init(void) arc_lowmem_init(); #endif - /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ - arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); - - /* How to set default max varies by platform. */ - arc_c_max = arc_default_max(arc_c_min, allmem); + arc_set_limits(allmem); #ifndef _KERNEL /* @@ -7507,6 +7646,8 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; + arc_register_hotplug(); + arc_state_init(); buf_init(); @@ -7515,8 +7656,9 @@ arc_init(void) offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri, - boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri, + boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7527,8 +7669,8 @@ arc_init(void) kstat_install(arc_ksp); } - arc_evict_zthr = zthr_create_timer("arc_evict", - arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1)); + arc_evict_zthr = zthr_create("arc_evict", + arc_evict_cb_check, arc_evict_cb, NULL); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1)); @@ -7613,6 +7755,8 @@ arc_fini(void) buf_fini(); arc_state_fini(); + arc_unregister_hotplug(); + /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any @@ -8068,9 +8212,6 @@ l2arc_write_done(zio_t *zio) DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); - if (zio->io_error != 0) - ARCSTAT_BUMP(arcstat_l2_writes_error); - /* * All writes completed, or an error was hit. */ @@ -8134,8 +8275,7 @@ top: arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); @@ -8183,6 +8323,8 @@ top: list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_writes_error); + /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the @@ -8748,9 +8890,16 @@ out: goto top; } - ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); - if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + if (!all) { + /* + * In case of cache device removal (all) the following + * assertions may be violated without functional consequences + * as the device is about to be removed. + */ + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + } } /* @@ -9089,6 +9238,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); @@ -9111,6 +9262,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; + l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); @@ -9153,8 +9305,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); - ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); - ARCSTAT_INCR(arcstat_l2_psize, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); @@ -9379,8 +9529,6 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) l2arc_dev_hdr_phys_t *l2dhdr; uint64_t l2dhdr_asize; spa_t *spa; - int err; - boolean_t l2dhdr_valid = B_TRUE; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); @@ -9409,10 +9557,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) /* * Read the device header, if an error is returned do not rebuild L2ARC. */ - if ((err = l2arc_dev_hdr_read(dev)) != 0) - l2dhdr_valid = B_FALSE; - - if (l2dhdr_valid && dev->l2ad_log_entries > 0) { + if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, @@ -9712,7 +9857,7 @@ l2arc_rebuild(l2arc_dev_t *dev) * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); - l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); + l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of @@ -9759,6 +9904,7 @@ l2arc_rebuild(l2arc_dev_t *dev) !dev->l2ad_first) goto out; + cond_resched(); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { @@ -9792,7 +9938,7 @@ l2arc_rebuild(l2arc_dev_t *dev) PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; - } + } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); @@ -9859,7 +10005,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, - ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); @@ -10030,7 +10176,7 @@ cleanup: */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, - uint64_t lb_asize, uint64_t lb_daddr) + uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; @@ -10104,19 +10250,18 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), - L2BLK_GET_PREFETCH((le)->le_prop)); + L2BLK_GET_PREFETCH((le)->le_prop), + L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to - * avoid underflow since the latter also calls the former. + * avoid underflow since the latter also calls vdev_space_update(). */ + l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); - mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); @@ -10136,14 +10281,15 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; + exists->b_l2hdr.b_arcs_state = + L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); + l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); - ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } @@ -10439,6 +10585,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); @@ -10607,5 +10754,8 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, - "When full, ARC allocation waits for eviction of this % of alloc size"); + "When full, ARC allocation waits for eviction of this % of alloc size"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW, + "The number of headers to evict per sublist before moving to the next"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 7d817320aae4..93445a80294b 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. @@ -1974,6 +1974,74 @@ dbuf_redirty(dbuf_dirty_record_t *dr) } dbuf_dirty_record_t * +dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid); + dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE); + ASSERT(dn->dn_maxblkid >= blkid); + + dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; + dr->dr_txg = tx->tx_txg; + dr->dt.dll.dr_blkid = blkid; + dr->dr_accounted = dn->dn_datablksz; + + /* + * There should not be any dbuf for the block that we're dirtying. + * Otherwise the buffer contents could be inconsistent between the + * dbuf and the lightweight dirty record. + */ + ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)); + + mutex_enter(&dn->dn_mtx); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); + } + + if (dn->dn_nlevels == 1) { + ASSERT3U(blkid, <, dn->dn_nblkptr); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); + dnode_setdirty(dn, tx); + } else { + mutex_exit(&dn->dn_mtx); + + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent_db = dbuf_hold_level(dn, + 1, blkid >> epbs, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (parent_db == NULL) { + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + int err = dbuf_read(parent_db, NULL, + (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err != 0) { + dbuf_rele(parent_db, FTAG); + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + + dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx); + dbuf_rele(parent_db, FTAG); + mutex_enter(&parent_dr->dt.di.dr_mtx); + ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg); + list_insert_tail(&parent_dr->dt.di.dr_children, dr); + mutex_exit(&parent_dr->dt.di.dr_mtx); + dr->dr_parent = parent_dr; + } + + dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx); + + return (dr); +} + +dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; @@ -2090,6 +2158,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; if (db->db_level == 0) { void *data_old = db->db_buf; @@ -2255,7 +2324,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; if (dr->dt.dl.dr_data != db->db.db_data) { - struct dnode *dn = DB_DNODE(db); + struct dnode *dn = dr->dr_dnode; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); kmem_free(dr->dt.dl.dr_data, max_bonuslen); @@ -2280,9 +2349,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr; ASSERT(txg != 0); @@ -2302,13 +2369,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - dr = dbuf_find_dirty_eq(db, txg); + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); + dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -2336,7 +2402,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } - DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); @@ -2627,11 +2692,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); - xuio_stat_wbuf_copied(); return; } - xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); @@ -3003,8 +3066,29 @@ typedef struct dbuf_prefetch_arg { zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ + dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ + void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; +static void +dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) +{ + if (dpa->dpa_cb != NULL) + dpa->dpa_cb(dpa->dpa_arg, io_done); + kmem_free(dpa, sizeof (*dpa)); +} + +static void +dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + dbuf_prefetch_fini(dpa, B_TRUE); + if (abuf != NULL) + arc_buf_destroy(abuf, private); +} + /* * Actually issue the prefetch read for the block given. */ @@ -3017,11 +3101,12 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) - return; + return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = - dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_NO_BUF; /* dnodes are always read as raw and then converted later */ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && @@ -3031,7 +3116,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, + dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } @@ -3051,8 +3137,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } ASSERT(zio == NULL || zio->io_error == 0); @@ -3084,11 +3169,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { - kmem_free(dpa, sizeof (*dpa)); arc_buf_destroy(abuf, private); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); @@ -3105,11 +3188,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { - kmem_free(dpa, sizeof (*dpa)); + dbuf_prefetch_fini(dpa, B_TRUE); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3139,9 +3221,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) +int +dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; @@ -3151,10 +3234,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) - return; + goto no_issue; if (level == 0 && dnode_block_freed(dn, blkid)) - return; + goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to @@ -3162,11 +3245,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; + goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; + goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); @@ -3176,7 +3259,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ - return; + goto no_issue; } /* @@ -3212,7 +3295,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) - return; + goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); @@ -3230,6 +3313,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; + dpa->dpa_cb = cb; + dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) @@ -3245,7 +3330,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3266,6 +3350,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * dpa may have already been freed. */ zio_nowait(pio); + return (1); +no_issue: + if (cb != NULL) + cb(arg, B_FALSE); + return (0); +} + +int +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + + return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* @@ -3803,15 +3900,13 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(DB_DNODE_HELD(db)); ASSERT(db->db_blkid == DMU_BONUS_BLKID); ASSERT(data != NULL); - dnode_t *dn = DB_DNODE(db); + dnode_t *dn = dr->dr_dnode; ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); - DB_DNODE_EXIT(db); dbuf_sync_leaf_verify_bonus_dnode(dr); @@ -3870,8 +3965,7 @@ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - zio_t *zio; + dnode_t *dn = dr->dr_dnode; ASSERT(dmu_tx_is_syncing(tx)); @@ -3891,12 +3985,9 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); - DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; @@ -3905,7 +3996,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, db->db_buf, tx); - zio = dr->dr_zio; + zio_t *zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -3930,7 +4021,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) { #ifdef ZFS_DEBUG - dnode_t *dn = DB_DNODE(dr->dr_dbuf); + dnode_t *dn = dr->dr_dnode; /* * Encrypted bonus buffers can have data past their bonuslen. @@ -3953,6 +4044,153 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) #endif } +static blkptr_t * +dbuf_lightweight_bp(dbuf_dirty_record_t *dr) +{ + /* This must be a lightweight dirty record. */ + ASSERT3P(dr->dr_dbuf, ==, NULL); + dnode_t *dn = dr->dr_dnode; + + if (dn->dn_phys->dn_nlevels == 1) { + VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr); + return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); + } else { + dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + VERIFY3U(parent_db->db_level, ==, 1); + VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); + blkptr_t *bp = parent_db->db.db_data; + return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); + } +} + +static void +dbuf_lightweight_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error != 0) + return; + + dnode_t *dn = dr->dr_dnode; + + blkptr_t *bp_orig = dbuf_lightweight_bp(dr); + spa_t *spa = dmu_objset_spa(dn->dn_objset); + int64_t delta = bp_get_dsize_sync(spa, bp) - + bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta); + + uint64_t blkid = dr->dt.dll.dr_blkid; + mutex_enter(&dn->dn_mtx); + if (blkid > dn->dn_phys->dn_maxblkid) { + ASSERT0(dn->dn_objset->os_raw_receive); + dn->dn_phys->dn_maxblkid = blkid; + } + mutex_exit(&dn->dn_mtx); + + if (!BP_IS_EMBEDDED(bp)) { + uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1; + BP_SET_FILL(bp, fill); + } + + dmu_buf_impl_t *parent_db; + EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); + if (dr->dr_parent == NULL) { + parent_db = dn->dn_dbuf; + } else { + parent_db = dr->dr_parent->dr_dbuf; + } + rw_enter(&parent_db->db_rwlock, RW_WRITER); + *bp_orig = *bp; + rw_exit(&parent_db->db_rwlock); +} + +static void +dbuf_lightweight_physdone(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dsl_pool_t *dp = spa_get_dsl(zio->io_spa); + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_lightweight_done(). + */ + int delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + +static void +dbuf_lightweight_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + + VERIFY0(zio->io_error); + + objset_t *os = dr->dr_dnode->dn_objset; + dmu_tx_t *tx = os->os_synctx; + + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { + ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, zio->io_bp, tx); + } + + /* + * See comment in dbuf_write_done(). + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + abd_free(dr->dt.dll.dr_abd); + kmem_free(dr, sizeof (*dr)); +} + +noinline static void +dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dnode_t *dn = dr->dr_dnode; + zio_t *pio; + if (dn->dn_phys->dn_nlevels == 1) { + pio = dn->dn_zio; + } else { + pio = dr->dr_parent->dr_zio; + } + + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(dn->dn_objset), + .zb_object = dn->dn_object, + .zb_level = 0, + .zb_blkid = dr->dt.dll.dr_blkid, + }; + + /* + * See comment in dbuf_write(). This is so that zio->io_bp_orig + * will have the old BP in dbuf_lightweight_done(). + */ + dr->dr_bp_copy = *dbuf_lightweight_bp(dr); + + dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), + dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, + dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), + &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, + dbuf_lightweight_physdone, dbuf_lightweight_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); + + zio_nowait(dr->dr_zio); +} + /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to @@ -3963,7 +4201,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; uint64_t txg = tx->tx_txg; @@ -3987,9 +4225,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } DBUF_VERIFY(db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { @@ -4079,16 +4314,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); - DB_DNODE_EXIT(db); } else { - /* - * Although zio_nowait() does not "wait for an IO", it does - * initiate the IO. If this is an empty write it seems plausible - * that the IO could actually be completed before the nowait - * returns. We need to DB_DNODE_EXIT() first in case - * zio_nowait() invalidates the dbuf. - */ - DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } @@ -4111,15 +4337,19 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) DMU_META_DNODE_OBJECT); break; } - if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - VERIFY3U(dr->dr_dbuf->db_level, ==, level); - } list_remove(list, dr); - if (dr->dr_dbuf->db_level > 0) - dbuf_sync_indirect(dr, tx); - else - dbuf_sync_leaf(dr, tx); + if (dr->dr_dbuf == NULL) { + dbuf_sync_lightweight(dr, tx); + } else { + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } } } @@ -4299,7 +4529,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; - dbuf_dirty_record_t *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); @@ -4320,7 +4549,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) DBUF_VERIFY(db); - dr = db->db_data_pending; + dbuf_dirty_record_t *dr = db->db_data_pending; + dnode_t *dn = dr->dr_dnode; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(list_next(&db->db_dirty_records, dr) == NULL); @@ -4328,14 +4558,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - DB_DNODE_EXIT(db); } #endif @@ -4347,10 +4572,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -4361,7 +4582,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } - DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } @@ -4554,7 +4774,7 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; @@ -4565,8 +4785,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { @@ -4622,7 +4840,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); - DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 2c96645214f8..a02f43df13fd 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -499,7 +499,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; - zio_t *zio; + zio_t *zio = NULL; ASSERT(length <= DMU_MAX_ACCESS); @@ -531,14 +531,17 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + if (read) + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); + if (read) + zio_nowait(zio); return (SET_ERROR(EIO)); } @@ -555,15 +558,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } rw_exit(&dn->dn_struct_rwlock); - /* wait for async i/o */ - err = zio_wait(zio); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ if (read) { + /* wait for async read i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); @@ -1165,165 +1168,12 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } -/* - * DMU support for xuio - */ -kstat_t *xuio_ksp = NULL; - -typedef struct xuio_stats { - /* loaned yet not returned arc_buf */ - kstat_named_t xuiostat_onloan_rbuf; - kstat_named_t xuiostat_onloan_wbuf; - /* whether a copy is made when loaning out a read buffer */ - kstat_named_t xuiostat_rbuf_copied; - kstat_named_t xuiostat_rbuf_nocopy; - /* whether a copy is made when assigning a write buffer */ - kstat_named_t xuiostat_wbuf_copied; - kstat_named_t xuiostat_wbuf_nocopy; -} xuio_stats_t; - -static xuio_stats_t xuio_stats = { - { "onloan_read_buf", KSTAT_DATA_UINT64 }, - { "onloan_write_buf", KSTAT_DATA_UINT64 }, - { "read_buf_copied", KSTAT_DATA_UINT64 }, - { "read_buf_nocopy", KSTAT_DATA_UINT64 }, - { "write_buf_copied", KSTAT_DATA_UINT64 }, - { "write_buf_nocopy", KSTAT_DATA_UINT64 } -}; - -#define XUIOSTAT_INCR(stat, val) \ - atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -#ifdef HAVE_UIO_ZEROCOPY -int -dmu_xuio_init(xuio_t *xuio, int nblk) -{ - dmu_xuio_t *priv; - uio_t *uio = &xuio->xu_uio; - - uio->uio_iovcnt = nblk; - uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); - - priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); - priv->cnt = nblk; - priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); - priv->iovp = (iovec_t *)uio->uio_iov; - XUIO_XUZC_PRIV(xuio) = priv; - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); - - return (0); -} - -void -dmu_xuio_fini(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int nblk = priv->cnt; - - kmem_free(priv->iovp, nblk * sizeof (iovec_t)); - kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); - kmem_free(priv, sizeof (dmu_xuio_t)); - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); -} - -/* - * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } - * and increase priv->next by 1. - */ -int -dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) -{ - struct iovec *iov; - uio_t *uio = &xuio->xu_uio; - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int i = priv->next++; - - ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_lsize(abuf)); - iov = (iovec_t *)uio->uio_iov + i; - iov->iov_base = (char *)abuf->b_data + off; - iov->iov_len = n; - priv->bufs[i] = abuf; - return (0); -} - -int -dmu_xuio_cnt(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - return (priv->cnt); -} - -arc_buf_t * -dmu_xuio_arcbuf(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - return (priv->bufs[i]); -} - -void -dmu_xuio_clear(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - priv->bufs[i] = NULL; -} -#endif /* HAVE_UIO_ZEROCOPY */ - -static void -xuio_stat_init(void) -{ - xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (xuio_ksp != NULL) { - xuio_ksp->ks_data = &xuio_stats; - kstat_install(xuio_ksp); - } -} - -static void -xuio_stat_fini(void) -{ - if (xuio_ksp != NULL) { - kstat_delete(xuio_ksp); - xuio_ksp = NULL; - } -} - -void -xuio_stat_wbuf_copied(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_copied); -} - -void -xuio_stat_wbuf_nocopy(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); -} - #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; -#endif /* * NB: we could do this block-at-a-time, but it's nice @@ -1344,21 +1194,6 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) bufoff = uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); -#ifdef HAVE_UIO_ZEROCOPY - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) - uio_advance(uio, tocpy); - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else -#endif #ifdef __FreeBSD__ err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); @@ -1561,6 +1396,32 @@ dmu_return_arcbuf(arc_buf_t *buf) } /* + * A "lightweight" write is faster than a regular write (e.g. + * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the + * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the + * data can not be read or overwritten until the transaction's txg has been + * synced. This makes it appropriate for workloads that are known to be + * (temporarily) write-only, like "zfs receive". + * + * A single block is written, starting at the specified offset in bytes. If + * the call is successful, it returns 0 and the provided abd has been + * consumed (the caller should not free it). + */ +int +dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr = + dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); + if (dr == NULL) + return (SET_ERROR(EIO)); + dr->dt.dll.dr_abd = abd; + dr->dt.dll.dr_props = *zp; + dr->dt.dll.dr_flags = flags; + return (0); +} + +/* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). @@ -1583,8 +1444,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_exit(&dn->dn_struct_rwlock); /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. + * We can only assign if the offset is aligned and the arc buf is the + * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); @@ -1597,7 +1458,6 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); - XUIOSTAT_BUMP(xuiostat_wbuf_copied); } return (0); @@ -2409,7 +2269,6 @@ dmu_init(void) abd_init(); zfs_dbgmsg_init(); sa_cache_init(); - xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); @@ -2429,7 +2288,6 @@ dmu_fini(void) dbuf_fini(); dnode_fini(); dmu_objset_fini(); - xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c index 453a2842ce6e..12cdbd68b104 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_object.c +++ b/sys/contrib/openzfs/module/zfs/dmu_object.c @@ -58,10 +58,8 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; int error; - kpreempt_disable(); - cpuobj = &os->os_obj_next_percpu[CPU_SEQID % + cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % os->os_obj_next_percpu_len]; - kpreempt_enable(); if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index af5935e2374d..66a8f20092e0 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -682,8 +682,9 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); @@ -755,8 +756,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); @@ -778,11 +780,15 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, * speed up pool import times and to keep this txg reserved * completely for recovery work. */ - if ((dmu_objset_userobjspace_upgradable(*osp) || - dmu_objset_projectquota_upgradable(*osp)) && - !readonly && !dp->dp_spa->spa_claiming && - (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) - dmu_objset_id_quota_upgrade(*osp); + if (!readonly && !dp->dp_spa->spa_claiming && + (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) { + if (dmu_objset_userobjspace_upgradable(*osp) || + dmu_objset_projectquota_upgradable(*osp)) { + dmu_objset_id_quota_upgrade(*osp); + } else if (dmu_objset_userused_enabled(*osp)) { + dmu_objset_userspace_upgrade(*osp); + } + } dsl_pool_rele(dp, FTAG); return (0); @@ -794,8 +800,9 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, { dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); @@ -812,9 +819,10 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) { - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; - + ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } @@ -842,7 +850,9 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); @@ -850,21 +860,22 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); - dsl_dataset_disown(ds, decrypt, tag); - VERIFY0(dsl_dataset_own(dp, name, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds)); + dsl_dataset_disown(ds, flags, tag); + VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) { + ds_hold_flags_t flags; + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; /* * Stop upgrading thread */ dmu_objset_upgrade_stop(os); - dsl_dataset_disown(os->os_dsl_dataset, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag); + dsl_dataset_disown(os->os_dsl_dataset, flags, tag); } void @@ -1231,7 +1242,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) } VERIFY0(zio_wait(rzio)); - dmu_objset_do_userquota_updates(os, tx); + dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); @@ -1424,10 +1435,15 @@ dmu_objset_upgrade_task_cb(void *data) mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = EINTR; if (!os->os_upgrade_exit) { + int status; + mutex_exit(&os->os_upgrade_lock); - os->os_upgrade_status = os->os_upgrade_cb(os); + status = os->os_upgrade_cb(os); + mutex_enter(&os->os_upgrade_lock); + + os->os_upgrade_status = status; } os->os_upgrade_exit = B_TRUE; os->os_upgrade_id = 0; @@ -1455,6 +1471,8 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); os->os_upgrade_status = ENOMEM; } + } else { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } mutex_exit(&os->os_upgrade_lock); } @@ -1498,23 +1516,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) multilist_sublist_remove(list, dn); /* - * If we are not doing useraccounting (os_synced_dnodes == NULL) - * we are done with this dnode for this txg. Unset dn_dirty_txg - * if later txgs aren't dirtying it so that future holders do - * not get a stale value. Otherwise, we will do this in - * userquota_updates_task() when processing has completely - * finished for this txg. + * See the comment above dnode_rele_task() for an explanation + * of why this dnode hold is always needed (even when not + * doing user accounting). */ multilist_t *newlist = dn->dn_objset->os_synced_dnodes; - if (newlist != NULL) { - (void) dnode_add_ref(dn, newlist); - multilist_insert(newlist, dn); - } else { - mutex_enter(&dn->dn_mtx); - if (dn->dn_dirty_txg == tx->tx_txg) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - } + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); dnode_sync(dn, tx); } @@ -1676,22 +1684,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) txgoff = tx->tx_txg & TXG_MASK; - if (dmu_objset_userused_enabled(os) && - (!os->os_encrypted || !dmu_objset_is_receiving(os))) { - /* - * We must create the list here because it uses the - * dn_dirty_link[] of this txg. But it may already - * exist because we call dsl_dataset_sync() twice per txg. - */ - if (os->os_synced_dnodes == NULL) { - os->os_synced_dnodes = - multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[txgoff]), - dnode_multilist_index_func); - } else { - ASSERT3U(os->os_synced_dnodes->ml_offset, ==, - offsetof(dnode_t, dn_dirty_link[txgoff])); - } + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes == NULL) { + os->os_synced_dnodes = + multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes->ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); } ml = os->os_dirty_dnodes[txgoff]; @@ -1998,8 +2003,6 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); - if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) - dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); @@ -2010,13 +2013,44 @@ userquota_updates_task(void *arg) kmem_free(uua, sizeof (*uua)); } -void -dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +/* + * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being + * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be + * evicted because the block containing the dnode can't be evicted until it is + * written out. However, this hold is necessary to prevent the dnode_t from + * being moved (via dnode_move()) while it's still referenced by + * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for + * dirty_lightweight_leaf-type dirty records. + * + * If we are doing user-object accounting, the dnode_rele() happens from + * userquota_updates_task() instead. + */ +static void +dnode_rele_task(void *arg) { - int num_sublists; + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + multilist_sublist_t *list = + multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + + dnode_t *dn; + while ((dn = multilist_sublist_head(list)) != NULL) { + multilist_sublist_remove(list, dn); + dnode_rele(dn, os->os_synced_dnodes); + } + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +/* + * Return TRUE if userquota updates are needed. + */ +static boolean_t +dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) +{ if (!dmu_objset_userused_enabled(os)) - return; + return (B_FALSE); /* * If this is a raw receive just return and handle accounting @@ -2026,10 +2060,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) - return; + return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) - return; + return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { @@ -2046,23 +2080,39 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } + return (B_TRUE); +} - num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); +/* + * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and + * also release the holds on the dnodes from dmu_objset_sync_dnodes(). + * The caller must taskq_wait(dp_sync_taskq). + */ +void +dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) +{ + boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); + + int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); for (int i = 0; i < num_sublists; i++) { - if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) - continue; userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; - /* note: caller does taskq_wait() */ + + /* + * If we don't need to update userquotas, use + * dnode_rele_task() to call dnode_rele() + */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - userquota_updates_task, uua, 0); + need_userquota ? userquota_updates_task : dnode_rele_task, + uua, 0); /* callback frees uua */ } } + /* * Returns a pointer to data to find uid/gid from * @@ -2084,18 +2134,11 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr == NULL) { data = NULL; } else { - dnode_t *dn; - - DB_DNODE_ENTER(dr->dr_dbuf); - dn = DB_DNODE(dr->dr_dbuf); - - if (dn->dn_bonuslen == 0 && + if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; - - DB_DNODE_EXIT(dr->dr_dbuf); } return (data); @@ -2285,8 +2328,8 @@ dmu_objset_space_upgrade(objset_t *os) return (0); } -int -dmu_objset_userspace_upgrade(objset_t *os) +static int +dmu_objset_userspace_upgrade_cb(objset_t *os) { int err = 0; @@ -2306,6 +2349,12 @@ dmu_objset_userspace_upgrade(objset_t *os) return (0); } +void +dmu_objset_userspace_upgrade(objset_t *os) +{ + dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb); +} + static int dmu_objset_id_quota_upgrade_cb(objset_t *os) { @@ -2316,14 +2365,15 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); - if (!dmu_objset_userobjused_enabled(os)) + if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (!dmu_objset_projectquota_enabled(os) && dmu_objset_userobjspace_present(os)) return (SET_ERROR(ENOTSUP)); - dmu_objset_ds(os)->ds_feature_activation[ - SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; + if (dmu_objset_userobjused_enabled(os)) + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; if (dmu_objset_projectquota_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; @@ -2332,7 +2382,9 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) if (err) return (err); - os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + if (dmu_objset_userobjused_enabled(os)) + os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; if (dmu_objset_projectquota_enabled(os)) os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; @@ -2977,7 +3029,7 @@ EXPORT_SYMBOL(dmu_objset_create_impl); EXPORT_SYMBOL(dmu_objset_open_impl); EXPORT_SYMBOL(dmu_objset_evict); EXPORT_SYMBOL(dmu_objset_register_type); -EXPORT_SYMBOL(dmu_objset_do_userquota_updates); +EXPORT_SYMBOL(dmu_objset_sync_done); EXPORT_SYMBOL(dmu_objset_userquota_get_ids); EXPORT_SYMBOL(dmu_objset_userused_enabled); EXPORT_SYMBOL(dmu_objset_userspace_upgrade); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 2eee19a28e34..a0fd157ebc5f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -79,10 +79,10 @@ struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* - * If the record is a write, pointer to the arc_buf_t containing the + * If the record is a WRITE or SPILL, pointer to the abd containing the * payload. */ - arc_buf_t *arc_buf; + abd_t *abd; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ @@ -95,8 +95,8 @@ struct receive_writer_arg { bqueue_t q; /* - * These three args are used to signal to the main thread that we're - * done. + * These three members are used to signal to the main thread when + * we're done. */ kmutex_t mutex; kcondvar_t cv; @@ -175,18 +175,6 @@ byteswap_record(dmu_replay_record_t *drr) DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; - case DRR_WRITE_BYREF: - DO64(drr_write_byref.drr_object); - DO64(drr_write_byref.drr_offset); - DO64(drr_write_byref.drr_length); - DO64(drr_write_byref.drr_toguid); - DO64(drr_write_byref.drr_refguid); - DO64(drr_write_byref.drr_refobject); - DO64(drr_write_byref.drr_refoffset); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. - drr_key.ddk_cksum); - DO64(drr_write_byref.drr_key.ddk_prop); - break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); @@ -572,7 +560,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; @@ -784,7 +772,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; @@ -958,7 +946,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drc->drc_drrb; int error; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; dsl_dataset_t *ds; const char *tofs = drc->drc_tofs; @@ -1106,7 +1094,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) const char *tofs = drba->drba_cookie->drc_tofs; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; @@ -1903,58 +1891,106 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) struct receive_record_arg *rrd; while ((rrd = list_head(&rwa->write_batch)) != NULL) { struct drr_write *drrw = &rrd->header.drr_u.drr_write; - arc_buf_t *abuf = rrd->arc_buf; + abd_t *abd = rrd->abd; ASSERT3U(drrw->drr_object, ==, rwa->last_object); - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - - /* - * If we are receiving an incremental large-block stream into - * a dataset that previously did a non-large-block receive, - * the WRITE record may be larger than the object's block - * size. dmu_assign_arcbuf_by_dnode() handles this as long - * as the arcbuf is not compressed, so decompress it here if - * necessary. - */ - if (drrw->drr_logical_size != dn->dn_datablksz && - arc_get_compression(abuf) != ZIO_COMPRESS_OFF) { + if (drrw->drr_logical_size != dn->dn_datablksz) { + /* + * The WRITE record is larger than the object's block + * size. We must be receiving an incremental + * large-block stream into a dataset that previously did + * a non-large-block receive. Lightweight writes must + * be exactly one block, so we need to decompress the + * data (if compressed) and do a normal dmu_write(). + */ ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); - zbookmark_phys_t zb = { - .zb_objset = dmu_objset_id(rwa->os), - .zb_object = rwa->last_object, - .zb_level = 0, - .zb_blkid = - drrw->drr_offset >> dn->dn_datablkshift, - }; + if (DRR_WRITE_COMPRESSED(drrw)) { + abd_t *decomp_abd = + abd_alloc_linear(drrw->drr_logical_size, + B_FALSE); + + err = zio_decompress_data( + drrw->drr_compressiontype, + abd, abd_to_buf(decomp_abd), + abd_get_size(abd), + abd_get_size(decomp_abd), NULL); + + if (err == 0) { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(decomp_abd), tx); + } + abd_free(decomp_abd); + } else { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(abd), tx); + } + if (err == 0) + abd_free(abd); + } else { + zio_prop_t zp; + dmu_write_policy(rwa->os, dn, 0, 0, &zp); + + enum zio_flag zio_flags = 0; + + if (rwa->raw) { + zp.zp_encrypt = B_TRUE; + zp.zp_compress = drrw->drr_compressiontype; + zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + rwa->byteswap; + bcopy(drrw->drr_salt, zp.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(drrw->drr_iv, zp.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(drrw->drr_mac, zp.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { + zp.zp_nopwrite = B_FALSE; + zp.zp_copies = MIN(zp.zp_copies, + SPA_DVAS_PER_BP - 1); + } + zio_flags |= ZIO_FLAG_RAW; + } else if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + zp.zp_compress = drrw->drr_compressiontype; + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } else if (rwa->byteswap) { + /* + * Note: compressed blocks never need to be + * byteswapped, because WRITE records for + * metadata blocks are never compressed. The + * exception is raw streams, which are written + * in the original byteorder, and the byteorder + * bit is preserved in the BP by setting + * zp_byteorder above. + */ + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func( + abd_to_buf(abd), + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } /* - * The size of loaned arc bufs is counted in - * arc_loaned_bytes. When we untransform - * (decompress) the buf, its size increases. To - * ensure that arc_loaned_bytes remains accurate, we - * need to return (un-loan) the buf (with its - * compressed size) and then re-loan it (with its - * new, uncompressed size). + * Since this data can't be read until the receive + * completes, we can do a "lightweight" write for + * improved performance. */ - arc_return_buf(abuf, FTAG); - VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os), - &zb, B_FALSE)); - arc_loan_inuse_buf(abuf, FTAG); + err = dmu_lightweight_write_by_dnode(dn, + drrw->drr_offset, abd, &zp, zio_flags, tx); } - err = dmu_assign_arcbuf_by_dnode(dn, - drrw->drr_offset, abuf, tx); if (err != 0) { /* * This rrd is left on the list, so the caller will - * free it (and the arc_buf). + * free it (and the abd). */ break; } @@ -1987,7 +2023,7 @@ flush_write_batch(struct receive_writer_arg *rwa) if (err != 0) { struct receive_record_arg *rrd; while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { - dmu_return_arcbuf(rrd->arc_buf); + abd_free(rrd->abd); kmem_free(rrd, sizeof (*rrd)); } } @@ -2090,9 +2126,8 @@ receive_write_embedded(struct receive_writer_arg *rwa, static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - arc_buf_t *abuf) + abd_t *abd) { - dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; @@ -2107,7 +2142,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (0); } @@ -2131,7 +2166,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, return (err); } - tx = dmu_tx_create(rwa->os); + dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -2150,18 +2185,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx); - VERIFY(0 == dbuf_spill_set_blksz(db_spill, + VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrs->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_SPILL_PAYLOAD_SIZE(drrs)); + arc_buf_t *abuf; + if (rwa->raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + rwa->byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), + drrs->drr_object, byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype, 0); + } else { + abuf = arc_loan_buf(dmu_objset_spa(rwa->os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } } + bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs)); + abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); @@ -2263,8 +2315,9 @@ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; - ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; + dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has @@ -2451,53 +2504,19 @@ receive_read_record(dmu_recv_cookie_t *drc) case DRR_WRITE: { struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; - arc_buf_t *abuf; - boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - - if (drc->drc_raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ - drc->drc_byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), - drrw->drr_object, byteorder, drrw->drr_salt, - drrw->drr_iv, drrw->drr_mac, drrw->drr_type, - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype, 0); - } else if (DRR_WRITE_COMPRESSED(drrw)) { - ASSERT3U(drrw->drr_compressed_size, >, 0); - ASSERT3U(drrw->drr_logical_size, >=, - drrw->drr_compressed_size); - ASSERT(!is_meta); - abuf = arc_loan_compressed_buf( - dmu_objset_spa(drc->drc_os), - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype, 0); - } else { - abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), - is_meta, drrw->drr_logical_size); - } - - err = receive_read_payload_and_next_header(drc, - DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + int size = DRR_WRITE_PAYLOAD_SIZE(drrw); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(drc, size, + abd_to_buf(abd)); if (err != 0) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (err); } - drc->drc_rrd->arc_buf = abuf; + drc->drc_rrd->abd = abd; receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwb = - &drc->drc_rrd->header.drr_u.drr_write_byref; - err = receive_read_payload_and_next_header(drc, 0, NULL); - receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset, - drrwb->drr_length); - return (err); - } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = @@ -2536,29 +2555,14 @@ receive_read_record(dmu_recv_cookie_t *drc) case DRR_SPILL: { struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; - arc_buf_t *abuf; - /* DRR_SPILL records are either raw or uncompressed */ - if (drc->drc_raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ - drc->drc_byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), - drrs->drr_object, byteorder, drrs->drr_salt, - drrs->drr_iv, drrs->drr_mac, drrs->drr_type, - drrs->drr_compressed_size, drrs->drr_length, - drrs->drr_compressiontype, 0); - } else { - abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), - DMU_OT_IS_METADATA(drrs->drr_type), - drrs->drr_length); - } - err = receive_read_payload_and_next_header(drc, - DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data); + int size = DRR_SPILL_PAYLOAD_SIZE(drrs); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(drc, size, + abd_to_buf(abd)); if (err != 0) - dmu_return_arcbuf(abuf); + abd_free(abd); else - drc->drc_rrd->arc_buf = abuf; + drc->drc_rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: @@ -2687,9 +2691,9 @@ receive_process_record(struct receive_writer_arg *rwa, if (rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { - if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); @@ -2726,8 +2730,8 @@ receive_process_record(struct receive_writer_arg *rwa, * the rrd or arc_buf. */ ASSERT(err != 0); - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + abd_free(rrd->abd); + rrd->abd = NULL; } break; } @@ -2749,10 +2753,10 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->arc_buf); + err = receive_spill(rwa, drrs, rrd->abd); if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; break; } @@ -2800,9 +2804,9 @@ receive_writer_thread(void *arg) int err = 0; if (rwa->err == 0) { err = receive_process_record(rwa, rrd); - } else if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + } else if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c index 225ec40537ec..62c7d01d4bd2 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_redact.c +++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c @@ -858,7 +858,7 @@ hold_next_object(objset_t *os, struct redact_record *rec, void *tag, { int err = 0; if (*dn != NULL) - dnode_rele(*dn, FTAG); + dnode_rele(*dn, tag); *dn = NULL; if (*object < rec->start_object) { *object = rec->start_object - 1; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 9480c8b75497..d654382237c0 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -2626,7 +2626,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, { int err; dsl_dataset_t *fromds; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; struct dmu_send_params dspp = {0}; dspp.embedok = embedok; dspp.large_block_ok = large_block_ok; @@ -2638,6 +2638,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, dspp.rawok = rawok; dspp.savedok = savedok; + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; err = dsl_pool_hold(pool, FTAG, &dspp.dp); if (err != 0) return (err); @@ -2711,12 +2712,13 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, dmu_send_outparams_t *dsop) { int err = 0; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; boolean_t owned = B_FALSE; dsl_dataset_t *fromds = NULL; zfs_bookmark_phys_t book = {0}; struct dmu_send_params dspp = {0}; + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; dspp.tosnap = tosnap; dspp.embedok = embedok; dspp.large_block_ok = large_block_ok; diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 09ef2be94944..0ebed4e6fbdf 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -230,9 +230,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); - if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) - err = SET_ERROR(EFBIG); - if (dn == NULL) return; diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 4d86863f30ea..5d061fe3813e 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -59,16 +59,29 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; + kstat_named_t zfetchstat_max_completion_us; + kstat_named_t zfetchstat_last_completion_us; + kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, + { "max_completion_us", KSTAT_DATA_UINT64 }, + { "last_completion_us", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, }; #define ZFETCHSTAT_BUMP(stat) \ - atomic_inc_64(&zfetch_stats.stat.value.ui64); + atomic_inc_64(&zfetch_stats.stat.value.ui64) +#define ZFETCHSTAT_ADD(stat, val) \ + atomic_add_64(&zfetch_stats.stat.value.ui64, val) +#define ZFETCHSTAT_SET(stat, val) \ + zfetch_stats.stat.value.ui64 = val +#define ZFETCHSTAT_GET(stat) \ + zfetch_stats.stat.value.ui64 + kstat_t *zfetch_ksp; @@ -104,8 +117,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { if (zf == NULL) return; - zf->zf_dnode = dno; + zf->zf_numstreams = 0; list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zs_node)); @@ -114,12 +127,28 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) } static void +dmu_zfetch_stream_fini(zstream_t *zs) +{ + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); +} + +static void dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zs_lock); - kmem_free(zs, sizeof (*zs)); + dmu_zfetch_stream_fini(zs); + zf->zf_numstreams--; +} + +static void +dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(MUTEX_HELD(&zf->zf_lock)); + list_remove(&zf->zf_stream, zs); + zs->zs_fetch = NULL; + zf->zf_numstreams--; } /* @@ -132,8 +161,12 @@ dmu_zfetch_fini(zfetch_t *zf) zstream_t *zs; mutex_enter(&zf->zf_lock); - while ((zs = list_head(&zf->zf_stream)) != NULL) - dmu_zfetch_stream_remove(zf, zs); + while ((zs = list_head(&zf->zf_stream)) != NULL) { + if (zfs_refcount_count(&zs->zs_blocks) != 0) + dmu_zfetch_stream_orphan(zf, zs); + else + dmu_zfetch_stream_remove(zf, zs); + } mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); mutex_destroy(&zf->zf_lock); @@ -151,7 +184,7 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs_next; - int numstreams = 0; + hrtime_t now = gethrtime(); ASSERT(MUTEX_HELD(&zf->zf_lock)); @@ -161,11 +194,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) for (zstream_t *zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); - if (((gethrtime() - zs->zs_atime) / NANOSEC) > + /* + * Skip gethrtime() call if there are still references + */ + if (zfs_refcount_count(&zs->zs_blocks) != 0) + continue; + if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); - else - numstreams++; } /* @@ -179,7 +215,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); - if (numstreams >= max_streams) { + if (zf->zf_numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } @@ -188,12 +224,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zs->zs_blkid = blkid; zs->zs_pf_blkid = blkid; zs->zs_ipf_blkid = blkid; - zs->zs_atime = gethrtime(); + zs->zs_atime = now; + zs->zs_fetch = zf; + zfs_refcount_create(&zs->zs_blocks); mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - + zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } +static void +dmu_zfetch_stream_done(void *arg, boolean_t io_issued) +{ + zstream_t *zs = arg; + + if (zs->zs_start_time && io_issued) { + hrtime_t now = gethrtime(); + hrtime_t delta = NSEC2USEC(now - zs->zs_start_time); + + zs->zs_start_time = 0; + ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta); + if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us)) + ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta); + } + + if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0) + return; + + /* + * The parent fetch structure has gone away + */ + if (zs->zs_fetch == NULL) + dmu_zfetch_stream_fini(zs); +} + /* * This is the predictive prefetch entry point. It associates dnode access * specified with blkid and nblks arguments with prefetch stream, predicts @@ -209,7 +272,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued; uint64_t end_of_access_blkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; @@ -230,11 +293,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * As a fast path for small (single-block) files, ignore access * to the first block. */ - if (blkid == 0) + if (!have_lock && blkid == 0) return; if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + /* + * A fast path for small files for which no prefetch will + * happen. + */ + if (zf->zf_dnode->dn_maxblkid < 2) { + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return; + } mutex_enter(&zf->zf_lock); /* @@ -343,9 +416,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; zs->zs_atime = gethrtime(); + /* no prior reads in progress */ + if (zfs_refcount_count(&zs->zs_blocks) == 0) + zs->zs_start_time = zs->zs_atime; zs->zs_blkid = end_of_access_blkid; + zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart, + NULL); mutex_exit(&zs->zs_lock); mutex_exit(&zf->zf_lock); + issued = 0; /* * dbuf_prefetch() is asynchronous (even when it needs to read @@ -354,16 +433,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, */ for (int i = 0; i < pf_nblks; i++) { - dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { - dbuf_prefetch(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_hits); + + if (issued) + ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } /* BEGIN CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 23364dbae897..eaba9c0c0e7f 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -609,7 +609,6 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); @@ -649,6 +648,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_free_txg = 0; dn->dn_dirtyctx_firstset = NULL; + dn->dn_dirty_txg = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -1812,6 +1812,7 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(new_nlevels, >, dn->dn_nlevels); dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); @@ -1829,10 +1830,12 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && + + IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1); + if (dr->dr_dbuf == NULL || + (dr->dr_dbuf->db_level == old_nlevels - 1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c index ae44cb69765c..66e48a1e17d4 100644 --- a/sys/contrib/openzfs/module/zfs/dnode_sync.c +++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2020 Oxide Computer Company */ @@ -851,6 +851,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet - * initiated the IO for the dnode's dbuf. + * initiated the IO for the dnode's dbuf. Additionally, the caller + * has already added a reference to the dnode because it's on the + * os_synced_dnodes list. */ } diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index 16bf2c4414a8..2faf1af52991 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -1561,33 +1561,6 @@ dsl_bookmark_latest_txg(dsl_dataset_t *ds) return (dbn->dbn_phys.zbm_creation_txg); } -static inline unsigned int -redact_block_buf_num_entries(unsigned int size) -{ - return (size / sizeof (redact_block_phys_t)); -} - -/* - * This function calculates the offset of the last entry in the array of - * redact_block_phys_t. If we're reading the redaction list into buffers of - * size bufsize, then for all but the last buffer, the last valid entry in the - * array will be the last entry in the array. However, for the last buffer, any - * amount of it may be filled. Thus, we check to see if we're looking at the - * last buffer in the redaction list, and if so, we return the total number of - * entries modulo the number of entries per buffer. Otherwise, we return the - * number of entries per buffer minus one. - */ -static inline unsigned int -last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid) -{ - if (bufid == (rl->rl_phys->rlp_num_entries - 1) / - redact_block_buf_num_entries(bufsize)) { - return ((rl->rl_phys->rlp_num_entries - 1) % - redact_block_buf_num_entries(bufsize)); - } - return (redact_block_buf_num_entries(bufsize) - 1); -} - /* * Compare the redact_block_phys_t to the bookmark. If the last block in the * redact_block_phys_t is before the bookmark, return -1. If the first block in @@ -1633,8 +1606,6 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, rl_traverse_callback_t cb, void *arg) { objset_t *mos = rl->rl_mos; - redact_block_phys_t *buf; - unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; int err = 0; if (rl->rl_phys->rlp_last_object != UINT64_MAX || @@ -1651,42 +1622,48 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, } /* - * Binary search for the point to resume from. The goal is to minimize - * the number of disk reads we have to perform. + * This allows us to skip the binary search and resume checking logic + * below, if we're not resuming a redacted send. + */ + if (ZB_IS_ZERO(resume)) + resume = NULL; + + /* + * Binary search for the point to resume from. */ - buf = zio_data_buf_alloc(bufsize); - uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) / - redact_block_buf_num_entries(bufsize); - uint64_t minbufid = 0; - while (resume != NULL && maxbufid - minbufid >= 1) { - ASSERT3U(maxbufid, >, minbufid); - uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2); - err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize, - buf, DMU_READ_NO_PREFETCH); + uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1; + uint64_t minidx = 0; + while (resume != NULL && maxidx > minidx) { + redact_block_phys_t rbp = { 0 }; + ASSERT3U(maxidx, >, minidx); + uint64_t mididx = minidx + ((maxidx - minidx) / 2); + err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp), + sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH); if (err != 0) break; - int cmp0 = redact_block_zb_compare(&buf[0], resume); - int cmpn = redact_block_zb_compare( - &buf[last_entry(rl, bufsize, maxbufid)], resume); + int cmp = redact_block_zb_compare(&rbp, resume); - /* - * If the first block is before or equal to the resume point, - * and the last one is equal or after, then the resume point is - * in this buf, and we should start here. - */ - if (cmp0 <= 0 && cmpn >= 0) + if (cmp == 0) { + minidx = mididx; break; - - if (cmp0 > 0) - maxbufid = midbufid - 1; - else if (cmpn < 0) - minbufid = midbufid + 1; - else - panic("No progress in binary search for resume point"); + } else if (cmp > 0) { + maxidx = + (mididx == minidx ? minidx : mididx - 1); + } else { + minidx = mididx + 1; + } } - for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize); + unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; + redact_block_phys_t *buf = zio_data_buf_alloc(bufsize); + + unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t); + uint64_t start_block = minidx / entries_per_buf; + err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf, + DMU_READ_PREFETCH); + + for (uint64_t curidx = minidx; err == 0 && curidx < rl->rl_phys->rlp_num_entries; curidx++) { /* @@ -1696,22 +1673,35 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, * prefetching, and this code shouldn't be the bottleneck, so we * don't need to do manual prefetching. */ - if (curidx % redact_block_buf_num_entries(bufsize) == 0) { + if (curidx % entries_per_buf == 0) { err = dmu_read(mos, rl->rl_object, curidx * sizeof (*buf), bufsize, buf, DMU_READ_PREFETCH); if (err != 0) break; } - redact_block_phys_t *rb = &buf[curidx % - redact_block_buf_num_entries(bufsize)]; + redact_block_phys_t *rb = &buf[curidx % entries_per_buf]; /* * If resume is non-null, we should either not send the data, or * null out resume so we don't have to keep doing these * comparisons. */ if (resume != NULL) { + /* + * It is possible that after the binary search we got + * a record before the resume point. There's two cases + * where this can occur. If the record is the last + * redaction record, and the resume point is after the + * end of the redacted data, curidx will be the last + * redaction record. In that case, the loop will end + * after this iteration. The second case is if the + * resume point is between two redaction records, the + * binary search can return either the record before + * or after the resume point. In that case, the next + * iteration will be greater than the resume point. + */ if (redact_block_zb_compare(rb, resume) < 0) { + ASSERT3U(curidx, ==, minidx); continue; } else { /* @@ -1733,8 +1723,10 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, } } - if (cb(rb, arg) != 0) + if (cb(rb, arg) != 0) { + err = EINTR; break; + } } zio_data_buf_free(buf, bufsize); diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c index 26d4c2fe7e33..e38ec0cae827 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c +++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c @@ -2007,14 +2007,6 @@ dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds, if (ret != 0) return (ret); - /* - * Useraccounting is not portable and must be done with the keys loaded. - * Therefore, whenever we do any kind of receive the useraccounting - * must not be present. - */ - ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); - ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); - mdn = DMU_META_DNODE(os); /* @@ -2105,6 +2097,9 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, */ arc_release(os->os_phys_buf, &os->os_phys_buf); bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN); + os->os_phys->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_phys->os_flags &= ~OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN); os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 1fcd83db7988..de60c33589e3 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -2267,10 +2267,8 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_bookmark_sync_done(ds, tx); - if (os->os_synced_dnodes != NULL) { - multilist_destroy(os->os_synced_dnodes); - os->os_synced_dnodes = NULL; - } + multilist_destroy(os->os_synced_dnodes); + os->os_synced_dnodes = NULL; if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 3a2028625e8b..c770eafa75d8 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. @@ -220,11 +220,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri, - boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, + boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", - boot_ncpus, defclsyspri, boot_ncpus, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } @@ -565,6 +566,11 @@ dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); + dmu_objset_sync_done(dp->dp_meta_objset, tx); + taskq_wait(dp->dp_sync_taskq); + multilist_destroy(dp->dp_meta_objset->os_synced_dnodes); + dp->dp_meta_objset->os_synced_dnodes = NULL; + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } @@ -676,7 +682,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); + dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); @@ -1264,8 +1270,16 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * - * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only - * or modifying operations. + * Operations generally fall somewhere into the following taxonomy: + * + * Read-Only Modifying + * + * Dataset Layer / MOS zfs get zfs destroy + * + * Individual Dataset read() write() + * + * + * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the @@ -1275,6 +1289,25 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. + * + * + * Operations On Individual Datasets + * + * Objects _within_ an objset should only be modified by the current 'owner' + * of the objset to prevent incorrect concurrent modification. Thus, use + * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, + * and fail with EBUSY if there is already an owner. The owner can then + * implement its own locking strategy, independent of the dataset layer's + * locking infrastructure. + * (E.g., the ZPL has its own set of locks to control concurrency. A regular + * vnop will not reach into the dataset layer). + * + * Ideally, objects would also only be read by the objset’s owner, so that we + * don’t observe state mid-modification. + * (E.g. the ZPL is creating a new object and linking it into a directory; if + * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an + * intermediate state. The ioctl level violates this but in pretty benign + * ways, e.g. reading the zpl props object.) */ int diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 4704781bfa45..40adfbcee4e1 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) return (0); } -static void +void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; @@ -3328,19 +3328,12 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, } /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. - */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); - - /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* @@ -3987,7 +3980,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, /* * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. + * zpool(8) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 133005b227e5..bed6bf64c928 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -32,6 +32,7 @@ #include <sys/space_map.h> #include <sys/metaslab_impl.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/zio.h> #include <sys/spa_impl.h> #include <sys/zfeature.h> @@ -263,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2; * Internal switch to enable/disable the metaslab allocation tracing * facility. */ -#ifdef _METASLAB_TRACING -boolean_t metaslab_trace_enabled = B_TRUE; -#endif +boolean_t metaslab_trace_enabled = B_FALSE; /* * Maximum entries that the metaslab allocation tracing facility will keep @@ -275,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE; * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ -#ifdef _METASLAB_TRACING uint64_t metaslab_trace_max_entries = 5000; -#endif /* * Maximum number of metaslabs per group that can be disabled @@ -313,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE; */ uint32_t metaslab_by_size_min_shift = 14; +/* + * If not set, we will first try normal allocation. If that fails then + * we will do a gang allocation. If that fails then we will do a "try hard" + * gang allocation. If that fails then we will have a multi-layer gang + * block. + * + * If set, we will first try normal allocation. If that fails then + * we will do a "try hard" allocation. If that fails we will do a gang + * allocation. If that fails we will do a "try hard" gang allocation. If + * that fails then we will have a multi-layer gang block. + */ +int zfs_metaslab_try_hard_before_gang = B_FALSE; + +/* + * When not trying hard, we only consider the best zfs_metaslab_find_max_tries + * metaslabs. This improves performance, especially when there are many + * metaslabs per vdev and the allocation can't actually be satisfied (so we + * would otherwise iterate all the metaslabs). If there is a metaslab with a + * worse weight but it can actually satisfy the allocation, we won't find it + * until trying hard. This may happen if the worse metaslab is not loaded + * (and the true weight is better than we have calculated), or due to weight + * bucketization. E.g. we are looking for a 60K segment, and the best + * metaslabs all have free segments in the 32-63K bucket, but the best + * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a + * subsequent metaslab has ms_max_size >60KB (but fewer segments in this + * bucket, and therefore a lower weight). + */ +int zfs_metaslab_find_max_tries = 100; + static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -324,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); -#ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; - kstat_named_t metaslabstat_df_find_under_floor; kstat_named_t metaslabstat_reload_tree; + kstat_named_t metaslabstat_too_many_tries; + kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, - { "df_find_under_floor", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, + { "too_many_tries", KSTAT_DATA_UINT64 }, + { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ @@ -372,18 +399,6 @@ metaslab_stat_fini(void) kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } -#else - -void -metaslab_stat_init(void) -{ -} - -void -metaslab_stat_fini(void) -{ -} -#endif /* * ========================================================================== @@ -395,20 +410,19 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) { metaslab_class_t *mc; - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc = kmem_zalloc(offsetof(metaslab_class_t, + mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; - mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); - mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (zfs_refcount_t), KM_SLEEP); - mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (uint64_t), KM_SLEEP); - for (int i = 0; i < spa->spa_alloc_count; i++) - zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + mca->mca_rotor = NULL; + zfs_refcount_create_tracked(&mca->mca_alloc_slots); + } return (mc); } @@ -416,21 +430,22 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) void metaslab_class_destroy(metaslab_class_t *mc) { - ASSERT(mc->mc_rotor == NULL); + spa_t *spa = mc->mc_spa; + ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); - for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) - zfs_refcount_destroy(&mc->mc_alloc_slots[i]); - kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * - sizeof (zfs_refcount_t)); - kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * - sizeof (uint64_t)); + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + ASSERT(mca->mca_rotor == NULL); + zfs_refcount_destroy(&mca->mca_alloc_slots); + } mutex_destroy(&mc->mc_lock); multilist_destroy(mc->mc_metaslab_txg_list); - kmem_free(mc, sizeof (metaslab_class_t)); + kmem_free(mc, offsetof(metaslab_class_t, + mc_allocator[spa->spa_alloc_count])); } int @@ -445,7 +460,7 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mg = mc->mc_rotor) == NULL) + if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) return (0); do { @@ -454,7 +469,7 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); - } while ((mg = mg->mg_next) != mc->mc_rotor); + } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); return (0); } @@ -811,7 +826,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mg = kmem_zalloc(offsetof(metaslab_group_t, + mg_allocator[allocators]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); @@ -824,8 +840,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; - mg->mg_allocator = kmem_zalloc(allocators * - sizeof (metaslab_group_allocator_t), KM_SLEEP); for (int i = 0; i < allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); @@ -859,21 +873,19 @@ metaslab_group_destroy(metaslab_group_t *mg) metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_alloc_queue_depth); } - kmem_free(mg->mg_allocator, mg->mg_allocators * - sizeof (metaslab_group_allocator_t)); - - kmem_free(mg, sizeof (metaslab_group_t)); + kmem_free(mg, offsetof(metaslab_group_t, + mg_allocator[mg->mg_allocators])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; - ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); + ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); - ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); @@ -884,7 +896,7 @@ metaslab_group_activate(metaslab_group_t *mg) mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); metaslab_group_alloc_update(mg); - if ((mgprev = mc->mc_rotor) == NULL) { + if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { @@ -894,7 +906,10 @@ metaslab_group_activate(metaslab_group_t *mg) mgprev->mg_next = mg; mgnext->mg_prev = mg; } - mc->mc_rotor = mg; + for (int i = 0; i < spa->spa_alloc_count; i++) { + mc->mc_allocator[i].mca_rotor = mg; + mg = mg->mg_next; + } } /* @@ -915,7 +930,8 @@ metaslab_group_passivate(metaslab_group_t *mg) (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { - ASSERT(mc->mc_rotor != mg); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); @@ -962,12 +978,15 @@ metaslab_group_passivate(metaslab_group_t *mg) mgnext = mg->mg_next; if (mg == mgnext) { - mc->mc_rotor = NULL; + mgnext = NULL; } else { - mc->mc_rotor = mgnext; mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } + for (int i = 0; i < spa->spa_alloc_count; i++) { + if (mc->mc_allocator[i].mca_rotor == mg) + mc->mc_allocator[i].mca_rotor = mgnext; + } mg->mg_prev = NULL; mg->mg_next = NULL; @@ -1201,7 +1220,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then - * check if we have reached our allocation limit (mg_alloc_queue_depth) + * check if we have reached our allocation limit (mga_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest @@ -1350,9 +1369,7 @@ static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; -#ifdef _METASLAB_TRACING METASLABSTAT_BUMP(metaslabstat_reload_tree); -#endif ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; @@ -1563,6 +1580,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) + /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking @@ -1654,19 +1672,13 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); + if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ -#ifdef _METASLAB_TRACING - metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; - if (size < (1 << mrap->mra_floor_shift)) { - METASLABSTAT_BUMP( - metaslabstat_df_find_under_floor); - } -#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } @@ -2616,6 +2628,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocator = -1; ms->ms_new = B_TRUE; + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_metaslab_init != NULL) + ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); + /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. @@ -4393,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * Metaslab allocation tracing facility * ========================================================================== */ -#ifdef _METASLAB_TRACING /* * Add an allocation trace element to the allocation tracing list. @@ -4468,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) list_destroy(&zal->zal_list); zal->zal_size = 0; } -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ /* * ========================================================================== @@ -4510,13 +4510,14 @@ static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + metaslab_class_allocator_t *mca = + &mg->mg_class->mc_allocator[allocator]; uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mga->mga_cur_max_alloc_queue_depth; while (cur < max) { if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, cur, cur + 1) == cur) { - atomic_inc_64( - &mg->mg_class->mc_alloc_max_slots[allocator]); + atomic_inc_64(&mca->mca_alloc_max_slots); return; } cur = mga->mga_cur_max_alloc_queue_depth; @@ -4622,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); + int tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; + + if (!try_hard && tries > zfs_metaslab_find_max_tries) { + METASLABSTAT_BUMP(metaslabstat_too_many_tries); + return (NULL); + } + tries++; + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5052,6 +5061,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5073,7 +5083,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_aliquot because + * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -5109,23 +5119,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, mg->mg_next != NULL) mg = mg->mg_next; } else { - mg = mc->mc_rotor; + mg = mca->mca_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mc->mc_rotor; + mg = fast_mg = mca->mca_rotor; do { if (fast_mg->mg_vd->vdev_pending_fastwrite < mg->mg_vd->vdev_pending_fastwrite) mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); } else { - ASSERT(mc->mc_rotor != NULL); - mg = mc->mc_rotor; + ASSERT(mca->mca_rotor != NULL); + mg = mca->mca_rotor; } /* @@ -5133,7 +5143,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) - mg = mc->mc_rotor; + mg = mca->mca_rotor; rotor = mg; top: @@ -5211,7 +5221,7 @@ top: * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ - if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { + if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; @@ -5249,10 +5259,10 @@ top: } if ((flags & METASLAB_FASTWRITE) || - atomic_add_64_nv(&mc->mc_aliquot, asize) >= + atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -5269,14 +5279,17 @@ top: return (0); } next: - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* - * If we haven't tried hard, do so now. + * If we haven't tried hard, perhaps do so now. */ - if (!try_hard) { + if (!try_hard && (zfs_metaslab_try_hard_before_gang || + GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || + psize <= 1 << spa->spa_min_ashift)) { + METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } @@ -5588,15 +5601,15 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; - uint64_t max = mc->mc_alloc_max_slots[allocator]; + uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - uint64_t reserved_slots = - zfs_refcount_count(&mc->mc_alloc_slots[allocator]); + uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots); if (reserved_slots < max) available_slots = max - reserved_slots; @@ -5606,11 +5619,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) { - reserved_slots = - zfs_refcount_add(&mc->mc_alloc_slots[allocator], - zio); - } + for (int d = 0; d < slots; d++) + zfs_refcount_add(&mca->mca_alloc_slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; } @@ -5623,12 +5633,12 @@ void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; + ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - for (int d = 0; d < slots; d++) { - (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], - zio); - } + for (int d = 0; d < slots; d++) + zfs_refcount_remove(&mca->mca_alloc_slots, zio); mutex_exit(&mc->mc_lock); } @@ -5674,7 +5684,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); @@ -5721,7 +5731,7 @@ metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) metaslab_claim_cb_arg_t arg; /* - * Only zdb(1M) can claim on indirect vdevs. This is used + * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ @@ -5782,7 +5792,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + if (mc->mc_allocator[allocator].mca_rotor == NULL) { + /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } @@ -5813,7 +5824,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } - } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); @@ -6235,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, + ZMOD_RW, "Try hard to allocate before ganging"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, + "Normally only consider this many of the best metaslabs in each vdev"); diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 99852521b6d1..d05c9db24c20 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa) if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); - if (!vdev_writeable(leaf)) { + /* + * We skip unwritable, offline, detached, and dRAID spare + * devices as they are either not legal targets or the write + * may fail or not be seen by other hosts. Skipped dRAID + * spares can never be written so the fail mask is not set. + */ + if (!vdev_writeable(leaf) || leaf->vdev_offline || + leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { + continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c index a3adfd317af6..36c0d33bf1f6 100644 --- a/sys/contrib/openzfs/module/zfs/multilist.c +++ b/sys/contrib/openzfs/module/zfs/multilist.c @@ -96,9 +96,12 @@ multilist_create_impl(size_t size, size_t offset, } /* - * Allocate a new multilist, using the default number of sublists - * (the number of CPUs, or at least 4, or the tunable - * zfs_multilist_num_sublists). + * Allocate a new multilist, using the default number of sublists (the number + * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note + * that the multilists do not expand if more CPUs are hot-added. In that case, + * we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ multilist_t * multilist_create(size_t size, size_t offset, diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 9d1d4e0cca64..53ffbc31c186 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -60,6 +60,7 @@ #include <sys/vdev_rebuild.h> #include <sys/vdev_trim.h> #include <sys/vdev_disk.h> +#include <sys/vdev_draid.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> #include <sys/mmp.h> @@ -1280,15 +1281,15 @@ spa_activate(spa_t *spa, spa_mode_t mode) * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ - spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ - spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* @@ -2110,9 +2111,6 @@ spa_passivate_log(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - if (!spa_has_slogs(spa)) - return (B_FALSE); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -3681,7 +3679,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, /* * Build a new vdev tree from the trusted config */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); + if (error != 0) { + nvlist_free(mos_config); + spa_config_exit(spa, SCL_ALL, FTAG); + spa_load_failed(spa, "spa_config_parse failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was @@ -5631,7 +5636,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; @@ -5753,8 +5758,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point @@ -5895,6 +5900,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < ndraid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -6404,12 +6412,25 @@ spa_reset(const char *pool) */ /* + * This is called as a synctask to increment the draid feature flag + */ +static void +spa_draid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int draid = (int)(uintptr_t)arg; + + for (int c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); +} + +/* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; + uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -6438,8 +6459,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) + (error = vdev_create(vd, txg, B_FALSE)) != 0) { + return (spa_vdev_exit(spa, vd, txg, error)); + } + + /* + * The virtual dRAID spares must be added after vdev tree is created + * and the vdev guids are generated. The guid of their assoicated + * dRAID is stored in the config and used when opening the spare. + */ + if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, + rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) + nspares = 0; + } else { return (spa_vdev_exit(spa, vd, txg, error)); + } /* * We must validate the spares and l2cache devices after checking the @@ -6452,7 +6488,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { @@ -6462,10 +6498,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + /* Fail if top level vdev is raidz or a dRAID */ + if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } + /* * Need the top level mirror to be * a mirror of leaf vdevs only @@ -6506,6 +6542,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* + * We can't increment a feature while holding spa_vdev so we + * have to do it in a synctask. + */ + if (ndraid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, + (void *)(uintptr_t)ndraid, tx); + dmu_tx_commit(tx); + } + + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may @@ -6615,14 +6664,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * A dRAID spare can only replace a child of its parent dRAID vdev. + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + if (rebuild) { /* - * For rebuilds, the parent vdev must support reconstruction + * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable - * parents are the root vdev or a mirror vdev. + * vdevs types are the root vdev, a mirror, or dRAID. */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) { + tvd = pvd; + if (pvd->vdev_top != NULL) + tvd = pvd->vdev_top; + + if (tvd->vdev_ops != &vdev_mirror_ops && + tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } @@ -6915,14 +6977,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. + * If we are detaching the original disk from a normal spare, then it + * implies that the spare should become a real disk, and be removed + * from the active spare list for the pool. dRAID spares on the + * other hand are coupled to the pool and thus should never be removed + * from the spares list. */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; + if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { + vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + if (last_cvd->vdev_isspare && + last_cvd->vdev_ops != &vdev_draid_spare_ops) { + unspare = B_TRUE; + } + } /* * Erase the disk labels so the disk can be used for other things. @@ -7903,6 +7971,9 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); + + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) @@ -8013,18 +8084,9 @@ spa_async_thread(void *arg) /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_RESILVER_DONE) - spa_vdev_resilver_done(spa); - - /* - * If any devices are done replacing, detach them. Then if no - * top-level vdevs are rebuilding attempt to kick off a scrub. - */ - if (tasks & SPA_ASYNC_REBUILD_DONE) { + if (tasks & SPA_ASYNC_RESILVER_DONE || + tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); - - if (!vdev_rebuild_active(spa->spa_root_vdev)) - (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* @@ -8818,12 +8880,18 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) } for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); - normal->mc_alloc_max_slots[i] = slots_per_allocator; - special->mc_alloc_max_slots[i] = slots_per_allocator; - dedup->mc_alloc_max_slots[i] = slots_per_allocator; + ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. + mca_alloc_slots)); + ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. + mca_alloc_slots)); + ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. + mca_alloc_slots)); + normal->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; + special->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; + dedup->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c index 2ab58815400a..2939c0366504 100644 --- a/sys/contrib/openzfs/module/zfs/spa_history.c +++ b/sys/contrib/openzfs/module/zfs/spa_history.c @@ -321,7 +321,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * posted as a result of the ZPOOL_HIST_CMD key being present * it would result in only one sysevent being posted with the * full command line arguments, requiring the consumer to know - * how to parse and understand zfs(1M) command invocations. + * how to parse and understand zfs(8) command invocations. */ spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 04210472886c..f49be8eec01a 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; + spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; @@ -1366,7 +1367,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) /* * If anything changed, wait for it to sync. This ensures that, - * from the system administrator's perspective, zpool(1M) commands + * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ @@ -1807,10 +1808,11 @@ spa_update_dspace(spa_t *spa) ddt_get_dedup_dspace(spa); if (spa->spa_vdev_removal != NULL) { /* - * We can't allocate from the removing device, so - * subtract its size. This prevents the DMU/DSL from - * filling up the (now smaller) pool while we are in the - * middle of removing the device. + * We can't allocate from the removing device, so subtract + * its size if it was included in dspace (i.e. if this is a + * normal-class vdev, not special/dedup). This prevents the + * DMU/DSL from filling up the (now smaller) pool while we + * are in the middle of removing the device. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking @@ -1822,8 +1824,10 @@ spa_update_dspace(spa_t *spa) spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - spa->spa_dspace -= spa_deflate(spa) ? - vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + if (vd->vdev_mg->mg_class == spa_normal_class(spa)) { + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + } spa_config_exit(spa, SCL_VDEV, FTAG); } } @@ -2435,7 +2439,7 @@ spa_fini(void) boolean_t spa_has_slogs(spa_t *spa) { - return (spa->spa_log_class->mc_rotor != NULL); + return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c index 65375b579da6..3efd26155014 100644 --- a/sys/contrib/openzfs/module/zfs/txg.c +++ b/sys/contrib/openzfs/module/zfs/txg.c @@ -305,9 +305,7 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) * significance to the chosen tx_cpu. Because.. Why not use * the current cpu to index into the array? */ - kpreempt_disable(); - tc = &tx->tx_cpu[CPU_SEQID]; - kpreempt_enable(); + tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE]; mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; @@ -448,8 +446,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, boot_ncpus * 2, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); } cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 6af61cdcd9bf..7ffe924212da 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -40,6 +40,7 @@ #include <sys/dsl_dir.h> #include <sys/vdev_impl.h> #include <sys/vdev_rebuild.h> +#include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -51,6 +52,7 @@ #include <sys/arc.h> #include <sys/zil.h> #include <sys/dsl_scan.h> +#include <sys/vdev_raidz.h> #include <sys/abd.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> @@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -221,15 +225,16 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. - * String origin is either the per-vdev zap or zpool(1M). + * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) @@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) return (asize); } +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to @@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd) if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void @@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd) vdev_set_min_asize(vd->vdev_child[c]); } +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { @@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); @@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); - cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, @@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; @@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; + /* + * If creating a top-level vdev, check for allocation + * classes input. + */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); @@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); } } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_draid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + return (SET_ERROR(ENOTSUP)); + } } - vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; + /* + * Initialize the vdev specific data. This is done before calling + * vdev_alloc_common() since it may fail and this simplifies the + * error reporting and cleanup code paths. + */ + void *tsd = NULL; + if (ops->vdev_op_init != NULL) { + rc = ops->vdev_op_init(spa, nv, &tsd); + if (rc != 0) { + return (rc); + } + } + vd = vdev_alloc_common(spa, id, guid, ops); + vd->vdev_tsd = tsd; vd->vdev_islog = islog; - vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + vic = &vd->vdev_indirect_config; + ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); @@ -937,6 +967,9 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops->vdev_op_fini != NULL) + vd->vdev_ops->vdev_op_fini(vd); + /* * Discard allocation state. */ @@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); - mutex_destroy(&vd->vdev_rebuild_io_lock); cv_destroy(&vd->vdev_rebuild_cv); - cv_destroy(&vd->vdev_rebuild_io_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd) } /* - * Add a mirror/replacing vdev above an existing vdev. + * Add a mirror/replacing vdev above an existing vdev. There is no need to + * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) @@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; + + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } } } @@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd) return (B_FALSE); } -void -vdev_open_children(vdev_t *vd) +/* + * Returns B_TRUE if the passed child should be opened. + */ +static boolean_t +vdev_default_open_children_func(vdev_t *vd) +{ + return (B_TRUE); +} + +/* + * Open the requested child vdevs. If any of the leaf vdevs are using + * a ZFS volume then do the opens in a single thread. This avoids a + * deadlock when the current thread is holding the spa_namespace_lock. + */ +static void +vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { - taskq_t *tq; int children = vd->vdev_children; - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (vdev_uses_zvols(vd)) { -retry_sync: - for (int c = 0; c < children; c++) - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - } else { - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - if (tq == NULL) - goto retry_sync; + taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + vd->vdev_nonrot = B_TRUE; - for (int c = 0; c < children; c++) + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE) + continue; + + if (tq == NULL || vdev_uses_zvols(vd)) { + cvd->vdev_open_error = vdev_open(cvd); + } else { VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); + cvd, TQ_SLEEP) != TASKQID_INVALID); + } + + vd->vdev_nonrot &= cvd->vdev_nonrot; + } + if (tq != NULL) { + taskq_wait(tq); taskq_destroy(tq); } +} - vd->vdev_nonrot = B_TRUE; +/* + * Open all child vdevs. + */ +void +vdev_open_children(vdev_t *vd) +{ + vdev_open_children_impl(vd, vdev_default_open_children_func); +} - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +/* + * Conditionally open a subset of child vdevs. + */ +void +vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + vdev_open_children_impl(vd, open_func); } /* @@ -1953,6 +2017,16 @@ vdev_open(vdev_t *vd) } /* + * Track the the minimum allocation size. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + } + + /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. @@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd != NULL); + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are @@ -2575,15 +2651,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) /* * While we are loading the pool, the DTLs have not been loaded yet. - * Ignore the DTLs and try all devices. This avoids a recursive - * mutex enter on the vdev_dtl_lock, and also makes us try hard - * when loading the pool (relying on the checksum to ensure that - * we get the right data -- note that we while loading, we are - * only reading the MOS, which is always checksummed). + * This isn't a problem but it can result in devices being tried + * which are known to not have the data. In which case, the import + * is relying on the checksum to ensure that we get the right data. + * Note that while importing we are only reading the MOS, which is + * always checksummed. */ - if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) - return (B_FALSE); - mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); @@ -2606,10 +2679,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) } /* - * Returns B_TRUE if vdev determines offset needs to be resilvered. + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. */ boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + /* Set by sequential resilver. */ + if (phys_birth == TXG_UNKNOWN) + return (B_TRUE); + + return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); +} + +/* + * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); @@ -2617,7 +2706,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) vd->vdev_ops->vdev_op_leaf) return (B_TRUE); - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); + return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, + phys_birth)); } /* @@ -2862,8 +2952,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ + else if (vdev_get_nparity(vd) != 0) + minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); @@ -2884,6 +2974,7 @@ vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; + range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { @@ -2895,10 +2986,17 @@ vdev_dtl_load(vdev_t *vd) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(vd->vdev_dtl_sm, - vd->vdev_dtl[DTL_MISSING], SM_ALLOC); - mutex_exit(&vd->vdev_dtl_lock); + rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); + if (error == 0) { + mutex_enter(&vd->vdev_dtl_lock); + range_tree_walk(rt, range_tree_add, + vd->vdev_dtl[DTL_MISSING]); + mutex_exit(&vd->vdev_dtl_lock); + } + + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); return (error); } @@ -3727,6 +3825,9 @@ top: if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3971,6 +4072,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio) static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { + /* + * Exclude the dRAID spare when aggregating to avoid double counting + * the ops and bytes. These IOs are counted by the physical leaves. + */ + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return; + for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; @@ -4063,7 +4171,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); - } } else { /* @@ -4248,7 +4355,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * Repair is the result of a rebuild issued by the - * rebuild thread (vdev_rebuild_thread). + * rebuild thread (vdev_rebuild_thread). To avoid + * double counting repaired bytes the virtual dRAID + * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; @@ -4256,8 +4365,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); + } vs->vs_rebuild_processed += psize; } @@ -4353,8 +4464,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; - if (spa->spa_load_state == SPA_LOAD_NONE && - type == ZIO_TYPE_WRITE && txg != 0 && + if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { @@ -4981,31 +5091,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) vdev_resilver_needed(vd, NULL, NULL)); } +boolean_t +vdev_xlate_is_empty(range_seg64_t *rs) +{ + return (rs->rs_start == rs->rs_end); +} + /* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. + * Translate a logical range to the first contiguous physical range for the + * specified vdev_t. This function is initially called with a leaf vdev and + * will walk each parent vdev until it reaches a top-level vdev. Once the + * top-level is reached the physical range is initialized and the recursive + * function begins to unwind. As it unwinds it calls the parent's vdev + * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs) + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, + remain_rs); } else { /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. + * We've reached the top-level vdev, initialize the physical + * range to the logical range and set an empty remaining + * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; + + remain_rs->rs_start = logical_rs->rs_start; + remain_rs->rs_end = logical_rs->rs_start; + return; } @@ -5015,16 +5136,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, /* * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. + * range into its physical and any remaining components by calling + * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } +void +vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg) +{ + range_seg64_t iter_rs = *logical_rs; + range_seg64_t physical_rs; + range_seg64_t remain_rs; + + while (!vdev_xlate_is_empty(&iter_rs)) { + + vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); + + /* + * With raidz and dRAID, it's possible that the logical range + * does not live on this leaf vdev. Only when there is a non- + * zero physical size call the provided function. + */ + if (!vdev_xlate_is_empty(&physical_rs)) + func(arg, &physical_rs); + + iter_rs = remain_rs; + } +} + /* * Look at the vdev tree and determine whether any devices are currently being * replaced. diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c new file mode 100644 index 000000000000..6b7ad7021a50 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -0,0 +1,2984 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_rebuild.h> +#include <sys/abd.h> +#include <sys/zio.h> +#include <sys/nvpair.h> +#include <sys/zio_checksum.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> +#include <zfs_fletcher.h> + +#ifdef ZFS_DEBUG +#include <sys/vdev.h> /* For vdev_xlate() in vdev_draid_io_verify() */ +#endif + +/* + * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is + * comprised of multiple raidz redundancy groups which are spread over the + * dRAID children. To ensure an even distribution, and avoid hot spots, a + * permutation mapping is applied to the order of the dRAID children. + * This mixing effectively distributes the parity columns evenly over all + * of the disks in the dRAID. + * + * This is beneficial because it means when resilvering all of the disks + * can participate thereby increasing the available IOPs and bandwidth. + * Furthermore, by reserving a small fraction of each child's total capacity + * virtual distributed spare disks can be created. These spares similarly + * benefit from the performance gains of spanning all of the children. The + * consequence of which is that resilvering to a distributed spare can + * substantially reduce the time required to restore full parity to pool + * with a failed disks. + * + * === dRAID group layout === + * + * First, let's define a "row" in the configuration to be a 16M chunk from + * each physical drive at the same offset. This is the minimum allowable + * size since it must be possible to store a full 16M block when there is + * only a single data column. Next, we define a "group" to be a set of + * sequential disks containing both the parity and data columns. We allow + * groups to span multiple rows in order to align any group size to any + * number of physical drives. Finally, a "slice" is comprised of the rows + * which contain the target number of groups. The permutation mappings + * are applied in a round robin fashion to each slice. + * + * Given D+P drives in a group (including parity drives) and C-S physical + * drives (not including the spare drives), we can distribute the groups + * across R rows without remainder by selecting the least common multiple + * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). + * + * In the example below, there are C=14 physical drives in the configuration + * with S=2 drives worth of spare capacity. Each group has a width of 9 + * which includes D=8 data and P=1 parity drive. There are 4 groups and + * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice + * size is 576M (144M * 4). When allocating from a dRAID each group is + * filled before moving on to the next as show in slice0 below. + * + * data disks (8 data + 1 parity) spares (2) + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 + * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | | group 0 | group 1..| | + * | +-----------------------------------+-----------+-------| + * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r + * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o + * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w + * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 + * s +-----------------------+-----------------------+-------+ + * l | ..group 1 | group 2.. | | + * i +-----------------------+-----------------------+-------+ + * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r + * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o + * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w + * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 + * | +-----------+-----------+-----------------------+-------+ + * | |..group 2 | group 3 | | + * | +-----------+-----------+-----------------------+-------+ + * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r + * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o + * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w + * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 4 | group 5..| | row 3 + * i +-----------------------+-----------+-----------+-------| + * c | ..group 5 | group 6.. | | row 4 + * e +-----------+-----------+-----------------------+-------+ + * 1 |..group 6 | group 7 | | row 5 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 8 | group 9..| | row 6 + * i +-----------------------------------------------+-------| + * c | ..group 9 | group 10.. | | row 7 + * e +-----------------------+-----------------------+-------+ + * 2 |..group 10 | group 11 | | row 8 + * +-----------+-----------------------------------+-------+ + * + * This layout has several advantages over requiring that each row contain + * a whole number of groups. + * + * 1. The group count is not a relevant parameter when defining a dRAID + * layout. Only the group width is needed, and *all* groups will have + * the desired size. + * + * 2. All possible group widths (<= physical disk count) can be supported. + * + * 3. The logic within vdev_draid.c is simplified when the group width is + * the same for all groups (although some of the logic around computing + * permutation numbers and drive offsets is more complicated). + * + * N.B. The following array describes all valid dRAID permutation maps. + * Each row is used to generate a permutation map for a different number + * of children from a unique seed. The seeds were generated and carefully + * evaluated by the 'draid' utility in order to provide balanced mappings. + * In addition to the seed a checksum of the in-memory mapping is stored + * for verification. + * + * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, + * with a given permutation map) is the ratio of the amounts of I/O that will + * be sent to the least and most busy disks when resilvering. The average + * imbalance ratio (of a given number of disks and permutation map) is the + * average of the ratios of all possible single and double disk failures. + * + * In order to achieve a low imbalance ratio the number of permutations in + * the mapping must be significantly larger than the number of children. + * For dRAID the number of permutations has been limited to 512 to minimize + * the map size. This does result in a gradually increasing imbalance ratio + * as seen in the table below. Increasing the number of permutations for + * larger child counts would reduce the imbalance ratio. However, in practice + * when there are a large number of children each child is responsible for + * fewer total IOs so it's less of a concern. + * + * Note these values are hard coded and must never be changed. Existing + * pools depend on the same mapping always being generated in order to + * read and write from the correct locations. Any change would make + * existing pools completely inaccessible. + */ +static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { + { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ + { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ + { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ + { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ + { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ + { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ + { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ + { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ + { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ + { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ + { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ + { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ + { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ + { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ + { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ + { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ + { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ + { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ + { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ + { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ + { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ + { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ + { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ + { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ + { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ + { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ + { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ + { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ + { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ + { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ + { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ + { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ + { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ + { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ + { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ + { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ + { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ + { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ + { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ + { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ + { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ + { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ + { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ + { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ + { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ + { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ + { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ + { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ + { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ + { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ + { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ + { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ + { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ + { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ + { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ + { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ + { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ + { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ + { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ + { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ + { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ + { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ + { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ + { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ + { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ + { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ + { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ + { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ + { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ + { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ + { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ + { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ + { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ + { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ + { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ + { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ + { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ + { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ + { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ + { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ + { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ + { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ + { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ + { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ + { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ + { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ + { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ + { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ + { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ + { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ + { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ + { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ + { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ + { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ + { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ + { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ + { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ + { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ + { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ + { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ + { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ + { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ + { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ + { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ + { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ + { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ + { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ + { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ + { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ + { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ + { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ + { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ + { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ + { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ + { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ + { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ + { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ + { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ + { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ + { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ + { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ + { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ + { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ + { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ + { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ + { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ + { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ + { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ + { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ + { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ + { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ + { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ + { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ + { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ + { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ + { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ + { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ + { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ + { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ + { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ + { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ + { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ + { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ + { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ + { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ + { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ + { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ + { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ + { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ + { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ + { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ + { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ + { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ + { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ + { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ + { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ + { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ + { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ + { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ + { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ + { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ + { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ + { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ + { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ + { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ + { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ + { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ + { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ + { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ + { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ + { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ + { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ + { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ + { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ + { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ + { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ + { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ + { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ + { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ + { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ + { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ + { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ + { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ + { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ + { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ + { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ + { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ + { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ + { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ + { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ + { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ + { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ + { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ + { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ + { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ + { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ + { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ + { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ + { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ + { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ + { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ + { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ + { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ + { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ + { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ + { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ + { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ + { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ + { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ + { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ + { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ + { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ + { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ + { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ + { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ + { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ + { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ + { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ + { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ + { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ + { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ + { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ + { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ + { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ + { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ + { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ + { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ + { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ + { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ + { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ + { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ + { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ + { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ + { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ + { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ + { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ + { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ + { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ + { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ + { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ + { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ + { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ + { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ + { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ + { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ + { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ + { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ + { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ + { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ + { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ + { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ + { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ + { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ + { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ +}; + +/* + * Verify the map is valid. Each device index must appear exactly + * once in every row, and the permutation array checksum must match. + */ +static int +verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, + uint64_t checksum) +{ + int countssz = sizeof (uint16_t) * children; + uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); + + for (int i = 0; i < nperms; i++) { + for (int j = 0; j < children; j++) { + uint8_t val = perms[(i * children) + j]; + + if (val >= children || counts[val] != i) { + kmem_free(counts, countssz); + return (EINVAL); + } + + counts[val]++; + } + } + + if (checksum != 0) { + int permssz = sizeof (uint8_t) * children * nperms; + zio_cksum_t cksum; + + fletcher_4_native_varsize(perms, permssz, &cksum); + + if (checksum != cksum.zc_word[0]) { + kmem_free(counts, countssz); + return (ECKSUM); + } + } + + kmem_free(counts, countssz); + + return (0); +} + +/* + * Generate the permutation array for the draid_map_t. These maps control + * the placement of all data in a dRAID. Therefore it's critical that the + * seed always generates the same mapping. We provide our own pseudo-random + * number generator for this purpose. + */ +int +vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) +{ + VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); + VERIFY3U(map->dm_seed, !=, 0); + VERIFY3U(map->dm_nperms, !=, 0); + VERIFY3P(map->dm_perms, ==, NULL); + +#ifdef _KERNEL + /* + * The kernel code always provides both a map_seed and checksum. + * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide + * a zero checksum when generating new candidate maps. + */ + VERIFY3U(map->dm_checksum, !=, 0); +#endif + uint64_t children = map->dm_children; + uint64_t nperms = map->dm_nperms; + int rowsz = sizeof (uint8_t) * children; + int permssz = rowsz * nperms; + uint8_t *perms; + + /* Allocate the permutation array */ + perms = vmem_alloc(permssz, KM_SLEEP); + + /* Setup an initial row with a known pattern */ + uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); + for (int i = 0; i < children; i++) + initial_row[i] = i; + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + uint8_t *current_row, *previous_row = initial_row; + + /* + * Perform a Fisher-Yates shuffle of each row using the previous + * row as the starting point. An initial_row with known pattern + * is used as the input for the first row. + */ + for (int i = 0; i < nperms; i++) { + current_row = &perms[i * children]; + memcpy(current_row, previous_row, rowsz); + + for (int j = children - 1; j > 0; j--) { + uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); + uint8_t val = current_row[j]; + current_row[j] = current_row[k]; + current_row[k] = val; + } + + previous_row = current_row; + } + + kmem_free(initial_row, rowsz); + + int error = verify_perms(perms, children, nperms, map->dm_checksum); + if (error) { + vmem_free(perms, permssz); + return (error); + } + + *permsp = perms; + + return (0); +} + +/* + * Lookup the fixed draid_map_t for the requested number of children. + */ +int +vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) +{ + for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + if (draid_maps[i].dm_children == children) { + *mapp = &draid_maps[i]; + return (0); + } + } + + return (ENOENT); +} + +/* + * Lookup the permutation array and iteration id for the provided offset. + */ +static void +vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, + uint8_t **base, uint64_t *iter) +{ + uint64_t ncols = vdc->vdc_children; + uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + + *base = vdc->vdc_perms + (poff / ncols) * ncols; + *iter = poff % ncols; +} + +static inline uint64_t +vdev_draid_permute_id(vdev_draid_config_t *vdc, + uint8_t *base, uint64_t iter, uint64_t index) +{ + return ((base[index] + iter) % vdc->vdc_children); +} + +/* + * Return the asize which is the psize rounded up to a full group width. + * i.e. vdev_draid_psize_to_asize(). + */ +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_ashift; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; + uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; + + ASSERT3U(asize, !=, 0); + ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); + + return (asize); +} + +/* + * Deflate the asize to the psize, this includes stripping parity. + */ +uint64_t +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT0(asize % vdc->vdc_groupwidth); + + return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); +} + +/* + * Convert a logical offset to the corresponding group number. + */ +static uint64_t +vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (offset / vdc->vdc_groupsz); +} + +/* + * Convert a group number to the logical starting offset for that group. + */ +static uint64_t +vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (group * vdc->vdc_groupsz); +} + + +static void +vdev_draid_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT0(rm->rm_freed); + rm->rm_freed = B_TRUE; + + if (rm->rm_reports == 0) { + vdev_raidz_map_free(rm); + } +} + +/*ARGSUSED*/ +static void +vdev_draid_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed) + vdev_raidz_map_free(rm); +} + +static void +vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + const size_t c = zcr->zcr_cbinfo; + uint64_t skip_size = zcr->zcr_sector; + uint64_t parity_size; + size_t x, offset, size; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + /* + * Detailed cksum reporting is currently only supported for single + * row draid mappings, this covers the vast majority of zios. Only + * a dRAID zio which spans groups will have multiple rows. + */ + if (rm->rm_nrows != 1) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + raidz_row_t *rr = rm->rm_row[0]; + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical zio) + */ + if (rr->rr_col[0].rc_gdata == NULL) { + abd_t *bad_parity[VDEV_DRAID_MAXPARITY]; + + /* + * Set up the rr_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); + } + + /* + * Fill in the data columns from good_data being + * careful to pad short columns and empty columns + * with a skip sector. + */ + uint64_t good_size = abd_get_size((abd_t *)good_data); + + offset = 0; + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + + if (offset == good_size) { + /* empty data column (small write) */ + rr->rr_col[x].rc_abd = + abd_get_zeros(skip_size); + } else if (x < rr->rr_bigcols) { + /* this is a "big column" */ + size = rr->rr_col[x].rc_size; + rr->rr_col[x].rc_abd = + abd_get_offset_size( + (abd_t *)good_data, offset, size); + offset += size; + } else { + /* short data column, add skip sector */ + size = rr->rr_col[x].rc_size -skip_size; + rr->rr_col[x].rc_abd = abd_alloc( + rr->rr_col[x].rc_size, B_TRUE); + abd_copy_off(rr->rr_col[x].rc_abd, + (abd_t *)good_data, 0, offset, + size); + abd_zero_off(rr->rr_col[x].rc_abd, + size, skip_size); + offset += size; + } + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity_row(rm, rr); + + /* restore everything back to its original state */ + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; + + offset = 0; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + if (offset == good_size || x < rr->rr_bigcols) + abd_put(rr->rr_col[x].rc_abd); + else + abd_free(rr->rr_col[x].rc_abd); + + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; + } + } + + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); + } else { + /* adjust good_data to point at the start of our column */ + parity_size = size = rr->rr_col[0].rc_size; + if (c >= rr->rr_bigcols) { + size -= skip_size; + zcr->zcr_length = size; + } + + /* empty column */ + if (size == 0) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE); + return; + } + + offset = 0; + for (x = rr->rr_firstdatacol; x < c; x++) { + if (x < rr->rr_bigcols) { + offset += parity_size; + } else { + offset += parity_size - skip_size; + } + } + + good = abd_get_offset_size((abd_t *)good_data, offset, size); + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_put((abd_t *)good); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_draid_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + raidz_map_t *rm = zio->io_vsd; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_draid_cksum_finish; + zcr->zcr_free = vdev_draid_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_row[0]->rr_abd_copy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. Furthermore, all columns should have been expanded + * by vdev_draid_map_alloc_empty() when attempting reconstruction. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + ASSERT3U(rr->rr_col[c].rc_size, ==, + rr->rr_col[0].rc_size); + size += rr->rr_col[c].rc_size; + } + + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); + + abd_copy(tmp, col->rc_abd, col->rc_size); + + if (abd_is_gang(col->rc_abd)) + abd_free(col->rc_abd); + else + abd_put(col->rc_abd); + + col->rc_abd = tmp; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); + } +} + +const zio_vsd_ops_t vdev_draid_vsd_ops = { + .vsd_free = vdev_draid_map_free_vsd, + .vsd_cksum_report = vdev_draid_cksum_report +}; + +/* + * Full stripe writes. When writing, all columns (D+P) are required. Parity + * is calculated over all the columns, including empty zero filled sectors, + * and each is written to disk. While only the data columns are needed for + * a normal read, all of the columns are required for reconstruction when + * performing a sequential resilver. + * + * For "big columns" it's sufficient to map the correct range of the zio ABD. + * Partial columns require allocating a gang ABD in order to zero fill the + * empty sectors. When the column is empty a zero filled sector must be + * mapped. In all cases the data ABDs must be the same size as the parity + * ABDs (e.g. rc->rc_size == parity_size). + */ +static void +vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small write), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + rc->rc_abd = abd_get_zeros(skip_size); + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), + B_TRUE); + } + + ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); + + abd_off += rc->rc_size; + rc->rc_size = parity_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Scrub/resilver reads. In order to store the contents of the skip sectors + * an additional ABD is allocated. The columns are handled in the same way + * as a full stripe write except instead of using the zero ABD the newly + * allocated skip ABD is used to back the skip sectors. In all cases the + * data ABD must be the same size as the parity ABDs. + */ +static void +vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + uint64_t abd_size = abd_get_size(rc->rc_abd); + ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + /* + * Increase rc_size so the skip ABD is included in subsequent + * parity calculations. + */ + abd_off += rc->rc_size; + rc->rc_size = abd_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Normal reads. In this common case only the columns containing data + * are read in to the zio ABDs. Neither the parity columns or empty skip + * sectors are read unless the checksum fails verification. In which case + * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand + * the raid map in order to allow reconstruction using the parity data and + * skip sectors. + */ +static void +vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size > 0) { + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + abd_off += rc->rc_size; + } + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key + * difference is that an ABD is allocated to back skip sectors so they may + * be read in to memory, verified, and repaired if needed. + */ +void +vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, ==, NULL); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column", nothing to add */ + ASSERT3P(rc->rc_abd, !=, NULL); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT(!abd_is_gang(rc->rc_abd)); + abd_t *read_abd = rc->rc_abd; + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, read_abd, B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + /* + * Increase rc_size so the empty ABD is included in subsequent + * parity calculations. + */ + rc->rc_size = parity_size; + } + + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Given a logical address within a dRAID configuration, return the physical + * address on the first drive in the group that this address maps to + * (at position 'start' in permutation number 'perm'). + */ +static uint64_t +vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, + uint64_t *perm, uint64_t *start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + /* b is the dRAID (parent) sector offset. */ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t b_offset = logical_offset >> ashift; + + /* + * The height of a row in units of the vdev's minimum sector size. + * This is the amount of data written to each disk of each group + * in a given permutation. + */ + uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; + + /* + * We cycle through a disk permutation every groupsz * ngroups chunk + * of address space. Note that ngroups * groupsz must be a multiple + * of the number of data drives (ndisks) in order to guarantee + * alignment. So, for example, if our row height is 16MB, our group + * size is 10, and there are 13 data drives in the draid, then ngroups + * will be 13, we will change permutation every 2.08GB and each + * disk will have 160MB of data per chunk. + */ + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t ngroups = vdc->vdc_ngroups; + uint64_t ndisks = vdc->vdc_ndisks; + + /* + * groupstart is where the group this IO will land in "starts" in + * the permutation array. + */ + uint64_t group = logical_offset / vdc->vdc_groupsz; + uint64_t groupstart = (group * groupwidth) % ndisks; + ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); + *start = groupstart; + + /* b_offset is the sector offset within a group chunk */ + b_offset = b_offset % (rowheight_sectors * groupwidth); + ASSERT0(b_offset % groupwidth); + + /* + * Find the starting byte offset on each child vdev: + * - within a permutation there are ngroups groups spread over the + * rows, where each row covers a slice portion of the disk + * - each permutation has (groupwidth * ngroups) / ndisks rows + * - so each permutation covers rows * slice portion of the disk + * - so we need to find the row where this IO group target begins + */ + *perm = group / ngroups; + uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + (((group % ngroups) * groupwidth) / ndisks); + + return (((rowheight_sectors * row) + + (b_offset / groupwidth)) << ashift); +} + +static uint64_t +vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, + uint64_t abd_offset, uint64_t abd_size) +{ + vdev_t *vd = zio->io_vd; + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t io_size = abd_size; + uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t group = vdev_draid_offset_to_group(vd, io_offset); + uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); + + /* + * Limit the io_size to the space remaining in the group. A second + * row in the raidz_map_t is created for the remainder. + */ + if (io_offset + io_asize > start_offset) { + io_size = vdev_draid_asize_to_psize(vd, + start_offset - io_offset); + } + + /* + * At most a block may span the logical end of one group and the start + * of the next group. Therefore, at the end of a group the io_size must + * span the group width evenly and the remainder must be aligned to the + * start of the next group. + */ + IMPLY(abd_offset == 0 && io_size < zio->io_size, + (io_asize >> ashift) % vdc->vdc_groupwidth == 0); + IMPLY(abd_offset != 0, + vdev_draid_group_to_offset(vd, group) == io_offset); + + /* Lookup starting byte offset on each child vdev */ + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + io_offset, &perm, &groupstart); + + /* + * If there is less than groupwidth drives available after the group + * start, the group is going to wrap onto the next row. 'wrap' is the + * group disk number that starts on the next row. + */ + uint64_t ndisks = vdc->vdc_ndisks; + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t wrap = groupwidth; + + if (groupstart + groupwidth > ndisks) + wrap = ndisks - groupstart; + + /* The io size in units of the vdev's minimum sector size. */ + const uint64_t psize = io_size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + uint64_t q = psize / vdc->vdc_ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = psize - q * vdc->vdc_ndata; + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); + ASSERT3U(bc, <, groupwidth); + + /* The total number of data and parity sectors for this I/O. */ + uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); + + raidz_row_t *rr; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); + rr->rr_cols = groupwidth; + rr->rr_scols = groupwidth; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = vdc->vdc_nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; +#ifdef ZFS_DEBUG + rr->rr_offset = io_offset; + rr->rr_size = io_size; +#endif + *rrp = rr; + + uint8_t *base; + uint64_t iter, asize = 0; + vdev_draid_get_perm(vdc, perm, &base, &iter); + for (uint64_t i = 0; i < groupwidth; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + uint64_t c = (groupstart + i) % ndisks; + + /* increment the offset if we wrap to the next row */ + if (i == wrap) + physical_offset += VDEV_DRAID_ROWHEIGHT; + + rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); + rc->rc_offset = physical_offset; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; + + if (q == 0 && i >= bc) + rc->rc_size = 0; + else if (i < bc) + rc->rc_size = (q + 1) << ashift; + else + rc->rc_size = q << ashift; + + asize += rc->rc_size; + } + + ASSERT3U(asize, ==, tot << ashift); + rr->rr_nempty = roundup(tot, groupwidth) - tot; + IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); + + /* Allocate buffers for the parity columns */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + } + + /* + * Map buffers for data columns and allocate/map buffers for skip + * sectors. There are three distinct cases for dRAID which are + * required to support sequential rebuild. + */ + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_draid_map_alloc_write(zio, abd_offset, rr); + } else if ((rr->rr_nempty > 0) && + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + vdev_draid_map_alloc_scrub(zio, abd_offset, rr); + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + vdev_draid_map_alloc_read(zio, abd_offset, rr); + } + + return (io_size); +} + +/* + * Allocate the raidz mapping to be applied to the dRAID I/O. The parity + * calculations for dRAID are identical to raidz however there are a few + * differences in the layout. + * + * - dRAID always allocates a full stripe width. Any extra sectors due + * this padding are zero filled and written to disk. They will be read + * back during a scrub or repair operation since they are included in + * the parity calculation. This property enables sequential resilvering. + * + * - When the block at the logical offset spans redundancy groups then two + * rows are allocated in the raidz_map_t. One row resides at the end of + * the first group and the other at the start of the following group. + */ +static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio) +{ + raidz_row_t *rr[2]; + uint64_t abd_offset = 0; + uint64_t abd_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + uint64_t size; + int nrows = 1; + + size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, + abd_offset, abd_size); + if (size < abd_size) { + vdev_t *vd = zio->io_vd; + + io_offset += vdev_draid_asize(vd, size); + abd_offset += size; + abd_size -= size; + nrows++; + + ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( + vd, vdev_draid_offset_to_group(vd, io_offset))); + ASSERT3U(abd_offset, <, zio->io_size); + ASSERT3U(abd_size, !=, 0); + + size = vdev_draid_map_alloc_row(zio, &rr[1], + io_offset, abd_offset, abd_size); + VERIFY3U(size, ==, abd_size); + } + + raidz_map_t *rm; + rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); + rm->rm_ops = vdev_raidz_math_get_ops(); + rm->rm_nrows = nrows; + rm->rm_row[0] = rr[0]; + if (nrows == 2) + rm->rm_row[1] = rr[1]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_draid_vsd_ops; + + return (rm); +} + +/* + * Given an offset into a dRAID return the next group width aligned offset + * which can be used to start an allocation. + */ +static uint64_t +vdev_draid_get_astart(vdev_t *vd, const uint64_t start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); +} + +/* + * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) + * rounded down to the last full slice. So each child must provide at least + * 1 / (children - nspares) of its asize. + */ +static uint64_t +vdev_draid_min_asize(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +/* + * When using dRAID the minimum allocation size is determined by the number + * of data disks in the redundancy group. Full stripes are always used. + */ +static uint64_t +vdev_draid_min_alloc(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (vdc->vdc_ndata << vd->vdev_ashift); +} + +/* + * Returns true if the txg range does not exist on any leaf vdev. + * + * A dRAID spare does not fit into the DTL model. While it has child vdevs + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Essentially we do a vdev_dtl_reassess() on the + * fly by replacing a dRAID spare with the child vdev under the offset. + * Note that it is a recursive process because the child vdev can be + * another dRAID spare and so on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child + * contains the txg range the data it is not missing. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, physical_offset, + txg, size)) + return (B_FALSE); + } + + return (B_TRUE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_missing(vd, physical_offset, + txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Returns true if the txg is only partially replicated on the leaf vdevs. + */ +static boolean_t +vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child is + * missing the txg range then it is partially replicated. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + } + + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_partial(vd, physical_offset, txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Determine if the vdev is readable at the given offset. + */ +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_readable(cvd, physical_offset)) + return (B_TRUE); + } + + return (B_FALSE); + } + + return (vdev_readable(vd)); +} + +/* + * Returns the first distributed spare found under the provided vdev tree. + */ +static vdev_t * +vdev_draid_find_spare(vdev_t *vd) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); + if (svd != NULL) + return (svd); + } + + return (NULL); +} + +/* + * Returns B_TRUE if the passed in vdev is currently "faulted". + * Faulted, in this context, means that the vdev represents a + * replacing or sparing vdev tree. + */ +static boolean_t +vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + + /* + * After resolving the distributed spare to a leaf vdev + * check the parent to determine if it's "faulted". + */ + vd = vd->vdev_parent; + } + + return (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Determine if the dRAID block at the logical offset is degraded. + * Used by sequential resilver. + */ +static boolean_t +vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Group contains a faulted vdev. */ + if (vdev_draid_faulted(cvd, physical_offset)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Determine if the txg is missing. Used by healing resilver. + */ +static boolean_t +vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, + uint64_t size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Transaction group is known to be partially replicated. */ + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Find the smallest child asize and largest sector size to calculate the + * available capacity. Distributed spares are ignored since their capacity + * is also based of the minimum child size in the top-level dRAID. + */ +static void +vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, + uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) +{ + uint64_t logical_ashift = 0, physical_ashift = 0; + uint64_t asize = 0, max_asize = 0; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + + asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; + max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; + logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); + physical_ashift = MAX(physical_ashift, + cvd->vdev_physical_ashift); + } + + *asizep = asize; + *max_asizep = max_asize; + *logical_ashiftp = logical_ashift; + *physical_ashiftp = physical_ashift; +} + +/* + * Open spare vdevs. + */ +static boolean_t +vdev_draid_open_spares(vdev_t *vd) +{ + return (vd->vdev_ops == &vdev_draid_spare_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Open all children, excluding spares. + */ +static boolean_t +vdev_draid_open_children(vdev_t *vd) +{ + return (!vdev_draid_open_spares(vd)); +} + +/* + * Open a top-level dRAID vdev. + */ +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t nparity = vdc->vdc_nparity; + int open_errors = 0; + + if (nparity > VDEV_DRAID_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * First open the normal children then the distributed spares. This + * ordering is important to ensure the distributed spares calculate + * the correct psize in the event that the dRAID vdevs were expanded. + */ + vdev_open_children_subset(vd, vdev_draid_open_children); + vdev_open_children_subset(vd, vdev_draid_open_spares); + + /* Verify enough of the children are available to continue. */ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + } + } + + /* + * Allocatable capacity is the sum of the space on all children less + * the number of distributed spares rounded down to last full row + * and then to the last full group. An additional 32MB of scratch + * space is reserved at the end of each child for use by the dRAID + * expansion feature. + */ + uint64_t child_asize, child_max_asize; + vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, + logical_ashift, physical_ashift); + + /* + * Should be unreachable since the minimum child size is 64MB, but + * we want to make sure an underflow absolutely cannot occur here. + */ + if (child_asize < VDEV_DRAID_REFLOW_RESERVE || + child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { + return (SET_ERROR(ENXIO)); + } + + child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + + *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + + return (0); +} + +/* + * Close a top-level dRAID vdev. + */ +static void +vdev_draid_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. A dRAID chunks may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span dRAID redundancy groups. + */ +static uint64_t +vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t ashift = vd->vdev_ashift; + uint64_t ndata = vdc->vdc_ndata; + uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), + SPA_MAXBLOCKSIZE); + + ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); + ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); + + /* Chunks must evenly span all data columns in the group. */ + psize = (((psize >> ashift) / ndata) * ndata) << ashift; + uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); + + /* Reduce the chunk size to the group space remaining. */ + uint64_t group = vdev_draid_offset_to_group(vd, start); + uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; + chunk_size = MIN(chunk_size, left); + + ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, + vdev_draid_offset_to_group(vd, start + chunk_size - 1)); + + return (chunk_size); +} + +/* + * Align the start of the metaslab to the group width and slightly reduce + * its size to a multiple of the group width. Since full stripe writes are + * required by dRAID this space is unallocable. Furthermore, aligning the + * metaslab start is important for vdev initialize and TRIM which both operate + * on metaslab boundaries which vdev_xlate() expects to be aligned. + */ +static void +vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; + uint64_t astart = vdev_draid_get_astart(vd, *ms_start); + uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; + + *ms_start = astart; + *ms_size = asize; + + ASSERT0(*ms_start % sz); + ASSERT0(*ms_size % sz); +} + +/* + * Add virtual dRAID spares to the list of valid spares. In order to accomplish + * this the existing array must be freed and reallocated with the additional + * entries. + */ +int +vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, + uint64_t next_vdev_id) +{ + uint64_t draid_nspares = 0; + uint64_t ndraid = 0; + int error; + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + + if (cvd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = cvd->vdev_tsd; + draid_nspares += vdc->vdc_nspares; + ndraid++; + } + } + + if (draid_nspares == 0) { + *ndraidp = ndraid; + return (0); + } + + nvlist_t **old_spares, **new_spares; + uint_t old_nspares; + error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &old_spares, &old_nspares); + if (error) + old_nspares = 0; + + /* Allocate memory and copy of the existing spares. */ + new_spares = kmem_alloc(sizeof (nvlist_t *) * + (draid_nspares + old_nspares), KM_SLEEP); + for (uint_t i = 0; i < old_nspares; i++) + new_spares[i] = fnvlist_dup(old_spares[i]); + + /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ + uint64_t n = old_nspares; + for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { + vdev_t *cvd = vd->vdev_child[vdev_id]; + char path[64]; + + if (cvd->vdev_ops != &vdev_draid_ops) + continue; + + vdev_draid_config_t *vdc = cvd->vdev_tsd; + uint64_t nspares = vdc->vdc_nspares; + uint64_t nparity = vdc->vdc_nparity; + + for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { + bzero(path, sizeof (path)); + (void) snprintf(path, sizeof (path) - 1, + "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, + (u_longlong_t)nparity, + (u_longlong_t)next_vdev_id + vdev_id, + (u_longlong_t)spare_id); + + nvlist_t *spare = fnvlist_alloc(); + fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, + cvd->vdev_guid); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, + spare_id); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, + cvd->vdev_ashift); + + new_spares[n] = spare; + n++; + } + } + + if (n > 0) { + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + new_spares, n); + } + + for (int i = 0; i < n; i++) + nvlist_free(new_spares[i]); + + kmem_free(new_spares, sizeof (*new_spares) * n); + *ndraidp = ndraid; + + return (0); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. + */ +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = vdev_draid_asize(vd, psize); + + if (phys_birth == TXG_UNKNOWN) { + /* + * Sequential resilver. There is no meaningful phys_birth + * for this block, we can only determine if block resides + * in a degraded group in which case it must be resilvered. + */ + ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, + vdev_draid_offset_to_group(vd, offset + asize - 1)); + + return (vdev_draid_group_degraded(vd, offset)); + } else { + /* + * Healing resilver. TXGs not in DTL_PARTIAL are intact, + * as are blocks in non-degraded groups. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) + return (B_TRUE); + + /* The block may span groups in which case check both. */ + if (vdev_draid_offset_to_group(vd, offset) != + vdev_draid_offset_to_group(vd, offset + asize - 1)) { + if (vdev_draid_group_missing(vd, + offset + asize, phys_birth, 1)) + return (B_TRUE); + } + + return (B_FALSE); + } +} + +static boolean_t +vdev_draid_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_draid_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static void +vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +{ +#ifdef ZFS_DEBUG + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_draid_asize(vd, rr->rr_size); + + raidz_col_t *rc = &rr->rr_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); +#endif +} + +/* + * For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() + * if a skip sector needs to be added to a column. + */ +static void +vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * Empty columns are zero filled and included in the parity + * calculation and therefore must be written. + */ + ASSERT3U(rc->rc_size, !=, 0); + + /* Verify physical to logical translation */ + vdev_draid_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } +} + +/* + * For read operations: + * 1. The vdev_draid_map_alloc() function will create a minimal raidz + * mapping for the read based on the zio->io_flags. There are two + * possible mappings either 1) a normal read, or 2) a scrub/resilver. + * 2. Create the zio read operations. This will include all parity + * columns and skip sectors for a scrub/resilver. + */ +static void +vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* Sequential rebuild must do IO at redundancy group boundary. */ + IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last. Any errors along the way will force us to read the parity. + * For scrub/resilver IOs which verify skip sectors, a gang ABD will + * have been allocated to store them and rc->rc_size is increased. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; + rc->rc_skipped = 1; + continue; + } + + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + + /* + * Empty columns may be read during vdev_draid_io_done(). + * Only skip them after the readable and missing checks + * verify they are available. + */ + if (rc->rc_size == 0) { + rc->rc_skipped = 1; + continue; + } + + if (zio->io_flags & ZIO_FLAG_RESILVER) { + vdev_t *svd; + + /* + * If this child is a distributed spare then the + * offset might reside on the vdev being replaced. + * In which case this data must be written to the + * new device. Failure to do so would result in + * checksum errors when the old device is detached + * and the pool is scrubbed. + */ + if ((svd = vdev_draid_find_spare(cvd)) != NULL) { + svd = vdev_draid_spare_get_child(svd, + rc->rc_offset); + if (svd && (svd->vdev_ops == &vdev_spare_ops || + svd->vdev_ops == &vdev_replacing_ops)) { + rc->rc_repair = 1; + } + } + + /* + * Always issue a repair IO to this child when its + * a spare or replacing vdev with an active rebuild. + */ + if ((cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_replacing_ops) && + vdev_draid_rebuilding(cvd)) { + rc->rc_repair = 1; + } + } + } + + /* + * Either a parity or data column is missing this means a repair + * may be attempted by vdev_draid_io_done(). Expand the raid map + * to read in empty columns which are needed along with the parity + * during reconstruction. + */ + if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && + rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { + vdev_draid_map_alloc_empty(zio, rr); + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + +/* + * Start an IO operation to a dRAID vdev. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd __maybe_unused = zio->io_vd; + raidz_map_t *rm; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); + + rm = vdev_draid_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_write(zio, rm->rm_row[i]); + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_read(zio, rm->rm_row[i]); + } + } + + zio_execute(zio); +} + +/* + * Complete an IO operation on a dRAID vdev. The raidz logic can be applied + * to dRAID since the layout is fully described by the raidz_map_t. + */ +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_raidz_io_done(zio); +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + ASSERT(vd->vdev_ops == &vdev_draid_ops); + + if (faulted > vdc->vdc_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +static void +vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_draid_ops); + + vdev_draid_config_t *vdc = raidvd->vdev_tsd; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* Make sure the offsets are block-aligned */ + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + + uint64_t logical_start = logical_rs->rs_start; + uint64_t logical_end = logical_rs->rs_end; + + /* + * Unaligned ranges must be skipped. All metaslabs are correctly + * aligned so this should not happen, but this case is handled in + * case it's needed by future callers. + */ + uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); + if (astart != logical_start) { + physical_rs->rs_start = logical_start; + physical_rs->rs_end = logical_start; + remain_rs->rs_start = MIN(astart, logical_end); + remain_rs->rs_end = logical_end; + return; + } + + /* + * Unlike with mirrors and raidz a dRAID logical range can map + * to multiple non-contiguous physical ranges. This is handled by + * limiting the size of the logical range to a single group and + * setting the remain argument such that it describes the remaining + * unmapped logical range. This is stricter than absolutely + * necessary but helps simplify the logic below. + */ + uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); + uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); + if (logical_end > nextstart) + logical_end = nextstart; + + /* Find the starting offset for each vdev in the group */ + uint64_t perm, groupstart; + uint64_t start = vdev_draid_logical_to_physical(raidvd, + logical_start, &perm, &groupstart); + uint64_t end = start; + + uint8_t *base; + uint64_t iter, id; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + /* + * Check if the passed child falls within the group. If it does + * update the start and end to reflect the physical range. + * Otherwise, leave them unmodified which will result in an empty + * (zero-length) physical range being returned. + */ + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + + if (c == 0 && i != 0) { + /* the group wrapped, increment the start */ + start += VDEV_DRAID_ROWHEIGHT; + end = start; + } + + id = vdev_draid_permute_id(vdc, base, iter, c); + if (id == cvd->vdev_id) { + uint64_t b_size = (logical_end >> ashift) - + (logical_start >> ashift); + ASSERT3U(b_size, >, 0); + end = start + ((((b_size - 1) / + vdc->vdc_groupwidth) + 1) << ashift); + break; + } + } + physical_rs->rs_start = start; + physical_rs->rs_end = end; + + /* + * Only top-level vdevs are allowed to set remain_rs because + * when .vdev_op_xlate() is called for their children the full + * logical range is not provided by vdev_xlate(). + */ + remain_rs->rs_start = logical_end; + remain_rs->rs_end = logical_rs->rs_end; + + ASSERT3U(physical_rs->rs_start, <=, logical_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_end - logical_start); +} + +/* + * Add dRAID specific fields to the config nvlist. + */ +static void +vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + vdev_draid_config_t *vdc = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); +} + +/* + * Initialize private dRAID specific fields from the nvlist. + */ +static int +vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + uint64_t ndata, nparity, nspares, ngroups; + int error; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || + nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + return (SET_ERROR(EINVAL)); + } + + uint_t children; + nvlist_t **child; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children == 0 || + children > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100 || nspares > (children - (ndata + nparity))) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + /* + * Validate the minimum number of children exist per group for the + * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). + */ + if (children < (ndata + nparity + nspares)) + return (SET_ERROR(EINVAL)); + + /* + * Create the dRAID configuration using the pool nvlist configuration + * and the fixed mapping for the correct number of children. + */ + vdev_draid_config_t *vdc; + const draid_map_t *map; + + error = vdev_draid_lookup_map(children, &map); + if (error) + return (SET_ERROR(EINVAL)); + + vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); + vdc->vdc_ndata = ndata; + vdc->vdc_nparity = nparity; + vdc->vdc_nspares = nspares; + vdc->vdc_children = children; + vdc->vdc_ngroups = ngroups; + vdc->vdc_nperms = map->dm_nperms; + + error = vdev_draid_generate_perms(map, &vdc->vdc_perms); + if (error) { + kmem_free(vdc, sizeof (*vdc)); + return (SET_ERROR(EINVAL)); + } + + /* + * Derived constants. + */ + vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; + vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; + vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; + vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / + vdc->vdc_ndisks; + + ASSERT3U(vdc->vdc_groupwidth, >=, 2); + ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); + ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); + ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % + vdc->vdc_ndisks, ==, 0); + + *tsd = vdc; + + return (0); +} + +static void +vdev_draid_fini(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + vmem_free(vdc->vdc_perms, sizeof (uint8_t) * + vdc->vdc_children * vdc->vdc_nperms); + kmem_free(vdc, sizeof (*vdc)); +} + +static uint64_t +vdev_draid_nparity(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_nparity); +} + +static uint64_t +vdev_draid_ndisks(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_ndisks); +} + +vdev_ops_t vdev_draid_ops = { + .vdev_op_init = vdev_draid_init, + .vdev_op_fini = vdev_draid_fini, + .vdev_op_open = vdev_draid_open, + .vdev_op_close = vdev_draid_close, + .vdev_op_asize = vdev_draid_asize, + .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_alloc = vdev_draid_min_alloc, + .vdev_op_io_start = vdev_draid_io_start, + .vdev_op_io_done = vdev_draid_io_done, + .vdev_op_state_change = vdev_draid_state_change, + .vdev_op_need_resilver = vdev_draid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_draid_xlate, + .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, + .vdev_op_metaslab_init = vdev_draid_metaslab_init, + .vdev_op_config_generate = vdev_draid_config_generate, + .vdev_op_nparity = vdev_draid_nparity, + .vdev_op_ndisks = vdev_draid_ndisks, + .vdev_op_type = VDEV_TYPE_DRAID, + .vdev_op_leaf = B_FALSE, +}; + + +/* + * A dRAID distributed spare is a virtual leaf vdev which is included in the + * parent dRAID configuration. The last N columns of the dRAID permutation + * table are used to determine on which dRAID children a specific offset + * should be written. These spare leaf vdevs can only be used to replace + * faulted children in the same dRAID configuration. + */ + +/* + * Distributed spare state. All fields are set when the distributed spare is + * first opened and are immutable. + */ +typedef struct { + vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ + uint64_t vds_top_guid; /* top-level parent dRAID guid */ + uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ +} vdev_draid_spare_t; + +/* + * Returns the parent dRAID vdev to which the distributed spare belongs. + * This may be safely called even when the vdev is not open. + */ +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + if (vds->vds_draid_vdev != NULL) + return (vds->vds_draid_vdev); + + return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, + vds->vds_top_guid)); +} + +/* + * A dRAID space is active when it's the child of a vdev using the + * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. + */ +static boolean_t +vdev_draid_spare_is_active(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_draid_ops)) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* + * Given a dRAID distribute spare vdev, returns the physical child vdev + * on which the provided offset resides. This may involve recursing through + * multiple layers of distributed spares. Note that offset is relative to + * this vdev. + */ +vdev_t * +vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + /* The vdev is closed */ + if (vds->vds_draid_vdev == NULL) + return (NULL); + + vdev_t *tvd = vds->vds_draid_vdev; + vdev_draid_config_t *vdc = tvd->vdev_tsd; + + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + + uint8_t *base; + uint64_t iter; + uint64_t perm = physical_offset / vdc->vdc_devslicesz; + + vdev_draid_get_perm(vdc, perm, &base, &iter); + + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, + (tvd->vdev_children - 1) - vds->vds_spare_id); + vdev_t *cvd = tvd->vdev_child[cid]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_get_child(cvd, physical_offset)); + + return (cvd); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_close(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vds->vds_draid_vdev = NULL; +} + +/* + * Opening a dRAID spare device is done by looking up the associated dRAID + * top-level vdev guid from the spare configuration. + */ +static int +vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + uint64_t asize, max_asize; + + vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); + if (tvd == NULL) { + /* + * When spa_vdev_add() is labeling new spares the + * associated dRAID is not attached to the root vdev + * nor does this spare have a parent. Simulate a valid + * device in order to allow the label to be initialized + * and the distributed spare added to the configuration. + */ + if (vd->vdev_parent == NULL) { + *psize = *max_psize = SPA_MINDEVSIZE; + *logical_ashift = *physical_ashift = ASHIFT_MIN; + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + vdev_draid_config_t *vdc = tvd->vdev_tsd; + if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) + return (SET_ERROR(EINVAL)); + + if (vds->vds_spare_id >= vdc->vdc_nspares) + return (SET_ERROR(EINVAL)); + + /* + * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here + * because the caller may be vdev_draid_open() in which case the + * values are stale as they haven't yet been updated by vdev_open(). + * To avoid this always recalculate the dRAID asize and max_asize. + */ + vdev_draid_calculate_asize(tvd, &asize, &max_asize, + logical_ashift, physical_ashift); + + *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + + vds->vds_draid_vdev = tvd; + + return (0); +} + +/* + * Completed distributed spare IO. Store the result in the parent zio + * as if it had performed the operation itself. Only the first error is + * preserved if there are multiple errors. + */ +static void +vdev_draid_spare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + /* + * IOs are issued to non-writable vdevs in order to keep their + * DTLs accurate. However, we don't want to propagate the + * error in to the distributed spare's DTL. When resilvering + * vdev_draid_need_resilver() will consult the relevant DTL + * to determine if the data is missing and must be repaired. + */ + if (!vdev_writeable(zio->io_vd)) + return; + + if (pio->io_error == 0) + pio->io_error = zio->io_error; +} + +/* + * Returns a valid label nvlist for the distributed spare vdev. This is + * used to bypass the IO pipeline to avoid the complexity of constructing + * a complete label with valid checksum to return when read. + */ +nvlist_t * +vdev_draid_read_config_spare(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + uint64_t guid = vd->vdev_guid; + + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, + vdev_draid_spare_is_active(vd) ? + POOL_STATE_ACTIVE : POOL_STATE_SPARE); + + /* Set the vdev guid based on the vdev list in sav_count. */ + for (int i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + return (nv); +} + +/* + * Handle any ioctl requested of the distributed spare. Only flushes + * are supported in which case all children must be flushed. + */ +static int +vdev_draid_spare_ioctl(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + int error = 0; + + if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } else { + error = SET_ERROR(ENOTSUP); + } + + return (error); +} + +/* + * Initiate an IO to the distributed spare. For normal IOs this entails using + * the zio->io_offset and permutation table to calculate which child dRAID vdev + * is responsible for the data. Then passing along the zio to that child to + * perform the actual IO. The label ranges are not stored on disk and require + * some special handling which is described below. + */ +static void +vdev_draid_spare_io_start(zio_t *zio) +{ + vdev_t *cvd = NULL, *vd = zio->io_vd; + vdev_draid_spare_t *vds = vd->vdev_tsd; + uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vds == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + zio->io_error = vdev_draid_spare_ioctl(zio); + break; + + case ZIO_TYPE_WRITE: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs and config writers to simulate the + * existence of an on disk label. vdev_label_sync(), + * vdev_uberblock_sync() and vdev_copy_uberblocks() + * skip the distributed spares. This only leaves + * vdev_label_init() which is allowed to succeed to + * avoid adding special cases the function. + */ + if (zio->io_flags & ZIO_FLAG_PROBE || + zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_READ: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs to simulate the existence of a + * label. vdev_label_read_config() bypasses the + * pipeline to read the label configuration and + * vdev_uberblock_load() skips distributed spares + * when attempting to locate the best uberblock. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_TRIM: + /* The vdev label ranges are never trimmed */ + ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); + + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !cvd->vdev_has_trim) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_io_done(zio_t *zio) +{ +} + +/* + * Lookup the full spare config in spa->spa_spares.sav_config and + * return the top_guid and spare_id for the named spare. + */ +static int +vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, + uint64_t *spare_idp) +{ + nvlist_t **spares; + uint_t nspares; + int error; + + if ((spa->spa_spares.sav_config == NULL) || + (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { + return (SET_ERROR(ENOENT)); + } + + char *spare_name; + error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (int i = 0; i < nspares; i++) { + nvlist_t *spare = spares[i]; + uint64_t top_guid, spare_id; + char *type, *path; + + /* Skip non-distributed spares */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); + if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) + continue; + + /* Skip spares with the wrong name */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); + if (error != 0 || strcmp(path, spare_name) != 0) + continue; + + /* Found the matching spare */ + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_TOP_GUID, &top_guid); + if (error == 0) { + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_SPARE_ID, &spare_id); + } + + if (error != 0) { + return (SET_ERROR(EINVAL)); + } else { + *top_guidp = top_guid; + *spare_idp = spare_id; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* + * Initialize private dRAID spare specific fields from the nvlist. + */ +static int +vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_draid_spare_t *vds; + uint64_t top_guid = 0; + uint64_t spare_id; + + /* + * In the normal case check the list of spares stored in the spa + * to lookup the top_guid and spare_id for provided spare config. + * When creating a new pool or adding vdevs the spare list is not + * yet populated and the values are provided in the passed config. + */ + if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, + &spare_id) != 0) + return (SET_ERROR(EINVAL)); + } + + vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); + vds->vds_draid_vdev = NULL; + vds->vds_top_guid = top_guid; + vds->vds_spare_id = spare_id; + + *tsd = vds; + + return (0); +} + +static void +vdev_draid_spare_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); +} + +static void +vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); +} + +vdev_ops_t vdev_draid_spare_ops = { + .vdev_op_init = vdev_draid_spare_init, + .vdev_op_fini = vdev_draid_spare_fini, + .vdev_op_open = vdev_draid_spare_open, + .vdev_op_close = vdev_draid_spare_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_draid_spare_io_start, + .vdev_op_io_done = vdev_draid_spare_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_draid_spare_config_generate, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DRAID_SPARE, + .vdev_op_leaf = B_TRUE, +}; diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c new file mode 100644 index 000000000000..fe1a75c11312 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c @@ -0,0 +1,40 @@ +/* + * Xorshift Pseudo Random Number Generator based on work by David Blackman + * and Sebastiano Vigna (vigna@acm.org). + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * http://prng.di.unimi.it/xoroshiro128plusplus.c + * + * To the extent possible under law, the author has dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * See <http://creativecommons.org/publicdomain/zero/1.0/>. + * + * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid, + * small-state generators. It is extremely (sub-ns) fast and it passes all + * tests we are aware of, but its state space is large enough only for + * mild parallelism. + */ + +#include <sys/vdev_draid.h> + +static inline uint64_t rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +uint64_t +vdev_draid_rand(uint64_t *s) +{ + const uint64_t s0 = s[0]; + uint64_t s1 = s[1]; + const uint64_t result = rotl(s0 + s1, 17) + s0; + + s1 ^= s0; + s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b + s[1] = rotl(s1, 28); // c + + return (result); +} diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 12ee393bd5db..07d1c922a50c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -239,6 +239,7 @@ typedef struct indirect_child { */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ + int ic_error; /* set when a child does not contain the data */ } indirect_child_t; /* @@ -1272,15 +1273,14 @@ vdev_indirect_read_all(zio_t *zio) continue; /* - * Note, we may read from a child whose DTL - * indicates that the data may not be present here. - * While this might result in a few i/os that will - * likely return incorrect data, it simplifies the - * code since we can treat scrub and resilver - * identically. (The incorrect data will be - * detected and ignored when we verify the - * checksum.) + * If a child is missing the data, set ic_error. Used + * in vdev_indirect_repair(). We perform the read + * nevertheless which provides the opportunity to + * reconstruct the split block if at all possible. */ + if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, + zio->io_txg, 1)) + ic->ic_error = SET_ERROR(ESTALE); ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); @@ -1410,7 +1410,11 @@ vdev_indirect_checksum_error(zio_t *zio, * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data - * with the good copy. Note that we do this without regard for the DTL's, + * with the good copy. The DTL is checked in vdev_indirect_read_all() and + * if a vdev is missing a copy of the data we set ic_error and the read is + * performed. This provides the opportunity to reconstruct the split block + * if at all possible. ic_error is checked here and if set it suppresses + * incrementing the checksum counter. Aside from this DTLs are not checked, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use @@ -1447,6 +1451,14 @@ vdev_indirect_repair(zio_t *zio) ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); + /* + * If ic_error is set the current child does not have + * a copy of the data, so suppress incrementing the + * checksum counter. + */ + if (ic->ic_error == ESTALE) + continue; + vdev_indirect_checksum_error(zio, is, ic); } } @@ -1844,9 +1856,13 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, @@ -1855,6 +1871,11 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index 7ff7fffcc80e..083ad2861b5b 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } + + vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_INITIALIZE_ACTIVE || + old_state == VDEV_INITIALIZE_SUSPENDED) + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, @@ -318,6 +322,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) } static void +vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_initialize_bytes_est += size; + + if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && + vd->vdev_initialize_last_offset < physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - physical_rs->rs_start; + } +} + +static void vdev_initialize_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || @@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been initialized */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been initialized */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_initialize_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_initialize_last_offset > last_rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd) &where)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_initialize_bytes_est += size; - if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += size; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_start && - vd->vdev_initialize_last_offset < - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += - vd->vdev_initialize_last_offset - - physical_rs.rs_start; - } + + vdev_xlate_walk(vd, &logical_rs, + vdev_initialize_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -419,55 +443,48 @@ vdev_initialize_load(vdev_t *vd) return (err); } -/* - * Convert the logical range into a physical range and add it to our - * avl tree. - */ static void -vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) { vdev_t *vd = arg; - range_seg64_t logical_rs, physical_rs; - logical_rs.rs_start = start; - logical_rs.rs_end = start + size; - - ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) return; /* Pick up where we left off mid-range. */ - if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " "(%llu, %llu)", vd->vdev_path, - (u_longlong_t)physical_rs.rs_start, - (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)physical_rs->rs_start, + (u_longlong_t)physical_rs->rs_end, (u_longlong_t)vd->vdev_initialize_last_offset, - (u_longlong_t)physical_rs.rs_end); - ASSERT3U(physical_rs.rs_end, >, + (u_longlong_t)physical_rs->rs_end); + ASSERT3U(physical_rs->rs_end, >, vd->vdev_initialize_last_offset); - physical_rs.rs_start = vd->vdev_initialize_last_offset; + physical_rs->rs_start = vd->vdev_initialize_last_offset; } - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + +/* + * Convert the logical range into a physical range and add it to our + * avl tree. + */ +static void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg64_t logical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } static void diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index d063b77ea836..fbd117d2d9ae 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include <sys/zap.h> #include <sys/vdev.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/uberblock_impl.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> @@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + if (vd->vdev_ops->vdev_op_config_generate != NULL) + vd->vdev_ops->vdev_op_config_generate(vd, nv); - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) + if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); + } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); @@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + /* + * The label for a dRAID distributed spare is not stored on disk. + * Instead it is generated when needed which allows us to bypass + * the pipeline when reading the config from the label. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_read_config_spare(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd) SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); + /* + * No uberblocks are stored on distributed spares, they may be + * safely skipped when expanding a leaf vdev. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * There's no need to write uberblocks to a distributed spare, they + * are already stored on all the leaves of the parent dRAID. For + * this same reason vdev_uberblock_load_impl() skips distributed + * spares when reading uberblocks. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { @@ -1764,6 +1772,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, return; /* + * The top-level config never needs to be written to a distributed + * spare. When read vdev_dspare_label_read_config() will generate + * the config for the vdev_label_read_config(). + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + + /* * Generate a label describing the top-level config to which we belong. */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c index 71b5adbbd06a..71ca43caec1a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c +++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c @@ -33,6 +33,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/vdev_impl.h> +#include <sys/vdev_draid.h> #include <sys/zio.h> #include <sys/abd.h> #include <sys/fs/zfs.h> @@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void) /* * Virtual device vector for mirroring. */ - typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; @@ -108,6 +108,7 @@ typedef struct mirror_child { uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; + uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { @@ -115,6 +116,7 @@ typedef struct mirror_map { int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; + boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; @@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) return (load + zfs_vdev_mirror_rotating_seek_inc); } +static boolean_t +vdev_mirror_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_mirror_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. @@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio) mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (vdev_mirror_rebuilding(mc->mc_vd)) + mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } @@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read - * preferring vdevs based on determined load. + * preferring vdevs based on determined load. If we can't, try the read on + * any vdev we haven't already tried. * - * Try to find a child whose DTL doesn't contain the block we want to read. - * If we can't, try the read on any vdev we haven't already tried. + * Distributed spares are an exception to the above load rule. They are + * always preferred in order to detect gaps in the distributed spare which + * are created when another disk in the dRAID fails. In order to restore + * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) @@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } + if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { + mm->mm_preferred[0] = c; + mm->mm_preferred_cnt = 1; + break; + } + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; @@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; + c++; + + /* + * When sequentially resilvering only issue write repair + * IOs to the vdev which is being rebuilt since performance + * is limited by the slowest child. This is an issue for + * faster replacement devices such as distributed spares. + */ + if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && + (zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SCRUB) && + mm->mm_rebuilding && !mc->mc_rebuilding) { + continue; + } + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); - c++; } zio_execute(zio); @@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_error == 0) { + vdev_ops_t *ops = mc->mc_vd->vdev_ops; + if (mc->mc_tried) continue; /* @@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio) * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - - * 2. it's an indirect vdev (in which case - * it could point to any other vdev, which - * might have a bad DTL) + * 2. it's an indirect or distributed spare + * vdev (in which case it could point to any + * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - mc->mc_vd->vdev_ops != &vdev_indirect_ops && + ops != &vdev_indirect_ops && + ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; @@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } } +/* + * Return the maximum asize for a rebuild zio in the provided range. + */ +static uint64_t +vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + vdev_ops_t vdev_mirror_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c index ce90df6e8d95..e9145fd012d7 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_missing.c +++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c @@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index a8ef3d7474c9..02040c3ee198 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -121,16 +121,17 @@ /* * The maximum number of i/os active to each device. Ideally, this will be >= - * the sum of each queue's max_active. It must be at least the sum of each - * queue's min_active. + * the sum of each queue's max_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the * number of active i/os is < zfs_vdev_max_active, then the min_active comes - * into play. We will send min_active from each queue, and then select from - * queues in the order defined by zio_priority_t. + * into play. We will send min_active from each queue round-robin, and then + * send from queues in the order defined by zio_priority_t up to max_active. + * Some queues have additional mechanisms to limit number of active I/Os in + * addition to min_active and max_active, see below. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, @@ -151,7 +152,7 @@ uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; -uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_scrub_max_active = 3; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; @@ -172,6 +173,28 @@ int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; /* + * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), + * the number of concurrently-active I/O's is limited to *_min_active, unless + * the vdev is "idle". When there are no interactive I/Os active (sync or + * async), and zfs_vdev_nia_delay I/Os have completed since the last + * interactive I/O, then the vdev is considered to be "idle", and the number + * of concurrently-active non-interactive I/O's is increased to *_max_active. + */ +uint_t zfs_vdev_nia_delay = 5; + +/* + * Some HDDs tend to prioritize sequential I/O so high that concurrent + * random I/O latency reaches several seconds. On some HDDs it happens + * even if sequential I/Os are submitted one at a time, and so setting + * *_max_active to 1 does not help. To prevent non-interactive I/Os, like + * scrub, from monopolizing the device no more than zfs_vdev_nia_credit + * I/Os can be sent while there are outstanding incomplete interactive + * I/Os. This enforced wait ensures the HDD services the interactive I/O + * within a reasonable amount of time. + */ +uint_t zfs_vdev_nia_credit = 5; + +/* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes * we include spans of optional I/Os to aid aggregation at the disk even when @@ -261,7 +284,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2) } static int -vdev_queue_class_min_active(zio_priority_t p) +vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -273,15 +296,19 @@ vdev_queue_class_min_active(zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: - return (zfs_vdev_scrub_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); case ZIO_PRIORITY_REMOVAL: - return (zfs_vdev_removal_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); case ZIO_PRIORITY_INITIALIZING: - return (zfs_vdev_initializing_min_active); + return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: + MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REBUILD: - return (zfs_vdev_rebuild_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); default: panic("invalid priority %u", p); return (0); @@ -311,14 +338,12 @@ vdev_queue_max_async_writes(spa_t *spa) * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ - if (spa_has_pending_synctask(spa)) + dirty = dp->dp_dirty_total; + if (dirty > max_bytes || spa_has_pending_synctask(spa)) return (zfs_vdev_async_write_max_active); - dirty = dp->dp_dirty_total; if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); - if (dirty > max_bytes) - return (zfs_vdev_async_write_max_active); /* * linear interpolation: @@ -337,7 +362,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static int -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -349,14 +374,34 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_scrub_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_scrub_min_active)); return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_removal_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_removal_min_active)); return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_initializing_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_initializing_min_active)); return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REBUILD: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_rebuild_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_rebuild_min_active)); return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); @@ -372,17 +417,24 @@ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p; + zio_priority_t p, n; if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); - /* find a queue that has not reached its minimum # outstanding i/os */ - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + /* + * Find a queue that has not reached its minimum # outstanding i/os. + * Do round-robin to reduce starvation due to zfs_vdev_max_active + * and vq_nia_credit limits. + */ + for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { + p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(p)) + vdev_queue_class_min_active(vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* @@ -392,8 +444,10 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, p)) + vdev_queue_class_max_active(spa, vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* No eligible queued i/os */ @@ -493,6 +547,20 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) } } +static boolean_t +vdev_queue_is_interactive(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SCRUB: + case ZIO_PRIORITY_REMOVAL: + case ZIO_PRIORITY_INITIALIZING: + case ZIO_PRIORITY_REBUILD: + return (B_FALSE); + default: + return (B_TRUE); + } +} + static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { @@ -502,6 +570,12 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (++vq->vq_ia_active == 1) + vq->vq_nia_credit = 1; + } else if (vq->vq_ia_active > 0) { + vq->vq_nia_credit--; + } avl_add(&vq->vq_active_tree, zio); if (shk->kstat != NULL) { @@ -520,6 +594,13 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (--vq->vq_ia_active == 0) + vq->vq_nia_credit = 0; + else + vq->vq_nia_credit = zfs_vdev_nia_credit; + } else if (vq->vq_ia_active == 0) + vq->vq_nia_credit++; avl_remove(&vq->vq_active_tree, zio); if (shk->kstat != NULL) { @@ -593,6 +674,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); + /* + * I/Os to distributed spares are directly dispatched to the dRAID + * leaf vdevs for aggregation. See the comment at the end of the + * zio_vdev_io_start() function. + */ + ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); + first = last = zio; if (zio->io_type == ZIO_TYPE_READ) @@ -1065,6 +1153,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, "Min active rebuild I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, + "Number of non-interactive I/Os to allow in sequence"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, + "Number of non-interactive I/Os before _max_active"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 47312e02f70a..989b90dc2635 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include <sys/fm/fs/zfs.h> #include <sys/vdev_raidz.h> #include <sys/vdev_raidz_impl.h> +#include <sys/vdev_draid.h> #ifdef ZFS_DEBUG #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -134,25 +135,51 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } -void -vdev_raidz_map_free(raidz_map_t *rm) +static void +vdev_raidz_row_free(raidz_row_t *rr) { int c; - for (c = 0; c < rm->rm_firstdatacol; c++) { - abd_free(rm->rm_col[c].rc_abd); + for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) { + abd_free(rr->rr_col[c].rc_abd); - if (rm->rm_col[c].rc_gdata != NULL) - abd_free(rm->rm_col[c].rc_gdata); + if (rr->rr_col[c].rc_gdata != NULL) { + abd_free(rr->rr_col[c].rc_gdata); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } } + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_size != 0) { + if (abd_is_gang(rr->rr_col[c].rc_abd)) + abd_free(rr->rr_col[c].rc_abd); + else + abd_put(rr->rr_col[c].rc_abd); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } + } + + if (rr->rr_abd_copy != NULL) + abd_free(rr->rr_abd_copy); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - abd_put(rm->rm_col[c].rc_abd); + if (rr->rr_abd_empty != NULL) + abd_free(rr->rr_abd_empty); + + kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); +} - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); +void +vdev_raidz_map_free(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) + vdev_raidz_row_free(rm->rm_row[i]); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); + kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void @@ -161,10 +188,11 @@ vdev_raidz_map_free_vsd(zio_t *zio) raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); - rm->rm_freed = 1; + rm->rm_freed = B_TRUE; - if (rm->rm_reports == 0) + if (rm->rm_reports == 0) { vdev_raidz_map_free(rm); + } } /*ARGSUSED*/ @@ -175,7 +203,7 @@ vdev_raidz_cksum_free(void *arg, size_t ignored) ASSERT3U(rm->rm_reports, >, 0); - if (--rm->rm_reports == 0 && rm->rm_freed != 0) + if (--rm->rm_reports == 0 && rm->rm_freed) vdev_raidz_map_free(rm); } @@ -186,77 +214,79 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) const size_t c = zcr->zcr_cbinfo; size_t x, offset; - const abd_t *good = NULL; - const abd_t *bad = rm->rm_col[c].rc_abd; - if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } - if (c < rm->rm_firstdatacol) { + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ - if (rm->rm_col[0].rc_gdata == NULL) { + if (rr->rr_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; /* - * Set up the rm_col[]s to generate the parity for + * Set up the rr_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_abd = - rm->rm_col[x].rc_gdata = - abd_alloc_sametype(rm->rm_col[x].rc_abd, - rm->rm_col[x].rc_size); + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); } /* fill in the data columns from good_data */ offset = 0; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); - rm->rm_col[x].rc_abd = + rr->rr_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, - offset, rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + offset, rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } /* * Construct the parity from the good data. */ - vdev_raidz_generate_parity(rm); + vdev_raidz_generate_parity_row(rm, rr); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_abd = bad_parity[x]; + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset_size( - rm->rm_abd_copy, offset, - rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } } - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, - rm->rm_col[c].rc_size); + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ offset = 0; - for (x = rm->rm_firstdatacol; x < c; x++) - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < c; x++) + offset += rr->rr_col[x].rc_size; good = abd_get_offset_size((abd_t *)good_data, offset, - rm->rm_col[c].rc_size); + rr->rr_col[c].rc_size); } /* we drop the ereport if it ends up that the data was good */ @@ -274,10 +304,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - size_t offset; - raidz_map_t *rm = zio->io_vsd; - size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; @@ -287,8 +314,9 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); + ASSERT3U(rm->rm_nrows, ==, 1); - if (rm->rm_abd_copy != NULL) + if (rm->rm_row[0]->rr_abd_copy != NULL) return; /* @@ -299,26 +327,30 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) * Our parity data is already in separate buffers, so there's no need * to copy them. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) + size += rr->rr_col[c].rc_size; - rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, - col->rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); - abd_copy(tmp, col->rc_abd, col->rc_size); + abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); - col->rc_abd = tmp; + abd_put(col->rc_abd); + col->rc_abd = tmp; - offset += col->rc_size; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); } - ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -337,7 +369,7 @@ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { - raidz_map_t *rm; + raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ @@ -349,6 +381,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); + rm->rm_nrows = 1; + /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. @@ -370,8 +406,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); - /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ + /* + * acols: The columns that will be accessed. + * scols: The columns that will be accessed or skipped. + */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; @@ -383,65 +421,70 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); - - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rm->rm_row[0] = rr; + + rr->rr_cols = acols; + rr->rr_scols = scols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; +#ifdef ZFS_DEBUG + rr->rr_offset = zio->io_offset; + rr->rr_size = zio->io_size; +#endif asize = 0; for (c = 0; c < scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; + rc->rc_devidx = col; + rc->rc_offset = coff; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; if (c >= acols) - rm->rm_col[c].rc_size = 0; + rc->rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << ashift; + rc->rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << ashift; + rc->rc_size = q << ashift; - asize += rm->rm_col[c].rc_size; + asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); - rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); - ASSERT3U(rm->rm_nskip, <=, nparity); + rm->rm_skipstart = bc; - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); + for (c = 0; c < rr->rr_firstdatacol; c++) + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rm->rm_col[c].rc_size); - off = rm->rm_col[c].rc_size; + rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rr->rr_col[c].rc_size); + off = rr->rr_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, - rm->rm_col[c].rc_size); - off += rm->rm_col[c].rc_size; + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size); + off += rc->rc_size; } /* @@ -464,24 +507,21 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * skip the first column since at least one data and one parity * column must appear in each row. */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; + if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rr->rr_col[0].rc_devidx; + o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -550,50 +590,43 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) } static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p(raidz_row_t *rr) { - uint64_t *p; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq(raidz_row_t *rr) { - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } @@ -601,14 +634,15 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } @@ -616,33 +650,29 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; - - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_R].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); + (void) memcpy(r, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; @@ -651,14 +681,15 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } @@ -671,27 +702,38 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * parity columns available. */ void -vdev_raidz_generate_parity(raidz_map_t *rm) +vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { + ASSERT3U(rr->rr_cols, !=, 0); + /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; - switch (rm->rm_firstdatacol) { + switch (rr->rr_firstdatacol) { case 1: - vdev_raidz_generate_parity_p(rm); + vdev_raidz_generate_parity_p(rr); break; case 2: - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); break; case 3: - vdev_raidz_generate_parity_pqr(rm); + vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } +void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_generate_parity_row(rm, rr); + } +} + /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) @@ -809,30 +851,27 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) } static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; - int c; abd_t *dst, *src; - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); + ASSERT3U(ntgts, ==, 1); + ASSERT3U(x, >=, rr->rr_firstdatacol); + ASSERT3U(x, <, rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); + ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + dst = rr->rr_col[x].rc_abd; - abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); + abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; if (c == x) continue; @@ -845,7 +884,7 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) } static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; @@ -853,44 +892,44 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 1); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) + if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); - + rr->rr_col[x].rc_size - size); + } } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); + ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, + size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); + src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + dst = rr->rr_col[x].rc_abd; + exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; @@ -901,10 +940,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 2); ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); + ASSERT(x >= rr->rr_firstdatacol); + ASSERT(y < rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as @@ -913,29 +952,29 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; + pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rr->rr_col[x].rc_size; + ysize = rr->rr_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rr->rr_col[x].rc_size = 0; + rr->rr_col[y].rc_size = 0; - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; + rr->rr_col[x].rc_size = xsize; + rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; + pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + xd = rr->rr_col[x].rc_abd; + yd = rr->rr_col[y].rc_abd; /* * We now have: @@ -953,7 +992,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) */ a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; @@ -967,14 +1006,14 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1134,13 +1173,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) /* END CSTYLED */ static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, +vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. @@ -1164,7 +1203,7 @@ vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, } static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, +vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; @@ -1176,10 +1215,10 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, * correspond to data columns. */ for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); + ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); + ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* @@ -1196,8 +1235,8 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; + ASSERT3U(used[j], >=, rr->rr_firstdatacol); + jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; @@ -1258,7 +1297,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, } static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, +vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; @@ -1290,22 +1329,24 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, for (i = 0; i < n; i++) { c = used[i]; - ASSERT3U(c, <, rm->rm_cols); + ASSERT3U(c, <, rr->rr_cols); - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; + ccount = rr->rr_col[c].rc_size; + ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); + if (ccount == 0) + continue; + src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); + cc = missing[j] + rr->rr_firstdatacol; + ASSERT3U(cc, >=, rr->rr_firstdatacol); + ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; + dcount[j] = rr->rr_col[cc].rc_size; + if (dcount[j] != 0) + dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; @@ -1334,16 +1375,14 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, } static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; - uint8_t *p, *pp; size_t psize; - uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; @@ -1354,30 +1393,39 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate - * temporary linear ABDs. + * temporary linear ABDs if any non-linear ABDs are found. */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); - - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + if (!abd_is_linear(rr->rr_col[i].rc_abd)) { + bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), + KM_PUSHPAGE); + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + + bufs[c] = col->rc_abd; + if (bufs[c] != NULL) { + col->rc_abd = abd_alloc_linear( + col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], + col->rc_size); + } + } - bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + break; } } - n = rm->rm_cols - rm->rm_firstdatacol; + n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { + if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; + tgts[t] - rr->rr_firstdatacol; } } @@ -1387,7 +1435,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); + ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. @@ -1422,9 +1470,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) used[i] = parity_map[i]; } - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { + c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } @@ -1437,18 +1485,18 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Initialize the interesting rows of the matrix. */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); @@ -1457,21 +1505,24 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) * copy back from temporary linear abds and free them */ if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); + if (bufs[c] != NULL) { + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + } col->rc_abd = bufs[c]; } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } return (code); } -int -vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +static int +vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, + const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; @@ -1480,26 +1531,19 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; - /* - * The tgts list must already be sorted. - */ - for (i = 1; i < nt; i++) { - ASSERT(t[i] > t[i - 1]); - } - - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; + nbadparity = rr->rr_firstdatacol; + nbaddata = rr->rr_cols - nbadparity; ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) + for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; - } else if (rm->rm_col[c].rc_error != 0) { + } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { + } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; @@ -1514,7 +1558,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return (ret); @@ -1524,29 +1568,29 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + return (vdev_raidz_reconstruct_p(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + return (vdev_raidz_reconstruct_q(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; case 2: - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + return (vdev_raidz_reconstruct_pq(rr, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; } - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); @@ -1556,8 +1600,8 @@ static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; @@ -1573,7 +1617,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; @@ -1602,19 +1646,20 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, static void vdev_raidz_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t cols = vdrz->vd_logical_width; + uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -1623,7 +1668,18 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +/* + * The allocatable space for a raidz vdev is N * sizeof(smallest child) + * so each child must provide at least 1/Nth of its asize. + */ +static uint64_t +vdev_raidz_min_asize(vdev_t *vd) +{ + return ((vd->vdev_min_asize + vd->vdev_children - 1) / + vd->vdev_children); +} + +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1634,21 +1690,21 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg64_t logical_rs, physical_rs; - logical_rs.rs_start = zio->io_offset; + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, zio->io_size); + vdev_raidz_asize(vd, rr->rr_size); - raidz_col_t *rc = &rm->rm_col[col]; + raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - vdev_xlate(cvd, &logical_rs, &physical_rs); + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1666,106 +1722,82 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) #endif } -/* - * Start an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Generate the parity data - * 2. Create child zio write operations to each column's vdev, for both - * data and parity. - * 3. If the column skips any sectors for padding, create optional dummy - * write zio children for those areas to improve aggregation continuity. - * - For read operations: - * 1. Create child zio read operations to each data column's vdev to read - * the range of data required for zio. - * 2. If this is a scrub or resilver operation, or if any of the data - * vdevs have had errors, then create zio read operations to the parity - * columns' VDevs as well. - */ static void -vdev_raidz_io_start(zio_t *zio) +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) { vdev_t *vd = zio->io_vd; - vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; - raidz_map_t *rm; - raidz_col_t *rc; + raidz_map_t *rm = zio->io_vsd; int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + vdev_raidz_generate_parity_row(rm, rr); - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Verify physical to logical translation. - */ - vdev_raidz_io_verify(zio, rm, c); + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } + /* Verify physical to logical translation */ + vdev_raidz_io_verify(vd, rr, c); - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } - zio_execute(zio); - return; + /* + * Generate optional I/Os for skip sectors to improve aggregation + * contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rr->rr_scols); + if (c == rr->rr_scols) + c = 0; + + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } +} - ASSERT(zio->io_type == ZIO_TYPE_READ); +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; else - rm->rm_missingparity++; + rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; else - rm->rm_missingparity++; + rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, @@ -1773,11 +1805,56 @@ vdev_raidz_io_start(zio_t *zio) vdev_raidz_child_done, rc)); } } +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_raidz_t *vdrz = vd->vdev_tsd; + raidz_map_t *rm; + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, + vdrz->vd_logical_width, vdrz->vd_nparity); + + /* + * Until raidz expansion is implemented all maps for a raidz vdev + * contain a single row. + */ + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + vdev_raidz_io_start_read(zio, rr); + } zio_execute(zio); } - /* * Report a checksum error for a child of a RAID-Z device. */ @@ -1786,7 +1863,8 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; @@ -1827,13 +1905,14 @@ raidz_checksum_verify(zio_t *zio) * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. + * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; + raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; @@ -1843,8 +1922,18 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; + /* + * All data columns must have been successfully read in order + * to use them to generate parity columns for comparison. + */ + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + rc = &rr->rr_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + return (ret); + } + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; @@ -1852,12 +1941,19 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_copy(orig[c], rc->rc_abd, rc->rc_size); } - vdev_raidz_generate_parity(rm); + /* + * Regenerates parity even for !tried||rc_error!=0 columns. This + * isn't harmful but it does have the side effect of fixing stuff + * we didn't realize was necessary (i.e. even if we return 0). + */ + vdev_raidz_generate_parity_row(rm, rr); + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; + if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); @@ -1870,456 +1966,597 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) } static int -vdev_raidz_worst_error(raidz_map_t *rm) +vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); + for (int c = 0; c < rr->rr_cols; c++) + error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } -/* - * Iterate over all combinations of bad data and attempt a reconstruction. - * Note that the algorithm below is non-optimal because it doesn't take into - * account how reconstruction is actually performed. For example, with - * triple-parity RAID-Z the reconstruction procedure is the same if column 4 - * is targeted as invalid as if columns 1 and 4 are targeted since in both - * cases we'd only use parity information in column 0. - */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +static void +vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - abd_t *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int curr, next, i, c, n; - int code, ret = 0; + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; - ASSERT(total_errors < rm->rm_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + if (c < rr->rr_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } /* - * This simplifies one edge condition. + * If we read more parity disks than were used for + * reconstruction, confirm that the other parity disks produced + * correct data. + * + * Note that we also regenerate parity when resilvering so we + * can write it out to failed devices later. */ - tgts[-1] = -1; + if (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + int n = raidz_parity_verify(zio, rr); + unexpected_errors += n; + ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); + } - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. + * Use the good data we have in hand to repair damaged children. */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if ((rc->rc_error == 0 || rc->rc_size == 0) && + (rc->rc_repair == 0)) { + continue; } - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + +static void +raidz_restore_orig_data(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + abd_copy_from_buf(rc->rc_abd, + rc->rc_orig_data, rc->rc_size); + rc->rc_need_orig_restore = B_FALSE; } + } + } +} + +/* + * returns EINVAL if reconstruction of the block will not be possible + * returns ECKSUM if this specific reconstruction failed + * returns 0 on successful reconstruction + */ +static int +raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) +{ + raidz_map_t *rm = zio->io_vsd; - tgts[i] = c++; + /* Reconstruct each row */ + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ + int t = 0; + int dead = 0; + int dead_data = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT0(rc->rc_need_orig_restore); + if (rc->rc_error != 0) { + dead++; + if (c >= nparity) + dead_data++; + continue; + } + if (rc->rc_size == 0) + continue; + for (int lt = 0; lt < ntgts; lt++) { + if (rc->rc_devidx == ltgts[lt]) { + if (rc->rc_orig_data == NULL) { + rc->rc_orig_data = + zio_buf_alloc(rc->rc_size); + abd_copy_to_buf( + rc->rc_orig_data, + rc->rc_abd, rc->rc_size); + } + rc->rc_need_orig_restore = B_TRUE; + + dead++; + if (c >= nparity) + dead_data++; + my_tgts[t++] = c; + break; + } + } + } + if (dead > nparity) { + /* reconstruction not possible */ + raidz_restore_orig_data(rm); + return (EINVAL); } + rr->rr_code = 0; + if (dead_data > 0) + rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, + my_tgts, t); + } - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; + /* Check for success */ + if (raidz_checksum_verify(zio) == 0) { + + /* Reconstruction succeeded - report errors */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + /* + * Note: if this is a parity column, + * we don't really know if it's wrong. + * We need to let + * vdev_raidz_io_done_verified() check + * it, and if we set rc_error, it will + * think that it is a "known" error + * that doesn't need to be checked + * or corrected. + */ + if (rc->rc_error == 0 && + c >= rr->rr_firstdatacol) { + raidz_checksum_error(zio, + rc, rc->rc_gdata); + rc->rc_error = + SET_ERROR(ECKSUM); + } + rc->rc_need_orig_restore = B_FALSE; + } + } - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); + vdev_raidz_io_done_verified(zio, rr); } - orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, - rm->rm_col[0].rc_size); + zio_checksum_verified(zio); - curr = 0; - next = tgts[curr]; + return (0); + } - while (curr != n) { - tgts[curr] = next; - curr = 0; + /* Reconstruction failed - restore original data */ + raidz_restore_orig_data(rm); + return (ECKSUM); +} - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy(orig[i], rc->rc_abd, rc->rc_size); - } +/* + * Iterate over all combinations of N bad vdevs and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + * + * The order that we find the various possible combinations of failed + * disks is dictated by these rules: + * - Examine each "slot" (the "i" in tgts[i]) + * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - if we can't increment because it runs into the next slot, + * reset our slot to the minimum, and examine the next slot + * + * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose + * 3 columns to reconstruct), we will generate the following sequence: + * + * STATE ACTION + * 0 1 2 special case: skip since these are all parity + * 0 1 3 first slot: reset to 0; middle slot: increment to 2 + * 0 2 3 first slot: increment to 1 + * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 + * 0 1 4 first: reset to 0; middle: increment to 2 + * 0 2 4 first: increment to 1 + * 1 2 4 first: reset to 0; middle: increment to 3 + * 0 3 4 first: increment to 1 + * 1 3 4 first: increment to 2 + * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 + * 0 1 5 first: reset to 0; middle: increment to 2 + * 0 2 5 first: increment to 1 + * 1 2 5 first: reset to 0; middle: increment to 3 + * 0 3 5 first: increment to 1 + * 1 3 5 first: increment to 2 + * 2 3 5 first: reset to 0; middle: increment to 4 + * 0 4 5 first: increment to 1 + * 1 4 5 first: increment to 2 + * 2 4 5 first: increment to 3 + * 3 4 5 done + * + * This strategy works for dRAID but is less effecient when there are a large + * number of child vdevs and therefore permutations to check. Furthermore, + * since the raidz_map_t rows likely do not overlap reconstruction would be + * possible as long as there are no more than nparity data errors per row. + * These additional permutations are not currently checked but could be as + * a future improvement. + */ +static int +vdev_raidz_combrec(zio_t *zio) +{ + int nparity = vdev_get_nparity(zio->io_vd); + raidz_map_t *rm = zio->io_vsd; - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } + /* Check if there's enough data to attempt reconstrution. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + int total_errors = 0; - ret = code; - goto done; - } + for (int c = 0; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_error) + total_errors++; + } - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy(rc->rc_abd, orig[i], rc->rc_size); - } + if (total_errors > nparity) + return (vdev_raidz_worst_error(rr)); + } - do { + for (int num_failures = 1; num_failures <= nparity; num_failures++) { + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *ltgts = &tstore[1]; /* value is logical child ID */ + + /* Determine number of logical children, n */ + int n = zio->io_vd->vdev_children; + + ASSERT3U(num_failures, <=, nparity); + ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); + + /* Handle corner cases in combrec logic */ + ltgts[-1] = -1; + for (int i = 0; i < num_failures; i++) { + ltgts[i] = i; + } + ltgts[num_failures] = n; + + for (;;) { + int err = raidz_reconstruct(zio, ltgts, num_failures, + nparity); + if (err == EINVAL) { /* - * Find the next valid column after the curr - * position.. + * Reconstruction not possible with this # + * failures; try more failures. */ - for (next = tgts[curr] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; + break; + } else if (err == 0) + return (0); + + /* Compute next targets to try */ + for (int t = 0; ; t++) { + ASSERT3U(t, <, num_failures); + ltgts[t]++; + if (ltgts[t] == n) { + /* try more failures */ + ASSERT3U(t, ==, num_failures - 1); + break; + } - ASSERT(next <= tgts[curr + 1]); + ASSERT3U(ltgts[t], <, n); + ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. + * Try the next combination. */ - if (next != tgts[curr + 1]) + if (ltgts[t] != ltgts[t + 1]) break; /* - * Otherwise, find the next valid column after - * the previous position. + * Otherwise, reset this tgt to the minimum, + * and move on to the next tgt. */ - for (c = tgts[curr - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[curr] = c; - curr++; + ltgts[t] = ltgts[t - 1] + 1; + ASSERT3U(ltgts[t], ==, t); + } - } while (curr != n); + /* Increase the number of failures and keep trying. */ + if (ltgts[num_failures - 1] == n) + break; } } - n--; -done: - for (i = 0; i < n; i++) - abd_free(orig[i]); - return (ret); + return (ECKSUM); +} + +void +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +{ + for (uint64_t row = 0; row < rm->rm_nrows; row++) { + raidz_row_t *rr = rm->rm_row[row]; + vdev_raidz_reconstruct_row(rm, rr, t, nt); + } } /* - * Complete an IO operation on a RAIDZ VDev + * Complete a write IO operation on a RAIDZ VDev * * Outline: - * - For write operations: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. */ static void -vdev_raidz_io_done(zio_t *zio) +vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) +{ + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + total_errors++; + } + } + + /* + * Treat partial writes as a success. If we couldn't write enough + * columns to reconstruct the data, the I/O failed. Otherwise, + * good enough. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + if (total_errors > rr->rr_firstdatacol) { + zio->io_error = zio_worst_error(zio->io_error, + vdev_raidz_worst_error(rr)); + } +} + +/* + * return 0 if no reconstruction occurred, otherwise the "code" from + * vdev_raidz_reconstruct(). + */ +static int +vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, + raidz_row_t *rr) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc = NULL; - int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + int code = 0; - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - if (c < rm->rm_firstdatacol) |