aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
authorMatt Macy <mmacy@FreeBSD.org>2021-01-07 23:27:17 +0000
committerMatt Macy <mmacy@FreeBSD.org>2021-01-08 00:55:59 +0000
commit7877fdebeeb35fad1cbbafce22598b1bdf97c786 (patch)
tree10ccc0bab059d6f48a221045b92416fc347fe784 /sys/contrib/openzfs/module
parent84089de83e79a0f748c6e22b1aacb59156e153d2 (diff)
downloadsrc-7877fdebeeb35fad1cbbafce22598b1bdf97c786.tar.gz
src-7877fdebeeb35fad1cbbafce22598b1bdf97c786.zip
OpenZFS merge main-gf11b09
- add dRAID support - fix duplicate close handling - fix memory leak in prefetch - fix problem with SIMD benchmarking on FreeBSD boot ...
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/Makefile.bsd39
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/gcm.c54
-rw-r--r--sys/contrib/openzfs/module/icp/algs/modes/modes.c8
-rw-r--r--sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S24
-rw-r--r--sys/contrib/openzfs/module/icp/core/kcf_sched.c4
-rw-r--r--sys/contrib/openzfs/module/icp/include/modes/modes.h8
-rw-r--r--sys/contrib/openzfs/module/icp/io/aes.c18
-rw-r--r--sys/contrib/openzfs/module/lua/lapi.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c5
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c43
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c10
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c12
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c70
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c36
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c6
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c70
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c3
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c877
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c18
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c15
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c248
-rw-r--r--sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c132
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/Makefile.in3
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/abd_os.c2
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/arc_os.c88
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/policy.c5
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c31
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c18
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c1
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c4
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c1091
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c5
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c15
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c25
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c354
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c10
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c23
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c24
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c177
-rw-r--r--sys/contrib/openzfs/module/zcommon/Makefile.in1
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfeature_common.c6
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_fletcher.c69
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_namecheck.c4
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_prop.c6
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_uio.c173
-rw-r--r--sys/contrib/openzfs/module/zfs/Makefile.in3
-rw-r--r--sys/contrib/openzfs/module/zfs/abd.c14
-rw-r--r--sys/contrib/openzfs/module/zfs/aggsum.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c306
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c371
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c226
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_object.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c206
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_recv.c294
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_redact.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_send.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_tx.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_zfetch.c122
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c13
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode_sync.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_bookmark.c114
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_crypt.c11
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dataset.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_pool.c49
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_scan.c13
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c238
-rw-r--r--sys/contrib/openzfs/module/zfs/mmp.c11
-rw-r--r--sys/contrib/openzfs/module/zfs/multilist.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c158
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_history.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c20
-rw-r--r--sys/contrib/openzfs/module/zfs/txg.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c387
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid.c2984
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid_rand.c40
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect.c39
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_initialize.c141
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_label.c62
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_mirror.c137
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_missing.c18
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_queue.c134
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c1864
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math.c14
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h313
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_rebuild.c231
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_removal.c100
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_root.c9
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_trim.c153
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp.c7
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_fm.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c77
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_vnops.c895
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c72
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_inject.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c22
-rw-r--r--sys/contrib/openzfs/module/zstd/zfs_zstd.c52
96 files changed, 8901 insertions, 4922 deletions
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd
index 4feb9e1eaf0c..e7cddcc5bb5e 100644
--- a/sys/contrib/openzfs/module/Makefile.bsd
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@@ -24,7 +24,6 @@ KMOD= openzfs
CFLAGS+= -I${.OBJDIR:H}/include
CFLAGS+= -I${INCDIR}
-CFLAGS+= -I${INCDIR}/spl
CFLAGS+= -I${INCDIR}/os/freebsd
CFLAGS+= -I${INCDIR}/os/freebsd/spl
CFLAGS+= -I${INCDIR}/os/freebsd/zfs
@@ -40,7 +39,13 @@ CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSS
.endif
.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true"
-CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS
+CFLAGS+= -DZFS_DEBUG -g
+.if defined(WITH_INVARIANTS) && ${WITH_INVARIANTS} == "true"
+ CFLAGS+= -DINVARIANTS -DWITNESS -DOPENSOLARIS_WITNESS
+.endif
+.if defined(WITH_O0) && ${WITH_O0} == "true"
+ CFLAGS+= -O0
+.endif
.else
CFLAGS += -DNDEBUG
.endif
@@ -102,9 +107,10 @@ SRCS+= nvpair.c \
#os/freebsd/spl
SRCS+= acl_common.c \
- btree.c \
callb.c \
list.c \
+ sha256c.c \
+ sha512c.c \
spl_acl.c \
spl_cmn_err.c \
spl_dtrace.c \
@@ -112,6 +118,7 @@ SRCS+= acl_common.c \
spl_kstat.c \
spl_misc.c \
spl_policy.c \
+ spl_procfs_list.c \
spl_string.c \
spl_sunddi.c \
spl_sysevent.c \
@@ -119,11 +126,8 @@ SRCS+= acl_common.c \
spl_uio.c \
spl_vfs.c \
spl_vm.c \
- spl_zone.c \
- sha256c.c \
- sha512c.c \
- spl_procfs_list.c \
- spl_zlib.c
+ spl_zlib.c \
+ spl_zone.c
.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
@@ -133,6 +137,7 @@ SRCS+= spl_atomic.c
#os/freebsd/zfs
SRCS+= abd_os.c \
+ arc_os.c \
crypto_os.c \
dmu_os.c \
hkdf.c \
@@ -140,17 +145,16 @@ SRCS+= abd_os.c \
spa_os.c \
sysctl_os.c \
vdev_file.c \
- vdev_label_os.c \
vdev_geom.c \
+ vdev_label_os.c \
zfs_acl.c \
zfs_ctldir.c \
+ zfs_debug.c \
zfs_dir.c \
zfs_ioctl_compat.c \
zfs_ioctl_os.c \
- zfs_log.c \
- zfs_replay.c \
zfs_vfsops.c \
- zfs_vnops.c \
+ zfs_vnops_os.c \
zfs_znode.c \
zio_crypt.c \
zvol_os.c
@@ -178,10 +182,10 @@ SRCS+= zfeature_common.c \
SRCS+= abd.c \
aggsum.c \
arc.c \
- arc_os.c \
blkptr.c \
bplist.c \
bpobj.c \
+ btree.c \
cityhash.c \
dbuf.c \
dbuf_stats.c \
@@ -245,6 +249,8 @@ SRCS+= abd.c \
unique.c \
vdev.c \
vdev_cache.c \
+ vdev_draid.c \
+ vdev_draid_rand.c \
vdev_indirect.c \
vdev_indirect_births.c \
vdev_indirect_mapping.c \
@@ -276,16 +282,18 @@ SRCS+= abd.c \
zcp_synctask.c \
zfeature.c \
zfs_byteswap.c \
- zfs_debug.c \
zfs_file_os.c \
zfs_fm.c \
zfs_fuid.c \
zfs_ioctl.c \
+ zfs_log.c \
zfs_onexit.c \
zfs_quota.c \
zfs_ratelimit.c \
+ zfs_replay.c \
zfs_rlock.c \
zfs_sa.c \
+ zfs_vnops.c \
zil.c \
zio.c \
zio_checksum.c \
@@ -323,7 +331,7 @@ CFLAGS.spl_vm.c= -Wno-cast-qual
CFLAGS.spl_zlib.c= -Wno-cast-qual
CFLAGS.abd.c= -Wno-cast-qual
CFLAGS.zfs_log.c= -Wno-cast-qual
-CFLAGS.zfs_vnops.c= -Wno-pointer-arith
+CFLAGS.zfs_vnops_os.c= -Wno-pointer-arith
CFLAGS.u8_textprep.c= -Wno-cast-qual
CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith
@@ -341,6 +349,7 @@ CFLAGS.lz4.c= -Wno-cast-qual
CFLAGS.spa.c= -Wno-cast-qual
CFLAGS.spa_misc.c= -Wno-cast-qual
CFLAGS.sysctl_os.c= -include ../zfs_config.h
+CFLAGS.vdev_draid.c= -Wno-cast-qual
CFLAGS.vdev_raidz.c= -Wno-cast-qual
CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
index 5553c55e11cd..23686c59e8ce 100644
--- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
@@ -59,10 +59,12 @@ boolean_t gcm_avx_can_use_movbe = B_FALSE;
static boolean_t gcm_use_avx = B_FALSE;
#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
+extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+
static inline boolean_t gcm_avx_will_work(void);
static inline void gcm_set_avx(boolean_t);
static inline boolean_t gcm_toggle_avx(void);
-extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+static inline size_t gcm_simd_get_htab_size(boolean_t);
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
crypto_data_t *, size_t);
@@ -629,6 +631,21 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
(volatile boolean_t *)&gcm_avx_can_use_movbe);
}
}
+ /* Allocate Htab memory as needed. */
+ if (gcm_ctx->gcm_use_avx == B_TRUE) {
+ size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
+
+ if (htab_len == 0) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ gcm_ctx->gcm_htab_len = htab_len;
+ gcm_ctx->gcm_Htable =
+ (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
+
+ if (gcm_ctx->gcm_Htable == NULL) {
+ return (CRYPTO_HOST_MEMORY);
+ }
+ }
/* Avx and non avx context initialization differs from here on. */
if (gcm_ctx->gcm_use_avx == B_FALSE) {
#endif /* ifdef CAN_USE_GCM_ASM */
@@ -689,6 +706,22 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
if (ks->ops->needs_byteswap == B_TRUE) {
gcm_ctx->gcm_use_avx = B_FALSE;
}
+ /* Allocate Htab memory as needed. */
+ if (gcm_ctx->gcm_use_avx == B_TRUE) {
+ size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
+
+ if (htab_len == 0) {
+ return (CRYPTO_MECHANISM_PARAM_INVALID);
+ }
+ gcm_ctx->gcm_htab_len = htab_len;
+ gcm_ctx->gcm_Htable =
+ (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
+
+ if (gcm_ctx->gcm_Htable == NULL) {
+ return (CRYPTO_HOST_MEMORY);
+ }
+ }
+
/* Avx and non avx context initialization differs from here on. */
if (gcm_ctx->gcm_use_avx == B_FALSE) {
#endif /* ifdef CAN_USE_GCM_ASM */
@@ -1018,7 +1051,7 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
/* Clear the FPU registers since they hold sensitive internal state. */
#define clear_fpu_regs() clear_fpu_regs_avx()
#define GHASH_AVX(ctx, in, len) \
- gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \
+ gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
in, len)
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
@@ -1036,8 +1069,8 @@ extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
extern void aes_encrypt_intel(const uint32_t rk[], int nr,
const uint32_t pt[4], uint32_t ct[4]);
-extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]);
-extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2],
+extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
+extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
const uint8_t *in, size_t len);
extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
@@ -1073,6 +1106,18 @@ gcm_toggle_avx(void)
}
}
+static inline size_t
+gcm_simd_get_htab_size(boolean_t simd_mode)
+{
+ switch (simd_mode) {
+ case B_TRUE:
+ return (2 * 6 * 2 * sizeof (uint64_t));
+
+ default:
+ return (0);
+ }
+}
+
/*
* Clear sensitive data in the context.
*
@@ -1088,7 +1133,6 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
{
bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
- bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable));
bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
index f07876a478e2..faae9722bd04 100644
--- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c
+++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
@@ -152,6 +152,14 @@ crypto_free_mode_ctx(void *ctx)
vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf,
((gcm_ctx_t *)ctx)->gcm_pt_buf_len);
+#ifdef CAN_USE_GCM_ASM
+ if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) {
+ gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx;
+ bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
+ kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
+ }
+#endif
+
kmem_free(ctx, sizeof (gcm_ctx_t));
}
}
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
index ed9f660fce5b..dc71ae2c1c89 100644
--- a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -55,6 +55,7 @@
.type _aesni_ctr32_ghash_6x,@function
.align 32
_aesni_ctr32_ghash_6x:
+.cfi_startproc
vmovdqu 32(%r11),%xmm2
subq $6,%rdx
vpxor %xmm4,%xmm4,%xmm4
@@ -363,12 +364,14 @@ _aesni_ctr32_ghash_6x:
vpxor %xmm4,%xmm8,%xmm8
.byte 0xf3,0xc3
+.cfi_endproc
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
#endif /* ifdef HAVE_MOVBE */
.type _aesni_ctr32_ghash_no_movbe_6x,@function
.align 32
_aesni_ctr32_ghash_no_movbe_6x:
+.cfi_startproc
vmovdqu 32(%r11),%xmm2
subq $6,%rdx
vpxor %xmm4,%xmm4,%xmm4
@@ -689,6 +692,7 @@ _aesni_ctr32_ghash_no_movbe_6x:
vpxor %xmm4,%xmm8,%xmm8
.byte 0xf3,0xc3
+.cfi_endproc
.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
.globl aesni_gcm_decrypt
@@ -714,6 +718,8 @@ aesni_gcm_decrypt:
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
+ pushq %r9
+.cfi_offset %r9,-64
vzeroupper
vmovdqu (%r8),%xmm1
@@ -726,7 +732,8 @@ aesni_gcm_decrypt:
andq $-128,%rsp
vmovdqu (%r11),%xmm0
leaq 128(%rcx),%rcx
- leaq 32+32(%r9),%r9
+ movq 32(%r9),%r9
+ leaq 32(%r9),%r9
movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
vpshufb %xmm0,%xmm8,%xmm8
@@ -782,7 +789,9 @@ aesni_gcm_decrypt:
vmovups %xmm14,-16(%rsi)
vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
+ movq -56(%rax),%r9
+.cfi_restore %r9
+ vmovdqu %xmm8,(%r9)
vzeroupper
movq -48(%rax),%r15
@@ -807,6 +816,7 @@ aesni_gcm_decrypt:
.type _aesni_ctr32_6x,@function
.align 32
_aesni_ctr32_6x:
+.cfi_startproc
vmovdqu 0-128(%rcx),%xmm4
vmovdqu 32(%r11),%xmm2
leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
@@ -893,6 +903,7 @@ _aesni_ctr32_6x:
vpshufb %xmm0,%xmm1,%xmm1
vpxor %xmm4,%xmm14,%xmm14
jmp .Loop_ctr32
+.cfi_endproc
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
.globl aesni_gcm_encrypt
@@ -918,6 +929,8 @@ aesni_gcm_encrypt:
.cfi_offset %r14,-48
pushq %r15
.cfi_offset %r15,-56
+ pushq %r9
+.cfi_offset %r9,-64
vzeroupper
vmovdqu (%r8),%xmm1
@@ -960,7 +973,8 @@ aesni_gcm_encrypt:
call _aesni_ctr32_6x
vmovdqu (%r9),%xmm8
- leaq 32+32(%r9),%r9
+ movq 32(%r9),%r9
+ leaq 32(%r9),%r9
subq $12,%rdx
movq $192,%r10
vpshufb %xmm0,%xmm8,%xmm8
@@ -1151,7 +1165,9 @@ aesni_gcm_encrypt:
vpxor %xmm7,%xmm2,%xmm2
vpxor %xmm2,%xmm8,%xmm8
vpshufb (%r11),%xmm8,%xmm8
- vmovdqu %xmm8,-64(%r9)
+ movq -56(%rax),%r9
+.cfi_restore %r9
+ vmovdqu %xmm8,(%r9)
vzeroupper
movq -48(%rax),%r15
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_sched.c b/sys/contrib/openzfs/module/icp/core/kcf_sched.c
index 40d50553d67e..81fd15f8ea26 100644
--- a/sys/contrib/openzfs/module/icp/core/kcf_sched.c
+++ b/sys/contrib/openzfs/module/icp/core/kcf_sched.c
@@ -1308,9 +1308,7 @@ kcf_reqid_insert(kcf_areq_node_t *areq)
kcf_areq_node_t *headp;
kcf_reqid_table_t *rt;
- kpreempt_disable();
- rt = kcf_reqid_table[CPU_SEQID & REQID_TABLE_MASK];
- kpreempt_enable();
+ rt = kcf_reqid_table[CPU_SEQID_UNSTABLE & REQID_TABLE_MASK];
mutex_enter(&rt->rt_lock);
diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h
index 57a211ccf1bf..ab71197542eb 100644
--- a/sys/contrib/openzfs/module/icp/include/modes/modes.h
+++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h
@@ -219,14 +219,14 @@ typedef struct gcm_ctx {
size_t gcm_pt_buf_len;
uint32_t gcm_tmp[4];
/*
- * The relative positions of gcm_ghash, gcm_H and pre-computed
- * gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S,
- * so please don't change (or adjust accordingly).
+ * The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded
+ * in aesni-gcm-x86_64.S, so please don't change (or adjust there).
*/
uint64_t gcm_ghash[2];
uint64_t gcm_H[2];
#ifdef CAN_USE_GCM_ASM
- uint64_t gcm_Htable[12][2];
+ uint64_t *gcm_Htable;
+ size_t gcm_htab_len;
#endif
uint64_t gcm_J0[2];
uint64_t gcm_len_a_len_c[2];
diff --git a/sys/contrib/openzfs/module/icp/io/aes.c b/sys/contrib/openzfs/module/icp/io/aes.c
index 96fb6bb1af30..e540af4473f7 100644
--- a/sys/contrib/openzfs/module/icp/io/aes.c
+++ b/sys/contrib/openzfs/module/icp/io/aes.c
@@ -1051,6 +1051,16 @@ out:
bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
}
+#ifdef CAN_USE_GCM_ASM
+ if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) &&
+ ((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
+
+ gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
+
+ bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
+ kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+ }
+#endif
return (ret);
}
@@ -1209,6 +1219,14 @@ out:
vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf,
((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len);
}
+#ifdef CAN_USE_GCM_ASM
+ if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
+ gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
+
+ bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
+ kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+ }
+#endif
}
return (ret);
diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c
index 8f072531fde5..6a845c461052 100644
--- a/sys/contrib/openzfs/module/lua/lapi.c
+++ b/sys/contrib/openzfs/module/lua/lapi.c
@@ -1300,7 +1300,7 @@ module_exit(lua_fini);
ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS");
ZFS_MODULE_AUTHOR("Lua.org");
-ZFS_MODULE_LICENSE("MIT");
+ZFS_MODULE_LICENSE("Dual MIT/GPL");
ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
EXPORT_SYMBOL(lua_absindex);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
index 5cd5c69efa71..5ecd3d310361 100644
--- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
@@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$");
#include <sys/jail.h>
#include <sys/policy.h>
#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
int
@@ -312,11 +313,11 @@ secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
}
int
-secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr,
+secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr,
boolean_t issuidroot __unused)
{
- if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+ if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0)
return (0);
return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
index a7bda509bf54..0a323e8856a3 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -106,13 +106,13 @@ abd_free_chunk(void *c)
kmem_cache_free(abd_chunk_cache, c);
}
-static size_t
+static uint_t
abd_chunkcnt_for_bytes(size_t size)
{
return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
}
-static inline size_t
+static inline uint_t
abd_scatter_chunkcnt(abd_t *abd)
{
ASSERT(!abd_is_linear(abd));
@@ -129,7 +129,7 @@ abd_size_alloc_linear(size_t size)
void
abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
{
- size_t n = abd_scatter_chunkcnt(abd);
+ uint_t n = abd_scatter_chunkcnt(abd);
ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
int waste = n * zfs_abd_chunk_size - abd->abd_size;
if (op == ABDSTAT_INCR) {
@@ -161,25 +161,28 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
void
abd_verify_scatter(abd_t *abd)
{
+ uint_t i, n;
+
/*
* There is no scatter linear pages in FreeBSD so there is an
* if an error if the ABD has been marked as a linear page.
*/
- VERIFY(!abd_is_linear_page(abd));
+ ASSERT(!abd_is_linear_page(abd));
ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
zfs_abd_chunk_size);
- size_t n = abd_scatter_chunkcnt(abd);
- for (int i = 0; i < n; i++) {
- ASSERT3P(
- ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
+ n = abd_scatter_chunkcnt(abd);
+ for (i = 0; i < n; i++) {
+ ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
}
}
void
abd_alloc_chunks(abd_t *abd, size_t size)
{
- size_t n = abd_chunkcnt_for_bytes(size);
- for (int i = 0; i < n; i++) {
+ uint_t i, n;
+
+ n = abd_chunkcnt_for_bytes(size);
+ for (i = 0; i < n; i++) {
void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
ASSERT3P(c, !=, NULL);
ABD_SCATTER(abd).abd_chunks[i] = c;
@@ -190,8 +193,10 @@ abd_alloc_chunks(abd_t *abd, size_t size)
void
abd_free_chunks(abd_t *abd)
{
- size_t n = abd_scatter_chunkcnt(abd);
- for (int i = 0; i < n; i++) {
+ uint_t i, n;
+
+ n = abd_scatter_chunkcnt(abd);
+ for (i = 0; i < n; i++) {
abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
}
}
@@ -199,7 +204,7 @@ abd_free_chunks(abd_t *abd)
abd_t *
abd_alloc_struct(size_t size)
{
- size_t chunkcnt = abd_chunkcnt_for_bytes(size);
+ uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
/*
* In the event we are allocating a gang ABD, the size passed in
* will be 0. We must make sure to set abd_size to the size of an
@@ -221,9 +226,9 @@ abd_alloc_struct(size_t size)
void
abd_free_struct(abd_t *abd)
{
- size_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
+ uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
abd_scatter_chunkcnt(abd);
- int size = MAX(sizeof (abd_t),
+ ssize_t size = MAX(sizeof (abd_t),
offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
mutex_destroy(&abd->abd_mtx);
ASSERT(!list_link_active(&abd->abd_gang_link));
@@ -238,7 +243,9 @@ abd_free_struct(abd_t *abd)
static void
abd_alloc_zero_scatter(void)
{
- size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+ uint_t i, n;
+
+ n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
@@ -251,7 +258,7 @@ abd_alloc_zero_scatter(void)
ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
zfs_abd_chunk_size;
- for (int i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
abd_zero_buf;
}
@@ -356,7 +363,7 @@ abd_get_offset_scatter(abd_t *sabd, size_t off)
ASSERT3U(off, <=, sabd->abd_size);
size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
- size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+ uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
(new_offset / zfs_abd_chunk_size);
abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
index 94df750035a4..4fc7468bfa47 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -243,3 +243,13 @@ arc_lowmem_fini(void)
if (arc_event_lowmem != NULL)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
}
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index 1b37ce0d7f6b..647c1463ba14 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -114,6 +114,7 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -228,15 +229,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
static int
sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
{
- uint32_t val;
- int err;
+ int err, val;
val = arc_no_grow_shift;
- err = sysctl_handle_32(oidp, &val, 0, req);
+ err = sysctl_handle_int(oidp, &val, 0, req);
if (err != 0 || req->newptr == NULL)
return (err);
- if (val >= arc_shrink_shift)
+ if (val < 0 || val >= arc_shrink_shift)
return (EINVAL);
arc_no_grow_shift = val;
@@ -244,8 +244,8 @@ sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
}
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
- CTLTYPE_U32 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, sizeof (uint32_t),
- sysctl_vfs_zfs_arc_no_grow_shift, "U",
+ CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int),
+ sysctl_vfs_zfs_arc_no_grow_shift, "I",
"log2(fraction of ARC which must be free to allow growing)");
int
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
index cf762c5fd61c..825bd706e0c0 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
@@ -292,19 +292,28 @@ vdev_file_io_done(zio_t *zio)
}
vdev_ops_t vdev_file_ops = {
- vdev_file_open,
- vdev_file_close,
- vdev_default_asize,
- vdev_file_io_start,
- vdev_file_io_done,
- NULL,
- NULL,
- vdev_file_hold,
- vdev_file_rele,
- NULL,
- vdev_default_xlate,
- VDEV_TYPE_FILE, /* name of this vdev type */
- B_TRUE /* leaf vdev */
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
};
/*
@@ -313,19 +322,28 @@ vdev_ops_t vdev_file_ops = {
#ifndef _KERNEL
vdev_ops_t vdev_disk_ops = {
- vdev_file_open,
- vdev_file_close,
- vdev_default_asize,
- vdev_file_io_start,
- vdev_file_io_done,
- NULL,
- NULL,
- vdev_file_hold,
- vdev_file_rele,
- NULL,
- vdev_default_xlate,
- VDEV_TYPE_DISK, /* name of this vdev type */
- B_TRUE /* leaf vdev */
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
};
#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
index f042eff7cd2e..c9e8e21982cf 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@@ -1141,7 +1141,6 @@ sendreq:
break;
case ZIO_TYPE_IOCTL:
bp->bio_cmd = BIO_FLUSH;
- bp->bio_flags |= BIO_ORDERED;
bp->bio_data = NULL;
bp->bio_offset = cp->provider->mediasize;
bp->bio_length = 0;
@@ -1190,17 +1189,26 @@ vdev_geom_rele(vdev_t *vd)
}
vdev_ops_t vdev_disk_ops = {
- vdev_geom_open,
- vdev_geom_close,
- vdev_default_asize,
- vdev_geom_io_start,
- vdev_geom_io_done,
- NULL,
- NULL,
- vdev_geom_hold,
- vdev_geom_rele,
- NULL,
- vdev_default_xlate,
- VDEV_TYPE_DISK, /* name of this vdev type */
- B_TRUE /* leaf vdev */
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
+ .vdev_op_open = vdev_geom_open,
+ .vdev_op_close = vdev_geom_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_geom_io_start,
+ .vdev_op_io_done = vdev_geom_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_geom_hold,
+ .vdev_op_rele = vdev_geom_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index d7786d5136a2..8fb259f4ba76 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -158,7 +158,8 @@ zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
if (rc)
return (SET_ERROR(rc));
- *resid = auio.uio_resid;
+ if (resid)
+ *resid = auio.uio_resid;
*offp += count - auio.uio_resid;
return (SET_ERROR(0));
}
@@ -296,7 +297,8 @@ zfs_file_unlink(const char *fnamep)
rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
#else
#ifdef AT_BENEATH
- rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0);
+ rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+ seg, 0, 0);
#else
rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
seg, 0);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c
index 8b22f2fdc3b3..e69de29bb2d1 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c
@@ -1,70 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/errno.h>
-#include <sys/kmem.h>
-#include <sys/sunddi.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_onexit.h>
-
-static int
-zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
-{
- *zo = zfsdev_get_state(minor, ZST_ONEXIT);
- if (*zo == NULL)
- return (SET_ERROR(EBADF));
-
- return (0);
-}
-
-int
-zfs_onexit_fd_hold(int fd, minor_t *minorp)
-{
- file_t *fp, *tmpfp;
- zfs_onexit_t *zo;
- void *data;
- int error;
-
- if ((error = zfs_file_get(fd, &fp)))
- return (error);
-
- tmpfp = curthread->td_fpop;
- curthread->td_fpop = fp;
- error = devfs_get_cdevpriv(&data);
- if (error == 0)
- *minorp = (minor_t)(uintptr_t)data;
- curthread->td_fpop = tmpfp;
- if (error != 0)
- return (SET_ERROR(EBADF));
- return (zfs_onexit_minor_to_state(*minorp, &zo));
-}
-
-void
-zfs_onexit_fd_rele(int fd)
-{
- zfs_file_put(fd);
-}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
index 54ebfa7532dd..7bc6b83d0272 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -42,6 +42,7 @@
#include <sys/mount.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
@@ -433,7 +434,7 @@ zfs_sync(vfs_t *vfsp, int waitfor)
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
- * run sync(1M). Unlike other filesystems, ZFS honors the
+ * run sync(8). Unlike other filesystems, ZFS honors the
* request by waiting for all pools to commit all dirty data.
*/
spa_sync_allpools();
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c
index 3c3285f93389..2e8eadb5e16e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c
@@ -29,6 +29,7 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -270,69 +271,13 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
return (0);
}
-/*
- * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
- * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off)
-{
- znode_t *zp = VTOZ(vp);
- uint64_t noff = (uint64_t)*off; /* new offset */
- uint64_t file_sz;
- int error;
- boolean_t hole;
-
- file_sz = zp->z_size;
- if (noff >= file_sz) {
- return (SET_ERROR(ENXIO));
- }
-
- if (cmd == _FIO_SEEK_HOLE)
- hole = B_TRUE;
- else
- hole = B_FALSE;
-
- error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
-
- if (error == ESRCH)
- return (SET_ERROR(ENXIO));
-
- /* file was dirty, so fall back to using generic logic */
- if (error == EBUSY) {
- if (hole)
- *off = file_sz;
-
- return (0);
- }
-
- /*
- * We could find a hole that begins after the logical end-of-file,
- * because dmu_offset_next() only works on whole blocks. If the
- * EOF falls mid-block, then indicate that the "virtual hole"
- * at the end of the file begins at the logical EOF, rather than
- * at the end of the last block.
- */
- if (noff > file_sz) {
- ASSERT(hole);
- noff = file_sz;
- }
-
- if (noff < *off)
- return (error);
- *off = noff;
- return (error);
-}
-
/* ARGSUSED */
static int
zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
int *rvalp)
{
- offset_t off;
+ loff_t off;
int error;
- zfsvfs_t *zfsvfs;
- znode_t *zp;
switch (com) {
case _FIOFFS:
@@ -350,18 +295,12 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
return (0);
}
- case _FIO_SEEK_DATA:
- case _FIO_SEEK_HOLE:
+ case F_SEEK_DATA:
+ case F_SEEK_HOLE:
{
off = *(offset_t *)data;
- zp = VTOZ(vp);
- zfsvfs = zp->z_zfsvfs;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
/* offset parameter is in/out */
- error = zfs_holey(vp, com, &off);
- ZFS_EXIT(zfsvfs);
+ error = zfs_holey(VTOZ(vp), com, &off);
if (error)
return (error);
*(offset_t *)data = off;
@@ -525,16 +464,15 @@ page_unhold(vm_page_t pp)
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
*/
-static void
-update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
- int segflg, dmu_tx_t *tx)
+void
+update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
{
vm_object_t obj;
struct sf_buf *sf;
+ vnode_t *vp = ZTOV(zp);
caddr_t va;
int off;
- ASSERT(segflg != UIO_NOCOPY);
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
@@ -552,8 +490,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
zfs_vmobject_wunlock_12(obj);
va = zfs_map_page(pp, &sf);
- (void) dmu_read(os, oid, start+off, nbytes,
- va+off, DMU_READ_PREFETCH);
+ (void) dmu_read(os, zp->z_id, start + off, nbytes,
+ va + off, DMU_READ_PREFETCH);
zfs_unmap_page(sf);
zfs_vmobject_wlock_12(obj);
@@ -579,10 +517,10 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
* map them into contiguous KVA region and populate them
* in one single dmu_read() call.
*/
-static int
-mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+int
+mappedread_sf(znode_t *zp, int nbytes, uio_t *uio)
{
- znode_t *zp = VTOZ(vp);
+ vnode_t *vp = ZTOV(zp);
objset_t *os = zp->z_zfsvfs->z_os;
struct sf_buf *sf;
vm_object_t obj;
@@ -664,10 +602,10 @@ mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
*/
-static int
-mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+int
+mappedread(znode_t *zp, int nbytes, uio_t *uio)
{
- znode_t *zp = VTOZ(vp);
+ vnode_t *vp = ZTOV(zp);
vm_object_t obj;
int64_t start;
int len = nbytes;
@@ -710,523 +648,6 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio)
return (error);
}
-offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
-
-/*
- * Read bytes from specified file into supplied buffer.
- *
- * IN: vp - vnode of file to be read from.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * ioflag - SYNC flags; used to provide FRSYNC semantics.
- * cr - credentials of caller.
- * ct - caller context
- *
- * OUT: uio - updated offset and range, buffer filled.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Side Effects:
- * vp - atime updated if byte count > 0
- */
-/* ARGSUSED */
-static int
-zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- ssize_t n, nbytes, start_resid;
- int error = 0;
- int64_t nread;
- zfs_locked_range_t *lr;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /* We don't copy out anything useful for directories. */
- if (vp->v_type == VDIR) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EISDIR));
- }
-
- if (zp->z_pflags & ZFS_AV_QUARANTINED) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
-
- /*
- * Validate file offset
- */
- if (uio->uio_loffset < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Fasttrack empty reads
- */
- if (uio->uio_resid == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- /*
- * If we're in FRSYNC mode, sync out this znode before reading it.
- */
- if (zfsvfs->z_log &&
- (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
- zil_commit(zfsvfs->z_log, zp->z_id);
-
- /*
- * Lock the range against changes.
- */
- lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset,
- uio->uio_resid, RL_READER);
-
- /*
- * If we are reading past end-of-file we can skip
- * to the end; but we might still need to set atime.
- */
- if (uio->uio_loffset >= zp->z_size) {
- error = 0;
- goto out;
- }
-
- ASSERT(uio->uio_loffset < zp->z_size);
- n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
- start_resid = n;
-
- while (n > 0) {
- nbytes = MIN(n, zfs_read_chunk_size -
- P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
- if (uio->uio_segflg == UIO_NOCOPY)
- error = mappedread_sf(vp, nbytes, uio);
- else if (vn_has_cached_data(vp)) {
- error = mappedread(vp, nbytes, uio);
- } else {
- error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, nbytes);
- }
- if (error) {
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = SET_ERROR(EIO);
- break;
- }
-
- n -= nbytes;
- }
-
- nread = start_resid - n;
- dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
-
-out:
- zfs_rangelock_exit(lr);
-
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Write the bytes to a file.
- *
- * IN: vp - vnode of file to be written to.
- * uio - structure supplying write location, range info,
- * and data buffer.
- * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
- * set if in append mode.
- * cr - credentials of caller.
- * ct - caller context (NFS/CIFS fem monitor only)
- *
- * OUT: uio - updated offset and range.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - ctime|mtime updated if byte count > 0
- */
-
-/* ARGSUSED */
-static int
-zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- rlim64_t limit = MAXOFFSET_T;
- ssize_t start_resid = uio->uio_resid;
- ssize_t tx_bytes;
- uint64_t end_size;
- dmu_buf_impl_t *db;
- dmu_tx_t *tx;
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- zilog_t *zilog;
- offset_t woff;
- ssize_t n, nbytes;
- zfs_locked_range_t *lr;
- int max_blksz = zfsvfs->z_max_blksz;
- int error = 0;
- arc_buf_t *abuf;
- iovec_t *aiov = NULL;
- xuio_t *xuio = NULL;
- int i_iov = 0;
- int iovcnt __unused = uio->uio_iovcnt;
- iovec_t *iovp = uio->uio_iov;
- int write_eof;
- int count = 0;
- sa_bulk_attr_t bulk[4];
- uint64_t mtime[2], ctime[2];
- uint64_t uid, gid, projid;
- int64_t nwritten;
-
- /*
- * Fasttrack empty write
- */
- n = start_resid;
- if (n == 0)
- return (0);
-
- if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
- limit = MAXOFFSET_T;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
- &zp->z_size, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
-
- /*
- * Callers might not be able to detect properly that we are read-only,
- * so check it explicitly here.
- */
- if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EROFS));
- }
-
- /*
- * If immutable or not appending then return EPERM.
- * Intentionally allow ZFS_READONLY through here.
- * See zfs_zaccess_common()
- */
- if ((zp->z_pflags & ZFS_IMMUTABLE) ||
- ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
- (uio->uio_loffset < zp->z_size))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- zilog = zfsvfs->z_log;
-
- /*
- * Validate file offset
- */
- woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * If in append mode, set the io offset pointer to eof.
- */
- if (ioflag & FAPPEND) {
- /*
- * Obtain an appending range lock to guarantee file append
- * semantics. We reset the write offset once we have the lock.
- */
- lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
- woff = lr->lr_offset;
- if (lr->lr_length == UINT64_MAX) {
- /*
- * We overlocked the file because this write will cause
- * the file block size to increase.
- * Note that zp_size cannot change with this lock held.
- */
- woff = zp->z_size;
- }
- uio->uio_loffset = woff;
- } else {
- /*
- * Note that if the file block size will change as a result of
- * this write, then this range lock will lock the entire file
- * so that we can re-write the block safely.
- */
- lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
- }
-
- if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
- zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (EFBIG);
- }
-
- if (woff >= limit) {
- zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EFBIG));
- }
-
- if ((woff + n) > limit || woff > (limit - n))
- n = limit - woff;
-
- /* Will this write extend the file length? */
- write_eof = (woff + n > zp->z_size);
-
- end_size = MAX(zp->z_size, woff + n);
-
- uid = zp->z_uid;
- gid = zp->z_gid;
- projid = zp->z_projid;
-
- /*
- * Write the file in reasonable size chunks. Each chunk is written
- * in a separate transaction; this keeps the intent log records small
- * and allows us to do more fine-grained space accounting.
- */
- while (n > 0) {
- woff = uio->uio_loffset;
-
- if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
- zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
- (projid != ZFS_DEFAULT_PROJID &&
- zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
- projid))) {
- error = SET_ERROR(EDQUOT);
- break;
- }
-
- abuf = NULL;
- if (xuio) {
- ASSERT(i_iov < iovcnt);
- aiov = &iovp[i_iov];
- abuf = dmu_xuio_arcbuf(xuio, i_iov);
- dmu_xuio_clear(xuio, i_iov);
- DTRACE_PROBE3(zfs_cp_write, int, i_iov,
- iovec_t *, aiov, arc_buf_t *, abuf);
- ASSERT((aiov->iov_base == abuf->b_data) ||
- ((char *)aiov->iov_base - (char *)abuf->b_data +
- aiov->iov_len == arc_buf_size(abuf)));
- i_iov++;
- } else if (n >= max_blksz &&
- woff >= zp->z_size &&
- P2PHASE(woff, max_blksz) == 0 &&
- zp->z_blksz == max_blksz) {
- /*
- * This write covers a full block. "Borrow" a buffer
- * from the dmu so that we can fill it before we enter
- * a transaction. This avoids the possibility of
- * holding up the transaction if the data copy hangs
- * up on a pagefault (e.g., from an NFS server mapping).
- */
- size_t cbytes;
-
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- max_blksz);
- ASSERT(abuf != NULL);
- ASSERT(arc_buf_size(abuf) == max_blksz);
- if ((error = uiocopy(abuf->b_data, max_blksz,
- UIO_WRITE, uio, &cbytes))) {
- dmu_return_arcbuf(abuf);
- break;
- }
- ASSERT(cbytes == max_blksz);
- }
-
- /*
- * Start a transaction.
- */
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
- DB_DNODE_ENTER(db);
- dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
- MIN(n, max_blksz));
- DB_DNODE_EXIT(db);
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- if (abuf != NULL)
- dmu_return_arcbuf(abuf);
- break;
- }
-
- /*
- * If zfs_range_lock() over-locked we grow the blocksize
- * and then reduce the lock range. This will only happen
- * on the first iteration since zfs_range_reduce() will
- * shrink down r_len to the appropriate size.
- */
- if (lr->lr_length == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size,
- 1 << highbit64(zp->z_blksz));
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
- zfs_rangelock_reduce(lr, woff, n);
- }
-
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
- if (woff + nbytes > zp->z_size)
- vnode_pager_setsize(vp, woff + nbytes);
-
- if (abuf == NULL) {
- tx_bytes = uio->uio_resid;
- error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, nbytes, tx);
- tx_bytes -= uio->uio_resid;
- } else {
- tx_bytes = nbytes;
- ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
- /*
- * If this is not a full block write, but we are
- * extending the file past EOF and this data starts
- * block-aligned, use assign_arcbuf(). Otherwise,
- * write via dmu_write().
- */
- if (tx_bytes < max_blksz && (!write_eof ||
- aiov->iov_base != abuf->b_data)) {
- ASSERT(xuio);
- dmu_write(zfsvfs->z_os, zp->z_id, woff,
- aiov->iov_len, aiov->iov_base, tx);
- dmu_return_arcbuf(abuf);
- xuio_stat_wbuf_copied();
- } else {
- ASSERT(xuio || tx_bytes == max_blksz);
- dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff,
- abuf, tx);
- }
- ASSERT(tx_bytes <= uio->uio_resid);
- uioskip(uio, tx_bytes);
- }
- if (tx_bytes && vn_has_cached_data(vp)) {
- update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
- zp->z_id, uio->uio_segflg, tx);
- }
-
- /*
- * If we made no progress, we're done. If we made even
- * partial progress, update the znode and ZIL accordingly.
- */
- if (tx_bytes == 0) {
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
- (void *)&zp->z_size, sizeof (uint64_t), tx);
- dmu_tx_commit(tx);
- ASSERT(error != 0);
- break;
- }
-
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the execute bits is set.
- *
- * It would be nice to to this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- *
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
- */
- mutex_enter(&zp->z_acl_lock);
- if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(vp, cr,
- (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
- uint64_t newmode;
- zp->z_mode &= ~(S_ISUID | S_ISGID);
- newmode = zp->z_mode;
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
- (void *)&newmode, sizeof (uint64_t), tx);
- }
- mutex_exit(&zp->z_acl_lock);
-
- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
-
- /*
- * Update the file size (zp_size) if it has changed;
- * account for possible concurrent updates.
- */
- while ((end_size = zp->z_size) < uio->uio_loffset) {
- (void) atomic_cas_64(&zp->z_size, end_size,
- uio->uio_loffset);
- ASSERT(error == 0 || error == EFAULT);
- }
- /*
- * If we are replaying and eof is non zero then force
- * the file size to the specified eof. Note, there's no
- * concurrency during replay.
- */
- if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
- zp->z_size = zfsvfs->z_replay_eof;
-
- if (error == 0)
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- else
- (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes,
- ioflag, NULL, NULL);
- dmu_tx_commit(tx);
-
- if (error != 0)
- break;
- ASSERT(tx_bytes == nbytes);
- n -= nbytes;
-
- }
-
- zfs_rangelock_exit(lr);
-
- /*
- * If we're in replay mode, or we made no progress, return error.
- * Otherwise, it's at least a partial write, so it's successful.
- */
- if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- /*
- * EFAULT means that at least one page of the source buffer was not
- * available. VFS will re-try remaining I/O upon this error.
- */
- if (error == EFAULT) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (ioflag & (FSYNC | FDSYNC) ||
- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, zp->z_id);
-
- nwritten = start_resid - uio->uio_resid;
- dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
int
zfs_write_simple(znode_t *zp, const void *data, size_t len,
loff_t pos, size_t *presid)
@@ -1249,184 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}
-static void
-zfs_get_done(zgd_t *zgd, int error)
+void
+zfs_zrele_async(znode_t *zp)
{
- znode_t *zp = zgd->zgd_private;
- objset_t *os = zp->z_zfsvfs->z_os;
-
- if (zgd->zgd_db)
- dmu_buf_rele(zgd->zgd_db, zgd);
+ vnode_t *vp = ZTOV(zp);
+ objset_t *os = ITOZSB(vp)->z_os;
- zfs_rangelock_exit(zgd->zgd_lr);
-
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
-
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef ZFS_DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t object = lr->lr_foid;
- uint64_t offset = lr->lr_offset;
- uint64_t size = lr->lr_length;
- dmu_buf_t *db;
- zgd_t *zgd;
- int error = 0;
-
- ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
- ASSERT3U(size, !=, 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, object, &zp) != 0)
- return (SET_ERROR(ENOENT));
- if (zp->z_unlinked) {
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- VN_RELE_ASYNC(ZTOV(zp),
- dsl_pool_zrele_taskq(dmu_objset_pool(os)));
- return (SET_ERROR(ENOENT));
- }
-
- zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_lwb = lwb;
- zgd->zgd_private = zp;
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
- size, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (offset >= zp->z_size) {
- error = SET_ERROR(ENOENT);
- } else {
- error = dmu_read(os, object, offset, size, buf,
- DMU_READ_NO_PREFETCH);
- }
- ASSERT(error == 0 || error == ENOENT);
- } else { /* indirect write */
- /*
- * Have to lock the whole block to ensure when it's
- * written out and its checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- uint64_t blkoff;
- size = zp->z_blksz;
- blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
- offset -= blkoff;
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- if (zp->z_blksz == size)
- break;
- offset += blkoff;
- zfs_rangelock_exit(zgd->zgd_lr);
- }
- /* test for truncation needs to be done while range locked */
- if (lr->lr_offset >= zp->z_size)
- error = SET_ERROR(ENOENT);
-#ifdef ZFS_DEBUG
- if (zil_fault_io) {
- error = SET_ERROR(EIO);
- zil_fault_io = 0;
- }
-#endif
- if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
-
- if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
-
- zgd->zgd_db = db;
- zgd->zgd_bp = bp;
-
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
-
- error = dmu_sync(zio, lr->lr_common.lrc_txg,
- zfs_get_done, zgd);
- ASSERT(error || lr->lr_length <= size);
-
- /*
- * On success, we need to wait for the write I/O
- * initiated by dmu_sync() to complete before we can
- * release this dbuf. We will finish everything up
- * in the zfs_get_done() callback.
- */
- if (error == 0)
- return (0);
-
- if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- /*
- * TX_WRITE2 relies on the data previously
- * written by the TX_WRITE that caused
- * EALREADY. We zero out the BP because
- * it is the old, currently-on-disk BP,
- * so there's no need to zio_flush() its
- * vdevs (flushing would needlesly hurt
- * performance, and doesn't work on
- * indirect vdevs).
- */
- zgd->zgd_bp = NULL;
- BP_ZERO(bp);
- error = 0;
- }
- }
- }
-
- zfs_get_done(zgd, error);
-
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (flag & V_ACE_MASK)
- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
- else
- error = zfs_zaccess_rwx(zp, mode, flag, cr);
-
- ZFS_EXIT(zfsvfs);
- return (error);
+ VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
}
static int
@@ -2708,27 +1958,6 @@ update:
return (error);
}
-ulong_t zfs_fsync_sync_cnt = 4;
-
-static int
-zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
- if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
- }
- tsd_set(zfs_fsyncer_key, NULL);
- return (0);
-}
-
-
/*
* Get the requested file attributes and place them in the provided
* vattr structure.
@@ -3905,7 +3134,7 @@ zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
return (error);
}
-#if __FreeBSD_version < 1300110
+#if __FreeBSD_version < 1300124
static void
cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
@@ -4793,45 +4022,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
}
}
-/*ARGSUSED*/
-static int
-zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- error = zfs_getacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- zilog_t *zilog = zfsvfs->z_log;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- error = zfs_setacl(zp, vsecp, skipaclchk, cr);
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
static int
zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
int *rahead)
@@ -5225,7 +4415,7 @@ static int
zfs_freebsd_read(struct vop_read_args *ap)
{
- return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+ return (zfs_read(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag),
ap->a_cred));
}
@@ -5242,7 +4432,7 @@ static int
zfs_freebsd_write(struct vop_write_args *ap)
{
- return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
+ return (zfs_write(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag),
ap->a_cred));
}
@@ -5301,7 +4491,7 @@ zfs_freebsd_access(struct vop_access_args *ap)
*/
accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
if (accmode != 0)
- error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
+ error = zfs_access(zp, accmode, 0, ap->a_cred);
/*
* VADMIN has to be handled by vaccess().
@@ -5512,7 +4702,7 @@ zfs_freebsd_fsync(struct vop_fsync_args *ap)
{
vop_stdfsync(ap);
- return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
+ return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
}
#ifndef _SYS_SYSPROTO_H_
@@ -5825,7 +5015,11 @@ zfs_freebsd_inactive(struct vop_inactive_args *ap)
{
vnode_t *vp = ap->a_vp;
+#if __FreeBSD_version >= 1300123
zfs_inactive(vp, curthread->td_ucred, NULL);
+#else
+ zfs_inactive(vp, ap->a_td->td_ucred, NULL);
+#endif
return (0);
}
@@ -6377,7 +5571,8 @@ zfs_freebsd_getacl(struct vop_getacl_args *ap)
return (EINVAL);
vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
- if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL)))
+ if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
+ &vsecattr, 0, ap->a_cred)))
return (error);
error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
@@ -6510,7 +5705,13 @@ zfs_vptocnp(struct vop_vptocnp_args *ap)
error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
#endif
if (error == 0) {
- error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, ap->a_buflen);
+#if __FreeBSD_version >= 1300123
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
+ ap->a_buflen);
+#else
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+ ap->a_buf, ap->a_buflen);
+#endif
vput(covered_vp);
}
vn_lock(vp, ltype | LK_RETRY);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
index 40baa0b80928..6a21623c5f67 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -149,7 +149,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_acl_cached = NULL;
zp->z_vnode = NULL;
- zp->z_moved = 0;
return (0);
}
@@ -278,7 +277,6 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
- sharezp->z_moved = 0;
sharezp->z_unlinked = 0;
sharezp->z_atime_dirty = 0;
sharezp->z_zfsvfs = zfsvfs;
@@ -437,7 +435,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
vp->v_data = zp;
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
- zp->z_moved = 0;
zp->z_sa_hdl = NULL;
zp->z_unlinked = 0;
@@ -1692,7 +1689,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
- rootzp->z_moved = 0;
rootzp->z_unlinked = 0;
rootzp->z_atime_dirty = 0;
rootzp->z_is_sa = USE_SA(version, os);
@@ -2015,6 +2011,20 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
return (error);
}
+
+void
+zfs_inode_update(znode_t *zp)
+{
+ vm_object_t object;
+
+ if ((object = ZTOV(zp)->v_object) == NULL ||
+ zp->z_size == object->un_pager.vnp.vnp_size)
+ return;
+
+ vnode_pager_setsize(ZTOV(zp), zp->z_size);
+}
+
+
#ifdef _KERNEL
int
zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
index fb88bc325d3c..fd2beee7bdd2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -1071,6 +1071,16 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
/*
+ * This is necessary here as we check next whether
+ * OBJSET_FLAG_USERACCOUNTING_COMPLETE or
+ * OBJSET_FLAG_USEROBJACCOUNTING are set in order to
+ * decide if the local_mac should be zeroed out.
+ */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+
+ /*
* The local MAC protects the user, group and project accounting.
* If these objects are not present, the local MAC is zeroed out.
*/
@@ -1081,7 +1091,10 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
(datalen >= OBJSET_PHYS_SIZE_V2 &&
osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
- (datalen <= OBJSET_PHYS_SIZE_V1)) {
+ (datalen <= OBJSET_PHYS_SIZE_V1) ||
+ (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 ||
+ (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) &&
+ key->zk_version > 0)) {
bzero(local_mac, ZIO_OBJSET_MAC_LEN);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 092eb34eaa47..6c44e3681709 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -116,7 +116,6 @@ enum zvol_geom_state {
};
struct zvol_state_os {
- int zso_volmode;
#define zso_dev _zso_state._zso_dev
#define zso_geom _zso_state._zso_geom
union {
@@ -134,6 +133,7 @@ struct zvol_state_os {
enum zvol_geom_state zsg_state;
} _zso_geom;
} _zso_state;
+ int zso_dying;
};
static uint32_t zvol_minors;
@@ -209,7 +209,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count)
{
zvol_state_t *zv;
int err = 0;
- boolean_t drop_suspend = B_TRUE;
+ boolean_t drop_suspend = B_FALSE;
boolean_t drop_namespace = B_FALSE;
if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
@@ -228,16 +228,15 @@ retry:
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
zv = pp->private;
if (zv == NULL) {
- if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
rw_exit(&zvol_state_lock);
- return (SET_ERROR(ENXIO));
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
}
if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
/*
* We need to guarantee that the namespace lock is held
- * to avoid spurious failures in zvol_first_open
+ * to avoid spurious failures in zvol_first_open.
*/
drop_namespace = B_TRUE;
if (!mutex_tryenter(&spa_namespace_lock)) {
@@ -247,8 +246,12 @@ retry:
}
}
mutex_enter(&zv->zv_state_lock);
-
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+ if (zv->zv_zso->zso_dying) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_zv_locked;
+ }
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
/*
* make sure zvol is not suspended during first open
@@ -256,6 +259,7 @@ retry:
* ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
+ drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
@@ -266,8 +270,6 @@ retry:
drop_suspend = B_FALSE;
}
}
- } else {
- drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);
@@ -277,7 +279,7 @@ retry:
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
err = zvol_first_open(zv, !(flag & FWRITE));
if (err)
- goto out_mutex;
+ goto out_zv_locked;
pp->mediasize = zv->zv_volsize;
pp->stripeoffset = 0;
pp->stripesize = zv->zv_volblocksize;
@@ -289,41 +291,37 @@ retry:
*/
if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
- err = EROFS;
- goto out_open_count;
+ err = SET_ERROR(EROFS);
+ goto out_opened;
}
if (zv->zv_flags & ZVOL_EXCL) {
- err = EBUSY;
- goto out_open_count;
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
}
#ifdef FEXCL
if (flag & FEXCL) {
if (zv->zv_open_count != 0) {
- err = EBUSY;
- goto out_open_count;
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
}
zv->zv_flags |= ZVOL_EXCL;
}
#endif
zv->zv_open_count += count;
- if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
- mutex_exit(&zv->zv_state_lock);
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
- return (0);
-
-out_open_count:
- if (zv->zv_open_count == 0)
+out_opened:
+ if (zv->zv_open_count == 0) {
zvol_last_close(zv);
-out_mutex:
+ wakeup(zv);
+ }
+out_zv_locked:
+ mutex_exit(&zv->zv_state_lock);
+out_locked:
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
- mutex_exit(&zv->zv_state_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
- return (SET_ERROR(err));
+ return (err);
}
/*ARGSUSED*/
@@ -332,6 +330,7 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
{
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;
+ int new_open_count;
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
zv = pp->private;
@@ -342,30 +341,32 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
- ASSERT(zv->zv_open_count == 1);
+ ASSERT3U(zv->zv_open_count, ==, 1);
zv->zv_flags &= ~ZVOL_EXCL;
}
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
/*
* If the open count is zero, this is a spurious close.
* That indicates a bug in the kernel / DDI framework.
*/
- ASSERT(zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
/*
* make sure zvol is not suspended during last close
* (hold zv_suspend_lock) and respect proper lock acquisition
* ordering - zv_suspend_lock before zv_state_lock
*/
- if ((zv->zv_open_count - count) == 0) {
+ new_open_count = zv->zv_open_count - count;
+ if (new_open_count == 0) {
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
mutex_enter(&zv->zv_state_lock);
/* check to see if zv_suspend_lock is needed */
- if (zv->zv_open_count != 1) {
+ new_open_count = zv->zv_open_count - count;
+ if (new_open_count != 0) {
rw_exit(&zv->zv_suspend_lock);
drop_suspend = B_FALSE;
}
@@ -380,11 +381,11 @@ zvol_geom_close(struct g_provider *pp, int flag, int count)
/*
* You may get multiple opens, but only one close.
*/
- zv->zv_open_count -= count;
-
+ zv->zv_open_count = new_open_count;
if (zv->zv_open_count == 0) {
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
zvol_last_close(zv);
+ wakeup(zv);
}
mutex_exit(&zv->zv_state_lock);
@@ -400,7 +401,7 @@ zvol_geom_run(zvol_state_t *zv)
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
g_error_provider(pp, 0);
@@ -414,7 +415,7 @@ zvol_geom_destroy(zvol_state_t *zv)
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
g_topology_assert();
@@ -422,10 +423,25 @@ zvol_geom_destroy(zvol_state_t *zv)
VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
mutex_exit(&zv->zv_state_lock);
zsg->zsg_provider = NULL;
- pp->private = NULL;
g_wither_geom(pp->geom, ENXIO);
}
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+
+ if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
+ return;
+ mutex_enter(&zv->zv_state_lock);
+ zv->zv_zso->zso_dying = B_TRUE;
+
+ if (zv->zv_open_count)
+ msleep(zv, &zv->zv_state_lock,
+ PRIBIO, "zvol:dying", 10*hz);
+ mutex_exit(&zv->zv_state_lock);
+}
+
+
static int
zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
{
@@ -483,7 +499,7 @@ zvol_geom_worker(void *arg)
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct bio *bp;
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM);
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
thread_lock(curthread);
sched_prio(curthread, PRIBIO);
@@ -512,9 +528,13 @@ static void
zvol_geom_bio_start(struct bio *bp)
{
zvol_state_t *zv = bp->bio_to->private;
- struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct zvol_state_geom *zsg;
boolean_t first;
+ if (zv == NULL) {
+ g_io_deliver(bp, ENXIO);
+ return;
+ }
if (bp->bio_cmd == BIO_GETATTR) {
if (zvol_geom_bio_getattr(bp))
g_io_deliver(bp, EOPNOTSUPP);
@@ -522,6 +542,7 @@ zvol_geom_bio_start(struct bio *bp)
}
if (!THREAD_CAN_SLEEP()) {
+ zsg = &zv->zv_zso->zso_geom;
mtx_lock(&zsg->zsg_queue_mtx);
first = (bioq_first(&zsg->zsg_queue) == NULL);
bioq_insert_tail(&zsg->zsg_queue, bp);
@@ -540,7 +561,7 @@ zvol_geom_bio_getattr(struct bio *bp)
zvol_state_t *zv;
zv = bp->bio_to->private;
- ASSERT(zv != NULL);
+ ASSERT3P(zv, !=, NULL);
spa_t *spa = dmu_objset_spa(zv->zv_objset);
uint64_t refd, avail, usedobjs, availobjs;
@@ -613,7 +634,7 @@ zvol_geom_bio_strategy(struct bio *bp)
goto sync;
break;
default:
- error = EOPNOTSUPP;
+ error = SET_ERROR(EOPNOTSUPP);
goto resume;
}
@@ -621,7 +642,7 @@ zvol_geom_bio_strategy(struct bio *bp)
volsize = zv->zv_volsize;
os = zv->zv_objset;
- ASSERT(os != NULL);
+ ASSERT3P(os, !=, NULL);
addr = bp->bio_data;
resid = bp->bio_length;
@@ -688,7 +709,7 @@ unlock:
bp->bio_completed = bp->bio_length - resid;
if (bp->bio_completed < bp->bio_length && off > volsize)
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
switch (bp->bio_cmd) {
case BIO_FLUSH:
@@ -825,18 +846,33 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
zvol_state_t *zv;
struct zvol_state_dev *zsd;
int err = 0;
- boolean_t drop_suspend = B_TRUE;
+ boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_namespace = B_FALSE;
+retry:
rw_enter(&zvol_state_lock, ZVOL_RW_READER);
zv = dev->si_drv2;
if (zv == NULL) {
rw_exit(&zvol_state_lock);
- return (SET_ERROR(ENXIO));
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
}
+ if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ /*
+ * We need to guarantee that the namespace lock is held
+ * to avoid spurious failures in zvol_first_open.
+ */
+ drop_namespace = B_TRUE;
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ rw_exit(&zvol_state_lock);
+ mutex_enter(&spa_namespace_lock);
+ goto retry;
+ }
+ }
mutex_enter(&zv->zv_state_lock);
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
/*
* make sure zvol is not suspended during first open
@@ -844,6 +880,7 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
* ordering - zv_suspend_lock before zv_state_lock
*/
if (zv->zv_open_count == 0) {
+ drop_suspend = B_TRUE;
if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
mutex_exit(&zv->zv_state_lock);
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
@@ -854,8 +891,6 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
drop_suspend = B_FALSE;
}
}
- } else {
- drop_suspend = B_FALSE;
}
rw_exit(&zvol_state_lock);
@@ -865,21 +900,21 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
err = zvol_first_open(zv, !(flags & FWRITE));
if (err)
- goto out_locked;
+ goto out_zv_locked;
}
if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
- err = EROFS;
+ err = SET_ERROR(EROFS);
goto out_opened;
}
if (zv->zv_flags & ZVOL_EXCL) {
- err = EBUSY;
+ err = SET_ERROR(EBUSY);
goto out_opened;
}
#ifdef FEXCL
if (flags & FEXCL) {
if (zv->zv_open_count != 0) {
- err = EBUSY;
+ err = SET_ERROR(EBUSY);
goto out_opened;
}
zv->zv_flags |= ZVOL_EXCL;
@@ -894,20 +929,19 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
(zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
}
-
- mutex_exit(&zv->zv_state_lock);
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
- return (0);
-
out_opened:
- if (zv->zv_open_count == 0)
+ if (zv->zv_open_count == 0) {
zvol_last_close(zv);
-out_locked:
+ wakeup(zv);
+ }
+out_zv_locked:
mutex_exit(&zv->zv_state_lock);
+out_locked:
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
- return (SET_ERROR(err));
+ return (err);
}
static int
@@ -926,17 +960,17 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
mutex_enter(&zv->zv_state_lock);
if (zv->zv_flags & ZVOL_EXCL) {
- ASSERT(zv->zv_open_count == 1);
+ ASSERT3U(zv->zv_open_count, ==, 1);
zv->zv_flags &= ~ZVOL_EXCL;
}
- ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV);
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
/*
* If the open count is zero, this is a spurious close.
* That indicates a bug in the kernel / DDI framework.
*/
- ASSERT(zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
/*
* make sure zvol is not suspended during last close
* (hold zv_suspend_lock) and respect proper lock acquisition
@@ -972,6 +1006,7 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
if (zv->zv_open_count == 0) {
ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
zvol_last_close(zv);
+ wakeup(zv);
}
mutex_exit(&zv->zv_state_lock);
@@ -1022,7 +1057,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
length <= 0) {
printf("%s: offset=%jd length=%jd\n", __func__, offset,
length);
- error = EINVAL;
+ error = SET_ERROR(EINVAL);
break;
}
rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
@@ -1076,7 +1111,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
refd = metaslab_class_get_alloc(spa_normal_class(spa));
arg->value.off = refd / DEV_BSIZE;
} else
- error = ENOIOCTL;
+ error = SET_ERROR(ENOIOCTL);
break;
}
case FIOSEEKHOLE:
@@ -1092,7 +1127,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
break;
}
default:
- error = ENOIOCTL;
+ error = SET_ERROR(ENOIOCTL);
}
return (error);
@@ -1144,14 +1179,14 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
hlist_del(&zv->zv_hlink);
hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
- if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
struct g_geom *gp;
g_topology_lock();
gp = pp->geom;
- ASSERT(gp != NULL);
+ ASSERT3P(gp, !=, NULL);
zsg->zsg_provider = NULL;
g_wither_provider(pp, ENXIO);
@@ -1164,7 +1199,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
zsg->zsg_provider = pp;
g_error_provider(pp, 0);
g_topology_unlock();
- } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct cdev *dev;
struct make_dev_args args;
@@ -1206,26 +1241,30 @@ zvol_free(zvol_state_t *zv)
{
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count == 0);
+ ASSERT0(zv->zv_open_count);
ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
rw_destroy(&zv->zv_suspend_lock);
zfs_rangelock_fini(&zv->zv_rangelock);
- if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp __maybe_unused = zsg->zsg_provider;
+
+ ASSERT3P(pp->private, ==, NULL);
g_topology_lock();
zvol_geom_destroy(zv);
g_topology_unlock();
mtx_destroy(&zsg->zsg_queue_mtx);
- } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct cdev *dev = zsd->zsd_cdev;
- if (dev != NULL)
- destroy_dev(dev);
+ ASSERT3P(dev->si_drv2, ==, NULL);
+
+ destroy_dev(dev);
}
mutex_destroy(&zv->zv_state_lock);
@@ -1249,7 +1288,6 @@ zvol_create_minor_impl(const char *name)
int error;
ZFS_LOG(1, "Creating ZVOL %s...", name);
-
hash = zvol_name_hash(name);
if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1258,10 +1296,11 @@ zvol_create_minor_impl(const char *name)
}
DROP_GIANT();
- /* lie and say we're read-only */
- error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+
doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+ /* lie and say we're read-only */
+ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
if (error)
goto out_doi;
@@ -1275,8 +1314,10 @@ zvol_create_minor_impl(const char *name)
error = dsl_prop_get_integer(name,
zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
- if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT)
+ if (error || volmode == ZFS_VOLMODE_DEFAULT)
volmode = zvol_volmode;
+ error = 0;
+
/*
* zvol_alloc equivalent ...
*/
@@ -1284,8 +1325,8 @@ zvol_create_minor_impl(const char *name)
zv->zv_hash = hash;
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
- zv->zv_zso->zso_volmode = volmode;
- if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ zv->zv_volmode = volmode;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp;
struct g_geom *gp;
@@ -1298,7 +1339,6 @@ zvol_create_minor_impl(const char *name)
gp->start = zvol_geom_bio_start;
gp->access = zvol_geom_access;
pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
- /* TODO: NULL check? */
pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
pp->sectorsize = DEV_BSIZE;
pp->mediasize = 0;
@@ -1306,7 +1346,7 @@ zvol_create_minor_impl(const char *name)
zsg->zsg_provider = pp;
bioq_init(&zsg->zsg_queue);
- } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) {
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
struct cdev *dev;
struct make_dev_args args;
@@ -1320,12 +1360,12 @@ zvol_create_minor_impl(const char *name)
args.mda_mode = 0640;
args.mda_si_drv2 = zv;
error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
- if (error != 0) {
- mutex_destroy(&zv->zv_state_lock);
+ if (error) {
kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+ mutex_destroy(&zv->zv_state_lock);
kmem_free(zv, sizeof (*zv));
dmu_objset_disown(os, B_TRUE, FTAG);
- goto out_giant;
+ goto out_doi;
}
dev->si_iosize_max = maxphys;
zsd->zsd_cdev = dev;
@@ -1350,15 +1390,14 @@ zvol_create_minor_impl(const char *name)
ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
- /* XXX do prefetch */
+ /* TODO: prefetch for geom tasting */
zv->zv_objset = NULL;
out_dmu_objset_disown:
dmu_objset_disown(os, B_TRUE, FTAG);
- if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
- if (error == 0)
- zvol_geom_run(zv);
+ if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
+ zvol_geom_run(zv);
g_topology_unlock();
}
out_doi:
@@ -1368,9 +1407,8 @@ out_doi:
zvol_insert(zv);
zvol_minors++;
rw_exit(&zvol_state_lock);
+ ZFS_LOG(1, "ZVOL %s created.", name);
}
- ZFS_LOG(1, "ZVOL %s created.", name);
-out_giant:
PICKUP_GIANT();
return (error);
}
@@ -1379,11 +1417,11 @@ static void
zvol_clear_private(zvol_state_t *zv)
{
ASSERT(RW_LOCK_HELD(&zvol_state_lock));
- if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
- if (pp == NULL) /* XXX when? */
+ if (pp->private == NULL) /* already cleared */
return;
mtx_lock(&zsg->zsg_queue_mtx);
@@ -1391,11 +1429,15 @@ zvol_clear_private(zvol_state_t *zv)
pp->private = NULL;
wakeup_one(&zsg->zsg_queue);
while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
- msleep(&zsg->zsg_state,
- &zsg->zsg_queue_mtx,
+ msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
0, "zvol:w", 0);
mtx_unlock(&zsg->zsg_queue_mtx);
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+ struct cdev *dev = zsd->zsd_cdev;
+
+ dev->si_drv2 = NULL;
}
}
@@ -1403,15 +1445,17 @@ static int
zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
{
zv->zv_volsize = volsize;
- if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) {
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
struct g_provider *pp = zsg->zsg_provider;
- if (pp == NULL) /* XXX when? */
- return (0);
-
g_topology_lock();
+ if (pp->private == NULL) {
+ g_topology_unlock();
+ return (SET_ERROR(ENXIO));
+ }
+
/*
* Do not invoke resize event when initial size was zero.
* ZVOL initializes the size on first open, this is not
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
index fafadffc751c..e8d89bfeabe5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -28,6 +28,9 @@
#include <sys/kmem.h>
#include <sys/tsd.h>
#include <sys/trace_spl.h>
+#ifdef HAVE_CPU_HOTPLUG
+#include <linux/cpuhotplug.h>
+#endif
int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
@@ -35,7 +38,7 @@ MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
int spl_taskq_thread_dynamic = 1;
-module_param(spl_taskq_thread_dynamic, int, 0644);
+module_param(spl_taskq_thread_dynamic, int, 0444);
MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
int spl_taskq_thread_priority = 1;
@@ -59,6 +62,11 @@ EXPORT_SYMBOL(system_delay_taskq);
static taskq_t *dynamic_taskq;
static taskq_thread_t *taskq_thread_create(taskq_t *);
+#ifdef HAVE_CPU_HOTPLUG
+/* Multi-callback id for cpu hotplugging. */
+static int spl_taskq_cpuhp_state;
+#endif
+
/* List of all taskqs */
LIST_HEAD(tq_list);
struct rw_semaphore tq_list_sem;
@@ -1024,13 +1032,14 @@ taskq_thread_create(taskq_t *tq)
}
taskq_t *
-taskq_create(const char *name, int nthreads, pri_t pri,
+taskq_create(const char *name, int threads_arg, pri_t pri,
int minalloc, int maxalloc, uint_t flags)
{
taskq_t *tq;
taskq_thread_t *tqt;
int count = 0, rc = 0, i;
unsigned long irqflags;
+ int nthreads = threads_arg;
ASSERT(name != NULL);
ASSERT(minalloc >= 0);
@@ -1041,15 +1050,27 @@ taskq_create(const char *name, int nthreads, pri_t pri,
if (flags & TASKQ_THREADS_CPU_PCT) {
ASSERT(nthreads <= 100);
ASSERT(nthreads >= 0);
- nthreads = MIN(nthreads, 100);
+ nthreads = MIN(threads_arg, 100);
nthreads = MAX(nthreads, 0);
- nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
+ nthreads = MAX((num_online_cpus() * nthreads) /100, 1);
}
tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
if (tq == NULL)
return (NULL);
+ tq->tq_hp_support = B_FALSE;
+#ifdef HAVE_CPU_HOTPLUG
+ if (flags & TASKQ_THREADS_CPU_PCT) {
+ tq->tq_hp_support = B_TRUE;
+ if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state,
+ &tq->tq_hp_cb_node) != 0) {
+ kmem_free(tq, sizeof (*tq));
+ return (NULL);
+ }
+ }
+#endif
+
spin_lock_init(&tq->tq_lock);
INIT_LIST_HEAD(&tq->tq_thread_list);
INIT_LIST_HEAD(&tq->tq_active_list);
@@ -1058,6 +1079,7 @@ taskq_create(const char *name, int nthreads, pri_t pri,
tq->tq_nthreads = 0;
tq->tq_nspawn = 0;
tq->tq_maxthreads = nthreads;
+ tq->tq_cpu_pct = threads_arg;
tq->tq_pri = pri;
tq->tq_minalloc = minalloc;
tq->tq_maxalloc = maxalloc;
@@ -1131,6 +1153,12 @@ taskq_destroy(taskq_t *tq)
tq->tq_flags &= ~TASKQ_ACTIVE;
spin_unlock_irqrestore(&tq->tq_lock, flags);
+#ifdef HAVE_CPU_HOTPLUG
+ if (tq->tq_hp_support) {
+ VERIFY0(cpuhp_state_remove_instance_nocalls(
+ spl_taskq_cpuhp_state, &tq->tq_hp_cb_node));
+ }
+#endif
/*
* When TASKQ_ACTIVE is clear new tasks may not be added nor may
* new worker threads be spawned for dynamic taskq.
@@ -1198,7 +1226,6 @@ taskq_destroy(taskq_t *tq)
}
EXPORT_SYMBOL(taskq_destroy);
-
static unsigned int spl_taskq_kick = 0;
/*
@@ -1255,12 +1282,96 @@ module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
MODULE_PARM_DESC(spl_taskq_kick,
"Write nonzero to kick stuck taskqs to spawn more threads");
+#ifdef HAVE_CPU_HOTPLUG
+/*
+ * This callback will be called exactly once for each core that comes online,
+ * for each dynamic taskq. We attempt to expand taskqs that have
+ * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every
+ * time, to correctly determine whether or not to add a thread.
+ */
+static int
+spl_taskq_expand(unsigned int cpu, struct hlist_node *node)
+{
+ taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+ unsigned long flags;
+ int err = 0;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+ int nthreads = MIN(tq->tq_cpu_pct, 100);
+ nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1);
+ tq->tq_maxthreads = nthreads;
+
+ if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+ tq->tq_maxthreads > tq->tq_nthreads) {
+ ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1);
+ taskq_thread_t *tqt = taskq_thread_create(tq);
+ if (tqt == NULL)
+ err = -1;
+ }
+
+out:
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (err);
+}
+
+/*
+ * While we don't support offlining CPUs, it is possible that CPUs will fail
+ * to online successfully. We do need to be able to handle this case
+ * gracefully.
+ */
+static int
+spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node)
+{
+ taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+ unsigned long flags;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+ int nthreads = MIN(tq->tq_cpu_pct, 100);
+ nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1);
+ tq->tq_maxthreads = nthreads;
+
+ if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+ tq->tq_maxthreads < tq->tq_nthreads) {
+ ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1);
+ taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next,
+ taskq_thread_t, tqt_thread_list);
+ struct task_struct *thread = tqt->tqt_thread;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kthread_stop(thread);
+
+ return (0);
+ }
+
+out:
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (0);
+}
+#endif
+
int
spl_taskq_init(void)
{
init_rwsem(&tq_list_sem);
tsd_create(&taskq_tsd, NULL);
+#ifdef HAVE_CPU_HOTPLUG
+ spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+ "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down);
+#endif
+
system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
if (system_taskq == NULL)
@@ -1269,6 +1380,9 @@ spl_taskq_init(void)
system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
if (system_delay_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+ cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
taskq_destroy(system_taskq);
return (1);
}
@@ -1276,6 +1390,9 @@ spl_taskq_init(void)
dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
if (dynamic_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+ cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
taskq_destroy(system_taskq);
taskq_destroy(system_delay_taskq);
return (1);
@@ -1304,4 +1421,9 @@ spl_taskq_fini(void)
system_taskq = NULL;
tsd_destroy(&taskq_tsd);
+
+#ifdef HAVE_CPU_HOTPLUG
+ cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+ spl_taskq_cpuhp_state = 0;
+#endif
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
index 87414d6eacc5..75bec52c94e2 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
+++ b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
@@ -23,8 +23,9 @@ $(MODULE)-objs += ../os/linux/zfs/zfs_dir.o
$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o
$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o
$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o
$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o
-$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o
$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o
$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o
$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
index c2281449ed12..0abac228447f 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -178,7 +178,7 @@ static struct page *abd_zero_page = NULL;
static kmem_cache_t *abd_cache = NULL;
static kstat_t *abd_ksp;
-static size_t
+static uint_t
abd_chunkcnt_for_bytes(size_t size)
{
return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
index 792c75d46ffe..83d4a3d8496c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -48,6 +48,8 @@
#include <sys/vmsystm.h>
#include <sys/zpl.h>
#include <linux/page_compat.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
#endif
#include <sys/callb.h>
#include <sys/kstat.h>
@@ -73,6 +75,9 @@
*/
int zfs_arc_shrinker_limit = 10000;
+#ifdef CONFIG_MEMORY_HOTPLUG
+static struct notifier_block arc_hotplug_callback_mem_nb;
+#endif
/*
* Return a default max arc size based on the amount of physical memory.
@@ -278,18 +283,9 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
return (0);
}
-void
-arc_lowmem_init(void)
+static void
+arc_set_sys_free(uint64_t allmem)
{
- uint64_t allmem = arc_all_memory();
-
- /*
- * Register a shrinker to support synchronous (direct) memory
- * reclaim from the arc. This is done to prevent kswapd from
- * swapping out pages when it is preferable to shrink the arc.
- */
- spl_register_shrinker(&arc_shrinker);
-
/*
* The ARC tries to keep at least this much memory available for the
* system. This gives the ARC time to shrink in response to memory
@@ -343,6 +339,20 @@ arc_lowmem_init(void)
}
void
+arc_lowmem_init(void)
+{
+ uint64_t allmem = arc_all_memory();
+
+ /*
+ * Register a shrinker to support synchronous (direct) memory
+ * reclaim from the arc. This is done to prevent kswapd from
+ * swapping out pages when it is preferable to shrink the arc.
+ */
+ spl_register_shrinker(&arc_shrinker);
+ arc_set_sys_free(allmem);
+}
+
+void
arc_lowmem_fini(void)
{
spl_unregister_shrinker(&arc_shrinker);
@@ -375,6 +385,52 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
return (0);
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* ARGSUSED */
+static int
+arc_hotplug_callback(struct notifier_block *self, unsigned long action,
+ void *arg)
+{
+ uint64_t allmem = arc_all_memory();
+ if (action != MEM_ONLINE)
+ return (NOTIFY_OK);
+
+ arc_set_limits(allmem);
+
+#ifdef __LP64__
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
+ allmem * zfs_dirty_data_max_max_percent / 100);
+#else
+ if (zfs_dirty_data_max_max == 0)
+ zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
+ allmem * zfs_dirty_data_max_max_percent / 100);
+#endif
+
+ arc_set_sys_free(allmem);
+ return (NOTIFY_OK);
+}
+#endif
+
+void
+arc_register_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback;
+ /* There is no significance to the value 100 */
+ arc_hotplug_callback_mem_nb.priority = 100;
+ register_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
+
+void
+arc_unregister_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ unregister_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
#else /* _KERNEL */
int64_t
arc_available_memory(void)
@@ -405,6 +461,16 @@ arc_free_memory(void)
{
return (spa_get_random(arc_all_memory() * 20 / 100));
}
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
#endif /* _KERNEL */
/*
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
index 5267d67eea82..8780d7f6c70a 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -204,7 +204,8 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
* Enforced in the Linux VFS.
*/
int
-secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot)
+secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr,
+ boolean_t issuidroot)
{
return (priv_policy_user(cr, CAP_FSETID, EPERM));
}
@@ -271,7 +272,7 @@ void
secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
{
if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
+ secpolicy_vnode_setid_retain(NULL, cr,
(vap->va_mode & S_ISUID) != 0 &&
(vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
vap->va_mask |= AT_MODE;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
index a54961c76870..4bd27d1b516f 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -94,6 +94,14 @@ bdev_capacity(struct block_device *bdev)
return (i_size_read(bdev->bd_inode));
}
+#if !defined(HAVE_BDEV_WHOLE)
+static inline struct block_device *
+bdev_whole(struct block_device *bdev)
+{
+ return (bdev->bd_contains);
+}
+#endif
+
/*
* Returns the maximum expansion capacity of the block device (in bytes).
*
@@ -118,7 +126,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
uint64_t psize;
int64_t available;
- if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
+ if (wholedisk && bdev != bdev_whole(bdev)) {
/*
* When reporting maximum expansion capacity for a wholedisk
* deduct any capacity which is expected to be lost due to
@@ -132,7 +140,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
* "reserved" EFI partition: in such cases return the device
* usable capacity.
*/
- available = i_size_read(bdev->bd_contains->bd_inode) -
+ available = i_size_read(bdev_whole(bdev)->bd_inode) -
((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
PARTITION_END_ALIGNMENT) << SECTOR_BITS);
psize = MAX(available, bdev_capacity(bdev));
@@ -192,8 +200,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
vd->vd_bdev = NULL;
if (bdev) {
- if (v->vdev_expanding && bdev != bdev->bd_contains) {
- bdevname(bdev->bd_contains, disk_name + 5);
+ if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
+ bdevname(bdev_whole(bdev), disk_name + 5);
/*
* If userland has BLKPG_RESIZE_PARTITION,
* then it should have updated the partition
@@ -468,7 +476,11 @@ vdev_blkg_tryget(struct blkcg_gq *blkg)
this_cpu_inc(*count);
rc = true;
} else {
+#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
+ rc = atomic_long_inc_not_zero(&ref->data->count);
+#else
rc = atomic_long_inc_not_zero(&ref->count);
+#endif
}
rcu_read_unlock_sched();
@@ -787,7 +799,7 @@ vdev_disk_io_done(zio_t *zio)
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
- if (check_disk_change(vd->vd_bdev)) {
+ if (zfs_check_media_change(vd->vd_bdev)) {
invalidate_bdev(vd->vd_bdev);
v->vdev_remove_wanted = B_TRUE;
spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
@@ -822,9 +834,13 @@ vdev_disk_rele(vdev_t *vd)
}
vdev_ops_t vdev_disk_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_disk_io_start,
.vdev_op_io_done = vdev_disk_io_done,
.vdev_op_state_change = NULL,
@@ -833,6 +849,11 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_rele = vdev_disk_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
index 423ce858144c..bf8a13ae6154 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
@@ -305,9 +305,13 @@ vdev_file_io_done(zio_t *zio)
}
vdev_ops_t vdev_file_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_file_open,
.vdev_op_close = vdev_file_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_file_io_start,
.vdev_op_io_done = vdev_file_io_done,
.vdev_op_state_change = NULL,
@@ -316,6 +320,11 @@ vdev_ops_t vdev_file_ops = {
.vdev_op_rele = vdev_file_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
@@ -341,9 +350,13 @@ vdev_file_fini(void)
#ifndef _KERNEL
vdev_ops_t vdev_disk_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_file_open,
.vdev_op_close = vdev_file_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_file_io_start,
.vdev_op_io_done = vdev_file_io_done,
.vdev_op_state_change = NULL,
@@ -352,6 +365,11 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_rele = vdev_file_rele,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
index c13a9771235d..a1668e46e4f9 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -467,7 +467,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
zp->z_zn_prefetch = B_FALSE;
- zp->z_moved = B_FALSE;
zp->z_is_sa = B_FALSE;
zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_TRUE;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
index 36bbd5d0829b..165c1218ae79 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -294,7 +294,7 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr)
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
- * run sync(1M). Unlike other filesystems, ZFS honors the
+ * run sync(1). Unlike other filesystems, ZFS honors the
* request by waiting for all pools to commit all dirty data.
*/
spa_sync_allpools();
@@ -1451,7 +1451,7 @@ int
zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
{
const char *osname = zm->mnt_osname;
- struct inode *root_inode;
+ struct inode *root_inode = NULL;
uint64_t recordsize;
int error = 0;
zfsvfs_t *zfsvfs = NULL;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c
index b668c7dff013..3be387a30e5c 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c
@@ -240,78 +240,6 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)
return (0);
}
-#if defined(SEEK_HOLE) && defined(SEEK_DATA)
-/*
- * Lseek support for finding holes (cmd == SEEK_HOLE) and
- * data (cmd == SEEK_DATA). "off" is an in/out parameter.
- */
-static int
-zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
-{
- znode_t *zp = ITOZ(ip);
- uint64_t noff = (uint64_t)*off; /* new offset */
- uint64_t file_sz;
- int error;
- boolean_t hole;
-
- file_sz = zp->z_size;
- if (noff >= file_sz) {
- return (SET_ERROR(ENXIO));
- }
-
- if (cmd == SEEK_HOLE)
- hole = B_TRUE;
- else
- hole = B_FALSE;
-
- error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
-
- if (error == ESRCH)
- return (SET_ERROR(ENXIO));
-
- /* file was dirty, so fall back to using generic logic */
- if (error == EBUSY) {
- if (hole)
- *off = file_sz;
-
- return (0);
- }
-
- /*
- * We could find a hole that begins after the logical end-of-file,
- * because dmu_offset_next() only works on whole blocks. If the
- * EOF falls mid-block, then indicate that the "virtual hole"
- * at the end of the file begins at the logical EOF, rather than
- * at the end of the last block.
- */
- if (noff > file_sz) {
- ASSERT(hole);
- noff = file_sz;
- }
-
- if (noff < *off)
- return (error);
- *off = noff;
- return (error);
-}
-
-int
-zfs_holey(struct inode *ip, int cmd, loff_t *off)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- error = zfs_holey_common(ip, cmd, off);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* SEEK_HOLE && SEEK_DATA */
-
#if defined(_KERNEL)
/*
* When a file is memory mapped, we must keep the IO data synchronized
@@ -320,10 +248,10 @@ zfs_holey(struct inode *ip, int cmd, loff_t *off)
* On Write: If we find a memory mapped page, we write to *both*
* the page and the dmu buffer.
*/
-static void
-update_pages(struct inode *ip, int64_t start, int len,
- objset_t *os, uint64_t oid)
+void
+update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
{
+ struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
struct page *pp;
uint64_t nbytes;
@@ -340,8 +268,8 @@ update_pages(struct inode *ip, int64_t start, int len,
flush_dcache_page(pp);
pb = kmap(pp);
- (void) dmu_read(os, oid, start+off, nbytes, pb+off,
- DMU_READ_PREFETCH);
+ (void) dmu_read(os, zp->z_id, start + off, nbytes,
+ pb + off, DMU_READ_PREFETCH);
kunmap(pp);
if (mapping_writably_mapped(mp))
@@ -369,12 +297,12 @@ update_pages(struct inode *ip, int64_t start, int len,
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
* the file is memory mapped.
*/
-static int
-mappedread(struct inode *ip, int nbytes, uio_t *uio)
+int
+mappedread(znode_t *zp, int nbytes, uio_t *uio)
{
+ struct inode *ip = ZTOI(zp);
struct address_space *mp = ip->i_mapping;
struct page *pp;
- znode_t *zp = ITOZ(ip);
int64_t start, off;
uint64_t bytes;
int len = nbytes;
@@ -414,575 +342,9 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
}
#endif /* _KERNEL */
-unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
/*
- * Read bytes from specified file into supplied buffer.
- *
- * IN: ip - inode of file to be read from.
- * uio - structure supplying read location, range info,
- * and return buffer.
- * ioflag - O_SYNC flags; used to provide FRSYNC semantics.
- * O_DIRECT flag; used to bypass page cache.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range, buffer filled.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Side Effects:
- * inode - atime updated if byte count > 0
- */
-/* ARGSUSED */
-int
-zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
-{
- int error = 0;
- boolean_t frsync = B_FALSE;
-
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (zp->z_pflags & ZFS_AV_QUARANTINED) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
-
- /*
- * Validate file offset
- */
- if (uio->uio_loffset < (offset_t)0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- /*
- * Fasttrack empty reads
- */
- if (uio->uio_resid == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
-#ifdef FRSYNC
- /*
- * If we're in FRSYNC mode, sync out this znode before reading it.
- * Only do this for non-snapshots.
- *
- * Some platforms do not support FRSYNC and instead map it
- * to O_SYNC, which results in unnecessary calls to zil_commit. We
- * only honor FRSYNC requests on platforms which support it.
- */
- frsync = !!(ioflag & FRSYNC);
-#endif
- if (zfsvfs->z_log &&
- (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
- zil_commit(zfsvfs->z_log, zp->z_id);
-
- /*
- * Lock the range against changes.
- */
- zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
- uio->uio_loffset, uio->uio_resid, RL_READER);
-
- /*
- * If we are reading past end-of-file we can skip
- * to the end; but we might still need to set atime.
- */
- if (uio->uio_loffset >= zp->z_size) {
- error = 0;
- goto out;
- }
-
- ASSERT(uio->uio_loffset < zp->z_size);
- ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
- ssize_t start_resid = n;
-
-#ifdef HAVE_UIO_ZEROCOPY
- xuio_t *xuio = NULL;
- if ((uio->uio_extflg == UIO_XUIO) &&
- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
- int nblk;
- int blksz = zp->z_blksz;
- uint64_t offset = uio->uio_loffset;
-
- xuio = (xuio_t *)uio;
- if ((ISP2(blksz))) {
- nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
- blksz)) / blksz;
- } else {
- ASSERT(offset + n <= blksz);
- nblk = 1;
- }
- (void) dmu_xuio_init(xuio, nblk);
-
- if (vn_has_cached_data(ip)) {
- /*
- * For simplicity, we always allocate a full buffer
- * even if we only expect to read a portion of a block.
- */
- while (--nblk >= 0) {
- (void) dmu_xuio_add(xuio,
- dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz), 0, blksz);
- }
- }
- }
-#endif /* HAVE_UIO_ZEROCOPY */
-
- while (n > 0) {
- ssize_t nbytes = MIN(n, zfs_read_chunk_size -
- P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
-
- if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
- error = mappedread(ip, nbytes, uio);
- } else {
- error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, nbytes);
- }
-
- if (error) {
- /* convert checksum errors into IO errors */
- if (error == ECKSUM)
- error = SET_ERROR(EIO);
- break;
- }
-
- n -= nbytes;
- }
-
- int64_t nread = start_resid - n;
- dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
- task_io_account_read(nread);
-out:
- zfs_rangelock_exit(lr);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/*
- * Write the bytes to a file.
- *
- * IN: ip - inode of file to be written to.
- * uio - structure supplying write location, range info,
- * and data buffer.
- * ioflag - O_APPEND flag set if in append mode.
- * O_DIRECT flag; used to bypass page cache.
- * cr - credentials of caller.
- *
- * OUT: uio - updated offset and range.
- *
- * RETURN: 0 if success
- * error code if failure
- *
- * Timestamps:
- * ip - ctime|mtime updated if byte count > 0
- */
-
-/* ARGSUSED */
-int
-zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
-{
- int error = 0;
- ssize_t start_resid = uio->uio_resid;
-
- /*
- * Fasttrack empty write
- */
- ssize_t n = start_resid;
- if (n == 0)
- return (0);
-
- rlim64_t limit = uio->uio_limit;
- if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
- limit = MAXOFFSET_T;
-
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- sa_bulk_attr_t bulk[4];
- int count = 0;
- uint64_t mtime[2], ctime[2];
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
- &zp->z_size, 8);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
-
- /*
- * Callers might not be able to detect properly that we are read-only,
- * so check it explicitly here.
- */
- if (zfs_is_readonly(zfsvfs)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EROFS));
- }
-
- /*
- * If immutable or not appending then return EPERM
- */
- if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
- ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
- (uio->uio_loffset < zp->z_size))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- /*
- * Validate file offset
- */
- offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset;
- if (woff < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- int max_blksz = zfsvfs->z_max_blksz;
- xuio_t *xuio = NULL;
-
- /*
- * Pre-fault the pages to ensure slow (eg NFS) pages
- * don't hold up txg.
- * Skip this if uio contains loaned arc_buf.
- */
-#ifdef HAVE_UIO_ZEROCOPY
- if ((uio->uio_extflg == UIO_XUIO) &&
- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
- xuio = (xuio_t *)uio;
- else
-#endif
- if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EFAULT));
- }
-
- /*
- * If in append mode, set the io offset pointer to eof.
- */
- zfs_locked_range_t *lr;
- if (ioflag & O_APPEND) {
- /*
- * Obtain an appending range lock to guarantee file append
- * semantics. We reset the write offset once we have the lock.
- */
- lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
- woff = lr->lr_offset;
- if (lr->lr_length == UINT64_MAX) {
- /*
- * We overlocked the file because this write will cause
- * the file block size to increase.
- * Note that zp_size cannot change with this lock held.
- */
- woff = zp->z_size;
- }
- uio->uio_loffset = woff;
- } else {
- /*
- * Note that if the file block size will change as a result of
- * this write, then this range lock will lock the entire file
- * so that we can re-write the block safely.
- */
- lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
- }
-
- if (woff >= limit) {
- zfs_rangelock_exit(lr);
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EFBIG));
- }
-
- if ((woff + n) > limit || woff > (limit - n))
- n = limit - woff;
-
- /* Will this write extend the file length? */
- int write_eof = (woff + n > zp->z_size);
-
- uint64_t end_size = MAX(zp->z_size, woff + n);
- zilog_t *zilog = zfsvfs->z_log;
-#ifdef HAVE_UIO_ZEROCOPY
- int i_iov = 0;
- const iovec_t *iovp = uio->uio_iov;
- int iovcnt __maybe_unused = uio->uio_iovcnt;
-#endif
-
-
- /*
- * Write the file in reasonable size chunks. Each chunk is written
- * in a separate transaction; this keeps the intent log records small
- * and allows us to do more fine-grained space accounting.
- */
- while (n > 0) {
- woff = uio->uio_loffset;
-
- if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
- KUID_TO_SUID(ip->i_uid)) ||
- zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
- KGID_TO_SGID(ip->i_gid)) ||
- (zp->z_projid != ZFS_DEFAULT_PROJID &&
- zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
- zp->z_projid))) {
- error = SET_ERROR(EDQUOT);
- break;
- }
-
- arc_buf_t *abuf = NULL;
- const iovec_t *aiov = NULL;
- if (xuio) {
-#ifdef HAVE_UIO_ZEROCOPY
- ASSERT(i_iov < iovcnt);
- ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
- aiov = &iovp[i_iov];
- abuf = dmu_xuio_arcbuf(xuio, i_iov);
- dmu_xuio_clear(xuio, i_iov);
- ASSERT((aiov->iov_base == abuf->b_data) ||
- ((char *)aiov->iov_base - (char *)abuf->b_data +
- aiov->iov_len == arc_buf_size(abuf)));
- i_iov++;
-#endif
- } else if (n >= max_blksz && woff >= zp->z_size &&
- P2PHASE(woff, max_blksz) == 0 &&
- zp->z_blksz == max_blksz) {
- /*
- * This write covers a full block. "Borrow" a buffer
- * from the dmu so that we can fill it before we enter
- * a transaction. This avoids the possibility of
- * holding up the transaction if the data copy hangs
- * up on a pagefault (e.g., from an NFS server mapping).
- */
- size_t cbytes;
-
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- max_blksz);
- ASSERT(abuf != NULL);
- ASSERT(arc_buf_size(abuf) == max_blksz);
- if ((error = uiocopy(abuf->b_data, max_blksz,
- UIO_WRITE, uio, &cbytes))) {
- dmu_return_arcbuf(abuf);
- break;
- }
- ASSERT(cbytes == max_blksz);
- }
-
- /*
- * Start a transaction.
- */
- dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
- DB_DNODE_ENTER(db);
- dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
- MIN(n, max_blksz));
- DB_DNODE_EXIT(db);
- zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
- dmu_tx_abort(tx);
- if (abuf != NULL)
- dmu_return_arcbuf(abuf);
- break;
- }
-
- /*
- * If rangelock_enter() over-locked we grow the blocksize
- * and then reduce the lock range. This will only happen
- * on the first iteration since rangelock_reduce() will
- * shrink down lr_length to the appropriate size.
- */
- if (lr->lr_length == UINT64_MAX) {
- uint64_t new_blksz;
-
- if (zp->z_blksz > max_blksz) {
- /*
- * File's blocksize is already larger than the
- * "recordsize" property. Only let it grow to
- * the next power of 2.
- */
- ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size,
- 1 << highbit64(zp->z_blksz));
- } else {
- new_blksz = MIN(end_size, max_blksz);
- }
- zfs_grow_blocksize(zp, new_blksz, tx);
- zfs_rangelock_reduce(lr, woff, n);
- }
-
- /*
- * XXX - should we really limit each write to z_max_blksz?
- * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
- */
- ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-
- ssize_t tx_bytes;
- if (abuf == NULL) {
- tx_bytes = uio->uio_resid;
- uio->uio_fault_disable = B_TRUE;
- error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
- uio, nbytes, tx);
- uio->uio_fault_disable = B_FALSE;
- if (error == EFAULT) {
- dmu_tx_commit(tx);
- /*
- * Account for partial writes before
- * continuing the loop.
- * Update needs to occur before the next
- * uio_prefaultpages, or prefaultpages may
- * error, and we may break the loop early.
- */
- if (tx_bytes != uio->uio_resid)
- n -= tx_bytes - uio->uio_resid;
- if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
- break;
- }
- continue;
- } else if (error != 0) {
- dmu_tx_commit(tx);
- break;
- }
- tx_bytes -= uio->uio_resid;
- } else {
- tx_bytes = nbytes;
- ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
- /*
- * If this is not a full block write, but we are
- * extending the file past EOF and this data starts
- * block-aligned, use assign_arcbuf(). Otherwise,
- * write via dmu_write().
- */
- if (tx_bytes < max_blksz && (!write_eof ||
- aiov->iov_base != abuf->b_data)) {
- ASSERT(xuio);
- dmu_write(zfsvfs->z_os, zp->z_id, woff,
- /* cppcheck-suppress nullPointer */
- aiov->iov_len, aiov->iov_base, tx);
- dmu_return_arcbuf(abuf);
- xuio_stat_wbuf_copied();
- } else {
- ASSERT(xuio || tx_bytes == max_blksz);
- error = dmu_assign_arcbuf_by_dbuf(
- sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
- if (error != 0) {
- dmu_return_arcbuf(abuf);
- dmu_tx_commit(tx);
- break;
- }
- }
- ASSERT(tx_bytes <= uio->uio_resid);
- uioskip(uio, tx_bytes);
- }
- if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
- update_pages(ip, woff,
- tx_bytes, zfsvfs->z_os, zp->z_id);
- }
-
- /*
- * If we made no progress, we're done. If we made even
- * partial progress, update the znode and ZIL accordingly.
- */
- if (tx_bytes == 0) {
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
- (void *)&zp->z_size, sizeof (uint64_t), tx);
- dmu_tx_commit(tx);
- ASSERT(error != 0);
- break;
- }
-
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the execute bits is set.
- *
- * It would be nice to do this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- *
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
- */
- mutex_enter(&zp->z_acl_lock);
- uint32_t uid = KUID_TO_SUID(ip->i_uid);
- if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
- ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
- uint64_t newmode;
- zp->z_mode &= ~(S_ISUID | S_ISGID);
- ip->i_mode = newmode = zp->z_mode;
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
- (void *)&newmode, sizeof (uint64_t), tx);
- }
- mutex_exit(&zp->z_acl_lock);
-
- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
-
- /*
- * Update the file size (zp_size) if it has changed;
- * account for possible concurrent updates.
- */
- while ((end_size = zp->z_size) < uio->uio_loffset) {
- (void) atomic_cas_64(&zp->z_size, end_size,
- uio->uio_loffset);
- ASSERT(error == 0);
- }
- /*
- * If we are replaying and eof is non zero then force
- * the file size to the specified eof. Note, there's no
- * concurrency during replay.
- */
- if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
- zp->z_size = zfsvfs->z_replay_eof;
-
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
-
- zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
- NULL, NULL);
- dmu_tx_commit(tx);
-
- if (error != 0)
- break;
- ASSERT(tx_bytes == nbytes);
- n -= nbytes;
-
- if (!xuio && n > 0) {
- if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
- error = EFAULT;
- break;
- }
- }
- }
-
- zfs_inode_update(zp);
- zfs_rangelock_exit(lr);
-
- /*
- * If we're in replay mode, or we made no progress, return error.
- * Otherwise, it's at least a partial write, so it's successful.
- */
- if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (ioflag & (O_SYNC | O_DSYNC) ||
- zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, zp->z_id);
-
- int64_t nwritten = start_resid - uio->uio_resid;
- dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
- task_io_account_write(nwritten);
-
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*
* Write the bytes to a file.
*
* IN: zp - znode of file to be written to
@@ -993,37 +355,40 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
* OUT: resid - remaining bytes to write
*
* RETURN: 0 if success
- * positive error code if failure
+ * positive error code if failure. EIO is returned
+ * for a short write when residp isn't provided.
*
* Timestamps:
* zp - ctime|mtime updated if byte count > 0
*/
int
zfs_write_simple(znode_t *zp, const void *data, size_t len,
- loff_t pos, size_t *resid)
+ loff_t pos, size_t *residp)
{
- ssize_t written;
- int error = 0;
+ fstrans_cookie_t cookie;
+ int error;
- written = zpl_write_common(ZTOI(zp), data, len, &pos,
- UIO_SYSSPACE, 0, kcred);
- if (written < 0) {
- error = -written;
- } else if (resid == NULL) {
- if (written < len)
- error = SET_ERROR(EIO); /* short write */
- } else {
- *resid = len - written;
+ struct iovec iov;
+ iov.iov_base = (void *)data;
+ iov.iov_len = len;
+
+ uio_t uio;
+ uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
+
+ cookie = spl_fstrans_mark();
+ error = zfs_write(zp, &uio, 0, kcred);
+ spl_fstrans_unmark(cookie);
+
+ if (error == 0) {
+ if (residp != NULL)
+ *residp = uio_resid(&uio);
+ else if (uio_resid(&uio) != 0)
+ error = SET_ERROR(EIO);
}
+
return (error);
}
-/*
- * Drop a reference on the passed inode asynchronously. This ensures
- * that the caller will never drop the last reference on an inode in
- * the current context. Doing so while holding open a tx could result
- * in a deadlock if iput_final() re-enters the filesystem code.
- */
void
zfs_zrele_async(znode_t *zp)
{
@@ -1040,179 +405,6 @@ zfs_zrele_async(znode_t *zp)
zrele(zp);
}
-/* ARGSUSED */
-static void
-zfs_get_done(zgd_t *zgd, int error)
-{
- znode_t *zp = zgd->zgd_private;
-
- if (zgd->zgd_db)
- dmu_buf_rele(zgd->zgd_db, zgd);
-
- zfs_rangelock_exit(zgd->zgd_lr);
-
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- zfs_zrele_async(zp);
-
- kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef ZFS_DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
- zfsvfs_t *zfsvfs = arg;
- objset_t *os = zfsvfs->z_os;
- znode_t *zp;
- uint64_t object = lr->lr_foid;
- uint64_t offset = lr->lr_offset;
- uint64_t size = lr->lr_length;
- dmu_buf_t *db;
- zgd_t *zgd;
- int error = 0;
-
- ASSERT3P(lwb, !=, NULL);
- ASSERT3P(zio, !=, NULL);
- ASSERT3U(size, !=, 0);
-
- /*
- * Nothing to do if the file has been removed
- */
- if (zfs_zget(zfsvfs, object, &zp) != 0)
- return (SET_ERROR(ENOENT));
- if (zp->z_unlinked) {
- /*
- * Release the vnode asynchronously as we currently have the
- * txg stopped from syncing.
- */
- zfs_zrele_async(zp);
- return (SET_ERROR(ENOENT));
- }
-
- zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
- zgd->zgd_lwb = lwb;
- zgd->zgd_private = zp;
-
- /*
- * Write records come in two flavors: immediate and indirect.
- * For small writes it's cheaper to store the data with the
- * log record (immediate); for large writes it's cheaper to
- * sync the data and get a pointer to it (indirect) so that
- * we don't have to write the data twice.
- */
- if (buf != NULL) { /* immediate write */
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- /* test for truncation needs to be done while range locked */
- if (offset >= zp->z_size) {
- error = SET_ERROR(ENOENT);
- } else {
- error = dmu_read(os, object, offset, size, buf,
- DMU_READ_NO_PREFETCH);
- }
- ASSERT(error == 0 || error == ENOENT);
- } else { /* indirect write */
- /*
- * Have to lock the whole block to ensure when it's
- * written out and its checksum is being calculated
- * that no one can change the data. We need to re-check
- * blocksize after we get the lock in case it's changed!
- */
- for (;;) {
- uint64_t blkoff;
- size = zp->z_blksz;
- blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
- offset -= blkoff;
- zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
- offset, size, RL_READER);
- if (zp->z_blksz == size)
- break;
- offset += blkoff;
- zfs_rangelock_exit(zgd->zgd_lr);
- }
- /* test for truncation needs to be done while range locked */
- if (lr->lr_offset >= zp->z_size)
- error = SET_ERROR(ENOENT);
-#ifdef ZFS_DEBUG
- if (zil_fault_io) {
- error = SET_ERROR(EIO);
- zil_fault_io = 0;
- }
-#endif
- if (error == 0)
- error = dmu_buf_hold(os, object, offset, zgd, &db,
- DMU_READ_NO_PREFETCH);
-
- if (error == 0) {
- blkptr_t *bp = &lr->lr_blkptr;
-
- zgd->zgd_db = db;
- zgd->zgd_bp = bp;
-
- ASSERT(db->db_offset == offset);
- ASSERT(db->db_size == size);
-
- error = dmu_sync(zio, lr->lr_common.lrc_txg,
- zfs_get_done, zgd);
- ASSERT(error || lr->lr_length <= size);
-
- /*
- * On success, we need to wait for the write I/O
- * initiated by dmu_sync() to complete before we can
- * release this dbuf. We will finish everything up
- * in the zfs_get_done() callback.
- */
- if (error == 0)
- return (0);
-
- if (error == EALREADY) {
- lr->lr_common.lrc_txtype = TX_WRITE2;
- /*
- * TX_WRITE2 relies on the data previously
- * written by the TX_WRITE that caused
- * EALREADY. We zero out the BP because
- * it is the old, currently-on-disk BP.
- */
- zgd->zgd_bp = NULL;
- BP_ZERO(bp);
- error = 0;
- }
- }
- }
-
- zfs_get_done(zgd, error);
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (flag & V_ACE_MASK)
- error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
- else
- error = zfs_zaccess_rwx(zp, mode, flag, cr);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
/*
* Lookup an entry in a directory, or an extended attribute directory.
@@ -2440,26 +1632,6 @@ out:
return (error);
}
-ulong_t zfs_fsync_sync_cnt = 4;
-
-int
-zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
-
- (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
-
- if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
- }
- tsd_set(zfs_fsyncer_key, NULL);
-
- return (0);
-}
-
/*
* Get the basic file attributes and place them in the provided kstat
* structure. The inode is assumed to be the authoritative source
@@ -4796,207 +3968,9 @@ zfs_fid(struct inode *ip, fid_t *fidp)
return (0);
}
-/*ARGSUSED*/
-int
-zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- error = zfs_getacl(zp, vsecp, skipaclchk, cr);
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/*ARGSUSED*/
-int
-zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
-{
- zfsvfs_t *zfsvfs = ZTOZSB(zp);
- int error;
- boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
- zilog_t *zilog = zfsvfs->z_log;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- error = zfs_setacl(zp, vsecp, skipaclchk, cr);
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zilog, 0);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-#ifdef HAVE_UIO_ZEROCOPY
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10); /* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read. This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17); /* 128K */
-
-/*ARGSUSED*/
-static int
-zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
-{
- znode_t *zp = ITOZ(ip);
- zfsvfs_t *zfsvfs = ITOZSB(ip);
- int max_blksz = zfsvfs->z_max_blksz;
- uio_t *uio = &xuio->xu_uio;
- ssize_t size = uio->uio_resid;
- offset_t offset = uio->uio_loffset;
- int blksz;
- int fullblk, i;
- arc_buf_t *abuf;
- ssize_t maxsize;
- int preamble, postamble;
-
- if (xuio->xu_type != UIOTYPE_ZEROCOPY)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- switch (ioflag) {
- case UIO_WRITE:
- /*
- * Loan out an arc_buf for write if write size is bigger than
- * max_blksz, and the file's block size is also max_blksz.
- */
- blksz = max_blksz;
- if (size < blksz || zp->z_blksz != blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- /*
- * Caller requests buffers for write before knowing where the
- * write offset might be (e.g. NFS TCP write).
- */
- if (offset == -1) {
- preamble = 0;
- } else {
- preamble = P2PHASE(offset, blksz);
- if (preamble) {
- preamble = blksz - preamble;
- size -= preamble;
- }
- }
-
- postamble = P2PHASE(size, blksz);
- size -= postamble;
-
- fullblk = size / blksz;
- (void) dmu_xuio_init(xuio,
- (preamble != 0) + fullblk + (postamble != 0));
-
- /*
- * Have to fix iov base/len for partial buffers. They
- * currently represent full arc_buf's.
- */
- if (preamble) {
- /* data begins in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf,
- blksz - preamble, preamble);
- }
-
- for (i = 0; i < fullblk; i++) {
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, blksz);
- }
-
- if (postamble) {
- /* data ends in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, postamble);
- }
- break;
- case UIO_READ:
- /*
- * Loan out an arc_buf for read if the read size is larger than
- * the current file block size. Block alignment is not
- * considered. Partial arc_buf will be loaned out for read.
- */
- blksz = zp->z_blksz;
- if (blksz < zcr_blksz_min)
- blksz = zcr_blksz_min;
- if (blksz > zcr_blksz_max)
- blksz = zcr_blksz_max;
- /* avoid potential complexity of dealing with it */
- if (blksz > max_blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- maxsize = zp->z_size - uio->uio_loffset;
- if (size > maxsize)
- size = maxsize;
-
- if (size < blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- break;
- default:
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- uio->uio_extflg = UIO_XUIO;
- XUIO_XUZC_RW(xuio) = ioflag;
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
-{
- int i;
- arc_buf_t *abuf;
- int ioflag = XUIO_XUZC_RW(xuio);
-
- ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
- i = dmu_xuio_cnt(xuio);
- while (i-- > 0) {
- abuf = dmu_xuio_arcbuf(xuio, i);
- /*
- * if abuf == NULL, it must be a write buffer
- * that has been returned in zfs_write().
- */
- if (abuf)
- dmu_return_arcbuf(abuf);
- ASSERT(abuf || ioflag == UIO_WRITE);
- }
-
- dmu_xuio_fini(xuio);
- return (0);
-}
-#endif /* HAVE_UIO_ZEROCOPY */
-
#if defined(_KERNEL)
EXPORT_SYMBOL(zfs_open);
EXPORT_SYMBOL(zfs_close);
-EXPORT_SYMBOL(zfs_read);
-EXPORT_SYMBOL(zfs_write);
-EXPORT_SYMBOL(zfs_access);
EXPORT_SYMBOL(zfs_lookup);
EXPORT_SYMBOL(zfs_create);
EXPORT_SYMBOL(zfs_tmpfile);
@@ -5004,7 +3978,6 @@ EXPORT_SYMBOL(zfs_remove);
EXPORT_SYMBOL(zfs_mkdir);
EXPORT_SYMBOL(zfs_rmdir);
EXPORT_SYMBOL(zfs_readdir);
-EXPORT_SYMBOL(zfs_fsync);
EXPORT_SYMBOL(zfs_getattr_fast);
EXPORT_SYMBOL(zfs_setattr);
EXPORT_SYMBOL(zfs_rename);
@@ -5014,8 +3987,6 @@ EXPORT_SYMBOL(zfs_link);
EXPORT_SYMBOL(zfs_inactive);
EXPORT_SYMBOL(zfs_space);
EXPORT_SYMBOL(zfs_fid);
-EXPORT_SYMBOL(zfs_getsecattr);
-EXPORT_SYMBOL(zfs_setsecattr);
EXPORT_SYMBOL(zfs_getpage);
EXPORT_SYMBOL(zfs_putpage);
EXPORT_SYMBOL(zfs_dirty_inode);
@@ -5024,8 +3995,6 @@ EXPORT_SYMBOL(zfs_map);
/* BEGIN CSTYLED */
module_param(zfs_delete_blocks, ulong, 0644);
MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
-module_param(zfs_read_chunk_size, ulong, 0644);
-MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
/* END CSTYLED */
#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
index a542c662cb15..b33594488ee0 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
@@ -134,7 +134,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
zp->z_acl_cached = NULL;
zp->z_xattr_cached = NULL;
zp->z_xattr_parent = 0;
- zp->z_moved = B_FALSE;
return (0);
}
@@ -505,6 +504,7 @@ zfs_inode_update(znode_t *zp)
dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
spin_lock(&ip->i_lock);
+ ip->i_mode = zp->z_mode;
ip->i_blocks = i_blocks;
i_size_write(ip, zp->z_size);
spin_unlock(&ip->i_lock);
@@ -546,7 +546,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
ASSERT3P(zp->z_xattr_cached, ==, NULL);
zp->z_unlinked = B_FALSE;
zp->z_atime_dirty = B_FALSE;
- zp->z_moved = B_FALSE;
zp->z_is_mapped = B_FALSE;
zp->z_is_ctldir = B_FALSE;
zp->z_is_stale = B_FALSE;
@@ -619,7 +618,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
zfsvfs->z_nr_znodes++;
- membar_producer();
mutex_exit(&zfsvfs->z_znodes_lock);
unlock_new_inode(ip);
@@ -1901,7 +1899,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
rootzp->z_unlinked = B_FALSE;
rootzp->z_atime_dirty = B_FALSE;
- rootzp->z_moved = B_FALSE;
rootzp->z_is_sa = USE_SA(version, os);
rootzp->z_pflags = 0;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
index 96dabe55a138..8106359e1c77 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
@@ -1198,6 +1198,16 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
/*
+ * This is necessary here as we check next whether
+ * OBJSET_FLAG_USERACCOUNTING_COMPLETE or
+ * OBJSET_FLAG_USEROBJACCOUNTING are set in order to
+ * decide if the local_mac should be zeroed out.
+ */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+
+ /*
* The local MAC protects the user, group and project accounting.
* If these objects are not present, the local MAC is zeroed out.
*/
@@ -1208,7 +1218,10 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
(datalen >= OBJSET_PHYS_SIZE_V2 &&
osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
- (datalen <= OBJSET_PHYS_SIZE_V1)) {
+ (datalen <= OBJSET_PHYS_SIZE_V1) ||
+ (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 ||
+ (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) &&
+ key->zk_version > 0)) {
bzero(local_mac, ZIO_OBJSET_MAC_LEN);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index fa4500f6f8d1..e6420f19ed87 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -55,7 +55,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
int error = 0;
- ZFS_ENTER(zfsvfs);
+ ZPL_ENTER(zfsvfs);
if (!zpl_dir_emit_dots(filp, ctx))
goto out;
@@ -76,7 +76,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
ctx->pos++;
}
out:
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
return (error);
}
@@ -242,13 +242,14 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
uint64_t id, pos;
int error = 0;
- ZFS_ENTER(zfsvfs);
+ ZPL_ENTER(zfsvfs);
cookie = spl_fstrans_mark();
if (!zpl_dir_emit_dots(filp, ctx))
goto out;
- pos = ctx->pos;
+ /* Start the position at 0 if it already emitted . and .. */
+ pos = (ctx->pos == 2 ? 0 : ctx->pos);
while (error == 0) {
dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
@@ -265,7 +266,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
}
out:
spl_fstrans_unmark(cookie);
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
if (error == -ENOENT)
return (0);
@@ -368,13 +369,13 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
struct inode *ip = path->dentry->d_inode;
zfsvfs_t *zfsvfs = ITOZSB(ip);
- ZFS_ENTER(zfsvfs);
+ ZPL_ENTER(zfsvfs);
generic_fillattr(ip, stat);
stat->nlink = stat->size = 2;
stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
stat->atime = current_time(ip);
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
return (0);
}
@@ -452,7 +453,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
znode_t *dzp;
int error = 0;
- ZFS_ENTER(zfsvfs);
+ ZPL_ENTER(zfsvfs);
cookie = spl_fstrans_mark();
if (zfsvfs->z_shares_dir == 0) {
@@ -471,7 +472,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
iput(ZTOI(dzp));
out:
spl_fstrans_unmark(cookie);
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
ASSERT3S(error, <=, 0);
return (error);
@@ -502,13 +503,13 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
znode_t *dzp;
int error;
- ZFS_ENTER(zfsvfs);
+ ZPL_ENTER(zfsvfs);
if (zfsvfs->z_shares_dir == 0) {
generic_fillattr(path->dentry->d_inode, stat);
stat->nlink = stat->size = 2;
stat->atime = current_time(ip);
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
return (0);
}
@@ -518,7 +519,7 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
iput(ZTOI(dzp));
}
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
ASSERT3S(error, <=, 0);
return (error);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
index 51e189a87272..9e08c94e2147 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -212,244 +212,221 @@ zfs_io_flags(struct kiocb *kiocb)
return (flags);
}
-static ssize_t
-zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
- unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
- cred_t *cr, size_t skip)
+/*
+ * If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
+ * is true. This is needed since datasets with inherited "relatime" property
+ * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
+ * `zfs set relatime=...`), which is what relatime test in VFS by
+ * relatime_need_update() is based on.
+ */
+static inline void
+zpl_file_accessed(struct file *filp)
{
- ssize_t read;
- uio_t uio = { { 0 }, 0 };
- int error;
- fstrans_cookie_t cookie;
-
- uio.uio_iov = iovp;
- uio.uio_iovcnt = nr_segs;
- uio.uio_loffset = *ppos;
- uio.uio_segflg = segment;
- uio.uio_limit = MAXOFFSET_T;
- uio.uio_resid = count;
- uio.uio_skip = skip;
-
- cookie = spl_fstrans_mark();
- error = -zfs_read(ip, &uio, flags, cr);
- spl_fstrans_unmark(cookie);
- if (error < 0)
- return (error);
-
- read = count - uio.uio_resid;
- *ppos += read;
+ struct inode *ip = filp->f_mapping->host;
- return (read);
+ if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
+ if (zfs_relatime_need_update(ip))
+ file_accessed(filp);
+ } else {
+ file_accessed(filp);
+ }
}
-inline ssize_t
-zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
- uio_seg_t segment, int flags, cred_t *cr)
-{
- struct iovec iov;
-
- iov.iov_base = (void *)buf;
- iov.iov_len = len;
+#if defined(HAVE_VFS_RW_ITERATE)
- return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
- flags, cr, 0));
+/*
+ * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports
+ * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to
+ * manipulate the iov_iter are available. In which case the full iov_iter
+ * can be attached to the uio and correctly handled in the lower layers.
+ * Otherwise, for older kernels extract the iovec and pass it instead.
+ */
+static void
+zpl_uio_init(uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
+ loff_t pos, ssize_t count, size_t skip)
+{
+#if defined(HAVE_VFS_IOV_ITER)
+ uio_iov_iter_init(uio, to, pos, count, skip);
+#else
+ uio_iovec_init(uio, to->iov, to->nr_segs, pos,
+ to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
+ count, skip);
+#endif
}
static ssize_t
-zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
+zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
{
cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
struct file *filp = kiocb->ki_filp;
- struct inode *ip = filp->f_mapping->host;
- zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip));
- ssize_t read;
- unsigned int f_flags = filp->f_flags;
+ ssize_t count = iov_iter_count(to);
+ uio_t uio;
+
+ zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0);
- f_flags |= zfs_io_flags(kiocb);
crhold(cr);
- read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
- nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
crfree(cr);
- /*
- * If relatime is enabled, call file_accessed() only if
- * zfs_relatime_need_update() is true. This is needed since datasets
- * with inherited "relatime" property aren't necessarily mounted with
- * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what
- * relatime test in VFS by relatime_need_update() is based on.
- */
- if (!IS_NOATIME(ip) && zfsvfs->z_relatime) {
- if (zfs_relatime_need_update(ip))
- file_accessed(filp);
- } else {
- file_accessed(filp);
- }
+ if (error < 0)
+ return (error);
+
+ ssize_t read = count - uio.uio_resid;
+ kiocb->ki_pos += read;
+
+ zpl_file_accessed(filp);
return (read);
}
-#if defined(HAVE_VFS_RW_ITERATE)
-static ssize_t
-zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
+static inline ssize_t
+zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
+ size_t *countp)
{
- ssize_t ret;
- uio_seg_t seg = UIO_USERSPACE;
- if (to->type & ITER_KVEC)
- seg = UIO_SYSSPACE;
- if (to->type & ITER_BVEC)
- seg = UIO_BVEC;
- ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
- iov_iter_count(to), seg, to->iov_offset);
- if (ret > 0)
- iov_iter_advance(to, ret);
- return (ret);
-}
+#ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB
+ ssize_t ret = generic_write_checks(kiocb, from);
+ if (ret <= 0)
+ return (ret);
+
+ *countp = ret;
#else
-static ssize_t
-zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, loff_t pos)
-{
- ssize_t ret;
- size_t count;
+ struct file *file = kiocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *ip = mapping->host;
+ int isblk = S_ISBLK(ip->i_mode);
- ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE);
+ *countp = iov_iter_count(from);
+ ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk);
if (ret)
return (ret);
+#endif
- return (zpl_iter_read_common(kiocb, iovp, nr_segs, count,
- UIO_USERSPACE, 0));
+ return (0);
}
-#endif /* HAVE_VFS_RW_ITERATE */
static ssize_t
-zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
- unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
- cred_t *cr, size_t skip)
+zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
{
- ssize_t wrote;
- uio_t uio = { { 0 }, 0 };
- int error;
+ cred_t *cr = CRED();
fstrans_cookie_t cookie;
+ struct file *filp = kiocb->ki_filp;
+ struct inode *ip = filp->f_mapping->host;
+ uio_t uio;
+ size_t count = 0;
+ ssize_t ret;
- if (flags & O_APPEND)
- *ppos = i_size_read(ip);
+ ret = zpl_generic_write_checks(kiocb, from, &count);
+ if (ret)
+ return (ret);
- uio.uio_iov = iovp;
- uio.uio_iovcnt = nr_segs;
- uio.uio_loffset = *ppos;
- uio.uio_segflg = segment;
- uio.uio_limit = MAXOFFSET_T;
- uio.uio_resid = count;
- uio.uio_skip = skip;
+ zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset);
+ crhold(cr);
cookie = spl_fstrans_mark();
- error = -zfs_write(ip, &uio, flags, cr);
+
+ int error = -zfs_write(ITOZ(ip), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
spl_fstrans_unmark(cookie);
+ crfree(cr);
+
if (error < 0)
return (error);
- wrote = count - uio.uio_resid;
- *ppos += wrote;
+ ssize_t wrote = count - uio.uio_resid;
+ kiocb->ki_pos += wrote;
+
+ if (wrote > 0)
+ iov_iter_advance(from, wrote);
return (wrote);
}
-inline ssize_t
-zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
- uio_seg_t segment, int flags, cred_t *cr)
-{
- struct iovec iov;
-
- iov.iov_base = (void *)buf;
- iov.iov_len = len;
-
- return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
- flags, cr, 0));
-}
+#else /* !HAVE_VFS_RW_ITERATE */
static ssize_t
-zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
- unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
+zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
struct file *filp = kiocb->ki_filp;
- ssize_t wrote;
- unsigned int f_flags = filp->f_flags;
-
- f_flags |= zfs_io_flags(kiocb);
- crhold(cr);
- wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
- nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
- crfree(cr);
-
- return (wrote);
-}
-
-#if defined(HAVE_VFS_RW_ITERATE)
-static ssize_t
-zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
-{
size_t count;
ssize_t ret;
- uio_seg_t seg = UIO_USERSPACE;
-#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
- struct file *file = kiocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *ip = mapping->host;
- int isblk = S_ISBLK(ip->i_mode);
-
- count = iov_iter_count(from);
- ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
+ ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (ret)
return (ret);
-#else
- /*
- * XXX - ideally this check should be in the same lock region with
- * write operations, so that there's no TOCTTOU race when doing
- * append and someone else grow the file.
- */
- ret = generic_write_checks(kiocb, from);
- if (ret <= 0)
- return (ret);
- count = ret;
-#endif
- if (from->type & ITER_KVEC)
- seg = UIO_SYSSPACE;
- if (from->type & ITER_BVEC)
- seg = UIO_BVEC;
+ uio_t uio;
+ uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
+ count, 0);
- ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
- count, seg, from->iov_offset);
- if (ret > 0)
- iov_iter_advance(from, ret);
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error < 0)
+ return (error);
- return (ret);
+ ssize_t read = count - uio.uio_resid;
+ kiocb->ki_pos += read;
+
+ zpl_file_accessed(filp);
+
+ return (read);
}
-#else
+
static ssize_t
-zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
+zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct file *file = kiocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *ip = mapping->host;
- int isblk = S_ISBLK(ip->i_mode);
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ struct file *filp = kiocb->ki_filp;
+ struct inode *ip = filp->f_mapping->host;
size_t count;
ssize_t ret;
- ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ);
+ ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
if (ret)
return (ret);
- ret = generic_write_checks(file, &pos, &count, isblk);
+ ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode));
if (ret)
return (ret);
- return (zpl_iter_write_common(kiocb, iovp, nr_segs, count,
- UIO_USERSPACE, 0));
+ uio_t uio;
+ uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
+ count, 0);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ int error = -zfs_write(ITOZ(ip), &uio,
+ filp->f_flags | zfs_io_flags(kiocb), cr);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error < 0)
+ return (error);
+
+ ssize_t wrote = count - uio.uio_resid;
+ kiocb->ki_pos += wrote;
+
+ return (wrote);
}
#endif /* HAVE_VFS_RW_ITERATE */
@@ -486,14 +463,27 @@ zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
#error "Unknown direct IO interface"
#endif
-#else
+#else /* HAVE_VFS_RW_ITERATE */
#if defined(HAVE_VFS_DIRECT_IO_IOVEC)
static ssize_t
-zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp,
+zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
loff_t pos, unsigned long nr_segs)
{
if (rw == WRITE)
+ return (zpl_aio_write(kiocb, iov, nr_segs, pos));
+ else
+ return (zpl_aio_read(kiocb, iov, nr_segs, pos));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+ const struct iovec *iovp = iov_iter_iovec(iter);
+ unsigned long nr_segs = iter->nr_segs;
+
+ ASSERT3S(pos, ==, kiocb->ki_pos);
+ if (rw == WRITE)
return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
else
return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
@@ -517,7 +507,7 @@ zpl_llseek(struct file *filp, loff_t offset, int whence)
spl_inode_lock_shared(ip);
cookie = spl_fstrans_mark();
- error = -zfs_holey(ip, whence, &offset);
+ error = -zfs_holey(ITOZ(ip), whence, &offset);
spl_fstrans_unmark(cookie);
if (error == 0)
error = lseek_execute(filp, ip, offset, maxbytes);
@@ -603,10 +593,6 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma)
* Populate a page with data for the Linux page cache. This function is
* only used to support mmap(2). There will be an identical copy of the
* data in the ARC which is kept up to date via .write() and .writepage().
- *
- * Current this function relies on zpl_read_common() and the O_DIRECT
- * flag to read in a page. This works but the more correct way is to
- * update zfs_fillpage() to be Linux friendly and use that interface.
*/
static int
zpl_readpage(struct file *filp, struct page *pp)
@@ -675,10 +661,10 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
enum writeback_sync_modes sync_mode;
int result;
- ZFS_ENTER(zfsvfs);
+ ZPL_ENTER(zfsvfs);
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
wbc->sync_mode = WB_SYNC_ALL;
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
sync_mode = wbc->sync_mode;
/*
@@ -691,11 +677,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
wbc->sync_mode = WB_SYNC_NONE;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
if (sync_mode != wbc->sync_mode) {
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
if (zfsvfs->z_log != NULL)
zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
+ ZPL_EXIT(zfsvfs);
/*
* We need to call write_cache_pages() again (we can't just
@@ -1037,6 +1023,10 @@ const struct file_operations zpl_file_operations = {
#endif
.read_iter = zpl_iter_read,
.write_iter = zpl_iter_write,
+#ifdef HAVE_VFS_IOV_ITER
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+#endif
#else
.read = do_sync_read,
.write = do_sync_write,
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
index f3b97a22074c..f336fbb1272b 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -490,19 +490,17 @@ zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
{
fstrans_cookie_t cookie;
cred_t *cr = CRED();
- struct iovec iov;
- uio_t uio = { { 0 }, 0 };
int error;
crhold(cr);
*link = NULL;
+
+ struct iovec iov;
iov.iov_len = MAXPATHLEN;
iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = (MAXPATHLEN - 1);
+ uio_t uio;
+ uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0);
cookie = spl_fstrans_mark();
error = -zfs_readlink(ip, &uio, cr);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
index 9db8bda4cc66..c2fd3fee1401 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -185,14 +185,27 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data)
static int
__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
{
- char *fsname;
+ ZPL_ENTER(zfsvfs);
- ZFS_ENTER(zfsvfs);
- fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
dmu_objset_name(zfsvfs->z_os, fsname);
- seq_puts(seq, fsname);
+
+ for (int i = 0; fsname[i] != 0; i++) {
+ /*
+ * Spaces in the dataset name must be converted to their
+ * octal escape sequence for getmntent(3) to correctly
+ * parse then fsname portion of /proc/self/mounts.
+ */
+ if (fsname[i] == ' ') {
+ seq_puts(seq, "\\040");
+ } else {
+ seq_putc(seq, fsname[i]);
+ }
+ }
+
kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
- ZFS_EXIT(zfsvfs);
+
+ ZPL_EXIT(zfsvfs);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
index 9b5fd0fd397b..1ec3dae2bb81 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -274,10 +274,10 @@ static int
zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
size_t size, cred_t *cr)
{
+ fstrans_cookie_t cookie;
struct inode *xip = NULL;
znode_t *dxzp = NULL;
znode_t *xzp = NULL;
- loff_t pos = 0;
int error;
/* Lookup the xattr directory */
@@ -302,7 +302,19 @@ zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
goto out;
}
- error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr);
+ struct iovec iov;
+ iov.iov_base = (void *)value;
+ iov.iov_len = size;
+
+ uio_t uio;
+ uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_read(ITOZ(xip), &uio, 0, cr);
+ spl_fstrans_unmark(cookie);
+
+ if (error == 0)
+ error = size - uio_resid(&uio);
out:
if (xzp)
zrele(xzp);
@@ -441,7 +453,6 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
znode_t *dxzp = NULL;
znode_t *xzp = NULL;
vattr_t *vap = NULL;
- ssize_t wrote;
int lookup_flags, error;
const int xattr_mode = S_IFREG | 0644;
loff_t pos = 0;
@@ -496,13 +507,8 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
if (error)
goto out;
- wrote = zpl_write_common(ZTOI(xzp), value, size, &pos,
- UIO_SYSSPACE, 0, cr);
- if (wrote < 0)
- error = wrote;
-
+ error = -zfs_write_simple(xzp, value, size, pos, NULL);
out:
-
if (error == 0) {
ip->i_ctime = current_time(ip);
zfs_mark_inode_dirty(ip);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 218e1101edf8..cdc2076702af 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -66,49 +66,33 @@ typedef struct zv_request {
* Given a path, return TRUE if path is a ZVOL.
*/
static boolean_t
-zvol_is_zvol_impl(const char *device)
+zvol_is_zvol_impl(const char *path)
{
- struct block_device *bdev;
- unsigned int major;
+ dev_t dev = 0;
- bdev = vdev_lookup_bdev(device);
- if (IS_ERR(bdev))
+ if (vdev_lookup_bdev(path, &dev) != 0)
return (B_FALSE);
- major = MAJOR(bdev->bd_dev);
- bdput(bdev);
-
- if (major == zvol_major)
+ if (MAJOR(dev) == zvol_major)
return (B_TRUE);
return (B_FALSE);
}
static void
-uio_from_bio(uio_t *uio, struct bio *bio)
-{
- uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
- uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
- uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
- uio->uio_segflg = UIO_BVEC;
- uio->uio_limit = MAXOFFSET_T;
- uio->uio_resid = BIO_BI_SIZE(bio);
- uio->uio_skip = BIO_BI_SKIP(bio);
-}
-
-static void
zvol_write(void *arg)
{
- int error = 0;
-
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
- uio_t uio = { { 0 }, 0 };
- uio_from_bio(&uio, bio);
+ int error = 0;
+ uio_t uio;
+
+ uio_bvec_init(&uio, bio);
zvol_state_t *zv = zvr->zv;
- ASSERT(zv && zv->zv_open_count > 0);
- ASSERT(zv->zv_zilog != NULL);
+ ASSERT3P(zv, !=, NULL);
+ ASSERT3U(zv->zv_open_count, >, 0);
+ ASSERT3P(zv->zv_zilog, !=, NULL);
/* bio marked as FLUSH need to flush before write */
if (bio_is_flush(bio))
@@ -122,10 +106,14 @@ zvol_write(void *arg)
return;
}
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
ssize_t start_resid = uio.uio_resid;
- unsigned long start_jif = jiffies;
- blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
- bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
+ unsigned long start_time;
+
+ boolean_t acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
boolean_t sync =
bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
@@ -169,8 +157,10 @@ zvol_write(void *arg)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
rw_exit(&zv->zv_suspend_lock);
- blk_generic_end_io_acct(zv->zv_zso->zvo_queue,
- WRITE, &zv->zv_zso->zvo_disk->part0, start_jif);
+
+ if (acct)
+ blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
}
@@ -187,14 +177,18 @@ zvol_discard(void *arg)
boolean_t sync;
int error = 0;
dmu_tx_t *tx;
- unsigned long start_jif;
- ASSERT(zv && zv->zv_open_count > 0);
- ASSERT(zv->zv_zilog != NULL);
+ ASSERT3P(zv, !=, NULL);
+ ASSERT3U(zv->zv_open_count, >, 0);
+ ASSERT3P(zv->zv_zilog, !=, NULL);
+
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
+ unsigned long start_time;
- start_jif = jiffies;
- blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE,
- bio_sectors(bio), &zv->zv_zso->zvo_disk->part0);
+ boolean_t acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
@@ -239,8 +233,10 @@ zvol_discard(void *arg)
unlock:
rw_exit(&zv->zv_suspend_lock);
- blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE,
- &zv->zv_zso->zvo_disk->part0, start_jif);
+
+ if (acct)
+ blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
}
@@ -248,20 +244,25 @@ unlock:
static void
zvol_read(void *arg)
{
- int error = 0;
-
zv_request_t *zvr = arg;
struct bio *bio = zvr->bio;
- uio_t uio = { { 0 }, 0 };
- uio_from_bio(&uio, bio);
+ int error = 0;
+ uio_t uio;
+
+ uio_bvec_init(&uio, bio);
zvol_state_t *zv = zvr->zv;
- ASSERT(zv && zv->zv_open_count > 0);
+ ASSERT3P(zv, !=, NULL);
+ ASSERT3U(zv->zv_open_count, >, 0);
+ struct request_queue *q = zv->zv_zso->zvo_queue;
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
ssize_t start_resid = uio.uio_resid;
- unsigned long start_jif = jiffies;
- blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio),
- &zv->zv_zso->zvo_disk->part0);
+ unsigned long start_time;
+
+ boolean_t acct = blk_queue_io_stat(q);
+ if (acct)
+ start_time = blk_generic_start_io_acct(q, disk, READ, bio);
zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
uio.uio_loffset, uio.uio_resid, RL_READER);
@@ -289,8 +290,10 @@ zvol_read(void *arg)
task_io_account_read(nread);
rw_exit(&zv->zv_suspend_lock);
- blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ,
- &zv->zv_zso->zvo_disk->part0, start_jif);
+
+ if (acct)
+ blk_generic_end_io_acct(q, disk, READ, bio, start_time);
+
BIO_END_IO(bio, -error);
kmem_free(zvr, sizeof (zv_request_t));
}
@@ -482,9 +485,9 @@ zvol_open(struct block_device *bdev, fmode_t flag)
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock));
if (zv->zv_open_count == 0) {
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
if (error)
goto out_mutex;
@@ -501,7 +504,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
- check_disk_change(bdev);
+ zfs_check_media_change(bdev);
return (0);
@@ -530,7 +533,7 @@ zvol_release(struct gendisk *disk, fmode_t mode)
zv = disk->private_data;
mutex_enter(&zv->zv_state_lock);
- ASSERT(zv->zv_open_count > 0);
+ ASSERT3U(zv->zv_open_count, >, 0);
/*
* make sure zvol is not suspended during last close
* (hold zv_suspend_lock) and respect proper lock acquisition
@@ -553,11 +556,12 @@ zvol_release(struct gendisk *disk, fmode_t mode)
rw_exit(&zvol_state_lock);
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock));
zv->zv_open_count--;
- if (zv->zv_open_count == 0)
+ if (zv->zv_open_count == 0) {
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
zvol_last_close(zv);
+ }
mutex_exit(&zv->zv_state_lock);
@@ -652,8 +656,15 @@ zvol_revalidate_disk(struct gendisk *disk)
static int
zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
{
+ struct gendisk *disk = zv->zv_zso->zvo_disk;
- revalidate_disk(zv->zv_zso->zvo_disk);
+#if defined(HAVE_REVALIDATE_DISK_SIZE)
+ revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
+#elif defined(HAVE_REVALIDATE_DISK)
+ revalidate_disk(disk);
+#else
+ zvol_revalidate_disk(disk);
+#endif
return (0);
}
@@ -697,46 +708,6 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return (0);
}
-/*
- * Find a zvol_state_t given the full major+minor dev_t. If found,
- * return with zv_state_lock taken, otherwise, return (NULL) without
- * taking zv_state_lock.
- */
-static zvol_state_t *
-zvol_find_by_dev(dev_t dev)
-{
- zvol_state_t *zv;
-
- rw_enter(&zvol_state_lock, RW_READER);
- for (zv = list_head(&zvol_state_list); zv != NULL;
- zv = list_next(&zvol_state_list, zv)) {
- mutex_enter(&zv->zv_state_lock);
- if (zv->zv_zso->zvo_dev == dev) {
- rw_exit(&zvol_state_lock);
- return (zv);
- }
- mutex_exit(&zv->zv_state_lock);
- }
- rw_exit(&zvol_state_lock);
-
- return (NULL);
-}
-
-static struct kobject *
-zvol_probe(dev_t dev, int *part, void *arg)
-{
- zvol_state_t *zv;
- struct kobject *kobj;
-
- zv = zvol_find_by_dev(dev);
- kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL;
- ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock));
- if (zv)
- mutex_exit(&zv->zv_state_lock);
-
- return (kobj);
-}
-
static struct block_device_operations zvol_ops = {
.open = zvol_open,
.release = zvol_release,
@@ -774,6 +745,7 @@ zvol_alloc(dev_t dev, const char *name)
zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
zv->zv_zso = zso;
+ zv->zv_volmode = volmode;
list_link_init(&zv->zv_next);
mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -859,8 +831,8 @@ zvol_free(zvol_state_t *zv)
ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
- ASSERT(zv->zv_open_count == 0);
- ASSERT(zv->zv_zso->zvo_disk->private_data == NULL);
+ ASSERT0(zv->zv_open_count);
+ ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
rw_destroy(&zv->zv_suspend_lock);
zfs_rangelock_fini(&zv->zv_rangelock);
@@ -879,6 +851,11 @@ zvol_free(zvol_state_t *zv)
kmem_free(zv, sizeof (zvol_state_t));
}
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+}
+
/*
* Create a block device minor node and setup the linkage between it
* and the specified volume. Once this function returns the block
@@ -1083,9 +1060,6 @@ zvol_init(void)
return (-ENOMEM);
}
zvol_init_impl();
- blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS,
- THIS_MODULE, zvol_probe, NULL, NULL);
-
ida_init(&zvol_ida);
zvol_register_ops(&zvol_linux_ops);
return (0);
@@ -1095,7 +1069,6 @@ void
zvol_fini(void)
{
zvol_fini_impl();
- blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS);
unregister_blkdev(zvol_major, ZVOL_DRIVER);
taskq_destroy(zvol_taskq);
ida_destroy(&zvol_ida);
diff --git a/sys/contrib/openzfs/module/zcommon/Makefile.in b/sys/contrib/openzfs/module/zcommon/Makefile.in
index b5cdf4c0c9fe..ebc538440445 100644
--- a/sys/contrib/openzfs/module/zcommon/Makefile.in
+++ b/sys/contrib/openzfs/module/zcommon/Makefile.in
@@ -19,7 +19,6 @@ $(MODULE)-objs += zfs_fletcher_superscalar.o
$(MODULE)-objs += zfs_fletcher_superscalar4.o
$(MODULE)-objs += zfs_namecheck.o
$(MODULE)-objs += zfs_prop.o
-$(MODULE)-objs += zfs_uio.o
$(MODULE)-objs += zpool_prop.o
$(MODULE)-objs += zprop_common.o
diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
index 97ddacbab9e0..34ebabcf3b3c 100644
--- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c
+++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
@@ -576,7 +576,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
"org.openzfs:device_rebuild", "device_rebuild",
- "Support for sequential device rebuilds",
+ "Support for sequential mirror/dRAID device rebuilds",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
{
@@ -589,6 +589,10 @@ zpool_feature_init(void)
"zstd compression algorithm support.",
ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps);
}
+
+ zfeature_register(SPA_FEATURE_DRAID,
+ "org.openzfs:draid", "draid", "Support for distributed spare RAID",
+ ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL);
}
#if defined(_KERNEL)
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
index 3e0632a32864..7a9de4a4309d 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
@@ -660,7 +660,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
}
-#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */
+#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
zio_cksum_t *);
@@ -885,23 +885,26 @@ zio_abd_checksum_func_t fletcher_4_abd_ops = {
.acf_iter = abd_fletcher_4_iter
};
+#if defined(_KERNEL)
+
+#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ")
-#if defined(_KERNEL) && defined(__linux__)
+#if defined(__linux__)
static int
fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
{
const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
char *fmt;
- int i, cnt = 0;
+ int cnt = 0;
/* list fastest */
- fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s ";
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
cnt += sprintf(buffer + cnt, fmt, "fastest");
/* list all supported implementations */
- for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
- fmt = (i == impl) ? "[%s] " : "%s ";
+ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
cnt += sprintf(buffer + cnt, fmt,
fletcher_4_supp_impls[i]->name);
}
@@ -915,14 +918,62 @@ fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
return (fletcher_4_impl_set(val));
}
+#else
+
+#include <sys/sbuf.h>
+
+static int
+fletcher_4_param(ZFS_MODULE_PARAM_ARGS)
+{
+ int err;
+
+ if (req->newptr == NULL) {
+ const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ const int init_buflen = 64;
+ const char *fmt;
+ struct sbuf *s;
+
+ s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ (void) sbuf_printf(s, fmt, "fastest");
+
+ /* list all supported implementations */
+ for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ (void) sbuf_printf(s, fmt,
+ fletcher_4_supp_impls[i]->name);
+ }
+
+ err = sbuf_finish(s);
+ sbuf_delete(s);
+
+ return (err);
+ }
+
+ char buf[16];
+
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err)
+ return (err);
+ return (-fletcher_4_impl_set(buf));
+}
+
+#endif
+
+#undef IMPL_FMT
+
/*
* Choose a fletcher 4 implementation in ZFS.
* Users can choose "cycle" to exercise all implementations, but this is
* for testing purpose therefore it can only be set in user space.
*/
-module_param_call(zfs_fletcher_4_impl,
- fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
-MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
+/* BEGIN CSTYLED */
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,
+ fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,
+ "Select fletcher 4 implementation.");
+/* END CSTYLED */
EXPORT_SYMBOL(fletcher_init);
EXPORT_SYMBOL(fletcher_2_incremental_native);
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
index f8625042a74c..0011a971cacb 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
@@ -442,7 +442,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
return (-1);
}
- if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) {
+ if (strcmp(pool, "mirror") == 0 ||
+ strcmp(pool, "raidz") == 0 ||
+ strcmp(pool, "draid") == 0) {
if (why)
*why = NAME_ERR_RESERVED;
return (-1);
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 0352b13aa240..b78331187e13 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -551,14 +551,14 @@ zfs_prop_init(void)
PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
"MOUNTPOINT");
zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
- PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
+ PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options",
"SHARENFS");
zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
"filesystem | volume | snapshot | bookmark", "TYPE");
zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
- "on | off | sharemgr(1M) options", "SHARESMB");
+ "on | off | SMB share options", "SHARESMB");
zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
"<sensitivity label>", "MLSLABEL");
@@ -1016,7 +1016,7 @@ zcommon_fini(void)
kfpu_fini();
}
-module_init(zcommon_init);
+module_init_early(zcommon_init);
module_exit(zcommon_fini);
#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_uio.c b/sys/contrib/openzfs/module/zcommon/zfs_uio.c
index d586e0a1220a..e435e1a9f78a 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_uio.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_uio.c
@@ -39,12 +39,6 @@
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
*/
-/*
- * The uio support from OpenSolaris has been added as a short term
- * work around. The hope is to adopt native Linux type and drop the
- * use of uio's entirely. Under Linux they only add overhead and
- * when possible we want to use native APIs for the ZPL layer.
- */
#ifdef _KERNEL
#include <sys/types.h>
@@ -71,7 +65,6 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
cnt = MIN(iov->iov_len - skip, n);
switch (uio->uio_segflg) {
case UIO_USERSPACE:
- case UIO_USERISPACE:
/*
* p = kernel data pointer
* iov->iov_base = user data pointer
@@ -165,81 +158,82 @@ uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio)
return (0);
}
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+uiomove_iter(void *p, size_t n, enum uio_rw rw, struct uio *uio,
+ boolean_t revert)
+{
+ size_t cnt = MIN(n, uio->uio_resid);
+
+ if (uio->uio_skip)
+ iov_iter_advance(uio->uio_iter, uio->uio_skip);
+
+ if (rw == UIO_READ)
+ cnt = copy_to_iter(p, cnt, uio->uio_iter);
+ else
+ cnt = copy_from_iter(p, cnt, uio->uio_iter);
+
+ /*
+ * When operating on a full pipe no bytes are processed.
+ * In which case return EFAULT which is converted to EAGAIN
+ * by the kernel's generic_file_splice_read() function.
+ */
+ if (cnt == 0)
+ return (EFAULT);
+
+ /*
+ * Revert advancing the uio_iter. This is set by uiocopy()
+ * to avoid consuming the uio and its iov_iter structure.
+ */
+ if (revert)
+ iov_iter_revert(uio->uio_iter, cnt);
+
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+
+ return (0);
+}
+#endif
+
int
uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
{
- if (uio->uio_segflg != UIO_BVEC)
- return (uiomove_iov(p, n, rw, uio));
- else
+ if (uio->uio_segflg == UIO_BVEC)
return (uiomove_bvec(p, n, rw, uio));
+#if defined(HAVE_VFS_IOV_ITER)
+ else if (uio->uio_segflg == UIO_ITER)
+ return (uiomove_iter(p, n, rw, uio, B_FALSE));
+#endif
+ else
+ return (uiomove_iov(p, n, rw, uio));
}
EXPORT_SYMBOL(uiomove);
-#define fuword8(uptr, vptr) get_user((*vptr), (uptr))
-
-/*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified. Any
- * error will terminate the process as this is only a best attempt to get
- * the pages resident.
- */
int
uio_prefaultpages(ssize_t n, struct uio *uio)
{
- const struct iovec *iov;
- ulong_t cnt, incr;
- caddr_t p;
- uint8_t tmp;
- int iovcnt;
- size_t skip;
+ struct iov_iter iter, *iterp = NULL;
- /* no need to fault in kernel pages */
- switch (uio->uio_segflg) {
- case UIO_SYSSPACE:
- case UIO_BVEC:
- return (0);
- case UIO_USERSPACE:
- case UIO_USERISPACE:
- break;
- default:
- ASSERT(0);
- }
-
- iov = uio->uio_iov;
- iovcnt = uio->uio_iovcnt;
- skip = uio->uio_skip;
-
- for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
- cnt = MIN(iov->iov_len - skip, n);
- /* empty iov */
- if (cnt == 0)
- continue;
- n -= cnt;
- /*
- * touch each page in this segment.
- */
- p = iov->iov_base + skip;
- while (cnt) {
- if (fuword8((uint8_t *)p, &tmp))
- return (EFAULT);
- incr = MIN(cnt, PAGESIZE);
- p += incr;
- cnt -= incr;
- }
- /*
- * touch the last byte in case it straddles a page.
- */
- p--;
- if (fuword8((uint8_t *)p, &tmp))
- return (EFAULT);
+#if defined(HAVE_IOV_ITER_FAULT_IN_READABLE)
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ iterp = &iter;
+ iov_iter_init_compat(iterp, READ, uio->uio_iov,
+ uio->uio_iovcnt, uio->uio_resid);
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ iterp = uio->uio_iter;
+#endif
}
+ if (iterp && iov_iter_fault_in_readable(iterp, n))
+ return (EFAULT);
+#endif
return (0);
}
EXPORT_SYMBOL(uio_prefaultpages);
/*
- * same as uiomove() but doesn't modify uio structure.
+ * The same as uiomove() but doesn't modify uio structure.
* return in cbytes how many bytes were copied.
*/
int
@@ -249,39 +243,54 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
int ret;
bcopy(uio, &uio_copy, sizeof (struct uio));
- ret = uiomove(p, n, rw, &uio_copy);
+
+ if (uio->uio_segflg == UIO_BVEC)
+ ret = uiomove_bvec(p, n, rw, &uio_copy);
+#if defined(HAVE_VFS_IOV_ITER)
+ else if (uio->uio_segflg == UIO_ITER)
+ ret = uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
+#endif
+ else
+ ret = uiomove_iov(p, n, rw, &uio_copy);
+
*cbytes = uio->uio_resid - uio_copy.uio_resid;
+
return (ret);
}
EXPORT_SYMBOL(uiocopy);
/*
- * Drop the next n chars out of *uiop.
+ * Drop the next n chars out of *uio.
*/
void
-uioskip(uio_t *uiop, size_t n)
+uioskip(uio_t *uio, size_t n)
{
- if (n > uiop->uio_resid)
+ if (n > uio->uio_resid)
return;
- uiop->uio_skip += n;
- if (uiop->uio_segflg != UIO_BVEC) {
- while (uiop->uio_iovcnt &&
- uiop->uio_skip >= uiop->uio_iov->iov_len) {
- uiop->uio_skip -= uiop->uio_iov->iov_len;
- uiop->uio_iov++;
- uiop->uio_iovcnt--;
+ if (uio->uio_segflg == UIO_BVEC) {
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_bvec->bv_len) {
+ uio->uio_skip -= uio->uio_bvec->bv_len;
+ uio->uio_bvec++;
+ uio->uio_iovcnt--;
}
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ iov_iter_advance(uio->uio_iter, n);
+#endif
} else {
- while (uiop->uio_iovcnt &&
- uiop->uio_skip >= uiop->uio_bvec->bv_len) {
- uiop->uio_skip -= uiop->uio_bvec->bv_len;
- uiop->uio_bvec++;
- uiop->uio_iovcnt--;
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_iov->iov_len) {
+ uio->uio_skip -= uio->uio_iov->iov_len;
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
}
}
- uiop->uio_loffset += n;
- uiop->uio_resid -= n;
+ uio->uio_loffset += n;
+ uio->uio_resid -= n;
}
EXPORT_SYMBOL(uioskip);
#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in
index 259ac4dc926c..653ea0da9bcc 100644
--- a/sys/contrib/openzfs/module/zfs/Makefile.in
+++ b/sys/contrib/openzfs/module/zfs/Makefile.in
@@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o
$(MODULE)-objs += unique.o
$(MODULE)-objs += vdev.o
$(MODULE)-objs += vdev_cache.o
+$(MODULE)-objs += vdev_draid.o
+$(MODULE)-objs += vdev_draid_rand.o
$(MODULE)-objs += vdev_indirect.o
$(MODULE)-objs += vdev_indirect_births.o
$(MODULE)-objs += vdev_indirect_mapping.o
@@ -120,6 +122,7 @@ $(MODULE)-objs += zfs_ratelimit.o
$(MODULE)-objs += zfs_replay.o
$(MODULE)-objs += zfs_rlock.o
$(MODULE)-objs += zfs_sa.o
+$(MODULE)-objs += zfs_vnops.o
$(MODULE)-objs += zil.o
$(MODULE)-objs += zio.o
$(MODULE)-objs += zio_checksum.o
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
index 6018a42ca0d8..68d4aa5f5cb4 100644
--- a/sys/contrib/openzfs/module/zfs/abd.c
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -781,16 +781,17 @@ int
abd_iterate_func(abd_t *abd, size_t off, size_t size,
abd_iter_func_t *func, void *private)
{
- int ret = 0;
struct abd_iter aiter;
- boolean_t abd_multi;
- abd_t *c_abd;
+ int ret = 0;
+
+ if (size == 0)
+ return (0);
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
- abd_multi = abd_is_gang(abd);
- c_abd = abd_init_abd_iter(abd, &aiter, off);
+ boolean_t abd_multi = abd_is_gang(abd);
+ abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
/* If we are at the end of the gang ABD we are done */
@@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
abd_t *c_dabd, *c_sabd;
+ if (size == 0)
+ return (0);
+
abd_verify(dabd);
abd_verify(sabd);
diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c
index a2fec27744e1..e46da95f676c 100644
--- a/sys/contrib/openzfs/module/zfs/aggsum.c
+++ b/sys/contrib/openzfs/module/zfs/aggsum.c
@@ -70,6 +70,11 @@
* zeroing out the borrowed value (forcing that thread to borrow on its next
* request, which will also be expensive). This is what makes aggsums well
* suited for write-many read-rarely operations.
+ *
+ * Note that the aggsums do not expand if more CPUs are hot-added. In that
+ * case, we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
*/
/*
@@ -167,9 +172,7 @@ aggsum_add(aggsum_t *as, int64_t delta)
struct aggsum_bucket *asb;
int64_t borrow;
- kpreempt_disable();
- asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets];
- kpreempt_enable();
+ asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
/* Try fast path if we already borrowed enough before. */
mutex_enter(&asb->asc_lock);
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 68508cf152a8..c21ae27b9af8 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -492,6 +492,8 @@ arc_stats_t arc_stats = {
{ "evict_not_enough", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
{ "evict_l2_skip", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
@@ -533,6 +535,11 @@ arc_stats_t arc_stats = {
{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_prefetch_asize", KSTAT_DATA_UINT64 },
+ { "l2_mru_asize", KSTAT_DATA_UINT64 },
+ { "l2_mfu_asize", KSTAT_DATA_UINT64 },
+ { "l2_bufc_data_asize", KSTAT_DATA_UINT64 },
+ { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
@@ -894,6 +901,17 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
static void l2arc_do_free_on_write(void);
+static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+ boolean_t state_only);
+
+#define l2arc_hdr_arcstats_increment(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
+#define l2arc_hdr_arcstats_decrement(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
+#define l2arc_hdr_arcstats_increment_state(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
+#define l2arc_hdr_arcstats_decrement_state(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
/*
* l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
@@ -951,7 +969,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio);
/* L2ARC persistence block restoration routines. */
static void l2arc_log_blk_restore(l2arc_dev_t *dev,
- const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr);
+ const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
@@ -1727,7 +1745,7 @@ static arc_buf_hdr_t *
arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
enum zio_compress compress, uint8_t complevel, boolean_t protected,
- boolean_t prefetch)
+ boolean_t prefetch, arc_state_type_t arcs_state)
{
arc_buf_hdr_t *hdr;
@@ -1751,6 +1769,7 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_daddr = daddr;
+ hdr->b_l2hdr.b_arcs_state = arcs_state;
return (hdr);
}
@@ -2312,7 +2331,11 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
arc_evictable_space_decrement(hdr, state);
}
/* remove the prefetch flag if we get a reference */
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
}
@@ -2595,9 +2618,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
}
}
- if (HDR_HAS_L1HDR(hdr))
+ if (HDR_HAS_L1HDR(hdr)) {
hdr->b_l1hdr.b_state = new_state;
+ if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ }
+
/*
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
@@ -3685,6 +3715,76 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
}
static void
+l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+ boolean_t state_only)
+{
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ l2arc_dev_t *dev = l2hdr->b_dev;
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ arc_buf_contents_t type = hdr->b_type;
+ int64_t lsize_s;
+ int64_t psize_s;
+ int64_t asize_s;
+
+ if (incr) {
+ lsize_s = lsize;
+ psize_s = psize;
+ asize_s = asize;
+ } else {
+ lsize_s = -lsize;
+ psize_s = -psize;
+ asize_s = -asize;
+ }
+
+ /* If the buffer is a prefetch, count it as such. */
+ if (HDR_PREFETCH(hdr)) {
+ ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
+ } else {
+ /*
+ * We use the value stored in the L2 header upon initial
+ * caching in L2ARC. This value will be updated in case
+ * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
+ * metadata (log entry) cannot currently be updated. Having
+ * the ARC state in the L2 header solves the problem of a
+ * possibly absent L1 header (apparent in buffers restored
+ * from persistent L2ARC).
+ */
+ switch (hdr->b_l2hdr.b_arcs_state) {
+ case ARC_STATE_MRU_GHOST:
+ case ARC_STATE_MRU:
+ ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
+ break;
+ case ARC_STATE_MFU_GHOST:
+ case ARC_STATE_MFU:
+ ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (state_only)
+ return;
+
+ ARCSTAT_INCR(arcstat_l2_psize, psize_s);
+ ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
+
+ switch (type) {
+ case ARC_BUFC_DATA:
+ ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
+ break;
+ case ARC_BUFC_METADATA:
+ ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
+ break;
+ default:
+ break;
+ }
+}
+
+
+static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
{
l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
@@ -3697,9 +3797,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
list_remove(&dev->l2ad_buflist, hdr);
- ARCSTAT_INCR(arcstat_l2_psize, -psize);
- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
-
+ l2arc_hdr_arcstats_decrement(hdr);
vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
@@ -3903,6 +4001,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
if (l2arc_write_eligible(hdr->b_spa, hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_eligible,
HDR_GET_LSIZE(hdr));
+
+ switch (state->arcs_state) {
+ case ARC_STATE_MRU:
+ ARCSTAT_INCR(
+ arcstat_evict_l2_eligible_mru,
+ HDR_GET_LSIZE(hdr));
+ break;
+ case ARC_STATE_MFU:
+ ARCSTAT_INCR(
+ arcstat_evict_l2_eligible_mfu,
+ HDR_GET_LSIZE(hdr));
+ break;
+ default:
+ break;
+ }
} else {
ARCSTAT_INCR(arcstat_evict_l2_ineligible,
HDR_GET_LSIZE(hdr));
@@ -4769,14 +4882,7 @@ arc_kmem_reap_soon(void)
static boolean_t
arc_evict_cb_check(void *arg, zthr_t *zthr)
{
- /*
- * This is necessary so that any changes which may have been made to
- * many of the zfs_arc_* module parameters will be propagated to
- * their actual internal variable counterparts. Without this,
- * changing those module params at runtime would have no effect.
- */
- arc_tuning_update(B_FALSE);
-
+#ifdef ZFS_DEBUG
/*
* This is necessary in order to keep the kstat information
* up to date for tools that display kstat data such as the
@@ -4784,12 +4890,11 @@ arc_evict_cb_check(void *arg, zthr_t *zthr)
* typically do not call kstat's update function, but simply
* dump out stats from the most recent update. Without
* this call, these commands may show stale stats for the
- * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
- * with this change, the data might be up to 1 second
- * out of date(the arc_evict_zthr has a maximum sleep
- * time of 1 second); but that should suffice. The
- * arc_state_t structures can be queried directly if more
- * accurate information is needed.
+ * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+ * with this call, the data might be out of date if the
+ * evict thread hasn't been woken recently; but that should
+ * suffice. The arc_state_t structures can be queried
+ * directly if more accurate information is needed.
*/
#ifndef __FreeBSD__
if (arc_ksp != NULL)
@@ -5347,11 +5452,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
ARCSTAT_BUMP(arcstat_mru_hits);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
hdr->b_l1hdr.b_arc_access = now;
return;
@@ -5382,13 +5491,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* was evicted from the cache. Move it to the
* MFU state.
*/
-
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
new_state = arc_mru;
if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
@@ -5641,7 +5753,7 @@ arc_read_done(zio_t *zio)
*/
int callback_cnt = 0;
for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
- if (!acb->acb_done)
+ if (!acb->acb_done || acb->acb_nobuf)
continue;
callback_cnt++;
@@ -5806,6 +5918,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
(zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
+ boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
int rc = 0;
ASSERT(!embedded_bp ||
@@ -5890,6 +6003,7 @@ top:
acb->acb_compressed = compressed_read;
acb->acb_encrypted = encrypted_read;
acb->acb_noauth = noauth_read;
+ acb->acb_nobuf = no_buf;
acb->acb_zb = *zb;
if (pio != NULL)
acb->acb_zio_dummy = zio_null(pio,
@@ -5899,8 +6013,6 @@ top:
acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
- mutex_exit(hash_lock);
- goto out;
}
mutex_exit(hash_lock);
goto out;
@@ -5909,7 +6021,7 @@ top:
ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
hdr->b_l1hdr.b_state == arc_mfu);
- if (done) {
+ if (done && !no_buf) {
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
/*
* This is a demand read which does not have to
@@ -5963,8 +6075,12 @@ top:
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
rc != EACCES);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
@@ -6108,8 +6224,13 @@ top:
}
if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
@@ -6178,7 +6299,11 @@ top:
metadata, misses);
}
- if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
+ /* Check if the spa even has l2 configured */
+ const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
+ spa->spa_l2cache.sav_count > 0;
+
+ if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
* 1. The L2ARC vdev was previously cached.
@@ -6186,7 +6311,7 @@ top:
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
- * 5. This isn't prefetch and l2arc_noprefetch is set.
+ * 5. This isn't prefetch or l2arc_noprefetch is 0.
*/
if (HDR_HAS_L2HDR(hdr) &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
@@ -6279,15 +6404,24 @@ top:
} else {
if (vd != NULL)
spa_config_exit(spa, SCL_L2ARC, vd);
+
/*
- * Skip ARC stat bump for block pointers with
- * embedded data. The data are read from the blkptr
- * itself via decode_embedded_bp_compressed().
+ * Only a spa with l2 should contribute to l2
+ * miss stats. (Including the case of having a
+ * faulted cache device - that's also a miss.)
*/
- if (l2arc_ndev != 0 && !embedded_bp) {
- DTRACE_PROBE1(l2arc__miss,
- arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_l2_misses);
+ if (spa_has_l2) {
+ /*
+ * Skip ARC stat bump for block pointers with
+ * embedded data. The data are read from the
+ * blkptr itself via
+ * decode_embedded_bp_compressed().
+ */
+ if (!embedded_bp) {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ }
}
}
@@ -7072,9 +7206,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
*/
uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
uint64_t spa_dirty_anon = spa_dirty_data(spa);
-
- if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
- anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
+ uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
+ if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
+ anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
#ifdef ZFS_DEBUG
uint64_t meta_esize = zfs_refcount_count(
@@ -7082,9 +7216,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
uint64_t data_esize =
zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
- "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
+ "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
arc_tempreserve >> 10, meta_esize >> 10,
- data_esize >> 10, reserve >> 10, arc_c >> 10);
+ data_esize >> 10, reserve >> 10, rarc_c >> 10);
#endif
DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
return (SET_ERROR(ERESTART));
@@ -7452,6 +7586,15 @@ arc_target_bytes(void)
}
void
+arc_set_limits(uint64_t allmem)
+{
+ /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
+ arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
+
+ /* How to set default max varies by platform. */
+ arc_c_max = arc_default_max(arc_c_min, allmem);
+}
+void
arc_init(void)
{
uint64_t percent, allmem = arc_all_memory();
@@ -7466,11 +7609,7 @@ arc_init(void)
arc_lowmem_init();
#endif
- /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
- arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
-
- /* How to set default max varies by platform. */
- arc_c_max = arc_default_max(arc_c_min, allmem);
+ arc_set_limits(allmem);
#ifndef _KERNEL
/*
@@ -7507,6 +7646,8 @@ arc_init(void)
if (arc_c < arc_c_min)
arc_c = arc_c_min;
+ arc_register_hotplug();
+
arc_state_init();
buf_init();
@@ -7515,8 +7656,9 @@ arc_init(void)
offsetof(arc_prune_t, p_node));
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
- arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri,
- boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
+ boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+ TASKQ_THREADS_CPU_PCT);
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -7527,8 +7669,8 @@ arc_init(void)
kstat_install(arc_ksp);
}
- arc_evict_zthr = zthr_create_timer("arc_evict",
- arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1));
+ arc_evict_zthr = zthr_create("arc_evict",
+ arc_evict_cb_check, arc_evict_cb, NULL);
arc_reap_zthr = zthr_create_timer("arc_reap",
arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1));
@@ -7613,6 +7755,8 @@ arc_fini(void)
buf_fini();
arc_state_fini();
+ arc_unregister_hotplug();
+
/*
* We destroy the zthrs after all the ARC state has been
* torn down to avoid the case of them receiving any
@@ -8068,9 +8212,6 @@ l2arc_write_done(zio_t *zio)
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
l2arc_write_callback_t *, cb);
- if (zio->io_error != 0)
- ARCSTAT_BUMP(arcstat_l2_writes_error);
-
/*
* All writes completed, or an error was hit.
*/
@@ -8134,8 +8275,7 @@ top:
arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
uint64_t psize = HDR_GET_PSIZE(hdr);
- ARCSTAT_INCR(arcstat_l2_psize, -psize);
- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
+ l2arc_hdr_arcstats_decrement(hdr);
bytes_dropped +=
vdev_psize_to_asize(dev->l2ad_vdev, psize);
@@ -8183,6 +8323,8 @@ top:
list_destroy(&cb->l2wcb_abd_list);
if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
/*
* Restore the lbps array in the header to its previous state.
* If the list of log block pointers is empty, zero out the
@@ -8748,9 +8890,16 @@ out:
goto top;
}
- ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
- if (!dev->l2ad_first)
- ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+ if (!all) {
+ /*
+ * In case of cache device removal (all) the following
+ * assertions may be violated without functional consequences
+ * as the device is about to be removed.
+ */
+ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+ }
}
/*
@@ -9089,6 +9238,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
hdr->b_l2hdr.b_hits = 0;
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_arcs_state =
+ hdr->b_l1hdr.b_state->arcs_state;
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
mutex_enter(&dev->l2ad_mtx);
@@ -9111,6 +9262,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
+ l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
mutex_exit(hash_lock);
@@ -9153,8 +9305,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
- ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
- ARCSTAT_INCR(arcstat_l2_psize, write_psize);
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
@@ -9379,8 +9529,6 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
l2arc_dev_hdr_phys_t *l2dhdr;
uint64_t l2dhdr_asize;
spa_t *spa;
- int err;
- boolean_t l2dhdr_valid = B_TRUE;
dev = l2arc_vdev_get(vd);
ASSERT3P(dev, !=, NULL);
@@ -9409,10 +9557,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
/*
* Read the device header, if an error is returned do not rebuild L2ARC.
*/
- if ((err = l2arc_dev_hdr_read(dev)) != 0)
- l2dhdr_valid = B_FALSE;
-
- if (l2dhdr_valid && dev->l2ad_log_entries > 0) {
+ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
/*
* If we are onlining a cache device (vdev_reopen) that was
* still present (l2arc_vdev_present()) and rebuild is enabled,
@@ -9712,7 +9857,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
* L2BLK_GET_PSIZE returns aligned size for log blocks.
*/
uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
- l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr);
+ l2arc_log_blk_restore(dev, this_lb, asize);
/*
* log block restored, include its pointer in the list of
@@ -9759,6 +9904,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
!dev->l2ad_first)
goto out;
+ cond_resched();
for (;;) {
mutex_enter(&l2arc_rebuild_thr_lock);
if (dev->l2ad_rebuild_cancel) {
@@ -9792,7 +9938,7 @@ l2arc_rebuild(l2arc_dev_t *dev)
PTR_SWAP(this_lb, next_lb);
this_io = next_io;
next_io = NULL;
- }
+ }
if (this_io != NULL)
l2arc_log_blk_fetch_abort(this_io);
@@ -9859,7 +10005,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
- ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_SPECULATIVE, B_FALSE));
@@ -10030,7 +10176,7 @@ cleanup:
*/
static void
l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
- uint64_t lb_asize, uint64_t lb_daddr)
+ uint64_t lb_asize)
{
uint64_t size = 0, asize = 0;
uint64_t log_entries = dev->l2ad_log_entries;
@@ -10104,19 +10250,18 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
L2BLK_GET_PROTECTED((le)->le_prop),
- L2BLK_GET_PREFETCH((le)->le_prop));
+ L2BLK_GET_PREFETCH((le)->le_prop),
+ L2BLK_GET_STATE((le)->le_prop));
asize = vdev_psize_to_asize(dev->l2ad_vdev,
L2BLK_GET_PSIZE((le)->le_prop));
/*
* vdev_space_update() has to be called before arc_hdr_destroy() to
- * avoid underflow since the latter also calls the former.
+ * avoid underflow since the latter also calls vdev_space_update().
*/
+ l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
- ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr));
- ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr));
-
mutex_enter(&dev->l2ad_mtx);
list_insert_tail(&dev->l2ad_buflist, hdr);
(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
@@ -10136,14 +10281,15 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
exists->b_l2hdr.b_dev = dev;
exists->b_l2hdr.b_daddr = le->le_daddr;
+ exists->b_l2hdr.b_arcs_state =
+ L2BLK_GET_STATE((le)->le_prop);
mutex_enter(&dev->l2ad_mtx);
list_insert_tail(&dev->l2ad_buflist, exists);
(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(exists), exists);
mutex_exit(&dev->l2ad_mtx);
+ l2arc_hdr_arcstats_increment(exists);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
- ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists));
- ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists));
}
ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
}
@@ -10439,6 +10585,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
+ L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
HDR_GET_PSIZE(hdr));
@@ -10607,5 +10754,8 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
"Percentage of excess dnodes to try to unpin");
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
- "When full, ARC allocation waits for eviction of this % of alloc size");
+ "When full, ARC allocation waits for eviction of this % of alloc size");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
+ "The number of headers to evict per sublist before moving to the next");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index 7d817320aae4..93445a80294b 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2019, Klara Inc.
@@ -1974,6 +1974,74 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
}
dbuf_dirty_record_t *
+dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
+ dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
+ ASSERT(dn->dn_maxblkid >= blkid);
+
+ dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
+ list_link_init(&dr->dr_dirty_node);
+ list_link_init(&dr->dr_dbuf_node);
+ dr->dr_dnode = dn;
+ dr->dr_txg = tx->tx_txg;
+ dr->dt.dll.dr_blkid = blkid;
+ dr->dr_accounted = dn->dn_datablksz;
+
+ /*
+ * There should not be any dbuf for the block that we're dirtying.
+ * Otherwise the buffer contents could be inconsistent between the
+ * dbuf and the lightweight dirty record.
+ */
+ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+
+ mutex_enter(&dn->dn_mtx);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
+ }
+
+ if (dn->dn_nlevels == 1) {
+ ASSERT3U(blkid, <, dn->dn_nblkptr);
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_setdirty(dn, tx);
+ } else {
+ mutex_exit(&dn->dn_mtx);
+
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
+ 1, blkid >> epbs, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (parent_db == NULL) {
+ kmem_free(dr, sizeof (*dr));
+ return (NULL);
+ }
+ int err = dbuf_read(parent_db, NULL,
+ (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err != 0) {
+ dbuf_rele(parent_db, FTAG);
+ kmem_free(dr, sizeof (*dr));
+ return (NULL);
+ }
+
+ dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
+ dbuf_rele(parent_db, FTAG);
+ mutex_enter(&parent_dr->dt.di.dr_mtx);
+ ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
+ list_insert_tail(&parent_dr->dt.di.dr_children, dr);
+ mutex_exit(&parent_dr->dt.di.dr_mtx);
+ dr->dr_parent = parent_dr;
+ }
+
+ dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
+
+ return (dr);
+}
+
+dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn;
@@ -2090,6 +2158,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
list_link_init(&dr->dr_dirty_node);
list_link_init(&dr->dr_dbuf_node);
+ dr->dr_dnode = dn;
if (db->db_level == 0) {
void *data_old = db->db_buf;
@@ -2255,7 +2324,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
dmu_buf_impl_t *db = dr->dr_dbuf;
if (dr->dt.dl.dr_data != db->db.db_data) {
- struct dnode *dn = DB_DNODE(db);
+ struct dnode *dn = dr->dr_dnode;
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
kmem_free(dr->dt.dl.dr_data, max_bonuslen);
@@ -2280,9 +2349,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
static boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn;
uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr;
ASSERT(txg != 0);
@@ -2302,13 +2369,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* If this buffer is not dirty, we're done.
*/
- dr = dbuf_find_dirty_eq(db, txg);
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
if (dr == NULL)
return (B_FALSE);
ASSERT(dr->dr_dbuf == db);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
+ dnode_t *dn = dr->dr_dnode;
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2336,7 +2402,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
- DB_DNODE_EXIT(db);
if (db->db_state != DB_NOFILL) {
dbuf_unoverride(dr);
@@ -2627,11 +2692,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
arc_buf_destroy(buf, db);
- xuio_stat_wbuf_copied();
return;
}
- xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
@@ -3003,8 +3066,29 @@ typedef struct dbuf_prefetch_arg {
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
+ void *dpa_arg; /* prefetch completion arg */
} dbuf_prefetch_arg_t;
+static void
+dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
+{
+ if (dpa->dpa_cb != NULL)
+ dpa->dpa_cb(dpa->dpa_arg, io_done);
+ kmem_free(dpa, sizeof (*dpa));
+}
+
+static void
+dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+ dbuf_prefetch_arg_t *dpa = private;
+
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ if (abuf != NULL)
+ arc_buf_destroy(abuf, private);
+}
+
/*
* Actually issue the prefetch read for the block given.
*/
@@ -3017,11 +3101,12 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
- return;
+ return (dbuf_prefetch_fini(dpa, B_FALSE));
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags =
- dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_NO_BUF;
/* dnodes are always read as raw and then converted later */
if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
@@ -3031,7 +3116,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
+ dbuf_issue_final_prefetch_done, dpa,
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
}
@@ -3051,8 +3137,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
if (abuf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
- kmem_free(dpa, sizeof (*dpa));
- return;
+ return (dbuf_prefetch_fini(dpa, B_TRUE));
}
ASSERT(zio == NULL || zio->io_error == 0);
@@ -3084,11 +3169,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
dpa->dpa_curlevel, curblkid, FTAG);
if (db == NULL) {
- kmem_free(dpa, sizeof (*dpa));
arc_buf_destroy(abuf, private);
- return;
+ return (dbuf_prefetch_fini(dpa, B_TRUE));
}
-
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
dbuf_rele(db, FTAG);
@@ -3105,11 +3188,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
- kmem_free(dpa, sizeof (*dpa));
+ dbuf_prefetch_fini(dpa, B_TRUE);
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
- kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
@@ -3139,9 +3221,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
* complete. Note that the prefetch might fail if the dataset is encrypted and
* the encryption key is unmapped before the IO completes.
*/
-void
-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
- arc_flags_t aflags)
+int
+dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+ void *arg)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
@@ -3151,10 +3234,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (blkid > dn->dn_maxblkid)
- return;
+ goto no_issue;
if (level == 0 && dnode_block_freed(dn, blkid))
- return;
+ goto no_issue;
/*
* This dnode hasn't been written to disk yet, so there's nothing to
@@ -3162,11 +3245,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
*/
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
- return;
+ goto no_issue;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
- return;
+ goto no_issue;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
@@ -3176,7 +3259,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
- return;
+ goto no_issue;
}
/*
@@ -3212,7 +3295,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
- return;
+ goto no_issue;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
@@ -3230,6 +3313,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
dpa->dpa_dnode = dn;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
+ dpa->dpa_cb = cb;
+ dpa->dpa_arg = arg;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
@@ -3245,7 +3330,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
- kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
@@ -3266,6 +3350,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
* dpa may have already been freed.
*/
zio_nowait(pio);
+ return (1);
+no_issue:
+ if (cb != NULL)
+ cb(arg, B_FALSE);
+ return (0);
+}
+
+int
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
+{
+
+ return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
}
/*
@@ -3803,15 +3900,13 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT0(db->db_level);
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(DB_DNODE_HELD(db));
ASSERT(db->db_blkid == DMU_BONUS_BLKID);
ASSERT(data != NULL);
- dnode_t *dn = DB_DNODE(db);
+ dnode_t *dn = dr->dr_dnode;
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
- DB_DNODE_EXIT(db);
dbuf_sync_leaf_verify_bonus_dnode(dr);
@@ -3870,8 +3965,7 @@ noinline static void
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
- zio_t *zio;
+ dnode_t *dn = dr->dr_dnode;
ASSERT(dmu_tx_is_syncing(tx));
@@ -3891,12 +3985,9 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT3U(db->db_state, ==, DB_CACHED);
ASSERT(db->db_buf != NULL);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
/* Indirect block size must match what the dnode thinks it is. */
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
- DB_DNODE_EXIT(db);
/* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr;
@@ -3905,7 +3996,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dbuf_write(dr, db->db_buf, tx);
- zio = dr->dr_zio;
+ zio_t *zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -3930,7 +4021,7 @@ static void
dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
{
#ifdef ZFS_DEBUG
- dnode_t *dn = DB_DNODE(dr->dr_dbuf);
+ dnode_t *dn = dr->dr_dnode;
/*
* Encrypted bonus buffers can have data past their bonuslen.
@@ -3953,6 +4044,153 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
#endif
}
+static blkptr_t *
+dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
+{
+ /* This must be a lightweight dirty record. */
+ ASSERT3P(dr->dr_dbuf, ==, NULL);
+ dnode_t *dn = dr->dr_dnode;
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
+ return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
+ } else {
+ dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ VERIFY3U(parent_db->db_level, ==, 1);
+ VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
+ VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
+ blkptr_t *bp = parent_db->db.db_data;
+ return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
+ }
+}
+
+static void
+dbuf_lightweight_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_error != 0)
+ return;
+
+ dnode_t *dn = dr->dr_dnode;
+
+ blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ int64_t delta = bp_get_dsize_sync(spa, bp) -
+ bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta);
+
+ uint64_t blkid = dr->dt.dll.dr_blkid;
+ mutex_enter(&dn->dn_mtx);
+ if (blkid > dn->dn_phys->dn_maxblkid) {
+ ASSERT0(dn->dn_objset->os_raw_receive);
+ dn->dn_phys->dn_maxblkid = blkid;
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
+ BP_SET_FILL(bp, fill);
+ }
+
+ dmu_buf_impl_t *parent_db;
+ EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
+ if (dr->dr_parent == NULL) {
+ parent_db = dn->dn_dbuf;
+ } else {
+ parent_db = dr->dr_parent->dr_dbuf;
+ }
+ rw_enter(&parent_db->db_rwlock, RW_WRITER);
+ *bp_orig = *bp;
+ rw_exit(&parent_db->db_rwlock);
+}
+
+static void
+dbuf_lightweight_physdone(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dbuf_lightweight_done().
+ */
+ int delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+static void
+dbuf_lightweight_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+
+ VERIFY0(zio->io_error);
+
+ objset_t *os = dr->dr_dnode->dn_objset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+ ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+
+ /*
+ * See comment in dbuf_write_done().
+ */
+ if (zio->io_phys_children == 0) {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted, zio->io_txg);
+ } else {
+ dsl_pool_undirty_space(dmu_objset_pool(os),
+ dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+ }
+
+ abd_free(dr->dt.dll.dr_abd);
+ kmem_free(dr, sizeof (*dr));
+}
+
+noinline static void
+dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dnode_t *dn = dr->dr_dnode;
+ zio_t *pio;
+ if (dn->dn_phys->dn_nlevels == 1) {
+ pio = dn->dn_zio;
+ } else {
+ pio = dr->dr_parent->dr_zio;
+ }
+
+ zbookmark_phys_t zb = {
+ .zb_objset = dmu_objset_id(dn->dn_objset),
+ .zb_object = dn->dn_object,
+ .zb_level = 0,
+ .zb_blkid = dr->dt.dll.dr_blkid,
+ };
+
+ /*
+ * See comment in dbuf_write(). This is so that zio->io_bp_orig
+ * will have the old BP in dbuf_lightweight_done().
+ */
+ dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
+
+ dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
+ dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
+ dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
+ &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
+ dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
+
+ zio_nowait(dr->dr_zio);
+}
+
/*
* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
* critical the we not allow the compiler to inline this function in to
@@ -3963,7 +4201,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
+ dnode_t *dn = dr->dr_dnode;
objset_t *os;
uint64_t txg = tx->tx_txg;
@@ -3987,9 +4225,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
DBUF_VERIFY(db);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
-
if (db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
@@ -4079,16 +4314,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(!list_link_active(&dr->dr_dirty_node));
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
- DB_DNODE_EXIT(db);
} else {
- /*
- * Although zio_nowait() does not "wait for an IO", it does
- * initiate the IO. If this is an empty write it seems plausible
- * that the IO could actually be completed before the nowait
- * returns. We need to DB_DNODE_EXIT() first in case
- * zio_nowait() invalidates the dbuf.
- */
- DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
}
}
@@ -4111,15 +4337,19 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
DMU_META_DNODE_OBJECT);
break;
}
- if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- VERIFY3U(dr->dr_dbuf->db_level, ==, level);
- }
list_remove(list, dr);
- if (dr->dr_dbuf->db_level > 0)
- dbuf_sync_indirect(dr, tx);
- else
- dbuf_sync_leaf(dr, tx);
+ if (dr->dr_dbuf == NULL) {
+ dbuf_sync_lightweight(dr, tx);
+ } else {
+ if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+ }
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
}
}
@@ -4299,7 +4529,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
blkptr_t *bp = db->db_blkptr;
objset_t *os = db->db_objset;
dmu_tx_t *tx = os->os_synctx;
- dbuf_dirty_record_t *dr;
ASSERT0(zio->io_error);
ASSERT(db->db_blkptr == bp);
@@ -4320,7 +4549,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
DBUF_VERIFY(db);
- dr = db->db_data_pending;
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ dnode_t *dn = dr->dr_dnode;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_dbuf == db);
ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
@@ -4328,14 +4558,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
- DB_DNODE_EXIT(db);
}
#endif
@@ -4347,10 +4572,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
- dnode_t *dn;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -4361,7 +4582,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
}
- DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
@@ -4554,7 +4774,7 @@ static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
+ dnode_t *dn = dr->dr_dnode;
objset_t *os;
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
@@ -4565,8 +4785,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
os = dn->dn_objset;
if (db->db_state != DB_NOFILL) {
@@ -4622,7 +4840,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
- DB_DNODE_EXIT(db);
/*
* We copy the blkptr now (rather than when we instantiate the dirty
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index 2c96645214f8..a02f43df13fd 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -499,7 +499,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
- zio_t *zio;
+ zio_t *zio = NULL;
ASSERT(length <= DMU_MAX_ACCESS);
@@ -531,14 +531,17 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ if (read)
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
- zio_nowait(zio);
+ if (read)
+ zio_nowait(zio);
return (SET_ERROR(EIO));
}
@@ -555,15 +558,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}
rw_exit(&dn->dn_struct_rwlock);
- /* wait for async i/o */
- err = zio_wait(zio);
- if (err) {
- dmu_buf_rele_array(dbp, nblks, tag);
- return (err);
- }
-
- /* wait for other io to complete */
if (read) {
+ /* wait for async read i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+
+ /* wait for other io to complete */
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
mutex_enter(&db->db_mtx);
@@ -1165,165 +1168,12 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
-/*
- * DMU support for xuio
- */
-kstat_t *xuio_ksp = NULL;
-
-typedef struct xuio_stats {
- /* loaned yet not returned arc_buf */
- kstat_named_t xuiostat_onloan_rbuf;
- kstat_named_t xuiostat_onloan_wbuf;
- /* whether a copy is made when loaning out a read buffer */
- kstat_named_t xuiostat_rbuf_copied;
- kstat_named_t xuiostat_rbuf_nocopy;
- /* whether a copy is made when assigning a write buffer */
- kstat_named_t xuiostat_wbuf_copied;
- kstat_named_t xuiostat_wbuf_nocopy;
-} xuio_stats_t;
-
-static xuio_stats_t xuio_stats = {
- { "onloan_read_buf", KSTAT_DATA_UINT64 },
- { "onloan_write_buf", KSTAT_DATA_UINT64 },
- { "read_buf_copied", KSTAT_DATA_UINT64 },
- { "read_buf_nocopy", KSTAT_DATA_UINT64 },
- { "write_buf_copied", KSTAT_DATA_UINT64 },
- { "write_buf_nocopy", KSTAT_DATA_UINT64 }
-};
-
-#define XUIOSTAT_INCR(stat, val) \
- atomic_add_64(&xuio_stats.stat.value.ui64, (val))
-#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
-
-#ifdef HAVE_UIO_ZEROCOPY
-int
-dmu_xuio_init(xuio_t *xuio, int nblk)
-{
- dmu_xuio_t *priv;
- uio_t *uio = &xuio->xu_uio;
-
- uio->uio_iovcnt = nblk;
- uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
-
- priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
- priv->cnt = nblk;
- priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
- priv->iovp = (iovec_t *)uio->uio_iov;
- XUIO_XUZC_PRIV(xuio) = priv;
-
- if (XUIO_XUZC_RW(xuio) == UIO_READ)
- XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
- else
- XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
-
- return (0);
-}
-
-void
-dmu_xuio_fini(xuio_t *xuio)
-{
- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
- int nblk = priv->cnt;
-
- kmem_free(priv->iovp, nblk * sizeof (iovec_t));
- kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
- kmem_free(priv, sizeof (dmu_xuio_t));
-
- if (XUIO_XUZC_RW(xuio) == UIO_READ)
- XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
- else
- XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
-}
-
-/*
- * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
- * and increase priv->next by 1.
- */
-int
-dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
-{
- struct iovec *iov;
- uio_t *uio = &xuio->xu_uio;
- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
- int i = priv->next++;
-
- ASSERT(i < priv->cnt);
- ASSERT(off + n <= arc_buf_lsize(abuf));
- iov = (iovec_t *)uio->uio_iov + i;
- iov->iov_base = (char *)abuf->b_data + off;
- iov->iov_len = n;
- priv->bufs[i] = abuf;
- return (0);
-}
-
-int
-dmu_xuio_cnt(xuio_t *xuio)
-{
- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
- return (priv->cnt);
-}
-
-arc_buf_t *
-dmu_xuio_arcbuf(xuio_t *xuio, int i)
-{
- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-
- ASSERT(i < priv->cnt);
- return (priv->bufs[i]);
-}
-
-void
-dmu_xuio_clear(xuio_t *xuio, int i)
-{
- dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
-
- ASSERT(i < priv->cnt);
- priv->bufs[i] = NULL;
-}
-#endif /* HAVE_UIO_ZEROCOPY */
-
-static void
-xuio_stat_init(void)
-{
- xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
- KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
- if (xuio_ksp != NULL) {
- xuio_ksp->ks_data = &xuio_stats;
- kstat_install(xuio_ksp);
- }
-}
-
-static void
-xuio_stat_fini(void)
-{
- if (xuio_ksp != NULL) {
- kstat_delete(xuio_ksp);
- xuio_ksp = NULL;
- }
-}
-
-void
-xuio_stat_wbuf_copied(void)
-{
- XUIOSTAT_BUMP(xuiostat_wbuf_copied);
-}
-
-void
-xuio_stat_wbuf_nocopy(void)
-{
- XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
-}
-
#ifdef _KERNEL
int
dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
-#ifdef HAVE_UIO_ZEROCOPY
- xuio_t *xuio = NULL;
-#endif
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -1344,21 +1194,6 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
bufoff = uio_offset(uio) - db->db_offset;
tocpy = MIN(db->db_size - bufoff, size);
-#ifdef HAVE_UIO_ZEROCOPY
- if (xuio) {
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- arc_buf_t *dbuf_abuf = dbi->db_buf;
- arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
- err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
- if (!err)
- uio_advance(uio, tocpy);
-
- if (abuf == dbuf_abuf)
- XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
- else
- XUIOSTAT_BUMP(xuiostat_rbuf_copied);
- } else
-#endif
#ifdef __FreeBSD__
err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
tocpy, uio);
@@ -1561,6 +1396,32 @@ dmu_return_arcbuf(arc_buf_t *buf)
}
/*
+ * A "lightweight" write is faster than a regular write (e.g.
+ * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
+ * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the
+ * data can not be read or overwritten until the transaction's txg has been
+ * synced. This makes it appropriate for workloads that are known to be
+ * (temporarily) write-only, like "zfs receive".
+ *
+ * A single block is written, starting at the specified offset in bytes. If
+ * the call is successful, it returns 0 and the provided abd has been
+ * consumed (the caller should not free it).
+ */
+int
+dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
+ const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx)
+{
+ dbuf_dirty_record_t *dr =
+ dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
+ if (dr == NULL)
+ return (SET_ERROR(EIO));
+ dr->dt.dll.dr_abd = abd;
+ dr->dt.dll.dr_props = *zp;
+ dr->dt.dll.dr_flags = flags;
+ return (0);
+}
+
+/*
* When possible directly assign passed loaned arc buffer to a dbuf.
* If this is not possible copy the contents of passed arc buf via
* dmu_write().
@@ -1583,8 +1444,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
rw_exit(&dn->dn_struct_rwlock);
/*
- * We can only assign if the offset is aligned, the arc buf is the
- * same size as the dbuf, and the dbuf is not metadata.
+ * We can only assign if the offset is aligned and the arc buf is the
+ * same size as the dbuf.
*/
if (offset == db->db.db_offset && blksz == db->db.db_size) {
dbuf_assign_arcbuf(db, buf, tx);
@@ -1597,7 +1458,6 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx);
dmu_return_arcbuf(buf);
- XUIOSTAT_BUMP(xuiostat_wbuf_copied);
}
return (0);
@@ -2409,7 +2269,6 @@ dmu_init(void)
abd_init();
zfs_dbgmsg_init();
sa_cache_init();
- xuio_stat_init();
dmu_objset_init();
dnode_init();
zfetch_init();
@@ -2429,7 +2288,6 @@ dmu_fini(void)
dbuf_fini();
dnode_fini();
dmu_objset_fini();
- xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
abd_fini();
diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c
index 453a2842ce6e..12cdbd68b104 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_object.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_object.c
@@ -58,10 +58,8 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
int error;
- kpreempt_disable();
- cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
+ cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
os->os_obj_next_percpu_len];
- kpreempt_enable();
if (dn_slots == 0) {
dn_slots = DNODE_MIN_SLOTS;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index af5935e2374d..66a8f20092e0 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -682,8 +682,9 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
err = dsl_pool_hold(name, tag, &dp);
if (err != 0)
return (err);
@@ -755,8 +756,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
err = dsl_pool_hold(name, FTAG, &dp);
if (err != 0)
return (err);
@@ -778,11 +780,15 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
* speed up pool import times and to keep this txg reserved
* completely for recovery work.
*/
- if ((dmu_objset_userobjspace_upgradable(*osp) ||
- dmu_objset_projectquota_upgradable(*osp)) &&
- !readonly && !dp->dp_spa->spa_claiming &&
- (ds->ds_dir->dd_crypto_obj == 0 || decrypt))
- dmu_objset_id_quota_upgrade(*osp);
+ if (!readonly && !dp->dp_spa->spa_claiming &&
+ (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {
+ if (dmu_objset_userobjspace_upgradable(*osp) ||
+ dmu_objset_projectquota_upgradable(*osp)) {
+ dmu_objset_id_quota_upgrade(*osp);
+ } else if (dmu_objset_userused_enabled(*osp)) {
+ dmu_objset_userspace_upgrade(*osp);
+ }
+ }
dsl_pool_rele(dp, FTAG);
return (0);
@@ -794,8 +800,9 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
{
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
if (err != 0)
return (err);
@@ -812,9 +819,10 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
void
dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
{
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
-
+ ds_hold_flags_t flags;
dsl_pool_t *dp = dmu_objset_pool(os);
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
dsl_pool_rele(dp, tag);
}
@@ -842,7 +850,9 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
{
dsl_pool_t *dp;
char name[ZFS_MAX_DATASET_NAME_LEN];
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
VERIFY3P(ds, !=, NULL);
VERIFY3P(ds->ds_owner, ==, tag);
VERIFY(dsl_dataset_long_held(ds));
@@ -850,21 +860,22 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
dsl_dataset_name(ds, name);
dp = ds->ds_dir->dd_pool;
dsl_pool_config_enter(dp, FTAG);
- dsl_dataset_disown(ds, decrypt, tag);
- VERIFY0(dsl_dataset_own(dp, name,
- (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds));
+ dsl_dataset_disown(ds, flags, tag);
+ VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));
dsl_pool_config_exit(dp, FTAG);
}
void
dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
{
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
/*
* Stop upgrading thread
*/
dmu_objset_upgrade_stop(os);
- dsl_dataset_disown(os->os_dsl_dataset,
- (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag);
+ dsl_dataset_disown(os->os_dsl_dataset, flags, tag);
}
void
@@ -1231,7 +1242,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
}
VERIFY0(zio_wait(rzio));
- dmu_objset_do_userquota_updates(os, tx);
+ dmu_objset_sync_done(os, tx);
taskq_wait(dp->dp_sync_taskq);
if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
ASSERT3P(ds->ds_key_mapping, !=, NULL);
@@ -1424,10 +1435,15 @@ dmu_objset_upgrade_task_cb(void *data)
mutex_enter(&os->os_upgrade_lock);
os->os_upgrade_status = EINTR;
if (!os->os_upgrade_exit) {
+ int status;
+
mutex_exit(&os->os_upgrade_lock);
- os->os_upgrade_status = os->os_upgrade_cb(os);
+ status = os->os_upgrade_cb(os);
+
mutex_enter(&os->os_upgrade_lock);
+
+ os->os_upgrade_status = status;
}
os->os_upgrade_exit = B_TRUE;
os->os_upgrade_id = 0;
@@ -1455,6 +1471,8 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
os->os_upgrade_status = ENOMEM;
}
+ } else {
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
}
mutex_exit(&os->os_upgrade_lock);
}
@@ -1498,23 +1516,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
multilist_sublist_remove(list, dn);
/*
- * If we are not doing useraccounting (os_synced_dnodes == NULL)
- * we are done with this dnode for this txg. Unset dn_dirty_txg
- * if later txgs aren't dirtying it so that future holders do
- * not get a stale value. Otherwise, we will do this in
- * userquota_updates_task() when processing has completely
- * finished for this txg.
+ * See the comment above dnode_rele_task() for an explanation
+ * of why this dnode hold is always needed (even when not
+ * doing user accounting).
*/
multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
- if (newlist != NULL) {
- (void) dnode_add_ref(dn, newlist);
- multilist_insert(newlist, dn);
- } else {
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_dirty_txg == tx->tx_txg)
- dn->dn_dirty_txg = 0;
- mutex_exit(&dn->dn_mtx);
- }
+ (void) dnode_add_ref(dn, newlist);
+ multilist_insert(newlist, dn);
dnode_sync(dn, tx);
}
@@ -1676,22 +1684,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
txgoff = tx->tx_txg & TXG_MASK;
- if (dmu_objset_userused_enabled(os) &&
- (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
- /*
- * We must create the list here because it uses the
- * dn_dirty_link[] of this txg. But it may already
- * exist because we call dsl_dataset_sync() twice per txg.
- */
- if (os->os_synced_dnodes == NULL) {
- os->os_synced_dnodes =
- multilist_create(sizeof (dnode_t),
- offsetof(dnode_t, dn_dirty_link[txgoff]),
- dnode_multilist_index_func);
- } else {
- ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
- offsetof(dnode_t, dn_dirty_link[txgoff]));
- }
+ /*
+ * We must create the list here because it uses the
+ * dn_dirty_link[] of this txg. But it may already
+ * exist because we call dsl_dataset_sync() twice per txg.
+ */
+ if (os->os_synced_dnodes == NULL) {
+ os->os_synced_dnodes =
+ multilist_create(sizeof (dnode_t),
+ offsetof(dnode_t, dn_dirty_link[txgoff]),
+ dnode_multilist_index_func);
+ } else {
+ ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
+ offsetof(dnode_t, dn_dirty_link[txgoff]));
}
ml = os->os_dirty_dnodes[txgoff];
@@ -1998,8 +2003,6 @@ userquota_updates_task(void *arg)
dn->dn_id_flags |= DN_ID_CHKED_BONUS;
}
dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
- if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
- dn->dn_dirty_txg = 0;
mutex_exit(&dn->dn_mtx);
multilist_sublist_remove(list, dn);
@@ -2010,13 +2013,44 @@ userquota_updates_task(void *arg)
kmem_free(uua, sizeof (*uua));
}
-void
-dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
+/*
+ * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being
+ * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be
+ * evicted because the block containing the dnode can't be evicted until it is
+ * written out. However, this hold is necessary to prevent the dnode_t from
+ * being moved (via dnode_move()) while it's still referenced by
+ * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for
+ * dirty_lightweight_leaf-type dirty records.
+ *
+ * If we are doing user-object accounting, the dnode_rele() happens from
+ * userquota_updates_task() instead.
+ */
+static void
+dnode_rele_task(void *arg)
{
- int num_sublists;
+ userquota_updates_arg_t *uua = arg;
+ objset_t *os = uua->uua_os;
+ multilist_sublist_t *list =
+ multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+
+ dnode_t *dn;
+ while ((dn = multilist_sublist_head(list)) != NULL) {
+ multilist_sublist_remove(list, dn);
+ dnode_rele(dn, os->os_synced_dnodes);
+ }
+ multilist_sublist_unlock(list);
+ kmem_free(uua, sizeof (*uua));
+}
+
+/*
+ * Return TRUE if userquota updates are needed.
+ */
+static boolean_t
+dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)
+{
if (!dmu_objset_userused_enabled(os))
- return;
+ return (B_FALSE);
/*
* If this is a raw receive just return and handle accounting
@@ -2026,10 +2060,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
* used for recovery.
*/
if (os->os_encrypted && dmu_objset_is_receiving(os))
- return;
+ return (B_FALSE);
if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
- return;
+ return (B_FALSE);
/* Allocate the user/group/project used objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
@@ -2046,23 +2080,39 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
+ return (B_TRUE);
+}
- num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
+/*
+ * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and
+ * also release the holds on the dnodes from dmu_objset_sync_dnodes().
+ * The caller must taskq_wait(dp_sync_taskq).
+ */
+void
+dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)
+{
+ boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);
+
+ int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
for (int i = 0; i < num_sublists; i++) {
- if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i))
- continue;
userquota_updates_arg_t *uua =
kmem_alloc(sizeof (*uua), KM_SLEEP);
uua->uua_os = os;
uua->uua_sublist_idx = i;
uua->uua_tx = tx;
- /* note: caller does taskq_wait() */
+
+ /*
+ * If we don't need to update userquotas, use
+ * dnode_rele_task() to call dnode_rele()
+ */
(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
- userquota_updates_task, uua, 0);
+ need_userquota ? userquota_updates_task : dnode_rele_task,
+ uua, 0);
/* callback frees uua */
}
}
+
/*
* Returns a pointer to data to find uid/gid from
*
@@ -2084,18 +2134,11 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (dr == NULL) {
data = NULL;
} else {
- dnode_t *dn;
-
- DB_DNODE_ENTER(dr->dr_dbuf);
- dn = DB_DNODE(dr->dr_dbuf);
-
- if (dn->dn_bonuslen == 0 &&
+ if (dr->dr_dnode->dn_bonuslen == 0 &&
dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
data = dr->dt.dl.dr_data->b_data;
else
data = dr->dt.dl.dr_data;
-
- DB_DNODE_EXIT(dr->dr_dbuf);
}
return (data);
@@ -2285,8 +2328,8 @@ dmu_objset_space_upgrade(objset_t *os)
return (0);
}
-int
-dmu_objset_userspace_upgrade(objset_t *os)
+static int
+dmu_objset_userspace_upgrade_cb(objset_t *os)
{
int err = 0;
@@ -2306,6 +2349,12 @@ dmu_objset_userspace_upgrade(objset_t *os)
return (0);
}
+void
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+ dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);
+}
+
static int
dmu_objset_id_quota_upgrade_cb(objset_t *os)
{
@@ -2316,14 +2365,15 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os)
return (0);
if (dmu_objset_is_snapshot(os))
return (SET_ERROR(EINVAL));
- if (!dmu_objset_userobjused_enabled(os))
+ if (!dmu_objset_userused_enabled(os))
return (SET_ERROR(ENOTSUP));
if (!dmu_objset_projectquota_enabled(os) &&
dmu_objset_userobjspace_present(os))
return (SET_ERROR(ENOTSUP));
- dmu_objset_ds(os)->ds_feature_activation[
- SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
+ if (dmu_objset_userobjused_enabled(os))
+ dmu_objset_ds(os)->ds_feature_activation[
+ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
if (dmu_objset_projectquota_enabled(os))
dmu_objset_ds(os)->ds_feature_activation[
SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
@@ -2332,7 +2382,9 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os)
if (err)
return (err);
- os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+ os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ if (dmu_objset_userobjused_enabled(os))
+ os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
if (dmu_objset_projectquota_enabled(os))
os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
@@ -2977,7 +3029,7 @@ EXPORT_SYMBOL(dmu_objset_create_impl);
EXPORT_SYMBOL(dmu_objset_open_impl);
EXPORT_SYMBOL(dmu_objset_evict);
EXPORT_SYMBOL(dmu_objset_register_type);
-EXPORT_SYMBOL(dmu_objset_do_userquota_updates);
+EXPORT_SYMBOL(dmu_objset_sync_done);
EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
EXPORT_SYMBOL(dmu_objset_userused_enabled);
EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
index 2eee19a28e34..a0fd157ebc5f 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_recv.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -79,10 +79,10 @@ struct receive_record_arg {
dmu_replay_record_t header;
void *payload; /* Pointer to a buffer containing the payload */
/*
- * If the record is a write, pointer to the arc_buf_t containing the
+ * If the record is a WRITE or SPILL, pointer to the abd containing the
* payload.
*/
- arc_buf_t *arc_buf;
+ abd_t *abd;
int payload_size;
uint64_t bytes_read; /* bytes read from stream when record created */
boolean_t eos_marker; /* Marks the end of the stream */
@@ -95,8 +95,8 @@ struct receive_writer_arg {
bqueue_t q;
/*
- * These three args are used to signal to the main thread that we're
- * done.
+ * These three members are used to signal to the main thread when
+ * we're done.
*/
kmutex_t mutex;
kcondvar_t cv;
@@ -175,18 +175,6 @@ byteswap_record(dmu_replay_record_t *drr)
DO64(drr_write.drr_key.ddk_prop);
DO64(drr_write.drr_compressed_size);
break;
- case DRR_WRITE_BYREF:
- DO64(drr_write_byref.drr_object);
- DO64(drr_write_byref.drr_offset);
- DO64(drr_write_byref.drr_length);
- DO64(drr_write_byref.drr_toguid);
- DO64(drr_write_byref.drr_refguid);
- DO64(drr_write_byref.drr_refobject);
- DO64(drr_write_byref.drr_refoffset);
- ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
- drr_key.ddk_cksum);
- DO64(drr_write_byref.drr_key.ddk_prop);
- break;
case DRR_WRITE_EMBEDDED:
DO64(drr_write_embedded.drr_object);
DO64(drr_write_embedded.drr_offset);
@@ -572,7 +560,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
int error;
uint64_t featureflags = drba->drba_cookie->drc_featureflags;
dsl_dataset_t *ds;
@@ -784,7 +772,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_t *ds, *newds;
objset_t *os;
uint64_t dsobj;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
int error;
uint64_t crflags = 0;
dsl_crypto_params_t dummy_dcp = { 0 };
@@ -958,7 +946,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsl_pool_t *dp = dmu_tx_pool(tx);
struct drr_begin *drrb = drc->drc_drrb;
int error;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
dsl_dataset_t *ds;
const char *tofs = drc->drc_tofs;
@@ -1106,7 +1094,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
const char *tofs = drba->drba_cookie->drc_tofs;
uint64_t featureflags = drba->drba_cookie->drc_featureflags;
dsl_dataset_t *ds;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
/* 6 extra bytes for /%recv */
char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
@@ -1903,58 +1891,106 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
struct receive_record_arg *rrd;
while ((rrd = list_head(&rwa->write_batch)) != NULL) {
struct drr_write *drrw = &rrd->header.drr_u.drr_write;
- arc_buf_t *abuf = rrd->arc_buf;
+ abd_t *abd = rrd->abd;
ASSERT3U(drrw->drr_object, ==, rwa->last_object);
- if (rwa->byteswap && !arc_is_encrypted(abuf) &&
- arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
- dmu_object_byteswap_t byteswap =
- DMU_OT_BYTESWAP(drrw->drr_type);
- dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
- DRR_WRITE_PAYLOAD_SIZE(drrw));
- }
-
- /*
- * If we are receiving an incremental large-block stream into
- * a dataset that previously did a non-large-block receive,
- * the WRITE record may be larger than the object's block
- * size. dmu_assign_arcbuf_by_dnode() handles this as long
- * as the arcbuf is not compressed, so decompress it here if
- * necessary.
- */
- if (drrw->drr_logical_size != dn->dn_datablksz &&
- arc_get_compression(abuf) != ZIO_COMPRESS_OFF) {
+ if (drrw->drr_logical_size != dn->dn_datablksz) {
+ /*
+ * The WRITE record is larger than the object's block
+ * size. We must be receiving an incremental
+ * large-block stream into a dataset that previously did
+ * a non-large-block receive. Lightweight writes must
+ * be exactly one block, so we need to decompress the
+ * data (if compressed) and do a normal dmu_write().
+ */
ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
- zbookmark_phys_t zb = {
- .zb_objset = dmu_objset_id(rwa->os),
- .zb_object = rwa->last_object,
- .zb_level = 0,
- .zb_blkid =
- drrw->drr_offset >> dn->dn_datablkshift,
- };
+ if (DRR_WRITE_COMPRESSED(drrw)) {
+ abd_t *decomp_abd =
+ abd_alloc_linear(drrw->drr_logical_size,
+ B_FALSE);
+
+ err = zio_decompress_data(
+ drrw->drr_compressiontype,
+ abd, abd_to_buf(decomp_abd),
+ abd_get_size(abd),
+ abd_get_size(decomp_abd), NULL);
+
+ if (err == 0) {
+ dmu_write_by_dnode(dn,
+ drrw->drr_offset,
+ drrw->drr_logical_size,
+ abd_to_buf(decomp_abd), tx);
+ }
+ abd_free(decomp_abd);
+ } else {
+ dmu_write_by_dnode(dn,
+ drrw->drr_offset,
+ drrw->drr_logical_size,
+ abd_to_buf(abd), tx);
+ }
+ if (err == 0)
+ abd_free(abd);
+ } else {
+ zio_prop_t zp;
+ dmu_write_policy(rwa->os, dn, 0, 0, &zp);
+
+ enum zio_flag zio_flags = 0;
+
+ if (rwa->raw) {
+ zp.zp_encrypt = B_TRUE;
+ zp.zp_compress = drrw->drr_compressiontype;
+ zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
+ rwa->byteswap;
+ bcopy(drrw->drr_salt, zp.zp_salt,
+ ZIO_DATA_SALT_LEN);
+ bcopy(drrw->drr_iv, zp.zp_iv,
+ ZIO_DATA_IV_LEN);
+ bcopy(drrw->drr_mac, zp.zp_mac,
+ ZIO_DATA_MAC_LEN);
+ if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
+ zp.zp_nopwrite = B_FALSE;
+ zp.zp_copies = MIN(zp.zp_copies,
+ SPA_DVAS_PER_BP - 1);
+ }
+ zio_flags |= ZIO_FLAG_RAW;
+ } else if (DRR_WRITE_COMPRESSED(drrw)) {
+ ASSERT3U(drrw->drr_compressed_size, >, 0);
+ ASSERT3U(drrw->drr_logical_size, >=,
+ drrw->drr_compressed_size);
+ zp.zp_compress = drrw->drr_compressiontype;
+ zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+ } else if (rwa->byteswap) {
+ /*
+ * Note: compressed blocks never need to be
+ * byteswapped, because WRITE records for
+ * metadata blocks are never compressed. The
+ * exception is raw streams, which are written
+ * in the original byteorder, and the byteorder
+ * bit is preserved in the BP by setting
+ * zp_byteorder above.
+ */
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrw->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(
+ abd_to_buf(abd),
+ DRR_WRITE_PAYLOAD_SIZE(drrw));
+ }
/*
- * The size of loaned arc bufs is counted in
- * arc_loaned_bytes. When we untransform
- * (decompress) the buf, its size increases. To
- * ensure that arc_loaned_bytes remains accurate, we
- * need to return (un-loan) the buf (with its
- * compressed size) and then re-loan it (with its
- * new, uncompressed size).
+ * Since this data can't be read until the receive
+ * completes, we can do a "lightweight" write for
+ * improved performance.
*/
- arc_return_buf(abuf, FTAG);
- VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os),
- &zb, B_FALSE));
- arc_loan_inuse_buf(abuf, FTAG);
+ err = dmu_lightweight_write_by_dnode(dn,
+ drrw->drr_offset, abd, &zp, zio_flags, tx);
}
- err = dmu_assign_arcbuf_by_dnode(dn,
- drrw->drr_offset, abuf, tx);
if (err != 0) {
/*
* This rrd is left on the list, so the caller will
- * free it (and the arc_buf).
+ * free it (and the abd).
*/
break;
}
@@ -1987,7 +2023,7 @@ flush_write_batch(struct receive_writer_arg *rwa)
if (err != 0) {
struct receive_record_arg *rrd;
while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
- dmu_return_arcbuf(rrd->arc_buf);
+ abd_free(rrd->abd);
kmem_free(rrd, sizeof (*rrd));
}
}
@@ -2090,9 +2126,8 @@ receive_write_embedded(struct receive_writer_arg *rwa,
static int
receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
- arc_buf_t *abuf)
+ abd_t *abd)
{
- dmu_tx_t *tx;
dmu_buf_t *db, *db_spill;
int err;
@@ -2107,7 +2142,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
* the DRR_FLAG_SPILL_BLOCK flag.
*/
if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
- dmu_return_arcbuf(abuf);
+ abd_free(abd);
return (0);
}
@@ -2131,7 +2166,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
return (err);
}
- tx = dmu_tx_create(rwa->os);
+ dmu_tx_t *tx = dmu_tx_create(rwa->os);
dmu_tx_hold_spill(tx, db->db_object);
@@ -2150,18 +2185,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
*/
if (db_spill->db_size != drrs->drr_length) {
dmu_buf_will_fill(db_spill, tx);
- VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+ VERIFY0(dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
}
- if (rwa->byteswap && !arc_is_encrypted(abuf) &&
- arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
- dmu_object_byteswap_t byteswap =
- DMU_OT_BYTESWAP(drrs->drr_type);
- dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
- DRR_SPILL_PAYLOAD_SIZE(drrs));
+ arc_buf_t *abuf;
+ if (rwa->raw) {
+ boolean_t byteorder = ZFS_HOST_BYTEORDER ^
+ !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
+ rwa->byteswap;
+
+ abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
+ drrs->drr_object, byteorder, drrs->drr_salt,
+ drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
+ drrs->drr_compressed_size, drrs->drr_length,
+ drrs->drr_compressiontype, 0);
+ } else {
+ abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
+ DMU_OT_IS_METADATA(drrs->drr_type),
+ drrs->drr_length);
+ if (rwa->byteswap) {
+ dmu_object_byteswap_t byteswap =
+ DMU_OT_BYTESWAP(drrs->drr_type);
+ dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
+ DRR_SPILL_PAYLOAD_SIZE(drrs));
+ }
}
+ bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs));
+ abd_free(abd);
dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
dmu_buf_rele(db, FTAG);
@@ -2263,8 +2315,9 @@ static void
dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
{
dsl_dataset_t *ds = drc->drc_ds;
- ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
+ dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
/*
* Wait for the txg sync before cleaning up the receive. For
* resumable receives, this ensures that our resume state has
@@ -2451,53 +2504,19 @@ receive_read_record(dmu_recv_cookie_t *drc)
case DRR_WRITE:
{
struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
- arc_buf_t *abuf;
- boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
-
- if (drc->drc_raw) {
- boolean_t byteorder = ZFS_HOST_BYTEORDER ^
- !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
- drc->drc_byteswap;
-
- abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os),
- drrw->drr_object, byteorder, drrw->drr_salt,
- drrw->drr_iv, drrw->drr_mac, drrw->drr_type,
- drrw->drr_compressed_size, drrw->drr_logical_size,
- drrw->drr_compressiontype, 0);
- } else if (DRR_WRITE_COMPRESSED(drrw)) {
- ASSERT3U(drrw->drr_compressed_size, >, 0);
- ASSERT3U(drrw->drr_logical_size, >=,
- drrw->drr_compressed_size);
- ASSERT(!is_meta);
- abuf = arc_loan_compressed_buf(
- dmu_objset_spa(drc->drc_os),
- drrw->drr_compressed_size, drrw->drr_logical_size,
- drrw->drr_compressiontype, 0);
- } else {
- abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os),
- is_meta, drrw->drr_logical_size);
- }
-
- err = receive_read_payload_and_next_header(drc,
- DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
+ int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ abd_t *abd = abd_alloc_linear(size, B_FALSE);
+ err = receive_read_payload_and_next_header(drc, size,
+ abd_to_buf(abd));
if (err != 0) {
- dmu_return_arcbuf(abuf);
+ abd_free(abd);
return (err);
}
- drc->drc_rrd->arc_buf = abuf;
+ drc->drc_rrd->abd = abd;
receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
drrw->drr_logical_size);
return (err);
}
- case DRR_WRITE_BYREF:
- {
- struct drr_write_byref *drrwb =
- &drc->drc_rrd->header.drr_u.drr_write_byref;
- err = receive_read_payload_and_next_header(drc, 0, NULL);
- receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset,
- drrwb->drr_length);
- return (err);
- }
case DRR_WRITE_EMBEDDED:
{
struct drr_write_embedded *drrwe =
@@ -2536,29 +2555,14 @@ receive_read_record(dmu_recv_cookie_t *drc)
case DRR_SPILL:
{
struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
- arc_buf_t *abuf;
- /* DRR_SPILL records are either raw or uncompressed */
- if (drc->drc_raw) {
- boolean_t byteorder = ZFS_HOST_BYTEORDER ^
- !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
- drc->drc_byteswap;
-
- abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os),
- drrs->drr_object, byteorder, drrs->drr_salt,
- drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
- drrs->drr_compressed_size, drrs->drr_length,
- drrs->drr_compressiontype, 0);
- } else {
- abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os),
- DMU_OT_IS_METADATA(drrs->drr_type),
- drrs->drr_length);
- }
- err = receive_read_payload_and_next_header(drc,
- DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data);
+ int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+ abd_t *abd = abd_alloc_linear(size, B_FALSE);
+ err = receive_read_payload_and_next_header(drc, size,
+ abd_to_buf(abd));
if (err != 0)
- dmu_return_arcbuf(abuf);
+ abd_free(abd);
else
- drc->drc_rrd->arc_buf = abuf;
+ drc->drc_rrd->abd = abd;
return (err);
}
case DRR_OBJECT_RANGE:
@@ -2687,9 +2691,9 @@ receive_process_record(struct receive_writer_arg *rwa,
if (rrd->header.drr_type != DRR_WRITE) {
err = flush_write_batch(rwa);
if (err != 0) {
- if (rrd->arc_buf != NULL) {
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
@@ -2726,8 +2730,8 @@ receive_process_record(struct receive_writer_arg *rwa,
* the rrd or arc_buf.
*/
ASSERT(err != 0);
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
}
break;
}
@@ -2749,10 +2753,10 @@ receive_process_record(struct receive_writer_arg *rwa,
case DRR_SPILL:
{
struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
- err = receive_spill(rwa, drrs, rrd->arc_buf);
+ err = receive_spill(rwa, drrs, rrd->abd);
if (err != 0)
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
rrd->payload = NULL;
break;
}
@@ -2800,9 +2804,9 @@ receive_writer_thread(void *arg)
int err = 0;
if (rwa->err == 0) {
err = receive_process_record(rwa, rrd);
- } else if (rrd->arc_buf != NULL) {
- dmu_return_arcbuf(rrd->arc_buf);
- rrd->arc_buf = NULL;
+ } else if (rrd->abd != NULL) {
+ abd_free(rrd->abd);
+ rrd->abd = NULL;
rrd->payload = NULL;
} else if (rrd->payload != NULL) {
kmem_free(rrd->payload, rrd->payload_size);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
index 225ec40537ec..62c7d01d4bd2 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_redact.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -858,7 +858,7 @@ hold_next_object(objset_t *os, struct redact_record *rec, void *tag,
{
int err = 0;
if (*dn != NULL)
- dnode_rele(*dn, FTAG);
+ dnode_rele(*dn, tag);
*dn = NULL;
if (*object < rec->start_object) {
*object = rec->start_object - 1;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
index 9480c8b75497..d654382237c0 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_send.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -2626,7 +2626,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
{
int err;
dsl_dataset_t *fromds;
- ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
struct dmu_send_params dspp = {0};
dspp.embedok = embedok;
dspp.large_block_ok = large_block_ok;
@@ -2638,6 +2638,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
dspp.rawok = rawok;
dspp.savedok = savedok;
+ dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
err = dsl_pool_hold(pool, FTAG, &dspp.dp);
if (err != 0)
return (err);
@@ -2711,12 +2712,13 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dmu_send_outparams_t *dsop)
{
int err = 0;
- ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
boolean_t owned = B_FALSE;
dsl_dataset_t *fromds = NULL;
zfs_bookmark_phys_t book = {0};
struct dmu_send_params dspp = {0};
+ dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
dspp.tosnap = tosnap;
dspp.embedok = embedok;
dspp.large_block_ok = large_block_ok;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c
index 09ef2be94944..0ebed4e6fbdf 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_tx.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c
@@ -230,9 +230,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
- if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
- err = SET_ERROR(EFBIG);
-
if (dn == NULL)
return;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 4d86863f30ea..5d061fe3813e 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -59,16 +59,29 @@ typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
+ kstat_named_t zfetchstat_max_completion_us;
+ kstat_named_t zfetchstat_last_completion_us;
+ kstat_named_t zfetchstat_io_issued;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
+ { "max_completion_us", KSTAT_DATA_UINT64 },
+ { "last_completion_us", KSTAT_DATA_UINT64 },
+ { "io_issued", KSTAT_DATA_UINT64 },
};
#define ZFETCHSTAT_BUMP(stat) \
- atomic_inc_64(&zfetch_stats.stat.value.ui64);
+ atomic_inc_64(&zfetch_stats.stat.value.ui64)
+#define ZFETCHSTAT_ADD(stat, val) \
+ atomic_add_64(&zfetch_stats.stat.value.ui64, val)
+#define ZFETCHSTAT_SET(stat, val) \
+ zfetch_stats.stat.value.ui64 = val
+#define ZFETCHSTAT_GET(stat) \
+ zfetch_stats.stat.value.ui64
+
kstat_t *zfetch_ksp;
@@ -104,8 +117,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
{
if (zf == NULL)
return;
-
zf->zf_dnode = dno;
+ zf->zf_numstreams = 0;
list_create(&zf->zf_stream, sizeof (zstream_t),
offsetof(zstream_t, zs_node));
@@ -114,12 +127,28 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
}
static void
+dmu_zfetch_stream_fini(zstream_t *zs)
+{
+ mutex_destroy(&zs->zs_lock);
+ kmem_free(zs, sizeof (*zs));
+}
+
+static void
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
{
ASSERT(MUTEX_HELD(&zf->zf_lock));
list_remove(&zf->zf_stream, zs);
- mutex_destroy(&zs->zs_lock);
- kmem_free(zs, sizeof (*zs));
+ dmu_zfetch_stream_fini(zs);
+ zf->zf_numstreams--;
+}
+
+static void
+dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(MUTEX_HELD(&zf->zf_lock));
+ list_remove(&zf->zf_stream, zs);
+ zs->zs_fetch = NULL;
+ zf->zf_numstreams--;
}
/*
@@ -132,8 +161,12 @@ dmu_zfetch_fini(zfetch_t *zf)
zstream_t *zs;
mutex_enter(&zf->zf_lock);
- while ((zs = list_head(&zf->zf_stream)) != NULL)
- dmu_zfetch_stream_remove(zf, zs);
+ while ((zs = list_head(&zf->zf_stream)) != NULL) {
+ if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ dmu_zfetch_stream_orphan(zf, zs);
+ else
+ dmu_zfetch_stream_remove(zf, zs);
+ }
mutex_exit(&zf->zf_lock);
list_destroy(&zf->zf_stream);
mutex_destroy(&zf->zf_lock);
@@ -151,7 +184,7 @@ static void
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{
zstream_t *zs_next;
- int numstreams = 0;
+ hrtime_t now = gethrtime();
ASSERT(MUTEX_HELD(&zf->zf_lock));
@@ -161,11 +194,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
for (zstream_t *zs = list_head(&zf->zf_stream);
zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
- if (((gethrtime() - zs->zs_atime) / NANOSEC) >
+ /*
+ * Skip gethrtime() call if there are still references
+ */
+ if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ continue;
+ if (((now - zs->zs_atime) / NANOSEC) >
zfetch_min_sec_reap)
dmu_zfetch_stream_remove(zf, zs);
- else
- numstreams++;
}
/*
@@ -179,7 +215,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
zfetch_max_distance));
- if (numstreams >= max_streams) {
+ if (zf->zf_numstreams >= max_streams) {
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
return;
}
@@ -188,12 +224,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
zs->zs_blkid = blkid;
zs->zs_pf_blkid = blkid;
zs->zs_ipf_blkid = blkid;
- zs->zs_atime = gethrtime();
+ zs->zs_atime = now;
+ zs->zs_fetch = zf;
+ zfs_refcount_create(&zs->zs_blocks);
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
-
+ zf->zf_numstreams++;
list_insert_head(&zf->zf_stream, zs);
}
+static void
+dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
+{
+ zstream_t *zs = arg;
+
+ if (zs->zs_start_time && io_issued) {
+ hrtime_t now = gethrtime();
+ hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
+
+ zs->zs_start_time = 0;
+ ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
+ if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
+ ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
+ }
+
+ if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
+ return;
+
+ /*
+ * The parent fetch structure has gone away
+ */
+ if (zs->zs_fetch == NULL)
+ dmu_zfetch_stream_fini(zs);
+}
+
/*
* This is the predictive prefetch entry point. It associates dnode access
* specified with blkid and nblks arguments with prefetch stream, predicts
@@ -209,7 +272,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
zstream_t *zs;
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
int64_t pf_ahead_blks, max_blks;
- int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+ int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
uint64_t end_of_access_blkid;
end_of_access_blkid = blkid + nblks;
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
@@ -230,11 +293,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
* As a fast path for small (single-block) files, ignore access
* to the first block.
*/
- if (blkid == 0)
+ if (!have_lock && blkid == 0)
return;
if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
+
+ /*
+ * A fast path for small files for which no prefetch will
+ * happen.
+ */
+ if (zf->zf_dnode->dn_maxblkid < 2) {
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ return;
+ }
mutex_enter(&zf->zf_lock);
/*
@@ -343,9 +416,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
zs->zs_atime = gethrtime();
+ /* no prior reads in progress */
+ if (zfs_refcount_count(&zs->zs_blocks) == 0)
+ zs->zs_start_time = zs->zs_atime;
zs->zs_blkid = end_of_access_blkid;
+ zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
+ NULL);
mutex_exit(&zs->zs_lock);
mutex_exit(&zf->zf_lock);
+ issued = 0;
/*
* dbuf_prefetch() is asynchronous (even when it needs to read
@@ -354,16 +433,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
*/
for (int i = 0; i < pf_nblks; i++) {
- dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+ dmu_zfetch_stream_done, zs);
}
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
- dbuf_prefetch(zf->zf_dnode, 1, iblk,
- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+ dmu_zfetch_stream_done, zs);
}
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+ if (issued)
+ ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
}
/* BEGIN CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 23364dbae897..eaba9c0c0e7f 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -609,7 +609,6 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_assigned_txg);
- ASSERT0(dn->dn_dirty_txg);
ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
ASSERT(avl_is_empty(&dn->dn_dbufs));
@@ -649,6 +648,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dn->dn_free_txg = 0;
dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_dirty_txg = 0;
dn->dn_allocated_txg = tx->tx_txg;
dn->dn_id_flags = 0;
@@ -1812,6 +1812,7 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(new_nlevels, >, dn->dn_nlevels);
dn->dn_nlevels = new_nlevels;
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
@@ -1829,10 +1830,12 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
- if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+
+ IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
+ if (dr->dr_dbuf == NULL ||
+ (dr->dr_dbuf->db_level == old_nlevels - 1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c
index ae44cb69765c..66e48a1e17d4 100644
--- a/sys/contrib/openzfs/module/zfs/dnode_sync.c
+++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2020 Oxide Computer Company
*/
@@ -851,6 +851,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
/*
* Although we have dropped our reference to the dnode, it
* can't be evicted until its written, and we haven't yet
- * initiated the IO for the dnode's dbuf.
+ * initiated the IO for the dnode's dbuf. Additionally, the caller
+ * has already added a reference to the dnode because it's on the
+ * os_synced_dnodes list.
*/
}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
index 16bf2c4414a8..2faf1af52991 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
@@ -1561,33 +1561,6 @@ dsl_bookmark_latest_txg(dsl_dataset_t *ds)
return (dbn->dbn_phys.zbm_creation_txg);
}
-static inline unsigned int
-redact_block_buf_num_entries(unsigned int size)
-{
- return (size / sizeof (redact_block_phys_t));
-}
-
-/*
- * This function calculates the offset of the last entry in the array of
- * redact_block_phys_t. If we're reading the redaction list into buffers of
- * size bufsize, then for all but the last buffer, the last valid entry in the
- * array will be the last entry in the array. However, for the last buffer, any
- * amount of it may be filled. Thus, we check to see if we're looking at the
- * last buffer in the redaction list, and if so, we return the total number of
- * entries modulo the number of entries per buffer. Otherwise, we return the
- * number of entries per buffer minus one.
- */
-static inline unsigned int
-last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid)
-{
- if (bufid == (rl->rl_phys->rlp_num_entries - 1) /
- redact_block_buf_num_entries(bufsize)) {
- return ((rl->rl_phys->rlp_num_entries - 1) %
- redact_block_buf_num_entries(bufsize));
- }
- return (redact_block_buf_num_entries(bufsize) - 1);
-}
-
/*
* Compare the redact_block_phys_t to the bookmark. If the last block in the
* redact_block_phys_t is before the bookmark, return -1. If the first block in
@@ -1633,8 +1606,6 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
rl_traverse_callback_t cb, void *arg)
{
objset_t *mos = rl->rl_mos;
- redact_block_phys_t *buf;
- unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
int err = 0;
if (rl->rl_phys->rlp_last_object != UINT64_MAX ||
@@ -1651,42 +1622,48 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
}
/*
- * Binary search for the point to resume from. The goal is to minimize
- * the number of disk reads we have to perform.
+ * This allows us to skip the binary search and resume checking logic
+ * below, if we're not resuming a redacted send.
+ */
+ if (ZB_IS_ZERO(resume))
+ resume = NULL;
+
+ /*
+ * Binary search for the point to resume from.
*/
- buf = zio_data_buf_alloc(bufsize);
- uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) /
- redact_block_buf_num_entries(bufsize);
- uint64_t minbufid = 0;
- while (resume != NULL && maxbufid - minbufid >= 1) {
- ASSERT3U(maxbufid, >, minbufid);
- uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2);
- err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize,
- buf, DMU_READ_NO_PREFETCH);
+ uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1;
+ uint64_t minidx = 0;
+ while (resume != NULL && maxidx > minidx) {
+ redact_block_phys_t rbp = { 0 };
+ ASSERT3U(maxidx, >, minidx);
+ uint64_t mididx = minidx + ((maxidx - minidx) / 2);
+ err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp),
+ sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH);
if (err != 0)
break;
- int cmp0 = redact_block_zb_compare(&buf[0], resume);
- int cmpn = redact_block_zb_compare(
- &buf[last_entry(rl, bufsize, maxbufid)], resume);
+ int cmp = redact_block_zb_compare(&rbp, resume);
- /*
- * If the first block is before or equal to the resume point,
- * and the last one is equal or after, then the resume point is
- * in this buf, and we should start here.
- */
- if (cmp0 <= 0 && cmpn >= 0)
+ if (cmp == 0) {
+ minidx = mididx;
break;
-
- if (cmp0 > 0)
- maxbufid = midbufid - 1;
- else if (cmpn < 0)
- minbufid = midbufid + 1;
- else
- panic("No progress in binary search for resume point");
+ } else if (cmp > 0) {
+ maxidx =
+ (mididx == minidx ? minidx : mididx - 1);
+ } else {
+ minidx = mididx + 1;
+ }
}
- for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize);
+ unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
+ redact_block_phys_t *buf = zio_data_buf_alloc(bufsize);
+
+ unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t);
+ uint64_t start_block = minidx / entries_per_buf;
+ err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf,
+ DMU_READ_PREFETCH);
+
+ for (uint64_t curidx = minidx;
err == 0 && curidx < rl->rl_phys->rlp_num_entries;
curidx++) {
/*
@@ -1696,22 +1673,35 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
* prefetching, and this code shouldn't be the bottleneck, so we
* don't need to do manual prefetching.
*/
- if (curidx % redact_block_buf_num_entries(bufsize) == 0) {
+ if (curidx % entries_per_buf == 0) {
err = dmu_read(mos, rl->rl_object, curidx *
sizeof (*buf), bufsize, buf,
DMU_READ_PREFETCH);
if (err != 0)
break;
}
- redact_block_phys_t *rb = &buf[curidx %
- redact_block_buf_num_entries(bufsize)];
+ redact_block_phys_t *rb = &buf[curidx % entries_per_buf];
/*
* If resume is non-null, we should either not send the data, or
* null out resume so we don't have to keep doing these
* comparisons.
*/
if (resume != NULL) {
+ /*
+ * It is possible that after the binary search we got
+ * a record before the resume point. There's two cases
+ * where this can occur. If the record is the last
+ * redaction record, and the resume point is after the
+ * end of the redacted data, curidx will be the last
+ * redaction record. In that case, the loop will end
+ * after this iteration. The second case is if the
+ * resume point is between two redaction records, the
+ * binary search can return either the record before
+ * or after the resume point. In that case, the next
+ * iteration will be greater than the resume point.
+ */
if (redact_block_zb_compare(rb, resume) < 0) {
+ ASSERT3U(curidx, ==, minidx);
continue;
} else {
/*
@@ -1733,8 +1723,10 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
}
}
- if (cb(rb, arg) != 0)
+ if (cb(rb, arg) != 0) {
+ err = EINTR;
break;
+ }
}
zio_data_buf_free(buf, bufsize);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
index 26d4c2fe7e33..e38ec0cae827 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
@@ -2007,14 +2007,6 @@ dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds,
if (ret != 0)
return (ret);
- /*
- * Useraccounting is not portable and must be done with the keys loaded.
- * Therefore, whenever we do any kind of receive the useraccounting
- * must not be present.
- */
- ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
- ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
-
mdn = DMU_META_DNODE(os);
/*
@@ -2105,6 +2097,9 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
*/
arc_release(os->os_phys_buf, &os->os_phys_buf);
bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+ os->os_phys->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+ os->os_phys->os_flags &= ~OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+ os->os_flags = os->os_phys->os_flags;
bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN);
os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
index 1fcd83db7988..de60c33589e3 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -2267,10 +2267,8 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
dsl_bookmark_sync_done(ds, tx);
- if (os->os_synced_dnodes != NULL) {
- multilist_destroy(os->os_synced_dnodes);
- os->os_synced_dnodes = NULL;
- }
+ multilist_destroy(os->os_synced_dnodes);
+ os->os_synced_dnodes = NULL;
if (os->os_encrypted)
os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE;
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
index 3a2028625e8b..c770eafa75d8 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_pool.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
@@ -220,11 +220,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
- dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri,
- boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
+ boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+ TASKQ_THREADS_CPU_PCT);
dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
- boot_ncpus, defclsyspri, boot_ncpus, INT_MAX,
- TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ 100, defclsyspri, boot_ncpus, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
return (dp);
}
@@ -565,6 +566,11 @@ dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
dmu_objset_sync(dp->dp_meta_objset, zio, tx);
VERIFY0(zio_wait(zio));
+ dmu_objset_sync_done(dp->dp_meta_objset, tx);
+ taskq_wait(dp->dp_sync_taskq);
+ multilist_destroy(dp->dp_meta_objset->os_synced_dnodes);
+ dp->dp_meta_objset->os_synced_dnodes = NULL;
+
dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
}
@@ -676,7 +682,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
*/
for (ds = list_head(&synced_datasets); ds != NULL;
ds = list_next(&synced_datasets, ds)) {
- dmu_objset_do_userquota_updates(ds->ds_objset, tx);
+ dmu_objset_sync_done(ds->ds_objset, tx);
}
taskq_wait(dp->dp_sync_taskq);
@@ -1264,8 +1270,16 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
* (e.g. it could be destroyed). Therefore you shouldn't do anything to the
* dataset except release it.
*
- * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
- * or modifying operations.
+ * Operations generally fall somewhere into the following taxonomy:
+ *
+ * Read-Only Modifying
+ *
+ * Dataset Layer / MOS zfs get zfs destroy
+ *
+ * Individual Dataset read() write()
+ *
+ *
+ * Dataset Layer Operations
*
* Modifying operations should generally use dsl_sync_task(). The synctask
* infrastructure enforces proper locking strategy with respect to the
@@ -1275,6 +1289,25 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
* information from the dataset, then release the pool and dataset.
* dmu_objset_{hold,rele}() are convenience routines that also do the pool
* hold/rele.
+ *
+ *
+ * Operations On Individual Datasets
+ *
+ * Objects _within_ an objset should only be modified by the current 'owner'
+ * of the objset to prevent incorrect concurrent modification. Thus, use
+ * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner,
+ * and fail with EBUSY if there is already an owner. The owner can then
+ * implement its own locking strategy, independent of the dataset layer's
+ * locking infrastructure.
+ * (E.g., the ZPL has its own set of locks to control concurrency. A regular
+ * vnop will not reach into the dataset layer).
+ *
+ * Ideally, objects would also only be read by the objset’s owner, so that we
+ * don’t observe state mid-modification.
+ * (E.g. the ZPL is creating a new object and linking it into a directory; if
+ * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an
+ * intermediate state. The ioctl level violates this but in pretty benign
+ * ways, e.g. reading the zpl props object.)
*/
int
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index 4704781bfa45..40adfbcee4e1 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
return (0);
}
-static void
+void
dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
@@ -3328,19 +3328,12 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
}
/*
- * Check if the txg falls within the range which must be
- * resilvered. DVAs outside this range can always be skipped.
- */
- if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
- return (B_FALSE);
-
- /*
* Check if the top-level vdev must resilver this offset.
* When the offset does not intersect with a dirty leaf DTL
* then it may be possible to skip the resilver IO. The psize
* is provided instead of asize to simplify the check for RAIDZ.
*/
- if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+ if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
return (B_FALSE);
/*
@@ -3987,7 +3980,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
/*
* Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
+ * zpool(8) status can make useful progress reports.
*/
scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index 133005b227e5..bed6bf64c928 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -32,6 +32,7 @@
#include <sys/space_map.h>
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/spa_impl.h>
#include <sys/zfeature.h>
@@ -263,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2;
* Internal switch to enable/disable the metaslab allocation tracing
* facility.
*/
-#ifdef _METASLAB_TRACING
-boolean_t metaslab_trace_enabled = B_TRUE;
-#endif
+boolean_t metaslab_trace_enabled = B_FALSE;
/*
* Maximum entries that the metaslab allocation tracing facility will keep
@@ -275,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE;
* to every exceed this value. In debug mode, the system will panic if this
* limit is ever reached allowing for further investigation.
*/
-#ifdef _METASLAB_TRACING
uint64_t metaslab_trace_max_entries = 5000;
-#endif
/*
* Maximum number of metaslabs per group that can be disabled
@@ -313,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE;
*/
uint32_t metaslab_by_size_min_shift = 14;
+/*
+ * If not set, we will first try normal allocation. If that fails then
+ * we will do a gang allocation. If that fails then we will do a "try hard"
+ * gang allocation. If that fails then we will have a multi-layer gang
+ * block.
+ *
+ * If set, we will first try normal allocation. If that fails then
+ * we will do a "try hard" allocation. If that fails we will do a gang
+ * allocation. If that fails we will do a "try hard" gang allocation. If
+ * that fails then we will have a multi-layer gang block.
+ */
+int zfs_metaslab_try_hard_before_gang = B_FALSE;
+
+/*
+ * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
+ * metaslabs. This improves performance, especially when there are many
+ * metaslabs per vdev and the allocation can't actually be satisfied (so we
+ * would otherwise iterate all the metaslabs). If there is a metaslab with a
+ * worse weight but it can actually satisfy the allocation, we won't find it
+ * until trying hard. This may happen if the worse metaslab is not loaded
+ * (and the true weight is better than we have calculated), or due to weight
+ * bucketization. E.g. we are looking for a 60K segment, and the best
+ * metaslabs all have free segments in the 32-63K bucket, but the best
+ * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
+ * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
+ * bucket, and therefore a lower weight).
+ */
+int zfs_metaslab_find_max_tries = 100;
+
static uint64_t metaslab_weight(metaslab_t *, boolean_t);
static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@@ -324,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
static unsigned int metaslab_idx_func(multilist_t *, void *);
static void metaslab_evict(metaslab_t *, uint64_t);
static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
-#ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache;
typedef struct metaslab_stats {
kstat_named_t metaslabstat_trace_over_limit;
- kstat_named_t metaslabstat_df_find_under_floor;
kstat_named_t metaslabstat_reload_tree;
+ kstat_named_t metaslabstat_too_many_tries;
+ kstat_named_t metaslabstat_try_hard;
} metaslab_stats_t;
static metaslab_stats_t metaslab_stats = {
{ "trace_over_limit", KSTAT_DATA_UINT64 },
- { "df_find_under_floor", KSTAT_DATA_UINT64 },
{ "reload_tree", KSTAT_DATA_UINT64 },
+ { "too_many_tries", KSTAT_DATA_UINT64 },
+ { "try_hard", KSTAT_DATA_UINT64 },
};
#define METASLABSTAT_BUMP(stat) \
@@ -372,18 +399,6 @@ metaslab_stat_fini(void)
kmem_cache_destroy(metaslab_alloc_trace_cache);
metaslab_alloc_trace_cache = NULL;
}
-#else
-
-void
-metaslab_stat_init(void)
-{
-}
-
-void
-metaslab_stat_fini(void)
-{
-}
-#endif
/*
* ==========================================================================
@@ -395,20 +410,19 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
{
metaslab_class_t *mc;
- mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
+ mc = kmem_zalloc(offsetof(metaslab_class_t,
+ mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
mc->mc_spa = spa;
- mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
- mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
- sizeof (zfs_refcount_t), KM_SLEEP);
- mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
- sizeof (uint64_t), KM_SLEEP);
- for (int i = 0; i < spa->spa_alloc_count; i++)
- zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+ mca->mca_rotor = NULL;
+ zfs_refcount_create_tracked(&mca->mca_alloc_slots);
+ }
return (mc);
}
@@ -416,21 +430,22 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
void
metaslab_class_destroy(metaslab_class_t *mc)
{
- ASSERT(mc->mc_rotor == NULL);
+ spa_t *spa = mc->mc_spa;
+
ASSERT(mc->mc_alloc == 0);
ASSERT(mc->mc_deferred == 0);
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
- for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
- zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
- kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
- sizeof (zfs_refcount_t));
- kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
- sizeof (uint64_t));
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+ ASSERT(mca->mca_rotor == NULL);
+ zfs_refcount_destroy(&mca->mca_alloc_slots);
+ }
mutex_destroy(&mc->mc_lock);
multilist_destroy(mc->mc_metaslab_txg_list);
- kmem_free(mc, sizeof (metaslab_class_t));
+ kmem_free(mc, offsetof(metaslab_class_t,
+ mc_allocator[spa->spa_alloc_count]));
}
int
@@ -445,7 +460,7 @@ metaslab_class_validate(metaslab_class_t *mc)
ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
- if ((mg = mc->mc_rotor) == NULL)
+ if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
return (0);
do {
@@ -454,7 +469,7 @@ metaslab_class_validate(metaslab_class_t *mc)
ASSERT3P(vd->vdev_top, ==, vd);
ASSERT3P(mg->mg_class, ==, mc);
ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
- } while ((mg = mg->mg_next) != mc->mc_rotor);
+ } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
return (0);
}
@@ -811,7 +826,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
{
metaslab_group_t *mg;
- mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
+ mg = kmem_zalloc(offsetof(metaslab_group_t,
+ mg_allocator[allocators]), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
@@ -824,8 +840,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
mg->mg_no_free_space = B_TRUE;
mg->mg_allocators = allocators;
- mg->mg_allocator = kmem_zalloc(allocators *
- sizeof (metaslab_group_allocator_t), KM_SLEEP);
for (int i = 0; i < allocators; i++) {
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
@@ -859,21 +873,19 @@ metaslab_group_destroy(metaslab_group_t *mg)
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
}
- kmem_free(mg->mg_allocator, mg->mg_allocators *
- sizeof (metaslab_group_allocator_t));
-
- kmem_free(mg, sizeof (metaslab_group_t));
+ kmem_free(mg, offsetof(metaslab_group_t,
+ mg_allocator[mg->mg_allocators]));
}
void
metaslab_group_activate(metaslab_group_t *mg)
{
metaslab_class_t *mc = mg->mg_class;
+ spa_t *spa = mc->mc_spa;
metaslab_group_t *mgprev, *mgnext;
- ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
+ ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
- ASSERT(mc->mc_rotor != mg);
ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL);
ASSERT(mg->mg_activation_count <= 0);
@@ -884,7 +896,7 @@ metaslab_group_activate(metaslab_group_t *mg)
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
metaslab_group_alloc_update(mg);
- if ((mgprev = mc->mc_rotor) == NULL) {
+ if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
mg->mg_prev = mg;
mg->mg_next = mg;
} else {
@@ -894,7 +906,10 @@ metaslab_group_activate(metaslab_group_t *mg)
mgprev->mg_next = mg;
mgnext->mg_prev = mg;
}
- mc->mc_rotor = mg;
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mc->mc_allocator[i].mca_rotor = mg;
+ mg = mg->mg_next;
+ }
}
/*
@@ -915,7 +930,8 @@ metaslab_group_passivate(metaslab_group_t *mg)
(SCL_ALLOC | SCL_ZIO));
if (--mg->mg_activation_count != 0) {
- ASSERT(mc->mc_rotor != mg);
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ ASSERT(mc->mc_allocator[i].mca_rotor != mg);
ASSERT(mg->mg_prev == NULL);
ASSERT(mg->mg_next == NULL);
ASSERT(mg->mg_activation_count < 0);
@@ -962,12 +978,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
mgnext = mg->mg_next;
if (mg == mgnext) {
- mc->mc_rotor = NULL;
+ mgnext = NULL;
} else {
- mc->mc_rotor = mgnext;
mgprev->mg_next = mgnext;
mgnext->mg_prev = mgprev;
}
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ if (mc->mc_allocator[i].mca_rotor == mg)
+ mc->mc_allocator[i].mca_rotor = mgnext;
+ }
mg->mg_prev = NULL;
mg->mg_next = NULL;
@@ -1201,7 +1220,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* in metaslab_group_alloc_update() for more information) and
* the allocation throttle is disabled then allow allocations to this
* device. However, if the allocation throttle is enabled then
- * check if we have reached our allocation limit (mg_alloc_queue_depth)
+ * check if we have reached our allocation limit (mga_alloc_queue_depth)
* to determine if we should allow allocations to this metaslab group.
* If all metaslab groups are no longer considered allocatable
* (mc_alloc_groups == 0) or we're trying to allocate the smallest
@@ -1350,9 +1369,7 @@ static void
metaslab_size_tree_full_load(range_tree_t *rt)
{
metaslab_rt_arg_t *mrap = rt->rt_arg;
-#ifdef _METASLAB_TRACING
METASLABSTAT_BUMP(metaslabstat_reload_tree);
-#endif
ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
mrap->mra_floor_shift = 0;
struct mssa_arg arg = {0};
@@ -1563,6 +1580,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
defined(WITH_CF_BLOCK_ALLOCATOR)
+
/*
* This is a helper function that can be used by the allocator to find a
* suitable block to allocate. This will search the specified B-tree looking
@@ -1654,19 +1672,13 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size)
range_seg_t *rs;
if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
metaslab_size_tree_full_load(msp->ms_allocatable);
+
if (metaslab_df_use_largest_segment) {
/* use largest free segment */
rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
} else {
zfs_btree_index_t where;
/* use segment of this size, or next largest */
-#ifdef _METASLAB_TRACING
- metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
- if (size < (1 << mrap->mra_floor_shift)) {
- METASLABSTAT_BUMP(
- metaslabstat_df_find_under_floor);
- }
-#endif
rs = metaslab_block_find(&msp->ms_allocatable_by_size,
rt, msp->ms_start, size, &where);
}
@@ -2616,6 +2628,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
ms->ms_allocator = -1;
ms->ms_new = B_TRUE;
+ vdev_ops_t *ops = vd->vdev_ops;
+ if (ops->vdev_op_metaslab_init != NULL)
+ ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
+
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
@@ -4393,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva)
* Metaslab allocation tracing facility
* ==========================================================================
*/
-#ifdef _METASLAB_TRACING
/*
* Add an allocation trace element to the allocation tracing list.
@@ -4468,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
list_destroy(&zal->zal_list);
zal->zal_size = 0;
}
-#else
-
-#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc)
-
-void
-metaslab_trace_init(zio_alloc_list_t *zal)
-{
-}
-
-void
-metaslab_trace_fini(zio_alloc_list_t *zal)
-{
-}
-
-#endif /* _METASLAB_TRACING */
/*
* ==========================================================================
@@ -4510,13 +4510,14 @@ static void
metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
{
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+ metaslab_class_allocator_t *mca =
+ &mg->mg_class->mc_allocator[allocator];
uint64_t max = mg->mg_max_alloc_queue_depth;
uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
while (cur < max) {
if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
cur, cur + 1) == cur) {
- atomic_inc_64(
- &mg->mg_class->mc_alloc_max_slots[allocator]);
+ atomic_inc_64(&mca->mca_alloc_max_slots);
return;
}
cur = mga->mga_cur_max_alloc_queue_depth;
@@ -4622,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
if (msp == NULL)
msp = avl_nearest(t, idx, AVL_AFTER);
+ int tries = 0;
for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
int i;
+
+ if (!try_hard && tries > zfs_metaslab_find_max_tries) {
+ METASLABSTAT_BUMP(metaslabstat_too_many_tries);
+ return (NULL);
+ }
+ tries++;
+
if (!metaslab_should_allocate(msp, asize, try_hard)) {
metaslab_trace_add(zal, mg, msp, asize, d,
TRACE_TOO_SMALL, allocator);
@@ -5052,6 +5061,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
zio_alloc_list_t *zal, int allocator)
{
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
metaslab_group_t *mg, *fast_mg, *rotor;
vdev_t *vd;
boolean_t try_hard = B_FALSE;
@@ -5073,7 +5083,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* Start at the rotor and loop through all mgs until we find something.
- * Note that there's no locking on mc_rotor or mc_aliquot because
+ * Note that there's no locking on mca_rotor or mca_aliquot because
* nothing actually breaks if we miss a few updates -- we just won't
* allocate quite as evenly. It all balances out over time.
*
@@ -5109,23 +5119,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
mg->mg_next != NULL)
mg = mg->mg_next;
} else {
- mg = mc->mc_rotor;
+ mg = mca->mca_rotor;
}
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
} else if (flags & METASLAB_FASTWRITE) {
- mg = fast_mg = mc->mc_rotor;
+ mg = fast_mg = mca->mca_rotor;
do {
if (fast_mg->mg_vd->vdev_pending_fastwrite <
mg->mg_vd->vdev_pending_fastwrite)
mg = fast_mg;
- } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor);
+ } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
} else {
- ASSERT(mc->mc_rotor != NULL);
- mg = mc->mc_rotor;
+ ASSERT(mca->mca_rotor != NULL);
+ mg = mca->mca_rotor;
}
/*
@@ -5133,7 +5143,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* metaslab group that has been passivated, just follow the rotor.
*/
if (mg->mg_class != mc || mg->mg_activation_count <= 0)
- mg = mc->mc_rotor;
+ mg = mca->mca_rotor;
rotor = mg;
top:
@@ -5211,7 +5221,7 @@ top:
* Bias is also used to compensate for unequally
* sized vdevs so that space is allocated fairly.
*/
- if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
+ if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
vdev_stat_t *vs = &vd->vdev_stat;
int64_t vs_free = vs->vs_space - vs->vs_alloc;
int64_t mc_free = mc->mc_space - mc->mc_alloc;
@@ -5249,10 +5259,10 @@ top:
}
if ((flags & METASLAB_FASTWRITE) ||
- atomic_add_64_nv(&mc->mc_aliquot, asize) >=
+ atomic_add_64_nv(&mca->mca_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
- mc->mc_rotor = mg->mg_next;
- mc->mc_aliquot = 0;
+ mca->mca_rotor = mg->mg_next;
+ mca->mca_aliquot = 0;
}
DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -5269,14 +5279,17 @@ top:
return (0);
}
next:
- mc->mc_rotor = mg->mg_next;
- mc->mc_aliquot = 0;
+ mca->mca_rotor = mg->mg_next;
+ mca->mca_aliquot = 0;
} while ((mg = mg->mg_next) != rotor);
/*
- * If we haven't tried hard, do so now.
+ * If we haven't tried hard, perhaps do so now.
*/
- if (!try_hard) {
+ if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
+ GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
+ psize <= 1 << spa->spa_min_ashift)) {
+ METASLABSTAT_BUMP(metaslabstat_try_hard);
try_hard = B_TRUE;
goto top;
}
@@ -5588,15 +5601,15 @@ boolean_t
metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
zio_t *zio, int flags)
{
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
- uint64_t max = mc->mc_alloc_max_slots[allocator];
+ uint64_t max = mca->mca_alloc_max_slots;
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
- uint64_t reserved_slots =
- zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
+ uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
if (reserved_slots < max)
available_slots = max - reserved_slots;
@@ -5606,11 +5619,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
- for (int d = 0; d < slots; d++) {
- reserved_slots =
- zfs_refcount_add(&mc->mc_alloc_slots[allocator],
- zio);
- }
+ for (int d = 0; d < slots; d++)
+ zfs_refcount_add(&mca->mca_alloc_slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
}
@@ -5623,12 +5633,12 @@ void
metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
int allocator, zio_t *zio)
{
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
- for (int d = 0; d < slots; d++) {
- (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
- zio);
- }
+ for (int d = 0; d < slots; d++)
+ zfs_refcount_remove(&mca->mca_alloc_slots, zio);
mutex_exit(&mc->mc_lock);
}
@@ -5674,7 +5684,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
range_tree_remove(msp->ms_allocatable, offset, size);
range_tree_clear(msp->ms_trim, offset, size);
- if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */
metaslab_class_t *mc = msp->ms_group->mg_class;
multilist_sublist_t *mls =
multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
@@ -5721,7 +5731,7 @@ metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
metaslab_claim_cb_arg_t arg;
/*
- * Only zdb(1M) can claim on indirect vdevs. This is used
+ * Only zdb(8) can claim on indirect vdevs. This is used
* to detect leaks of mapped space (that are not accounted
* for in the obsolete counts, spacemap, or bpobj).
*/
@@ -5782,7 +5792,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
- if (mc->mc_rotor == NULL) { /* no vdevs in this class */
+ if (mc->mc_allocator[allocator].mca_rotor == NULL) {
+ /* no vdevs in this class */
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (SET_ERROR(ENOSPC));
}
@@ -5813,7 +5824,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
metaslab_group_alloc_increment(spa,
DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
}
-
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
@@ -6235,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,
ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
"Percentage of memory that can be used to store metaslab range trees");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
+ ZMOD_RW, "Try hard to allocate before ganging");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW,
+ "Normally only consider this many of the best metaslabs in each vdev");
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
index 99852521b6d1..d05c9db24c20 100644
--- a/sys/contrib/openzfs/module/zfs/mmp.c
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa)
if (leaf == NULL)
leaf = list_head(&spa->spa_leaf_list);
- if (!vdev_writeable(leaf)) {
+ /*
+ * We skip unwritable, offline, detached, and dRAID spare
+ * devices as they are either not legal targets or the write
+ * may fail or not be seen by other hosts. Skipped dRAID
+ * spares can never be written so the fail mask is not set.
+ */
+ if (!vdev_writeable(leaf) || leaf->vdev_offline ||
+ leaf->vdev_detached) {
fail_mask |= MMP_FAIL_NOT_WRITABLE;
+ } else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
+ continue;
} else if (leaf->vdev_mmp_pending != 0) {
fail_mask |= MMP_FAIL_WRITE_PENDING;
} else {
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
index a3adfd317af6..36c0d33bf1f6 100644
--- a/sys/contrib/openzfs/module/zfs/multilist.c
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -96,9 +96,12 @@ multilist_create_impl(size_t size, size_t offset,
}
/*
- * Allocate a new multilist, using the default number of sublists
- * (the number of CPUs, or at least 4, or the tunable
- * zfs_multilist_num_sublists).
+ * Allocate a new multilist, using the default number of sublists (the number
+ * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
+ * that the multilists do not expand if more CPUs are hot-added. In that case,
+ * we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
*/
multilist_t *
multilist_create(size_t size, size_t offset,
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 9d1d4e0cca64..53ffbc31c186 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -60,6 +60,7 @@
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
+#include <sys/vdev_draid.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/mmp.h>
@@ -1280,15 +1281,15 @@ spa_activate(spa_t *spa, spa_mode_t mode)
* pool traverse code from monopolizing the global (and limited)
* system_taskq by inappropriately scheduling long running tasks on it.
*/
- spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus,
- defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
+ spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
+ defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
/*
* The taskq to upgrade datasets in this pool. Currently used by
* feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
*/
- spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus,
- defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
+ spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
+ defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
}
/*
@@ -2110,9 +2111,6 @@ spa_passivate_log(spa_t *spa)
ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
- if (!spa_has_slogs(spa))
- return (B_FALSE);
-
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
@@ -3681,7 +3679,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
/*
* Build a new vdev tree from the trusted config
*/
- VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+ error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
+ if (error != 0) {
+ nvlist_free(mos_config);
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_load_failed(spa, "spa_config_parse failed [error=%d]",
+ error);
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+ }
/*
* Vdev paths in the MOS may be obsolete. If the untrusted config was
@@ -5631,7 +5636,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
uint64_t txg = TXG_INITIAL;
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
- uint64_t version, obj;
+ uint64_t version, obj, ndraid = 0;
boolean_t has_features;
boolean_t has_encryption;
boolean_t has_allocclass;
@@ -5753,8 +5758,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
- (error = spa_validate_aux(spa, nvroot, txg,
- VDEV_ALLOC_ADD)) == 0) {
+ (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
+ (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
/*
* instantiate the metaslab groups (this will dirty the vdevs)
* we can no longer error exit past this point
@@ -5895,6 +5900,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa_sync_props(props, tx);
}
+ for (int i = 0; i < ndraid; i++)
+ spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+
dmu_tx_commit(tx);
spa->spa_sync_on = B_TRUE;
@@ -6404,12 +6412,25 @@ spa_reset(const char *pool)
*/
/*
+ * This is called as a synctask to increment the draid feature flag
+ */
+static void
+spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ int draid = (int)(uintptr_t)arg;
+
+ for (int c = 0; c < draid; c++)
+ spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+}
+
+/*
* Add a device to a storage pool.
*/
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
{
- uint64_t txg;
+ uint64_t txg, ndraid = 0;
int error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
@@ -6438,8 +6459,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
if (vd->vdev_children != 0 &&
- (error = vdev_create(vd, txg, B_FALSE)) != 0)
+ (error = vdev_create(vd, txg, B_FALSE)) != 0) {
+ return (spa_vdev_exit(spa, vd, txg, error));
+ }
+
+ /*
+ * The virtual dRAID spares must be added after vdev tree is created
+ * and the vdev guids are generated. The guid of their assoicated
+ * dRAID is stored in the config and used when opening the spare.
+ */
+ if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
+ rvd->vdev_children)) == 0) {
+ if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
+ nspares = 0;
+ } else {
return (spa_vdev_exit(spa, vd, txg, error));
+ }
/*
* We must validate the spares and l2cache devices after checking the
@@ -6452,7 +6488,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* If we are in the middle of a device removal, we can only add
* devices which match the existing devices in the pool.
* If we are in the middle of a removal, or have some indirect
- * vdevs, we can not add raidz toplevels.
+ * vdevs, we can not add raidz or dRAID top levels.
*/
if (spa->spa_vdev_removal != NULL ||
spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
@@ -6462,10 +6498,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
- /* Fail if top level vdev is raidz */
- if (tvd->vdev_ops == &vdev_raidz_ops) {
+ /* Fail if top level vdev is raidz or a dRAID */
+ if (vdev_get_nparity(tvd) != 0)
return (spa_vdev_exit(spa, vd, txg, EINVAL));
- }
+
/*
* Need the top level mirror to be
* a mirror of leaf vdevs only
@@ -6506,6 +6542,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
/*
+ * We can't increment a feature while holding spa_vdev so we
+ * have to do it in a synctask.
+ */
+ if (ndraid != 0) {
+ dmu_tx_t *tx;
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
+ (void *)(uintptr_t)ndraid, tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
* We have to be careful when adding new vdevs to an existing pool.
* If other threads start allocating from these vdevs before we
* sync the config cache, and we lose power, then upon reboot we may
@@ -6615,14 +6664,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ /*
+ * A dRAID spare can only replace a child of its parent dRAID vdev.
+ */
+ if (newvd->vdev_ops == &vdev_draid_spare_ops &&
+ oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
+ return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+ }
+
if (rebuild) {
/*
- * For rebuilds, the parent vdev must support reconstruction
+ * For rebuilds, the top vdev must support reconstruction
* using only space maps. This means the only allowable
- * parents are the root vdev or a mirror vdev.
+ * vdevs types are the root vdev, a mirror, or dRAID.
*/
- if (pvd->vdev_ops != &vdev_mirror_ops &&
- pvd->vdev_ops != &vdev_root_ops) {
+ tvd = pvd;
+ if (pvd->vdev_top != NULL)
+ tvd = pvd->vdev_top;
+
+ if (tvd->vdev_ops != &vdev_mirror_ops &&
+ tvd->vdev_ops != &vdev_root_ops &&
+ tvd->vdev_ops != &vdev_draid_ops) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
}
@@ -6915,14 +6977,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
}
/*
- * If we are detaching the original disk from a spare, then it implies
- * that the spare should become a real disk, and be removed from the
- * active spare list for the pool.
+ * If we are detaching the original disk from a normal spare, then it
+ * implies that the spare should become a real disk, and be removed
+ * from the active spare list for the pool. dRAID spares on the
+ * other hand are coupled to the pool and thus should never be removed
+ * from the spares list.
*/
- if (pvd->vdev_ops == &vdev_spare_ops &&
- vd->vdev_id == 0 &&
- pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
- unspare = B_TRUE;
+ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
+ vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+ if (last_cvd->vdev_isspare &&
+ last_cvd->vdev_ops != &vdev_draid_spare_ops) {
+ unspare = B_TRUE;
+ }
+ }
/*
* Erase the disk labels so the disk can be used for other things.
@@ -7903,6 +7971,9 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_checksum_errors = 0;
vdev_state_dirty(vd->vdev_top);
+
+ /* Tell userspace that the vdev is gone. */
+ zfs_post_remove(spa, vd);
}
for (int c = 0; c < vd->vdev_children; c++)
@@ -8013,18 +8084,9 @@ spa_async_thread(void *arg)
/*
* If any devices are done replacing, detach them.
*/
- if (tasks & SPA_ASYNC_RESILVER_DONE)
- spa_vdev_resilver_done(spa);
-
- /*
- * If any devices are done replacing, detach them. Then if no
- * top-level vdevs are rebuilding attempt to kick off a scrub.
- */
- if (tasks & SPA_ASYNC_REBUILD_DONE) {
+ if (tasks & SPA_ASYNC_RESILVER_DONE ||
+ tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
-
- if (!vdev_rebuild_active(spa->spa_root_vdev))
- (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
}
/*
@@ -8818,12 +8880,18 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
}
for (int i = 0; i < spa->spa_alloc_count; i++) {
- ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
- ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
- ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
- normal->mc_alloc_max_slots[i] = slots_per_allocator;
- special->mc_alloc_max_slots[i] = slots_per_allocator;
- dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+ ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
+ mca_alloc_slots));
+ ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
+ mca_alloc_slots));
+ ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
+ mca_alloc_slots));
+ normal->mc_allocator[i].mca_alloc_max_slots =
+ slots_per_allocator;
+ special->mc_allocator[i].mca_alloc_max_slots =
+ slots_per_allocator;
+ dedup->mc_allocator[i].mca_alloc_max_slots =
+ slots_per_allocator;
}
normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c
index 2ab58815400a..2939c0366504 100644
--- a/sys/contrib/openzfs/module/zfs/spa_history.c
+++ b/sys/contrib/openzfs/module/zfs/spa_history.c
@@ -321,7 +321,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx)
* posted as a result of the ZPOOL_HIST_CMD key being present
* it would result in only one sysevent being posted with the
* full command line arguments, requiring the consumer to know
- * how to parse and understand zfs(1M) command invocations.
+ * how to parse and understand zfs(8) command invocations.
*/
spa_history_log_notify(spa, nvl);
} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index 04210472886c..f49be8eec01a 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
+ spa->spa_min_alloc = INT_MAX;
/* Reset cached value */
spa->spa_dedup_dspace = ~0ULL;
@@ -1366,7 +1367,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
/*
* If anything changed, wait for it to sync. This ensures that,
- * from the system administrator's perspective, zpool(1M) commands
+ * from the system administrator's perspective, zpool(8) commands
* are synchronous. This is important for things like zpool offline:
* when the command completes, you expect no further I/O from ZFS.
*/
@@ -1807,10 +1808,11 @@ spa_update_dspace(spa_t *spa)
ddt_get_dedup_dspace(spa);
if (spa->spa_vdev_removal != NULL) {
/*
- * We can't allocate from the removing device, so
- * subtract its size. This prevents the DMU/DSL from
- * filling up the (now smaller) pool while we are in the
- * middle of removing the device.
+ * We can't allocate from the removing device, so subtract
+ * its size if it was included in dspace (i.e. if this is a
+ * normal-class vdev, not special/dedup). This prevents the
+ * DMU/DSL from filling up the (now smaller) pool while we
+ * are in the middle of removing the device.
*
* Note that the DMU/DSL doesn't actually know or care
* how much space is allocated (it does its own tracking
@@ -1822,8 +1824,10 @@ spa_update_dspace(spa_t *spa)
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
vdev_t *vd =
vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
- spa->spa_dspace -= spa_deflate(spa) ?
- vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ if (vd->vdev_mg->mg_class == spa_normal_class(spa)) {
+ spa->spa_dspace -= spa_deflate(spa) ?
+ vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ }
spa_config_exit(spa, SCL_VDEV, FTAG);
}
}
@@ -2435,7 +2439,7 @@ spa_fini(void)
boolean_t
spa_has_slogs(spa_t *spa)
{
- return (spa->spa_log_class->mc_rotor != NULL);
+ return (spa->spa_log_class->mc_groups != 0);
}
spa_log_state_t
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
index 65375b579da6..3efd26155014 100644
--- a/sys/contrib/openzfs/module/zfs/txg.c
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -305,9 +305,7 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
* significance to the chosen tx_cpu. Because.. Why not use
* the current cpu to index into the array?
*/
- kpreempt_disable();
- tc = &tx->tx_cpu[CPU_SEQID];
- kpreempt_enable();
+ tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE];
mutex_enter(&tc->tc_open_lock);
txg = tx->tx_open_txg;
@@ -448,8 +446,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
* Commit callback taskq hasn't been created yet.
*/
tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
- boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2,
- TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ 100, defclsyspri, boot_ncpus, boot_ncpus * 2,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+ TASKQ_THREADS_CPU_PCT);
}
cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 6af61cdcd9bf..7ffe924212da 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -40,6 +40,7 @@
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -51,6 +52,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
@@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
+ &vdev_draid_ops,
+ &vdev_draid_spare_ops,
&vdev_mirror_ops,
&vdev_replacing_ops,
&vdev_spare_ops,
@@ -221,15 +225,16 @@ vdev_getops(const char *type)
/* ARGSUSED */
void
-vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res)
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
- res->rs_start = in->rs_start;
- res->rs_end = in->rs_end;
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
}
/*
* Derive the enumerated allocation bias from string input.
- * String origin is either the per-vdev zap or zpool(1M).
+ * String origin is either the per-vdev zap or zpool(8).
*/
static vdev_alloc_bias_t
vdev_derive_alloc_bias(const char *bias)
@@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
return (asize);
}
+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+ return (vd->vdev_min_asize);
+}
+
/*
* Get the minimum allocatable size. We define the allocatable size as
* the vdev's asize rounded to the nearest metaslab. This allows us to
@@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd)
if (vd == vd->vdev_top)
return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
- /*
- * The allocatable space for a raidz vdev is N * sizeof(smallest child),
- * so each child must provide at least 1/Nth of its asize.
- */
- if (pvd->vdev_ops == &vdev_raidz_ops)
- return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
- pvd->vdev_children);
-
- return (pvd->vdev_min_asize);
+ return (pvd->vdev_ops->vdev_op_min_asize(pvd));
}
void
@@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd)
vdev_set_min_asize(vd->vdev_child[c]);
}
+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+ uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+ if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+ min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+ return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+ uint64_t nparity = 0;
+
+ if (vd->vdev_ops->vdev_op_nparity != NULL)
+ nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+ return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+ uint64_t ndisks = 1;
+
+ if (vd->vdev_ops->vdev_op_ndisks != NULL)
+ ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+ return (ndisks);
+}
+
vdev_t *
vdev_lookup_top(spa_t *spa, uint64_t vdev)
{
@@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
list_link_init(&vd->vdev_initialize_node);
list_link_init(&vd->vdev_leaf_node);
list_link_init(&vd->vdev_trim_node);
+
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
@@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
{
vdev_ops_t *ops;
char *type;
- uint64_t guid = 0, islog, nparity;
+ uint64_t guid = 0, islog;
vdev_t *vd;
vdev_indirect_config_t *vic;
char *tmp = NULL;
@@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
return (SET_ERROR(ENOTSUP));
- /*
- * Set the nparity property for RAID-Z vdevs.
- */
- nparity = -1ULL;
- if (ops == &vdev_raidz_ops) {
- if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
- &nparity) == 0) {
- if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
- return (SET_ERROR(EINVAL));
- /*
- * Previous versions could only support 1 or 2 parity
- * device.
- */
- if (nparity > 1 &&
- spa_version(spa) < SPA_VERSION_RAIDZ2)
- return (SET_ERROR(ENOTSUP));
- if (nparity > 2 &&
- spa_version(spa) < SPA_VERSION_RAIDZ3)
- return (SET_ERROR(ENOTSUP));
- } else {
- /*
- * We require the parity to be specified for SPAs that
- * support multiple parity levels.
- */
- if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
- return (SET_ERROR(EINVAL));
- /*
- * Otherwise, we default to 1 parity device for RAID-Z.
- */
- nparity = 1;
- }
- } else {
- nparity = 0;
- }
- ASSERT(nparity != -1ULL);
-
- /*
- * If creating a top-level vdev, check for allocation classes input
- */
if (top_level && alloctype == VDEV_ALLOC_ADD) {
char *bias;
+ /*
+ * If creating a top-level vdev, check for allocation
+ * classes input.
+ */
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
&bias) == 0) {
alloc_bias = vdev_derive_alloc_bias(bias);
@@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
return (SET_ERROR(ENOTSUP));
}
}
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (ops == &vdev_draid_ops &&
+ spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+ return (SET_ERROR(ENOTSUP));
+ }
}
- vd = vdev_alloc_common(spa, id, guid, ops);
- vic = &vd->vdev_indirect_config;
+ /*
+ * Initialize the vdev specific data. This is done before calling
+ * vdev_alloc_common() since it may fail and this simplifies the
+ * error reporting and cleanup code paths.
+ */
+ void *tsd = NULL;
+ if (ops->vdev_op_init != NULL) {
+ rc = ops->vdev_op_init(spa, nv, &tsd);
+ if (rc != 0) {
+ return (rc);
+ }
+ }
+ vd = vdev_alloc_common(spa, id, guid, ops);
+ vd->vdev_tsd = tsd;
vd->vdev_islog = islog;
- vd->vdev_nparity = nparity;
+
if (top_level && alloc_bias != VDEV_BIAS_NONE)
vd->vdev_alloc_bias = alloc_bias;
@@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_wholedisk) != 0)
vd->vdev_wholedisk = -1ULL;
+ vic = &vd->vdev_indirect_config;
+
ASSERT0(vic->vic_mapping_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
&vic->vic_mapping_object);
@@ -937,6 +967,9 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ if (vd->vdev_ops->vdev_op_fini != NULL)
+ vd->vdev_ops->vdev_op_fini(vd);
+
/*
* Discard allocation state.
*/
@@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
- mutex_destroy(&vd->vdev_rebuild_io_lock);
cv_destroy(&vd->vdev_rebuild_cv);
- cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd)
}
/*
- * Add a mirror/replacing vdev above an existing vdev.
+ * Add a mirror/replacing vdev above an existing vdev. There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
*/
vdev_t *
vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
@@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
spa->spa_min_ashift = vd->vdev_ashift;
+
+ uint64_t min_alloc = vdev_get_min_alloc(vd);
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
}
}
}
@@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd)
return (B_FALSE);
}
-void
-vdev_open_children(vdev_t *vd)
+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+ return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs. If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread. This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
{
- taskq_t *tq;
int children = vd->vdev_children;
- /*
- * in order to handle pools on top of zvols, do the opens
- * in a single thread so that the same thread holds the
- * spa_namespace_lock
- */
- if (vdev_uses_zvols(vd)) {
-retry_sync:
- for (int c = 0; c < children; c++)
- vd->vdev_child[c]->vdev_open_error =
- vdev_open(vd->vdev_child[c]);
- } else {
- tq = taskq_create("vdev_open", children, minclsyspri,
- children, children, TASKQ_PREPOPULATE);
- if (tq == NULL)
- goto retry_sync;
+ taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+ children, children, TASKQ_PREPOPULATE);
+ vd->vdev_nonrot = B_TRUE;
- for (int c = 0; c < children; c++)
+ for (int c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (open_func(cvd) == B_FALSE)
+ continue;
+
+ if (tq == NULL || vdev_uses_zvols(vd)) {
+ cvd->vdev_open_error = vdev_open(cvd);
+ } else {
VERIFY(taskq_dispatch(tq, vdev_open_child,
- vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID);
+ cvd, TQ_SLEEP) != TASKQID_INVALID);
+ }
+
+ vd->vdev_nonrot &= cvd->vdev_nonrot;
+ }
+ if (tq != NULL) {
+ taskq_wait(tq);
taskq_destroy(tq);
}
+}
- vd->vdev_nonrot = B_TRUE;
+/*
+ * Open all child vdevs.
+ */
+void
+vdev_open_children(vdev_t *vd)
+{
+ vdev_open_children_impl(vd, vdev_default_open_children_func);
+}
- for (int c = 0; c < children; c++)
- vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot;
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+ vdev_open_children_impl(vd, open_func);
}
/*
@@ -1953,6 +2017,16 @@ vdev_open(vdev_t *vd)
}
/*
+ * Track the the minimum allocation size.
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+ uint64_t min_alloc = vdev_get_min_alloc(vd);
+ if (min_alloc < spa->spa_min_alloc)
+ spa->spa_min_alloc = min_alloc;
+ }
+
+ /*
* If this is a leaf vdev, assess whether a resilver is needed.
* But don't do this if we are doing a reopen for a scrub, since
* this would just restart the scrub we are already doing.
@@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd)
vdev_t *pvd = vd->vdev_parent;
spa_t *spa __maybe_unused = vd->vdev_spa;
- ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+ ASSERT(vd != NULL);
+ ASSERT(vd->vdev_open_thread == curthread ||
+ spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
/*
* If our parent is reopening, then we are as well, unless we are
@@ -2575,15 +2651,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
/*
* While we are loading the pool, the DTLs have not been loaded yet.
- * Ignore the DTLs and try all devices. This avoids a recursive
- * mutex enter on the vdev_dtl_lock, and also makes us try hard
- * when loading the pool (relying on the checksum to ensure that
- * we get the right data -- note that we while loading, we are
- * only reading the MOS, which is always checksummed).
+ * This isn't a problem but it can result in devices being tried
+ * which are known to not have the data. In which case, the import
+ * is relying on the checksum to ensure that we get the right data.
+ * Note that while importing we are only reading the MOS, which is
+ * always checksummed.
*/
- if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
- return (B_FALSE);
-
mutex_enter(&vd->vdev_dtl_lock);
if (!range_tree_is_empty(rt))
dirty = range_tree_contains(rt, txg, size);
@@ -2606,10 +2679,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
}
/*
- * Returns B_TRUE if vdev determines offset needs to be resilvered.
+ * Check if the txg falls within the range which must be
+ * resilvered. DVAs outside this range can always be skipped.
*/
boolean_t
-vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ /* Set by sequential resilver. */
+ if (phys_birth == TXG_UNKNOWN)
+ return (B_TRUE);
+
+ return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
{
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
@@ -2617,7 +2706,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
vd->vdev_ops->vdev_op_leaf)
return (B_TRUE);
- return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
+ return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+ phys_birth));
}
/*
@@ -2862,8 +2952,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
continue; /* leaf vdevs only */
if (t == DTL_PARTIAL)
minref = 1; /* i.e. non-zero */
- else if (vd->vdev_nparity != 0)
- minref = vd->vdev_nparity + 1; /* RAID-Z */
+ else if (vdev_get_nparity(vd) != 0)
+ minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
else
minref = vd->vdev_children; /* any kind of mirror */
space_reftree_create(&reftree);
@@ -2884,6 +2974,7 @@ vdev_dtl_load(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_meta_objset;
+ range_tree_t *rt;
int error = 0;
if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
@@ -2895,10 +2986,17 @@ vdev_dtl_load(vdev_t *vd)
return (error);
ASSERT(vd->vdev_dtl_sm != NULL);
- mutex_enter(&vd->vdev_dtl_lock);
- error = space_map_load(vd->vdev_dtl_sm,
- vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
- mutex_exit(&vd->vdev_dtl_lock);
+ rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
+ if (error == 0) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ range_tree_walk(rt, range_tree_add,
+ vd->vdev_dtl[DTL_MISSING]);
+ mutex_exit(&vd->vdev_dtl_lock);
+ }
+
+ range_tree_vacate(rt, NULL, NULL);
+ range_tree_destroy(rt);
return (error);
}
@@ -3727,6 +3825,9 @@ top:
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
tvd = vd->vdev_top;
mg = tvd->vdev_mg;
generation = spa->spa_config_generation + 1;
@@ -3971,6 +4072,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
static void
vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
{
+ /*
+ * Exclude the dRAID spare when aggregating to avoid double counting
+ * the ops and bytes. These IOs are counted by the physical leaves.
+ */
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
for (int t = 0; t < VS_ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
@@ -4063,7 +4171,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vdev_get_child_stat(cvd, vs, cvs);
if (vsx)
vdev_get_child_stat_ex(cvd, vsx, cvsx);
-
}
} else {
/*
@@ -4248,7 +4355,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
/*
* Repair is the result of a rebuild issued by the
- * rebuild thread (vdev_rebuild_thread).
+ * rebuild thread (vdev_rebuild_thread). To avoid
+ * double counting repaired bytes the virtual dRAID
+ * spare vdev is excluded from the processed bytes.
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
vdev_t *tvd = vd->vdev_top;
@@ -4256,8 +4365,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
atomic_add_64(rebuilt, psize);
+ }
vs->vs_rebuild_processed += psize;
}
@@ -4353,8 +4464,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
return;
- if (spa->spa_load_state == SPA_LOAD_NONE &&
- type == ZIO_TYPE_WRITE && txg != 0 &&
+ if (type == ZIO_TYPE_WRITE && txg != 0 &&
(!(flags & ZIO_FLAG_IO_REPAIR) ||
(flags & ZIO_FLAG_SCAN_THREAD) ||
spa->spa_claiming)) {
@@ -4981,31 +5091,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
vdev_resilver_needed(vd, NULL, NULL));
}
+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+ return (rs->rs_start == rs->rs_end);
+}
+
/*
- * Translate a logical range to the physical range for the specified vdev_t.
- * This function is initially called with a leaf vdev and will walk each
- * parent vdev until it reaches a top-level vdev. Once the top-level is
- * reached the physical range is initialized and the recursive function
- * begins to unwind. As it unwinds it calls the parent's vdev specific
- * translation function to do the real conversion.
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t. This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
*/
void
vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
- range_seg64_t *physical_rs)
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
/*
* Walk up the vdev tree
*/
if (vd != vd->vdev_top) {
- vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+ remain_rs);
} else {
/*
- * We've reached the top-level vdev, initialize the
- * physical range to the logical range and start to
- * unwind.
+ * We've reached the top-level vdev, initialize the physical
+ * range to the logical range and set an empty remaining
+ * range then start to unwind.
*/
physical_rs->rs_start = logical_rs->rs_start;
physical_rs->rs_end = logical_rs->rs_end;
+
+ remain_rs->rs_start = logical_rs->rs_start;
+ remain_rs->rs_end = logical_rs->rs_start;
+
return;
}
@@ -5015,16 +5136,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
/*
* As this recursive function unwinds, translate the logical
- * range into its physical components by calling the
- * vdev specific translate function.
+ * range into its physical and any remaining components by calling
+ * the vdev specific translate function.
*/
range_seg64_t intermediate = { 0 };
- pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
physical_rs->rs_start = intermediate.rs_start;
physical_rs->rs_end = intermediate.rs_end;
}
+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+ vdev_xlate_func_t *func, void *arg)
+{
+ range_seg64_t iter_rs = *logical_rs;
+ range_seg64_t physical_rs;
+ range_seg64_t remain_rs;
+
+ while (!vdev_xlate_is_empty(&iter_rs)) {
+
+ vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+ /*
+ * With raidz and dRAID, it's possible that the logical range
+ * does not live on this leaf vdev. Only when there is a non-
+ * zero physical size call the provided function.
+ */
+ if (!vdev_xlate_is_empty(&physical_rs))
+ func(arg, &physical_rs);
+
+ iter_rs = remain_rs;
+ }
+}
+
/*
* Look at the vdev tree and determine whether any devices are currently being
* replaced.
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
new file mode 100644
index 000000000000..6b7ad7021a50
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -0,0 +1,2984 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/abd.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <zfs_fletcher.h>
+
+#ifdef ZFS_DEBUG
+#include <sys/vdev.h> /* For vdev_xlate() in vdev_draid_io_verify() */
+#endif
+
+/*
+ * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is
+ * comprised of multiple raidz redundancy groups which are spread over the
+ * dRAID children. To ensure an even distribution, and avoid hot spots, a
+ * permutation mapping is applied to the order of the dRAID children.
+ * This mixing effectively distributes the parity columns evenly over all
+ * of the disks in the dRAID.
+ *
+ * This is beneficial because it means when resilvering all of the disks
+ * can participate thereby increasing the available IOPs and bandwidth.
+ * Furthermore, by reserving a small fraction of each child's total capacity
+ * virtual distributed spare disks can be created. These spares similarly
+ * benefit from the performance gains of spanning all of the children. The
+ * consequence of which is that resilvering to a distributed spare can
+ * substantially reduce the time required to restore full parity to pool
+ * with a failed disks.
+ *
+ * === dRAID group layout ===
+ *
+ * First, let's define a "row" in the configuration to be a 16M chunk from
+ * each physical drive at the same offset. This is the minimum allowable
+ * size since it must be possible to store a full 16M block when there is
+ * only a single data column. Next, we define a "group" to be a set of
+ * sequential disks containing both the parity and data columns. We allow
+ * groups to span multiple rows in order to align any group size to any
+ * number of physical drives. Finally, a "slice" is comprised of the rows
+ * which contain the target number of groups. The permutation mappings
+ * are applied in a round robin fashion to each slice.
+ *
+ * Given D+P drives in a group (including parity drives) and C-S physical
+ * drives (not including the spare drives), we can distribute the groups
+ * across R rows without remainder by selecting the least common multiple
+ * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S).
+ *
+ * In the example below, there are C=14 physical drives in the configuration
+ * with S=2 drives worth of spare capacity. Each group has a width of 9
+ * which includes D=8 data and P=1 parity drive. There are 4 groups and
+ * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice
+ * size is 576M (144M * 4). When allocating from a dRAID each group is
+ * filled before moving on to the next as show in slice0 below.
+ *
+ * data disks (8 data + 1 parity) spares (2)
+ * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0
+ * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * | | group 0 | group 1..| |
+ * | +-----------------------------------+-----------+-------|
+ * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r
+ * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o
+ * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w
+ * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0
+ * s +-----------------------+-----------------------+-------+
+ * l | ..group 1 | group 2.. | |
+ * i +-----------------------+-----------------------+-------+
+ * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r
+ * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o
+ * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w
+ * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1
+ * | +-----------+-----------+-----------------------+-------+
+ * | |..group 2 | group 3 | |
+ * | +-----------+-----------+-----------------------+-------+
+ * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r
+ * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o
+ * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w
+ * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2
+ * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1
+ * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * l | group 4 | group 5..| | row 3
+ * i +-----------------------+-----------+-----------+-------|
+ * c | ..group 5 | group 6.. | | row 4
+ * e +-----------+-----------+-----------------------+-------+
+ * 1 |..group 6 | group 7 | | row 5
+ * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2
+ * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ * l | group 8 | group 9..| | row 6
+ * i +-----------------------------------------------+-------|
+ * c | ..group 9 | group 10.. | | row 7
+ * e +-----------------------+-----------------------+-------+
+ * 2 |..group 10 | group 11 | | row 8
+ * +-----------+-----------------------------------+-------+
+ *
+ * This layout has several advantages over requiring that each row contain
+ * a whole number of groups.
+ *
+ * 1. The group count is not a relevant parameter when defining a dRAID
+ * layout. Only the group width is needed, and *all* groups will have
+ * the desired size.
+ *
+ * 2. All possible group widths (<= physical disk count) can be supported.
+ *
+ * 3. The logic within vdev_draid.c is simplified when the group width is
+ * the same for all groups (although some of the logic around computing
+ * permutation numbers and drive offsets is more complicated).
+ *
+ * N.B. The following array describes all valid dRAID permutation maps.
+ * Each row is used to generate a permutation map for a different number
+ * of children from a unique seed. The seeds were generated and carefully
+ * evaluated by the 'draid' utility in order to provide balanced mappings.
+ * In addition to the seed a checksum of the in-memory mapping is stored
+ * for verification.
+ *
+ * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed,
+ * with a given permutation map) is the ratio of the amounts of I/O that will
+ * be sent to the least and most busy disks when resilvering. The average
+ * imbalance ratio (of a given number of disks and permutation map) is the
+ * average of the ratios of all possible single and double disk failures.
+ *
+ * In order to achieve a low imbalance ratio the number of permutations in
+ * the mapping must be significantly larger than the number of children.
+ * For dRAID the number of permutations has been limited to 512 to minimize
+ * the map size. This does result in a gradually increasing imbalance ratio
+ * as seen in the table below. Increasing the number of permutations for
+ * larger child counts would reduce the imbalance ratio. However, in practice
+ * when there are a large number of children each child is responsible for
+ * fewer total IOs so it's less of a concern.
+ *
+ * Note these values are hard coded and must never be changed. Existing
+ * pools depend on the same mapping always being generated in order to
+ * read and write from the correct locations. Any change would make
+ * existing pools completely inaccessible.
+ */
+static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = {
+ { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */
+ { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */
+ { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */
+ { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */
+ { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */
+ { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */
+ { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */
+ { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */
+ { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */
+ { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */
+ { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */
+ { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */
+ { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */
+ { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */
+ { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */
+ { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */
+ { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */
+ { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */
+ { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */
+ { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */
+ { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */
+ { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */
+ { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */
+ { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */
+ { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */
+ { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */
+ { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */
+ { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */
+ { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */
+ { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */
+ { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */
+ { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */
+ { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */
+ { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */
+ { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */
+ { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */
+ { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */
+ { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */
+ { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */
+ { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */
+ { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */
+ { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */
+ { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */
+ { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */
+ { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */
+ { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */
+ { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */
+ { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */
+ { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */
+ { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */
+ { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */
+ { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */
+ { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */
+ { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */
+ { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */
+ { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */
+ { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */
+ { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */
+ { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */
+ { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */
+ { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */
+ { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */
+ { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */
+ { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */
+ { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */
+ { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */
+ { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */
+ { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */
+ { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */
+ { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */
+ { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */
+ { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */
+ { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */
+ { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */
+ { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */
+ { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */
+ { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */
+ { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */
+ { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */
+ { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */
+ { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */
+ { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */
+ { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */
+ { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */
+ { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */
+ { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */
+ { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */
+ { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */
+ { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */
+ { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */
+ { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */
+ { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */
+ { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */
+ { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */
+ { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */
+ { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */
+ { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */
+ { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */
+ { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */
+ { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */
+ { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */
+ { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */
+ { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */
+ { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */
+ { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */
+ { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */
+ { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */
+ { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */
+ { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */
+ { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */
+ { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */
+ { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */
+ { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */
+ { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */
+ { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */
+ { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */
+ { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */
+ { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */
+ { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */
+ { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */
+ { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */
+ { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */
+ { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */
+ { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */
+ { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */
+ { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */
+ { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */
+ { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */
+ { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */
+ { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */
+ { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */
+ { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */
+ { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */
+ { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */
+ { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */
+ { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */
+ { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */
+ { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */
+ { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */
+ { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */
+ { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */
+ { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */
+ { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */
+ { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */
+ { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */
+ { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */
+ { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */
+ { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */
+ { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */
+ { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */
+ { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */
+ { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */
+ { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */
+ { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */
+ { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */
+ { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */
+ { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */
+ { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */
+ { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */
+ { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */
+ { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */
+ { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */
+ { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */
+ { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */
+ { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */
+ { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */
+ { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */
+ { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */
+ { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */
+ { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */
+ { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */
+ { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */
+ { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */
+ { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */
+ { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */
+ { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */
+ { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */
+ { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */
+ { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */
+ { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */
+ { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */
+ { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */
+ { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */
+ { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */
+ { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */
+ { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */
+ { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */
+ { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */
+ { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */
+ { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */
+ { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */
+ { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */
+ { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */
+ { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */
+ { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */
+ { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */
+ { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */
+ { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */
+ { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */
+ { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */
+ { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */
+ { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */
+ { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */
+ { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */
+ { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */
+ { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */
+ { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */
+ { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */
+ { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */
+ { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */
+ { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */
+ { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */
+ { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */
+ { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */
+ { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */
+ { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */
+ { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */
+ { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */
+ { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */
+ { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */
+ { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */
+ { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */
+ { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */
+ { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */
+ { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */
+ { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */
+ { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */
+ { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */
+ { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */
+ { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */
+ { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */
+ { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */
+ { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */
+ { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */
+ { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */
+ { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */
+ { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */
+ { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */
+ { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */
+ { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */
+ { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */
+ { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */
+ { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */
+ { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */
+ { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */
+ { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */
+ { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */
+ { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */
+ { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */
+ { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */
+ { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */
+ { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */
+ { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */
+ { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */
+};
+
+/*
+ * Verify the map is valid. Each device index must appear exactly
+ * once in every row, and the permutation array checksum must match.
+ */
+static int
+verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms,
+ uint64_t checksum)
+{
+ int countssz = sizeof (uint16_t) * children;
+ uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP);
+
+ for (int i = 0; i < nperms; i++) {
+ for (int j = 0; j < children; j++) {
+ uint8_t val = perms[(i * children) + j];
+
+ if (val >= children || counts[val] != i) {
+ kmem_free(counts, countssz);
+ return (EINVAL);
+ }
+
+ counts[val]++;
+ }
+ }
+
+ if (checksum != 0) {
+ int permssz = sizeof (uint8_t) * children * nperms;
+ zio_cksum_t cksum;
+
+ fletcher_4_native_varsize(perms, permssz, &cksum);
+
+ if (checksum != cksum.zc_word[0]) {
+ kmem_free(counts, countssz);
+ return (ECKSUM);
+ }
+ }
+
+ kmem_free(counts, countssz);
+
+ return (0);
+}
+
+/*
+ * Generate the permutation array for the draid_map_t. These maps control
+ * the placement of all data in a dRAID. Therefore it's critical that the
+ * seed always generates the same mapping. We provide our own pseudo-random
+ * number generator for this purpose.
+ */
+int
+vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
+{
+ VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN);
+ VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN);
+ VERIFY3U(map->dm_seed, !=, 0);
+ VERIFY3U(map->dm_nperms, !=, 0);
+ VERIFY3P(map->dm_perms, ==, NULL);
+
+#ifdef _KERNEL
+ /*
+ * The kernel code always provides both a map_seed and checksum.
+ * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide
+ * a zero checksum when generating new candidate maps.
+ */
+ VERIFY3U(map->dm_checksum, !=, 0);
+#endif
+ uint64_t children = map->dm_children;
+ uint64_t nperms = map->dm_nperms;
+ int rowsz = sizeof (uint8_t) * children;
+ int permssz = rowsz * nperms;
+ uint8_t *perms;
+
+ /* Allocate the permutation array */
+ perms = vmem_alloc(permssz, KM_SLEEP);
+
+ /* Setup an initial row with a known pattern */
+ uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP);
+ for (int i = 0; i < children; i++)
+ initial_row[i] = i;
+
+ uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed };
+ uint8_t *current_row, *previous_row = initial_row;
+
+ /*
+ * Perform a Fisher-Yates shuffle of each row using the previous
+ * row as the starting point. An initial_row with known pattern
+ * is used as the input for the first row.
+ */
+ for (int i = 0; i < nperms; i++) {
+ current_row = &perms[i * children];
+ memcpy(current_row, previous_row, rowsz);
+
+ for (int j = children - 1; j > 0; j--) {
+ uint64_t k = vdev_draid_rand(draid_seed) % (j + 1);
+ uint8_t val = current_row[j];
+ current_row[j] = current_row[k];
+ current_row[k] = val;
+ }
+
+ previous_row = current_row;
+ }
+
+ kmem_free(initial_row, rowsz);
+
+ int error = verify_perms(perms, children, nperms, map->dm_checksum);
+ if (error) {
+ vmem_free(perms, permssz);
+ return (error);
+ }
+
+ *permsp = perms;
+
+ return (0);
+}
+
+/*
+ * Lookup the fixed draid_map_t for the requested number of children.
+ */
+int
+vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
+{
+ for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) {
+ if (draid_maps[i].dm_children == children) {
+ *mapp = &draid_maps[i];
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Lookup the permutation array and iteration id for the provided offset.
+ */
+static void
+vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex,
+ uint8_t **base, uint64_t *iter)
+{
+ uint64_t ncols = vdc->vdc_children;
+ uint64_t poff = pindex % (vdc->vdc_nperms * ncols);
+
+ *base = vdc->vdc_perms + (poff / ncols) * ncols;
+ *iter = poff % ncols;
+}
+
+static inline uint64_t
+vdev_draid_permute_id(vdev_draid_config_t *vdc,
+ uint8_t *base, uint64_t iter, uint64_t index)
+{
+ return ((base[index] + iter) % vdc->vdc_children);
+}
+
+/*
+ * Return the asize which is the psize rounded up to a full group width.
+ * i.e. vdev_draid_psize_to_asize().
+ */
+static uint64_t
+vdev_draid_asize(vdev_t *vd, uint64_t psize)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ uint64_t ashift = vd->vdev_ashift;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1;
+ uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift;
+
+ ASSERT3U(asize, !=, 0);
+ ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0);
+
+ return (asize);
+}
+
+/*
+ * Deflate the asize to the psize, this includes stripping parity.
+ */
+uint64_t
+vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT0(asize % vdc->vdc_groupwidth);
+
+ return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata);
+}
+
+/*
+ * Convert a logical offset to the corresponding group number.
+ */
+static uint64_t
+vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (offset / vdc->vdc_groupsz);
+}
+
+/*
+ * Convert a group number to the logical starting offset for that group.
+ */
+static uint64_t
+vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (group * vdc->vdc_groupsz);
+}
+
+
+static void
+vdev_draid_map_free_vsd(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
+
+ ASSERT0(rm->rm_freed);
+ rm->rm_freed = B_TRUE;
+
+ if (rm->rm_reports == 0) {
+ vdev_raidz_map_free(rm);
+ }
+}
+
+/*ARGSUSED*/
+static void
+vdev_draid_cksum_free(void *arg, size_t ignored)
+{
+ raidz_map_t *rm = arg;
+
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (--rm->rm_reports == 0 && rm->rm_freed)
+ vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
+{
+ raidz_map_t *rm = zcr->zcr_cbdata;
+ const size_t c = zcr->zcr_cbinfo;
+ uint64_t skip_size = zcr->zcr_sector;
+ uint64_t parity_size;
+ size_t x, offset, size;
+
+ if (good_data == NULL) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ /*
+ * Detailed cksum reporting is currently only supported for single
+ * row draid mappings, this covers the vast majority of zios. Only
+ * a dRAID zio which spans groups will have multiple rows.
+ */
+ if (rm->rm_nrows != 1) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+ return;
+ }
+
+ raidz_row_t *rr = rm->rm_row[0];
+ const abd_t *good = NULL;
+ const abd_t *bad = rr->rr_col[c].rc_abd;
+
+ if (c < rr->rr_firstdatacol) {
+ /*
+ * The first time through, calculate the parity blocks for
+ * the good data (this relies on the fact that the good
+ * data never changes for a given logical zio)
+ */
+ if (rr->rr_col[0].rc_gdata == NULL) {
+ abd_t *bad_parity[VDEV_DRAID_MAXPARITY];
+
+ /*
+ * Set up the rr_col[]s to generate the parity for
+ * good_data, first saving the parity bufs and
+ * replacing them with buffers to hold the result.
+ */
+ for (x = 0; x < rr->rr_firstdatacol; x++) {
+ bad_parity[x] = rr->rr_col[x].rc_abd;
+ rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
+ abd_alloc_sametype(rr->rr_col[x].rc_abd,
+ rr->rr_col[x].rc_size);
+ }
+
+ /*
+ * Fill in the data columns from good_data being
+ * careful to pad short columns and empty columns
+ * with a skip sector.
+ */
+ uint64_t good_size = abd_get_size((abd_t *)good_data);
+
+ offset = 0;
+ for (; x < rr->rr_cols; x++) {
+ abd_put(rr->rr_col[x].rc_abd);
+
+ if (offset == good_size) {
+ /* empty data column (small write) */
+ rr->rr_col[x].rc_abd =
+ abd_get_zeros(skip_size);
+ } else if (x < rr->rr_bigcols) {
+ /* this is a "big column" */
+ size = rr->rr_col[x].rc_size;
+ rr->rr_col[x].rc_abd =
+ abd_get_offset_size(
+ (abd_t *)good_data, offset, size);
+ offset += size;
+ } else {
+ /* short data column, add skip sector */
+ size = rr->rr_col[x].rc_size -skip_size;
+ rr->rr_col[x].rc_abd = abd_alloc(
+ rr->rr_col[x].rc_size, B_TRUE);
+ abd_copy_off(rr->rr_col[x].rc_abd,
+ (abd_t *)good_data, 0, offset,
+ size);
+ abd_zero_off(rr->rr_col[x].rc_abd,
+ size, skip_size);
+ offset += size;
+ }
+ }
+
+ /*
+ * Construct the parity from the good data.
+ */
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ /* restore everything back to its original state */
+ for (x = 0; x < rr->rr_firstdatacol; x++)
+ rr->rr_col[x].rc_abd = bad_parity[x];
+
+ offset = 0;
+ for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
+ if (offset == good_size || x < rr->rr_bigcols)
+ abd_put(rr->rr_col[x].rc_abd);
+ else
+ abd_free(rr->rr_col[x].rc_abd);
+
+ rr->rr_col[x].rc_abd = abd_get_offset_size(
+ rr->rr_abd_copy, offset,
+ rr->rr_col[x].rc_size);
+ offset += rr->rr_col[x].rc_size;
+ }
+ }
+
+ ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
+ good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
+ rr->rr_col[c].rc_size);
+ } else {
+ /* adjust good_data to point at the start of our column */
+ parity_size = size = rr->rr_col[0].rc_size;
+ if (c >= rr->rr_bigcols) {
+ size -= skip_size;
+ zcr->zcr_length = size;
+ }
+
+ /* empty column */
+ if (size == 0) {
+ zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE);
+ return;
+ }
+
+ offset = 0;
+ for (x = rr->rr_firstdatacol; x < c; x++) {
+ if (x < rr->rr_bigcols) {
+ offset += parity_size;
+ } else {
+ offset += parity_size - skip_size;
+ }
+ }
+
+ good = abd_get_offset_size((abd_t *)good_data, offset, size);
+ }
+
+ /* we drop the ereport if it ends up that the data was good */
+ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_put((abd_t *)good);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely. The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_draid_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+ size_t c = (size_t)(uintptr_t)arg;
+ raidz_map_t *rm = zio->io_vsd;
+
+ /* set up the report and bump the refcount */
+ zcr->zcr_cbdata = rm;
+ zcr->zcr_cbinfo = c;
+ zcr->zcr_finish = vdev_draid_cksum_finish;
+ zcr->zcr_free = vdev_draid_cksum_free;
+
+ rm->rm_reports++;
+ ASSERT3U(rm->rm_reports, >, 0);
+
+ if (rm->rm_row[0]->rr_abd_copy != NULL)
+ return;
+
+ /*
+ * It's the first time we're called for this raidz_map_t, so we need
+ * to copy the data aside; there's no guarantee that our zio's buffer
+ * won't be re-used for something else.
+ *
+ * Our parity data is already in separate buffers, so there's no need
+ * to copy them. Furthermore, all columns should have been expanded
+ * by vdev_draid_map_alloc_empty() when attempting reconstruction.
+ */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ size_t offset = 0;
+ size_t size = 0;
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ ASSERT3U(rr->rr_col[c].rc_size, ==,
+ rr->rr_col[0].rc_size);
+ size += rr->rr_col[c].rc_size;
+ }
+
+ rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+ abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
+ offset, col->rc_size);
+
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+
+ if (abd_is_gang(col->rc_abd))
+ abd_free(col->rc_abd);
+ else
+ abd_put(col->rc_abd);
+
+ col->rc_abd = tmp;
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
+ }
+}
+
+const zio_vsd_ops_t vdev_draid_vsd_ops = {
+ .vsd_free = vdev_draid_map_free_vsd,
+ .vsd_cksum_report = vdev_draid_cksum_report
+};
+
+/*
+ * Full stripe writes. When writing, all columns (D+P) are required. Parity
+ * is calculated over all the columns, including empty zero filled sectors,
+ * and each is written to disk. While only the data columns are needed for
+ * a normal read, all of the columns are required for reconstruction when
+ * performing a sequential resilver.
+ *
+ * For "big columns" it's sufficient to map the correct range of the zio ABD.
+ * Partial columns require allocating a gang ABD in order to zero fill the
+ * empty sectors. When the column is empty a zero filled sector must be
+ * mapped. In all cases the data ABDs must be the same size as the parity
+ * ABDs (e.g. rc->rc_size == parity_size).
+ */
+static void
+vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t abd_off = abd_offset;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size == 0) {
+ /* empty data column (small write), add a skip sector */
+ ASSERT3U(skip_size, ==, parity_size);
+ rc->rc_abd = abd_get_zeros(skip_size);
+ } else if (rc->rc_size == parity_size) {
+ /* this is a "big column" */
+ rc->rc_abd = abd_get_offset_size(zio->io_abd,
+ abd_off, rc->rc_size);
+ } else {
+ /* short data column, add a skip sector */
+ ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+ rc->rc_abd = abd_alloc_gang_abd();
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ zio->io_abd, abd_off, rc->rc_size), B_TRUE);
+ abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
+ B_TRUE);
+ }
+
+ ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size);
+
+ abd_off += rc->rc_size;
+ rc->rc_size = parity_size;
+ }
+
+ IMPLY(abd_offset != 0, abd_off == zio->io_size);
+}
+
+/*
+ * Scrub/resilver reads. In order to store the contents of the skip sectors
+ * an additional ABD is allocated. The columns are handled in the same way
+ * as a full stripe write except instead of using the zero ABD the newly
+ * allocated skip ABD is used to back the skip sectors. In all cases the
+ * data ABD must be the same size as the parity ABDs.
+ */
+static void
+vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t abd_off = abd_offset;
+ uint64_t skip_off = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ ASSERT3P(rr->rr_abd_empty, ==, NULL);
+
+ if (rr->rr_nempty > 0) {
+ rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
+ B_FALSE);
+ }
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size == 0) {
+ /* empty data column (small read), add a skip sector */
+ ASSERT3U(skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
+ skip_off, skip_size);
+ skip_off += skip_size;
+ } else if (rc->rc_size == parity_size) {
+ /* this is a "big column" */
+ rc->rc_abd = abd_get_offset_size(zio->io_abd,
+ abd_off, rc->rc_size);
+ } else {
+ /* short data column, add a skip sector */
+ ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ rc->rc_abd = abd_alloc_gang_abd();
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ zio->io_abd, abd_off, rc->rc_size), B_TRUE);
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
+ skip_off += skip_size;
+ }
+
+ uint64_t abd_size = abd_get_size(rc->rc_abd);
+ ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
+
+ /*
+ * Increase rc_size so the skip ABD is included in subsequent
+ * parity calculations.
+ */
+ abd_off += rc->rc_size;
+ rc->rc_size = abd_size;
+ }
+
+ IMPLY(abd_offset != 0, abd_off == zio->io_size);
+ ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
+}
+
+/*
+ * Normal reads. In this common case only the columns containing data
+ * are read in to the zio ABDs. Neither the parity columns or empty skip
+ * sectors are read unless the checksum fails verification. In which case
+ * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand
+ * the raid map in order to allow reconstruction using the parity data and
+ * skip sectors.
+ */
+static void
+vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+ uint64_t abd_off = abd_offset;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size > 0) {
+ rc->rc_abd = abd_get_offset_size(zio->io_abd,
+ abd_off, rc->rc_size);
+ abd_off += rc->rc_size;
+ }
+ }
+
+ IMPLY(abd_offset != 0, abd_off == zio->io_size);
+}
+
+/*
+ * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key
+ * difference is that an ABD is allocated to back skip sectors so they may
+ * be read in to memory, verified, and repaired if needed.
+ */
+void
+vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
+{
+ uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+ uint64_t parity_size = rr->rr_col[0].rc_size;
+ uint64_t skip_off = 0;
+
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ ASSERT3P(rr->rr_abd_empty, ==, NULL);
+
+ if (rr->rr_nempty > 0) {
+ rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
+ B_FALSE);
+ }
+
+ for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_size == 0) {
+ /* empty data column (small read), add a skip sector */
+ ASSERT3U(skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ ASSERT3P(rc->rc_abd, ==, NULL);
+ rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
+ skip_off, skip_size);
+ skip_off += skip_size;
+ } else if (rc->rc_size == parity_size) {
+ /* this is a "big column", nothing to add */
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ } else {
+ /* short data column, add a skip sector */
+ ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+ ASSERT3U(rr->rr_nempty, !=, 0);
+ ASSERT3P(rc->rc_abd, !=, NULL);
+ ASSERT(!abd_is_gang(rc->rc_abd));
+ abd_t *read_abd = rc->rc_abd;
+ rc->rc_abd = abd_alloc_gang_abd();
+ abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
+ abd_gang_add(rc->rc_abd, abd_get_offset_size(
+ rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
+ skip_off += skip_size;
+ }
+
+ /*
+ * Increase rc_size so the empty ABD is included in subsequent
+ * parity calculations.
+ */
+ rc->rc_size = parity_size;
+ }
+
+ ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
+}
+
+/*
+ * Given a logical address within a dRAID configuration, return the physical
+ * address on the first drive in the group that this address maps to
+ * (at position 'start' in permutation number 'perm').
+ */
+static uint64_t
+vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset,
+ uint64_t *perm, uint64_t *start)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ /* b is the dRAID (parent) sector offset. */
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t b_offset = logical_offset >> ashift;
+
+ /*
+ * The height of a row in units of the vdev's minimum sector size.
+ * This is the amount of data written to each disk of each group
+ * in a given permutation.
+ */
+ uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift;
+
+ /*
+ * We cycle through a disk permutation every groupsz * ngroups chunk
+ * of address space. Note that ngroups * groupsz must be a multiple
+ * of the number of data drives (ndisks) in order to guarantee
+ * alignment. So, for example, if our row height is 16MB, our group
+ * size is 10, and there are 13 data drives in the draid, then ngroups
+ * will be 13, we will change permutation every 2.08GB and each
+ * disk will have 160MB of data per chunk.
+ */
+ uint64_t groupwidth = vdc->vdc_groupwidth;
+ uint64_t ngroups = vdc->vdc_ngroups;
+ uint64_t ndisks = vdc->vdc_ndisks;
+
+ /*
+ * groupstart is where the group this IO will land in "starts" in
+ * the permutation array.
+ */
+ uint64_t group = logical_offset / vdc->vdc_groupsz;
+ uint64_t groupstart = (group * groupwidth) % ndisks;
+ ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart);
+ *start = groupstart;
+
+ /* b_offset is the sector offset within a group chunk */
+ b_offset = b_offset % (rowheight_sectors * groupwidth);
+ ASSERT0(b_offset % groupwidth);
+
+ /*
+ * Find the starting byte offset on each child vdev:
+ * - within a permutation there are ngroups groups spread over the
+ * rows, where each row covers a slice portion of the disk
+ * - each permutation has (groupwidth * ngroups) / ndisks rows
+ * - so each permutation covers rows * slice portion of the disk
+ * - so we need to find the row where this IO group target begins
+ */
+ *perm = group / ngroups;
+ uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) +
+ (((group % ngroups) * groupwidth) / ndisks);
+
+ return (((rowheight_sectors * row) +
+ (b_offset / groupwidth)) << ashift);
+}
+
+static uint64_t
+vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
+ uint64_t abd_offset, uint64_t abd_size)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ uint64_t ashift = vd->vdev_top->vdev_ashift;
+ uint64_t io_size = abd_size;
+ uint64_t io_asize = vdev_draid_asize(vd, io_size);
+ uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
+ uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
+
+ /*
+ * Limit the io_size to the space remaining in the group. A second
+ * row in the raidz_map_t is created for the remainder.
+ */
+ if (io_offset + io_asize > start_offset) {
+ io_size = vdev_draid_asize_to_psize(vd,
+ start_offset - io_offset);
+ }
+
+ /*
+ * At most a block may span the logical end of one group and the start
+ * of the next group. Therefore, at the end of a group the io_size must
+ * span the group width evenly and the remainder must be aligned to the
+ * start of the next group.
+ */
+ IMPLY(abd_offset == 0 && io_size < zio->io_size,
+ (io_asize >> ashift) % vdc->vdc_groupwidth == 0);
+ IMPLY(abd_offset != 0,
+ vdev_draid_group_to_offset(vd, group) == io_offset);
+
+ /* Lookup starting byte offset on each child vdev */
+ uint64_t groupstart, perm;
+ uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+ io_offset, &perm, &groupstart);
+
+ /*
+ * If there is less than groupwidth drives available after the group
+ * start, the group is going to wrap onto the next row. 'wrap' is the
+ * group disk number that starts on the next row.
+ */
+ uint64_t ndisks = vdc->vdc_ndisks;
+ uint64_t groupwidth = vdc->vdc_groupwidth;
+ uint64_t wrap = groupwidth;
+
+ if (groupstart + groupwidth > ndisks)
+ wrap = ndisks - groupstart;
+
+ /* The io size in units of the vdev's minimum sector size. */
+ const uint64_t psize = io_size >> ashift;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
+ uint64_t q = psize / vdc->vdc_ndata;
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ uint64_t r = psize - q * vdc->vdc_ndata;
+
+ /* The number of "big columns" - those which contain remainder data. */
+ uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity);
+ ASSERT3U(bc, <, groupwidth);
+
+ /* The total number of data and parity sectors for this I/O. */
+ uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
+
+ raidz_row_t *rr;
+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);
+ rr->rr_cols = groupwidth;
+ rr->rr_scols = groupwidth;
+ rr->rr_bigcols = bc;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = vdc->vdc_nparity;
+ rr->rr_abd_copy = NULL;
+ rr->rr_abd_empty = NULL;
+#ifdef ZFS_DEBUG
+ rr->rr_offset = io_offset;
+ rr->rr_size = io_size;
+#endif
+ *rrp = rr;
+
+ uint8_t *base;
+ uint64_t iter, asize = 0;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+ for (uint64_t i = 0; i < groupwidth; i++) {
+ raidz_col_t *rc = &rr->rr_col[i];
+ uint64_t c = (groupstart + i) % ndisks;
+
+ /* increment the offset if we wrap to the next row */
+ if (i == wrap)
+ physical_offset += VDEV_DRAID_ROWHEIGHT;
+
+ rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
+ rc->rc_offset = physical_offset;
+ rc->rc_abd = NULL;
+ rc->rc_gdata = NULL;
+ rc->rc_orig_data = NULL;
+ rc->rc_error = 0;
+ rc->rc_tried = 0;
+ rc->rc_skipped = 0;
+ rc->rc_repair = 0;
+ rc->rc_need_orig_restore = B_FALSE;
+
+ if (q == 0 && i >= bc)
+ rc->rc_size = 0;
+ else if (i < bc)
+ rc->rc_size = (q + 1) << ashift;
+ else
+ rc->rc_size = q << ashift;
+
+ asize += rc->rc_size;
+ }
+
+ ASSERT3U(asize, ==, tot << ashift);
+ rr->rr_nempty = roundup(tot, groupwidth) - tot;
+ IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc);
+
+ /* Allocate buffers for the parity columns */
+ for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
+ }
+
+ /*
+ * Map buffers for data columns and allocate/map buffers for skip
+ * sectors. There are three distinct cases for dRAID which are
+ * required to support sequential rebuild.
+ */
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ vdev_draid_map_alloc_write(zio, abd_offset, rr);
+ } else if ((rr->rr_nempty > 0) &&
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+ vdev_draid_map_alloc_scrub(zio, abd_offset, rr);
+ } else {
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+ vdev_draid_map_alloc_read(zio, abd_offset, rr);
+ }
+
+ return (io_size);
+}
+
+/*
+ * Allocate the raidz mapping to be applied to the dRAID I/O. The parity
+ * calculations for dRAID are identical to raidz however there are a few
+ * differences in the layout.
+ *
+ * - dRAID always allocates a full stripe width. Any extra sectors due
+ * this padding are zero filled and written to disk. They will be read
+ * back during a scrub or repair operation since they are included in
+ * the parity calculation. This property enables sequential resilvering.
+ *
+ * - When the block at the logical offset spans redundancy groups then two
+ * rows are allocated in the raidz_map_t. One row resides at the end of
+ * the first group and the other at the start of the following group.
+ */
+static raidz_map_t *
+vdev_draid_map_alloc(zio_t *zio)
+{
+ raidz_row_t *rr[2];
+ uint64_t abd_offset = 0;
+ uint64_t abd_size = zio->io_size;
+ uint64_t io_offset = zio->io_offset;
+ uint64_t size;
+ int nrows = 1;
+
+ size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset,
+ abd_offset, abd_size);
+ if (size < abd_size) {
+ vdev_t *vd = zio->io_vd;
+
+ io_offset += vdev_draid_asize(vd, size);
+ abd_offset += size;
+ abd_size -= size;
+ nrows++;
+
+ ASSERT3U(io_offset, ==, vdev_draid_group_to_offset(
+ vd, vdev_draid_offset_to_group(vd, io_offset)));
+ ASSERT3U(abd_offset, <, zio->io_size);
+ ASSERT3U(abd_size, !=, 0);
+
+ size = vdev_draid_map_alloc_row(zio, &rr[1],
+ io_offset, abd_offset, abd_size);
+ VERIFY3U(size, ==, abd_size);
+ }
+
+ raidz_map_t *rm;
+ rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP);
+ rm->rm_ops = vdev_raidz_math_get_ops();
+ rm->rm_nrows = nrows;
+ rm->rm_row[0] = rr[0];
+ if (nrows == 2)
+ rm->rm_row[1] = rr[1];
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_draid_vsd_ops;
+
+ return (rm);
+}
+
+/*
+ * Given an offset into a dRAID return the next group width aligned offset
+ * which can be used to start an allocation.
+ */
+static uint64_t
+vdev_draid_get_astart(vdev_t *vd, const uint64_t start)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift));
+}
+
+/*
+ * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child)
+ * rounded down to the last full slice. So each child must provide at least
+ * 1 / (children - nspares) of its asize.
+ */
+static uint64_t
+vdev_draid_min_asize(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks));
+}
+
+/*
+ * When using dRAID the minimum allocation size is determined by the number
+ * of data disks in the redundancy group. Full stripes are always used.
+ */
+static uint64_t
+vdev_draid_min_alloc(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ return (vdc->vdc_ndata << vd->vdev_ashift);
+}
+
+/*
+ * Returns true if the txg range does not exist on any leaf vdev.
+ *
+ * A dRAID spare does not fit into the DTL model. While it has child vdevs
+ * there is no redundancy among them, and the effective child vdev is
+ * determined by offset. Essentially we do a vdev_dtl_reassess() on the
+ * fly by replacing a dRAID spare with the child vdev under the offset.
+ * Note that it is a recursive process because the child vdev can be
+ * another dRAID spare and so on.
+ */
+boolean_t
+vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
+ uint64_t size)
+{
+ if (vd->vdev_ops == &vdev_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops) {
+ /*
+ * Check all of the readable children, if any child
+ * contains the txg range the data it is not missing.
+ */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!vdev_readable(cvd))
+ continue;
+
+ if (!vdev_draid_missing(cvd, physical_offset,
+ txg, size))
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+ }
+
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ /*
+ * When sequentially resilvering we don't have a proper
+ * txg range so instead we must presume all txgs are
+ * missing on this vdev until the resilver completes.
+ */
+ if (vd->vdev_rebuild_txg != 0)
+ return (B_TRUE);
+
+ /*
+ * DTL_MISSING is set for all prior txgs when a resilver
+ * is started in spa_vdev_attach().
+ */
+ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+ return (B_TRUE);
+
+ /*
+ * Consult the DTL on the relevant vdev. Either a vdev
+ * leaf or spare/replace mirror child may be returned so
+ * we must recursively call vdev_draid_missing_impl().
+ */
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_TRUE);
+
+ return (vdev_draid_missing(vd, physical_offset,
+ txg, size));
+ }
+
+ return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Returns true if the txg is only partially replicated on the leaf vdevs.
+ */
+static boolean_t
+vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
+ uint64_t size)
+{
+ if (vd->vdev_ops == &vdev_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops) {
+ /*
+ * Check all of the readable children, if any child is
+ * missing the txg range then it is partially replicated.
+ */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!vdev_readable(cvd))
+ continue;
+
+ if (vdev_draid_partial(cvd, physical_offset, txg, size))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+ }
+
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ /*
+ * When sequentially resilvering we don't have a proper
+ * txg range so instead we must presume all txgs are
+ * missing on this vdev until the resilver completes.
+ */
+ if (vd->vdev_rebuild_txg != 0)
+ return (B_TRUE);
+
+ /*
+ * DTL_MISSING is set for all prior txgs when a resilver
+ * is started in spa_vdev_attach().
+ */
+ if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+ return (B_TRUE);
+
+ /*
+ * Consult the DTL on the relevant vdev. Either a vdev
+ * leaf or spare/replace mirror child may be returned so
+ * we must recursively call vdev_draid_missing_impl().
+ */
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_TRUE);
+
+ return (vdev_draid_partial(vd, physical_offset, txg, size));
+ }
+
+ return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Determine if the vdev is readable at the given offset.
+ */
+boolean_t
+vdev_draid_readable(vdev_t *vd, uint64_t physical_offset)
+{
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_FALSE);
+ }
+
+ if (vd->vdev_ops == &vdev_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops) {
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (!vdev_readable(cvd))
+ continue;
+
+ if (vdev_draid_readable(cvd, physical_offset))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+ }
+
+ return (vdev_readable(vd));
+}
+
+/*
+ * Returns the first distributed spare found under the provided vdev tree.
+ */
+static vdev_t *
+vdev_draid_find_spare(vdev_t *vd)
+{
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (vd);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]);
+ if (svd != NULL)
+ return (svd);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Returns B_TRUE if the passed in vdev is currently "faulted".
+ * Faulted, in this context, means that the vdev represents a
+ * replacing or sparing vdev tree.
+ */
+static boolean_t
+vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset)
+{
+ if (vd->vdev_ops == &vdev_draid_spare_ops) {
+ vd = vdev_draid_spare_get_child(vd, physical_offset);
+ if (vd == NULL)
+ return (B_FALSE);
+
+ /*
+ * After resolving the distributed spare to a leaf vdev
+ * check the parent to determine if it's "faulted".
+ */
+ vd = vd->vdev_parent;
+ }
+
+ return (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+}
+
+/*
+ * Determine if the dRAID block at the logical offset is degraded.
+ * Used by sequential resilver.
+ */
+static boolean_t
+vdev_draid_group_degraded(vdev_t *vd, uint64_t offset)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
+
+ uint64_t groupstart, perm;
+ uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+ offset, &perm, &groupstart);
+
+ uint8_t *base;
+ uint64_t iter;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+ uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+ uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
+ vdev_t *cvd = vd->vdev_child[cid];
+
+ /* Group contains a faulted vdev. */
+ if (vdev_draid_faulted(cvd, physical_offset))
+ return (B_TRUE);
+
+ /*
+ * Always check groups with active distributed spares
+ * because any vdev failure in the pool will affect them.
+ */
+ if (vdev_draid_find_spare(cvd) != NULL)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the txg is missing. Used by healing resilver.
+ */
+static boolean_t
+vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
+ uint64_t size)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
+
+ uint64_t groupstart, perm;
+ uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+ offset, &perm, &groupstart);
+
+ uint8_t *base;
+ uint64_t iter;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+ uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+ uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
+ vdev_t *cvd = vd->vdev_child[cid];
+
+ /* Transaction group is known to be partially replicated. */
+ if (vdev_draid_partial(cvd, physical_offset, txg, size))
+ return (B_TRUE);
+
+ /*
+ * Always check groups with active distributed spares
+ * because any vdev failure in the pool will affect them.
+ */
+ if (vdev_draid_find_spare(cvd) != NULL)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Find the smallest child asize and largest sector size to calculate the
+ * available capacity. Distributed spares are ignored since their capacity
+ * is also based of the minimum child size in the top-level dRAID.
+ */
+static void
+vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
+ uint64_t *logical_ashiftp, uint64_t *physical_ashiftp)
+{
+ uint64_t logical_ashift = 0, physical_ashift = 0;
+ uint64_t asize = 0, max_asize = 0;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ continue;
+
+ asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
+ max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+ logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
+ physical_ashift = MAX(physical_ashift,
+ cvd->vdev_physical_ashift);
+ }
+
+ *asizep = asize;
+ *max_asizep = max_asize;
+ *logical_ashiftp = logical_ashift;
+ *physical_ashiftp = physical_ashift;
+}
+
+/*
+ * Open spare vdevs.
+ */
+static boolean_t
+vdev_draid_open_spares(vdev_t *vd)
+{
+ return (vd->vdev_ops == &vdev_draid_spare_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops);
+}
+
+/*
+ * Open all children, excluding spares.
+ */
+static boolean_t
+vdev_draid_open_children(vdev_t *vd)
+{
+ return (!vdev_draid_open_spares(vd));
+}
+
+/*
+ * Open a top-level dRAID vdev.
+ */
+static int
+vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ uint64_t nparity = vdc->vdc_nparity;
+ int open_errors = 0;
+
+ if (nparity > VDEV_DRAID_MAXPARITY ||
+ vd->vdev_children < nparity + 1) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * First open the normal children then the distributed spares. This
+ * ordering is important to ensure the distributed spares calculate
+ * the correct psize in the event that the dRAID vdevs were expanded.
+ */
+ vdev_open_children_subset(vd, vdev_draid_open_children);
+ vdev_open_children_subset(vd, vdev_draid_open_spares);
+
+ /* Verify enough of the children are available to continue. */
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c]->vdev_open_error != 0) {
+ if ((++open_errors) > nparity) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+ return (SET_ERROR(ENXIO));
+ }
+ }
+ }
+
+ /*
+ * Allocatable capacity is the sum of the space on all children less
+ * the number of distributed spares rounded down to last full row
+ * and then to the last full group. An additional 32MB of scratch
+ * space is reserved at the end of each child for use by the dRAID
+ * expansion feature.
+ */
+ uint64_t child_asize, child_max_asize;
+ vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize,
+ logical_ashift, physical_ashift);
+
+ /*
+ * Should be unreachable since the minimum child size is 64MB, but
+ * we want to make sure an underflow absolutely cannot occur here.
+ */
+ if (child_asize < VDEV_DRAID_REFLOW_RESERVE ||
+ child_max_asize < VDEV_DRAID_REFLOW_RESERVE) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) /
+ VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
+ child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) /
+ VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
+
+ *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
+ vdc->vdc_groupsz);
+ *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
+ vdc->vdc_groupsz);
+
+ return (0);
+}
+
+/*
+ * Close a top-level dRAID vdev.
+ */
+static void
+vdev_draid_close(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c] != NULL)
+ vdev_close(vd->vdev_child[c]);
+ }
+}
+
+/*
+ * Return the maximum asize for a rebuild zio in the provided range
+ * given the following constraints. A dRAID chunks may not:
+ *
+ * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or
+ * - Span dRAID redundancy groups.
+ */
+static uint64_t
+vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+ uint64_t max_segment)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ uint64_t ashift = vd->vdev_ashift;
+ uint64_t ndata = vdc->vdc_ndata;
+ uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift),
+ SPA_MAXBLOCKSIZE);
+
+ ASSERT3U(vdev_draid_get_astart(vd, start), ==, start);
+ ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0);
+
+ /* Chunks must evenly span all data columns in the group. */
+ psize = (((psize >> ashift) / ndata) * ndata) << ashift;
+ uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize));
+
+ /* Reduce the chunk size to the group space remaining. */
+ uint64_t group = vdev_draid_offset_to_group(vd, start);
+ uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start;
+ chunk_size = MIN(chunk_size, left);
+
+ ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0);
+ ASSERT3U(vdev_draid_offset_to_group(vd, start), ==,
+ vdev_draid_offset_to_group(vd, start + chunk_size - 1));
+
+ return (chunk_size);
+}
+
+/*
+ * Align the start of the metaslab to the group width and slightly reduce
+ * its size to a multiple of the group width. Since full stripe writes are
+ * required by dRAID this space is unallocable. Furthermore, aligning the
+ * metaslab start is important for vdev initialize and TRIM which both operate
+ * on metaslab boundaries which vdev_xlate() expects to be aligned.
+ */
+static void
+vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+ uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift;
+ uint64_t astart = vdev_draid_get_astart(vd, *ms_start);
+ uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz;
+
+ *ms_start = astart;
+ *ms_size = asize;
+
+ ASSERT0(*ms_start % sz);
+ ASSERT0(*ms_size % sz);
+}
+
+/*
+ * Add virtual dRAID spares to the list of valid spares. In order to accomplish
+ * this the existing array must be freed and reallocated with the additional
+ * entries.
+ */
+int
+vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
+ uint64_t next_vdev_id)
+{
+ uint64_t draid_nspares = 0;
+ uint64_t ndraid = 0;
+ int error;
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_t *cvd = vd->vdev_child[i];
+
+ if (cvd->vdev_ops == &vdev_draid_ops) {
+ vdev_draid_config_t *vdc = cvd->vdev_tsd;
+ draid_nspares += vdc->vdc_nspares;
+ ndraid++;
+ }
+ }
+
+ if (draid_nspares == 0) {
+ *ndraidp = ndraid;
+ return (0);
+ }
+
+ nvlist_t **old_spares, **new_spares;
+ uint_t old_nspares;
+ error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &old_spares, &old_nspares);
+ if (error)
+ old_nspares = 0;
+
+ /* Allocate memory and copy of the existing spares. */
+ new_spares = kmem_alloc(sizeof (nvlist_t *) *
+ (draid_nspares + old_nspares), KM_SLEEP);
+ for (uint_t i = 0; i < old_nspares; i++)
+ new_spares[i] = fnvlist_dup(old_spares[i]);
+
+ /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */
+ uint64_t n = old_nspares;
+ for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) {
+ vdev_t *cvd = vd->vdev_child[vdev_id];
+ char path[64];
+
+ if (cvd->vdev_ops != &vdev_draid_ops)
+ continue;
+
+ vdev_draid_config_t *vdc = cvd->vdev_tsd;
+ uint64_t nspares = vdc->vdc_nspares;
+ uint64_t nparity = vdc->vdc_nparity;
+
+ for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
+ bzero(path, sizeof (path));
+ (void) snprintf(path, sizeof (path) - 1,
+ "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
+ (u_longlong_t)nparity,
+ (u_longlong_t)next_vdev_id + vdev_id,
+ (u_longlong_t)spare_id);
+
+ nvlist_t *spare = fnvlist_alloc();
+ fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path);
+ fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_DRAID_SPARE);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID,
+ cvd->vdev_guid);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID,
+ spare_id);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1);
+ fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT,
+ cvd->vdev_ashift);
+
+ new_spares[n] = spare;
+ n++;
+ }
+ }
+
+ if (n > 0) {
+ (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
+ fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ new_spares, n);
+ }
+
+ for (int i = 0; i < n; i++)
+ nvlist_free(new_spares[i]);
+
+ kmem_free(new_spares, sizeof (*new_spares) * n);
+ *ndraidp = ndraid;
+
+ return (0);
+}
+
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered.
+ */
+static boolean_t
+vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
+{
+ uint64_t offset = DVA_GET_OFFSET(dva);
+ uint64_t asize = vdev_draid_asize(vd, psize);
+
+ if (phys_birth == TXG_UNKNOWN) {
+ /*
+ * Sequential resilver. There is no meaningful phys_birth
+ * for this block, we can only determine if block resides
+ * in a degraded group in which case it must be resilvered.
+ */
+ ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==,
+ vdev_draid_offset_to_group(vd, offset + asize - 1));
+
+ return (vdev_draid_group_degraded(vd, offset));
+ } else {
+ /*
+ * Healing resilver. TXGs not in DTL_PARTIAL are intact,
+ * as are blocks in non-degraded groups.
+ */
+ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+ return (B_FALSE);
+
+ if (vdev_draid_group_missing(vd, offset, phys_birth, 1))
+ return (B_TRUE);
+
+ /* The block may span groups in which case check both. */
+ if (vdev_draid_offset_to_group(vd, offset) !=
+ vdev_draid_offset_to_group(vd, offset + asize - 1)) {
+ if (vdev_draid_group_missing(vd,
+ offset + asize, phys_birth, 1))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+ }
+}
+
+static boolean_t
+vdev_draid_rebuilding(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+ return (B_TRUE);
+
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (vdev_draid_rebuilding(vd->vdev_child[i])) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+static void
+vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+{
+#ifdef ZFS_DEBUG
+ range_seg64_t logical_rs, physical_rs, remain_rs;
+ logical_rs.rs_start = rr->rr_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_draid_asize(vd, rr->rr_size);
+
+ raidz_col_t *rc = &rr->rr_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
+ ASSERT(vdev_xlate_is_empty(&remain_rs));
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end);
+#endif
+}
+
+/*
+ * For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity. A gang ABD is allocated by vdev_draid_map_alloc()
+ * if a skip sector needs to be added to a column.
+ */
+static void
+vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ raidz_map_t *rm = zio->io_vsd;
+
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ /*
+ * Empty columns are zero filled and included in the parity
+ * calculation and therefore must be written.
+ */
+ ASSERT3U(rc->rc_size, !=, 0);
+
+ /* Verify physical to logical translation */
+ vdev_draid_io_verify(vd, rr, c);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx], rc->rc_offset,
+ rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
+ 0, vdev_raidz_child_done, rc));
+ }
+}
+
+/*
+ * For read operations:
+ * 1. The vdev_draid_map_alloc() function will create a minimal raidz
+ * mapping for the read based on the zio->io_flags. There are two
+ * possible mappings either 1) a normal read, or 2) a scrub/resilver.
+ * 2. Create the zio read operations. This will include all parity
+ * columns and skip sectors for a scrub/resilver.
+ */
+static void
+vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+
+ /* Sequential rebuild must do IO at redundancy group boundary. */
+ IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
+
+ /*
+ * Iterate over the columns in reverse order so that we hit the parity
+ * last. Any errors along the way will force us to read the parity.
+ * For scrub/resilver IOs which verify skip sectors, a gang ABD will
+ * have been allocated to store them and rc->rc_size is increased.
+ */
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (!vdev_draid_readable(cvd, rc->rc_offset)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ENXIO);
+ rc->rc_tried = 1;
+ rc->rc_skipped = 1;
+ continue;
+ }
+
+ if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(ESTALE);
+ rc->rc_skipped = 1;
+ continue;
+ }
+
+ /*
+ * Empty columns may be read during vdev_draid_io_done().
+ * Only skip them after the readable and missing checks
+ * verify they are available.
+ */
+ if (rc->rc_size == 0) {
+ rc->rc_skipped = 1;
+ continue;
+ }
+
+ if (zio->io_flags & ZIO_FLAG_RESILVER) {
+ vdev_t *svd;
+
+ /*
+ * If this child is a distributed spare then the
+ * offset might reside on the vdev being replaced.
+ * In which case this data must be written to the
+ * new device. Failure to do so would result in
+ * checksum errors when the old device is detached
+ * and the pool is scrubbed.
+ */
+ if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
+ svd = vdev_draid_spare_get_child(svd,
+ rc->rc_offset);
+ if (svd && (svd->vdev_ops == &vdev_spare_ops ||
+ svd->vdev_ops == &vdev_replacing_ops)) {
+ rc->rc_repair = 1;
+ }
+ }
+
+ /*
+ * Always issue a repair IO to this child when its
+ * a spare or replacing vdev with an active rebuild.
+ */
+ if ((cvd->vdev_ops == &vdev_spare_ops ||
+ cvd->vdev_ops == &vdev_replacing_ops) &&
+ vdev_draid_rebuilding(cvd)) {
+ rc->rc_repair = 1;
+ }
+ }
+ }
+
+ /*
+ * Either a parity or data column is missing this means a repair
+ * may be attempted by vdev_draid_io_done(). Expand the raid map
+ * to read in empty columns which are needed along with the parity
+ * during reconstruction.
+ */
+ if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) &&
+ rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) {
+ vdev_draid_map_alloc_empty(zio, rr);
+ }
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error || rc->rc_size == 0)
+ continue;
+
+ if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+ (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ }
+ }
+}
+
+/*
+ * Start an IO operation to a dRAID vdev.
+ */
+static void
+vdev_draid_io_start(zio_t *zio)
+{
+ vdev_t *vd __maybe_unused = zio->io_vd;
+ raidz_map_t *rm;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
+
+ rm = vdev_draid_map_alloc(zio);
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_draid_io_start_write(zio, rm->rm_row[i]);
+ }
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_draid_io_start_read(zio, rm->rm_row[i]);
+ }
+ }
+
+ zio_execute(zio);
+}
+
+/*
+ * Complete an IO operation on a dRAID vdev. The raidz logic can be applied
+ * to dRAID since the layout is fully described by the raidz_map_t.
+ */
+static void
+vdev_draid_io_done(zio_t *zio)
+{
+ vdev_raidz_io_done(zio);
+}
+
+static void
+vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+ ASSERT(vd->vdev_ops == &vdev_draid_ops);
+
+ if (faulted > vdc->vdc_nparity)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
+ else if (degraded + faulted != 0)
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ else
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+static void
+vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_draid_ops);
+
+ vdev_draid_config_t *vdc = raidvd->vdev_tsd;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* Make sure the offsets are block-aligned */
+ ASSERT0(logical_rs->rs_start % (1 << ashift));
+ ASSERT0(logical_rs->rs_end % (1 << ashift));
+
+ uint64_t logical_start = logical_rs->rs_start;
+ uint64_t logical_end = logical_rs->rs_end;
+
+ /*
+ * Unaligned ranges must be skipped. All metaslabs are correctly
+ * aligned so this should not happen, but this case is handled in
+ * case it's needed by future callers.
+ */
+ uint64_t astart = vdev_draid_get_astart(raidvd, logical_start);
+ if (astart != logical_start) {
+ physical_rs->rs_start = logical_start;
+ physical_rs->rs_end = logical_start;
+ remain_rs->rs_start = MIN(astart, logical_end);
+ remain_rs->rs_end = logical_end;
+ return;
+ }
+
+ /*
+ * Unlike with mirrors and raidz a dRAID logical range can map
+ * to multiple non-contiguous physical ranges. This is handled by
+ * limiting the size of the logical range to a single group and
+ * setting the remain argument such that it describes the remaining
+ * unmapped logical range. This is stricter than absolutely
+ * necessary but helps simplify the logic below.
+ */
+ uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start);
+ uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1);
+ if (logical_end > nextstart)
+ logical_end = nextstart;
+
+ /* Find the starting offset for each vdev in the group */
+ uint64_t perm, groupstart;
+ uint64_t start = vdev_draid_logical_to_physical(raidvd,
+ logical_start, &perm, &groupstart);
+ uint64_t end = start;
+
+ uint8_t *base;
+ uint64_t iter, id;
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ /*
+ * Check if the passed child falls within the group. If it does
+ * update the start and end to reflect the physical range.
+ * Otherwise, leave them unmodified which will result in an empty
+ * (zero-length) physical range being returned.
+ */
+ for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+ uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+
+ if (c == 0 && i != 0) {
+ /* the group wrapped, increment the start */
+ start += VDEV_DRAID_ROWHEIGHT;
+ end = start;
+ }
+
+ id = vdev_draid_permute_id(vdc, base, iter, c);
+ if (id == cvd->vdev_id) {
+ uint64_t b_size = (logical_end >> ashift) -
+ (logical_start >> ashift);
+ ASSERT3U(b_size, >, 0);
+ end = start + ((((b_size - 1) /
+ vdc->vdc_groupwidth) + 1) << ashift);
+ break;
+ }
+ }
+ physical_rs->rs_start = start;
+ physical_rs->rs_end = end;
+
+ /*
+ * Only top-level vdevs are allowed to set remain_rs because
+ * when .vdev_op_xlate() is called for their children the full
+ * logical range is not provided by vdev_xlate().
+ */
+ remain_rs->rs_start = logical_end;
+ remain_rs->rs_end = logical_rs->rs_end;
+
+ ASSERT3U(physical_rs->rs_start, <=, logical_start);
+ ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
+ logical_end - logical_start);
+}
+
+/*
+ * Add dRAID specific fields to the config nvlist.
+ */
+static void
+vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups);
+}
+
+/*
+ * Initialize private dRAID specific fields from the nvlist.
+ */
+static int
+vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ uint64_t ndata, nparity, nspares, ngroups;
+ int error;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata))
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) ||
+ nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ uint_t children;
+ nvlist_t **child;
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0 || children == 0 ||
+ children > VDEV_DRAID_MAX_CHILDREN) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) ||
+ nspares > 100 || nspares > (children - (ndata + nparity))) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) ||
+ ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Validate the minimum number of children exist per group for the
+ * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4).
+ */
+ if (children < (ndata + nparity + nspares))
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Create the dRAID configuration using the pool nvlist configuration
+ * and the fixed mapping for the correct number of children.
+ */
+ vdev_draid_config_t *vdc;
+ const draid_map_t *map;
+
+ error = vdev_draid_lookup_map(children, &map);
+ if (error)
+ return (SET_ERROR(EINVAL));
+
+ vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP);
+ vdc->vdc_ndata = ndata;
+ vdc->vdc_nparity = nparity;
+ vdc->vdc_nspares = nspares;
+ vdc->vdc_children = children;
+ vdc->vdc_ngroups = ngroups;
+ vdc->vdc_nperms = map->dm_nperms;
+
+ error = vdev_draid_generate_perms(map, &vdc->vdc_perms);
+ if (error) {
+ kmem_free(vdc, sizeof (*vdc));
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Derived constants.
+ */
+ vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity;
+ vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares;
+ vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT;
+ vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) /
+ vdc->vdc_ndisks;
+
+ ASSERT3U(vdc->vdc_groupwidth, >=, 2);
+ ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks);
+ ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT);
+ ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT);
+ ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0);
+ ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) %
+ vdc->vdc_ndisks, ==, 0);
+
+ *tsd = vdc;
+
+ return (0);
+}
+
+static void
+vdev_draid_fini(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ vmem_free(vdc->vdc_perms, sizeof (uint8_t) *
+ vdc->vdc_children * vdc->vdc_nperms);
+ kmem_free(vdc, sizeof (*vdc));
+}
+
+static uint64_t
+vdev_draid_nparity(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ return (vdc->vdc_nparity);
+}
+
+static uint64_t
+vdev_draid_ndisks(vdev_t *vd)
+{
+ vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+ return (vdc->vdc_ndisks);
+}
+
+vdev_ops_t vdev_draid_ops = {
+ .vdev_op_init = vdev_draid_init,
+ .vdev_op_fini = vdev_draid_fini,
+ .vdev_op_open = vdev_draid_open,
+ .vdev_op_close = vdev_draid_close,
+ .vdev_op_asize = vdev_draid_asize,
+ .vdev_op_min_asize = vdev_draid_min_asize,
+ .vdev_op_min_alloc = vdev_draid_min_alloc,
+ .vdev_op_io_start = vdev_draid_io_start,
+ .vdev_op_io_done = vdev_draid_io_done,
+ .vdev_op_state_change = vdev_draid_state_change,
+ .vdev_op_need_resilver = vdev_draid_need_resilver,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_draid_xlate,
+ .vdev_op_rebuild_asize = vdev_draid_rebuild_asize,
+ .vdev_op_metaslab_init = vdev_draid_metaslab_init,
+ .vdev_op_config_generate = vdev_draid_config_generate,
+ .vdev_op_nparity = vdev_draid_nparity,
+ .vdev_op_ndisks = vdev_draid_ndisks,
+ .vdev_op_type = VDEV_TYPE_DRAID,
+ .vdev_op_leaf = B_FALSE,
+};
+
+
+/*
+ * A dRAID distributed spare is a virtual leaf vdev which is included in the
+ * parent dRAID configuration. The last N columns of the dRAID permutation
+ * table are used to determine on which dRAID children a specific offset
+ * should be written. These spare leaf vdevs can only be used to replace
+ * faulted children in the same dRAID configuration.
+ */
+
+/*
+ * Distributed spare state. All fields are set when the distributed spare is
+ * first opened and are immutable.
+ */
+typedef struct {
+ vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */
+ uint64_t vds_top_guid; /* top-level parent dRAID guid */
+ uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */
+} vdev_draid_spare_t;
+
+/*
+ * Returns the parent dRAID vdev to which the distributed spare belongs.
+ * This may be safely called even when the vdev is not open.
+ */
+vdev_t *
+vdev_draid_spare_get_parent(vdev_t *vd)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+ if (vds->vds_draid_vdev != NULL)
+ return (vds->vds_draid_vdev);
+
+ return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev,
+ vds->vds_top_guid));
+}
+
+/*
+ * A dRAID space is active when it's the child of a vdev using the
+ * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops.
+ */
+static boolean_t
+vdev_draid_spare_is_active(vdev_t *vd)
+{
+ vdev_t *pvd = vd->vdev_parent;
+
+ if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops ||
+ pvd->vdev_ops == &vdev_replacing_ops ||
+ pvd->vdev_ops == &vdev_draid_ops)) {
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
+}
+
+/*
+ * Given a dRAID distribute spare vdev, returns the physical child vdev
+ * on which the provided offset resides. This may involve recursing through
+ * multiple layers of distributed spares. Note that offset is relative to
+ * this vdev.
+ */
+vdev_t *
+vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+ /* The vdev is closed */
+ if (vds->vds_draid_vdev == NULL)
+ return (NULL);
+
+ vdev_t *tvd = vds->vds_draid_vdev;
+ vdev_draid_config_t *vdc = tvd->vdev_tsd;
+
+ ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops);
+ ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares);
+
+ uint8_t *base;
+ uint64_t iter;
+ uint64_t perm = physical_offset / vdc->vdc_devslicesz;
+
+ vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+ uint64_t cid = vdev_draid_permute_id(vdc, base, iter,
+ (tvd->vdev_children - 1) - vds->vds_spare_id);
+ vdev_t *cvd = tvd->vdev_child[cid];
+
+ if (cvd->vdev_ops == &vdev_draid_spare_ops)
+ return (vdev_draid_spare_get_child(cvd, physical_offset));
+
+ return (cvd);
+}
+
+/* ARGSUSED */
+static void
+vdev_draid_spare_close(vdev_t *vd)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+ vds->vds_draid_vdev = NULL;
+}
+
+/*
+ * Opening a dRAID spare device is done by looking up the associated dRAID
+ * top-level vdev guid from the spare configuration.
+ */
+static int
+vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+ vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ uint64_t asize, max_asize;
+
+ vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid);
+ if (tvd == NULL) {
+ /*
+ * When spa_vdev_add() is labeling new spares the
+ * associated dRAID is not attached to the root vdev
+ * nor does this spare have a parent. Simulate a valid
+ * device in order to allow the label to be initialized
+ * and the distributed spare added to the configuration.
+ */
+ if (vd->vdev_parent == NULL) {
+ *psize = *max_psize = SPA_MINDEVSIZE;
+ *logical_ashift = *physical_ashift = ASHIFT_MIN;
+ return (0);
+ }
+
+ return (SET_ERROR(EINVAL));
+ }
+
+ vdev_draid_config_t *vdc = tvd->vdev_tsd;
+ if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL)
+ return (SET_ERROR(EINVAL));
+
+ if (vds->vds_spare_id >= vdc->vdc_nspares)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here
+ * because the caller may be vdev_draid_open() in which case the
+ * values are stale as they haven't yet been updated by vdev_open().
+ * To avoid this always recalculate the dRAID asize and max_asize.
+ */
+ vdev_draid_calculate_asize(tvd, &asize, &max_asize,
+ logical_ashift, physical_ashift);
+
+ *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+
+ vds->vds_draid_vdev = tvd;
+
+ return (0);
+}
+
+/*
+ * Completed distributed spare IO. Store the result in the parent zio
+ * as if it had performed the operation itself. Only the first error is
+ * preserved if there are multiple errors.
+ */
+static void
+vdev_draid_spare_child_done(zio_t *zio)
+{
+ zio_t *pio = zio->io_private;
+
+ /*
+ * IOs are issued to non-writable vdevs in order to keep their
+ * DTLs accurate. However, we don't want to propagate the
+ * error in to the distributed spare's DTL. When resilvering
+ * vdev_draid_need_resilver() will consult the relevant DTL
+ * to determine if the data is missing and must be repaired.
+ */
+ if (!vdev_writeable(zio->io_vd))
+ return;
+
+ if (pio->io_error == 0)
+ pio->io_error = zio->io_error;
+}
+
+/*
+ * Returns a valid label nvlist for the distributed spare vdev. This is
+ * used to bypass the IO pipeline to avoid the complexity of constructing
+ * a complete label with valid checksum to return when read.
+ */
+nvlist_t *
+vdev_draid_read_config_spare(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+ uint64_t guid = vd->vdev_guid;
+
+ nvlist_t *nv = fnvlist_alloc();
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
+ fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE,
+ vdev_draid_spare_is_active(vd) ?
+ POOL_STATE_ACTIVE : POOL_STATE_SPARE);
+
+ /* Set the vdev guid based on the vdev list in sav_count. */
+ for (int i = 0; i < sav->sav_count; i++) {
+ if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
+ strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
+ guid = sav->sav_vdevs[i]->vdev_guid;
+ break;
+ }
+ }
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
+
+ return (nv);
+}
+
+/*
+ * Handle any ioctl requested of the distributed spare. Only flushes
+ * are supported in which case all children must be flushed.
+ */
+static int
+vdev_draid_spare_ioctl(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ int error = 0;
+
+ if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
+ for (int c = 0; c < vd->vdev_children; c++) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[c], zio->io_offset, zio->io_abd,
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ } else {
+ error = SET_ERROR(ENOTSUP);
+ }
+
+ return (error);
+}
+
+/*
+ * Initiate an IO to the distributed spare. For normal IOs this entails using
+ * the zio->io_offset and permutation table to calculate which child dRAID vdev
+ * is responsible for the data. Then passing along the zio to that child to
+ * perform the actual IO. The label ranges are not stored on disk and require
+ * some special handling which is described below.
+ */
+static void
+vdev_draid_spare_io_start(zio_t *zio)
+{
+ vdev_t *cvd = NULL, *vd = zio->io_vd;
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+ uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (vds == NULL) {
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+ zio->io_error = vdev_draid_spare_ioctl(zio);
+ break;
+
+ case ZIO_TYPE_WRITE:
+ if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
+ /*
+ * Accept probe IOs and config writers to simulate the
+ * existence of an on disk label. vdev_label_sync(),
+ * vdev_uberblock_sync() and vdev_copy_uberblocks()
+ * skip the distributed spares. This only leaves
+ * vdev_label_init() which is allowed to succeed to
+ * avoid adding special cases the function.
+ */
+ if (zio->io_flags & ZIO_FLAG_PROBE ||
+ zio->io_flags & ZIO_FLAG_CONFIG_WRITER) {
+ zio->io_error = 0;
+ } else {
+ zio->io_error = SET_ERROR(EIO);
+ }
+ } else {
+ cvd = vdev_draid_spare_get_child(vd, offset);
+
+ if (cvd == NULL) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ }
+ break;
+
+ case ZIO_TYPE_READ:
+ if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
+ /*
+ * Accept probe IOs to simulate the existence of a
+ * label. vdev_label_read_config() bypasses the
+ * pipeline to read the label configuration and
+ * vdev_uberblock_load() skips distributed spares
+ * when attempting to locate the best uberblock.
+ */
+ if (zio->io_flags & ZIO_FLAG_PROBE) {
+ zio->io_error = 0;
+ } else {
+ zio->io_error = SET_ERROR(EIO);
+ }
+ } else {
+ cvd = vdev_draid_spare_get_child(vd, offset);
+
+ if (cvd == NULL || !vdev_readable(cvd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ }
+ break;
+
+ case ZIO_TYPE_TRIM:
+ /* The vdev label ranges are never trimmed */
+ ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset));
+
+ cvd = vdev_draid_spare_get_child(vd, offset);
+
+ if (cvd == NULL || !cvd->vdev_has_trim) {
+ zio->io_error = SET_ERROR(ENXIO);
+ } else {
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ offset, zio->io_abd, zio->io_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_draid_spare_child_done, zio));
+ }
+ break;
+
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ zio_execute(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_draid_spare_io_done(zio_t *zio)
+{
+}
+
+/*
+ * Lookup the full spare config in spa->spa_spares.sav_config and
+ * return the top_guid and spare_id for the named spare.
+ */
+static int
+vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
+ uint64_t *spare_idp)
+{
+ nvlist_t **spares;
+ uint_t nspares;
+ int error;
+
+ if ((spa->spa_spares.sav_config == NULL) ||
+ (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) {
+ return (SET_ERROR(ENOENT));
+ }
+
+ char *spare_name;
+ error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
+ if (error != 0)
+ return (SET_ERROR(EINVAL));
+
+ for (int i = 0; i < nspares; i++) {
+ nvlist_t *spare = spares[i];
+ uint64_t top_guid, spare_id;
+ char *type, *path;
+
+ /* Skip non-distributed spares */
+ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
+ if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0)
+ continue;
+
+ /* Skip spares with the wrong name */
+ error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path);
+ if (error != 0 || strcmp(path, spare_name) != 0)
+ continue;
+
+ /* Found the matching spare */
+ error = nvlist_lookup_uint64(spare,
+ ZPOOL_CONFIG_TOP_GUID, &top_guid);
+ if (error == 0) {
+ error = nvlist_lookup_uint64(spare,
+ ZPOOL_CONFIG_SPARE_ID, &spare_id);
+ }
+
+ if (error != 0) {
+ return (SET_ERROR(EINVAL));
+ } else {
+ *top_guidp = top_guid;
+ *spare_idp = spare_id;
+ return (0);
+ }
+ }
+
+ return (SET_ERROR(ENOENT));
+}
+
+/*
+ * Initialize private dRAID spare specific fields from the nvlist.
+ */
+static int
+vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ vdev_draid_spare_t *vds;
+ uint64_t top_guid = 0;
+ uint64_t spare_id;
+
+ /*
+ * In the normal case check the list of spares stored in the spa
+ * to lookup the top_guid and spare_id for provided spare config.
+ * When creating a new pool or adding vdevs the spare list is not
+ * yet populated and the values are provided in the passed config.
+ */
+ if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
+ &top_guid) != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID,
+ &spare_id) != 0)
+ return (SET_ERROR(EINVAL));
+ }
+
+ vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP);
+ vds->vds_draid_vdev = NULL;
+ vds->vds_top_guid = top_guid;
+ vds->vds_spare_id = spare_id;
+
+ *tsd = vds;
+
+ return (0);
+}
+
+static void
+vdev_draid_spare_fini(vdev_t *vd)
+{
+ kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t));
+}
+
+static void
+vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+ ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id);
+}
+
+vdev_ops_t vdev_draid_spare_ops = {
+ .vdev_op_init = vdev_draid_spare_init,
+ .vdev_op_fini = vdev_draid_spare_fini,
+ .vdev_op_open = vdev_draid_spare_open,
+ .vdev_op_close = vdev_draid_spare_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
+ .vdev_op_io_start = vdev_draid_spare_io_start,
+ .vdev_op_io_done = vdev_draid_spare_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = NULL,
+ .vdev_op_rele = NULL,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = vdev_draid_spare_config_generate,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
+ .vdev_op_type = VDEV_TYPE_DRAID_SPARE,
+ .vdev_op_leaf = B_TRUE,
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c
new file mode 100644
index 000000000000..fe1a75c11312
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c
@@ -0,0 +1,40 @@
+/*
+ * Xorshift Pseudo Random Number Generator based on work by David Blackman
+ * and Sebastiano Vigna (vigna@acm.org).
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ * http://prng.di.unimi.it/xoroshiro128plusplus.c
+ *
+ * To the extent possible under law, the author has dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * See <http://creativecommons.org/publicdomain/zero/1.0/>.
+ *
+ * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
+ * small-state generators. It is extremely (sub-ns) fast and it passes all
+ * tests we are aware of, but its state space is large enough only for
+ * mild parallelism.
+ */
+
+#include <sys/vdev_draid.h>
+
+static inline uint64_t rotl(const uint64_t x, int k)
+{
+ return (x << k) | (x >> (64 - k));
+}
+
+uint64_t
+vdev_draid_rand(uint64_t *s)
+{
+ const uint64_t s0 = s[0];
+ uint64_t s1 = s[1];
+ const uint64_t result = rotl(s0 + s1, 17) + s0;
+
+ s1 ^= s0;
+ s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
+ s[1] = rotl(s1, 28); // c
+
+ return (result);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
index 12ee393bd5db..07d1c922a50c 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -239,6 +239,7 @@ typedef struct indirect_child {
*/
struct indirect_child *ic_duplicate;
list_node_t ic_node; /* node on is_unique_child */
+ int ic_error; /* set when a child does not contain the data */
} indirect_child_t;
/*
@@ -1272,15 +1273,14 @@ vdev_indirect_read_all(zio_t *zio)
continue;
/*
- * Note, we may read from a child whose DTL
- * indicates that the data may not be present here.
- * While this might result in a few i/os that will
- * likely return incorrect data, it simplifies the
- * code since we can treat scrub and resilver
- * identically. (The incorrect data will be
- * detected and ignored when we verify the
- * checksum.)
+ * If a child is missing the data, set ic_error. Used
+ * in vdev_indirect_repair(). We perform the read
+ * nevertheless which provides the opportunity to
+ * reconstruct the split block if at all possible.
*/
+ if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING,
+ zio->io_txg, 1))
+ ic->ic_error = SET_ERROR(ESTALE);
ic->ic_data = abd_alloc_sametype(zio->io_abd,
is->is_size);
@@ -1410,7 +1410,11 @@ vdev_indirect_checksum_error(zio_t *zio,
* Issue repair i/os for any incorrect copies. We do this by comparing
* each split segment's correct data (is_good_child's ic_data) with each
* other copy of the data. If they differ, then we overwrite the bad data
- * with the good copy. Note that we do this without regard for the DTL's,
+ * with the good copy. The DTL is checked in vdev_indirect_read_all() and
+ * if a vdev is missing a copy of the data we set ic_error and the read is
+ * performed. This provides the opportunity to reconstruct the split block
+ * if at all possible. ic_error is checked here and if set it suppresses
+ * incrementing the checksum counter. Aside from this DTLs are not checked,
* which simplifies this code and also issues the optimal number of writes
* (based on which copies actually read bad data, as opposed to which we
* think might be wrong). For the same reason, we always use
@@ -1447,6 +1451,14 @@ vdev_indirect_repair(zio_t *zio)
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
NULL, NULL));
+ /*
+ * If ic_error is set the current child does not have
+ * a copy of the data, so suppress incrementing the
+ * checksum counter.
+ */
+ if (ic->ic_error == ESTALE)
+ continue;
+
vdev_indirect_checksum_error(zio, is, ic);
}
}
@@ -1844,9 +1856,13 @@ vdev_indirect_io_done(zio_t *zio)
}
vdev_ops_t vdev_indirect_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_indirect_open,
.vdev_op_close = vdev_indirect_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_indirect_io_start,
.vdev_op_io_done = vdev_indirect_io_done,
.vdev_op_state_change = NULL,
@@ -1855,6 +1871,11 @@ vdev_ops_t vdev_indirect_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = vdev_indirect_remap,
.vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index 7ff7fffcc80e..083ad2861b5b 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
vd->vdev_initialize_action_time = gethrestime_sec();
}
+
+ vdev_initializing_state_t old_state = vd->vdev_initialize_state;
vd->vdev_initialize_state = new_state;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
"vdev=%s suspended", vd->vdev_path);
break;
case VDEV_INITIALIZE_CANCELED:
- spa_history_log_internal(spa, "initialize", tx,
- "vdev=%s canceled", vd->vdev_path);
+ if (old_state == VDEV_INITIALIZE_ACTIVE ||
+ old_state == VDEV_INITIALIZE_SUSPENDED)
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s canceled", vd->vdev_path);
break;
case VDEV_INITIALIZE_COMPLETE:
spa_history_log_internal(spa, "initialize", tx,
@@ -318,6 +322,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
}
static void
+vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+ uint64_t *last_rs_end = (uint64_t *)arg;
+
+ if (physical_rs->rs_end > *last_rs_end)
+ *last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+ vd->vdev_initialize_bytes_est += size;
+
+ if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
+ vd->vdev_initialize_bytes_done += size;
+ } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
+ vd->vdev_initialize_last_offset < physical_rs->rs_end) {
+ vd->vdev_initialize_bytes_done +=
+ vd->vdev_initialize_last_offset - physical_rs->rs_start;
+ }
+}
+
+static void
vdev_initialize_calculate_progress(vdev_t *vd)
{
ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
@@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd)
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
mutex_enter(&msp->ms_lock);
- uint64_t ms_free = msp->ms_size -
- metaslab_allocated_space(msp);
-
- if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
- ms_free /= vd->vdev_top->vdev_children;
+ uint64_t ms_free = (msp->ms_size -
+ metaslab_allocated_space(msp)) /
+ vdev_get_ndisks(vd->vdev_top);
/*
* Convert the metaslab range to a physical range
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
- range_seg64_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
- vdev_xlate(vd, &logical_rs, &physical_rs);
+ /* Metaslab space after this offset has not been initialized */
+ vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
vd->vdev_initialize_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
continue;
- } else if (vd->vdev_initialize_last_offset >
- physical_rs.rs_end) {
+ }
+
+ /* Metaslab space before this offset has been initialized */
+ uint64_t last_rs_end = physical_rs.rs_end;
+ if (!vdev_xlate_is_empty(&remain_rs)) {
+ vdev_xlate_walk(vd, &remain_rs,
+ vdev_initialize_xlate_last_rs_end, &last_rs_end);
+ }
+
+ if (vd->vdev_initialize_last_offset > last_rs_end) {
vd->vdev_initialize_bytes_done += ms_free;
vd->vdev_initialize_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
@@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd)
&where)) {
logical_rs.rs_start = rs_get_start(rs, rt);
logical_rs.rs_end = rs_get_end(rs, rt);
- vdev_xlate(vd, &logical_rs, &physical_rs);
-
- uint64_t size = physical_rs.rs_end -
- physical_rs.rs_start;
- vd->vdev_initialize_bytes_est += size;
- if (vd->vdev_initialize_last_offset >
- physical_rs.rs_end) {
- vd->vdev_initialize_bytes_done += size;
- } else if (vd->vdev_initialize_last_offset >
- physical_rs.rs_start &&
- vd->vdev_initialize_last_offset <
- physical_rs.rs_end) {
- vd->vdev_initialize_bytes_done +=
- vd->vdev_initialize_last_offset -
- physical_rs.rs_start;
- }
+
+ vdev_xlate_walk(vd, &logical_rs,
+ vdev_initialize_xlate_progress, vd);
}
mutex_exit(&msp->ms_lock);
}
@@ -419,55 +443,48 @@ vdev_initialize_load(vdev_t *vd)
return (err);
}
-/*
- * Convert the logical range into a physical range and add it to our
- * avl tree.
- */
static void
-vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
{
vdev_t *vd = arg;
- range_seg64_t logical_rs, physical_rs;
- logical_rs.rs_start = start;
- logical_rs.rs_end = start + size;
-
- ASSERT(vd->vdev_ops->vdev_op_leaf);
- vdev_xlate(vd, &logical_rs, &physical_rs);
-
- IMPLY(vd->vdev_top == vd,
- logical_rs.rs_start == physical_rs.rs_start);
- IMPLY(vd->vdev_top == vd,
- logical_rs.rs_end == physical_rs.rs_end);
/* Only add segments that we have not visited yet */
- if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+ if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
return;
/* Pick up where we left off mid-range. */
- if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+ if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
"(%llu, %llu)", vd->vdev_path,
- (u_longlong_t)physical_rs.rs_start,
- (u_longlong_t)physical_rs.rs_end,
+ (u_longlong_t)physical_rs->rs_start,
+ (u_longlong_t)physical_rs->rs_end,
(u_longlong_t)vd->vdev_initialize_last_offset,
- (u_longlong_t)physical_rs.rs_end);
- ASSERT3U(physical_rs.rs_end, >,
+ (u_longlong_t)physical_rs->rs_end);
+ ASSERT3U(physical_rs->rs_end, >,
vd->vdev_initialize_last_offset);
- physical_rs.rs_start = vd->vdev_initialize_last_offset;
+ physical_rs->rs_start = vd->vdev_initialize_last_offset;
}
- ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
- /*
- * With raidz, it's possible that the logical range does not live on
- * this leaf vdev. We only add the physical range to this vdev's if it
- * has a length greater than 0.
- */
- if (physical_rs.rs_end > physical_rs.rs_start) {
- range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
- physical_rs.rs_end - physical_rs.rs_start);
- } else {
- ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
- }
+ ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+ range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
+ physical_rs->rs_end - physical_rs->rs_start);
+}
+
+/*
+ * Convert the logical range into a physical range and add it to our
+ * avl tree.
+ */
+static void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ range_seg64_t logical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
}
static void
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
index d063b77ea836..fbd117d2d9ae 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_label.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -142,6 +142,7 @@
#include <sys/zap.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_fru != NULL)
fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
- if (vd->vdev_nparity != 0) {
- ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
- VDEV_TYPE_RAIDZ) == 0);
+ if (vd->vdev_ops->vdev_op_config_generate != NULL)
+ vd->vdev_ops->vdev_op_config_generate(vd, nv);
- /*
- * Make sure someone hasn't managed to sneak a fancy new vdev
- * into a crufty old storage pool.
- */
- ASSERT(vd->vdev_nparity == 1 ||
- (vd->vdev_nparity <= 2 &&
- spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
- (vd->vdev_nparity <= 3 &&
- spa_version(spa) >= SPA_VERSION_RAIDZ3));
-
- /*
- * Note that we'll add the nparity tag even on storage pools
- * that only support a single parity device -- older software
- * will just ignore it.
- */
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
- }
-
- if (vd->vdev_wholedisk != -1ULL)
+ if (vd->vdev_wholedisk != -1ULL) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk);
+ }
if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
@@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd))
return (NULL);
+ /*
+ * The label for a dRAID distributed spare is not stored on disk.
+ * Instead it is generated when needed which allows us to bypass
+ * the pipeline when reading the config from the label.
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return (vdev_draid_read_config_spare(vd));
+
vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
vp = abd_to_buf(vp_abd);
@@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
- if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
@@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd)
SCL_STATE);
ASSERT(vd->vdev_ops->vdev_op_leaf);
+ /*
+ * No uberblocks are stored on distributed spares, they may be
+ * safely skipped when expanding a leaf vdev.
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
@@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
if (!vdev_writeable(vd))
return;
+ /*
+ * There's no need to write uberblocks to a distributed spare, they
+ * are already stored on all the leaves of the parent dRAID. For
+ * this same reason vdev_uberblock_load_impl() skips distributed
+ * spares when reading uberblocks.
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
/* If the vdev was expanded, need to copy uberblock rings. */
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
vd->vdev_copy_uberblocks == B_TRUE) {
@@ -1764,6 +1772,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes,
return;
/*
+ * The top-level config never needs to be written to a distributed
+ * spare. When read vdev_dspare_label_read_config() will generate
+ * the config for the vdev_label_read_config().
+ */
+ if (vd->vdev_ops == &vdev_draid_spare_ops)
+ return;
+
+ /*
* Generate a label describing the top-level config to which we belong.
*/
label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
index 71b5adbbd06a..71ca43caec1a 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -33,6 +33,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
#include <sys/zio.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
@@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void)
/*
* Virtual device vector for mirroring.
*/
-
typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
@@ -108,6 +108,7 @@ typedef struct mirror_child {
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
+ uint8_t mc_rebuilding;
} mirror_child_t;
typedef struct mirror_map {
@@ -115,6 +116,7 @@ typedef struct mirror_map {
int mm_preferred_cnt;
int mm_children;
boolean_t mm_resilvering;
+ boolean_t mm_rebuilding;
boolean_t mm_root;
mirror_child_t mm_child[];
} mirror_map_t;
@@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
return (load + zfs_vdev_mirror_rotating_seek_inc);
}
+static boolean_t
+vdev_mirror_rebuilding(vdev_t *vd)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+ return (B_TRUE);
+
+ for (int i = 0; i < vd->vdev_children; i++) {
+ if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
/*
* Avoid inlining the function to keep vdev_mirror_io_start(), which
* is this functions only caller, as small as possible on the stack.
@@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio)
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;
+
+ if (vdev_mirror_rebuilding(mc->mc_vd))
+ mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
}
}
@@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio)
return (mm->mm_preferred[p]);
}
+static boolean_t
+vdev_mirror_child_readable(mirror_child_t *mc)
+{
+ vdev_t *vd = mc->mc_vd;
+
+ if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+ return (vdev_draid_readable(vd, mc->mc_offset));
+ else
+ return (vdev_readable(vd));
+}
+
+static boolean_t
+vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
+{
+ vdev_t *vd = mc->mc_vd;
+
+ if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+ return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
+ else
+ return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
/*
* Try to find a vdev whose DTL doesn't contain the block we want to read
- * preferring vdevs based on determined load.
+ * preferring vdevs based on determined load. If we can't, try the read on
+ * any vdev we haven't already tried.
*
- * Try to find a child whose DTL doesn't contain the block we want to read.
- * If we can't, try the read on any vdev we haven't already tried.
+ * Distributed spares are an exception to the above load rule. They are
+ * always preferred in order to detect gaps in the distributed spare which
+ * are created when another disk in the dRAID fails. In order to restore
+ * redundancy those gaps must be read to trigger the required repair IO.
*/
static int
vdev_mirror_child_select(zio_t *zio)
@@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio)
if (mc->mc_tried || mc->mc_skipped)
continue;
- if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) {
+ if (mc->mc_vd == NULL ||
+ !vdev_mirror_child_readable(mc)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
mc->mc_skipped = 1;
continue;
}
- if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+ if (vdev_mirror_child_missing(mc, txg, 1)) {
mc->mc_error = SET_ERROR(ESTALE);
mc->mc_skipped = 1;
mc->mc_speculative = 1;
continue;
}
+ if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
+ mm->mm_preferred[0] = c;
+ mm->mm_preferred_cnt = 1;
+ break;
+ }
+
mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
if (mc->mc_load > lowest_load)
continue;
@@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
+ c++;
+
+ /*
+ * When sequentially resilvering only issue write repair
+ * IOs to the vdev which is being rebuilt since performance
+ * is limited by the slowest child. This is an issue for
+ * faster replacement devices such as distributed spares.
+ */
+ if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
+ (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+ !(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ mm->mm_rebuilding && !mc->mc_rebuilding) {
+ continue;
+ }
+
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
- c++;
}
zio_execute(zio);
@@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
if (mc->mc_error == 0) {
+ vdev_ops_t *ops = mc->mc_vd->vdev_ops;
+
if (mc->mc_tried)
continue;
/*
@@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio)
* 1. it's a scrub (in which case we have
* tried everything that was healthy)
* - or -
- * 2. it's an indirect vdev (in which case
- * it could point to any other vdev, which
- * might have a bad DTL)
+ * 2. it's an indirect or distributed spare
+ * vdev (in which case it could point to any
+ * other vdev, which might have a bad DTL)
* - or -
* 3. the DTL indicates that this data is
* missing from this vdev
*/
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
- mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
+ ops != &vdev_indirect_ops &&
+ ops != &vdev_draid_spare_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
@@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
}
}
+/*
+ * Return the maximum asize for a rebuild zio in the provided range.
+ */
+static uint64_t
+vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+ uint64_t max_segment)
+{
+ uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
+ SPA_MAXBLOCKSIZE);
+
+ return (MIN(asize, vdev_psize_to_asize(vd, psize)));
+}
+
vdev_ops_t vdev_mirror_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
- .vdev_op_need_resilver = NULL,
+ .vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_replacing_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
- .vdev_op_need_resilver = NULL,
+ .vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
vdev_ops_t vdev_spare_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_mirror_open,
.vdev_op_close = vdev_mirror_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_mirror_io_start,
.vdev_op_io_done = vdev_mirror_io_done,
.vdev_op_state_change = vdev_mirror_state_change,
- .vdev_op_need_resilver = NULL,
+ .vdev_op_need_resilver = vdev_default_need_resilver,
.vdev_op_hold = NULL,
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c
index ce90df6e8d95..e9145fd012d7 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_missing.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c
@@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio)
}
vdev_ops_t vdev_missing_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_missing_open,
.vdev_op_close = vdev_missing_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_missing_io_start,
.vdev_op_io_done = vdev_missing_io_done,
.vdev_op_state_change = NULL,
@@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
vdev_ops_t vdev_hole_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_missing_open,
.vdev_op_close = vdev_missing_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_missing_io_start,
.vdev_op_io_done = vdev_missing_io_done,
.vdev_op_state_change = NULL,
@@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index a8ef3d7474c9..02040c3ee198 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -121,16 +121,17 @@
/*
* The maximum number of i/os active to each device. Ideally, this will be >=
- * the sum of each queue's max_active. It must be at least the sum of each
- * queue's min_active.
+ * the sum of each queue's max_active.
*/
uint32_t zfs_vdev_max_active = 1000;
/*
* Per-queue limits on the number of i/os active to each device. If the
* number of active i/os is < zfs_vdev_max_active, then the min_active comes
- * into play. We will send min_active from each queue, and then select from
- * queues in the order defined by zio_priority_t.
+ * into play. We will send min_active from each queue round-robin, and then
+ * send from queues in the order defined by zio_priority_t up to max_active.
+ * Some queues have additional mechanisms to limit number of active I/Os in
+ * addition to min_active and max_active, see below.
*
* In general, smaller max_active's will lead to lower latency of synchronous
* operations. Larger max_active's may lead to higher overall throughput,
@@ -151,7 +152,7 @@ uint32_t zfs_vdev_async_read_max_active = 3;
uint32_t zfs_vdev_async_write_min_active = 2;
uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1;
-uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_scrub_max_active = 3;
uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
uint32_t zfs_vdev_initializing_min_active = 1;
@@ -172,6 +173,28 @@ int zfs_vdev_async_write_active_min_dirty_percent = 30;
int zfs_vdev_async_write_active_max_dirty_percent = 60;
/*
+ * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
+ * the number of concurrently-active I/O's is limited to *_min_active, unless
+ * the vdev is "idle". When there are no interactive I/Os active (sync or
+ * async), and zfs_vdev_nia_delay I/Os have completed since the last
+ * interactive I/O, then the vdev is considered to be "idle", and the number
+ * of concurrently-active non-interactive I/O's is increased to *_max_active.
+ */
+uint_t zfs_vdev_nia_delay = 5;
+
+/*
+ * Some HDDs tend to prioritize sequential I/O so high that concurrent
+ * random I/O latency reaches several seconds. On some HDDs it happens
+ * even if sequential I/Os are submitted one at a time, and so setting
+ * *_max_active to 1 does not help. To prevent non-interactive I/Os, like
+ * scrub, from monopolizing the device no more than zfs_vdev_nia_credit
+ * I/Os can be sent while there are outstanding incomplete interactive
+ * I/Os. This enforced wait ensures the HDD services the interactive I/O
+ * within a reasonable amount of time.
+ */
+uint_t zfs_vdev_nia_credit = 5;
+
+/*
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
* For read I/Os, we also aggregate across small adjacency gaps; for writes
* we include spans of optional I/Os to aid aggregation at the disk even when
@@ -261,7 +284,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2)
}
static int
-vdev_queue_class_min_active(zio_priority_t p)
+vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@@ -273,15 +296,19 @@ vdev_queue_class_min_active(zio_priority_t p)
case ZIO_PRIORITY_ASYNC_WRITE:
return (zfs_vdev_async_write_min_active);
case ZIO_PRIORITY_SCRUB:
- return (zfs_vdev_scrub_min_active);
+ return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
+ MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
case ZIO_PRIORITY_REMOVAL:
- return (zfs_vdev_removal_min_active);
+ return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
+ MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
case ZIO_PRIORITY_INITIALIZING:
- return (zfs_vdev_initializing_min_active);
+ return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
+ MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
case ZIO_PRIORITY_REBUILD:
- return (zfs_vdev_rebuild_min_active);
+ return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
+ MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
default:
panic("invalid priority %u", p);
return (0);
@@ -311,14 +338,12 @@ vdev_queue_max_async_writes(spa_t *spa)
* Sync tasks correspond to interactive user actions. To reduce the
* execution time of those actions we push data out as fast as possible.
*/
- if (spa_has_pending_synctask(spa))
+ dirty = dp->dp_dirty_total;
+ if (dirty > max_bytes || spa_has_pending_synctask(spa))
return (zfs_vdev_async_write_max_active);
- dirty = dp->dp_dirty_total;
if (dirty < min_bytes)
return (zfs_vdev_async_write_min_active);
- if (dirty > max_bytes)
- return (zfs_vdev_async_write_max_active);
/*
* linear interpolation:
@@ -337,7 +362,7 @@ vdev_queue_max_async_writes(spa_t *spa)
}
static int
-vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@@ -349,14 +374,34 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_WRITE:
return (vdev_queue_max_async_writes(spa));
case ZIO_PRIORITY_SCRUB:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_scrub_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_scrub_min_active));
return (zfs_vdev_scrub_max_active);
case ZIO_PRIORITY_REMOVAL:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_removal_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_removal_min_active));
return (zfs_vdev_removal_max_active);
case ZIO_PRIORITY_INITIALIZING:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_initializing_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_initializing_min_active));
return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
case ZIO_PRIORITY_REBUILD:
+ if (vq->vq_ia_active > 0) {
+ return (MIN(vq->vq_nia_credit,
+ zfs_vdev_rebuild_min_active));
+ } else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+ return (MAX(1, zfs_vdev_rebuild_min_active));
return (zfs_vdev_rebuild_max_active);
default:
panic("invalid priority %u", p);
@@ -372,17 +417,24 @@ static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
spa_t *spa = vq->vq_vdev->vdev_spa;
- zio_priority_t p;
+ zio_priority_t p, n;
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
- /* find a queue that has not reached its minimum # outstanding i/os */
- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ /*
+ * Find a queue that has not reached its minimum # outstanding i/os.
+ * Do round-robin to reduce starvation due to zfs_vdev_max_active
+ * and vq_nia_credit limits.
+ */
+ for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
+ p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
- vdev_queue_class_min_active(p))
+ vdev_queue_class_min_active(vq, p)) {
+ vq->vq_last_prio = p;
return (p);
+ }
}
/*
@@ -392,8 +444,10 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
- vdev_queue_class_max_active(spa, p))
+ vdev_queue_class_max_active(spa, vq, p)) {
+ vq->vq_last_prio = p;
return (p);
+ }
}
/* No eligible queued i/os */
@@ -493,6 +547,20 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
}
}
+static boolean_t
+vdev_queue_is_interactive(zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SCRUB:
+ case ZIO_PRIORITY_REMOVAL:
+ case ZIO_PRIORITY_INITIALIZING:
+ case ZIO_PRIORITY_REBUILD:
+ return (B_FALSE);
+ default:
+ return (B_TRUE);
+ }
+}
+
static void
vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
@@ -502,6 +570,12 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++;
+ if (vdev_queue_is_interactive(zio->io_priority)) {
+ if (++vq->vq_ia_active == 1)
+ vq->vq_nia_credit = 1;
+ } else if (vq->vq_ia_active > 0) {
+ vq->vq_nia_credit--;
+ }
avl_add(&vq->vq_active_tree, zio);
if (shk->kstat != NULL) {
@@ -520,6 +594,13 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--;
+ if (vdev_queue_is_interactive(zio->io_priority)) {
+ if (--vq->vq_ia_active == 0)
+ vq->vq_nia_credit = 0;
+ else
+ vq->vq_nia_credit = zfs_vdev_nia_credit;
+ } else if (vq->vq_ia_active == 0)
+ vq->vq_nia_credit++;
avl_remove(&vq->vq_active_tree, zio);
if (shk->kstat != NULL) {
@@ -593,6 +674,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);
+ /*
+ * I/Os to distributed spares are directly dispatched to the dRAID
+ * leaf vdevs for aggregation. See the comment at the end of the
+ * zio_vdev_io_start() function.
+ */
+ ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
+
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
@@ -1065,6 +1153,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
"Min active rebuild I/Os per vdev");
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
+ "Number of non-interactive I/Os to allow in sequence");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
+ "Number of non-interactive I/Os before _max_active");
+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
"Queue depth percentage for each top-level vdev");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 47312e02f70a..989b90dc2635 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -35,6 +35,7 @@
#include <sys/fm/fs/zfs.h>
#include <sys/vdev_raidz.h>
#include <sys/vdev_raidz_impl.h>
+#include <sys/vdev_draid.h>
#ifdef ZFS_DEBUG
#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
@@ -134,25 +135,51 @@
VDEV_RAIDZ_64MUL_2((x), mask); \
}
-void
-vdev_raidz_map_free(raidz_map_t *rm)
+static void
+vdev_raidz_row_free(raidz_row_t *rr)
{
int c;
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- abd_free(rm->rm_col[c].rc_abd);
+ for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) {
+ abd_free(rr->rr_col[c].rc_abd);
- if (rm->rm_col[c].rc_gdata != NULL)
- abd_free(rm->rm_col[c].rc_gdata);
+ if (rr->rr_col[c].rc_gdata != NULL) {
+ abd_free(rr->rr_col[c].rc_gdata);
+ }
+ if (rr->rr_col[c].rc_orig_data != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_orig_data,
+ rr->rr_col[c].rc_size);
+ }
}
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_size != 0) {
+ if (abd_is_gang(rr->rr_col[c].rc_abd))
+ abd_free(rr->rr_col[c].rc_abd);
+ else
+ abd_put(rr->rr_col[c].rc_abd);
+ }
+ if (rr->rr_col[c].rc_orig_data != NULL) {
+ zio_buf_free(rr->rr_col[c].rc_orig_data,
+ rr->rr_col[c].rc_size);
+ }
+ }
+
+ if (rr->rr_abd_copy != NULL)
+ abd_free(rr->rr_abd_copy);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
- abd_put(rm->rm_col[c].rc_abd);
+ if (rr->rr_abd_empty != NULL)
+ abd_free(rr->rr_abd_empty);
+
+ kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
+}
- if (rm->rm_abd_copy != NULL)
- abd_free(rm->rm_abd_copy);
+void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++)
+ vdev_raidz_row_free(rm->rm_row[i]);
- kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+ kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
}
static void
@@ -161,10 +188,11 @@ vdev_raidz_map_free_vsd(zio_t *zio)
raidz_map_t *rm = zio->io_vsd;
ASSERT0(rm->rm_freed);
- rm->rm_freed = 1;
+ rm->rm_freed = B_TRUE;
- if (rm->rm_reports == 0)
+ if (rm->rm_reports == 0) {
vdev_raidz_map_free(rm);
+ }
}
/*ARGSUSED*/
@@ -175,7 +203,7 @@ vdev_raidz_cksum_free(void *arg, size_t ignored)
ASSERT3U(rm->rm_reports, >, 0);
- if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+ if (--rm->rm_reports == 0 && rm->rm_freed)
vdev_raidz_map_free(rm);
}
@@ -186,77 +214,79 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
const size_t c = zcr->zcr_cbinfo;
size_t x, offset;
- const abd_t *good = NULL;
- const abd_t *bad = rm->rm_col[c].rc_abd;
-
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
return;
}
- if (c < rm->rm_firstdatacol) {
+ ASSERT3U(rm->rm_nrows, ==, 1);
+ raidz_row_t *rr = rm->rm_row[0];
+
+ const abd_t *good = NULL;
+ const abd_t *bad = rr->rr_col[c].rc_abd;
+
+ if (c < rr->rr_firstdatacol) {
/*
* The first time through, calculate the parity blocks for
* the good data (this relies on the fact that the good
* data never changes for a given logical ZIO)
*/
- if (rm->rm_col[0].rc_gdata == NULL) {
+ if (rr->rr_col[0].rc_gdata == NULL) {
abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
/*
- * Set up the rm_col[]s to generate the parity for
+ * Set up the rr_col[]s to generate the parity for
* good_data, first saving the parity bufs and
* replacing them with buffers to hold the result.
*/
- for (x = 0; x < rm->rm_firstdatacol; x++) {
- bad_parity[x] = rm->rm_col[x].rc_abd;
- rm->rm_col[x].rc_abd =
- rm->rm_col[x].rc_gdata =
- abd_alloc_sametype(rm->rm_col[x].rc_abd,
- rm->rm_col[x].rc_size);
+ for (x = 0; x < rr->rr_firstdatacol; x++) {
+ bad_parity[x] = rr->rr_col[x].rc_abd;
+ rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
+ abd_alloc_sametype(rr->rr_col[x].rc_abd,
+ rr->rr_col[x].rc_size);
}
/* fill in the data columns from good_data */
offset = 0;
- for (; x < rm->rm_cols; x++) {
- abd_put(rm->rm_col[x].rc_abd);
+ for (; x < rr->rr_cols; x++) {
+ abd_put(rr->rr_col[x].rc_abd);
- rm->rm_col[x].rc_abd =
+ rr->rr_col[x].rc_abd =
abd_get_offset_size((abd_t *)good_data,
- offset, rm->rm_col[x].rc_size);
- offset += rm->rm_col[x].rc_size;
+ offset, rr->rr_col[x].rc_size);
+ offset += rr->rr_col[x].rc_size;
}
/*
* Construct the parity from the good data.
*/
- vdev_raidz_generate_parity(rm);
+ vdev_raidz_generate_parity_row(rm, rr);
/* restore everything back to its original state */
- for (x = 0; x < rm->rm_firstdatacol; x++)
- rm->rm_col[x].rc_abd = bad_parity[x];
+ for (x = 0; x < rr->rr_firstdatacol; x++)
+ rr->rr_col[x].rc_abd = bad_parity[x];
offset = 0;
- for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
- abd_put(rm->rm_col[x].rc_abd);
- rm->rm_col[x].rc_abd = abd_get_offset_size(
- rm->rm_abd_copy, offset,
- rm->rm_col[x].rc_size);
- offset += rm->rm_col[x].rc_size;
+ for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
+ abd_put(rr->rr_col[x].rc_abd);
+ rr->rr_col[x].rc_abd = abd_get_offset_size(
+ rr->rr_abd_copy, offset,
+ rr->rr_col[x].rc_size);
+ offset += rr->rr_col[x].rc_size;
}
}
- ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
- good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
- rm->rm_col[c].rc_size);
+ ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
+ good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
+ rr->rr_col[c].rc_size);
} else {
/* adjust good_data to point at the start of our column */
offset = 0;
- for (x = rm->rm_firstdatacol; x < c; x++)
- offset += rm->rm_col[x].rc_size;
+ for (x = rr->rr_firstdatacol; x < c; x++)
+ offset += rr->rr_col[x].rc_size;
good = abd_get_offset_size((abd_t *)good_data, offset,
- rm->rm_col[c].rc_size);
+ rr->rr_col[c].rc_size);
}
/* we drop the ereport if it ends up that the data was good */
@@ -274,10 +304,7 @@ static void
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
- size_t offset;
-
raidz_map_t *rm = zio->io_vsd;
- size_t size;
/* set up the report and bump the refcount */
zcr->zcr_cbdata = rm;
@@ -287,8 +314,9 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
+ ASSERT3U(rm->rm_nrows, ==, 1);
- if (rm->rm_abd_copy != NULL)
+ if (rm->rm_row[0]->rr_abd_copy != NULL)
return;
/*
@@ -299,26 +327,30 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
* Our parity data is already in separate buffers, so there's no need
* to copy them.
*/
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ size_t offset = 0;
+ size_t size = 0;
- size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
- size += rm->rm_col[c].rc_size;
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
+ size += rr->rr_col[c].rc_size;
- rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
+ rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
- for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
- abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
- col->rc_size);
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+ abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
+ offset, col->rc_size);
- abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_copy(tmp, col->rc_abd, col->rc_size);
- abd_put(col->rc_abd);
- col->rc_abd = tmp;
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
- offset += col->rc_size;
+ offset += col->rc_size;
+ }
+ ASSERT3U(offset, ==, size);
}
- ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -337,7 +369,7 @@ noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
uint64_t nparity)
{
- raidz_map_t *rm;
+ raidz_row_t *rr;
/* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = zio->io_offset >> ashift;
/* The zio's size in units of the vdev's minimum sector size. */
@@ -349,6 +381,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
uint64_t off = 0;
+ raidz_map_t *rm =
+ kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
+ rm->rm_nrows = 1;
+
/*
* "Quotient": The number of data sectors for this stripe on all but
* the "big column" child vdevs that also contain "remainder" data.
@@ -370,8 +406,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
*/
tot = s + nparity * (q + (r == 0 ? 0 : 1));
- /* acols: The columns that will be accessed. */
- /* scols: The columns that will be accessed or skipped. */
+ /*
+ * acols: The columns that will be accessed.
+ * scols: The columns that will be accessed or skipped.
+ */
if (q == 0) {
/* Our I/O request doesn't span all child vdevs. */
acols = bc;
@@ -383,65 +421,70 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
ASSERT3U(acols, <=, scols);
- rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
-
- rm->rm_cols = acols;
- rm->rm_scols = scols;
- rm->rm_bigcols = bc;
- rm->rm_skipstart = bc;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
- rm->rm_firstdatacol = nparity;
- rm->rm_abd_copy = NULL;
- rm->rm_reports = 0;
- rm->rm_freed = 0;
- rm->rm_ecksuminjected = 0;
+ rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
+ rm->rm_row[0] = rr;
+
+ rr->rr_cols = acols;
+ rr->rr_scols = scols;
+ rr->rr_bigcols = bc;
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
+ rr->rr_firstdatacol = nparity;
+ rr->rr_abd_copy = NULL;
+ rr->rr_abd_empty = NULL;
+ rr->rr_nempty = 0;
+#ifdef ZFS_DEBUG
+ rr->rr_offset = zio->io_offset;
+ rr->rr_size = zio->io_size;
+#endif
asize = 0;
for (c = 0; c < scols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
col = f + c;
coff = o;
if (col >= dcols) {
col -= dcols;
coff += 1ULL << ashift;
}
- rm->rm_col[c].rc_devidx = col;
- rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_abd = NULL;
- rm->rm_col[c].rc_gdata = NULL;
- rm->rm_col[c].rc_error = 0;
- rm->rm_col[c].rc_tried = 0;
- rm->rm_col[c].rc_skipped = 0;
+ rc->rc_devidx = col;
+ rc->rc_offset = coff;
+ rc->rc_abd = NULL;
+ rc->rc_gdata = NULL;
+ rc->rc_orig_data = NULL;
+ rc->rc_error = 0;
+ rc->rc_tried = 0;
+ rc->rc_skipped = 0;
+ rc->rc_repair = 0;
+ rc->rc_need_orig_restore = B_FALSE;
if (c >= acols)
- rm->rm_col[c].rc_size = 0;
+ rc->rc_size = 0;
else if (c < bc)
- rm->rm_col[c].rc_size = (q + 1) << ashift;
+ rc->rc_size = (q + 1) << ashift;
else
- rm->rm_col[c].rc_size = q << ashift;
+ rc->rc_size = q << ashift;
- asize += rm->rm_col[c].rc_size;
+ asize += rc->rc_size;
}
ASSERT3U(asize, ==, tot << ashift);
- rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
- ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
- ASSERT3U(rm->rm_nskip, <=, nparity);
+ rm->rm_skipstart = bc;
- for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_abd =
- abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
+ for (c = 0; c < rr->rr_firstdatacol; c++)
+ rr->rr_col[c].rc_abd =
+ abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
- rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
- rm->rm_col[c].rc_size);
- off = rm->rm_col[c].rc_size;
+ rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
+ rr->rr_col[c].rc_size);
+ off = rr->rr_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
- rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
- rm->rm_col[c].rc_size);
- off += rm->rm_col[c].rc_size;
+ raidz_col_t *rc = &rr->rr_col[c];
+ rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size);
+ off += rc->rc_size;
}
/*
@@ -464,24 +507,21 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
* skip the first column since at least one data and one parity
* column must appear in each row.
*/
- ASSERT(rm->rm_cols >= 2);
- ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
+ ASSERT(rr->rr_cols >= 2);
+ ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
- if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
- devidx = rm->rm_col[0].rc_devidx;
- o = rm->rm_col[0].rc_offset;
- rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
- rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
- rm->rm_col[1].rc_devidx = devidx;
- rm->rm_col[1].rc_offset = o;
+ if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ devidx = rr->rr_col[0].rc_devidx;
+ o = rr->rr_col[0].rc_offset;
+ rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+ rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+ rr->rr_col[1].rc_devidx = devidx;
+ rr->rr_col[1].rc_offset = o;
if (rm->rm_skipstart == 0)
rm->rm_skipstart = 1;
}
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
-
/* init RAIDZ parity ops */
rm->rm_ops = vdev_raidz_math_get_ops();
@@ -550,50 +590,43 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private)
}
static void
-vdev_raidz_generate_parity_p(raidz_map_t *rm)
+vdev_raidz_generate_parity_p(raidz_row_t *rr)
{
- uint64_t *p;
- int c;
- abd_t *src;
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- if (c == rm->rm_firstdatacol) {
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ if (c == rr->rr_firstdatacol) {
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, NULL, NULL };
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_p_func, &pqr);
}
}
}
static void
-vdev_raidz_generate_parity_pq(raidz_map_t *rm)
+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
{
- uint64_t *p, *q, pcnt, ccnt, mask, i;
- int c;
- abd_t *src;
-
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
- if (c == rm->rm_firstdatacol) {
+ if (c == rr->rr_firstdatacol) {
ASSERT(ccnt == pcnt || ccnt == 0);
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
- (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
- for (i = ccnt; i < pcnt; i++) {
+ for (uint64_t i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
}
@@ -601,14 +634,15 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
struct pqr_struct pqr = { p, q, NULL };
ASSERT(ccnt <= pcnt);
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_pq_func, &pqr);
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (i = ccnt; i < pcnt; i++) {
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
@@ -616,33 +650,29 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
}
static void
-vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
{
- uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
- int c;
- abd_t *src;
-
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
- ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
- rm->rm_col[VDEV_RAIDZ_R].rc_size);
+ uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
+ uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+ rr->rr_col[VDEV_RAIDZ_R].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_abd;
- p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ abd_t *src = rr->rr_col[c].rc_abd;
- ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
+ uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
- if (c == rm->rm_firstdatacol) {
+ if (c == rr->rr_firstdatacol) {
ASSERT(ccnt == pcnt || ccnt == 0);
- abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
- (void) memcpy(q, p, rm->rm_col[c].rc_size);
- (void) memcpy(r, p, rm->rm_col[c].rc_size);
+ abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+ (void) memcpy(q, p, rr->rr_col[c].rc_size);
+ (void) memcpy(r, p, rr->rr_col[c].rc_size);
- for (i = ccnt; i < pcnt; i++) {
+ for (uint64_t i = ccnt; i < pcnt; i++) {
p[i] = 0;
q[i] = 0;
r[i] = 0;
@@ -651,14 +681,15 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
struct pqr_struct pqr = { p, q, r };
ASSERT(ccnt <= pcnt);
- (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
vdev_raidz_pqr_func, &pqr);
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (i = ccnt; i < pcnt; i++) {
+ uint64_t mask;
+ for (uint64_t i = ccnt; i < pcnt; i++) {
VDEV_RAIDZ_64MUL_2(q[i], mask);
VDEV_RAIDZ_64MUL_4(r[i], mask);
}
@@ -671,27 +702,38 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
* parity columns available.
*/
void
-vdev_raidz_generate_parity(raidz_map_t *rm)
+vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
{
+ ASSERT3U(rr->rr_cols, !=, 0);
+
/* Generate using the new math implementation */
- if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
+ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
return;
- switch (rm->rm_firstdatacol) {
+ switch (rr->rr_firstdatacol) {
case 1:
- vdev_raidz_generate_parity_p(rm);
+ vdev_raidz_generate_parity_p(rr);
break;
case 2:
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity_pq(rr);
break;
case 3:
- vdev_raidz_generate_parity_pqr(rm);
+ vdev_raidz_generate_parity_pqr(rr);
break;
default:
cmn_err(CE_PANIC, "invalid RAID-Z configuration");
}
}
+void
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ vdev_raidz_generate_parity_row(rm, rr);
+ }
+}
+
/* ARGSUSED */
static int
vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
@@ -809,30 +851,27 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
}
static int
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
- int c;
abd_t *dst, *src;
- ASSERT(ntgts == 1);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(x < rm->rm_cols);
+ ASSERT3U(ntgts, ==, 1);
+ ASSERT3U(x, >=, rr->rr_firstdatacol);
+ ASSERT3U(x, <, rr->rr_cols);
- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
- ASSERT(rm->rm_col[x].rc_size > 0);
+ ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
- src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
- abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
+ abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- uint64_t size = MIN(rm->rm_col[x].rc_size,
- rm->rm_col[c].rc_size);
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
- src = rm->rm_col[c].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[c].rc_abd;
if (c == x)
continue;
@@ -845,7 +884,7 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
}
static int
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
{
int x = tgts[0];
int c, exp;
@@ -853,44 +892,44 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
ASSERT(ntgts == 1);
- ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
- rm->rm_col[c].rc_size);
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
+ rr->rr_col[c].rc_size);
- src = rm->rm_col[c].rc_abd;
- dst = rm->rm_col[x].rc_abd;
+ src = rr->rr_col[c].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
- if (c == rm->rm_firstdatacol) {
+ if (c == rr->rr_firstdatacol) {
abd_copy(dst, src, size);
- if (rm->rm_col[x].rc_size > size)
+ if (rr->rr_col[x].rc_size > size) {
abd_zero_off(dst, size,
- rm->rm_col[x].rc_size - size);
-
+ rr->rr_col[x].rc_size - size);
+ }
} else {
- ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ ASSERT3U(size, <=, rr->rr_col[x].rc_size);
(void) abd_iterate_func2(dst, src, 0, 0, size,
vdev_raidz_reconst_q_pre_func, NULL);
(void) abd_iterate_func(dst,
- size, rm->rm_col[x].rc_size - size,
+ size, rr->rr_col[x].rc_size - size,
vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
- dst = rm->rm_col[x].rc_abd;
- exp = 255 - (rm->rm_cols - 1 - x);
+ src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rr->rr_col[x].rc_abd;
+ exp = 255 - (rr->rr_cols - 1 - x);
struct reconst_q_struct rq = { abd_to_buf(src), exp };
- (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
static int
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
{
uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
abd_t *pdata, *qdata;
@@ -901,10 +940,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
ASSERT(ntgts == 2);
ASSERT(x < y);
- ASSERT(x >= rm->rm_firstdatacol);
- ASSERT(y < rm->rm_cols);
+ ASSERT(x >= rr->rr_firstdatacol);
+ ASSERT(y < rr->rr_cols);
- ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
+ ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
/*
* Move the parity data aside -- we're going to compute parity as
@@ -913,29 +952,29 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
- xsize = rm->rm_col[x].rc_size;
- ysize = rm->rm_col[y].rc_size;
+ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+ xsize = rr->rr_col[x].rc_size;
+ ysize = rr->rr_col[y].rc_size;
- rm->rm_col[VDEV_RAIDZ_P].rc_abd =
- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
- rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
- abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
- rm->rm_col[x].rc_size = 0;
- rm->rm_col[y].rc_size = 0;
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+ rr->rr_col[x].rc_size = 0;
+ rr->rr_col[y].rc_size = 0;
- vdev_raidz_generate_parity_pq(rm);
+ vdev_raidz_generate_parity_pq(rr);
- rm->rm_col[x].rc_size = xsize;
- rm->rm_col[y].rc_size = ysize;
+ rr->rr_col[x].rc_size = xsize;
+ rr->rr_col[y].rc_size = ysize;
p = abd_to_buf(pdata);
q = abd_to_buf(qdata);
- pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
- xd = rm->rm_col[x].rc_abd;
- yd = rm->rm_col[y].rc_abd;
+ pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rr->rr_col[x].rc_abd;
+ yd = rr->rr_col[y].rc_abd;
/*
* We now have:
@@ -953,7 +992,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
*/
a = vdev_raidz_pow2[255 + x - y];
- b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
+ b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
tmp = 255 - vdev_raidz_log2[a ^ 1];
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
@@ -967,14 +1006,14 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
(void) abd_iterate_func(xd, ysize, xsize - ysize,
vdev_raidz_reconst_pq_tail_func, &rpq);
- abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
- abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
- rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
@@ -1134,13 +1173,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
/* END CSTYLED */
static void
-vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
uint8_t **rows)
{
int i, j;
int pow;
- ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+ ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
/*
* Fill in the missing rows of interest.
@@ -1164,7 +1203,7 @@ vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
}
static void
-vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
uint8_t **rows, uint8_t **invrows, const uint8_t *used)
{
int i, j, ii, jj;
@@ -1176,10 +1215,10 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
* correspond to data columns.
*/
for (i = 0; i < nmissing; i++) {
- ASSERT3S(used[i], <, rm->rm_firstdatacol);
+ ASSERT3S(used[i], <, rr->rr_firstdatacol);
}
for (; i < n; i++) {
- ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+ ASSERT3S(used[i], >=, rr->rr_firstdatacol);
}
/*
@@ -1196,8 +1235,8 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
*/
for (i = 0; i < nmissing; i++) {
for (j = nmissing; j < n; j++) {
- ASSERT3U(used[j], >=, rm->rm_firstdatacol);
- jj = used[j] - rm->rm_firstdatacol;
+ ASSERT3U(used[j], >=, rr->rr_firstdatacol);
+ jj = used[j] - rr->rr_firstdatacol;
ASSERT3S(jj, <, n);
invrows[i][j] = rows[i][jj];
rows[i][jj] = 0;
@@ -1258,7 +1297,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
}
static void
-vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
int *missing, uint8_t **invrows, const uint8_t *used)
{
int i, j, x, cc, c;
@@ -1290,22 +1329,24 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
for (i = 0; i < n; i++) {
c = used[i];
- ASSERT3U(c, <, rm->rm_cols);
+ ASSERT3U(c, <, rr->rr_cols);
- src = abd_to_buf(rm->rm_col[c].rc_abd);
- ccount = rm->rm_col[c].rc_size;
+ ccount = rr->rr_col[c].rc_size;
+ ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
+ if (ccount == 0)
+ continue;
+ src = abd_to_buf(rr->rr_col[c].rc_abd);
for (j = 0; j < nmissing; j++) {
- cc = missing[j] + rm->rm_firstdatacol;
- ASSERT3U(cc, >=, rm->rm_firstdatacol);
- ASSERT3U(cc, <, rm->rm_cols);
+ cc = missing[j] + rr->rr_firstdatacol;
+ ASSERT3U(cc, >=, rr->rr_firstdatacol);
+ ASSERT3U(cc, <, rr->rr_cols);
ASSERT3U(cc, !=, c);
- dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
- dcount[j] = rm->rm_col[cc].rc_size;
+ dcount[j] = rr->rr_col[cc].rc_size;
+ if (dcount[j] != 0)
+ dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
}
- ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
-
for (x = 0; x < ccount; x++, src++) {
if (*src != 0)
log = vdev_raidz_log2[*src];
@@ -1334,16 +1375,14 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
}
static int
-vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
{
int n, i, c, t, tt;
int nmissing_rows;
int missing_rows[VDEV_RAIDZ_MAXPARITY];
int parity_map[VDEV_RAIDZ_MAXPARITY];
-
uint8_t *p, *pp;
size_t psize;
-
uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
uint8_t *used;
@@ -1354,30 +1393,39 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
/*
* Matrix reconstruction can't use scatter ABDs yet, so we allocate
- * temporary linear ABDs.
+ * temporary linear ABDs if any non-linear ABDs are found.
*/
- if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
- bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
-
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
+ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
+ if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
+ bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
+ KM_PUSHPAGE);
+
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
+
+ bufs[c] = col->rc_abd;
+ if (bufs[c] != NULL) {
+ col->rc_abd = abd_alloc_linear(
+ col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c],
+ col->rc_size);
+ }
+ }
- bufs[c] = col->rc_abd;
- col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
- abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ break;
}
}
- n = rm->rm_cols - rm->rm_firstdatacol;
+ n = rr->rr_cols - rr->rr_firstdatacol;
/*
* Figure out which data columns are missing.
*/
nmissing_rows = 0;
for (t = 0; t < ntgts; t++) {
- if (tgts[t] >= rm->rm_firstdatacol) {
+ if (tgts[t] >= rr->rr_firstdatacol) {
missing_rows[nmissing_rows++] =
- tgts[t] - rm->rm_firstdatacol;
+ tgts[t] - rr->rr_firstdatacol;
}
}
@@ -1387,7 +1435,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
*/
for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
ASSERT(tt < ntgts);
- ASSERT(c < rm->rm_firstdatacol);
+ ASSERT(c < rr->rr_firstdatacol);
/*
* Skip any targeted parity columns.
@@ -1422,9 +1470,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
used[i] = parity_map[i];
}
- for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
if (tt < nmissing_rows &&
- c == missing_rows[tt] + rm->rm_firstdatacol) {
+ c == missing_rows[tt] + rr->rr_firstdatacol) {
tt++;
continue;
}
@@ -1437,18 +1485,18 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
/*
* Initialize the interesting rows of the matrix.
*/
- vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
/*
* Invert the matrix.
*/
- vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
invrows, used);
/*
* Reconstruct the missing data using the generated matrix.
*/
- vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
invrows, used);
kmem_free(p, psize);
@@ -1457,21 +1505,24 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
* copy back from temporary linear abds and free them
*/
if (bufs) {
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- raidz_col_t *col = &rm->rm_col[c];
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *col = &rr->rr_col[c];
- abd_copy(bufs[c], col->rc_abd, col->rc_size);
- abd_free(col->rc_abd);
+ if (bufs[c] != NULL) {
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ }
col->rc_abd = bufs[c];
}
- kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
}
return (code);
}
-int
-vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
+static int
+vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
+ const int *t, int nt)
{
int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
int ntgts;
@@ -1480,26 +1531,19 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
- /*
- * The tgts list must already be sorted.
- */
- for (i = 1; i < nt; i++) {
- ASSERT(t[i] > t[i - 1]);
- }
-
- nbadparity = rm->rm_firstdatacol;
- nbaddata = rm->rm_cols - nbadparity;
+ nbadparity = rr->rr_firstdatacol;
+ nbaddata = rr->rr_cols - nbadparity;
ntgts = 0;
- for (i = 0, c = 0; c < rm->rm_cols; c++) {
- if (c < rm->rm_firstdatacol)
+ for (i = 0, c = 0; c < rr->rr_cols; c++) {
+ if (c < rr->rr_firstdatacol)
parity_valid[c] = B_FALSE;
if (i < nt && c == t[i]) {
tgts[ntgts++] = c;
i++;
- } else if (rm->rm_col[c].rc_error != 0) {
+ } else if (rr->rr_col[c].rc_error != 0) {
tgts[ntgts++] = c;
- } else if (c >= rm->rm_firstdatacol) {
+ } else if (c >= rr->rr_firstdatacol) {
nbaddata--;
} else {
parity_valid[c] = B_TRUE;
@@ -1514,7 +1558,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
dt = &tgts[nbadparity];
/* Reconstruct using the new math implementation */
- ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
+ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
if (ret != RAIDZ_ORIGINAL_IMPL)
return (ret);
@@ -1524,29 +1568,29 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
switch (nbaddata) {
case 1:
if (parity_valid[VDEV_RAIDZ_P])
- return (vdev_raidz_reconstruct_p(rm, dt, 1));
+ return (vdev_raidz_reconstruct_p(rr, dt, 1));
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_q(rm, dt, 1));
+ return (vdev_raidz_reconstruct_q(rr, dt, 1));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rr->rr_firstdatacol > 2);
break;
case 2:
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rr->rr_firstdatacol > 1);
if (parity_valid[VDEV_RAIDZ_P] &&
parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+ return (vdev_raidz_reconstruct_pq(rr, dt, 2));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rr->rr_firstdatacol > 2);
break;
}
- code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+ code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
ASSERT(code > 0);
return (code);
@@ -1556,8 +1600,8 @@ static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
- vdev_t *cvd;
- uint64_t nparity = vd->vdev_nparity;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ uint64_t nparity = vdrz->vd_nparity;
int c;
int lasterror = 0;
int numerrors = 0;
@@ -1573,7 +1617,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
vdev_open_children(vd);
for (c = 0; c < vd->vdev_children; c++) {
- cvd = vd->vdev_child[c];
+ vdev_t *cvd = vd->vdev_child[c];
if (cvd->vdev_open_error != 0) {
lasterror = cvd->vdev_open_error;
@@ -1602,19 +1646,20 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
static void
vdev_raidz_close(vdev_t *vd)
{
- int c;
-
- for (c = 0; c < vd->vdev_children; c++)
- vdev_close(vd->vdev_child[c]);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ if (vd->vdev_child[c] != NULL)
+ vdev_close(vd->vdev_child[c]);
+ }
}
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vd->vdev_children;
- uint64_t nparity = vd->vdev_nparity;
+ uint64_t cols = vdrz->vd_logical_width;
+ uint64_t nparity = vdrz->vd_nparity;
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
@@ -1623,7 +1668,18 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize)
return (asize);
}
-static void
+/*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child)
+ * so each child must provide at least 1/Nth of its asize.
+ */
+static uint64_t
+vdev_raidz_min_asize(vdev_t *vd)
+{
+ return ((vd->vdev_min_asize + vd->vdev_children - 1) /
+ vd->vdev_children);
+}
+
+void
vdev_raidz_child_done(zio_t *zio)
{
raidz_col_t *rc = zio->io_private;
@@ -1634,21 +1690,21 @@ vdev_raidz_child_done(zio_t *zio)
}
static void
-vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
{
#ifdef ZFS_DEBUG
- vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
- range_seg64_t logical_rs, physical_rs;
- logical_rs.rs_start = zio->io_offset;
+ range_seg64_t logical_rs, physical_rs, remain_rs;
+ logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
- vdev_raidz_asize(zio->io_vd, zio->io_size);
+ vdev_raidz_asize(vd, rr->rr_size);
- raidz_col_t *rc = &rm->rm_col[col];
+ raidz_col_t *rc = &rr->rr_col[col];
vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
- vdev_xlate(cvd, &logical_rs, &physical_rs);
+ vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
+ ASSERT(vdev_xlate_is_empty(&remain_rs));
ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
/*
@@ -1666,106 +1722,82 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
#endif
}
-/*
- * Start an IO operation on a RAIDZ VDev
- *
- * Outline:
- * - For write operations:
- * 1. Generate the parity data
- * 2. Create child zio write operations to each column's vdev, for both
- * data and parity.
- * 3. If the column skips any sectors for padding, create optional dummy
- * write zio children for those areas to improve aggregation continuity.
- * - For read operations:
- * 1. Create child zio read operations to each data column's vdev to read
- * the range of data required for zio.
- * 2. If this is a scrub or resilver operation, or if any of the data
- * vdevs have had errors, then create zio read operations to the parity
- * columns' VDevs as well.
- */
static void
-vdev_raidz_io_start(zio_t *zio)
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
{
vdev_t *vd = zio->io_vd;
- vdev_t *tvd = vd->vdev_top;
- vdev_t *cvd;
- raidz_map_t *rm;
- raidz_col_t *rc;
+ raidz_map_t *rm = zio->io_vsd;
int c, i;
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
- vd->vdev_nparity);
-
- ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
+ vdev_raidz_generate_parity_row(rm, rr);
- if (zio->io_type == ZIO_TYPE_WRITE) {
- vdev_raidz_generate_parity(rm);
-
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
-
- /*
- * Verify physical to logical translation.
- */
- vdev_raidz_io_verify(zio, rm, c);
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- }
+ /* Verify physical to logical translation */
+ vdev_raidz_io_verify(vd, rr, c);
- /*
- * Generate optional I/Os for any skipped sectors to improve
- * aggregation contiguity.
- */
- for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
- ASSERT(c <= rm->rm_scols);
- if (c == rm->rm_scols)
- c = 0;
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset + rc->rc_size, NULL,
- 1 << tvd->vdev_ashift,
- zio->io_type, zio->io_priority,
- ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
- }
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx], rc->rc_offset,
+ rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
+ 0, vdev_raidz_child_done, rc));
+ }
- zio_execute(zio);
- return;
+ /*
+ * Generate optional I/Os for skip sectors to improve aggregation
+ * contiguity.
+ */
+ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+ ASSERT(c <= rr->rr_scols);
+ if (c == rr->rr_scols)
+ c = 0;
+
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
+ zio->io_type, zio->io_priority,
+ ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
+}
- ASSERT(zio->io_type == ZIO_TYPE_READ);
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
/*
* Iterate over the columns in reverse order so that we hit the parity
* last -- any errors along the way will force us to read the parity.
*/
- for (c = rm->rm_cols - 1; c >= 0; c--) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_size == 0)
+ continue;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
if (!vdev_readable(cvd)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
else
- rm->rm_missingparity++;
+ rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ENXIO);
rc->rc_tried = 1; /* don't even try */
rc->rc_skipped = 1;
continue;
}
if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
- if (c >= rm->rm_firstdatacol)
- rm->rm_missingdata++;
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
else
- rm->rm_missingparity++;
+ rr->rr_missingparity++;
rc->rc_error = SET_ERROR(ESTALE);
rc->rc_skipped = 1;
continue;
}
- if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
+ if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
@@ -1773,11 +1805,56 @@ vdev_raidz_io_start(zio_t *zio)
vdev_raidz_child_done, rc));
}
}
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ raidz_map_t *rm;
+
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
+ vdrz->vd_logical_width, vdrz->vd_nparity);
+
+ /*
+ * Until raidz expansion is implemented all maps for a raidz vdev
+ * contain a single row.
+ */
+ ASSERT3U(rm->rm_nrows, ==, 1);
+ raidz_row_t *rr = rm->rm_row[0];
+
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_READ);
+ vdev_raidz_io_start_read(zio, rr);
+ }
zio_execute(zio);
}
-
/*
* Report a checksum error for a child of a RAID-Z device.
*/
@@ -1786,7 +1863,8 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
{
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
+ zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio_bad_cksum_t zbc;
raidz_map_t *rm = zio->io_vsd;
@@ -1827,13 +1905,14 @@ raidz_checksum_verify(zio_t *zio)
* Generate the parity from the data columns. If we tried and were able to
* read the parity without error, verify that the generated parity matches the
* data we read. If it doesn't, we fire off a checksum error. Return the
- * number such failures.
+ * number of such failures.
*/
static int
-raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
+raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
{
abd_t *orig[VDEV_RAIDZ_MAXPARITY];
int c, ret = 0;
+ raidz_map_t *rm = zio->io_vsd;
raidz_col_t *rc;
blkptr_t *bp = zio->io_bp;
@@ -1843,8 +1922,18 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
if (checksum == ZIO_CHECKSUM_NOPARITY)
return (ret);
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
+ /*
+ * All data columns must have been successfully read in order
+ * to use them to generate parity columns for comparison.
+ */
+ for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ rc = &rr->rr_col[c];
+ if (!rc->rc_tried || rc->rc_error != 0)
+ return (ret);
+ }
+
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
@@ -1852,12 +1941,19 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
abd_copy(orig[c], rc->rc_abd, rc->rc_size);
}
- vdev_raidz_generate_parity(rm);
+ /*
+ * Regenerates parity even for !tried||rc_error!=0 columns. This
+ * isn't harmful but it does have the side effect of fixing stuff
+ * we didn't realize was necessary (i.e. even if we return 0).
+ */
+ vdev_raidz_generate_parity_row(rm, rr);
+
+ for (c = 0; c < rr->rr_firstdatacol; c++) {
+ rc = &rr->rr_col[c];
- for (c = 0; c < rm->rm_firstdatacol; c++) {
- rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
+
if (abd_cmp(orig[c], rc->rc_abd) != 0) {
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
@@ -1870,456 +1966,597 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
}
static int
-vdev_raidz_worst_error(raidz_map_t *rm)
+vdev_raidz_worst_error(raidz_row_t *rr)
{
int error = 0;
- for (int c = 0; c < rm->rm_cols; c++)
- error = zio_worst_error(error, rm->rm_col[c].rc_error);
+ for (int c = 0; c < rr->rr_cols; c++)
+ error = zio_worst_error(error, rr->rr_col[c].rc_error);
return (error);
}
-/*
- * Iterate over all combinations of bad data and attempt a reconstruction.
- * Note that the algorithm below is non-optimal because it doesn't take into
- * account how reconstruction is actually performed. For example, with
- * triple-parity RAID-Z the reconstruction procedure is the same if column 4
- * is targeted as invalid as if columns 1 and 4 are targeted since in both
- * cases we'd only use parity information in column 0.
- */
-static int
-vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+static void
+vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
{
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc;
- abd_t *orig[VDEV_RAIDZ_MAXPARITY];
- int tstore[VDEV_RAIDZ_MAXPARITY + 2];
- int *tgts = &tstore[1];
- int curr, next, i, c, n;
- int code, ret = 0;
+ int unexpected_errors = 0;
+ int parity_errors = 0;
+ int parity_untried = 0;
+ int data_errors = 0;
- ASSERT(total_errors < rm->rm_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ if (c < rr->rr_firstdatacol)
+ parity_errors++;
+ else
+ data_errors++;
+
+ if (!rc->rc_skipped)
+ unexpected_errors++;
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+ parity_untried++;
+ }
+ }
/*
- * This simplifies one edge condition.
+ * If we read more parity disks than were used for
+ * reconstruction, confirm that the other parity disks produced
+ * correct data.
+ *
+ * Note that we also regenerate parity when resilvering so we
+ * can write it out to failed devices later.
*/
- tgts[-1] = -1;
+ if (parity_errors + parity_untried <
+ rr->rr_firstdatacol - data_errors ||
+ (zio->io_flags & ZIO_FLAG_RESILVER)) {
+ int n = raidz_parity_verify(zio, rr);
+ unexpected_errors += n;
+ ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
+ }
- for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+ if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+ (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
/*
- * Initialize the targets array by finding the first n columns
- * that contain no error.
- *
- * If there were no data errors, we need to ensure that we're
- * always explicitly attempting to reconstruct at least one
- * data column. To do this, we simply push the highest target
- * up into the data columns.
+ * Use the good data we have in hand to repair damaged children.
*/
- for (c = 0, i = 0; i < n; i++) {
- if (i == n - 1 && data_errors == 0 &&
- c < rm->rm_firstdatacol) {
- c = rm->rm_firstdatacol;
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *vd = zio->io_vd;
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if ((rc->rc_error == 0 || rc->rc_size == 0) &&
+ (rc->rc_repair == 0)) {
+ continue;
}
- while (rm->rm_col[c].rc_error != 0) {
- c++;
- ASSERT3S(c, <, rm->rm_cols);
+ zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ ZIO_TYPE_WRITE,
+ zio->io_priority == ZIO_PRIORITY_REBUILD ?
+ ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+ ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ }
+ }
+}
+
+static void
+raidz_restore_orig_data(raidz_map_t *rm)
+{
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ abd_copy_from_buf(rc->rc_abd,
+ rc->rc_orig_data, rc->rc_size);
+ rc->rc_need_orig_restore = B_FALSE;
}
+ }
+ }
+}
+
+/*
+ * returns EINVAL if reconstruction of the block will not be possible
+ * returns ECKSUM if this specific reconstruction failed
+ * returns 0 on successful reconstruction
+ */
+static int
+raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
+{
+ raidz_map_t *rm = zio->io_vsd;
- tgts[i] = c++;
+ /* Reconstruct each row */
+ for (int r = 0; r < rm->rm_nrows; r++) {
+ raidz_row_t *rr = rm->rm_row[r];
+ int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
+ int t = 0;
+ int dead = 0;
+ int dead_data = 0;
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ ASSERT0(rc->rc_need_orig_restore);
+ if (rc->rc_error != 0) {
+ dead++;
+ if (c >= nparity)
+ dead_data++;
+ continue;
+ }
+ if (rc->rc_size == 0)
+ continue;
+ for (int lt = 0; lt < ntgts; lt++) {
+ if (rc->rc_devidx == ltgts[lt]) {
+ if (rc->rc_orig_data == NULL) {
+ rc->rc_orig_data =
+ zio_buf_alloc(rc->rc_size);
+ abd_copy_to_buf(
+ rc->rc_orig_data,
+ rc->rc_abd, rc->rc_size);
+ }
+ rc->rc_need_orig_restore = B_TRUE;
+
+ dead++;
+ if (c >= nparity)
+ dead_data++;
+ my_tgts[t++] = c;
+ break;
+ }
+ }
+ }
+ if (dead > nparity) {
+ /* reconstruction not possible */
+ raidz_restore_orig_data(rm);
+ return (EINVAL);
}
+ rr->rr_code = 0;
+ if (dead_data > 0)
+ rr->rr_code = vdev_raidz_reconstruct_row(rm, rr,
+ my_tgts, t);
+ }
- /*
- * Setting tgts[n] simplifies the other edge condition.
- */
- tgts[n] = rm->rm_cols;
+ /* Check for success */
+ if (raidz_checksum_verify(zio) == 0) {
+
+ /* Reconstruction succeeded - report errors */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_need_orig_restore) {
+ /*
+ * Note: if this is a parity column,
+ * we don't really know if it's wrong.
+ * We need to let
+ * vdev_raidz_io_done_verified() check
+ * it, and if we set rc_error, it will
+ * think that it is a "known" error
+ * that doesn't need to be checked
+ * or corrected.
+ */
+ if (rc->rc_error == 0 &&
+ c >= rr->rr_firstdatacol) {
+ raidz_checksum_error(zio,
+ rc, rc->rc_gdata);
+ rc->rc_error =
+ SET_ERROR(ECKSUM);
+ }
+ rc->rc_need_orig_restore = B_FALSE;
+ }
+ }
- /*
- * These buffers were allocated in previous iterations.
- */
- for (i = 0; i < n - 1; i++) {
- ASSERT(orig[i] != NULL);
+ vdev_raidz_io_done_verified(zio, rr);
}
- orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
- rm->rm_col[0].rc_size);
+ zio_checksum_verified(zio);
- curr = 0;
- next = tgts[curr];
+ return (0);
+ }
- while (curr != n) {
- tgts[curr] = next;
- curr = 0;
+ /* Reconstruction failed - restore original data */
+ raidz_restore_orig_data(rm);
+ return (ECKSUM);
+}
- /*
- * Save off the original data that we're going to
- * attempt to reconstruct.
- */
- for (i = 0; i < n; i++) {
- ASSERT(orig[i] != NULL);
- c = tgts[i];
- ASSERT3S(c, >=, 0);
- ASSERT3S(c, <, rm->rm_cols);
- rc = &rm->rm_col[c];
- abd_copy(orig[i], rc->rc_abd, rc->rc_size);
- }
+/*
+ * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ *
+ * The order that we find the various possible combinations of failed
+ * disks is dictated by these rules:
+ * - Examine each "slot" (the "i" in tgts[i])
+ * - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ * - if we can't increment because it runs into the next slot,
+ * reset our slot to the minimum, and examine the next slot
+ *
+ * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
+ * 3 columns to reconstruct), we will generate the following sequence:
+ *
+ * STATE ACTION
+ * 0 1 2 special case: skip since these are all parity
+ * 0 1 3 first slot: reset to 0; middle slot: increment to 2
+ * 0 2 3 first slot: increment to 1
+ * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
+ * 0 1 4 first: reset to 0; middle: increment to 2
+ * 0 2 4 first: increment to 1
+ * 1 2 4 first: reset to 0; middle: increment to 3
+ * 0 3 4 first: increment to 1
+ * 1 3 4 first: increment to 2
+ * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
+ * 0 1 5 first: reset to 0; middle: increment to 2
+ * 0 2 5 first: increment to 1
+ * 1 2 5 first: reset to 0; middle: increment to 3
+ * 0 3 5 first: increment to 1
+ * 1 3 5 first: increment to 2
+ * 2 3 5 first: reset to 0; middle: increment to 4
+ * 0 4 5 first: increment to 1
+ * 1 4 5 first: increment to 2
+ * 2 4 5 first: increment to 3
+ * 3 4 5 done
+ *
+ * This strategy works for dRAID but is less effecient when there are a large
+ * number of child vdevs and therefore permutations to check. Furthermore,
+ * since the raidz_map_t rows likely do not overlap reconstruction would be
+ * possible as long as there are no more than nparity data errors per row.
+ * These additional permutations are not currently checked but could be as
+ * a future improvement.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio)
+{
+ int nparity = vdev_get_nparity(zio->io_vd);
+ raidz_map_t *rm = zio->io_vsd;
- /*
- * Attempt a reconstruction and exit the outer loop on
- * success.
- */
- code = vdev_raidz_reconstruct(rm, tgts, n);
- if (raidz_checksum_verify(zio) == 0) {
-
- for (i = 0; i < n; i++) {
- c = tgts[i];
- rc = &rm->rm_col[c];
- ASSERT(rc->rc_error == 0);
- if (rc->rc_tried)
- raidz_checksum_error(zio, rc,
- orig[i]);
- rc->rc_error = SET_ERROR(ECKSUM);
- }
+ /* Check if there's enough data to attempt reconstrution. */
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ int total_errors = 0;
- ret = code;
- goto done;
- }
+ for (int c = 0; c < rr->rr_cols; c++) {
+ if (rr->rr_col[c].rc_error)
+ total_errors++;
+ }
- /*
- * Restore the original data.
- */
- for (i = 0; i < n; i++) {
- c = tgts[i];
- rc = &rm->rm_col[c];
- abd_copy(rc->rc_abd, orig[i], rc->rc_size);
- }
+ if (total_errors > nparity)
+ return (vdev_raidz_worst_error(rr));
+ }
- do {
+ for (int num_failures = 1; num_failures <= nparity; num_failures++) {
+ int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+ int *ltgts = &tstore[1]; /* value is logical child ID */
+
+ /* Determine number of logical children, n */
+ int n = zio->io_vd->vdev_children;
+
+ ASSERT3U(num_failures, <=, nparity);
+ ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
+
+ /* Handle corner cases in combrec logic */
+ ltgts[-1] = -1;
+ for (int i = 0; i < num_failures; i++) {
+ ltgts[i] = i;
+ }
+ ltgts[num_failures] = n;
+
+ for (;;) {
+ int err = raidz_reconstruct(zio, ltgts, num_failures,
+ nparity);
+ if (err == EINVAL) {
/*
- * Find the next valid column after the curr
- * position..
+ * Reconstruction not possible with this #
+ * failures; try more failures.
*/
- for (next = tgts[curr] + 1;
- next < rm->rm_cols &&
- rm->rm_col[next].rc_error != 0; next++)
- continue;
+ break;
+ } else if (err == 0)
+ return (0);
+
+ /* Compute next targets to try */
+ for (int t = 0; ; t++) {
+ ASSERT3U(t, <, num_failures);
+ ltgts[t]++;
+ if (ltgts[t] == n) {
+ /* try more failures */
+ ASSERT3U(t, ==, num_failures - 1);
+ break;
+ }
- ASSERT(next <= tgts[curr + 1]);
+ ASSERT3U(ltgts[t], <, n);
+ ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
/*
* If that spot is available, we're done here.
+ * Try the next combination.
*/
- if (next != tgts[curr + 1])
+ if (ltgts[t] != ltgts[t + 1])
break;
/*
- * Otherwise, find the next valid column after
- * the previous position.
+ * Otherwise, reset this tgt to the minimum,
+ * and move on to the next tgt.
*/
- for (c = tgts[curr - 1] + 1;
- rm->rm_col[c].rc_error != 0; c++)
- continue;
-
- tgts[curr] = c;
- curr++;
+ ltgts[t] = ltgts[t - 1] + 1;
+ ASSERT3U(ltgts[t], ==, t);
+ }
- } while (curr != n);
+ /* Increase the number of failures and keep trying. */
+ if (ltgts[num_failures - 1] == n)
+ break;
}
}
- n--;
-done:
- for (i = 0; i < n; i++)
- abd_free(orig[i]);
- return (ret);
+ return (ECKSUM);
+}
+
+void
+vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
+{
+ for (uint64_t row = 0; row < rm->rm_nrows; row++) {
+ raidz_row_t *rr = rm->rm_row[row];
+ vdev_raidz_reconstruct_row(rm, rr, t, nt);
+ }
}
/*
- * Complete an IO operation on a RAIDZ VDev
+ * Complete a write IO operation on a RAIDZ VDev
*
* Outline:
- * - For write operations:
* 1. Check for errors on the child IOs.
* 2. Return, setting an error code if too few child VDevs were written
* to reconstruct the data later. Note that partial writes are
* considered successful if they can be reconstructed at all.
- * - For read operations:
- * 1. Check for errors on the child IOs.
- * 2. If data errors occurred:
- * a. Try to reassemble the data from the parity available.
- * b. If we haven't yet read the parity drives, read them now.
- * c. If all parity drives have been read but the data still doesn't
- * reassemble with a correct checksum, then try combinatorial
- * reconstruction.
- * d. If that doesn't work, return an error.
- * 3. If there were unexpected errors or this is a resilver operation,
- * rewrite the vdevs that had errors.
*/
static void
-vdev_raidz_io_done(zio_t *zio)
+vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
+{
+ int total_errors = 0;
+
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error) {
+ ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
+
+ total_errors++;
+ }
+ }
+
+ /*
+ * Treat partial writes as a success. If we couldn't write enough
+ * columns to reconstruct the data, the I/O failed. Otherwise,
+ * good enough.
+ *
+ * Now that we support write reallocation, it would be better
+ * to treat partial failure as real failure unless there are
+ * no non-degraded top-level vdevs left, and not update DTLs
+ * if we intend to reallocate.
+ */
+ if (total_errors > rr->rr_firstdatacol) {
+ zio->io_error = zio_worst_error(zio->io_error,
+ vdev_raidz_worst_error(rr));
+ }
+}
+
+/*
+ * return 0 if no reconstruction occurred, otherwise the "code" from
+ * vdev_raidz_reconstruct().
+ */
+static int
+vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
+ raidz_row_t *rr)
{
- vdev_t *vd = zio->io_vd;
- vdev_t *cvd;
- raidz_map_t *rm = zio->io_vsd;
- raidz_col_t *rc = NULL;
- int unexpected_errors = 0;
int parity_errors = 0;
int parity_untried = 0;
int data_errors = 0;
int total_errors = 0;
- int n, c;
- int tgts[VDEV_RAIDZ_MAXPARITY];
- int code;
-
- ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
+ int code = 0;
- ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
- ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
+ ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+ ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
if (rc->rc_error) {
ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
- if (c < rm->rm_firstdatacol)
+ if (c < rr->rr_firstdatacol)
parity_errors++;
else
data_errors++;
- if (!rc->rc_skipped)
- unexpected_errors++;
-
total_errors++;
- } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
+ } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
parity_untried++;
}
}
- if (zio->io_type == ZIO_TYPE_WRITE) {
- /*
- * XXX -- for now, treat partial writes as a success.
- * (If we couldn't write enough columns to reconstruct
- * the data, the I/O failed. Otherwise, good enough.)
- *
- * Now that we support write reallocation, it would be better
- * to treat partial failure as real failure unless there are
- * no non-degraded top-level vdevs left, and not update DTLs
- * if we intend to reallocate.
- */
- /* XXPOLICY */
- if (total_errors > rm->rm_firstdatacol)
- zio->io_error = vdev_raidz_worst_error(rm);
-
- return;
- }
-
- ASSERT(zio->io_type == ZIO_TYPE_READ);
/*
- * There are three potential phases for a read:
- * 1. produce valid data from the columns read
- * 2. read all disks and try again
- * 3. perform combinatorial reconstruction
- *
- * Each phase is progressively both more expensive and less likely to
- * occur. If we encounter more errors than we can repair or all phases
- * fail, we have no choice but to return an error.
+ * If there were data errors and the number of errors we saw was
+ * correctable -- less than or equal to the number of parity disks read
+ * -- reconstruct based on the missing data.
*/
+ if (data_errors != 0 &&
+ total_errors <= rr->rr_firstdatacol - parity_untried) {
+ /*
+ * We either attempt to read all the parity columns or
+ * none of them. If we didn't try to read parity, we
+ * wouldn't be here in the correctable case. There must
+ * also have been fewer parity errors than parity
+ * columns or, again, we wouldn't be in this code path.
+ */
+ ASSERT(parity_untried == 0);
+ ASSERT(parity_errors < rr->rr_firstdatacol);
- /*
- * If the number of errors we saw was correctable -- less than or equal
- * to the number of parity disks read -- attempt to produce data that
- * has a valid checksum. Naturally, this case applies in the absence of
- * any errors.
- */
- if (total_errors <= rm->rm_firstdatacol - parity_untried) {
- if (data_errors == 0) {
- if (raidz_checksum_verify(zio) == 0) {
- /*
- * If we read parity information (unnecessarily
- * as it happens since no reconstruction was
- * needed) regenerate and verify the parity.
- * We also regenerate parity when resilvering
- * so we can write it out to the failed device
- * later.
- */
- if (parity_errors + parity_untried <
- rm->rm_firstdatacol ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
- }
- goto done;
+ /*
+ * Identify the data columns that reported an error.
+ */
+ int n = 0;
+ int tgts[VDEV_RAIDZ_MAXPARITY];
+ for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_error != 0) {
+ ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+ tgts[n++] = c;
}
- } else {
- /*
- * We either attempt to read all the parity columns or
- * none of them. If we didn't try to read parity, we
- * wouldn't be here in the correctable case. There must
- * also have been fewer parity errors than parity
- * columns or, again, we wouldn't be in this code path.
- */
- ASSERT(parity_untried == 0);
- ASSERT(parity_errors < rm->rm_firstdatacol);
+ }
- /*
- * Identify the data columns that reported an error.
- */
- n = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- if (rc->rc_error != 0) {
- ASSERT(n < VDEV_RAIDZ_MAXPARITY);
- tgts[n++] = c;
- }
- }
+ ASSERT(rr->rr_firstdatacol >= n);
- ASSERT(rm->rm_firstdatacol >= n);
+ code = vdev_raidz_reconstruct_row(rm, rr, tgts, n);
+ }
- code = vdev_raidz_reconstruct(rm, tgts, n);
+ return (code);
+}
- if (raidz_checksum_verify(zio) == 0) {
- /*
- * If we read more parity disks than were used
- * for reconstruction, confirm that the other
- * parity disks produced correct data. This
- * routine is suboptimal in that it regenerates
- * the parity that we already used in addition
- * to the parity that we're attempting to
- * verify, but this should be a relatively
- * uncommon case, and can be optimized if it
- * becomes a problem. Note that we regenerate
- * parity when resilvering so we can write it
- * out to failed devices later.
- */
- if (parity_errors < rm->rm_firstdatacol - n ||
- (zio->io_flags & ZIO_FLAG_RESILVER)) {
- n = raidz_parity_verify(zio, rm);
- unexpected_errors += n;
- ASSERT(parity_errors + n <=
- rm->rm_firstdatacol);
- }
+/*
+ * Return the number of reads issued.
+ */
+static int
+vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
+{
+ vdev_t *vd = zio->io_vd;
+ int nread = 0;
- goto done;
- }
- }
- }
+ rr->rr_missingdata = 0;
+ rr->rr_missingparity = 0;
/*
- * This isn't a typical situation -- either we got a read error or
- * a child silently returned bad data. Read every block so we can
- * try again with as much data and parity as we can track down. If
- * we've already been through once before, all children will be marked
- * as tried so we'll proceed to combinatorial reconstruction.
+ * If this rows contains empty sectors which are not required
+ * for a normal read then allocate an ABD for them now so they
+ * may be read, verified, and any needed repairs performed.
*/
- unexpected_errors = 1;
- rm->rm_missingdata = 0;
- rm->rm_missingparity = 0;
+ if (rr->rr_nempty && rr->rr_abd_empty == NULL)
+ vdev_draid_map_alloc_empty(zio, rr);
- for (c = 0; c < rm->rm_cols; c++) {
- if (rm->rm_col[c].rc_tried)
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ if (rc->rc_tried || rc->rc_size == 0)
continue;
- zio_vdev_io_redone(zio);
- do {
- rc = &rm->rm_col[c];
- if (rc->rc_tried)
- continue;
- zio_nowait(zio_vdev_child_io(zio, NULL,
- vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- zio->io_type, zio->io_priority, 0,
- vdev_raidz_child_done, rc));
- } while (++c < rm->rm_cols);
-
- return;
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ vd->vdev_child[rc->rc_devidx],
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
+ zio->io_type, zio->io_priority, 0,
+ vdev_raidz_child_done, rc));
+ nread++;
}
+ return (nread);
+}
- /*
- * At this point we've attempted to reconstruct the data given the
- * errors we detected, and we've attempted to read all columns. There
- * must, therefore, be one or more additional problems -- silent errors
- * resulting in invalid data rather than explicit I/O errors resulting
- * in absent data. We check if there is enough additional data to
- * possibly reconstruct the data and then perform combinatorial
- * reconstruction over all possible combinations. If that fails,
- * we're cooked.
- */
- if (total_errors > rm->rm_firstdatacol) {
- zio->io_error = vdev_raidz_worst_error(rm);
+/*
+ * We're here because either there were too many errors to even attempt
+ * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
+ * failed. In either case, there is enough bad data to prevent reconstruction.
+ * Start checksum ereports for all children which haven't failed.
+ */
+static void
+vdev_raidz_io_done_unrecoverable(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
- } else if (total_errors < rm->rm_firstdatacol &&
- (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
- /*
- * If we didn't use all the available parity for the
- * combinatorial reconstruction, verify that the remaining
- * parity is correct.
- */
- if (code != (1 << rm->rm_firstdatacol) - 1)
- (void) raidz_parity_verify(zio, rm);
- } else {
- /*
- * We're here because either:
- *
- * total_errors == rm_first_datacol, or
- * vdev_raidz_combrec() failed
- *
- * In either case, there is enough bad data to prevent
- * reconstruction.
- *
- * Start checksum ereports for all children which haven't
- * failed, and the IO wasn't speculative.
- */
- zio->io_error = SET_ERROR(ECKSUM);
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- for (c = 0; c < rm->rm_cols; c++) {
- vdev_t *cvd;
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
- if (rc->rc_error != 0)
- continue;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
- zio_bad_cksum_t zbc;
- zbc.zbc_has_cksum = 0;
- zbc.zbc_injected = rm->rm_ecksuminjected;
-
- int ret = zfs_ereport_start_checksum(
- zio->io_spa, cvd, &zio->io_bookmark, zio,
- rc->rc_offset, rc->rc_size,
- (void *)(uintptr_t)c, &zbc);
- if (ret != EALREADY) {
- mutex_enter(&cvd->vdev_stat_lock);
- cvd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&cvd->vdev_stat_lock);
- }
+ for (int c = 0; c < rr->rr_cols; c++) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error != 0)
+ continue;
+
+ zio_bad_cksum_t zbc;
+ zbc.zbc_has_cksum = 0;
+ zbc.zbc_injected = rm->rm_ecksuminjected;
+
+ int ret = zfs_ereport_start_checksum(zio->io_spa,
+ cvd, &zio->io_bookmark, zio, rc->rc_offset,
+ rc->rc_size, (void *)(uintptr_t)c, &zbc);
+ if (ret != EALREADY) {
+ mutex_enter(&cvd->vdev_stat_lock);
+ cvd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&cvd->vdev_stat_lock);
}
}
}
+}
-done:
- zio_checksum_verified(zio);
+void
+vdev_raidz_io_done(zio_t *zio)
+{
+ raidz_map_t *rm = zio->io_vsd;
- if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
- (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
- /*
- * Use the good data we have in hand to repair damaged children.
- */
- for (c = 0; c < rm->rm_cols; c++) {
- rc = &rm->rm_col[c];
- cvd = vd->vdev_child[rc->rc_devidx];
+ if (zio->io_type == ZIO_TYPE_WRITE) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
+ }
+ } else {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ rr->rr_code =
+ vdev_raidz_io_done_reconstruct_known_missing(zio,
+ rm, rr);
+ }
- if (rc->rc_error == 0)
- continue;
+ if (raidz_checksum_verify(zio) == 0) {
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ raidz_row_t *rr = rm->rm_row[i];
+ vdev_raidz_io_done_verified(zio, rr);
+ }
+ zio_checksum_verified(zio);
+ } else {
+ /*
+ * A sequential resilver has no checksum which makes
+ * combinatoral reconstruction impossible. This code
+ * path is unreachable since raidz_checksum_verify()
+ * has no checksum to verify and must succeed.
+ */
+ ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
- zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_abd, rc->rc_size,
- ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
- ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+ /*
+ * This isn't a typical situation -- either we got a
+ * read error or a child silently returned bad data.
+ * Read every block so we can try again with as much
+ * data and parity as we can track down. If we've
+ * already been through once before, all children will
+ * be marked as tried so we'll proceed to combinatorial
+ * reconstruction.
+ */
+ int nread = 0;
+ for (int i = 0; i < rm->rm_nrows; i++) {
+ nread += vdev_raidz_read_all(zio,
+ rm->rm_row[i]);
+ }
+ if (nread != 0) {
+ /*
+ * Normally our stage is VDEV_IO_DONE, but if
+ * we've already called redone(), it will have
+ * changed to VDEV_IO_START, in which case we
+ * don't want to call redone() again.
+ */
+ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
+ zio_vdev_io_redone(zio);
+ return;
+ }
+
+ zio->io_error = vdev_raidz_combrec(zio);
+ if (zio->io_error == ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ vdev_raidz_io_done_unrecoverable(zio);
+ }
}
}
}
@@ -2327,7 +2564,8 @@ done:
static void
vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
{
- if (faulted > vd->vdev_nparity)
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ if (faulted > vdrz->vd_nparity)
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_NO_REPLICAS);
else if (degraded + faulted != 0)
@@ -2343,18 +2581,26 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
* width blocks must be resilvered.
*/
static boolean_t
-vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
+vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+ uint64_t phys_birth)
{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t dcols = vd->vdev_children;
- uint64_t nparity = vd->vdev_nparity;
+ uint64_t nparity = vdrz->vd_nparity;
uint64_t ashift = vd->vdev_top->vdev_ashift;
/* The starting RAIDZ (parent) vdev sector of the block. */
- uint64_t b = offset >> ashift;
+ uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
/* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = ((psize - 1) >> ashift) + 1;
/* The first column for this stripe. */
uint64_t f = b % dcols;
+ /* Unreachable by sequential resilver. */
+ ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
+
+ if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+ return (B_FALSE);
+
if (s + nparity >= dcols)
return (B_TRUE);
@@ -2375,7 +2621,8 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
}
static void
-vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
+vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
+ range_seg64_t *physical_rs, range_seg64_t *remain_rs)
{
vdev_t *raidvd = cvd->vdev_parent;
ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
@@ -2385,10 +2632,10 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
uint64_t ashift = raidvd->vdev_top->vdev_ashift;
/* make sure the offsets are block-aligned */
- ASSERT0(in->rs_start % (1 << ashift));
- ASSERT0(in->rs_end % (1 << ashift));
- uint64_t b_start = in->rs_start >> ashift;
- uint64_t b_end = in->rs_end >> ashift;
+ ASSERT0(logical_rs->rs_start % (1 << ashift));
+ ASSERT0(logical_rs->rs_end % (1 << ashift));
+ uint64_t b_start = logical_rs->rs_start >> ashift;
+ uint64_t b_end = logical_rs->rs_end >> ashift;
uint64_t start_row = 0;
if (b_start > tgt_col) /* avoid underflow */
@@ -2398,17 +2645,119 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
if (b_end > tgt_col)
end_row = ((b_end - tgt_col - 1) / width) + 1;
- res->rs_start = start_row << ashift;
- res->rs_end = end_row << ashift;
+ physical_rs->rs_start = start_row << ashift;
+ physical_rs->rs_end = end_row << ashift;
- ASSERT3U(res->rs_start, <=, in->rs_start);
- ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+ ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
+ ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
+ logical_rs->rs_end - logical_rs->rs_start);
+}
+
+/*
+ * Initialize private RAIDZ specific fields from the nvlist.
+ */
+static int
+vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+ vdev_raidz_t *vdrz;
+ uint64_t nparity;
+
+ uint_t children;
+ nvlist_t **child;
+ int error = nvlist_lookup_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, &child, &children);
+ if (error != 0)
+ return (SET_ERROR(EINVAL));
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
+ if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Previous versions could only support 1 or 2 parity
+ * device.
+ */
+ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
+ return (SET_ERROR(EINVAL));
+ else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
+ return (SET_ERROR(EINVAL));
+ } else {
+ /*
+ * We require the parity to be specified for SPAs that
+ * support multiple parity levels.
+ */
+ if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Otherwise, we default to 1 parity device for RAID-Z.
+ */
+ nparity = 1;
+ }
+
+ vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+ vdrz->vd_logical_width = children;
+ vdrz->vd_nparity = nparity;
+
+ *tsd = vdrz;
+
+ return (0);
+}
+
+static void
+vdev_raidz_fini(vdev_t *vd)
+{
+ kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
+}
+
+/*
+ * Add RAIDZ specific fields to the config nvlist.
+ */
+static void
+vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+ ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+ /*
+ * Make sure someone hasn't managed to sneak a fancy new vdev
+ * into a crufty old storage pool.
+ */
+ ASSERT(vdrz->vd_nparity == 1 ||
+ (vdrz->vd_nparity <= 2 &&
+ spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
+ (vdrz->vd_nparity <= 3 &&
+ spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
+
+ /*
+ * Note that we'll add these even on storage pools where they
+ * aren't strictly required -- older software will just ignore
+ * it.
+ */
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+}
+
+static uint64_t
+vdev_raidz_nparity(vdev_t *vd)
+{
+ vdev_raidz_t *vdrz = vd->vdev_tsd;
+ return (vdrz->vd_nparity);
+}
+
+static uint64_t
+vdev_raidz_ndisks(vdev_t *vd)
+{
+ return (vd->vdev_children);
}
vdev_ops_t vdev_raidz_ops = {
+ .vdev_op_init = vdev_raidz_init,
+ .vdev_op_fini = vdev_raidz_fini,
.vdev_op_open = vdev_raidz_open,
.vdev_op_close = vdev_raidz_close,
.vdev_op_asize = vdev_raidz_asize,
+ .vdev_op_min_asize = vdev_raidz_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = vdev_raidz_io_start,
.vdev_op_io_done = vdev_raidz_io_done,
.vdev_op_state_change = vdev_raidz_state_change,
@@ -2417,6 +2766,11 @@ vdev_ops_t vdev_raidz_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = vdev_raidz_xlate,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = vdev_raidz_config_generate,
+ .vdev_op_nparity = vdev_raidz_nparity,
+ .vdev_op_ndisks = vdev_raidz_ndisks,
.vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
index 9595a7b95251..25d76970e99a 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
@@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void)
* Select parity generation method for raidz_map
*/
int
-vdev_raidz_math_generate(raidz_map_t *rm)
+vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
{
raidz_gen_f gen_parity = NULL;
@@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm)
if (gen_parity == NULL)
return (RAIDZ_ORIGINAL_IMPL);
- gen_parity(rm);
+ gen_parity(rr);
return (0);
}
@@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
* @nbaddata - Number of failed data columns
*/
int
-vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
- const int *dt, const int nbaddata)
+vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
+ const int *parity_valid, const int *dt, const int nbaddata)
{
raidz_rec_f rec_fn = NULL;
@@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
if (rec_fn == NULL)
return (RAIDZ_ORIGINAL_IMPL);
else
- return (rec_fn(rm, dt));
+ return (rec_fn(rr, dt));
}
const char *raidz_gen_name[] = {
@@ -360,7 +360,7 @@ raidz_math_kstat_addr(kstat_t *ksp, loff_t n)
#define BENCH_D_COLS (8ULL)
#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
-#define BENCH_NS MSEC2NSEC(25) /* 25ms */
+#define BENCH_NS MSEC2NSEC(1) /* 1ms */
typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
@@ -410,7 +410,7 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
t_start = gethrtime();
do {
- for (i = 0; i < 25; i++, run_cnt++)
+ for (i = 0; i < 5; i++, run_cnt++)
bench_fn(bench_rm, fn);
t_diff = gethrtime() - t_start;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
index 89c2082c4ab9..35e016fc65a5 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
@@ -26,6 +26,7 @@
#define _VDEV_RAIDZ_MATH_IMPL_H
#include <sys/types.h>
+#include <sys/vdev_raidz_impl.h>
#define raidz_inline inline __attribute__((always_inline))
#ifndef noinline
@@ -36,33 +37,33 @@
* Functions calculate multiplication constants for data reconstruction.
* Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
* used parity columns for reconstruction.
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
* @coeff output array of coefficients. Array must be provided by
* user and must hold minimum MUL_CNT values.
*/
static noinline void
-raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
- const unsigned ncols = raidz_ncols(rm);
+ const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
}
static noinline void
-raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
- const unsigned ncols = raidz_ncols(rm);
+ const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
}
static noinline void
-raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
- const unsigned ncols = raidz_ncols(rm);
+ const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
gf_t a, b, e;
@@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
-raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
- const unsigned ncols = raidz_ncols(rm);
+ const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
@@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
-raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
- const unsigned ncols = raidz_ncols(rm);
+ const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
@@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
}
static noinline void
-raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
{
- const unsigned ncols = raidz_ncols(rm);
+ const unsigned ncols = rr->rr_cols;
const unsigned x = tgtidx[TARGET_X];
const unsigned y = tgtidx[TARGET_Y];
const unsigned z = tgtidx[TARGET_Z];
@@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private)
/*
* Generate P parity (RAIDZ1)
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
*/
static raidz_inline void
-raidz_generate_p_impl(raidz_map_t * const rm)
+raidz_generate_p_impl(raidz_row_t * const rr)
{
size_t c;
- const size_t ncols = raidz_ncols(rm);
- const size_t psize = rm->rm_col[CODE_P].rc_size;
- abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
+ const size_t ncols = rr->rr_cols;
+ const size_t psize = rr->rr_col[CODE_P].rc_size;
+ abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
size_t size;
abd_t *dabd;
raidz_math_begin();
/* start with first data column */
- raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
+ raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
for (c = 2; c < ncols; c++) {
- dabd = rm->rm_col[c].rc_abd;
- size = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ size = rr->rr_col[c].rc_size;
/* add data column */
raidz_add(pabd, dabd, size);
@@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
/*
* Generate PQ parity (RAIDZ2)
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
*/
static raidz_inline void
-raidz_generate_pq_impl(raidz_map_t * const rm)
+raidz_generate_pq_impl(raidz_row_t * const rr)
{
size_t c;
- const size_t ncols = raidz_ncols(rm);
- const size_t csize = rm->rm_col[CODE_P].rc_size;
+ const size_t ncols = rr->rr_cols;
+ const size_t csize = rr->rr_col[CODE_P].rc_size;
size_t dsize;
abd_t *dabd;
abd_t *cabds[] = {
- rm->rm_col[CODE_P].rc_abd,
- rm->rm_col[CODE_Q].rc_abd
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd
};
raidz_math_begin();
- raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
- raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
+ raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
for (c = 3; c < ncols; c++) {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
raidz_gen_pq_add);
@@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
/*
* Generate PQR parity (RAIDZ2)
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
*/
static raidz_inline void
-raidz_generate_pqr_impl(raidz_map_t * const rm)
+raidz_generate_pqr_impl(raidz_row_t * const rr)
{
size_t c;
- const size_t ncols = raidz_ncols(rm);
- const size_t csize = rm->rm_col[CODE_P].rc_size;
+ const size_t ncols = rr->rr_cols;
+ const size_t csize = rr->rr_col[CODE_P].rc_size;
size_t dsize;
abd_t *dabd;
abd_t *cabds[] = {
- rm->rm_col[CODE_P].rc_abd,
- rm->rm_col[CODE_Q].rc_abd,
- rm->rm_col[CODE_R].rc_abd
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
};
raidz_math_begin();
- raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
- raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
- raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
for (c = 4; c < ncols; c++) {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
raidz_gen_pqr_add);
@@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm)
* @syn_method raidz_add_abd()
* @rec_method not applicable
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
- const size_t xsize = rm->rm_col[x].rc_size;
- abd_t *xabd = rm->rm_col[x].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
size_t size;
abd_t *dabd;
+ if (xabd == NULL)
+ return (1 << CODE_P);
+
raidz_math_begin();
/* copy P into target */
- raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
/* generate p_syndrome */
for (c = firstdc; c < ncols; c++) {
if (c == x)
continue;
- dabd = rm->rm_col[c].rc_abd;
- size = MIN(rm->rm_col[c].rc_size, xsize);
+ dabd = rr->rr_col[c].rc_abd;
+ size = MIN(rr->rr_col[c].rc_size, xsize);
raidz_add(xabd, dabd, size);
}
@@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
* @syn_method raidz_add_abd()
* @rec_method raidz_mul_abd_cb()
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
- abd_t *xabd = rm->rm_col[x].rc_abd;
- const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
abd_t *tabds[] = { xabd };
+ if (xabd == NULL)
+ return (1 << CODE_Q);
+
unsigned coeff[MUL_CNT];
- raidz_rec_q_coeff(rm, tgtidx, coeff);
+ raidz_rec_q_coeff(rr, tgtidx, coeff);
raidz_math_begin();
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
}
@@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
}
/* add Q to the syndrome */
- raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
+ raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
@@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
* @syn_method raidz_add_abd()
* @rec_method raidz_mul_abd_cb()
*
- * @rm RAIDZ map
+ * @rr RAIDZ rr
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
- const size_t xsize = rm->rm_col[x].rc_size;
- abd_t *xabd = rm->rm_col[x].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
abd_t *tabds[] = { xabd };
+ if (xabd == NULL)
+ return (1 << CODE_R);
+
unsigned coeff[MUL_CNT];
- raidz_rec_r_coeff(rm, tgtidx, coeff);
+ raidz_rec_r_coeff(rr, tgtidx, coeff);
raidz_math_begin();
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
}
@@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
@@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
}
/* add R to the syndrome */
- raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
+ raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
/* transform the syndrome */
abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
@@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
* @syn_method raidz_syn_pq_abd()
* @rec_method raidz_rec_pq_abd()
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
- const size_t xsize = rm->rm_col[x].rc_size;
- const size_t ysize = rm->rm_col[y].rc_size;
- abd_t *xabd = rm->rm_col[x].rc_abd;
- abd_t *yabd = rm->rm_col[y].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
- rm->rm_col[CODE_P].rc_abd,
- rm->rm_col[CODE_Q].rc_abd
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd
};
+ if (xabd == NULL)
+ return ((1 << CODE_P) | (1 << CODE_Q));
+
unsigned coeff[MUL_CNT];
- raidz_rec_pq_coeff(rm, tgtidx, coeff);
+ raidz_rec_pq_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
/* Copy shorter targets back to the original abd buffer */
if (ysize < xsize)
- raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
@@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_pr_abd()
* @rec_method raidz_rec_pr_abd()
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[0];
const size_t y = tgtidx[1];
- const size_t xsize = rm->rm_col[x].rc_size;
- const size_t ysize = rm->rm_col[y].rc_size;
- abd_t *xabd = rm->rm_col[x].rc_abd;
- abd_t *yabd = rm->rm_col[y].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
- rm->rm_col[CODE_P].rc_abd,
- rm->rm_col[CODE_R].rc_abd
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
};
+
+ if (xabd == NULL)
+ return ((1 << CODE_P) | (1 << CODE_R));
+
unsigned coeff[MUL_CNT];
- raidz_rec_pr_coeff(rm, tgtidx, coeff);
+ raidz_rec_pr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets are shorter then others.
@@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
- raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
if (ysize < xsize)
abd_free(yabd);
- return ((1 << CODE_P) | (1 << CODE_Q));
+ return ((1 << CODE_P) | (1 << CODE_R));
}
@@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_qr_abd()
* @rec_method raidz_rec_qr_abd()
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
- const size_t xsize = rm->rm_col[x].rc_size;
- const size_t ysize = rm->rm_col[y].rc_size;
- abd_t *xabd = rm->rm_col[x].rc_abd;
- abd_t *yabd = rm->rm_col[y].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
abd_t *tabds[2] = { xabd, yabd };
abd_t *cabds[] = {
- rm->rm_col[CODE_Q].rc_abd,
- rm->rm_col[CODE_R].rc_abd
+ rr->rr_col[CODE_Q].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
};
+
+ if (xabd == NULL)
+ return ((1 << CODE_Q) | (1 << CODE_R));
+
unsigned coeff[MUL_CNT];
- raidz_rec_qr_coeff(rm, tgtidx, coeff);
+ raidz_rec_qr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
@@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
- raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
raidz_math_end();
@@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
* @syn_method raidz_syn_pqr_abd()
* @rec_method raidz_rec_pqr_abd()
*
- * @rm RAIDZ map
+ * @rr RAIDZ row
* @tgtidx array of missing data indexes
*/
static raidz_inline int
-raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
+raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
{
size_t c;
size_t dsize;
abd_t *dabd;
- const size_t firstdc = raidz_parity(rm);
- const size_t ncols = raidz_ncols(rm);
+ const size_t firstdc = rr->rr_firstdatacol;
+ const size_t ncols = rr->rr_cols;
const size_t x = tgtidx[TARGET_X];
const size_t y = tgtidx[TARGET_Y];
const size_t z = tgtidx[TARGET_Z];
- const size_t xsize = rm->rm_col[x].rc_size;
- const size_t ysize = rm->rm_col[y].rc_size;
- const size_t zsize = rm->rm_col[z].rc_size;
- abd_t *xabd = rm->rm_col[x].rc_abd;
- abd_t *yabd = rm->rm_col[y].rc_abd;
- abd_t *zabd = rm->rm_col[z].rc_abd;
+ const size_t xsize = rr->rr_col[x].rc_size;
+ const size_t ysize = rr->rr_col[y].rc_size;
+ const size_t zsize = rr->rr_col[z].rc_size;
+ abd_t *xabd = rr->rr_col[x].rc_abd;
+ abd_t *yabd = rr->rr_col[y].rc_abd;
+ abd_t *zabd = rr->rr_col[z].rc_abd;
abd_t *tabds[] = { xabd, yabd, zabd };
abd_t *cabds[] = {
- rm->rm_col[CODE_P].rc_abd,
- rm->rm_col[CODE_Q].rc_abd,
- rm->rm_col[CODE_R].rc_abd
+ rr->rr_col[CODE_P].rc_abd,
+ rr->rr_col[CODE_Q].rc_abd,
+ rr->rr_col[CODE_R].rc_abd
};
+
+ if (xabd == NULL)
+ return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+
unsigned coeff[MUL_CNT];
- raidz_rec_pqr_coeff(rm, tgtidx, coeff);
+ raidz_rec_pqr_coeff(rr, tgtidx, coeff);
/*
* Check if some of targets is shorter then others
@@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
/* Start with first data column if present */
if (firstdc != x) {
- raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
- raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
- raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+ raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
} else {
raidz_zero(xabd, xsize);
raidz_zero(yabd, xsize);
@@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
dabd = NULL;
dsize = 0;
} else {
- dabd = rm->rm_col[c].rc_abd;
- dsize = rm->rm_col[c].rc_size;
+ dabd = rr->rr_col[c].rc_abd;
+ dsize = rr->rr_col[c].rc_size;
}
abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
@@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
* Copy shorter targets back to the original abd buffer
*/
if (ysize < xsize)
- raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+ raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
if (zsize < xsize)
- raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
+ raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
raidz_math_end();
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
index 3362d608c037..784d1af15a81 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -25,6 +25,7 @@
*/
#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
#include <sys/dsl_scan.h>
#include <sys/spa_impl.h>
#include <sys/metaslab_impl.h>
@@ -63,13 +64,15 @@
*
* Limitations:
*
- * - Only supported for mirror vdev types. Due to the variable stripe
- * width used by raidz sequential reconstruction is not possible.
+ * - Sequential reconstruction is not possible on RAIDZ due to its
+ * variable stripe width. Note dRAID uses a fixed stripe width which
+ * avoids this issue, but comes at the expense of some usable capacity.
*
- * - Block checksums are not verified during sequential reconstuction.
+ * - Block checksums are not verified during sequential reconstruction.
* Similar to traditional RAID the parity/mirror data is reconstructed
* but cannot be immediately double checked. For this reason when the
- * last active resilver completes the pool is automatically scrubbed.
+ * last active resilver completes the pool is automatically scrubbed
+ * by default.
*
* - Deferred resilvers using sequential reconstruction are not currently
* supported. When adding another vdev to an active top-level resilver
@@ -77,8 +80,8 @@
*
* Advantages:
*
- * - Sequential reconstuction is performed in LBA order which may be faster
- * than healing reconstuction particularly when using using HDDs (or
+ * - Sequential reconstruction is performed in LBA order which may be faster
+ * than healing reconstruction particularly when using using HDDs (or
* especially with SMR devices). Only allocated capacity is resilvered.
*
* - Sequential reconstruction is not constrained by ZFS block boundaries.
@@ -86,9 +89,9 @@
* allowing all of these logical blocks to be repaired with a single IO.
*
* - Unlike a healing resilver or scrub which are pool wide operations,
- * sequential reconstruction is handled by the top-level mirror vdevs.
- * This allows for it to be started or canceled on a top-level vdev
- * without impacting any other top-level vdevs in the pool.
+ * sequential reconstruction is handled by the top-level vdevs. This
+ * allows for it to be started or canceled on a top-level vdev without
+ * impacting any other top-level vdevs in the pool.
*
* - Data only referenced by a pool checkpoint will be repaired because
* that space is reflected in the space maps. This differs for a
@@ -97,17 +100,35 @@
/*
- * Maximum number of queued rebuild I/Os top-level vdev. The number of
- * concurrent rebuild I/Os issued to the device is controlled by the
- * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
- * options.
+ * Size of rebuild reads; defaults to 1MiB per data disk and is capped at
+ * SPA_MAXBLOCKSIZE.
*/
-unsigned int zfs_rebuild_queue_limit = 20;
+unsigned long zfs_rebuild_max_segment = 1024 * 1024;
/*
- * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
+ * Maximum number of parallelly executed bytes per leaf vdev caused by a
+ * sequential resilver. We attempt to strike a balance here between keeping
+ * the vdev queues full of I/Os at all times and not overflowing the queues
+ * to cause long latency, which would cause long txg sync times.
+ *
+ * A large default value can be safely used here because the default target
+ * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep
+ * the queue depth short.
+ *
+ * 32MB was selected as the default value to achieve good performance with
+ * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
+ * rebuild was unable to saturate all of the drives using smaller values.
+ * With a value of 32MB the sequential resilver write rate was measured at
+ * 800MB/s sustained while rebuilding to a distributed spare.
*/
-unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+
+/*
+ * Automatically start a pool scrub when the last active sequential resilver
+ * completes in order to verify the checksums of all blocks which have been
+ * resilvered. This option is enabled by default and is strongly recommended.
+ */
+int zfs_rebuild_scrub_enabled = 1;
/*
* For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
@@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
REBUILD_PHYS_ENTRIES, vrp, tx));
- vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
+ vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
spa_history_log_internal(spa, "rebuild", tx,
@@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
vd->vdev_rebuilding = B_FALSE;
mutex_exit(&vd->vdev_rebuild_lock);
- spa_notify_waiters(spa);
+ /*
+ * While we're in syncing context take the opportunity to
+ * setup the scrub when there are no more active rebuilds.
+ */
+ if (!vdev_rebuild_active(spa->spa_root_vdev) &&
+ zfs_rebuild_scrub_enabled) {
+ pool_scan_func_t func = POOL_SCAN_SCRUB;
+ dsl_scan_setup_sync(&func, tx);
+ }
+
cv_broadcast(&vd->vdev_rebuild_cv);
}
@@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
vdev_t *vd = vr->vr_top_vdev;
- mutex_enter(&vd->vdev_rebuild_io_lock);
+ mutex_enter(&vr->vr_io_lock);
if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
/*
* The I/O failed because the top-level vdev was unavailable.
@@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio)
abd_free(zio->io_abd);
- ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
- vd->vdev_rebuild_inflight--;
- cv_broadcast(&vd->vdev_rebuild_io_cv);
- mutex_exit(&vd->vdev_rebuild_io_lock);
+ ASSERT3U(vr->vr_bytes_inflight, >, 0);
+ vr->vr_bytes_inflight -= zio->io_size;
+ cv_broadcast(&vr->vr_io_cv);
+ mutex_exit(&vr->vr_io_lock);
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
}
/*
- * Rebuild the data in this range by constructing a special dummy block
- * pointer for the given range. It has no relation to any existing blocks
- * in the pool. But by disabling checksum verification and issuing a scrub
- * I/O mirrored vdevs will replicate the block using any available mirror
- * leaf vdevs.
+ * Initialize a block pointer that can be used to read the given segment
+ * for sequential rebuild.
*/
static void
-vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
- uint64_t txg)
+vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
+ uint64_t asize)
{
- vdev_t *vd = vr->vr_top_vdev;
- spa_t *spa = vd->vdev_spa;
- uint64_t psize = asize;
-
- ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+ ASSERT(vd->vdev_ops == &vdev_draid_ops ||
+ vd->vdev_ops == &vdev_mirror_ops ||
vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
- blkptr_t blk, *bp = &blk;
+ uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
+ vdev_draid_asize_to_psize(vd, asize) : asize;
+
BP_ZERO(bp);
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
@@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
BP_SET_LEVEL(bp, 0);
BP_SET_DEDUP(bp, 0);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- /*
- * We increment the issued bytes by the asize rather than the psize
- * so the scanned and issued bytes may be directly compared. This
- * is consistent with the scrub/resilver issued reporting.
- */
- vr->vr_pass_bytes_issued += asize;
- vr->vr_rebuild_phys.vrp_bytes_issued += asize;
-
- zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
- abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
- ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_RESILVER, NULL));
}
/*
@@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
vdev_t *vd = vr->vr_top_vdev;
spa_t *spa = vd->vdev_spa;
+ blkptr_t blk;
ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
@@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vr->vr_pass_bytes_scanned += size;
vr->vr_rebuild_phys.vrp_bytes_scanned += size;
- mutex_enter(&vd->vdev_rebuild_io_lock);
+ /*
+ * Rebuild the data in this range by constructing a special block
+ * pointer. It has no relation to any existing blocks in the pool.
+ * However, by disabling checksum verification and issuing a scrub IO
+ * we can reconstruct and repair any children with missing data.
+ */
+ vdev_rebuild_blkptr_init(&blk, vd, start, size);
+ uint64_t psize = BP_GET_PSIZE(&blk);
+
+ if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+ return (0);
+
+ mutex_enter(&vr->vr_io_lock);
/* Limit in flight rebuild I/Os */
- while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
- cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+ while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
+ cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
- vd->vdev_rebuild_inflight++;
- mutex_exit(&vd->vdev_rebuild_io_lock);
+ vr->vr_bytes_inflight += psize;
+ mutex_exit(&vr->vr_io_lock);
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
@@ -558,46 +584,30 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
/* When exiting write out our progress. */
if (vdev_rebuild_should_stop(vd)) {
- mutex_enter(&vd->vdev_rebuild_io_lock);
- vd->vdev_rebuild_inflight--;
- mutex_exit(&vd->vdev_rebuild_io_lock);
+ mutex_enter(&vr->vr_io_lock);
+ vr->vr_bytes_inflight -= psize;
+ mutex_exit(&vr->vr_io_lock);
spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
mutex_exit(&vd->vdev_rebuild_lock);
dmu_tx_commit(tx);
return (SET_ERROR(EINTR));
}
mutex_exit(&vd->vdev_rebuild_lock);
+ dmu_tx_commit(tx);
vr->vr_scan_offset[txg & TXG_MASK] = start + size;
- vdev_rebuild_rebuild_block(vr, start, size, txg);
+ vr->vr_pass_bytes_issued += size;
+ vr->vr_rebuild_phys.vrp_bytes_issued += size;
- dmu_tx_commit(tx);
+ zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
+ abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
+ ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_RESILVER, NULL));
return (0);
}
/*
- * Split range into legally-sized logical chunks given the constraints of the
- * top-level mirror vdev type.
- */
-static uint64_t
-vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
-{
- uint64_t chunk_size, max_asize, max_segment;
-
- ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
- vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
-
- max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
- 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
- max_asize = vdev_psize_to_asize(vd, max_segment);
- chunk_size = MIN(size, max_asize);
-
- return (chunk_size);
-}
-
-/*
* Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
*/
static int
@@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr)
while (size > 0) {
uint64_t chunk_size;
- chunk_size = vdev_rebuild_chunk_size(vd, start, size);
+ /*
+ * Split range into legally-sized logical chunks
+ * given the constraints of the top-level vdev
+ * being rebuilt (dRAID or mirror).
+ */
+ ASSERT3P(vd->vdev_ops, !=, NULL);
+ chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
+ start, size, zfs_rebuild_max_segment);
error = vdev_rebuild_range(vr, start, chunk_size);
if (error != 0)
@@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg)
vr->vr_top_vdev = vd;
vr->vr_scan_msp = NULL;
vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
+
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
+ vr->vr_bytes_inflight_max = MAX(1ULL << 20,
+ zfs_rebuild_vdev_limit * vd->vdev_children);
+
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg)
ASSERT0(range_tree_space(vr->vr_scan_tree));
- /*
- * Disable any new allocations to this metaslab and wait
- * for any writes inflight to complete. This is needed to
- * ensure all allocated ranges are rebuilt.
- */
+ /* Disable any new allocations to this metaslab */
metaslab_disable(msp);
spa_config_exit(spa, SCL_CONFIG, FTAG);
- txg_wait_synced(dsl, 0);
mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
/*
+ * If there are outstanding allocations wait for them to be
+ * synced. This is needed to ensure all allocated ranges are
+ * on disk and therefore will be rebuilt.
+ */
+ for (int j = 0; j < TXG_SIZE; j++) {
+ if (range_tree_space(msp->ms_allocating[j])) {
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&msp->ms_sync_lock);
+ txg_wait_synced(dsl, 0);
+ mutex_enter(&msp->ms_sync_lock);
+ mutex_enter(&msp->ms_lock);
+ break;
+ }
+ }
+
+ /*
* When a metaslab has been allocated from read its allocated
- * ranges from the space map object in to the vr_scan_tree.
+ * ranges from the space map object into the vr_scan_tree.
* Then add inflight / unflushed ranges and remove inflight /
* unflushed frees. This is the minimum range to be rebuilt.
*/
@@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg)
/*
* To provide an accurate estimate re-calculate the estimated
* size every 5 minutes to account for recent allocations and
- * frees made space maps which have not yet been rebuilt.
+ * frees made to space maps which have not yet been rebuilt.
*/
if (gethrtime() > update_est_time + SEC2NSEC(300)) {
update_est_time = gethrtime();
@@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG);
/* Wait for any remaining rebuild I/O to complete */
- mutex_enter(&vd->vdev_rebuild_io_lock);
- while (vd->vdev_rebuild_inflight > 0)
- cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+ mutex_enter(&vr->vr_io_lock);
+ while (vr->vr_bytes_inflight > 0)
+ cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
- mutex_exit(&vd->vdev_rebuild_io_lock);
+ mutex_exit(&vr->vr_io_lock);
+
+ mutex_destroy(&vr->vr_io_lock);
+ cv_destroy(&vr->vr_io_cv);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
- "Max segment size in bytes of rebuild reads");
+ "Max segment size in bytes of rebuild reads");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+ "Max bytes in flight per leaf vdev for sequential resilvers");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
+ "Automatically scrub after sequential resilver completes");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index ed7d1d4b3030..6eaaddd3979f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
spa_vdev_removal_t *svr = NULL;
uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
- ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+ ASSERT0(vdev_get_nparity(vd));
svr = spa_vdev_removal_create(vd);
ASSERT(vd->vdev_removing);
@@ -993,7 +993,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
* An allocation class might not have any remaining vdevs or space
*/
metaslab_class_t *mc = mg->mg_class;
- if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ if (mc->mc_groups == 0)
mc = spa_normal_class(spa);
int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
zal, 0);
@@ -1120,7 +1120,7 @@ static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
ASSERT3P(zlist, !=, NULL);
- ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops);
+ ASSERT0(vdev_get_nparity(vd));
if (vd->vdev_leaf_zap != 0) {
char zkey[32];
@@ -1976,32 +1976,38 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
- /* available space in the pool's normal class */
- uint64_t available = dsl_dir_space_available(
- spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
metaslab_class_t *mc = vd->vdev_mg->mg_class;
-
- /*
- * When removing a vdev from an allocation class that has
- * remaining vdevs, include available space from the class.
- */
- if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
- uint64_t class_avail = metaslab_class_get_space(mc) -
- metaslab_class_get_alloc(mc);
-
- /* add class space, adjusted for overhead */
- available += (class_avail * 94) / 100;
- }
-
- /*
- * There has to be enough free space to remove the
- * device and leave double the "slop" space (i.e. we
- * must leave at least 3% of the pool free, in addition to
- * the normal slop space).
- */
- if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
- return (SET_ERROR(ENOSPC));
+ metaslab_class_t *normal = spa_normal_class(spa);
+ if (mc != normal) {
+ /*
+ * Space allocated from the special (or dedup) class is
+ * included in the DMU's space usage, but it's not included
+ * in spa_dspace (or dsl_pool_adjustedsize()). Therefore
+ * there is always at least as much free space in the normal
+ * class, as is allocated from the special (and dedup) class.
+ * As a backup check, we will return ENOSPC if this is
+ * violated. See also spa_update_dspace().
+ */
+ uint64_t available = metaslab_class_get_space(normal) -
+ metaslab_class_get_alloc(normal);
+ ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
+ if (available < vd->vdev_stat.vs_alloc)
+ return (SET_ERROR(ENOSPC));
+ } else {
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+ if (available <
+ vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ /*
+ * This is a normal device. There has to be enough free
+ * space to remove the device and leave double the
+ * "slop" space (i.e. we must leave at least 3% of the
+ * pool free, in addition to the normal slop space).
+ */
+ return (SET_ERROR(ENOSPC));
+ }
}
/*
@@ -2041,7 +2047,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
/*
* All vdevs in normal class must have the same ashift
- * and not be raidz.
+ * and not be raidz or draid.
*/
vdev_t *rvd = spa->spa_root_vdev;
int num_indirect = 0;
@@ -2064,7 +2070,7 @@ spa_vdev_remove_top_check(vdev_t *vd)
num_indirect++;
if (!vdev_is_concrete(cvd))
continue;
- if (cvd->vdev_ops == &vdev_raidz_ops)
+ if (vdev_get_nparity(cvd) != 0)
return (SET_ERROR(EINVAL));
/*
* Need the mirror to be mirror of leaf vdevs only
@@ -2217,18 +2223,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
* in this pool.
*/
if (vd == NULL || unspare) {
- if (vd == NULL)
- vd = spa_lookup_by_guid(spa, guid, B_TRUE);
- ev = spa_event_create(spa, vd, NULL,
- ESC_ZFS_VDEV_REMOVE_AUX);
-
- vd_type = VDEV_TYPE_SPARE;
- vd_path = spa_strdup(fnvlist_lookup_string(
- nv, ZPOOL_CONFIG_PATH));
- spa_vdev_remove_aux(spa->spa_spares.sav_config,
- ZPOOL_CONFIG_SPARES, spares, nspares, nv);
- spa_load_spares(spa);
- spa->spa_spares.sav_sync = B_TRUE;
+ char *type;
+ boolean_t draid_spare = B_FALSE;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
+ == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
+ draid_spare = B_TRUE;
+
+ if (vd == NULL && draid_spare) {
+ error = SET_ERROR(ENOTSUP);
+ } else {
+ if (vd == NULL)
+ vd = spa_lookup_by_guid(spa,
+ guid, B_TRUE);
+ ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_AUX);
+
+ vd_type = VDEV_TYPE_SPARE;
+ vd_path = spa_strdup(fnvlist_lookup_string(
+ nv, ZPOOL_CONFIG_PATH));
+ spa_vdev_remove_aux(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
} else {
error = SET_ERROR(EBUSY);
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c
index 9e8aac7d03de..45ddc2f71927 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_root.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_root.c
@@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
}
vdev_ops_t vdev_root_ops = {
+ .vdev_op_init = NULL,
+ .vdev_op_fini = NULL,
.vdev_op_open = vdev_root_open,
.vdev_op_close = vdev_root_close,
.vdev_op_asize = vdev_default_asize,
+ .vdev_op_min_asize = vdev_default_min_asize,
+ .vdev_op_min_alloc = NULL,
.vdev_op_io_start = NULL, /* not applicable to the root */
.vdev_op_io_done = NULL, /* not applicable to the root */
.vdev_op_state_change = vdev_root_state_change,
@@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = {
.vdev_op_rele = NULL,
.vdev_op_remap = NULL,
.vdev_op_xlate = NULL,
+ .vdev_op_rebuild_asize = NULL,
+ .vdev_op_metaslab_init = NULL,
+ .vdev_op_config_generate = NULL,
+ .vdev_op_nparity = NULL,
+ .vdev_op_ndisks = NULL,
.vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */
.vdev_op_leaf = B_FALSE /* not a leaf vdev */
};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index 02b42ddd5a6c..895957bda195 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
vd->vdev_trim_secure = secure;
}
- boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED);
+ vdev_trim_state_t old_state = vd->vdev_trim_state;
+ boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
vd->vdev_trim_state = new_state;
dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
@@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
"vdev=%s suspended", vd->vdev_path);
break;
case VDEV_TRIM_CANCELED:
- spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
- spa_history_log_internal(spa, "trim", tx,
- "vdev=%s canceled", vd->vdev_path);
+ if (old_state == VDEV_TRIM_ACTIVE ||
+ old_state == VDEV_TRIM_SUSPENDED) {
+ spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
+ spa_history_log_internal(spa, "trim", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ }
break;
case VDEV_TRIM_COMPLETE:
spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
@@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta)
return (0);
}
+static void
+vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+ uint64_t *last_rs_end = (uint64_t *)arg;
+
+ if (physical_rs->rs_end > *last_rs_end)
+ *last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+ vd->vdev_trim_bytes_est += size;
+
+ if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
+ vd->vdev_trim_bytes_done += size;
+ } else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
+ vd->vdev_trim_last_offset <= physical_rs->rs_end) {
+ vd->vdev_trim_bytes_done +=
+ vd->vdev_trim_last_offset - physical_rs->rs_start;
+ }
+}
+
/*
* Calculates the completion percentage of a manual TRIM.
*/
@@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd)
metaslab_t *msp = vd->vdev_top->vdev_ms[i];
mutex_enter(&msp->ms_lock);
- uint64_t ms_free = msp->ms_size -
- metaslab_allocated_space(msp);
-
- if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
- ms_free /= vd->vdev_top->vdev_children;
+ uint64_t ms_free = (msp->ms_size -
+ metaslab_allocated_space(msp)) /
+ vdev_get_ndisks(vd->vdev_top);
/*
* Convert the metaslab range to a physical range
* on our vdev. We use this to determine if we are
* in the middle of this metaslab range.
*/
- range_seg64_t logical_rs, physical_rs;
+ range_seg64_t logical_rs, physical_rs, remain_rs;
logical_rs.rs_start = msp->ms_start;
logical_rs.rs_end = msp->ms_start + msp->ms_size;
- vdev_xlate(vd, &logical_rs, &physical_rs);
+ /* Metaslab space after this offset has not been trimmed. */
+ vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
vd->vdev_trim_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
continue;
- } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) {
+ }
+
+ /* Metaslab space before this offset has been trimmed */
+ uint64_t last_rs_end = physical_rs.rs_end;
+ if (!vdev_xlate_is_empty(&remain_rs)) {
+ vdev_xlate_walk(vd, &remain_rs,
+ vdev_trim_xlate_last_rs_end, &last_rs_end);
+ }
+
+ if (vd->vdev_trim_last_offset > last_rs_end) {
vd->vdev_trim_bytes_done += ms_free;
vd->vdev_trim_bytes_est += ms_free;
mutex_exit(&msp->ms_lock);
@@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd)
rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
logical_rs.rs_start = rs_get_start(rs, rt);
logical_rs.rs_end = rs_get_end(rs, rt);
- vdev_xlate(vd, &logical_rs, &physical_rs);
-
- uint64_t size = physical_rs.rs_end -
- physical_rs.rs_start;
- vd->vdev_trim_bytes_est += size;
- if (vd->vdev_trim_last_offset >= physical_rs.rs_end) {
- vd->vdev_trim_bytes_done += size;
- } else if (vd->vdev_trim_last_offset >
- physical_rs.rs_start &&
- vd->vdev_trim_last_offset <=
- physical_rs.rs_end) {
- vd->vdev_trim_bytes_done +=
- vd->vdev_trim_last_offset -
- physical_rs.rs_start;
- }
+
+ vdev_xlate_walk(vd, &logical_rs,
+ vdev_trim_xlate_progress, vd);
}
mutex_exit(&msp->ms_lock);
}
@@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd)
return (err);
}
+static void
+vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+ trim_args_t *ta = arg;
+ vdev_t *vd = ta->trim_vdev;
+
+ /*
+ * Only a manual trim will be traversing the vdev sequentially.
+ * For an auto trim all valid ranges should be added.
+ */
+ if (ta->trim_type == TRIM_TYPE_MANUAL) {
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
+ ASSERT3U(physical_rs->rs_end, >,
+ vd->vdev_trim_last_offset);
+ physical_rs->rs_start = vd->vdev_trim_last_offset;
+ }
+ }
+
+ ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+ range_tree_add(ta->trim_tree, physical_rs->rs_start,
+ physical_rs->rs_end - physical_rs->rs_start);
+}
+
/*
- * Convert the logical range into a physical range and add it to the
+ * Convert the logical range into physical ranges and add them to the
* range tree passed in the trim_args_t.
*/
static void
@@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
{
trim_args_t *ta = arg;
vdev_t *vd = ta->trim_vdev;
- range_seg64_t logical_rs, physical_rs;
+ range_seg64_t logical_rs;
logical_rs.rs_start = start;
logical_rs.rs_end = start + size;
@@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
}
ASSERT(vd->vdev_ops->vdev_op_leaf);
- vdev_xlate(vd, &logical_rs, &physical_rs);
-
- IMPLY(vd->vdev_top == vd,
- logical_rs.rs_start == physical_rs.rs_start);
- IMPLY(vd->vdev_top == vd,
- logical_rs.rs_end == physical_rs.rs_end);
-
- /*
- * Only a manual trim will be traversing the vdev sequentially.
- * For an auto trim all valid ranges should be added.
- */
- if (ta->trim_type == TRIM_TYPE_MANUAL) {
-
- /* Only add segments that we have not visited yet */
- if (physical_rs.rs_end <= vd->vdev_trim_last_offset)
- return;
-
- /* Pick up where we left off mid-range. */
- if (vd->vdev_trim_last_offset > physical_rs.rs_start) {
- ASSERT3U(physical_rs.rs_end, >,
- vd->vdev_trim_last_offset);
- physical_rs.rs_start = vd->vdev_trim_last_offset;
- }
- }
-
- ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
-
- /*
- * With raidz, it's possible that the logical range does not live on
- * this leaf vdev. We only add the physical range to this vdev's if it
- * has a length greater than 0.
- */
- if (physical_rs.rs_end > physical_rs.rs_start) {
- range_tree_add(ta->trim_tree, physical_rs.rs_start,
- physical_rs.rs_end - physical_rs.rs_start);
- } else {
- ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
- }
+ vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
}
/*
diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c
index 793e0e4f0b75..1ad53eae1eef 100644
--- a/sys/contrib/openzfs/module/zfs/zcp.c
+++ b/sys/contrib/openzfs/module/zfs/zcp.c
@@ -722,8 +722,6 @@ static void *
zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
{
zcp_alloc_arg_t *allocargs = ud;
- int flags = (allocargs->aa_must_succeed) ?
- KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI);
if (nsize == 0) {
if (ptr != NULL) {
@@ -746,10 +744,7 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
return (NULL);
}
- allocbuf = vmem_alloc(allocsize, flags);
- if (allocbuf == NULL) {
- return (NULL);
- }
+ allocbuf = vmem_alloc(allocsize, KM_SLEEP);
allocargs->aa_alloc_remaining -= allocsize;
*allocbuf = allocsize;
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
index a8341f50ba09..ea71ef325c89 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_fm.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
bcopy(info, report->zcr_ckinfo, sizeof (*info));
}
- report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+ report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
+ report->zcr_align =
+ vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
report->zcr_length = length;
#ifdef _KERNEL
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 8703290020a5..8eb9474cadb0 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -231,6 +231,13 @@ zfsdev_state_t *zfsdev_state_list;
*/
unsigned long zfs_max_nvlist_src_size = 0;
+/*
+ * When logging the output nvlist of an ioctl in the on-disk history, limit
+ * the logged size to this many bytes. This must be less then DMU_MAX_ACCESS.
+ * This applies primarily to zfs_ioc_channel_program().
+ */
+unsigned long zfs_history_output_max = 1024 * 1024;
+
uint_t zfs_fsyncer_key;
uint_t zfs_allow_log_key;
@@ -5851,7 +5858,6 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc)
static int
zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
{
- objset_t *os;
int error = 0;
zfsvfs_t *zfsvfs;
@@ -5872,19 +5878,54 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
error = zfs_resume_fs(zfsvfs, newds);
}
}
- if (error == 0)
- error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+ if (error == 0) {
+ mutex_enter(&zfsvfs->z_os->os_upgrade_lock);
+ if (zfsvfs->z_os->os_upgrade_id == 0) {
+ /* clear potential error code and retry */
+ zfsvfs->z_os->os_upgrade_status = 0;
+ mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
+
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_userspace_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ } else {
+ mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
+ }
+
+ taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq,
+ zfsvfs->z_os->os_upgrade_id);
+ error = zfsvfs->z_os->os_upgrade_status;
+ }
zfs_vfs_rele(zfsvfs);
} else {
+ objset_t *os;
+
/* XXX kind of reading contents without owning */
error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
if (error != 0)
return (error);
- error = dmu_objset_userspace_upgrade(os);
- dmu_objset_rele_flags(os, B_TRUE, FTAG);
- }
+ mutex_enter(&os->os_upgrade_lock);
+ if (os->os_upgrade_id == 0) {
+ /* clear potential error code and retry */
+ os->os_upgrade_status = 0;
+ mutex_exit(&os->os_upgrade_lock);
+
+ dmu_objset_userspace_upgrade(os);
+ } else {
+ mutex_exit(&os->os_upgrade_lock);
+ }
+
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+ taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
+ error = os->os_upgrade_status;
+ dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT,
+ FTAG);
+ }
return (error);
}
@@ -6609,14 +6650,17 @@ static int
zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
{
int err;
- boolean_t force = B_FALSE;
+ boolean_t rc, force = B_FALSE;
spa_t *spa;
if ((err = spa_open(pool, &spa, FTAG)) != 0)
return (err);
- if (innvl)
- force = fnvlist_lookup_boolean_value(innvl, "force");
+ if (innvl) {
+ err = nvlist_lookup_boolean_value(innvl, "force", &rc);
+ if (err == 0)
+ force = rc;
+ }
if (force) {
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
@@ -6627,7 +6671,7 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
spa_close(spa, FTAG);
- return (err);
+ return (0);
}
/*
@@ -7519,8 +7563,14 @@ zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag)
vec->zvec_allow_log &&
spa_open(zc->zc_name, &spa, FTAG) == 0) {
if (!nvlist_empty(outnvl)) {
- fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
- outnvl);
+ size_t out_size = fnvlist_size(outnvl);
+ if (out_size > zfs_history_output_max) {
+ fnvlist_add_int64(lognv,
+ ZPOOL_HIST_OUTPUT_SIZE, out_size);
+ } else {
+ fnvlist_add_nvlist(lognv,
+ ZPOOL_HIST_OUTPUT_NVL, outnvl);
+ }
}
if (error != 0) {
fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
@@ -7629,4 +7679,7 @@ zfs_kmod_fini(void)
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW,
"Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
+
+ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW,
+ "Maximum size in bytes of ZFS ioctl output that will be logged");
/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..3b7c52b8dd34
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -0,0 +1,895 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/policy.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+
+static ulong_t zfs_fsync_sync_cnt = 4;
+
+int
+zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+ (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
+ tsd_set(zfs_fsyncer_key, NULL);
+
+ return (0);
+}
+
+
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+/*
+ * Lseek support for finding holes (cmd == SEEK_HOLE) and
+ * data (cmd == SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
+{
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_size;
+ if (noff >= file_sz) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (cmd == F_SEEK_HOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
+
+ if (error == ESRCH)
+ return (SET_ERROR(ENXIO));
+
+ /* file was dirty, so fall back to using generic logic */
+ if (error == EBUSY) {
+ if (hole)
+ *off = file_sz;
+
+ return (0);
+ }
+
+ /*
+ * We could find a hole that begins after the logical end-of-file,
+ * because dmu_offset_next() only works on whole blocks. If the
+ * EOF falls mid-block, then indicate that the "virtual hole"
+ * at the end of the file begins at the logical EOF, rather than
+ * at the end of the last block.
+ */
+ if (noff > file_sz) {
+ ASSERT(hole);
+ noff = file_sz;
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+int
+zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_holey_common(zp, cmd, off);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+/*ARGSUSED*/
+int
+zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (flag & V_ACE_MASK)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+ else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: zp - inode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - O_SYNC flags; used to provide FRSYNC semantics.
+ * O_DIRECT flag; used to bypass page cache.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Side Effects:
+ * inode - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+int
+zfs_read(struct znode *zp, uio_t *uio, int ioflag, cred_t *cr)
+{
+ int error = 0;
+ boolean_t frsync = B_FALSE;
+
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ /* We don't copy out anything useful for directories. */
+ if (Z_ISDIR(ZTOTYPE(zp))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+#ifdef FRSYNC
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ * Only do this for non-snapshots.
+ *
+ * Some platforms do not support FRSYNC and instead map it
+ * to O_SYNC, which results in unnecessary calls to zil_commit. We
+ * only honor FRSYNC requests on platforms which support it.
+ */
+ frsync = !!(ioflag & FRSYNC);
+#endif
+ if (zfsvfs->z_log &&
+ (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+ zil_commit(zfsvfs->z_log, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(uio->uio_loffset < zp->z_size);
+ ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+ ssize_t start_resid = n;
+
+ while (n > 0) {
+ ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
+ P2PHASE(uio->uio_loffset, zfs_vnops_read_chunk_size));
+#ifdef UIO_NOCOPY
+ if (uio->uio_segflg == UIO_NOCOPY)
+ error = mappedread_sf(zp, nbytes, uio);
+ else
+#endif
+ if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+ error = mappedread(zp, nbytes, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes);
+ }
+
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+
+ n -= nbytes;
+ }
+
+ int64_t nread = start_resid - n;
+ dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+ task_io_account_read(nread);
+out:
+ zfs_rangelock_exit(lr);
+
+ ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: zp - znode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - O_APPEND flag set if in append mode.
+ * O_DIRECT flag; used to bypass page cache.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+int
+zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr)
+{
+ int error = 0;
+ ssize_t start_resid = uio->uio_resid;
+
+ /*
+ * Fasttrack empty write
+ */
+ ssize_t n = start_resid;
+ if (n == 0)
+ return (0);
+
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+ uint64_t mtime[2], ctime[2];
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM
+ */
+ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * Validate file offset
+ */
+ offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset;
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ const uint64_t max_blksz = zfsvfs->z_max_blksz;
+
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
+ */
+ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFAULT));
+ }
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ zfs_locked_range_t *lr;
+ if (ioflag & O_APPEND) {
+ /*
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+ woff = lr->lr_offset;
+ if (lr->lr_length == UINT64_MAX) {
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
+ }
+ uio->uio_loffset = woff;
+ } else {
+ /*
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
+ */
+ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+ }
+
+ if (zn_rlimit_fsize(zp, uio, uio->uio_td)) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ const rlim64_t limit = MAXOFFSET_T;
+
+ if (woff >= limit) {
+ zfs_rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ if (n > limit - woff)
+ n = limit - woff;
+
+ uint64_t end_size = MAX(zp->z_size, woff + n);
+ zilog_t *zilog = zfsvfs->z_log;
+
+ const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
+ const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
+ const uint64_t projid = zp->z_projid;
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ woff = uio->uio_loffset;
+
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
+ (projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ arc_buf_t *abuf = NULL;
+ if (n >= max_blksz && woff >= zp->z_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
+ size_t cbytes;
+
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
+ ASSERT(abuf != NULL);
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if ((error = uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes))) {
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+ ASSERT3S(cbytes, ==, max_blksz);
+ }
+
+ /*
+ * Start a transaction.
+ */
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+ MIN(n, max_blksz));
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+
+ /*
+ * If rangelock_enter() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
+ */
+ if (lr->lr_length == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ zfs_rangelock_reduce(lr, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ const ssize_t nbytes =
+ MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+ ssize_t tx_bytes;
+ if (abuf == NULL) {
+ tx_bytes = uio->uio_resid;
+ uio_fault_disable(uio, B_TRUE);
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
+ uio_fault_disable(uio, B_FALSE);
+#ifdef __linux__
+ if (error == EFAULT) {
+ dmu_tx_commit(tx);
+ /*
+ * Account for partial writes before
+ * continuing the loop.
+ * Update needs to occur before the next
+ * uio_prefaultpages, or prefaultpages may
+ * error, and we may break the loop early.
+ */
+ if (tx_bytes != uio->uio_resid)
+ n -= tx_bytes - uio->uio_resid;
+ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ break;
+ }
+ continue;
+ }
+#endif
+ if (error != 0) {
+ dmu_tx_commit(tx);
+ break;
+ }
+ tx_bytes -= uio->uio_resid;
+ } else {
+ /* Implied by abuf != NULL: */
+ ASSERT3S(n, >=, max_blksz);
+ ASSERT0(P2PHASE(woff, max_blksz));
+ /*
+ * We can simplify nbytes to MIN(n, max_blksz) since
+ * P2PHASE(woff, max_blksz) is 0, and knowing
+ * n >= max_blksz lets us simplify further:
+ */
+ ASSERT3S(nbytes, ==, max_blksz);
+ /*
+ * Thus, we're writing a full block at a block-aligned
+ * offset and extending the file past EOF.
+ *
+ * dmu_assign_arcbuf_by_dbuf() will directly assign the
+ * arc buffer to a dbuf.
+ */
+ error = dmu_assign_arcbuf_by_dbuf(
+ sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
+ if (error != 0) {
+ dmu_return_arcbuf(abuf);
+ dmu_tx_commit(tx);
+ break;
+ }
+ ASSERT3S(nbytes, <=, uio->uio_resid);
+ uioskip(uio, nbytes);
+ tx_bytes = nbytes;
+ }
+ if (tx_bytes && zn_has_cached_data(zp) &&
+ !(ioflag & O_DIRECT)) {
+ update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
+ }
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the execute bits is set.
+ *
+ * It would be nice to do this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(zp, cr,
+ ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
+ uio->uio_loffset);
+ ASSERT(error == 0);
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+ NULL, NULL);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT3S(tx_bytes, ==, nbytes);
+ n -= nbytes;
+
+ if (n > 0) {
+ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ }
+ }
+
+ zfs_inode_update(zp);
+ zfs_rangelock_exit(lr);
+
+ /*
+ * If we're in replay mode, or we made no progress, or the
+ * uio data is inaccessible return an error. Otherwise, it's
+ * at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_replay || uio->uio_resid == start_resid ||
+ error == EFAULT) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (ioflag & (O_SYNC | O_DSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
+
+ const int64_t nwritten = start_resid - uio->uio_resid;
+ dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+ task_io_account_write(nwritten);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+static void zfs_get_done(zgd_t *zgd, int error);
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error = 0;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
+ return (SET_ERROR(ENOENT));
+ if (zp->z_unlinked) {
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_zrele_async(zp);
+ return (SET_ERROR(ENOENT));
+ }
+
+ zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zp;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (offset >= zp->z_size) {
+ error = SET_ERROR(ENOENT);
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ }
+ ASSERT(error == 0 || error == ENOENT);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and its checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ if (zp->z_blksz == size)
+ break;
+ offset += blkoff;
+ zfs_rangelock_exit(zgd->zgd_lr);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (lr->lr_offset >= zp->z_size)
+ error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+ if (zil_fault_io) {
+ error = SET_ERROR(EIO);
+ zil_fault_io = 0;
+ }
+#endif
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= size);
+
+ /*
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
+ */
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP.
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
+ error = 0;
+ }
+ }
+ }
+
+ zfs_get_done(zgd, error);
+
+ return (error);
+}
+
+
+/* ARGSUSED */
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+ znode_t *zp = zgd->zgd_private;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ zfs_rangelock_exit(zgd->zgd_lr);
+
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_zrele_async(zp);
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+EXPORT_SYMBOL(zfs_access);
+EXPORT_SYMBOL(zfs_fsync);
+EXPORT_SYMBOL(zfs_holey);
+EXPORT_SYMBOL(zfs_read);
+EXPORT_SYMBOL(zfs_write);
+EXPORT_SYMBOL(zfs_getsecattr);
+EXPORT_SYMBOL(zfs_setsecattr);
+
+ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW,
+ "Bytes to read per chunk");
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 933aedebd084..dfd92b893b9f 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -1301,7 +1301,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
spa_min_claim_txg(spa));
ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
- ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
@@ -1733,16 +1733,16 @@ zio_write_compress(zio_t *zio)
return (zio);
} else {
/*
- * Round up compressed size up to the ashift
- * of the smallest-ashift device, and zero the tail.
- * This ensures that the compressed size of the BP
- * (and thus compressratio property) are correct,
+ * Round compressed size up to the minimum allocation
+ * size of the smallest-ashift device, and zero the
+ * tail. This ensures that the compressed size of the
+ * BP (and thus compressratio property) are correct,
* in that we charge for the padding used to fill out
* the last sector.
*/
- ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
- size_t rounded = (size_t)P2ROUNDUP(psize,
- 1ULL << spa->spa_min_ashift);
+ ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
+ size_t rounded = (size_t)roundup(psize,
+ spa->spa_min_alloc);
if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@@ -2275,9 +2275,7 @@ zio_nowait(zio_t *zio)
* will ensure they complete prior to unloading the pool.
*/
spa_t *spa = zio->io_spa;
- kpreempt_disable();
- pio = spa->spa_async_zio_root[CPU_SEQID];
- kpreempt_enable();
+ pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
zio_add_child(pio, zio);
}
@@ -2816,8 +2814,8 @@ zio_write_gang_block(zio_t *pio)
ASSERT(has_data);
flags |= METASLAB_ASYNC_ALLOC;
- VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
- pio));
+ VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
+ mca_alloc_slots, pio));
/*
* The logical zio has already placed a reservation for
@@ -3618,17 +3616,16 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* of, so we just hash the objset ID to pick the allocator to get
* some parallelism.
*/
- error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
- txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL,
- cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
- spa->spa_alloc_count);
+ int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+ int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
+ spa->spa_alloc_count;
+ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp,
+ 1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
if (error == 0) {
*slog = TRUE;
} else {
- error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, NULL, METASLAB_FASTWRITE,
- &io_alloc_list, NULL, cityhash4(0, 0, 0,
- os->os_dsl_dataset->ds_object) % spa->spa_alloc_count);
+ error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp,
+ 1, txg, NULL, flags, &io_alloc_list, NULL, allocator);
if (error == 0)
*slog = FALSE;
}
@@ -3787,19 +3784,37 @@ zio_vdev_io_start(zio_t *zio)
* However, indirect vdevs point off to other vdevs which may have
* DTL's, so we never bypass them. The child i/os on concrete vdevs
* will be properly bypassed instead.
+ *
+ * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
+ * a dRAID spare vdev. For example, when a dRAID spare is first
+ * used, its spare blocks need to be written to but the leaf vdev's
+ * of such blocks can have empty DTL_PARTIAL.
+ *
+ * There seemed no clean way to allow such writes while bypassing
+ * spurious ones. At this point, just avoid all bypassing for dRAID
+ * for correctness.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
vd->vdev_ops != &vdev_indirect_ops &&
+ vd->vdev_top->vdev_ops != &vdev_draid_ops &&
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
return (zio);
}
- if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ ||
- zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) {
+ /*
+ * Select the next best leaf I/O to process. Distributed spares are
+ * excluded since they dispatch the I/O directly to a leaf vdev after
+ * applying the dRAID mapping.
+ */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops &&
+ (zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_TRIM)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (zio);
@@ -3836,8 +3851,8 @@ zio_vdev_io_done(zio_t *zio)
if (zio->io_delay)
zio->io_delay = gethrtime() - zio->io_delay;
- if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
-
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_ops != &vdev_draid_spare_ops) {
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
@@ -4239,7 +4254,7 @@ zio_checksum_verify(zio_t *zio)
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
return (zio);
- ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
+ ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
}
if ((error = zio_checksum_error(zio, &info)) != 0) {
@@ -4483,9 +4498,8 @@ zio_done(zio_t *zio)
metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
zio->io_allocator);
- VERIFY(zfs_refcount_not_held(
- &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
- zio));
+ VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
+ mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
}
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
index fb8ce0916eb5..e56ea88682ff 100644
--- a/sys/contrib/openzfs/module/zfs/zio_inject.c
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error)
if (zio->io_type != ZIO_TYPE_READ)
return (0);
+ /*
+ * A rebuild I/O has no checksum to verify.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
+ return (0);
+
rw_enter(&inject_lock, RW_READER);
for (handler = list_head(&inject_handlers); handler != NULL;
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 2b20b02e4942..7c6dae8650c7 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -772,7 +772,7 @@ zvol_setup_zv(zvol_state_t *zv)
if (error)
return (SET_ERROR(error));
- error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn);
+ error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
if (error)
return (SET_ERROR(error));
@@ -807,7 +807,7 @@ zvol_shutdown_zv(zvol_state_t *zv)
zv->zv_zilog = NULL;
- dnode_rele(zv->zv_dn, FTAG);
+ dnode_rele(zv->zv_dn, zv);
zv->zv_dn = NULL;
/*
@@ -1376,7 +1376,9 @@ typedef struct zvol_volmode_cb_arg {
static void
zvol_set_volmode_impl(char *name, uint64_t volmode)
{
- fstrans_cookie_t cookie = spl_fstrans_mark();
+ fstrans_cookie_t cookie;
+ uint64_t old_volmode;
+ zvol_state_t *zv;
if (strchr(name, '@') != NULL)
return;
@@ -1386,9 +1388,18 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
* this is necessary because our backing gendisk (zvol_state->zv_disk)
* could be different when we set, for instance, volmode from "geom"
* to "dev" (or vice versa).
- * A possible optimization is to modify our consumers so we don't get
- * called when "volmode" does not change.
*/
+ zv = zvol_find_by_name(name, RW_NONE);
+ if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
+ return;
+ if (zv != NULL) {
+ old_volmode = zv->zv_volmode;
+ mutex_exit(&zv->zv_state_lock);
+ if (old_volmode == volmode)
+ return;
+ zvol_wait_close(zv);
+ }
+ cookie = spl_fstrans_mark();
switch (volmode) {
case ZFS_VOLMODE_NONE:
(void) zvol_remove_minor_impl(name);
@@ -1406,7 +1417,6 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
(void) ops->zv_create_minor(name);
break;
}
-
spl_fstrans_unmark(cookie);
}
diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
index 3d1805f49cca..69ebf252d1ba 100644
--- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c
+++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
@@ -202,6 +202,34 @@ static struct zstd_fallback_mem zstd_dctx_fallback;
static struct zstd_pool *zstd_mempool_cctx;
static struct zstd_pool *zstd_mempool_dctx;
+
+static void
+zstd_mempool_reap(struct zstd_pool *zstd_mempool)
+{
+ struct zstd_pool *pool;
+
+ if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
+ return;
+ }
+
+ /* free obsolete slots */
+ for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+ pool = &zstd_mempool[i];
+ if (pool->mem && mutex_tryenter(&pool->barrier)) {
+ /* Free memory if unused object older than 2 minutes */
+ if (pool->mem && gethrestime_sec() > pool->timeout) {
+ vmem_free(pool->mem, pool->size);
+ ZSTDSTAT_SUB(zstd_stat_buffers, 1);
+ ZSTDSTAT_SUB(zstd_stat_size, pool->size);
+ pool->mem = NULL;
+ pool->size = 0;
+ pool->timeout = 0;
+ }
+ mutex_exit(&pool->barrier);
+ }
+ }
+}
+
/*
* Try to get a cached allocated buffer from memory pool or allocate a new one
* if necessary. If a object is older than 2 minutes and does not fit the
@@ -215,6 +243,7 @@ static struct zstd_pool *zstd_mempool_dctx;
*
* The scheduled release will be updated every time a object is reused.
*/
+
static void *
zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
{
@@ -242,31 +271,16 @@ zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
* Check if objects fits the size, if so we take it and
* update the timestamp.
*/
- if (size && !mem && pool->mem && size <= pool->size) {
+ if (pool->mem && size <= pool->size) {
pool->timeout = gethrestime_sec() +
ZSTD_POOL_TIMEOUT;
mem = pool->mem;
- continue;
+ return (mem);
}
-
- /* Free memory if unused object older than 2 minutes */
- if (pool->mem && gethrestime_sec() > pool->timeout) {
- vmem_free(pool->mem, pool->size);
- ZSTDSTAT_SUB(zstd_stat_buffers, 1);
- ZSTDSTAT_SUB(zstd_stat_size, pool->size);
- pool->mem = NULL;
- pool->size = 0;
- pool->timeout = 0;
- }
-
mutex_exit(&pool->barrier);
}
}
- if (!size || mem) {
- return (mem);
- }
-
/*
* If no preallocated slot was found, try to fill in a new one.
*
@@ -711,8 +725,8 @@ zfs_zstd_cache_reap_now(void)
* calling alloc with zero size seeks
* and releases old unused objects
*/
- zstd_mempool_alloc(zstd_mempool_cctx, 0);
- zstd_mempool_alloc(zstd_mempool_dctx, 0);
+ zstd_mempool_reap(zstd_mempool_cctx);
+ zstd_mempool_reap(zstd_mempool_dctx);
}
extern int __init