aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module')
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c2
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c40
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c3
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c8
-rw-r--r--sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c6
-rw-r--r--sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c3
-rw-r--r--sys/contrib/openzfs/module/zcommon/zfs_deleg.c1
-rw-r--r--sys/contrib/openzfs/module/zcommon/zpool_prop.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt_log.c18
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c6
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c117
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_draid.c28
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_file.c3
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c345
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_crrd.c7
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c19
17 files changed, 565 insertions, 53 deletions
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
index b15a3e6e38c0..cb5787269db2 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -1175,7 +1175,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
int count = 0;
zfs_acl_phys_t acl_phys;
- if (zp->z_zfsvfs->z_replay == B_FALSE) {
+ if (ZTOV(zp) != NULL && zp->z_zfsvfs->z_replay == B_FALSE) {
ASSERT_VOP_IN_SEQC(ZTOV(zp));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
index a222c5de4a2a..4de48e013ec4 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -674,7 +674,6 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
zfs_uio_t uio;
int *eofp = ap->a_eofflag;
off_t dots_offset;
- ssize_t orig_resid;
int error;
zfs_uio_init(&uio, ap->a_uio);
@@ -694,11 +693,13 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
return (0);
}
- orig_resid = zfs_uio_resid(&uio);
error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio,
&dots_offset);
- if (error != 0)
- goto err;
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
if (zfs_uio_offset(&uio) != dots_offset)
return (SET_ERROR(EINVAL));
@@ -711,11 +712,8 @@ zfsctl_root_readdir(struct vop_readdir_args *ap)
entry.d_reclen = sizeof (entry);
error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
if (error != 0) {
-err:
- if (error == ENAMETOOLONG) {
- error = orig_resid == zfs_uio_resid(&uio) ?
- EINVAL : 0;
- }
+ if (error == ENAMETOOLONG)
+ error = 0;
return (SET_ERROR(error));
}
if (eofp != NULL)
@@ -1060,21 +1058,17 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
zfs_uio_t uio;
int *eofp = ap->a_eofflag;
off_t dots_offset;
- ssize_t orig_resid;
int error;
zfs_uio_init(&uio, ap->a_uio);
- orig_resid = zfs_uio_resid(&uio);
ASSERT3S(vp->v_type, ==, VDIR);
error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap,
&uio, &dots_offset);
if (error != 0) {
- if (error == ENAMETOOLONG) { /* ran out of destination space */
- error = orig_resid == zfs_uio_resid(&uio) ?
- EINVAL : 0;
- }
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
return (error);
}
@@ -1092,13 +1086,9 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
if (error != 0) {
if (error == ENOENT) {
- if (orig_resid == zfs_uio_resid(&uio)) {
- error = EINVAL;
- } else {
- error = 0;
- if (eofp != NULL)
- *eofp = 1;
- }
+ if (eofp != NULL)
+ *eofp = 1;
+ error = 0;
}
zfs_exit(zfsvfs, FTAG);
return (error);
@@ -1111,10 +1101,8 @@ zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
entry.d_reclen = sizeof (entry);
error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
if (error != 0) {
- if (error == ENAMETOOLONG) {
- error = orig_resid == zfs_uio_resid(&uio) ?
- EINVAL : 0;
- }
+ if (error == ENAMETOOLONG)
+ error = 0;
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(error));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index 21e5f7938f9f..ca13569a1235 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -164,8 +164,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
return (zfs_file_write_impl(fp, buf, count, &off, resid));
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 120d97510c9e..411225786089 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -389,7 +389,9 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
error = vn_lock(vp, LK_EXCLUSIVE);
if (error)
return (error);
+ vn_seqc_write_begin(vp);
error = zfs_ioctl_setxattr(vp, fsx, cred);
+ vn_seqc_write_end(vp);
VOP_UNLOCK(vp);
return (error);
}
@@ -1696,7 +1698,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
objset_t *os;
caddr_t outbuf;
size_t bufsize;
- ssize_t orig_resid;
zap_cursor_t zc;
zap_attribute_t *zap;
uint_t bytes_wanted;
@@ -1745,7 +1746,6 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
error = 0;
os = zfsvfs->z_os;
offset = zfs_uio_offset(uio);
- orig_resid = zfs_uio_resid(uio);
prefetch = zp->z_zn_prefetch;
zap = zap_attribute_long_alloc();
@@ -1925,7 +1925,7 @@ update:
kmem_free(outbuf, bufsize);
if (error == ENOENT)
- error = orig_resid == zfs_uio_resid(uio) ? EINVAL : 0;
+ error = 0;
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -2206,6 +2206,7 @@ zfs_setattr_dir(znode_t *dzp)
if (err)
break;
+ vn_seqc_write_begin(ZTOV(zp));
mutex_enter(&dzp->z_lock);
if (zp->z_uid != dzp->z_uid) {
@@ -2255,6 +2256,7 @@ sa_add_projid_err:
dmu_tx_abort(tx);
}
tx = NULL;
+ vn_seqc_write_end(ZTOV(zp));
if (err != 0 && err != ENOENT)
break;
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index 7cd0a153577c..649022ab5bcb 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -817,6 +817,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
(*zpp)->z_dnodesize = dnodesize;
(*zpp)->z_projid = projid;
+ vnode_t *vp = ZTOV(*zpp);
+ if (!(flag & IS_ROOT_NODE))
+ vn_seqc_write_begin(vp);
+
if (vap->va_mask & AT_XVATTR)
zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
@@ -825,7 +829,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
}
if (!(flag & IS_ROOT_NODE)) {
- vnode_t *vp = ZTOV(*zpp);
+ vn_seqc_write_end(vp);
vp->v_vflag |= VV_FORCEINSMQ;
int err = insmntque(vp, zfsvfs->z_vfs);
vp->v_vflag &= ~VV_FORCEINSMQ;
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
index c729947369c2..3fdcdbac6f68 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -115,8 +115,9 @@ zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
*/
int
zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
- ssize_t *resid)
+ uint8_t ashift, ssize_t *resid)
{
+ (void) ashift;
ssize_t rc;
rc = kernel_write(fp, buf, count, &off);
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
index 49bb534ca26c..87596558c9a1 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
@@ -59,6 +59,7 @@ const zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
{ZFS_DELEG_PERM_SNAPSHOT},
{ZFS_DELEG_PERM_SHARE},
{ZFS_DELEG_PERM_SEND},
+ {ZFS_DELEG_PERM_SEND_RAW},
{ZFS_DELEG_PERM_USERPROP},
{ZFS_DELEG_PERM_USERQUOTA},
{ZFS_DELEG_PERM_GROUPQUOTA},
diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
index 04ae9f986d8f..07819ba2be8b 100644
--- a/sys/contrib/openzfs/module/zcommon/zpool_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
@@ -467,9 +467,15 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
+ zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
+ PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
+ sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
+ zprop_register_index(VDEV_PROP_AUTOSIT, "autosit", 0,
+ PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "AUTOSIT", boolean_table,
+ sfeatures);
/* default index properties */
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,
diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c
index c9217cef4f7d..c7a2426f3a77 100644
--- a/sys/contrib/openzfs/module/zfs/ddt_log.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_log.c
@@ -245,6 +245,13 @@ ddt_log_alloc_entry(ddt_t *ddt)
}
static void
+ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle)
+{
+ kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+ ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+}
+
+static void
ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
{
/* Create the log tree entry from a live or stored entry */
@@ -349,8 +356,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
- kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
- ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+ ddt_log_free_entry(ddt, ddle);
return (B_TRUE);
}
@@ -367,8 +373,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
avl_remove(&ddl->ddl_tree, ddle);
- kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
- ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+ ddt_log_free_entry(ddt, ddle);
return (B_TRUE);
}
@@ -529,8 +534,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
while ((ddle =
avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
- kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
- ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+ ddt_log_free_entry(ddt, ddle);
}
ASSERT(avl_is_empty(&ddl->ddl_tree));
}
@@ -729,7 +733,7 @@ ddt_log_load(ddt_t *ddt)
ddle = fe;
fe = AVL_NEXT(fl, fe);
avl_remove(fl, ddle);
-
+ ddt_log_free_entry(ddt, ddle);
ddle = ae;
ae = AVL_NEXT(al, ae);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index f7f808d5b8f7..a7a5c89bdafb 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
*/
uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
+ if (limit == 0)
+ end2 = start2;
do {
level2++;
start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
@@ -1689,8 +1691,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object,
dmu_object_info_from_dnode(dn, &doi);
- for (uint64_t off = 0; off < doi.doi_max_offset;
- off += dmu_prefetch_max) {
+ for (uint64_t off = 0; off < doi.doi_max_offset &&
+ dmu_prefetch_max > 0; off += dmu_prefetch_max) {
/* dbuf_read doesn't prefetch L1 blocks. */
dmu_prefetch_by_dnode(dn, 1, off,
dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index dceafbc27556..6f7c060f97f8 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
#ifdef ZFS_DEBUG
/*
- * Everything except dprintf, set_error, spa, and indirect_remap is on
- * by default in debug builds.
+ * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct
+ * is on by default in debug builds.
*/
int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
- ZFS_DEBUG_INDIRECT_REMAP);
+ ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT);
#else
int zfs_flags = 0;
#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index ed04ce0c86eb..fc6d445f9785 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -29,7 +29,7 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
- * Copyright (c) 2021, Klara Inc.
+ * Copyright (c) 2021, 2025, Klara, Inc.
* Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
*/
@@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
}
}
+ if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
+ vd->vdev_autosit =
+ vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
+
/*
* Add ourselves to the parent's list of children.
*/
@@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd)
spa_spare_remove(vd);
if (vd->vdev_isl2cache)
spa_l2cache_remove(vd);
+ if (vd->vdev_prev_histo)
+ kmem_free(vd->vdev_prev_histo,
+ sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS);
txg_list_destroy(&vd->vdev_ms_list);
txg_list_destroy(&vd->vdev_dtl_list);
@@ -3857,6 +3864,26 @@ vdev_load(vdev_t *vd)
}
}
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ uint64_t autosit;
+
+ error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit),
+ 1, &autosit);
+ if (error == 0) {
+ vd->vdev_autosit = autosit == 1;
+ } else if (error == ENOENT) {
+ vd->vdev_autosit = vdev_prop_default_numeric(
+ VDEV_PROP_AUTOSIT);
+ } else {
+ vdev_dbgmsg(vd,
+ "vdev_load: zap_lookup(top_zap=%llu) "
+ "failed [error=%d]",
+ (u_longlong_t)vd->vdev_top_zap, error);
+ }
+ }
+
/*
* Load any rebuild state from the top-level vdev zap.
*/
@@ -4616,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd)
vd->vdev_stat.vs_checksum_errors = 0;
vd->vdev_stat.vs_dio_verify_errors = 0;
vd->vdev_stat.vs_slow_ios = 0;
+ atomic_store_64(&vd->vdev_outlier_count, 0);
+ vd->vdev_read_sit_out_expire = 0;
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
@@ -6107,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_failfast = intval & 1;
break;
+ case VDEV_PROP_SIT_OUT:
+ /* Only expose this for a draid or raidz leaf */
+ if (!vd->vdev_ops->vdev_op_leaf ||
+ vd->vdev_top == NULL ||
+ (vd->vdev_top->vdev_ops != &vdev_raidz_ops &&
+ vd->vdev_top->vdev_ops != &vdev_draid_ops)) {
+ error = ENOTSUP;
+ break;
+ }
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ if (intval == 1) {
+ vdev_t *ancestor = vd;
+ while (ancestor->vdev_parent != vd->vdev_top)
+ ancestor = ancestor->vdev_parent;
+ vdev_t *pvd = vd->vdev_top;
+ uint_t sitouts = 0;
+ for (int i = 0; i < pvd->vdev_children; i++) {
+ if (pvd->vdev_child[i] == ancestor)
+ continue;
+ if (vdev_sit_out_reads(
+ pvd->vdev_child[i], 0)) {
+ sitouts++;
+ }
+ }
+ if (sitouts >= vdev_get_nparity(pvd)) {
+ error = ZFS_ERR_TOO_MANY_SITOUTS;
+ break;
+ }
+ if (error == 0)
+ vdev_raidz_sit_child(vd,
+ INT64_MAX - gethrestime_sec());
+ } else {
+ vdev_raidz_unsit_child(vd);
+ }
+ break;
+ case VDEV_PROP_AUTOSIT:
+ if (vd->vdev_ops != &vdev_raidz_ops &&
+ vd->vdev_ops != &vdev_draid_ops) {
+ error = ENOTSUP;
+ break;
+ }
+ if (nvpair_value_uint64(elem, &intval) != 0) {
+ error = EINVAL;
+ break;
+ }
+ vd->vdev_autosit = intval == 1;
+ break;
case VDEV_PROP_CHECKSUM_N:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
@@ -6456,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
+ case VDEV_PROP_SIT_OUT:
+ /* Only expose this for a draid or raidz leaf */
+ if (vd->vdev_ops->vdev_op_leaf &&
+ vd->vdev_top != NULL &&
+ (vd->vdev_top->vdev_ops ==
+ &vdev_raidz_ops ||
+ vd->vdev_top->vdev_ops ==
+ &vdev_draid_ops)) {
+ vdev_prop_add_list(outnvl, propname,
+ NULL, vdev_sit_out_reads(vd, 0),
+ ZPROP_SRC_NONE);
+ }
+ continue;
case VDEV_PROP_TRIM_SUPPORT:
/* only valid for leaf vdevs */
if (vd->vdev_ops->vdev_op_leaf) {
@@ -6506,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
vdev_prop_add_list(outnvl, propname, strval,
intval, src);
break;
+ case VDEV_PROP_AUTOSIT:
+ /* Only raidz vdevs cannot have this property */
+ if (vd->vdev_ops != &vdev_raidz_ops &&
+ vd->vdev_ops != &vdev_draid_ops) {
+ src = ZPROP_SRC_NONE;
+ intval = ZPROP_BOOLEAN_NA;
+ } else {
+ err = vdev_prop_get_int(vd, prop,
+ &intval);
+ if (err && err != ENOENT)
+ break;
+
+ if (intval ==
+ vdev_prop_default_numeric(prop))
+ src = ZPROP_SRC_DEFAULT;
+ else
+ src = ZPROP_SRC_LOCAL;
+ }
+
+ vdev_prop_add_list(outnvl, propname, NULL,
+ intval, src);
+ break;
+
case VDEV_PROP_CHECKSUM_N:
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
index a05289102af2..8588cfee3f7d 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_draid.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2018 Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2025, Klara, Inc.
*/
#include <sys/zfs_context.h>
@@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
rc->rc_allow_repair = 1;
}
}
+
+ if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+ rr->rr_outlier_cnt++;
+ ASSERT0(rc->rc_latency_outlier);
+ rc->rc_latency_outlier = 1;
+ }
+ }
+
+ /*
+ * When the row contains a latency outlier and sufficient parity
+ * exists to reconstruct the column data, then skip reading the
+ * known slow child vdev as a performance optimization.
+ */
+ if (rr->rr_outlier_cnt > 0 &&
+ (rr->rr_firstdatacol - rr->rr_missingparity) >=
+ (rr->rr_missingdata + 1)) {
+
+ for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+ rr->rr_missingdata++;
+ rc->rc_error = SET_ERROR(EAGAIN);
+ rc->rc_skipped = 1;
+ break;
+ }
+ }
}
/*
diff --git a/sys/contrib/openzfs/module/zfs/vdev_file.c b/sys/contrib/openzfs/module/zfs/vdev_file.c
index f457669bc809..20b4db65ec06 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_file.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_file.c
@@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg)
abd_return_buf_copy(zio->io_abd, buf, size);
} else {
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
- err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+ err = zfs_file_pwrite(vf->vf_file, buf, size, off,
+ vd->vdev_ashift, &resid);
abd_return_buf(zio->io_abd, buf, size);
}
zio->io_error = err;
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index b597d6daefde..56b8e3b60b22 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -24,6 +24,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2016 Gvozden Nešković. All rights reserved.
+ * Copyright (c) 2025, Klara, Inc.
*/
#include <sys/zfs_context.h>
@@ -356,6 +357,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0;
uint_t raidz_expand_pause_point = 0;
/*
+ * This represents the duration for a slow drive read sit out.
+ */
+static unsigned long vdev_read_sit_out_secs = 600;
+
+/*
+ * How often each RAID-Z and dRAID vdev will check for slow disk outliers.
+ * Increasing this interval will reduce the sensitivity of detection (since all
+ * I/Os since the last check are included in the statistics), but will slow the
+ * response to a disk developing a problem.
+ *
+ * Defaults to once per second; setting extremely small values may cause
+ * negative performance effects.
+ */
+static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000;
+
+/*
+ * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is
+ * used to determine how far out an outlier must be before it counts as an event
+ * worth consdering.
+ *
+ * Smaller values will result in more aggressive sitting out of disks that may
+ * have problems, but may significantly increase the rate of spurious sit-outs.
+ */
+static uint32_t vdev_raidz_outlier_insensitivity = 50;
+
+/*
* Maximum amount of copy io's outstanding at once.
*/
#ifdef _ILP32
@@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd)
vd->vdev_children);
}
+/*
+ * return B_TRUE if a read should be skipped due to being too slow.
+ *
+ * In vdev_child_slow_outlier() it looks for outliers based on disk
+ * latency from the most recent child reads. Here we're checking if,
+ * over time, a disk has has been an outlier too many times and is
+ * now in a sit out period.
+ */
+boolean_t
+vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
+{
+ if (vdev_read_sit_out_secs == 0)
+ return (B_FALSE);
+
+ /* Avoid skipping a data column read when scrubbing */
+ if (io_flags & ZIO_FLAG_SCRUB)
+ return (B_FALSE);
+
+ if (!vd->vdev_ops->vdev_op_leaf) {
+ boolean_t sitting = B_FALSE;
+ for (int c = 0; c < vd->vdev_children; c++) {
+ sitting |= vdev_sit_out_reads(vd->vdev_child[c],
+ io_flags);
+ }
+ return (sitting);
+ }
+
+ if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
+ return (B_TRUE);
+
+ vd->vdev_read_sit_out_expire = 0;
+
+ return (B_FALSE);
+}
+
void
vdev_raidz_child_done(zio_t *zio)
{
@@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
rc->rc_skipped = 1;
continue;
}
+
+ if (vdev_sit_out_reads(cvd, zio->io_flags)) {
+ rr->rr_outlier_cnt++;
+ ASSERT0(rc->rc_latency_outlier);
+ rc->rc_latency_outlier = 1;
+ }
+ }
+
+ /*
+ * When the row contains a latency outlier and sufficient parity
+ * exists to reconstruct the column data, then skip reading the
+ * known slow child vdev as a performance optimization.
+ */
+ if (rr->rr_outlier_cnt > 0 &&
+ (rr->rr_firstdatacol - rr->rr_missingparity) >=
+ (rr->rr_missingdata + 1)) {
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+
+ if (rc->rc_error == 0 && rc->rc_latency_outlier) {
+ if (c >= rr->rr_firstdatacol)
+ rr->rr_missingdata++;
+ else
+ rr->rr_missingparity++;
+ rc->rc_error = SET_ERROR(EAGAIN);
+ rc->rc_skipped = 1;
+ break;
+ }
+ }
+ }
+
+ for (int c = rr->rr_cols - 1; c >= 0; c--) {
+ raidz_col_t *rc = &rr->rr_col[c];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ if (rc->rc_error || rc->rc_size == 0)
+ continue;
+
if (forceparity ||
c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
@@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
ASSERT3U(prc->rc_devidx, ==, i);
vdev_t *cvd = vd->vdev_child[i];
+
if (!vdev_readable(cvd)) {
prc->rc_error = SET_ERROR(ENXIO);
prc->rc_tried = 1; /* don't even try */
@@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr)
return (error);
}
+/*
+ * Find the median value from a set of n values
+ */
+static uint64_t
+latency_median_value(const uint64_t *data, size_t n)
+{
+ uint64_t m;
+
+ if (n % 2 == 0)
+ m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1;
+ else
+ m = data[((n + 1) >> 1) - 1];
+
+ return (m);
+}
+
+/*
+ * Calculate the outlier fence from a set of n latency values
+ *
+ * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1)
+ */
+static uint64_t
+latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr)
+{
+ uint64_t q1 = latency_median_value(&data[0], n >> 1);
+ uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1);
+
+ /*
+ * To avoid detecting false positive outliers when N is small and
+ * and the latencies values are very close, make sure the IQR
+ * is at least 25% larger than Q1.
+ */
+ *iqr = MAX(q3 - q1, q1 / 4);
+
+ return (q3 + (*iqr * vdev_raidz_outlier_insensitivity));
+}
+#define LAT_CHILDREN_MIN 5
+#define LAT_OUTLIER_LIMIT 20
+
+static int
+latency_compare(const void *arg1, const void *arg2)
+{
+ const uint64_t *l1 = (uint64_t *)arg1;
+ const uint64_t *l2 = (uint64_t *)arg2;
+
+ return (TREE_CMP(*l1, *l2));
+}
+
+void
+vdev_raidz_sit_child(vdev_t *svd, uint64_t secs)
+{
+ for (int c = 0; c < svd->vdev_children; c++)
+ vdev_raidz_sit_child(svd->vdev_child[c], secs);
+
+ if (!svd->vdev_ops->vdev_op_leaf)
+ return;
+
+ /* Begin a sit out period for this slow drive */
+ svd->vdev_read_sit_out_expire = gethrestime_sec() +
+ secs;
+
+ /* Count each slow io period */
+ mutex_enter(&svd->vdev_stat_lock);
+ svd->vdev_stat.vs_slow_ios++;
+ mutex_exit(&svd->vdev_stat_lock);
+}
+
+void
+vdev_raidz_unsit_child(vdev_t *vd)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_raidz_unsit_child(vd->vdev_child[c]);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return;
+
+ vd->vdev_read_sit_out_expire = 0;
+}
+
+/*
+ * Check for any latency outlier from latest set of child reads.
+ *
+ * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This
+ * rule defines extreme outliers as data points outside the fence of the
+ * third quartile plus fifty times the Interquartile Range (IQR). This range
+ * is the distance between the first and third quartile.
+ *
+ * Fifty is an extremely large value for Tukey's fence, but the outliers we're
+ * attempting to detect here are orders of magnitude times larger than the
+ * median. This large value should capture any truly fault disk quickly,
+ * without causing spurious sit-outs.
+ *
+ * To further avoid spurious sit-outs, vdevs must be detected multiple times
+ * as an outlier before they are sat, and outlier counts will gradually decay.
+ * Every nchildren times we have detected an outlier, we subtract 2 from the
+ * outlier count of all children. If detected outliers are close to uniformly
+ * distributed, this will result in the outlier count remaining close to 0
+ * (in expectation; over long enough time-scales, spurious sit-outs are still
+ * possible).
+ */
+static void
+vdev_child_slow_outlier(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 ||
+ vd->vdev_children < LAT_CHILDREN_MIN)
+ return;
+
+ hrtime_t now = getlrtime();
+ uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
+
+ if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms))
+ return;
+
+ /* Allow a single winner when there are racing callers. */
+ if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last)
+ return;
+
+ int children = vd->vdev_children;
+ uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP);
+
+ for (int c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ if (cvd->vdev_prev_histo == NULL) {
+ mutex_enter(&cvd->vdev_stat_lock);
+ size_t size =
+ sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+ cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP);
+ memcpy(cvd->vdev_prev_histo,
+ cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ],
+ size);
+ mutex_exit(&cvd->vdev_stat_lock);
+ }
+ }
+ uint64_t max = 0;
+ vdev_t *svd = NULL;
+ uint_t sitouts = 0;
+ boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
+ for (int c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ boolean_t sitting = vdev_sit_out_reads(cvd, 0) ||
+ cvd->vdev_state != VDEV_STATE_HEALTHY;
+
+ /* We can't sit out more disks than we have parity */
+ if (sitting && ++sitouts >= vdev_get_nparity(vd))
+ skip = B_TRUE;
+
+ mutex_enter(&cvd->vdev_stat_lock);
+
+ uint64_t *prev_histo = cvd->vdev_prev_histo;
+ uint64_t *histo =
+ cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
+ if (skip) {
+ size_t size =
+ sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+ memcpy(prev_histo, histo, size);
+ mutex_exit(&cvd->vdev_stat_lock);
+ continue;
+ }
+ uint64_t count = 0;
+ lat_data[c] = 0;
+ for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
+ uint64_t this_count = histo[i] - prev_histo[i];
+ lat_data[c] += (1ULL << i) * this_count;
+ count += this_count;
+ }
+ size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
+ memcpy(prev_histo, histo, size);
+ mutex_exit(&cvd->vdev_stat_lock);
+ lat_data[c] /= MAX(1, count);
+
+ /* Wait until all disks have been read from */
+ if (lat_data[c] == 0 && !sitting) {
+ skip = B_TRUE;
+ continue;
+ }
+
+ /* Keep track of the vdev with largest value */
+ if (lat_data[c] > max) {
+ max = lat_data[c];
+ svd = cvd;
+ svd_sitting = sitting;
+ }
+ }
+
+ if (skip) {
+ kmem_free(lat_data, sizeof (uint64_t) * children);
+ return;
+ }
+
+ qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare);
+
+ uint64_t iqr;
+ uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr);
+
+ ASSERT3U(lat_data[children - 1], ==, max);
+ if (max > fence && !svd_sitting) {
+ ASSERT3U(iqr, >, 0);
+ uint64_t incr = MAX(1, MIN((max - fence) / iqr,
+ LAT_OUTLIER_LIMIT / 4));
+ vd->vdev_outlier_count += incr;
+ if (vd->vdev_outlier_count >= children) {
+ for (int c = 0; c < children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ cvd->vdev_outlier_count -= 2;
+ cvd->vdev_outlier_count = MAX(0,
+ cvd->vdev_outlier_count);
+ }
+ vd->vdev_outlier_count = 0;
+ }
+ /*
+ * Keep track of how many times this child has had
+ * an outlier read. A disk that persitently has a
+ * higher than peers outlier count will be considered
+ * a slow disk.
+ */
+ svd->vdev_outlier_count += incr;
+ if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
+ ASSERT0(svd->vdev_read_sit_out_expire);
+ vdev_raidz_sit_child(svd, vdev_read_sit_out_secs);
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT,
+ zio->io_spa, svd, NULL, NULL, 0);
+ vdev_dbgmsg(svd, "begin read sit out for %d secs",
+ (int)vdev_read_sit_out_secs);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ vd->vdev_child[c]->vdev_outlier_count = 0;
+ }
+ }
+
+ kmem_free(lat_data, sizeof (uint64_t) * children);
+}
+
static void
vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
{
@@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio)
raidz_row_t *rr = rm->rm_row[i];
vdev_raidz_io_done_verified(zio, rr);
}
+ /* Periodically check for a read outlier */
+ if (zio->io_type == ZIO_TYPE_READ)
+ vdev_child_slow_outlier(zio);
zio_checksum_verified(zio);
} else {
/*
@@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
"For expanded RAIDZ, automatically start a pool scrub when expansion "
"completes");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW,
+ "Raidz/draid slow disk sit out time period in seconds");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64,
+ ZMOD_RW, "Interval to check for slow raidz/draid children");
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT,
+ ZMOD_RW, "How insensitive the slow raidz/draid child check should be");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
index f9267ed41d71..30d4c7c36897 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_crrd.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
@@ -162,9 +162,9 @@ dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg)
daydiff = time - rrd_tail(&db->dbr_days);
monthdiff = time - rrd_tail(&db->dbr_months);
- if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60))
+ if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60)
rrd_add(&db->dbr_months, time, txg);
- else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60))
+ else if (daydiff >= 0 && daydiff >= 24 * 60 * 60)
rrd_add(&db->dbr_days, time, txg);
else if (minutedif >= 0)
rrd_add(&db->dbr_minutes, time, txg);
@@ -208,7 +208,8 @@ dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2)
if (r2 == NULL)
return (r1);
- return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2);
+ return (ABS(tv - (hrtime_t)r1->rrdd_time) <
+ ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2);
}
uint64_t
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 76c9d4ccd51f..5ca7c2320c4e 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -683,6 +683,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
dsl_dataset_t *ds;
const char *cp;
int error;
+ boolean_t rawok = (zc->zc_flags & 0x8);
/*
* Generate the current snapshot name from the given objsetid, then
@@ -705,6 +706,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
ZFS_DELEG_PERM_SEND, cr);
+ if (error != 0 && rawok == B_TRUE) {
+ error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+ ZFS_DELEG_PERM_SEND_RAW, cr);
+ }
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
@@ -714,9 +719,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
static int
zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
+ boolean_t rawok = nvlist_exists(innvl, "rawok");
+ int error;
+
(void) innvl;
- return (zfs_secpolicy_write_perms(zc->zc_name,
- ZFS_DELEG_PERM_SEND, cr));
+ error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_SEND, cr);
+ if (error != 0 && rawok == B_TRUE) {
+ error = zfs_secpolicy_write_perms(zc->zc_name,
+ ZFS_DELEG_PERM_SEND_RAW, cr);
+ }
+ return (error);
}
static int
@@ -7619,7 +7632,7 @@ zfs_ioctl_init(void)
zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
- POOL_CHECK_NONE, B_TRUE, B_TRUE,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS,