diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
19 files changed, 630 insertions, 151 deletions
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index df41e3b49204..bd6dc8edd8ca 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -486,13 +486,13 @@ static taskq_t *arc_flush_taskq; static uint_t zfs_arc_evict_threads = 0; /* The 7 states: */ -arc_state_t ARC_anon; -arc_state_t ARC_mru; -arc_state_t ARC_mru_ghost; -arc_state_t ARC_mfu; -arc_state_t ARC_mfu_ghost; -arc_state_t ARC_l2c_only; -arc_state_t ARC_uncached; +static arc_state_t ARC_anon; +/* */ arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +/* */ arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; +static arc_state_t ARC_uncached; arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, @@ -832,15 +832,15 @@ typedef struct arc_async_flush { #define L2ARC_FEED_TYPES 4 /* L2ARC Performance Tunables */ -uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ -uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ -uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ -uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; -uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ -int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ -int l2arc_feed_again = B_TRUE; /* turbo warmup */ -int l2arc_norw = B_FALSE; /* no reads during writes */ +static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +static uint64_t l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +static int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +static int l2arc_feed_again = B_TRUE; /* turbo warmup */ +static int l2arc_norw = B_FALSE; /* no reads during writes */ static uint_t l2arc_meta_percent = 33; /* limit on headers size */ /* diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 7403f10d91b7..fccc4c5b5b94 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -2270,14 +2270,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* @@ -2289,12 +2281,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); - mutex_enter(&dn->dn_mtx); - dnode_set_dirtyctx(dn, tx, db); - if (tx->tx_txg > dn->dn_dirty_txg) - dn->dn_dirty_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; @@ -2313,13 +2299,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr_next); } - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - ASSERT3U(dn->dn_nlevels, >, db->db_level); /* diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index d6658375f810..0dc9adc7fd4f 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -1701,9 +1701,11 @@ ddt_load(spa_t *spa) } } - error = ddt_log_load(ddt); - if (error != 0 && error != ENOENT) - return (error); + if (ddt->ddt_flags & DDT_FLAG_LOG) { + error = ddt_log_load(ddt); + if (error != 0 && error != ENOENT) + return (error); + } DDT_KSTAT_SET(ddt, dds_log_active_entries, avl_numnodes(&ddt->ddt_log_active->ddl_tree)); diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c index 3d30e244c1f7..c7a2426f3a77 100644 --- a/sys/contrib/openzfs/module/zfs/ddt_log.c +++ b/sys/contrib/openzfs/module/zfs/ddt_log.c @@ -176,11 +176,13 @@ ddt_log_update_stats(ddt_t *ddt) * that's reasonable to expect anyway. */ dmu_object_info_t doi; - uint64_t nblocks; - dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); - nblocks = doi.doi_physical_blocks_512; - dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); - nblocks += doi.doi_physical_blocks_512; + uint64_t nblocks = 0; + if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, + &doi) == 0) + nblocks += doi.doi_physical_blocks_512; + if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, + &doi) == 0) + nblocks += doi.doi_physical_blocks_512; ddt_object_t *ddo = &ddt->ddt_log_stats; ddo->ddo_count = @@ -243,6 +245,13 @@ ddt_log_alloc_entry(ddt_t *ddt) } static void +ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle) +{ + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); +} + +static void ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) { /* Create the log tree entry from a live or stored entry */ @@ -347,8 +356,7 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -365,8 +373,7 @@ ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); return (B_TRUE); } @@ -527,8 +534,7 @@ ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); while ((ddle = avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { - kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? - ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); + ddt_log_free_entry(ddt, ddle); } ASSERT(avl_is_empty(&ddl->ddl_tree)); } @@ -727,7 +733,7 @@ ddt_log_load(ddt_t *ddt) ddle = fe; fe = AVL_NEXT(fl, fe); avl_remove(fl, ddle); - + ddt_log_free_entry(ddt, ddle); ddle = ae; ae = AVL_NEXT(al, ae); } diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index f7f808d5b8f7..a7a5c89bdafb 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -759,6 +759,8 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, */ uint8_t ibps = ibs - SPA_BLKPTRSHIFT; limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + if (limit == 0) + end2 = start2; do { level2++; start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; @@ -1689,8 +1691,8 @@ dmu_object_cached_size(objset_t *os, uint64_t object, dmu_object_info_from_dnode(dn, &doi); - for (uint64_t off = 0; off < doi.doi_max_offset; - off += dmu_prefetch_max) { + for (uint64_t off = 0; off < doi.doi_max_offset && + dmu_prefetch_max > 0; off += dmu_prefetch_max) { /* dbuf_read doesn't prefetch L1 blocks. */ dmu_prefetch_by_dnode(dn, 1, off, dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ); diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index a77f338bdfd3..8e6b569c2100 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -2037,6 +2037,8 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + ASSERT3U(dn->dn_dirtycnt, >, 0); + dn->dn_dirtycnt--; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); @@ -2070,6 +2072,10 @@ dnode_rele_task(void *arg) dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_dirtycnt, >, 0); + dn->dn_dirtycnt--; + mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 51165d0bf723..3d3a9c713568 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -57,19 +57,19 @@ static unsigned int zfetch_max_sec_reap = 2; /* min bytes to prefetch per stream (default 2MB) */ static unsigned int zfetch_min_distance = 2 * 1024 * 1024; /* max bytes to prefetch per stream (default 8MB) */ -unsigned int zfetch_max_distance = 8 * 1024 * 1024; +static unsigned int zfetch_max_distance = 8 * 1024 * 1024; #else /* min bytes to prefetch per stream (default 4MB) */ static unsigned int zfetch_min_distance = 4 * 1024 * 1024; /* max bytes to prefetch per stream (default 64MB) */ -unsigned int zfetch_max_distance = 64 * 1024 * 1024; +static unsigned int zfetch_max_distance = 64 * 1024 * 1024; #endif /* max bytes to prefetch indirects for per stream (default 128MB) */ -unsigned int zfetch_max_idistance = 128 * 1024 * 1024; +static unsigned int zfetch_max_idistance = 128 * 1024 * 1024; /* max request reorder distance within a stream (default 16MB) */ -unsigned int zfetch_max_reorder = 16 * 1024 * 1024; +static unsigned int zfetch_max_reorder = 16 * 1024 * 1024; /* Max log2 fraction of holes in a stream */ -unsigned int zfetch_hole_shift = 2; +static unsigned int zfetch_hole_shift = 2; typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 963ff41232a3..6c150d31c669 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -173,9 +173,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; + dn->dn_dirtycnt = 0; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; @@ -229,9 +227,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); - ASSERT0(dn->dn_dirtyctx); - ASSERT0P(dn->dn_dirtyctx_firstset); + ASSERT0(dn->dn_dirtycnt); ASSERT0P(dn->dn_bonus); ASSERT(!dn->dn_have_spill); ASSERT0P(dn->dn_zio); @@ -692,10 +688,8 @@ dnode_destroy(dnode_t *dn) dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; - dn->dn_dirty_txg = 0; + dn->dn_dirtycnt = 0; - dn->dn_dirtyctx = 0; - dn->dn_dirtyctx_firstset = NULL; if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_destroy(dn->dn_bonus); @@ -800,11 +794,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; - dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; - dn->dn_dirtyctx_firstset = NULL; - dn->dn_dirty_txg = 0; + dn->dn_dirtycnt = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -955,9 +947,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; - ndn->dn_dirty_txg = odn->dn_dirty_txg; - ndn->dn_dirtyctx = odn->dn_dirtyctx; - ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ndn->dn_dirtycnt = odn->dn_dirtycnt; ASSERT0(zfs_refcount_count(&odn->dn_tx_holds)); zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); @@ -1020,9 +1010,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; - odn->dn_dirty_txg = 0; - odn->dn_dirtyctx = 0; - odn->dn_dirtyctx_firstset = NULL; + odn->dn_dirtycnt = 0; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; @@ -1273,8 +1261,8 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) } else if (DN_SLOT_IS_PTR(dn)) { mutex_enter(&dn->dn_mtx); boolean_t can_free = (dn->dn_type == DMU_OT_NONE && - zfs_refcount_is_zero(&dn->dn_holds) && - !DNODE_IS_DIRTY(dn)); + dn->dn_dirtycnt == 0 && + zfs_refcount_is_zero(&dn->dn_holds)); mutex_exit(&dn->dn_mtx); if (!can_free) @@ -1757,17 +1745,23 @@ dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp) * reference on the dnode. Returns FALSE if unable to add a * new reference. */ +static boolean_t +dnode_add_ref_locked(dnode_t *dn, const void *tag) +{ + ASSERT(MUTEX_HELD(&dn->dn_mtx)); + if (zfs_refcount_is_zero(&dn->dn_holds)) + return (FALSE); + VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + return (TRUE); +} + boolean_t dnode_add_ref(dnode_t *dn, const void *tag) { mutex_enter(&dn->dn_mtx); - if (zfs_refcount_is_zero(&dn->dn_holds)) { - mutex_exit(&dn->dn_mtx); - return (FALSE); - } - VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + boolean_t r = dnode_add_ref_locked(dn, tag); mutex_exit(&dn->dn_mtx); - return (TRUE); + return (r); } void @@ -1830,31 +1824,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode itself is dirty, or is carrying any uncommitted records. - * It is important to check both conditions, as some operations (eg appending - * to a file) can dirty both as a single logical unit, but they are not synced - * out atomically, so checking one and not the other can result in an object - * appearing to be clean mid-way through a commit. + * Test if the dnode is dirty, or carrying uncommitted records. * - * Do not change this lightly! If you get it wrong, dmu_offset_next() can - * detect a hole where there is really data, leading to silent corruption. + * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented + * in dnode_setdirty() the first time the dnode is dirtied on a txg, and + * decremented in either dnode_rele_task() or userquota_updates_task() when the + * txg is synced out. */ boolean_t dnode_is_dirty(dnode_t *dn) { mutex_enter(&dn->dn_mtx); - - for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i]) || - !list_is_empty(&dn->dn_dirty_records[i])) { - mutex_exit(&dn->dn_mtx); - return (B_TRUE); - } - } - + boolean_t dirty = (dn->dn_dirtycnt != 0); mutex_exit(&dn->dn_mtx); - - return (B_FALSE); + return (dirty); } void @@ -1916,7 +1899,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) * dnode will hang around after we finish processing its * children. */ - VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + mutex_enter(&dn->dn_mtx); + VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg)); + dn->dn_dirtycnt++; + ASSERT3U(dn->dn_dirtycnt, <=, 3); + mutex_exit(&dn->dn_mtx); (void) dbuf_dirty(dn->dn_dbuf, tx); @@ -2221,32 +2208,6 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_exit(&dn->dn_dbufs_mtx); } -void -dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag) -{ - /* - * Don't set dirtyctx to SYNC if we're just modifying this as we - * initialize the objset. - */ - if (dn->dn_dirtyctx == DN_UNDIRTIED) { - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - - if (ds != NULL) { - rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); - } - if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - if (dmu_tx_is_syncing(tx)) - dn->dn_dirtyctx = DN_DIRTY_SYNC; - else - dn->dn_dirtyctx = DN_DIRTY_OPEN; - dn->dn_dirtyctx_firstset = tag; - } - if (ds != NULL) { - rrw_exit(&ds->ds_bp_rwlock, tag); - } - } -} - static void dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len, dmu_tx_t *tx) diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index dceafbc27556..6f7c060f97f8 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -251,11 +251,11 @@ spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* - * Everything except dprintf, set_error, spa, and indirect_remap is on - * by default in debug builds. + * Everything except dprintf, set_error, indirect_remap, and raidz_reconstruct + * is on by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | - ZFS_DEBUG_INDIRECT_REMAP); + ZFS_DEBUG_INDIRECT_REMAP | ZFS_DEBUG_RAIDZ_RECONSTRUCT); #else int zfs_flags = 0; #endif diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 9cf35e379000..fc6d445f9785 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2025, Klara, Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ @@ -100,7 +100,7 @@ static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ static uint_t zfs_vdev_max_ms_shift = 34; -int vdev_validate_skip = B_FALSE; +static int vdev_validate_skip = B_FALSE; /* * Since the DTL space map of a vdev is not expected to have a lot of @@ -1086,6 +1086,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } } + if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) + vd->vdev_autosit = + vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); + /* * Add ourselves to the parent's list of children. */ @@ -1187,6 +1191,9 @@ vdev_free(vdev_t *vd) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); + if (vd->vdev_prev_histo) + kmem_free(vd->vdev_prev_histo, + sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -3857,6 +3864,26 @@ vdev_load(vdev_t *vd) } } + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + uint64_t autosit; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit), + 1, &autosit); + if (error == 0) { + vd->vdev_autosit = autosit == 1; + } else if (error == ENOENT) { + vd->vdev_autosit = vdev_prop_default_numeric( + VDEV_PROP_AUTOSIT); + } else { + vdev_dbgmsg(vd, + "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", + (u_longlong_t)vd->vdev_top_zap, error); + } + } + /* * Load any rebuild state from the top-level vdev zap. */ @@ -4616,6 +4643,8 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; + atomic_store_64(&vd->vdev_outlier_count, 0); + vd->vdev_read_sit_out_expire = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); @@ -6107,6 +6136,56 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_failfast = intval & 1; break; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (!vd->vdev_ops->vdev_op_leaf || + vd->vdev_top == NULL || + (vd->vdev_top->vdev_ops != &vdev_raidz_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops)) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (intval == 1) { + vdev_t *ancestor = vd; + while (ancestor->vdev_parent != vd->vdev_top) + ancestor = ancestor->vdev_parent; + vdev_t *pvd = vd->vdev_top; + uint_t sitouts = 0; + for (int i = 0; i < pvd->vdev_children; i++) { + if (pvd->vdev_child[i] == ancestor) + continue; + if (vdev_sit_out_reads( + pvd->vdev_child[i], 0)) { + sitouts++; + } + } + if (sitouts >= vdev_get_nparity(pvd)) { + error = ZFS_ERR_TOO_MANY_SITOUTS; + break; + } + if (error == 0) + vdev_raidz_sit_child(vd, + INT64_MAX - gethrestime_sec()); + } else { + vdev_raidz_unsit_child(vd); + } + break; + case VDEV_PROP_AUTOSIT: + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + error = ENOTSUP; + break; + } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_autosit = intval == 1; + break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; @@ -6456,6 +6535,19 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; + case VDEV_PROP_SIT_OUT: + /* Only expose this for a draid or raidz leaf */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_top != NULL && + (vd->vdev_top->vdev_ops == + &vdev_raidz_ops || + vd->vdev_top->vdev_ops == + &vdev_draid_ops)) { + vdev_prop_add_list(outnvl, propname, + NULL, vdev_sit_out_reads(vd, 0), + ZPROP_SRC_NONE); + } + continue; case VDEV_PROP_TRIM_SUPPORT: /* only valid for leaf vdevs */ if (vd->vdev_ops->vdev_op_leaf) { @@ -6506,6 +6598,29 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, strval, intval, src); break; + case VDEV_PROP_AUTOSIT: + /* Only raidz vdevs cannot have this property */ + if (vd->vdev_ops != &vdev_raidz_ops && + vd->vdev_ops != &vdev_draid_ops) { + src = ZPROP_SRC_NONE; + intval = ZPROP_BOOLEAN_NA; + } else { + err = vdev_prop_get_int(vd, prop, + &intval); + if (err && err != ENOENT) + break; + + if (intval == + vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + } + + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c index a05289102af2..8588cfee3f7d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_draid.c +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2018 Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + * Copyright (c) 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -1996,6 +1997,33 @@ vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) rc->rc_allow_repair = 1; } } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= rr->rr_firstdatacol; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + rr->rr_missingdata++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } } /* diff --git a/sys/contrib/openzfs/module/zfs/vdev_file.c b/sys/contrib/openzfs/module/zfs/vdev_file.c index f457669bc809..20b4db65ec06 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/zfs/vdev_file.c @@ -228,7 +228,8 @@ vdev_file_io_strategy(void *arg) abd_return_buf_copy(zio->io_abd, buf, size); } else { buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, + vd->vdev_ashift, &resid); abd_return_buf(zio->io_abd, buf, size); } zio->io_error = err; diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index c12713b107bf..e69e5598939e 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -122,7 +122,7 @@ * The maximum number of i/os active to each device. Ideally, this will be >= * the sum of each queue's max_active. */ -uint_t zfs_vdev_max_active = 1000; +static uint_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index b597d6daefde..56b8e3b60b22 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -24,6 +24,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include <sys/zfs_context.h> @@ -356,6 +357,32 @@ unsigned long raidz_expand_max_reflow_bytes = 0; uint_t raidz_expand_pause_point = 0; /* + * This represents the duration for a slow drive read sit out. + */ +static unsigned long vdev_read_sit_out_secs = 600; + +/* + * How often each RAID-Z and dRAID vdev will check for slow disk outliers. + * Increasing this interval will reduce the sensitivity of detection (since all + * I/Os since the last check are included in the statistics), but will slow the + * response to a disk developing a problem. + * + * Defaults to once per second; setting extremely small values may cause + * negative performance effects. + */ +static hrtime_t vdev_raidz_outlier_check_interval_ms = 1000; + +/* + * When performing slow outlier checks for RAID-Z and dRAID vdevs, this value is + * used to determine how far out an outlier must be before it counts as an event + * worth consdering. + * + * Smaller values will result in more aggressive sitting out of disks that may + * have problems, but may significantly increase the rate of spurious sit-outs. + */ +static uint32_t vdev_raidz_outlier_insensitivity = 50; + +/* * Maximum amount of copy io's outstanding at once. */ #ifdef _ILP32 @@ -2311,6 +2338,41 @@ vdev_raidz_min_asize(vdev_t *vd) vd->vdev_children); } +/* + * return B_TRUE if a read should be skipped due to being too slow. + * + * In vdev_child_slow_outlier() it looks for outliers based on disk + * latency from the most recent child reads. Here we're checking if, + * over time, a disk has has been an outlier too many times and is + * now in a sit out period. + */ +boolean_t +vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) +{ + if (vdev_read_sit_out_secs == 0) + return (B_FALSE); + + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) + return (B_FALSE); + + if (!vd->vdev_ops->vdev_op_leaf) { + boolean_t sitting = B_FALSE; + for (int c = 0; c < vd->vdev_children; c++) { + sitting |= vdev_sit_out_reads(vd->vdev_child[c], + io_flags); + } + return (sitting); + } + + if (vd->vdev_read_sit_out_expire >= gethrestime_sec()) + return (B_TRUE); + + vd->vdev_read_sit_out_expire = 0; + + return (B_FALSE); +} + void vdev_raidz_child_done(zio_t *zio) { @@ -2475,6 +2537,45 @@ vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) rc->rc_skipped = 1; continue; } + + if (vdev_sit_out_reads(cvd, zio->io_flags)) { + rr->rr_outlier_cnt++; + ASSERT0(rc->rc_latency_outlier); + rc->rc_latency_outlier = 1; + } + } + + /* + * When the row contains a latency outlier and sufficient parity + * exists to reconstruct the column data, then skip reading the + * known slow child vdev as a performance optimization. + */ + if (rr->rr_outlier_cnt > 0 && + (rr->rr_firstdatacol - rr->rr_missingparity) >= + (rr->rr_missingdata + 1)) { + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error == 0 && rc->rc_latency_outlier) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(EAGAIN); + rc->rc_skipped = 1; + break; + } + } + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + if (forceparity || c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { @@ -2498,6 +2599,7 @@ vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) ASSERT3U(prc->rc_devidx, ==, i); vdev_t *cvd = vd->vdev_child[i]; + if (!vdev_readable(cvd)) { prc->rc_error = SET_ERROR(ENXIO); prc->rc_tried = 1; /* don't even try */ @@ -2774,6 +2876,239 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } +/* + * Find the median value from a set of n values + */ +static uint64_t +latency_median_value(const uint64_t *data, size_t n) +{ + uint64_t m; + + if (n % 2 == 0) + m = (data[(n >> 1) - 1] + data[n >> 1]) >> 1; + else + m = data[((n + 1) >> 1) - 1]; + + return (m); +} + +/* + * Calculate the outlier fence from a set of n latency values + * + * fence = Q3 + vdev_raidz_outlier_insensitivity x (Q3 - Q1) + */ +static uint64_t +latency_quartiles_fence(const uint64_t *data, size_t n, uint64_t *iqr) +{ + uint64_t q1 = latency_median_value(&data[0], n >> 1); + uint64_t q3 = latency_median_value(&data[(n + 1) >> 1], n >> 1); + + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the IQR + * is at least 25% larger than Q1. + */ + *iqr = MAX(q3 - q1, q1 / 4); + + return (q3 + (*iqr * vdev_raidz_outlier_insensitivity)); +} +#define LAT_CHILDREN_MIN 5 +#define LAT_OUTLIER_LIMIT 20 + +static int +latency_compare(const void *arg1, const void *arg2) +{ + const uint64_t *l1 = (uint64_t *)arg1; + const uint64_t *l2 = (uint64_t *)arg2; + + return (TREE_CMP(*l1, *l2)); +} + +void +vdev_raidz_sit_child(vdev_t *svd, uint64_t secs) +{ + for (int c = 0; c < svd->vdev_children; c++) + vdev_raidz_sit_child(svd->vdev_child[c], secs); + + if (!svd->vdev_ops->vdev_op_leaf) + return; + + /* Begin a sit out period for this slow drive */ + svd->vdev_read_sit_out_expire = gethrestime_sec() + + secs; + + /* Count each slow io period */ + mutex_enter(&svd->vdev_stat_lock); + svd->vdev_stat.vs_slow_ios++; + mutex_exit(&svd->vdev_stat_lock); +} + +void +vdev_raidz_unsit_child(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_raidz_unsit_child(vd->vdev_child[c]); + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + vd->vdev_read_sit_out_expire = 0; +} + +/* + * Check for any latency outlier from latest set of child reads. + * + * Uses a Tukey's fence, with K = 50, for detecting extreme outliers. This + * rule defines extreme outliers as data points outside the fence of the + * third quartile plus fifty times the Interquartile Range (IQR). This range + * is the distance between the first and third quartile. + * + * Fifty is an extremely large value for Tukey's fence, but the outliers we're + * attempting to detect here are orders of magnitude times larger than the + * median. This large value should capture any truly fault disk quickly, + * without causing spurious sit-outs. + * + * To further avoid spurious sit-outs, vdevs must be detected multiple times + * as an outlier before they are sat, and outlier counts will gradually decay. + * Every nchildren times we have detected an outlier, we subtract 2 from the + * outlier count of all children. If detected outliers are close to uniformly + * distributed, this will result in the outlier count remaining close to 0 + * (in expectation; over long enough time-scales, spurious sit-outs are still + * possible). + */ +static void +vdev_child_slow_outlier(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + if (!vd->vdev_autosit || vdev_read_sit_out_secs == 0 || + vd->vdev_children < LAT_CHILDREN_MIN) + return; + + hrtime_t now = getlrtime(); + uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); + + if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms)) + return; + + /* Allow a single winner when there are racing callers. */ + if (atomic_cas_64(&vd->vdev_last_latency_check, last, now) != last) + return; + + int children = vd->vdev_children; + uint64_t *lat_data = kmem_alloc(sizeof (uint64_t) * children, KM_SLEEP); + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if (cvd->vdev_prev_histo == NULL) { + mutex_enter(&cvd->vdev_stat_lock); + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + cvd->vdev_prev_histo = kmem_zalloc(size, KM_SLEEP); + memcpy(cvd->vdev_prev_histo, + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], + size); + mutex_exit(&cvd->vdev_stat_lock); + } + } + uint64_t max = 0; + vdev_t *svd = NULL; + uint_t sitouts = 0; + boolean_t skip = B_FALSE, svd_sitting = B_FALSE; + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + boolean_t sitting = vdev_sit_out_reads(cvd, 0) || + cvd->vdev_state != VDEV_STATE_HEALTHY; + + /* We can't sit out more disks than we have parity */ + if (sitting && ++sitouts >= vdev_get_nparity(vd)) + skip = B_TRUE; + + mutex_enter(&cvd->vdev_stat_lock); + + uint64_t *prev_histo = cvd->vdev_prev_histo; + uint64_t *histo = + cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ]; + if (skip) { + size_t size = + sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + continue; + } + uint64_t count = 0; + lat_data[c] = 0; + for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) { + uint64_t this_count = histo[i] - prev_histo[i]; + lat_data[c] += (1ULL << i) * this_count; + count += this_count; + } + size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]); + memcpy(prev_histo, histo, size); + mutex_exit(&cvd->vdev_stat_lock); + lat_data[c] /= MAX(1, count); + + /* Wait until all disks have been read from */ + if (lat_data[c] == 0 && !sitting) { + skip = B_TRUE; + continue; + } + + /* Keep track of the vdev with largest value */ + if (lat_data[c] > max) { + max = lat_data[c]; + svd = cvd; + svd_sitting = sitting; + } + } + + if (skip) { + kmem_free(lat_data, sizeof (uint64_t) * children); + return; + } + + qsort((void *)lat_data, children, sizeof (uint64_t), latency_compare); + + uint64_t iqr; + uint64_t fence = latency_quartiles_fence(lat_data, children, &iqr); + + ASSERT3U(lat_data[children - 1], ==, max); + if (max > fence && !svd_sitting) { + ASSERT3U(iqr, >, 0); + uint64_t incr = MAX(1, MIN((max - fence) / iqr, + LAT_OUTLIER_LIMIT / 4)); + vd->vdev_outlier_count += incr; + if (vd->vdev_outlier_count >= children) { + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + cvd->vdev_outlier_count -= 2; + cvd->vdev_outlier_count = MAX(0, + cvd->vdev_outlier_count); + } + vd->vdev_outlier_count = 0; + } + /* + * Keep track of how many times this child has had + * an outlier read. A disk that persitently has a + * higher than peers outlier count will be considered + * a slow disk. + */ + svd->vdev_outlier_count += incr; + if (svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); + vdev_raidz_sit_child(svd, vdev_read_sit_out_secs); + (void) zfs_ereport_post(FM_EREPORT_ZFS_SITOUT, + zio->io_spa, svd, NULL, NULL, 0); + vdev_dbgmsg(svd, "begin read sit out for %d secs", + (int)vdev_read_sit_out_secs); + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; + } + } + + kmem_free(lat_data, sizeof (uint64_t) * children); +} + static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { @@ -3515,6 +3850,9 @@ vdev_raidz_io_done(zio_t *zio) raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } + /* Periodically check for a read outlier */ + if (zio->io_type == ZIO_TYPE_READ) + vdev_child_slow_outlier(zio); zio_checksum_verified(zio); } else { /* @@ -5155,3 +5493,10 @@ ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, "For expanded RAIDZ, automatically start a pool scrub when expansion " "completes"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, read_sit_out_secs, ULONG, ZMOD_RW, + "Raidz/draid slow disk sit out time period in seconds"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_check_interval_ms, U64, + ZMOD_RW, "Interval to check for slow raidz/draid children"); +ZFS_MODULE_PARAM(zfs_vdev, vdev_, raidz_outlier_insensitivity, UINT, + ZMOD_RW, "How insensitive the slow raidz/draid child check should be"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 2f7a739da241..2ce0121324ad 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -105,7 +105,7 @@ static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024; * * See also the accessor function spa_remove_max_segment(). */ -uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; +static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device @@ -137,7 +137,7 @@ uint_t vdev_removal_max_span = 32 * 1024; * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ -int zfs_removal_suspend_progress = 0; +static int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c index 0816ea134bf3..4cf9e0dbb405 100644 --- a/sys/contrib/openzfs/module/zfs/zfeature.c +++ b/sys/contrib/openzfs/module/zfs/zfeature.c @@ -308,6 +308,7 @@ feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + ASSERT(MUTEX_HELD(&spa->spa_feat_stats_lock)); VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, sizeof (uint64_t), 1, &refcount, tx)); @@ -360,7 +361,9 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) feature->fi_guid, 1, strlen(feature->fi_desc) + 1, feature->fi_desc, tx)); + mutex_enter(&spa->spa_feat_stats_lock); feature_sync(spa, feature, initial_refcount, tx); + mutex_exit(&spa->spa_feat_stats_lock); if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { uint64_t enabling_txg = dmu_tx_get_txg(tx); @@ -416,6 +419,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + mutex_enter(&spa->spa_feat_stats_lock); VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); switch (action) { @@ -433,6 +437,7 @@ feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, } feature_sync(spa, feature, refcount, tx); + mutex_exit(&spa->spa_feat_stats_lock); } void diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c index f9267ed41d71..30d4c7c36897 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_crrd.c +++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c @@ -162,9 +162,9 @@ dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg) daydiff = time - rrd_tail(&db->dbr_days); monthdiff = time - rrd_tail(&db->dbr_months); - if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60)) + if (monthdiff >= 0 && monthdiff >= 30 * 24 * 60 * 60) rrd_add(&db->dbr_months, time, txg); - else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60)) + else if (daydiff >= 0 && daydiff >= 24 * 60 * 60) rrd_add(&db->dbr_days, time, txg); else if (minutedif >= 0) rrd_add(&db->dbr_minutes, time, txg); @@ -208,7 +208,8 @@ dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2) if (r2 == NULL) return (r1); - return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2); + return (ABS(tv - (hrtime_t)r1->rrdd_time) < + ABS(tv - (hrtime_t)r2->rrdd_time) ? r1 : r2); } uint64_t diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 121b966b9864..5ca7c2320c4e 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -683,6 +683,7 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) dsl_dataset_t *ds; const char *cp; int error; + boolean_t rawok = (zc->zc_flags & 0x8); /* * Generate the current snapshot name from the given objsetid, then @@ -705,6 +706,10 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND_RAW, cr); + } dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); @@ -714,9 +719,17 @@ zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + boolean_t rawok = nvlist_exists(innvl, "rawok"); + int error; + (void) innvl; - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_SEND, cr)); + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr); + if (error != 0 && rawok == B_TRUE) { + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND_RAW, cr); + } + return (error); } static int @@ -4726,7 +4739,7 @@ zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); - } else if ((zv = zvol_suspend(fsname)) != NULL) { + } else if (zvol_suspend(fsname, &zv) == 0) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); @@ -5448,7 +5461,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin, } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); - } else if ((zv = zvol_suspend(tofs)) != NULL) { + } else if (zvol_suspend(tofs, &zv) == 0) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { @@ -7619,7 +7632,7 @@ zfs_ioctl_init(void) zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_NONE, B_TRUE, B_TRUE, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 2fd3e1c37045..faced0db7e9e 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -1145,20 +1145,34 @@ zvol_tag(zvol_state_t *zv) /* * Suspend the zvol for recv and rollback. */ -zvol_state_t * -zvol_suspend(const char *name) +int +zvol_suspend(const char *name, zvol_state_t **zvp) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) - return (NULL); + return (SET_ERROR(ENOENT)); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + /* + * If it's being removed, unlock and return error. It doesn't make any + * sense to try to suspend a zvol being removed, but being here also + * means that zvol_remove_minors_impl() is about to call zvol_remove() + * and then destroy the zvol_state_t, so returning a pointer to it for + * the caller to mess with would be a disaster anyway. + */ + if (zv->zv_flags & ZVOL_REMOVING) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + /* NB: Returning EIO here to match zfsvfs_teardown() */ + return (SET_ERROR(EIO)); + } + atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) @@ -1171,7 +1185,8 @@ zvol_suspend(const char *name) mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ - return (zv); + *zvp = zv; + return (0); } int |