diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
45 files changed, 1557 insertions, 611 deletions
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 04ca32356a6d..3483be64ec57 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -1052,7 +1052,7 @@ static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t birth = BP_GET_BIRTH(bp); + uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; @@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, space); + aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, -space); + aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -4490,7 +4490,7 @@ arc_evict(void) * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; - int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) @@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - zfs_max_recordsize; + int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - + arc_dnode_limit; /* Always allow at least one block of overflow. */ - if (over < 0) + if (arc_over < 0 && dn_over <= 0) return (ARC_OVF_NONE); /* If we are under memory pressure, report severe overflow. */ @@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; if (use_reserve) overflow *= 3; - return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -5585,7 +5587,7 @@ arc_read_done(zio_t *zio) if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; - ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, @@ -5688,7 +5690,7 @@ arc_read_done(zio_t *zio) error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); @@ -6107,7 +6109,7 @@ top: if (!embedded_bp) { hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_GET_BIRTH(bp); + hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -6955,7 +6957,7 @@ arc_write_done(zio_t *zio) buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_GET_BIRTH(zio->io_bp); + hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); @@ -7326,7 +7328,7 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - wmsum_value(&arc_sums.arcstat_dnode_size) + + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif @@ -7368,7 +7370,7 @@ arc_kstat_update(kstat_t *ksp, int rw) &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - wmsum_value(&arc_sums.arcstat_dnode_size); + aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7738,7 +7740,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - wmsum_init(&arc_sums.arcstat_dnode_size, 0); + aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7897,7 +7899,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - wmsum_fini(&arc_sums.arcstat_dnode_size); + aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c index 8c19de93f12f..0a8a077edf63 100644 --- a/sys/contrib/openzfs/module/zfs/bpobj.c +++ b/sys/contrib/openzfs/module/zfs/bpobj.c @@ -954,8 +954,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) (void) bp_freed, (void) tx; struct space_range_arg *sra = arg; - if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg && - BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) { + if (BP_GET_BIRTH(bp) > sra->mintxg && + BP_GET_BIRTH(bp) <= sra->maxtxg) { if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) sra->used += bp_get_dsize_sync(sra->spa, bp); else diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c index 27d9ed7ea2b0..40664354aa73 100644 --- a/sys/contrib/openzfs/module/zfs/brt.c +++ b/sys/contrib/openzfs/module/zfs/brt.c @@ -478,6 +478,18 @@ brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); + /* + * Activate the endian-fixed feature if this is the first BRT ZAP + * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) && + !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); + } else if (spa_feature_is_active(spa, + SPA_FEATURE_BLOCK_CLONING_ENDIAN)) { + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); + } + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } @@ -658,6 +670,8 @@ brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) rw_exit(&brtvd->bv_lock); spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)) + spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx); } static void @@ -855,16 +869,29 @@ brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); } +static boolean_t +brt_has_endian_fixed(spa_t *spa) +{ + return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN)); +} + static int -brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre) +brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries == 0) return (SET_ERROR(ENOENT)); - return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, - &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count)); + if (brt_has_endian_fixed(spa)) { + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count)); + } else { + return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count)); + } } /* @@ -1056,7 +1083,7 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) } rw_exit(&brtvd->bv_lock); - error = brt_entry_lookup(brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search); /* bre_search now contains correct bre_count */ if (error == ENOENT) { BRTSTAT_BUMP(brt_decref_no_entry); @@ -1118,7 +1145,7 @@ brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { rw_exit(&brtvd->bv_lock); - error = brt_entry_lookup(brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search); if (error == ENOENT) { refcnt = 0; } else { @@ -1270,10 +1297,18 @@ brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg) uint64_t off = BRE_OFFSET(bre); if (brtvd->bv_mos_entries != 0 && brt_vdev_lookup(spa, brtvd, off)) { - int error = zap_lookup_uint64_by_dnode( - brtvd->bv_mos_entries_dnode, &off, - BRT_KEY_WORDS, 1, sizeof (bre->bre_count), - &bre->bre_count); + int error; + if (brt_has_endian_fixed(spa)) { + error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count); + } else { + error = zap_lookup_uint64_by_dnode( + brtvd->bv_mos_entries_dnode, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count); + } if (error == 0) { BRTSTAT_BUMP(brt_addref_entry_on_disk); } else { @@ -1326,7 +1361,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg) } static void -brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) +brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { uint64_t off = BRE_OFFSET(bre); @@ -1337,9 +1372,15 @@ brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) BRT_KEY_WORDS, tx); VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(zap_update_uint64_by_dnode(dn, &off, - BRT_KEY_WORDS, 1, sizeof (bre->bre_count), - &bre->bre_count, tx)); + if (brt_has_endian_fixed(spa)) { + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, sizeof (bre->bre_count), 1, + &bre->bre_count, tx)); + } else { + VERIFY0(zap_update_uint64_by_dnode(dn, &off, + BRT_KEY_WORDS, 1, sizeof (bre->bre_count), + &bre->bre_count, tx)); + } } } @@ -1368,7 +1409,8 @@ brt_sync_table(spa_t *spa, dmu_tx_t *tx) void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); + brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre, + tx); kmem_cache_free(brt_entry_cache, bre); } diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index f1b5a17f337e..432c99cec960 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size) * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { - if (size > dbuf_cache_hiwater_bytes()) + /* + * Avoid calling dbuf_evict_one() from memory reclaim context + * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks. + * Memory reclaim threads can get stuck waiting for the dbuf + * hash lock. + */ + if (size > dbuf_cache_hiwater_bytes() && + !current_is_reclaim_thread()) { dbuf_evict_one(); + } cv_signal(&dbuf_evict_cv); } } @@ -1235,11 +1243,9 @@ dbuf_verify(dmu_buf_impl_t *db) DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); - ASSERT0(bp->blk_pad[0]); - ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); - ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp)); } } } @@ -1615,7 +1621,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags, */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb, - BP_GET_LOGICAL_BIRTH(bp)); + BP_GET_PHYSICAL_BIRTH(bp)); err = SET_ERROR(EIO); goto early_unlock; } @@ -2154,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr) ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + + /* + * Clear the rewrite flag since this is now a logical + * modification. + */ + dr->dt.dl.dr_rewrite = B_FALSE; } } @@ -2701,6 +2713,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); } +void +dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(tx->tx_txg != 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + /* + * If the dbuf is already dirty in this txg, it will be written + * anyway, so there's nothing to do. + */ + mutex_enter(&db->db_mtx); + if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + return; + } + mutex_exit(&db->db_mtx); + + /* + * The dbuf is not dirty, so we need to make it dirty and + * mark it for rewrite (preserve logical birth time). + */ + dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); + + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (dr != NULL && db->db_level == 0) + dr->dt.dl.dr_rewrite = B_TRUE; + mutex_exit(&db->db_mtx); +} + boolean_t dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -4899,7 +4943,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (BP_GET_LOGICAL_BIRTH(bp) != 0) { + if (BP_GET_BIRTH(bp) != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && @@ -5186,7 +5230,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp); + drica.drica_blk_birth = BP_GET_BIRTH(bp); drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { @@ -5201,8 +5245,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > - ds->ds_dir->dd_origin_txg) { + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -5320,7 +5363,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg); + ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? @@ -5334,6 +5377,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); /* + * Set rewrite properties for zfs_rewrite() operations. + */ + if (db->db_level == 0 && dr->dt.dl.dr_rewrite) { + zp.zp_rewrite = B_TRUE; + + /* + * Mark physical rewrite feature for activation. + * This will be activated automatically during dataset sync. + */ + dsl_dataset_t *ds = os->os_dsl_dataset; + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_PHYSICAL_REWRITE)) { + ds->ds_feature_activation[ + SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE; + } + } + + /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read @@ -5403,6 +5464,7 @@ EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); +EXPORT_SYMBOL(dmu_buf_will_rewrite); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c index 60cbb7755a7e..e0b9fc3951ff 100644 --- a/sys/contrib/openzfs/module/zfs/ddt.c +++ b/sys/contrib/openzfs/module/zfs/ddt.c @@ -724,10 +724,13 @@ ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) dvas[2] = bp->blk_dva[2]; if (ddt_phys_birth(ddp, v) == 0) { - if (v == DDT_PHYS_FLAT) - ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); - else - ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); + if (v == DDT_PHYS_FLAT) { + ddp->ddp_flat.ddp_phys_birth = + BP_GET_PHYSICAL_BIRTH(bp); + } else { + ddp->ddp_trad[v].ddp_phys_birth = + BP_GET_PHYSICAL_BIRTH(bp); + } } } @@ -891,14 +894,14 @@ ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) if (ddt->ddt_flags & DDT_FLAG_FLAT) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && - BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { + BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { return (DDT_PHYS_FLAT); } } else /* traditional phys */ { for (int p = 0; p < DDT_PHYS_MAX; p++) { if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_trad[p].ddp_dva[0]) && - BP_GET_BIRTH(bp) == + BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_trad[p].ddp_phys_birth) { return (p); } diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 21c465328134..296e58ef9cd8 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -1966,7 +1966,7 @@ dmu_sync_late_arrival_done(zio_t *zio) blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); - ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg); + ASSERT(BP_GET_BIRTH(zio->io_bp) == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } @@ -2508,6 +2508,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; + zp->zp_rewrite = B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -2655,11 +2656,12 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, * operation into ZIL, or it may be impossible to replay, since * the block may appear not yet allocated at that point. */ - if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + if (BP_GET_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { error = SET_ERROR(EINVAL); goto out; } - if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + if (BP_GET_PHYSICAL_BIRTH(bp) > + spa_last_synced_txg(os->os_spa)) { error = SET_ERROR(EAGAIN); goto out; } @@ -2731,7 +2733,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { if (!BP_IS_EMBEDDED(bp)) { BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg, - BP_GET_BIRTH(bp)); + BP_GET_PHYSICAL_BIRTH(bp)); + BP_SET_REWRITE(&dl->dr_overridden_by, 0); } else { BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c index 86f751e886c9..fb13b2f87f57 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_diff.c +++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c @@ -224,8 +224,8 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name, * call the ZFS_IOC_OBJ_TO_STATS ioctl. */ error = traverse_dataset(tosnap, fromtxg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, - diff_cb, &da); + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | + TRAVERSE_LOGICAL, diff_cb, &da); if (error != 0) { da.da_err = error; diff --git a/sys/contrib/openzfs/module/zfs/dmu_direct.c b/sys/contrib/openzfs/module/zfs/dmu_direct.c index 12b0ffa2c99b..930ff101eca3 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_direct.c +++ b/sys/contrib/openzfs/module/zfs/dmu_direct.c @@ -104,7 +104,7 @@ dmu_write_direct_done(zio_t *zio) dmu_sync_done(zio, NULL, zio->io_private); if (zio->io_error != 0) { - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) ASSERT3U(zio->io_error, ==, EIO); /* diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index b3f792e4ae6b..c135f620800f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -345,12 +345,6 @@ smallblk_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; - /* - * Inheritance and range checking should have been done by now. - */ - ASSERT(newval <= SPA_MAXBLOCKSIZE); - ASSERT(ISP2(newval)); - os->os_zpl_special_smallblock = newval; } @@ -1376,7 +1370,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) - zvol_create_minor(name); + zvol_create_minors(name); crfree(cr); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 3a4bd7a1cea9..73227b58c140 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -1403,7 +1403,7 @@ corrective_read_done(zio_t *zio) /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) { spa_remove_error(data->spa, &data->zb, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); } kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); @@ -1530,7 +1530,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, } rrd->abd = abd; - io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp, + io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_BIRTH(bp), bp, abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); @@ -3831,11 +3831,11 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) nvlist_free(drc->drc_keynvl); } else if (!drc->drc_heal) { if (drc->drc_newfs) { - zvol_create_minor(drc->drc_tofs); + zvol_create_minors(drc->drc_tofs); } char *snapname = kmem_asprintf("%s@%s", drc->drc_tofs, drc->drc_tosnap); - zvol_create_minor(snapname); + zvol_create_minors(snapname); kmem_strfree(snapname); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c index 65443d112f27..9226ac9e4b80 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_redact.c +++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c @@ -370,8 +370,8 @@ redact_traverse_thread(void *arg) #endif err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, - &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, - redact_cb, rt_arg); + &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_LOGICAL, redact_cb, rt_arg); if (err != EINTR) rt_arg->error_code = err; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 4f27f3df0e55..deeba29e159a 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -1084,7 +1084,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { - spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp)); return (SET_ERROR(EIO)); } @@ -1210,7 +1210,7 @@ send_traverse_thread(void *arg) err = traverse_dataset_resume(st_arg->os->os_dsl_dataset, st_arg->fromtxg, &st_arg->resume, - st_arg->flags, send_cb, st_arg); + st_arg->flags | TRAVERSE_LOGICAL, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c index f534a7dd64e3..dd1df1705040 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c +++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c @@ -74,6 +74,15 @@ static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); +static inline uint64_t +get_birth_time(traverse_data_t *td, const blkptr_t *bp) +{ + if (td->td_flags & TRAVERSE_LOGICAL) + return (BP_GET_LOGICAL_BIRTH(bp)); + else + return (BP_GET_BIRTH(bp)); +} + static int traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) @@ -85,7 +94,7 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, return (0); if (claim_txg == 0 && - BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa)) + get_birth_time(td, bp) >= spa_min_claim_txg(td->td_spa)) return (-1); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, @@ -110,7 +119,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, if (BP_IS_HOLE(bp)) return (0); - if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) + if (claim_txg == 0 || get_birth_time(td, bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); @@ -194,7 +203,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp, */ if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE) return (B_FALSE); - if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) + if (BP_IS_HOLE(bp) || get_birth_time(td, bp) <= td->td_min_txg) return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return (B_FALSE); @@ -265,7 +274,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_object == DMU_META_DNODE_OBJECT) && td->td_hole_birth_enabled_txg <= td->td_min_txg) return (0); - } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) { + } else if (get_birth_time(td, bp) <= td->td_min_txg) { return (0); } diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 904a039edf95..451e1533efa0 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT; static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ +static char * +rt_name(dnode_t *dn, const char *name) +{ + struct objset *os = dn->dn_objset; + + return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}", + spa_name(os->os_spa), + (u_longlong_t)(os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET), + (u_longlong_t)dn->dn_object, + name)); +} + static int dbuf_compare(const void *x1, const void *x2) { @@ -2436,8 +2449,10 @@ done: { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + dn->dn_free_ranges[txgoff] = + zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges")); } zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index e301fe19f645..fdc8b7b198f0 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -1523,7 +1523,7 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * If the block was live (referenced) at the time of this * bookmark, add its space to the bookmark's FBN. */ - if (BP_GET_LOGICAL_BIRTH(bp) <= + if (BP_GET_BIRTH(bp) <= dbn->dbn_phys.zbm_creation_txg && (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { mutex_enter(&dbn->dbn_lock); diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c index db568f42d24e..6b6bb8d45b6b 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c +++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c @@ -866,7 +866,7 @@ spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, dsl_pool_rele(dp, FTAG); /* create any zvols under this ds */ - zvol_create_minors_recursive(dsname); + zvol_create_minors(dsname); return (0); diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index c0a7872c40ad..b767c9641419 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -159,7 +159,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) return; } - ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, + ASSERT3U(BP_GET_BIRTH(bp), >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); @@ -194,7 +194,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -263,7 +263,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg); + ASSERT(BP_GET_BIRTH(bp) <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); @@ -281,7 +281,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg && + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -289,7 +289,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, bplist_append(&ds->ds_dir->dd_pending_frees, bp); } - if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + if (BP_GET_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; /* @@ -346,14 +346,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (logical birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == - ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) > + ds->ds_object && BP_GET_BIRTH(bp) > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { + if (BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } @@ -2005,7 +2005,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) if (error == 0) { for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { - zvol_create_minor(nvpair_name(pair)); + zvol_create_minors(nvpair_name(pair)); } } @@ -2944,7 +2944,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) if (snap == NULL) return (B_FALSE); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds)); + birth = BP_GET_BIRTH(dsl_dataset_get_blkptr(ds)); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; @@ -3413,7 +3413,7 @@ dsl_dataset_clone(const char *clone, const char *origin) 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) - zvol_create_minor(clone); + zvol_create_minors(clone); crfree(cr); diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c index 3113d932fb68..9ffc998ac173 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c +++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c @@ -484,7 +484,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); - dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); + dle_tofind.dle_mintxg = BP_GET_BIRTH(bp); dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); @@ -493,7 +493,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, if (dle == NULL) { zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", - bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); + bp, (longlong_t)BP_GET_BIRTH(bp)); dle = avl_first(&dl->dl_tree); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c index f5ec93b2dc5c..fff49c97f4d2 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c +++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c @@ -133,11 +133,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) ASSERT(!BP_IS_HOLE(bp)); - if (BP_GET_LOGICAL_BIRTH(bp) <= + if (BP_GET_BIRTH(bp) <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && - BP_GET_LOGICAL_BIRTH(bp) > + BP_GET_BIRTH(bp) > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); @@ -315,8 +315,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, - tx->tx_txg); + ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); @@ -730,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); - ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >, + ASSERT3U(BP_GET_BIRTH(bp), >, dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } @@ -1020,8 +1019,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT(ds->ds_prev == NULL || dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, - tx->tx_txg); + ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index f1088d87208b..4f1f66b835f2 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -1056,7 +1056,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); + ASSERT0(BP_GET_BIRTH(&dsl_dataset_phys(prev)->ds_bp)); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 1b2cd3e361d1..5052992d775c 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -454,7 +454,7 @@ static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; - sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp); + sio->sio_phys_birth = BP_GET_RAW_PHYSICAL_BIRTH(bp); sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp); sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); @@ -1768,7 +1768,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || - BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) + BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1778,7 +1778,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, * scrub there's nothing to do to it). */ if (claim_txg == 0 && - BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) + BP_GET_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], @@ -1804,7 +1804,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || - BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) + BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) return (0); /* @@ -1812,7 +1812,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, * already txg sync'ed (but this log block contains * other records that are not synced) */ - if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg) + if (claim_txg == 0 || BP_GET_BIRTH(bp) < claim_txg) return (0); ASSERT3U(BP_GET_LSIZE(bp), !=, 0); @@ -1952,7 +1952,7 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) return; if (BP_IS_HOLE(bp) || - BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || + BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; @@ -2223,7 +2223,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, if (dnp != NULL && dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { scn->scn_phys.scn_errors++; - spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } @@ -2319,7 +2319,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, * by arc_read() for the cases above. */ scn->scn_phys.scn_errors++; - spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp)); + spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp)); return (SET_ERROR(EINVAL)); } @@ -2396,7 +2396,12 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, if (f != SPA_FEATURE_NONE) ASSERT(dsl_dataset_feature_is_active(ds, f)); - if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { + /* + * Recurse any blocks that were written either logically or physically + * at or after cur_min_txg. About logical birth we care for traversal, + * looking for any changes, while about physical for the actual scan. + */ + if (BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } @@ -2422,7 +2427,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb, * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + if (BP_GET_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; return; } @@ -4806,7 +4811,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_GET_BIRTH(bp); + uint64_t phys_birth = BP_GET_PHYSICAL_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 43b94eba2d58..0e5f09b2724c 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -375,6 +375,16 @@ static metaslab_stats_t metaslab_stats = { #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); +char * +metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}", + spa_name(mg->mg_vd->vdev_spa), + (u_longlong_t)mg->mg_vd->vdev_guid, + (u_longlong_t)ms->ms_id, + name)); +} + static kstat_t *metaslab_ksp; @@ -750,7 +760,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc) } IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) mc_hist[i] += mg->mg_histogram[i]; @@ -1183,14 +1194,16 @@ metaslab_group_passivate(metaslab_group_t *mg) if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + metaslab_weight(msp, B_TRUE) & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + metaslab_weight(msp, B_TRUE) & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); } } @@ -1288,7 +1301,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) mutex_enter(&mc->mc_lock); for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] += msp->ms_sm->sm_phys->smp_histogram[i]; mc->mc_histogram[i + ashift] += @@ -1316,7 +1330,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) ASSERT3U(mc->mc_histogram[i + ashift], >=, msp->ms_sm->sm_phys->smp_histogram[i]); IMPLY(mg == mg->mg_vd->vdev_log_mg, - mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); + mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || + mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); mg->mg_histogram[i + ashift] -= msp->ms_sm->sm_phys->smp_histogram[i]; @@ -2895,30 +2910,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, - shift); + ms->ms_allocatable = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable")); for (int t = 0; t < TXG_SIZE; t++) { - ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, - NULL, start, shift); - } - ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); - ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_allocating[t] = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(mg, ms, "ms_allocating")); + } + ms->ms_freeing = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing")); + ms->ms_freed = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed")); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, - start, shift); + ms->ms_defer[t] = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer")); } - ms->ms_checkpointing = - zfs_range_tree_create(NULL, type, NULL, start, shift); - ms->ms_unflushed_allocs = - zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_checkpointing = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing")); + ms->ms_unflushed_allocs = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs")); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; - ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, - type, mrap, start, shift); + ms->ms_unflushed_frees = zfs_range_tree_create_flags( + &metaslab_rt_ops, type, mrap, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees")); - ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_trim = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim")); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); @@ -3892,7 +3920,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); - condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); + condense_tree = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, "condense_tree")); for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], @@ -3949,8 +3980,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, - start, shift); + zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, "tmp_tree")); zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); @@ -5199,29 +5232,16 @@ next: /* * We were unable to allocate from this metaslab so determine - * a new weight for this metaslab. Now that we have loaded - * the metaslab we can provide a better hint to the metaslab - * selector. - * - * For space-based metaslabs, we use the maximum block size. - * This information is only available when the metaslab - * is loaded and is more accurate than the generic free - * space weight that was calculated by metaslab_weight(). - * This information allows us to quickly compare the maximum - * available allocation in the metaslab to the allocation - * size being requested. - * - * For segment-based metaslabs, determine the new weight - * based on the highest bucket in the range tree. We - * explicitly use the loaded segment weight (i.e. the range - * tree histogram) since it contains the space that is - * currently available for allocation and is accurate - * even within a sync pass. + * a new weight for this metaslab. The weight was last + * recalculated either when we loaded it (if this is the first + * TXG it's been loaded in), or the last time a txg was synced + * out. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - weight = metaslab_largest_allocatable(msp); - WEIGHT_SET_SPACEBASED(weight); + metaslab_set_fragmentation(msp, B_TRUE); + weight = metaslab_space_weight(msp) & + ~METASLAB_ACTIVE_MASK; } else { weight = metaslab_weight_from_range_tree(msp); } @@ -5233,13 +5253,6 @@ next: * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. - * - * Note that we could attempt to use something like - * metaslab_recalculate_weight_and_sort() that - * retains the activation mask here. That function - * uses metaslab_weight() to set the weight though - * which is not as accurate as the calculations - * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); @@ -5590,7 +5603,21 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; uint64_t physical_birth = vdev_indirect_births_physbirth(vib, DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); - BP_SET_PHYSICAL_BIRTH(bp, physical_birth); + + /* + * For rewritten blocks, use the old physical birth as the new logical + * birth (representing when the space was allocated) and the removal + * time as the new physical birth (representing when it was actually + * written). + */ + if (BP_GET_REWRITE(bp)) { + uint64_t old_physical_birth = BP_GET_PHYSICAL_BIRTH(bp); + ASSERT3U(old_physical_birth, <, physical_birth); + BP_SET_BIRTH(bp, old_physical_birth, physical_birth); + BP_SET_REWRITE(bp, 0); + } else { + BP_SET_PHYSICAL_BIRTH(bp, physical_birth); + } DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], offset); @@ -5757,21 +5784,21 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) } /* - * Reserve some allocation slots. The reservation system must be called - * before we call into the allocator. If there aren't any available slots - * then the I/O will be throttled until an I/O completes and its slots are - * freed up. The function returns true if it was successful in placing - * the reservation. + * Reserve some space for a future allocation. The reservation system must be + * called before we call into the allocator. If there aren't enough space + * available, the calling I/O will be throttled until another I/O completes and + * its reservation is released. The function returns true if it was successful + * in placing the reservation. */ boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, - boolean_t must, boolean_t *more) +metaslab_class_throttle_reserve(metaslab_class_t *mc, int allocator, + int copies, uint64_t io_size, boolean_t must, boolean_t *more) { - metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - if (mc->mc_alloc_io_size < zio->io_size) { - mc->mc_alloc_io_size = zio->io_size; + if (mc->mc_alloc_io_size < io_size) { + mc->mc_alloc_io_size = io_size; metaslab_class_balance(mc, B_FALSE); } if (must || mca->mca_reserved <= mc->mc_alloc_max) { @@ -5782,10 +5809,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, * worst that can happen is few more I/Os get to allocation * earlier, that is not a problem. */ - int64_t delta = slots * zio->io_size; + int64_t delta = copies * io_size; *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= mc->mc_alloc_max); - zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } *more = B_FALSE; @@ -5793,13 +5819,13 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, } boolean_t -metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, - zio_t *zio) +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int allocator, + int copies, uint64_t io_size) { - metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - int64_t delta = slots * zio->io_size; + int64_t delta = copies * io_size; return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= mc->mc_alloc_max); } @@ -5960,7 +5986,7 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, int error = 0; ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); - ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp)); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -5975,12 +6001,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); ASSERT3P(zal, !=, NULL); - uint64_t cur_psize = 0; - + uint64_t smallest_psize = UINT64_MAX; for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva_range(spa, mc, psize, max_psize, - dva, d, hintdva, txg, flags, zal, allocator, - actual_psize ? &cur_psize : NULL); + uint64_t cur_psize = 0; + error = metaslab_alloc_dva_range(spa, mc, psize, + MIN(smallest_psize, max_psize), dva, d, hintdva, txg, + flags, zal, allocator, actual_psize ? &cur_psize : NULL); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); @@ -6000,13 +6026,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize, DVA_GET_VDEV(&dva[d]), allocator, flags, psize, tag); if (actual_psize) - max_psize = MIN(cur_psize, max_psize); + smallest_psize = MIN(cur_psize, smallest_psize); } } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); if (actual_psize) - *actual_psize = max_psize; + *actual_psize = smallest_psize; spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -6022,7 +6048,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa)); + ASSERT(!now || BP_GET_BIRTH(bp) >= spa_syncing_txg(spa)); /* * If we have a checkpoint for the pool we need to make sure that @@ -6040,7 +6066,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) * normally as they will be referenced by the checkpointed uberblock. */ boolean_t checkpoint = B_FALSE; - if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && + if (BP_GET_BIRTH(bp) <= spa->spa_checkpoint_txg && spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { /* * At this point, if the block is part of the checkpoint diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index 373636c69254..fc2b17606bd2 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t, ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare) -zfs_range_tree_t * -zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, +static zfs_range_tree_t * +zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, - uint64_t gap) + uint64_t gap, uint64_t flags, const char *name) { zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP); @@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, rt->rt_ops = ops; rt->rt_gap = gap; + rt->rt_flags = flags; + rt->rt_name = name; rt->rt_arg = arg; rt->rt_type = type; rt->rt_start = start; @@ -248,10 +250,29 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, } zfs_range_tree_t * +zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t gap) +{ + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap, + 0, NULL)); +} + +zfs_range_tree_t * zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift) { - return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0)); + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0, + 0, NULL)); +} + +zfs_range_tree_t * +zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t flags, const char *name) +{ + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0, + flags, name)); } void @@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); + if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME)) + kmem_strfree((char *)(uintptr_t)rt->rt_name); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } @@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, int64_t delta) { if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) { - zfs_panic_recover("zfs: attempting to decrease fill to or " - "below 0; probable double remove in segment [%llx:%llx]", + zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to " + "or below 0; probable double remove in segment [%llx:%llx]", + ZFS_RT_NAME(rt), (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) { - zfs_panic_recover("zfs: attempting to increase fill beyond " - "max; probable double add in segment [%llx:%llx]", + zfs_panic_recover("zfs: rt=%s: attempting to increase fill " + "beyond max; probable double add in segment [%llx:%llx]", + ZFS_RT_NAME(rt), (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } @@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * the normal code paths. */ if (rs != NULL) { + uint64_t rstart = zfs_rs_get_start(rs, rt); + uint64_t rend = zfs_rs_get_end(rs, rt); if (gap == 0) { - zfs_panic_recover("zfs: adding existent segment to " - "range tree (offset=%llx size=%llx)", - (longlong_t)start, (longlong_t)size); + zfs_panic_recover("zfs: rt=%s: adding segment " + "(offset=%llx size=%llx) overlapping with existing " + "one (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), + (longlong_t)start, (longlong_t)size, + (longlong_t)rstart, (longlong_t)(rend - rstart)); return; } - uint64_t rstart = zfs_rs_get_start(rs, rt); - uint64_t rend = zfs_rs_get_end(rs, rt); if (rstart <= start && rend >= end) { zfs_range_tree_adjust_fill(rt, rs, fill); return; @@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; + uint64_t rstart, rend; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); @@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: removing nonexistent segment from " - "range tree (offset=%llx size=%llx)", - (longlong_t)start, (longlong_t)size); + zfs_panic_recover("zfs: rt=%s: removing nonexistent segment " + "from range tree (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size); return; } + rstart = zfs_rs_get_start(rs, rt); + rend = zfs_rs_get_end(rs, rt); + /* * Range trees with gap support must only remove complete segments * from the tree. This allows us to maintain accurate fill accounting @@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, if (rt->rt_gap != 0) { if (do_fill) { if (zfs_rs_get_fill(rs, rt) == size) { - start = zfs_rs_get_start(rs, rt); - end = zfs_rs_get_end(rs, rt); + start = rstart; + end = rend; size = end - start; } else { zfs_range_tree_adjust_fill(rt, rs, -size); return; } - } else if (zfs_rs_get_start(rs, rt) != start || - zfs_rs_get_end(rs, rt) != end) { - zfs_panic_recover("zfs: freeing partial segment of " - "gap tree (offset=%llx size=%llx) of " + } else if (rstart != start || rend != end) { + zfs_panic_recover("zfs: rt=%s: freeing partial segment " + "of gap tree (offset=%llx size=%llx) of " "(offset=%llx size=%llx)", + ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size, - (longlong_t)zfs_rs_get_start(rs, rt), - (longlong_t)zfs_rs_get_end(rs, rt) - - zfs_rs_get_start(rs, rt)); + (longlong_t)rstart, (longlong_t)(rend - rstart)); return; } } - VERIFY3U(zfs_rs_get_start(rs, rt), <=, start); - VERIFY3U(zfs_rs_get_end(rs, rt), >=, end); + if (!(rstart <= start && rend >= end)) { + panic("zfs: rt=%s: removing segment " + "(offset=%llx size=%llx) not completely overlapped by " + "existing one (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), + (longlong_t)start, (longlong_t)size, + (longlong_t)rstart, (longlong_t)(rend - rstart)); + return; + } - left_over = (zfs_rs_get_start(rs, rt) != start); - right_over = (zfs_rs_get_end(rs, rt) != end); + left_over = (rstart != start); + right_over = (rend != end); zfs_range_tree_stat_decr(rt, rs); diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 6b52c6cb1f9e..5ecb175fbd63 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -100,6 +100,7 @@ #include <sys/vmsystm.h> #endif /* _KERNEL */ +#include "zfs_crrd.h" #include "zfs_prop.h" #include "zfs_comutil.h" #include <cityhash.h> @@ -311,6 +312,41 @@ static int zfs_livelist_condense_zthr_cancel = 0; static int zfs_livelist_condense_new_alloc = 0; /* + * Time variable to decide how often the txg should be added into the + * database (in seconds). + * The smallest available resolution is in minutes, which means an update occurs + * each time we reach `spa_note_txg_time` and the txg has changed. We provide + * a 256-slot ring buffer for minute-level resolution. The number is limited by + * the size of the structure we use and the maximum amount of bytes we can write + * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately + * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of + * high-resolution data. + * + * The user can decrease `spa_note_txg_time` to increase resolution within + * a day, at the cost of retaining fewer days of data. Alternatively, increasing + * the interval allows storing data over a longer period, but with lower + * frequency. + * + * This parameter does not affect the daily or monthly databases, as those only + * store one record per day and per month, respectively. + */ +static uint_t spa_note_txg_time = 10 * 60; + +/* + * How often flush txg database to a disk (in seconds). + * We flush data every time we write to it, making it the most reliable option. + * Since this happens every 10 minutes, it shouldn't introduce any noticeable + * overhead for the system. In case of failure, we will always have an + * up-to-date version of the database. + * + * The user can adjust the flush interval to a lower value, but it probably + * doesn't make sense to flush more often than the database is updated. + * The user can also increase the interval if they're concerned about the + * performance of writing the entire database to disk. + */ +static uint_t spa_flush_txg_time = 10 * 60; + +/* * ========================================================================== * SPA properties routines * ========================================================================== @@ -417,11 +453,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv) alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + alloc += metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); + size += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); @@ -1679,6 +1719,8 @@ spa_activate(spa_t *spa, spa_mode_t mode) "embedded_log", msp, B_TRUE); spa->spa_special_class = metaslab_class_create(spa, "special", msp, B_FALSE); + spa->spa_special_embedded_log_class = metaslab_class_create(spa, + "special_embedded_log", msp, B_TRUE); spa->spa_dedup_class = metaslab_class_create(spa, "dedup", msp, B_FALSE); @@ -1853,6 +1895,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; + metaslab_class_destroy(spa->spa_special_embedded_log_class); + spa->spa_special_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; @@ -2031,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa) } } +static void +spa_sync_time_logger(spa_t *spa, uint64_t txg) +{ + uint64_t curtime; + dmu_tx_t *tx; + + if (!spa_writeable(spa)) { + return; + } + curtime = gethrestime_sec(); + if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) { + return; + } + + if (txg > spa->spa_last_noted_txg) { + spa->spa_last_noted_txg_time = curtime; + spa->spa_last_noted_txg = txg; + + mutex_enter(&spa->spa_txg_log_time_lock); + dbrrd_add(&spa->spa_txg_log_time, curtime, txg); + mutex_exit(&spa->spa_txg_log_time_lock); + } + + if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) { + return; + } + spa->spa_last_flush_txg_time = curtime; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_minutes, tx)); + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_days, tx)); + VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_months, tx)); + dmu_tx_commit(tx); +} + +static void +spa_unload_sync_time_logger(spa_t *spa) +{ + uint64_t txg; + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); + + txg = dmu_tx_get_txg(tx); + spa->spa_last_noted_txg_time = 0; + spa->spa_last_flush_txg_time = 0; + spa_sync_time_logger(spa, txg); + + dmu_tx_commit(tx); +} + +static void +spa_load_txg_log_time(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_minutes); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "minute resolution [error=%d]", error); + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_days); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "day resolution [error=%d]", error); + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, + &spa->spa_txg_log_time.dbr_months); + if (error != 0 && error != ENOENT) { + spa_load_note(spa, "unable to load a txg time database with " + "month resolution [error=%d]", error); + } +} + +static boolean_t +spa_should_sync_time_logger_on_unload(spa_t *spa) +{ + + if (!spa_writeable(spa)) + return (B_FALSE); + + if (!spa->spa_sync_on) + return (B_FALSE); + + if (spa_state(spa) != POOL_STATE_EXPORTED) + return (B_FALSE); + + if (spa->spa_last_noted_txg == 0) + return (B_FALSE); + + return (B_TRUE); +} + + /* * Opposite of spa_load(). */ @@ -2052,6 +2202,9 @@ spa_unload(spa_t *spa) * we delay the final TXGs beyond what spa_final_txg is set at. */ if (spa->spa_final_txg == UINT64_MAX) { + if (spa_should_sync_time_logger_on_unload(spa)) + spa_unload_sync_time_logger(spa); + /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some @@ -2709,8 +2862,8 @@ spa_claim_notify(zio_t *zio) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ - if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) - spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); + if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp)) + spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp); mutex_exit(&spa->spa_props_lock); } @@ -3768,20 +3921,17 @@ out: * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { - const char *hostname = "<unknown>"; - uint64_t hostid = 0; - if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { - hostname = fnvlist_lookup_string(mmp_label, - ZPOOL_CONFIG_HOSTNAME); + const char *hostname = fnvlist_lookup_string( + mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { - hostid = fnvlist_lookup_uint64(mmp_label, - ZPOOL_CONFIG_HOSTID); + uint64_t hostid = fnvlist_lookup_uint64( + mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } @@ -4711,6 +4861,9 @@ spa_ld_get_props(spa_t *spa) if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* Load time log */ + spa_load_txg_log_time(spa); + /* * Load the persistent error log. If we have an older pool, this will * not be present. @@ -5899,7 +6052,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, } if (firstopen) - zvol_create_minors_recursive(spa_name(spa)); + zvol_create_minors(spa_name(spa)); *spapp = spa; @@ -6877,7 +7030,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) mutex_exit(&spa_namespace_lock); - zvol_create_minors_recursive(pool); + zvol_create_minors(pool); spa_import_os(spa); @@ -7134,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, spa_config_exit(spa, SCL_ALL, FTAG); } + if (spa_should_sync_time_logger_on_unload(spa)) + spa_unload_sync_time_logger(spa); + /* * If the log space map feature is enabled and the pool is * getting exported (but not destroyed), we want to spend some @@ -9092,6 +9248,8 @@ spa_async_thread(void *arg) old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + old_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); @@ -9100,6 +9258,8 @@ spa_async_thread(void *arg) new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + new_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* @@ -10180,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg) */ brt_pending_apply(spa, txg); + spa_sync_time_logger(spa, txg); + /* * Lock out configuration changes. */ @@ -10222,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg) dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + @@ -10309,7 +10472,7 @@ spa_sync(spa_t *spa, uint64_t txg) metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); - /* spa_embedded_log_class has only one metaslab per vdev. */ + /* Embedded log classes have only one metaslab per vdev. */ metaslab_class_evict_old(spa->spa_special_class, txg); metaslab_class_evict_old(spa->spa_dedup_class, txg); @@ -11095,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); +ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW, + "How frequently TXG timestamps are stored internally (in seconds)"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW, + "How frequently the TXG timestamps database should be flushed " + "to disk (in seconds)"); + #ifdef _KERNEL ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index 3e08f261fda1..7252fd534bdf 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -253,7 +253,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, if (error == 0 && BP_IS_HOLE(&bp)) error = SET_ERROR(ENOENT); - *birth_txg = BP_GET_LOGICAL_BIRTH(&bp); + *birth_txg = BP_GET_PHYSICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); @@ -885,7 +885,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, if (error == EACCES) error = 0; else if (!error) - zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp); + zep.zb_birth = BP_GET_PHYSICAL_BIRTH(&bp); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index f054e4290bbf..2eba8362a166 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -715,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -903,6 +904,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); + mutex_destroy(&spa->spa_txg_log_time_lock); kmem_free(spa, sizeof (spa_t)); } @@ -1308,6 +1310,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, metaslab_class_validate(spa_log_class(spa)); metaslab_class_validate(spa_embedded_log_class(spa)); metaslab_class_validate(spa_special_class(spa)); + metaslab_class_validate(spa_special_embedded_log_class(spa)); metaslab_class_validate(spa_dedup_class(spa)); spa_config_exit(spa, SCL_ALL, spa); @@ -1896,6 +1899,8 @@ spa_get_slop_space(spa_t *spa) */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); + embedded_log += metaslab_class_get_dspace( + spa_special_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* @@ -2001,6 +2006,12 @@ spa_special_class(spa_t *spa) } metaslab_class_t * +spa_special_embedded_log_class(spa_t *spa) +{ + return (spa->spa_special_embedded_log_class); +} + +metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 01758b0c54c0..70b14fb9b2c8 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } +char * +vdev_rt_name(vdev_t *vd, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name)); +} + +static char * +vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name, + dtl_type)); +} + /* * Virtual device management. */ @@ -282,12 +301,15 @@ vdev_getops(const char *type) * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a - * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. + * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and + * will point to a metaslab group of either embedded_log_class (for normal + * vdevs) or special_embedded_log_class (for special vdevs). */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { - if (mc == spa_embedded_log_class(vd->vdev_spa) && + if ((mc == spa_embedded_log_class(vd->vdev_spa) || + mc == spa_special_embedded_log_class(vd->vdev_spa)) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else @@ -692,8 +714,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + vd->vdev_obsolete_segments = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments")); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -747,8 +770,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + vd->vdev_dtl[t] = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t)); } txg_list_create(&vd->vdev_ms_list, spa, @@ -1508,8 +1532,13 @@ vdev_metaslab_group_create(vdev_t *vd) vd->vdev_mg = metaslab_group_create(mc, vd); if (!vd->vdev_islog) { - vd->vdev_log_mg = metaslab_group_create( - spa_embedded_log_class(spa), vd); + if (mc == spa_special_class(spa)) { + vd->vdev_log_mg = metaslab_group_create( + spa_special_embedded_log_class(spa), vd); + } else { + vd->vdev_log_mg = metaslab_group_create( + spa_embedded_log_class(spa), vd); + } } /* @@ -1624,9 +1653,10 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab - * group. + * group. This works for normal and special vdevs. */ - if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + if ((vd->vdev_mg->mg_class == spa_normal_class(spa) || + vd->vdev_mg->mg_class == spa_special_class(spa)) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; @@ -3449,7 +3479,9 @@ vdev_dtl_load(vdev_t *vd) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + rt = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt")); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); @@ -3597,7 +3629,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync")); mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index fac2c3a5f154..9fc71fa0e03e 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -1842,7 +1842,7 @@ vdev_indirect_io_done(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); ret = 0; } diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index 4274728578ad..9243c76e810d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + vd->vdev_initialize_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree")); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c index a6aee9437066..18efdaac006f 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c +++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c @@ -532,7 +532,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int c, lowest_load; - ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg); + ASSERT(zio->io_bp == NULL || BP_GET_PHYSICAL_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; @@ -779,7 +779,7 @@ vdev_mirror_io_done(zio_t *zio) * being written out during self healing. */ if ((zio->io_flags & ZIO_FLAG_DIO_READ) && - (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) { zio_dio_chksum_verify_error_report(zio); zio->io_error = vdev_mirror_worst_error(mm); ASSERT3U(zio->io_error, ==, ECKSUM); diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 71c4bfbdaf00..210cdcab1ecc 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -2206,11 +2206,7 @@ vdev_raidz_close(vdev_t *vd) /* * Return the logical width to use, given the txg in which the allocation - * happened. Note that BP_GET_BIRTH() is usually the txg in which the - * BP was allocated. Remapped BP's (that were relocated due to device - * removal, see remap_blkptr_cb()), will have a more recent physical birth - * which reflects when the BP was relocated, but we can ignore these because - * they can't be on RAIDZ (device removal doesn't support RAIDZ). + * happened. */ static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) @@ -2249,10 +2245,9 @@ vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg) vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t psize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; - cols = vdev_raidz_get_logical_width(vdrz, txg); + uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); ASSERT0(asize % (1 << ashift)); @@ -2285,10 +2280,9 @@ vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg) vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vdrz->vd_original_width; uint64_t nparity = vdrz->vd_nparity; - cols = vdev_raidz_get_logical_width(vdrz, txg); + uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg); asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -2345,7 +2339,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size, - BP_GET_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; @@ -2568,7 +2562,7 @@ vdev_raidz_io_start(zio_t *zio) raidz_map_t *rm; uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, - BP_GET_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); if (logical_width != vdrz->vd_physical_width) { zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; @@ -2691,7 +2685,7 @@ raidz_checksum_verify(zio_t *zio) */ if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) { zio->io_error = ret; - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); zio_checksum_verified(zio); return (0); @@ -3048,7 +3042,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) /* Check for success */ if (raidz_checksum_verify(zio) == 0) { - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) return (0); /* Reconstruction succeeded - report errors */ @@ -3514,7 +3508,7 @@ vdev_raidz_io_done(zio_t *zio) } if (raidz_checksum_verify(zio) == 0) { - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) goto done; for (int i = 0; i < rm->rm_nrows; i++) { @@ -4591,8 +4585,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); - zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, - start, shift); + zfs_range_tree_t *rt = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, + "spa_raidz_expand_thread:rt")); zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, rt); diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c index 0e296606d037..cf259788ccf4 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; - vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, - 0, 0); + vr->vr_scan_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree")); mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index db79ded6dce4..3887be4bd548 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + svr->svr_allocd_segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs")); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + svr->svr_frees[i] = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees")); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs")); zfs_btree_index_t where; zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); @@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs")); for (;;) { zfs_range_tree_t *rt = svr->svr_allocd_segs; zfs_range_seg_t *rs = zfs_range_tree_first(rt); @@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg) vca.vca_read_error_bytes = 0; vca.vca_write_error_bytes = 0; - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs")); mutex_enter(&svr->svr_lock); @@ -1895,8 +1900,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) vdev_indirect_mapping_max_offset(vim)); } - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs")); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index 842bb3e690d4..fc8d5b8e9a8a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -902,7 +902,9 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "autotrim_tree")); zfs_range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(zfs_range_tree_is_empty(msp->ms_trim)); @@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + ta->trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "autotrim_tree")); zfs_range_tree_walk(trim_tree, vdev_trim_range_add, ta); } @@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg) vd->vdev_trim_secure = 0; ta.trim_vdev = vd; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; @@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_SIMPLE; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c index 9711c91d7e4e..0896690c97e3 100644 --- a/sys/contrib/openzfs/module/zfs/zap.c +++ b/sys/contrib/openzfs/module/zfs/zap.c @@ -1304,7 +1304,7 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) { - int err = ENOENT; + int err; zap_entry_handle_t zeh; zap_leaf_t *l; diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c index 6960ea360b15..9aecf67fd256 100644 --- a/sys/contrib/openzfs/module/zfs/zcp.c +++ b/sys/contrib/openzfs/module/zfs/zcp.c @@ -1175,7 +1175,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync, for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL); pair != NULL; pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) { - zvol_create_minor(nvpair_name(pair)); + zvol_create_minors(nvpair_name(pair)); } fnvlist_free(runinfo.zri_new_zvols); diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c index 5c92be21c0c8..21852bf3d865 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_chksum.c +++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c @@ -32,9 +32,6 @@ #include <sys/blake3.h> #include <sys/sha2.h> -/* limit benchmarking to max 256KiB, when EdonR is slower then this: */ -#define LIMIT_PERF_MBS 300 - typedef struct { const char *name; const char *impl; @@ -52,9 +49,15 @@ typedef struct { zio_checksum_tmpl_free_t *(free); } chksum_stat_t; +#define AT_STARTUP 0 +#define AT_BENCHMARK 1 +#define AT_DONE 2 + static chksum_stat_t *chksum_stat_data = 0; -static int chksum_stat_cnt = 0; static kstat_t *chksum_kstat = NULL; +static int chksum_stat_limit = AT_STARTUP; +static int chksum_stat_cnt = 0; +static void chksum_benchmark(void); /* * Sample output on i3-1005G1 System: @@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data) static void * chksum_kstat_addr(kstat_t *ksp, loff_t n) { + /* full benchmark */ + chksum_benchmark(); + if (n < chksum_stat_cnt) ksp->ks_private = (void *)(chksum_stat_data + n); else @@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, kpreempt_enable(); run_bw = size * run_count * NANOSEC; - run_bw /= run_time_ns; /* B/s */ + run_bw /= run_time_ns; /* B/s */ *result = run_bw/1024/1024; /* MiB/s */ } -#define LIMIT_INIT 0 -#define LIMIT_NEEDED 1 -#define LIMIT_NOLIMIT 2 - static void chksum_benchit(chksum_stat_t *cs) { abd_t *abd; void *ctx = 0; void *salt = &cs->salt.zcs_bytes; - static int chksum_stat_limit = LIMIT_INIT; memset(salt, 0, sizeof (cs->salt.zcs_bytes)); if (cs->init) ctx = cs->init(&cs->salt); + /* benchmarks in startup mode */ + if (chksum_stat_limit == AT_STARTUP) { + abd = abd_alloc_linear(1<<18, B_FALSE); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + goto done; + } + /* allocate test memory via abd linear interface */ abd = abd_alloc_linear(1<<20, B_FALSE); + + /* benchmarks when requested */ chksum_run(cs, abd, ctx, 1, &cs->bs1k); chksum_run(cs, abd, ctx, 2, &cs->bs4k); chksum_run(cs, abd, ctx, 3, &cs->bs16k); chksum_run(cs, abd, ctx, 4, &cs->bs64k); - chksum_run(cs, abd, ctx, 5, &cs->bs256k); - - /* check if we ran on a slow cpu */ - if (chksum_stat_limit == LIMIT_INIT) { - if (cs->bs1k < LIMIT_PERF_MBS) { - chksum_stat_limit = LIMIT_NEEDED; - } else { - chksum_stat_limit = LIMIT_NOLIMIT; - } - } - - /* skip benchmarks >= 1MiB when the CPU is to slow */ - if (chksum_stat_limit == LIMIT_NEEDED) - goto abort; - chksum_run(cs, abd, ctx, 6, &cs->bs1m); abd_free(abd); @@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs) chksum_run(cs, abd, ctx, 7, &cs->bs4m); chksum_run(cs, abd, ctx, 8, &cs->bs16m); -abort: +done: abd_free(abd); /* free up temp memory */ @@ -243,7 +238,6 @@ chksum_benchmark(void) /* we need the benchmark only for the kernel module */ return; #endif - chksum_stat_t *cs; uint64_t max; uint32_t id, cbid = 0, id_save; @@ -251,8 +245,14 @@ chksum_benchmark(void) const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + /* benchmarks are done */ + if (chksum_stat_limit == AT_DONE) + return; + + /* count implementations */ - chksum_stat_cnt = 2; + chksum_stat_cnt = 1; /* edonr */ + chksum_stat_cnt += 1; /* skein */ chksum_stat_cnt += sha256->getcnt(); chksum_stat_cnt += sha512->getcnt(); chksum_stat_cnt += blake3->getcnt(); @@ -332,6 +332,17 @@ chksum_benchmark(void) } } blake3->setid(id_save); + + switch (chksum_stat_limit) { + case AT_STARTUP: + /* next time we want a full benchmark */ + chksum_stat_limit = AT_BENCHMARK; + break; + case AT_BENCHMARK: + /* no further benchmarks */ + chksum_stat_limit = AT_DONE; + break; + } } void @@ -341,7 +352,7 @@ chksum_init(void) blake3_per_cpu_ctx_init(); #endif - /* Benchmark supported implementations */ + /* 256KiB benchmark */ chksum_benchmark(); /* Install kstats for all implementations */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c new file mode 100644 index 000000000000..f9267ed41d71 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2024 Klara Inc. + * + * This software was developed by + * Mariusz Zaborski <mariusz.zaborski@klarasystems.com> + * Fred Weigel <fred.weigel@klarasystems.com> + * under sponsorship from Wasabi Technology, Inc. and Klara Inc. + */ +/* + * This file implements a round-robin database that stores timestamps and txg + * numbers. Due to limited space, we use a round-robin approach, where + * the oldest records are overwritten when there is no longer enough room. + * This is a best-effort mechanism, and the database should be treated as + * an approximation. Consider this before consuming it. + * + * The database is linear, meaning we assume each new entry is newer than the + * ones already stored. Because of this, if time is manipulated, the database + * will only accept records that are newer than the existing ones. + * (For example, jumping 10 years into the future and then back can lead to + * situation when for 10 years we wont write anything to database) + * + * All times stored in the database use UTC, which makes it easy to convert to + * and from local time. + * + * Each database holds 256 records (as defined in the `RRD_MAX_ENTRIES` macro). + * This limit comes from the maximum size of a ZAP object, where we store the + * binary blob. + * + * We've split the database into three smaller ones. + * The `minute database` provides high resolution (default: every 10 minutes), + * but only covers approximately 1.5 days. This gives a detailed view of recent + * activity, useful, for example, when performing a scrub of the last hour. + * The `daily database` records one txg per day. With 256 entries, it retains + * roughly 8 months of data. This allows users to scrub or analyze txgs across + * a range of days. + * The `monthly database` stores one record per month, giving approximately + * 21 years of history. + * All these calculations assume the worst-case scenario: the pool is always + * online and actively written to. + * + * A potential source of confusion is that the database does not store data + * while the pool is offline, leading to potential gaps in timeline. Also, + * the database contains no records from before this feature was enabled. + * Both, upon reflection, are expected. + */ +#include <sys/zfs_context.h> + +#include "zfs_crrd.h" + +rrd_data_t * +rrd_tail_entry(rrd_t *rrd) +{ + size_t n; + + if (rrd_len(rrd) == 0) + return (NULL); + + if (rrd->rrd_tail == 0) + n = RRD_MAX_ENTRIES - 1; + else + n = rrd->rrd_tail - 1; + + return (&rrd->rrd_entries[n]); +} + +uint64_t +rrd_tail(rrd_t *rrd) +{ + const rrd_data_t *tail; + + tail = rrd_tail_entry(rrd); + + return (tail == NULL ? 0 : tail->rrdd_time); +} + +/* + * Return length of data in the rrd. + * rrd_get works from 0..rrd_len()-1. + */ +size_t +rrd_len(rrd_t *rrd) +{ + + return (rrd->rrd_length); +} + +const rrd_data_t * +rrd_entry(rrd_t *rrd, size_t i) +{ + size_t n; + + if (i >= rrd_len(rrd)) { + return (0); + } + + n = (rrd->rrd_head + i) % RRD_MAX_ENTRIES; + return (&rrd->rrd_entries[n]); +} + +uint64_t +rrd_get(rrd_t *rrd, size_t i) +{ + const rrd_data_t *data = rrd_entry(rrd, i); + + return (data == NULL ? 0 : data->rrdd_txg); +} + +/* Add value to database. */ +void +rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg) +{ + rrd_data_t *tail; + + tail = rrd_tail_entry(rrd); + if (tail != NULL && tail->rrdd_time == time) { + if (tail->rrdd_txg < txg) { + tail->rrdd_txg = txg; + } else { + return; + } + } + + rrd->rrd_entries[rrd->rrd_tail].rrdd_time = time; + rrd->rrd_entries[rrd->rrd_tail].rrdd_txg = txg; + + rrd->rrd_tail = (rrd->rrd_tail + 1) % RRD_MAX_ENTRIES; + + if (rrd->rrd_length < RRD_MAX_ENTRIES) { + rrd->rrd_length++; + } else { + rrd->rrd_head = (rrd->rrd_head + 1) % RRD_MAX_ENTRIES; + } +} + +void +dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg) +{ + hrtime_t daydiff, monthdiff, minutedif; + + minutedif = time - rrd_tail(&db->dbr_minutes); + daydiff = time - rrd_tail(&db->dbr_days); + monthdiff = time - rrd_tail(&db->dbr_months); + + if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60)) + rrd_add(&db->dbr_months, time, txg); + else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60)) + rrd_add(&db->dbr_days, time, txg); + else if (minutedif >= 0) + rrd_add(&db->dbr_minutes, time, txg); +} + +/* + * We could do a binary search here, but the routine isn't frequently + * called and the data is small so we stick to a simple loop. + */ +static const rrd_data_t * +rrd_query(rrd_t *rrd, hrtime_t tv, dbrrd_rounding_t rounding) +{ + const rrd_data_t *data = NULL; + + for (size_t i = 0; i < rrd_len(rrd); i++) { + const rrd_data_t *cur = rrd_entry(rrd, i); + + if (rounding == DBRRD_FLOOR) { + if (tv < cur->rrdd_time) { + break; + } + data = cur; + } else { + /* DBRRD_CEILING */ + if (tv <= cur->rrdd_time) { + data = cur; + break; + } + } + } + + return (data); +} + +static const rrd_data_t * +dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2) +{ + + if (r1 == NULL) + return (r2); + if (r2 == NULL) + return (r1); + + return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2); +} + +uint64_t +dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rounding) +{ + const rrd_data_t *data, *dm, *dd, *dy; + + data = NULL; + dm = rrd_query(&r->dbr_minutes, tv, rounding); + dd = rrd_query(&r->dbr_days, tv, rounding); + dy = rrd_query(&r->dbr_months, tv, rounding); + + data = dbrrd_closest(tv, dbrrd_closest(tv, dd, dm), dy); + + return (data == NULL ? 0 : data->rrdd_txg); +} diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index ebb1cfd07125..dcb71229f96a 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -1704,6 +1704,8 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { {"scan_type", DATA_TYPE_UINT64, 0}, {"scan_command", DATA_TYPE_UINT64, 0}, + {"scan_date_start", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"scan_date_end", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int @@ -1712,6 +1714,7 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) spa_t *spa; int error; uint64_t scan_type, scan_cmd; + uint64_t date_start, date_end; if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) return (SET_ERROR(EINVAL)); @@ -1721,6 +1724,11 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if (scan_cmd >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0) + date_start = 0; + if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0) + date_end = 0; + if ((error = spa_open(poolname, &spa, FTAG)) != 0) return (error); @@ -1732,7 +1740,24 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) error = spa_scan_range(spa, scan_type, spa_get_last_scrubbed_txg(spa), 0); } else { - error = spa_scan(spa, scan_type); + uint64_t txg_start, txg_end; + + txg_start = txg_end = 0; + if (date_start != 0 || date_end != 0) { + mutex_enter(&spa->spa_txg_log_time_lock); + if (date_start != 0) { + txg_start = dbrrd_query(&spa->spa_txg_log_time, + date_start, DBRRD_FLOOR); + } + + if (date_end != 0) { + txg_end = dbrrd_query(&spa->spa_txg_log_time, + date_end, DBRRD_CEILING); + } + mutex_exit(&spa->spa_txg_log_time_lock); + } + + error = spa_scan_range(spa, scan_type, txg_start, txg_end); } spa_close(spa, FTAG); @@ -5000,15 +5025,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) } break; - case ZFS_PROP_SPECIAL_SMALL_BLOCKS: - /* - * This property could require the allocation classes - * feature to be active for setting, however we allow - * it so that tests of settable properties succeed. - * The CLI will issue a warning in this case. - */ - break; - case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c index 2ce25b72b288..2f61ecfd9b3b 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_log.c +++ b/sys/contrib/openzfs/module/zfs/zfs_log.c @@ -607,8 +607,6 @@ zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, * called as soon as the write is on stable storage (be it via a DMU sync or a * ZIL commit). */ -static uint_t zfs_immediate_write_sz = 32768; - void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, @@ -626,15 +624,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, return; } - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - resid >= zfs_immediate_write_sz) - write_state = WR_INDIRECT; - else if (commit) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; + write_state = zil_write_state(zilog, resid, blocksize, o_direct, + commit); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen, sizeof (gen)); @@ -938,6 +929,3 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, len -= partlen; } } - -ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, - "Largest data block to write to zil"); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 656ca4dc22ff..74aa91a4f2eb 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -49,6 +49,7 @@ #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dsl_crypt.h> +#include <sys/dsl_dataset.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/dbuf.h> @@ -67,13 +68,14 @@ int zfs_bclone_enabled = 1; /* - * When set zfs_clone_range() waits for dirty data to be written to disk. - * This allows the clone operation to reliably succeed when a file is modified - * and then immediately cloned. For small files this may be slower than making - * a copy of the file and is therefore not the default. However, in certain - * scenarios this behavior may be desirable so a tunable is provided. + * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty + * data to be written to disk before proceeding. This ensures that the clone + * operation reliably succeeds, even if a file is modified and then immediately + * cloned. Note that for small files this may be slower than simply copying + * the file. When set to 0 the clone operation will immediately fail if it + * encounters any dirty blocks. By default waiting is enabled. */ -int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 1; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be @@ -114,9 +116,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); - atomic_dec_32(&zp->z_sync_writes_cnt); zfs_exit(zfsvfs, FTAG); } return (error); @@ -1102,13 +1102,21 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, { int error; - if (flags != 0 || arg != 0) + if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) return (SET_ERROR(EINVAL)); zfsvfs_t *zfsvfs = ZTOZSB(zp); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); + /* Check if physical rewrite is allowed */ + spa_t *spa = zfsvfs->z_os->os_spa; + if ((flags & ZFS_REWRITE_PHYSICAL) && + !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); @@ -1196,7 +1204,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, if (dmu_buf_is_dirty(dbp[i], tx)) continue; nw += dbp[i]->db_size; - dmu_buf_will_dirty(dbp[i], tx); + if (flags & ZFS_REWRITE_PHYSICAL) + dmu_buf_will_rewrite(dbp[i], tx); + else + dmu_buf_will_dirty(dbp[i], tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index 00059b2c6de0..6e4f84257407 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -589,7 +589,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * that we rewind to is invalid. Thus, we return -1 so * zil_parse() doesn't attempt to read it. */ - if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg) + if (BP_GET_BIRTH(bp) >= first_txg) return (-1); if (zil_bp_tree_add(zilog, bp) != 0) @@ -615,7 +615,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx, * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ - if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg || + if (BP_IS_HOLE(bp) || BP_GET_BIRTH(bp) < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); @@ -640,7 +640,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ - if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) { + if (BP_GET_BIRTH(&lr->lr_blkptr) >= first_txg) { error = zil_read_log_data(zilog, lr, NULL); if (error != 0) return (error); @@ -687,7 +687,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, * just in case lets be safe and just stop here now instead of * corrupting the pool. */ - if (BP_GET_BIRTH(bp) >= first_txg) + if (BP_GET_PHYSICAL_BIRTH(bp) >= first_txg) return (SET_ERROR(ENOENT)); /* @@ -742,7 +742,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) /* * If we previously claimed it, we need to free it. */ - if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg && + if (BP_GET_BIRTH(bp) >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) { zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); } @@ -1997,7 +1997,7 @@ next_lwb: &slog); } if (error == 0) { - ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg); + ASSERT3U(BP_GET_BIRTH(bp), ==, txg); BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -2095,6 +2095,19 @@ zil_max_waste_space(zilog_t *zilog) */ static uint_t zil_maxcopied = 7680; +/* + * Largest write size to store the data directly into ZIL. + */ +uint_t zfs_immediate_write_sz = 32768; + +/* + * When enabled and blocks go to normal vdev, treat special vdevs as SLOG, + * writing data to ZIL (WR_COPIED/WR_NEED_COPY). Disabling this forces the + * indirect writes (WR_INDIRECT) to preserve special vdev throughput and + * endurance, likely at the cost of normal vdev latency. + */ +int zil_special_is_slog = 1; + uint64_t zil_max_copied_data(zilog_t *zilog) { @@ -2102,6 +2115,46 @@ zil_max_copied_data(zilog_t *zilog) return (MIN(max_data, zil_maxcopied)); } +/* + * Determine the appropriate write state for ZIL transactions based on + * pool configuration, data placement, write size, and logbias settings. + */ +itx_wr_state_t +zil_write_state(zilog_t *zilog, uint64_t size, uint32_t blocksize, + boolean_t o_direct, boolean_t commit) +{ + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) + return (WR_INDIRECT); + + /* + * Don't use indirect for too small writes to reduce overhead. + * Don't use indirect if written less than a half of a block if + * we are going to commit it immediately, since next write might + * rewrite the same block again, causing inflation. If commit + * is not planned, then next writes might coalesce, and so the + * indirect may be perfect. + */ + boolean_t indirect = (size >= zfs_immediate_write_sz && + (size >= blocksize / 2 || !commit)); + + if (spa_has_slogs(zilog->zl_spa)) { + /* Dedicated slogs: never use indirect */ + indirect = B_FALSE; + } else if (spa_has_special(zilog->zl_spa)) { + /* Special vdevs: only when beneficial */ + boolean_t on_special = (blocksize <= + zilog->zl_os->os_zpl_special_smallblock); + indirect &= (on_special || !zil_special_is_slog); + } + + if (indirect) + return (WR_INDIRECT); + else if (commit) + return (WR_COPIED); + else + return (WR_NEED_COPY); +} + static uint64_t zil_itx_record_size(itx_t *itx) { @@ -2902,19 +2955,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - /* - * Return if there's nothing to commit before we dirty the fs by - * calling zil_create(). - */ - if (list_is_empty(&zilog->zl_itx_commit_list)) - return; - - list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); - list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { + /* + * Return if there's nothing to commit before we dirty the fs. + */ + if (list_is_empty(&zilog->zl_itx_commit_list)) + return; + lwb = zil_create(zilog); } else { /* @@ -2942,6 +2990,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } } + list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; @@ -3111,7 +3163,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * possible, without significantly impacting the latency * of each individual itx. */ - if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + if (lwb->lwb_state == LWB_STATE_OPENED && + (!zilog->zl_parallel || zilog->zl_suspend > 0)) { zil_burst_done(zilog); list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); @@ -4418,3 +4471,9 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW, "Limit in bytes WR_COPIED size"); + +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW, + "Largest write size to store data into ZIL"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, special_is_slog, INT, ZMOD_RW, + "Treat special vdevs as SLOG"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 6d7bce8b0e10..218aec6093e2 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -692,7 +692,7 @@ error: zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } @@ -850,15 +850,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); - pio->io_reexecute |= zio->io_reexecute; + pio->io_post |= zio->io_post; ASSERT3U(*countp, >, 0); - /* - * Propogate the Direct I/O checksum verify failure to the parent. - */ - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) - pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; - (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -1110,7 +1104,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " - "pad=%#llx,%#llx " + "prop2=%#llx " + "pad=%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " @@ -1123,9 +1118,9 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, - (long long)bp->blk_pad[0], - (long long)bp->blk_pad[1], - (long long)BP_GET_PHYSICAL_BIRTH(bp), + (long long)bp->blk_prop2, + (long long)bp->blk_pad, + (long long)BP_GET_RAW_PHYSICAL_BIRTH(bp), (long long)BP_GET_LOGICAL_BIRTH(bp), (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], @@ -1340,7 +1335,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, { zio_t *zio; - zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp, + zio = zio_create(pio, spa, BP_GET_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? @@ -1649,7 +1644,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * through the mirror during self healing. See comment in * vdev_mirror_io_done() for more details. */ - ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + ASSERT0(pio->io_post & ZIO_POST_DIO_CHKSUM_ERR); } else if (type == ZIO_TYPE_WRITE && pio->io_prop.zp_direct_write == B_TRUE) { /* @@ -1685,7 +1680,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ - if (flags & ZIO_FLAG_IO_ALLOCATING && + if (flags & ZIO_FLAG_ALLOC_THROTTLED && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); @@ -1695,7 +1690,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); - flags &= ~ZIO_FLAG_IO_ALLOCATING; + flags &= ~ZIO_FLAG_ALLOC_THROTTLED; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, @@ -1860,7 +1855,7 @@ zio_write_bp_init(zio_t *zio) blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; - ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg); + ASSERT(BP_GET_BIRTH(bp) != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -1948,7 +1943,7 @@ zio_write_compress(zio_t *zio) ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); - if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) { + if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to @@ -2085,7 +2080,7 @@ zio_write_compress(zio_t *zio) * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg && + if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); @@ -2602,7 +2597,7 @@ zio_reexecute(void *arg) pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; - pio->io_reexecute = 0; + pio->io_post = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; @@ -2749,11 +2744,14 @@ zio_resume_wait(spa_t *spa) * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * - * A gang block consists of a gang header (zio_gbh_phys_t) and up to - * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like - * an indirect block: it's an array of block pointers. It consumes - * only one sector and hence is allocatable regardless of fragmentation. - * The gang header's bps point to its gang members, which hold the data. + * A gang block consists of a a gang header and up to gbh_nblkptrs(size) + * gang members. The gang header is like an indirect block: it's an array + * of block pointers, though the header has a small tail (a zio_eck_t) + * that stores an embedded checksum. It is allocated using only a single + * sector as the requested size, and hence is allocatable regardless of + * fragmentation. Its size is determined by the smallest allocatable + * asize of the vdevs it was allocated on. The gang header's bps point + * to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> * as the verifier to ensure uniqueness of the SHA256 checksum. @@ -2832,10 +2830,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, if (gn != NULL) { abd_t *gbh_abd = - abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute @@ -2906,14 +2904,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * -zio_gang_node_alloc(zio_gang_node_t **gnpp) +zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); - gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); - gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + gn = kmem_zalloc(sizeof (*gn) + + (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP); + gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize; + gn->gn_gbh = zio_buf_alloc(gangblocksize); *gnpp = gn; return (gn); @@ -2924,11 +2924,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) ASSERT(gn->gn_child[g] == NULL); - zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); - kmem_free(gn, sizeof (*gn)); + zio_buf_free(gn->gn_gbh, gn->gn_allocsize); + kmem_free(gn, sizeof (*gn) + + (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn))); *gnpp = NULL; } @@ -2940,7 +2941,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) if (gn == NULL) return; - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); @@ -2949,13 +2950,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { - zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + uint64_t gangblocksize = UINT64_MAX; + if (spa_feature_is_active(gio->io_spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER); + for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) { + vdev_t *vd = vdev_lookup_top(gio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[dva])); + uint64_t psize = vdev_gang_header_psize(vd); + gangblocksize = MIN(gangblocksize, psize); + } + spa_config_exit(gio->io_spa, SCL_VDEV, FTAG); + } else { + gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + ASSERT3U(gangblocksize, !=, UINT64_MAX); + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } @@ -2978,13 +2994,17 @@ zio_gang_tree_assemble_done(zio_t *zio) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); - ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + /* + * If this was an old-style gangblock, the gangblocksize should have + * been updated in zio_checksum_error to reflect that. + */ + ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic, + ==, ZEC_MAGIC); abd_free(zio->io_abd); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); @@ -3009,10 +3029,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + ASSERT3U(gbh_eck(gn->gn_gbh, + gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC); - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { - blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { + blkptr_t *gbp = gbh_bp(gn->gn_gbh, g); if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, @@ -3119,6 +3140,13 @@ zio_write_gang_done(zio_t *zio) abd_free(zio->io_abd); } +static void +zio_update_feature(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx); +} + static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { @@ -3157,20 +3185,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_GANG_HEADER; - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; } - error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE; + uint64_t candidate = gangblocksize; + error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio->io_allocator, pio); + &pio->io_alloc_list, pio->io_allocator, pio, &candidate); if (error) { pio->io_error = error; return (pio); } + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + gangblocksize = candidate; if (pio == gio) { gnpp = &gio->io_gang_tree; @@ -3179,23 +3211,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) ASSERT(pio->io_ready == zio_write_gang_member_ready); } - gn = zio_gang_node_alloc(gnpp); + gn = zio_gang_node_alloc(gnpp, gangblocksize); gbh = gn->gn_gbh; - memset(gbh, 0, SPA_GANGBLOCKSIZE); - gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); + memset(gbh, 0, gangblocksize); + gbh_abd = abd_get_from_buf(gbh, gangblocksize); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); zio_gang_inherit_allocator(pio, zio); - if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { boolean_t more; - VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies, - zio, B_TRUE, &more)); + VERIFY(metaslab_class_throttle_reserve(mc, zio->io_allocator, + gbh_copies, zio->io_size, B_TRUE, &more)); + zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED; } /* @@ -3203,7 +3236,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) * opportunistic allocations. If that fails to generate enough * space, we fall back to normal zio_write calls for nested gang. */ - for (int g = 0; resid != 0; g++) { + int g; + boolean_t any_failed = B_FALSE; + for (g = 0; resid != 0; g++) { flags &= METASLAB_ASYNC_ALLOC; flags |= METASLAB_GANG_CHILD; zp.zp_checksum = gio->io_prop.zp_checksum; @@ -3224,9 +3259,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); uint64_t min_size = zio_roundup_alloc_size(spa, - resid / (SPA_GBH_NBLKPTRS - g)); + resid / (gbh_nblkptrs(gangblocksize) - g)); min_size = MIN(min_size, resid); - bp = &gbh->zg_blkptr[g]; + bp = &((blkptr_t *)gbh)[g]; zio_alloc_list_t cio_list; metaslab_trace_init(&cio_list); @@ -3236,6 +3271,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) flags, &cio_list, zio->io_allocator, NULL, &allocated_size); boolean_t allocated = error == 0; + any_failed |= !allocated; uint64_t psize = allocated ? MIN(resid, allocated_size) : min_size; @@ -3268,6 +3304,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) } /* + * If we used more gang children than the old limit, we must already be + * using the new headers. No need to update anything, just move on. + * + * Otherwise, we might be in a case where we need to turn on the new + * feature, so we check that. We enable the new feature if we didn't + * manage to fit everything into 3 gang children and we could have + * written more than that. + */ + if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DYNAMIC_GANG_HEADER)); + } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE && + spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) && + !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) { + dmu_tx_t *tx = + dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1); + dsl_sync_task_nowait(spa->spa_dsl_pool, + zio_update_feature, + (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx); + dmu_tx_commit(tx); + } + + /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -3836,7 +3895,7 @@ zio_ddt_write(zio_t *zio) * block and leave. */ if (have_dvas == 0) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); + ASSERT(BP_GET_BIRTH(bp) == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_extend(ddp, v, bp); ddt_phys_addref(ddp, v); @@ -3864,6 +3923,23 @@ zio_ddt_write(zio_t *zio) * then we can just use them as-is. */ if (have_dvas >= need_dvas) { + /* + * For rewrite operations, try preserving the original + * logical birth time. If the result matches the + * original BP, this becomes a NOP. + */ + if (zp->zp_rewrite) { + uint64_t orig_logical_birth = + BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig); + ddt_bp_fill(ddp, v, bp, orig_logical_birth); + if (BP_EQUAL(bp, &zio->io_bp_orig)) { + /* We can skip accounting. */ + zio->io_flags |= ZIO_FLAG_NOPWRITE; + ddt_exit(ddt); + return (zio); + } + } + ddt_bp_fill(ddp, v, bp, txg); ddt_phys_addref(ddp, v); ddt_exit(ddt); @@ -4078,9 +4154,11 @@ zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more) * reserve then we throttle. */ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, - zio->io_prop.zp_copies, zio, B_FALSE, more)) { + zio->io_allocator, zio->io_prop.zp_copies, zio->io_size, + B_FALSE, more)) { return (NULL); } + zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED; avl_remove(&mca->mca_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); @@ -4164,8 +4242,10 @@ zio_dva_allocate(zio_t *zio) ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG); memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva, 3 * sizeof (dva_t)); - BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig), - BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig)); + BP_SET_LOGICAL_BIRTH(zio->io_bp, + BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig)); + BP_SET_PHYSICAL_BIRTH(zio->io_bp, + BP_GET_RAW_PHYSICAL_BIRTH(&zio->io_bp_orig)); return (zio); } @@ -4236,13 +4316,14 @@ again: * If we are holding old class reservation, drop it. * Dispatch the next ZIO(s) there if some are waiting. */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { if (metaslab_class_throttle_unreserve(mc, - zio->io_prop.zp_copies, zio)) { + zio->io_allocator, zio->io_prop.zp_copies, + zio->io_size)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } - zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; + zio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED; } if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { @@ -4291,6 +4372,15 @@ again: error); } zio->io_error = error; + } else if (zio->io_prop.zp_rewrite) { + /* + * For rewrite operations, preserve the logical birth time + * but set the physical birth time to the current txg. + */ + uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig); + ASSERT3U(logical_birth, <=, zio->io_txg); + BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg); + BP_SET_REWRITE(zio->io_bp, 1); } return (zio); @@ -4324,18 +4414,17 @@ zio_dva_claim(zio_t *zio) static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) { - metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp), - B_TRUE); + metaslab_free(zio->io_spa, bp, BP_GET_BIRTH(bp), B_TRUE); } if (gn != NULL) { - for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) { zio_dva_unallocate(zio, gn->gn_child[g], - &gn->gn_gbh->zg_blkptr[g]); + gbh_bp(gn->gn_gbh, g)); } } } @@ -4347,7 +4436,7 @@ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { - int error = 1; + int error; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); @@ -4372,14 +4461,34 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; ZIOSTAT_BUMP(ziostat_total_allocations); + + /* Try log class (dedicated slog devices) first */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); *slog = (error == 0); + + /* Try special_embedded_log class (reserved on special vdevs) */ + if (error != 0) { + error = metaslab_alloc(spa, spa_special_embedded_log_class(spa), + size, new_bp, 1, txg, NULL, flags, &io_alloc_list, + allocator, NULL); + } + + /* Try special class (general special vdev allocation) */ + if (error != 0) { + error = metaslab_alloc(spa, spa_special_class(spa), size, + new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, + NULL); + } + + /* Try embedded_log class (reserved on normal vdevs) */ if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator, NULL); } + + /* Finally fall back to normal class */ if (error != 0) { ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks); error = metaslab_alloc(spa, spa_normal_class(spa), size, @@ -4722,7 +4831,7 @@ zio_vdev_io_assess(zio_t *zio) * If a Direct I/O operation has a checksum verify error then this I/O * should not attempt to be issued again. */ - if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) { if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); ASSERT3U(zio->io_error, ==, EIO); @@ -5031,7 +5140,7 @@ zio_checksum_verify(zio_t *zio) ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } - ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ, !(zio->io_flags & ZIO_FLAG_SPECULATIVE)); @@ -5040,7 +5149,7 @@ zio_checksum_verify(zio_t *zio) if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { if (zio->io_flags & ZIO_FLAG_DIO_READ) { - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_t *pio = zio_unique_parent(zio); /* * Any Direct I/O read that has a checksum @@ -5090,7 +5199,7 @@ zio_dio_checksum_verify(zio_t *zio) if ((error = zio_checksum_error(zio, NULL)) != 0) { zio->io_error = error; if (error == ECKSUM) { - zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR; zio_dio_chksum_verify_error_report(zio); } } @@ -5115,7 +5224,7 @@ zio_checksum_verified(zio_t *zio) void zio_dio_chksum_verify_error_report(zio_t *zio) { - ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR); + ASSERT(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); if (zio->io_child_type == ZIO_CHILD_LOGICAL) return; @@ -5187,7 +5296,7 @@ zio_ready(zio_t *zio) if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || + ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); @@ -5202,7 +5311,7 @@ zio_ready(zio_t *zio) if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); @@ -5213,8 +5322,8 @@ zio_ready(zio_t *zio) * issue the next I/O to allocate. */ if (metaslab_class_throttle_unreserve( - zio->io_metaslab_class, zio->io_prop.zp_copies, - zio)) { + zio->io_metaslab_class, zio->io_allocator, + zio->io_prop.zp_copies, zio->io_size)) { zio_allocate_dispatch(zio->io_metaslab_class, zio->io_allocator); } @@ -5264,6 +5373,7 @@ zio_dva_throttle_done(zio_t *zio) vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; const void *tag = pio; + uint64_t size = pio->io_size; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); @@ -5273,16 +5383,19 @@ zio_dva_throttle_done(zio_t *zio) ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); - ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED); /* * Parents of gang children can have two flavors -- ones that allocated * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that * allocated the constituent blocks. The first use their parent as tag. + * We set the size to match the original allocation call for that case. */ if (pio->io_child_type == ZIO_CHILD_GANG && - (pio->io_flags & ZIO_FLAG_IO_REWRITE)) + (pio->io_flags & ZIO_FLAG_IO_REWRITE)) { tag = zio_unique_parent(pio); + size = SPA_OLD_GANGBLOCKSIZE; + } ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG && (pio->io_flags & ZIO_FLAG_IO_REWRITE))); @@ -5295,9 +5408,10 @@ zio_dva_throttle_done(zio_t *zio) ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, - pio->io_allocator, flags, pio->io_size, tag); + pio->io_allocator, flags, size, tag); - if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) { + if (metaslab_class_throttle_unreserve(pio->io_metaslab_class, + pio->io_allocator, 1, pio->io_size)) { zio_allocate_dispatch(zio->io_metaslab_class, pio->io_allocator); } @@ -5328,7 +5442,7 @@ zio_done(zio_t *zio) * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ - if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED && zio->io_child_type == ZIO_CHILD_VDEV) zio_dva_throttle_done(zio); @@ -5337,8 +5451,6 @@ zio_done(zio_t *zio) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { - ASSERT(zio->io_bp->blk_pad[0] == 0); - ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); @@ -5431,7 +5543,7 @@ zio_done(zio_t *zio) */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd) && - !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { @@ -5446,14 +5558,14 @@ zio_done(zio_t *zio) if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && - !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) && + !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, - BP_GET_LOGICAL_BIRTH(zio->io_bp)); + BP_GET_PHYSICAL_BIRTH(zio->io_bp)); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } @@ -5467,7 +5579,7 @@ zio_done(zio_t *zio) */ if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) && zio->io_prop.zp_dedup) { - zio->io_reexecute |= ZIO_REEXECUTE_NOW; + zio->io_post |= ZIO_POST_REEXECUTE; zio->io_prop.zp_dedup = B_FALSE; } /* @@ -5479,11 +5591,11 @@ zio_done(zio_t *zio) if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL) && - !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { + !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) - zio->io_reexecute |= ZIO_REEXECUTE_NOW; + zio->io_post |= ZIO_POST_REEXECUTE; else - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + zio->io_post |= ZIO_POST_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || @@ -5492,10 +5604,11 @@ zio_done(zio_t *zio) zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + zio->io_post |= ZIO_POST_SUSPEND; - if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) - zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && + !(zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) + zio->io_post |= ZIO_POST_SUSPEND; /* * Here is a possibly good place to attempt to do @@ -5514,7 +5627,8 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); - if ((zio->io_error || zio->io_reexecute) && + if ((zio->io_error || + (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); @@ -5525,16 +5639,16 @@ zio_done(zio_t *zio) * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && - (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) - zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; + (zio->io_post & ZIO_POST_SUSPEND)) + zio->io_post &= ~ZIO_POST_SUSPEND; - if (zio->io_reexecute) { + if (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)) { /* * A Direct I/O operation that has a checksum verify error * should not attempt to reexecute. Instead, the error should * just be propagated back. */ - ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); + ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR); /* * This is a logical I/O that wants to reexecute. @@ -5571,7 +5685,7 @@ zio_done(zio_t *zio) pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && - (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + (zio->io_post & ZIO_POST_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't @@ -5595,13 +5709,14 @@ zio_done(zio_t *zio) * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); - } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { + } else if (zio->io_post & ZIO_POST_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { + ASSERT(zio->io_post & ZIO_POST_REEXECUTE); /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. @@ -5614,7 +5729,8 @@ zio_done(zio_t *zio) } ASSERT(list_is_empty(&zio->io_child_list)); - ASSERT(zio->io_reexecute == 0); + ASSERT0(zio->io_post & ZIO_POST_REEXECUTE); + ASSERT0(zio->io_post & ZIO_POST_SUSPEND); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c index a91775b04af2..63d0c6dadd46 100644 --- a/sys/contrib/openzfs/module/zfs/zio_checksum.c +++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c @@ -279,7 +279,7 @@ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); - uint64_t txg = BP_GET_BIRTH(bp); + uint64_t txg = BP_GET_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); @@ -545,14 +545,39 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int error; - uint64_t size = (bp == NULL ? zio->io_size : - (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size; uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; + if (bp && BP_IS_GANG(bp)) { + if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) + size = zio->io_size; + else + size = SPA_OLD_GANGBLOCKSIZE; + } + error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); + if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) { + /* + * It's possible that this is an old gang block. Rerun + * the checksum with the old size; if that passes, then + * update the gangblocksize appropriately. + */ + error = zio_checksum_error_impl(spa, bp, checksum, data, + SPA_OLD_GANGBLOCKSIZE, offset, info); + if (error == 0) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + zio_t *pio; + for (pio = zio_unique_parent(zio); + pio->io_child_type != ZIO_CHILD_GANG; + pio = zio_unique_parent(pio)) + ; + zio_gang_node_t *gn = pio->io_private; + gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE; + } + } if (zio_injection_enabled && error == 0 && zio->io_error == 0) { error = zio_handle_fault_injection(zio, ECKSUM); diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 3568d4f43fcb..7e264f308cf2 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -102,6 +102,7 @@ extern int zfs_bclone_wait_dirty; zv_taskq_t zvol_taskqs; typedef enum { + ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, @@ -110,10 +111,14 @@ typedef enum { } zvol_async_op_t; typedef struct { - zvol_async_op_t op; - char name1[MAXNAMELEN]; - char name2[MAXNAMELEN]; - uint64_t value; + zvol_async_op_t zt_op; + char zt_name1[MAXNAMELEN]; + char zt_name2[MAXNAMELEN]; + uint64_t zt_value; + uint32_t zt_total; + uint32_t zt_done; + int32_t zt_status; + int zt_error; } zvol_task_t; zv_request_task_t * @@ -859,13 +864,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { }; /* - * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. - * - * We store data in the log buffers if it's small enough. - * Otherwise we will later flush the data out via dmu_sync(). + * zvol_log_write() handles TX_WRITE transactions. */ -static const ssize_t zvol_immediate_write_sz = 32768; - void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit) @@ -878,15 +878,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, if (zil_replaying(zilog, tx)) return; - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - write_state = WR_INDIRECT; - else if (!spa_has_slogs(zilog->zl_spa) && - size >= blocksize && blocksize > zvol_immediate_write_sz) - write_state = WR_INDIRECT; - else if (commit) - write_state = WR_COPIED; - else - write_state = WR_NEED_COPY; + write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit); while (size) { itx_t *itx; @@ -1434,6 +1426,57 @@ zvol_create_minors_cb(const char *dsname, void *arg) return (0); } +static void +zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done, + int error) +{ + + task->zt_total += total; + task->zt_done += done; + if (task->zt_total != task->zt_done) { + task->zt_status = -1; + if (error) + task->zt_error = error; + } +} + +static const char * +zvol_task_op_msg(zvol_async_op_t op) +{ + switch (op) { + case ZVOL_ASYNC_CREATE_MINORS: + return ("create"); + case ZVOL_ASYNC_REMOVE_MINORS: + return ("remove"); + case ZVOL_ASYNC_RENAME_MINORS: + return ("rename"); + case ZVOL_ASYNC_SET_SNAPDEV: + case ZVOL_ASYNC_SET_VOLMODE: + return ("set property"); + default: + return ("unknown"); + } + + __builtin_unreachable(); + return (NULL); +} + +static void +zvol_task_report_status(zvol_task_t *task) +{ + + if (task->zt_status == 0) + return; + + if (task->zt_error) { + dprintf("The %s minors zvol task was not ok, last error %d\n", + zvol_task_op_msg(task->zt_op), task->zt_error); + } else { + dprintf("The %s minors zvol task was not ok\n", + zvol_task_op_msg(task->zt_op)); + } +} + /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots @@ -1451,14 +1494,27 @@ zvol_create_minors_cb(const char *dsname, void *arg) * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ -void -zvol_create_minors_recursive(const char *name) +static void +zvol_create_minors_impl(zvol_task_t *task) { + const char *name = task->zt_name1; list_t minors_list; minors_job_t *job; + uint64_t snapdev; + int total = 0, done = 0, last_error, error; - if (zvol_inhibit_dev) + /* + * Note: the dsl_pool_config_lock must not be held. + * Minor node creation needs to obtain the zvol_state_lock. + * zvol_open() obtains the zvol_state_lock and then the dsl pool + * config lock. Therefore, we can't have the config lock now if + * we are going to wait for the zvol_state_lock, because it + * would be a lock order inversion which could lead to deadlock. + */ + + if (zvol_inhibit_dev) { return; + } /* * This is the list for prefetch jobs. Whenever we found a match @@ -1474,13 +1530,16 @@ zvol_create_minors_recursive(const char *name) if (strchr(name, '@') != NULL) { - uint64_t snapdev; - - int error = dsl_prop_get_integer(name, "snapdev", - &snapdev, NULL); - - if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - (void) zvol_os_create_minor(name); + error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) { + error = zvol_os_create_minor(name); + if (error == 0) { + done++; + } else { + last_error = error; + } + total++; + } } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, @@ -1495,41 +1554,30 @@ zvol_create_minors_recursive(const char *name) * sequentially. */ while ((job = list_remove_head(&minors_list)) != NULL) { - if (!job->error) - (void) zvol_os_create_minor(job->name); + if (!job->error) { + error = zvol_os_create_minor(job->name); + if (error == 0) { + done++; + } else { + last_error = error; + } + } else if (job->error == EINVAL) { + /* + * The objset, with the name requested by current job + * exist, but have the type different from zvol. + * Just ignore this sort of errors. + */ + done++; + } else { + last_error = job->error; + } + total++; kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); -} - -void -zvol_create_minor(const char *name) -{ - /* - * Note: the dsl_pool_config_lock must not be held. - * Minor node creation needs to obtain the zvol_state_lock. - * zvol_open() obtains the zvol_state_lock and then the dsl pool - * config lock. Therefore, we can't have the config lock now if - * we are going to wait for the zvol_state_lock, because it - * would be a lock order inversion which could lead to deadlock. - */ - - if (zvol_inhibit_dev) - return; - - if (strchr(name, '@') != NULL) { - uint64_t snapdev; - - int error = dsl_prop_get_integer(name, - "snapdev", &snapdev, NULL); - - if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) - (void) zvol_os_create_minor(name); - } else { - (void) zvol_os_create_minor(name); - } + zvol_task_update_status(task, total, done, last_error); } /* @@ -1577,10 +1625,11 @@ zvol_free_task(void *arg) zvol_os_free(arg); } -void -zvol_remove_minors_impl(const char *name) +static void +zvol_remove_minors_impl(zvol_task_t *task) { zvol_state_t *zv, *zv_next; + const char *name = task ? task->zt_name1 : NULL; int namelen = ((name) ? strlen(name) : 0); taskqid_t t; list_t delay_list, free_list; @@ -1662,13 +1711,13 @@ zvol_remove_minors_impl(const char *name) } /* Remove minor for this specific volume only */ -static void +static int zvol_remove_minor_impl(const char *name) { zvol_state_t *zv = NULL, *zv_next; if (zvol_inhibit_dev) - return; + return (0); rw_enter(&zvol_state_lock, RW_WRITER); @@ -1684,7 +1733,7 @@ zvol_remove_minor_impl(const char *name) if (zv == NULL) { rw_exit(&zvol_state_lock); - return; + return (ENOENT); } ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1698,7 +1747,7 @@ zvol_remove_minor_impl(const char *name) mutex_exit(&zv->zv_state_lock); rw_exit(&zvol_state_lock); zvol_remove_minor_task(zv); - return; + return (0); } zvol_remove(zv); @@ -1708,16 +1757,20 @@ zvol_remove_minor_impl(const char *name) rw_exit(&zvol_state_lock); zvol_os_free(zv); + + return (0); } /* * Rename minors for specified dataset including children and snapshots. */ static void -zvol_rename_minors_impl(const char *oldname, const char *newname) +zvol_rename_minors_impl(zvol_task_t *task) { zvol_state_t *zv, *zv_next; - int oldnamelen; + const char *oldname = task->zt_name1; + const char *newname = task->zt_name2; + int total = 0, done = 0, last_error, error, oldnamelen; if (zvol_inhibit_dev) return; @@ -1732,24 +1785,31 @@ zvol_rename_minors_impl(const char *oldname, const char *newname) mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { - zvol_os_rename_minor(zv, newname); + error = zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); - zvol_os_rename_minor(zv, name); + error = zvol_os_rename_minor(zv, name); kmem_strfree(name); } - + if (error) { + last_error = error; + } else { + done++; + } + total++; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); + zvol_task_update_status(task, total, done, last_error); } typedef struct zvol_snapdev_cb_arg { + zvol_task_t *task; uint64_t snapdev; } zvol_snapdev_cb_arg_t; @@ -1757,26 +1817,31 @@ static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; + int error = 0; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: - (void) zvol_os_create_minor(dsname); + error = zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: - (void) zvol_remove_minor_impl(dsname); + error = zvol_remove_minor_impl(dsname); break; } + zvol_task_update_status(arg->task, 1, error == 0, error); return (0); } static void -zvol_set_snapdev_impl(char *name, uint64_t snapdev) +zvol_set_snapdev_impl(zvol_task_t *task) { - zvol_snapdev_cb_arg_t arg = {snapdev}; + const char *name = task->zt_name1; + uint64_t snapdev = task->zt_value; + + zvol_snapdev_cb_arg_t arg = {task, snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately @@ -1787,11 +1852,14 @@ zvol_set_snapdev_impl(char *name, uint64_t snapdev) } static void -zvol_set_volmode_impl(char *name, uint64_t volmode) +zvol_set_volmode_impl(zvol_task_t *task) { + const char *name = task->zt_name1; + uint64_t volmode = task->zt_value; fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; + int error; if (strchr(name, '@') != NULL) return; @@ -1804,7 +1872,7 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) - return; + return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); @@ -1815,51 +1883,34 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: - (void) zvol_remove_minor_impl(name); + error = zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: - (void) zvol_remove_minor_impl(name); - (void) zvol_os_create_minor(name); + error = zvol_remove_minor_impl(name); + /* + * The remove minor function call above, might be not + * needed, if volmode was switched from 'none' value. + * Ignore error in this case. + */ + if (error == ENOENT) + error = 0; + else if (error) + break; + error = zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: - (void) zvol_remove_minor_impl(name); + error = zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ - (void) zvol_os_create_minor(name); + error = zvol_os_create_minor(name); break; } + zvol_task_update_status(task, 1, error == 0, error); spl_fstrans_unmark(cookie); } -static zvol_task_t * -zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, - uint64_t value) -{ - zvol_task_t *task; - - /* Never allow tasks on hidden names. */ - if (name1[0] == '$') - return (NULL); - - task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); - task->op = op; - task->value = value; - - strlcpy(task->name1, name1, sizeof (task->name1)); - if (name2 != NULL) - strlcpy(task->name2, name2, sizeof (task->name2)); - - return (task); -} - -static void -zvol_task_free(zvol_task_t *task) -{ - kmem_free(task, sizeof (zvol_task_t)); -} - /* * The worker thread function performed asynchronously. */ @@ -1868,25 +1919,29 @@ zvol_task_cb(void *arg) { zvol_task_t *task = arg; - switch (task->op) { + switch (task->zt_op) { + case ZVOL_ASYNC_CREATE_MINORS: + zvol_create_minors_impl(task); + break; case ZVOL_ASYNC_REMOVE_MINORS: - zvol_remove_minors_impl(task->name1); + zvol_remove_minors_impl(task); break; case ZVOL_ASYNC_RENAME_MINORS: - zvol_rename_minors_impl(task->name1, task->name2); + zvol_rename_minors_impl(task); break; case ZVOL_ASYNC_SET_SNAPDEV: - zvol_set_snapdev_impl(task->name1, task->value); + zvol_set_snapdev_impl(task); break; case ZVOL_ASYNC_SET_VOLMODE: - zvol_set_volmode_impl(task->name1, task->value); + zvol_set_volmode_impl(task); break; default: VERIFY(0); break; } - zvol_task_free(task); + zvol_task_report_status(task); + kmem_free(task, sizeof (zvol_task_t)); } typedef struct zvol_set_prop_int_arg { @@ -1931,23 +1986,17 @@ zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0) return (0); - switch (zsda->zsda_prop) { - case ZFS_PROP_VOLMODE: - task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, - NULL, prop); - break; - case ZFS_PROP_SNAPDEV: - task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, - NULL, prop); - break; - default: - task = NULL; - break; - } - - if (task == NULL) + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + if (zsda->zsda_prop == ZFS_PROP_VOLMODE) { + task->zt_op = ZVOL_ASYNC_SET_VOLMODE; + } else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) { + task->zt_op = ZVOL_ASYNC_SET_SNAPDEV; + } else { + kmem_free(task, sizeof (zvol_task_t)); return (0); - + } + task->zt_value = prop; + strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1)); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); @@ -2001,15 +2050,34 @@ zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source, } void -zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +zvol_create_minors(const char *name) { + spa_t *spa; zvol_task_t *task; taskqid_t id; - task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); - if (task == NULL) + if (spa_open(name, &spa, FTAG) != 0) return; + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->zt_op = ZVOL_ASYNC_CREATE_MINORS; + strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if (id != TASKQID_INVALID) + taskq_wait_id(spa->spa_zvol_taskq, id); + + spa_close(spa, FTAG); +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->zt_op = ZVOL_ASYNC_REMOVE_MINORS; + strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); @@ -2022,10 +2090,10 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, zvol_task_t *task; taskqid_t id; - task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); - if (task == NULL) - return; - + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->zt_op = ZVOL_ASYNC_RENAME_MINORS; + strlcpy(task->zt_name1, name1, sizeof (task->zt_name1)); + strlcpy(task->zt_name2, name2, sizeof (task->zt_name2)); id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); |