aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs')
-rw-r--r--sys/contrib/openzfs/module/zfs/arc.c32
-rw-r--r--sys/contrib/openzfs/module/zfs/bpobj.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/brt.c70
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c82
-rw-r--r--sys/contrib/openzfs/module/zfs/ddt.c15
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu.c11
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_diff.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_direct.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_objset.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_recv.c8
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_redact.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_send.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dmu_traverse.c17
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c19
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_bookmark.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_crypt.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_dataset.c20
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_deadlist.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_destroy.c12
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_pool.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/dsl_scan.c27
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c182
-rw-r--r--sys/contrib/openzfs/module/zfs/range_tree.c96
-rw-r--r--sys/contrib/openzfs/module/zfs/spa.c194
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_errlog.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/spa_misc.c11
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev.c57
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_indirect.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_initialize.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_mirror.c4
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_raidz.c28
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_rebuild.c5
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_removal.c30
-rw-r--r--sys/contrib/openzfs/module/zfs/vdev_trim.c24
-rw-r--r--sys/contrib/openzfs/module/zfs/zap.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zcp.c2
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_chksum.c69
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_crrd.c227
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_ioctl.c36
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_log.c16
-rw-r--r--sys/contrib/openzfs/module/zfs/zfs_vnops.c31
-rw-r--r--sys/contrib/openzfs/module/zfs/zil.c95
-rw-r--r--sys/contrib/openzfs/module/zfs/zio.c326
-rw-r--r--sys/contrib/openzfs/module/zfs/zio_checksum.c31
-rw-r--r--sys/contrib/openzfs/module/zfs/zvol.c342
45 files changed, 1557 insertions, 611 deletions
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index 04ca32356a6d..3483be64ec57 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -1052,7 +1052,7 @@ static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
const dva_t *dva = BP_IDENTITY(bp);
- uint64_t birth = BP_GET_BIRTH(bp);
+ uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *hdr;
@@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
ARCSTAT_INCR(arcstat_bonus_size, space);
break;
case ARC_SPACE_DNODE:
- ARCSTAT_INCR(arcstat_dnode_size, space);
+ aggsum_add(&arc_sums.arcstat_dnode_size, space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, space);
@@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
ARCSTAT_INCR(arcstat_bonus_size, -space);
break;
case ARC_SPACE_DNODE:
- ARCSTAT_INCR(arcstat_dnode_size, -space);
+ aggsum_add(&arc_sums.arcstat_dnode_size, -space);
break;
case ARC_SPACE_DBUF:
ARCSTAT_INCR(arcstat_dbuf_size, -space);
@@ -4490,7 +4490,7 @@ arc_evict(void)
* target is not evictable or if they go over arc_dnode_limit.
*/
int64_t prune = 0;
- int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
+ int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
+ zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
- zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
@@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
* in the ARC. In practice, that's in the tens of MB, which is low
* enough to be safe.
*/
- int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
+ int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
zfs_max_recordsize;
+ int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
+ arc_dnode_limit;
/* Always allow at least one block of overflow. */
- if (over < 0)
+ if (arc_over < 0 && dn_over <= 0)
return (ARC_OVF_NONE);
/* If we are under memory pressure, report severe overflow. */
@@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
if (use_reserve)
overflow *= 3;
- return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
+ return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
}
static abd_t *
@@ -5585,7 +5587,7 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr)) {
arc_buf_hdr_t *found;
- ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp));
ASSERT3U(hdr->b_dva.dva_word[0], ==,
BP_IDENTITY(zio->io_bp)->dva_word[0]);
ASSERT3U(hdr->b_dva.dva_word[1], ==,
@@ -5688,7 +5690,7 @@ arc_read_done(zio_t *zio)
error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(zio->io_spa, &acb->acb_zb,
- BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@@ -6107,7 +6109,7 @@ top:
if (!embedded_bp) {
hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = BP_GET_BIRTH(bp);
+ hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp);
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
@@ -6955,7 +6957,7 @@ arc_write_done(zio_t *zio)
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
+ hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp);
}
} else {
ASSERT(HDR_EMPTY(hdr));
@@ -7326,7 +7328,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
#if defined(COMPAT_FREEBSD11)
as->arcstat_other_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size) +
- wmsum_value(&arc_sums.arcstat_dnode_size) +
+ aggsum_value(&arc_sums.arcstat_dnode_size) +
wmsum_value(&arc_sums.arcstat_dbuf_size);
#endif
@@ -7368,7 +7370,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
&as->arcstat_uncached_evictable_metadata);
as->arcstat_dnode_size.value.ui64 =
- wmsum_value(&arc_sums.arcstat_dnode_size);
+ aggsum_value(&arc_sums.arcstat_dnode_size);
as->arcstat_bonus_size.value.ui64 =
wmsum_value(&arc_sums.arcstat_bonus_size);
as->arcstat_l2_hits.value.ui64 =
@@ -7738,7 +7740,7 @@ arc_state_init(void)
wmsum_init(&arc_sums.arcstat_data_size, 0);
wmsum_init(&arc_sums.arcstat_metadata_size, 0);
wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
- wmsum_init(&arc_sums.arcstat_dnode_size, 0);
+ aggsum_init(&arc_sums.arcstat_dnode_size, 0);
wmsum_init(&arc_sums.arcstat_bonus_size, 0);
wmsum_init(&arc_sums.arcstat_l2_hits, 0);
wmsum_init(&arc_sums.arcstat_l2_misses, 0);
@@ -7897,7 +7899,7 @@ arc_state_fini(void)
wmsum_fini(&arc_sums.arcstat_data_size);
wmsum_fini(&arc_sums.arcstat_metadata_size);
wmsum_fini(&arc_sums.arcstat_dbuf_size);
- wmsum_fini(&arc_sums.arcstat_dnode_size);
+ aggsum_fini(&arc_sums.arcstat_dnode_size);
wmsum_fini(&arc_sums.arcstat_bonus_size);
wmsum_fini(&arc_sums.arcstat_l2_hits);
wmsum_fini(&arc_sums.arcstat_l2_misses);
diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c
index 8c19de93f12f..0a8a077edf63 100644
--- a/sys/contrib/openzfs/module/zfs/bpobj.c
+++ b/sys/contrib/openzfs/module/zfs/bpobj.c
@@ -954,8 +954,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
(void) bp_freed, (void) tx;
struct space_range_arg *sra = arg;
- if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
- BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
+ if (BP_GET_BIRTH(bp) > sra->mintxg &&
+ BP_GET_BIRTH(bp) <= sra->maxtxg) {
if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
sra->used += bp_get_dsize_sync(sra->spa, bp);
else
diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c
index 27d9ed7ea2b0..40664354aa73 100644
--- a/sys/contrib/openzfs/module/zfs/brt.c
+++ b/sys/contrib/openzfs/module/zfs/brt.c
@@ -478,6 +478,18 @@ brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
BRT_DEBUG("Pool directory object created, object=%s", name);
+ /*
+ * Activate the endian-fixed feature if this is the first BRT ZAP
+ * (i.e., BLOCK_CLONING is not yet active) and the feature is enabled.
+ */
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN) &&
+ !spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+ spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
+ } else if (spa_feature_is_active(spa,
+ SPA_FEATURE_BLOCK_CLONING_ENDIAN)) {
+ spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
+ }
+
spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
}
@@ -658,6 +670,8 @@ brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
rw_exit(&brtvd->bv_lock);
spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
+ if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN))
+ spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN, tx);
}
static void
@@ -855,16 +869,29 @@ brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
}
+static boolean_t
+brt_has_endian_fixed(spa_t *spa)
+{
+ return (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING_ENDIAN));
+}
+
static int
-brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
+brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre)
{
uint64_t off = BRE_OFFSET(bre);
if (brtvd->bv_mos_entries == 0)
return (SET_ERROR(ENOENT));
- return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
- &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
+ if (brt_has_endian_fixed(spa)) {
+ return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
+ &off, BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
+ &bre->bre_count));
+ } else {
+ return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
+ &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
+ &bre->bre_count));
+ }
}
/*
@@ -1056,7 +1083,7 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
}
rw_exit(&brtvd->bv_lock);
- error = brt_entry_lookup(brtvd, &bre_search);
+ error = brt_entry_lookup(spa, brtvd, &bre_search);
/* bre_search now contains correct bre_count */
if (error == ENOENT) {
BRTSTAT_BUMP(brt_decref_no_entry);
@@ -1118,7 +1145,7 @@ brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
if (bre == NULL) {
rw_exit(&brtvd->bv_lock);
- error = brt_entry_lookup(brtvd, &bre_search);
+ error = brt_entry_lookup(spa, brtvd, &bre_search);
if (error == ENOENT) {
refcnt = 0;
} else {
@@ -1270,10 +1297,18 @@ brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
uint64_t off = BRE_OFFSET(bre);
if (brtvd->bv_mos_entries != 0 &&
brt_vdev_lookup(spa, brtvd, off)) {
- int error = zap_lookup_uint64_by_dnode(
- brtvd->bv_mos_entries_dnode, &off,
- BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
- &bre->bre_count);
+ int error;
+ if (brt_has_endian_fixed(spa)) {
+ error = zap_lookup_uint64_by_dnode(
+ brtvd->bv_mos_entries_dnode, &off,
+ BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
+ &bre->bre_count);
+ } else {
+ error = zap_lookup_uint64_by_dnode(
+ brtvd->bv_mos_entries_dnode, &off,
+ BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
+ &bre->bre_count);
+ }
if (error == 0) {
BRTSTAT_BUMP(brt_addref_entry_on_disk);
} else {
@@ -1326,7 +1361,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
}
static void
-brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
+brt_sync_entry(spa_t *spa, dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
{
uint64_t off = BRE_OFFSET(bre);
@@ -1337,9 +1372,15 @@ brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
BRT_KEY_WORDS, tx);
VERIFY(error == 0 || error == ENOENT);
} else {
- VERIFY0(zap_update_uint64_by_dnode(dn, &off,
- BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
- &bre->bre_count, tx));
+ if (brt_has_endian_fixed(spa)) {
+ VERIFY0(zap_update_uint64_by_dnode(dn, &off,
+ BRT_KEY_WORDS, sizeof (bre->bre_count), 1,
+ &bre->bre_count, tx));
+ } else {
+ VERIFY0(zap_update_uint64_by_dnode(dn, &off,
+ BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
+ &bre->bre_count, tx));
+ }
}
}
@@ -1368,7 +1409,8 @@ brt_sync_table(spa_t *spa, dmu_tx_t *tx)
void *c = NULL;
while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
- brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
+ brt_sync_entry(spa, brtvd->bv_mos_entries_dnode, bre,
+ tx);
kmem_cache_free(brt_entry_cache, bre);
}
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index f1b5a17f337e..432c99cec960 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size)
* and grabbing the lock results in massive lock contention.
*/
if (size > dbuf_cache_target_bytes()) {
- if (size > dbuf_cache_hiwater_bytes())
+ /*
+ * Avoid calling dbuf_evict_one() from memory reclaim context
+ * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.
+ * Memory reclaim threads can get stuck waiting for the dbuf
+ * hash lock.
+ */
+ if (size > dbuf_cache_hiwater_bytes() &&
+ !current_is_reclaim_thread()) {
dbuf_evict_one();
+ }
cv_signal(&dbuf_evict_cv);
}
}
@@ -1235,11 +1243,9 @@ dbuf_verify(dmu_buf_impl_t *db)
DVA_IS_EMPTY(&bp->blk_dva[1]) &&
DVA_IS_EMPTY(&bp->blk_dva[2]));
ASSERT0(bp->blk_fill);
- ASSERT0(bp->blk_pad[0]);
- ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
- ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+ ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
}
}
}
@@ -1615,7 +1621,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
*/
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
spa_log_error(db->db_objset->os_spa, &zb,
- BP_GET_LOGICAL_BIRTH(bp));
+ BP_GET_PHYSICAL_BIRTH(bp));
err = SET_ERROR(EIO);
goto early_unlock;
}
@@ -2154,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
ASSERT(arc_released(db->db_buf));
arc_buf_thaw(db->db_buf);
}
+
+ /*
+ * Clear the rewrite flag since this is now a logical
+ * modification.
+ */
+ dr->dt.dl.dr_rewrite = B_FALSE;
}
}
@@ -2701,6 +2713,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
}
+void
+dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ /*
+ * If the dbuf is already dirty in this txg, it will be written
+ * anyway, so there's nothing to do.
+ */
+ mutex_enter(&db->db_mtx);
+ if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * The dbuf is not dirty, so we need to make it dirty and
+ * mark it for rewrite (preserve logical birth time).
+ */
+ dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ if (dr != NULL && db->db_level == 0)
+ dr->dt.dl.dr_rewrite = B_TRUE;
+ mutex_exit(&db->db_mtx);
+}
+
boolean_t
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
@@ -4899,7 +4943,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
- if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
+ if (BP_GET_BIRTH(bp) != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
@@ -5186,7 +5230,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
drica.drica_os = dn->dn_objset;
- drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
+ drica.drica_blk_birth = BP_GET_BIRTH(bp);
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
@@ -5201,8 +5245,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- BP_GET_LOGICAL_BIRTH(bp) >
- ds->ds_dir->dd_origin_txg) {
+ BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -5320,7 +5363,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
}
ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
+ ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg);
ASSERT(pio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -5334,6 +5377,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
/*
+ * Set rewrite properties for zfs_rewrite() operations.
+ */
+ if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {
+ zp.zp_rewrite = B_TRUE;
+
+ /*
+ * Mark physical rewrite feature for activation.
+ * This will be activated automatically during dataset sync.
+ */
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ if (!dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_PHYSICAL_REWRITE)) {
+ ds->ds_feature_activation[
+ SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;
+ }
+ }
+
+ /*
* We copy the blkptr now (rather than when we instantiate the dirty
* record), because its value can change between open context and
* syncing context. We do not need to hold dn_struct_rwlock to read
@@ -5403,6 +5464,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_will_rewrite);
EXPORT_SYMBOL(dmu_buf_is_dirty);
EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
index 60cbb7755a7e..e0b9fc3951ff 100644
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -724,10 +724,13 @@ ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
dvas[2] = bp->blk_dva[2];
if (ddt_phys_birth(ddp, v) == 0) {
- if (v == DDT_PHYS_FLAT)
- ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp);
- else
- ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp);
+ if (v == DDT_PHYS_FLAT) {
+ ddp->ddp_flat.ddp_phys_birth =
+ BP_GET_PHYSICAL_BIRTH(bp);
+ } else {
+ ddp->ddp_trad[v].ddp_phys_birth =
+ BP_GET_PHYSICAL_BIRTH(bp);
+ }
}
}
@@ -891,14 +894,14 @@ ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
- BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
+ BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
return (DDT_PHYS_FLAT);
}
} else /* traditional phys */ {
for (int p = 0; p < DDT_PHYS_MAX; p++) {
if (DVA_EQUAL(BP_IDENTITY(bp),
&ddp->ddp_trad[p].ddp_dva[0]) &&
- BP_GET_BIRTH(bp) ==
+ BP_GET_PHYSICAL_BIRTH(bp) ==
ddp->ddp_trad[p].ddp_phys_birth) {
return (p);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index 21c465328134..296e58ef9cd8 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -1966,7 +1966,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
- ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
+ ASSERT(BP_GET_BIRTH(zio->io_bp) == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
}
@@ -2508,6 +2508,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_encrypt = encrypt;
zp->zp_byteorder = ZFS_HOST_BYTEORDER;
zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
+ zp->zp_rewrite = B_FALSE;
memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
@@ -2655,11 +2656,12 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
* operation into ZIL, or it may be impossible to replay, since
* the block may appear not yet allocated at that point.
*/
- if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+ if (BP_GET_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
error = SET_ERROR(EINVAL);
goto out;
}
- if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+ if (BP_GET_PHYSICAL_BIRTH(bp) >
+ spa_last_synced_txg(os->os_spa)) {
error = SET_ERROR(EAGAIN);
goto out;
}
@@ -2731,7 +2733,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
if (!BP_IS_EMBEDDED(bp)) {
BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
- BP_GET_BIRTH(bp));
+ BP_GET_PHYSICAL_BIRTH(bp));
+ BP_SET_REWRITE(&dl->dr_overridden_by, 0);
} else {
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
dr->dr_txg);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c
index 86f751e886c9..fb13b2f87f57 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_diff.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c
@@ -224,8 +224,8 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name,
* call the ZFS_IOC_OBJ_TO_STATS ioctl.
*/
error = traverse_dataset(tosnap, fromtxg,
- TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT,
- diff_cb, &da);
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT |
+ TRAVERSE_LOGICAL, diff_cb, &da);
if (error != 0) {
da.da_err = error;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_direct.c b/sys/contrib/openzfs/module/zfs/dmu_direct.c
index 12b0ffa2c99b..930ff101eca3 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_direct.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_direct.c
@@ -104,7 +104,7 @@ dmu_write_direct_done(zio_t *zio)
dmu_sync_done(zio, NULL, zio->io_private);
if (zio->io_error != 0) {
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+ if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
ASSERT3U(zio->io_error, ==, EIO);
/*
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
index b3f792e4ae6b..c135f620800f 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_objset.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -345,12 +345,6 @@ smallblk_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
- /*
- * Inheritance and range checking should have been done by now.
- */
- ASSERT(newval <= SPA_MAXBLOCKSIZE);
- ASSERT(ISP2(newval));
-
os->os_zpl_special_smallblock = newval;
}
@@ -1376,7 +1370,7 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
6, ZFS_SPACE_CHECK_NORMAL);
if (rv == 0)
- zvol_create_minor(name);
+ zvol_create_minors(name);
crfree(cr);
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
index 3a4bd7a1cea9..73227b58c140 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_recv.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -1403,7 +1403,7 @@ corrective_read_done(zio_t *zio)
/* Corruption corrected; update error log if needed */
if (zio->io_error == 0) {
spa_remove_error(data->spa, &data->zb,
- BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
}
kmem_free(data, sizeof (cr_cb_data_t));
abd_free(zio->io_abd);
@@ -1530,7 +1530,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
}
rrd->abd = abd;
- io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+ io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_BIRTH(bp), bp,
abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
&zb);
@@ -3831,11 +3831,11 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
nvlist_free(drc->drc_keynvl);
} else if (!drc->drc_heal) {
if (drc->drc_newfs) {
- zvol_create_minor(drc->drc_tofs);
+ zvol_create_minors(drc->drc_tofs);
}
char *snapname = kmem_asprintf("%s@%s",
drc->drc_tofs, drc->drc_tosnap);
- zvol_create_minor(snapname);
+ zvol_create_minors(snapname);
kmem_strfree(snapname);
}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
index 65443d112f27..9226ac9e4b80 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_redact.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -370,8 +370,8 @@ redact_traverse_thread(void *arg)
#endif
err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg,
- &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
- redact_cb, rt_arg);
+ &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_LOGICAL, redact_cb, rt_arg);
if (err != EINTR)
rt_arg->error_code = err;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
index 4f27f3df0e55..deeba29e159a 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_send.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -1084,7 +1084,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (sta->os->os_encrypted &&
!BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
- spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+ spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp));
return (SET_ERROR(EIO));
}
@@ -1210,7 +1210,7 @@ send_traverse_thread(void *arg)
err = traverse_dataset_resume(st_arg->os->os_dsl_dataset,
st_arg->fromtxg, &st_arg->resume,
- st_arg->flags, send_cb, st_arg);
+ st_arg->flags | TRAVERSE_LOGICAL, send_cb, st_arg);
if (err != EINTR)
st_arg->error_code = err;
diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
index f534a7dd64e3..dd1df1705040 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_traverse.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
@@ -74,6 +74,15 @@ static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
uint64_t objset, uint64_t object);
+static inline uint64_t
+get_birth_time(traverse_data_t *td, const blkptr_t *bp)
+{
+ if (td->td_flags & TRAVERSE_LOGICAL)
+ return (BP_GET_LOGICAL_BIRTH(bp));
+ else
+ return (BP_GET_BIRTH(bp));
+}
+
static int
traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
uint64_t claim_txg)
@@ -85,7 +94,7 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
return (0);
if (claim_txg == 0 &&
- BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
+ get_birth_time(td, bp) >= spa_min_claim_txg(td->td_spa))
return (-1);
SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -110,7 +119,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
if (BP_IS_HOLE(bp))
return (0);
- if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
+ if (claim_txg == 0 || get_birth_time(td, bp) < claim_txg)
return (0);
ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -194,7 +203,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
*/
if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
return (B_FALSE);
- if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
+ if (BP_IS_HOLE(bp) || get_birth_time(td, bp) <= td->td_min_txg)
return (B_FALSE);
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return (B_FALSE);
@@ -265,7 +274,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
zb->zb_object == DMU_META_DNODE_OBJECT) &&
td->td_hole_birth_enabled_txg <= td->td_min_txg)
return (0);
- } else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
+ } else if (get_birth_time(td, bp) <= td->td_min_txg) {
return (0);
}
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 904a039edf95..451e1533efa0 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
#endif /* _KERNEL */
+static char *
+rt_name(dnode_t *dn, const char *name)
+{
+ struct objset *os = dn->dn_objset;
+
+ return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
+ spa_name(os->os_spa),
+ (u_longlong_t)(os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
+ (u_longlong_t)dn->dn_object,
+ name));
+}
+
static int
dbuf_compare(const void *x1, const void *x2)
{
@@ -2436,8 +2449,10 @@ done:
{
int txgoff = tx->tx_txg & TXG_MASK;
if (dn->dn_free_ranges[txgoff] == NULL) {
- dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
- ZFS_RANGE_SEG64, NULL, 0, 0);
+ dn->dn_free_ranges[txgoff] =
+ zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
}
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
index e301fe19f645..fdc8b7b198f0 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
@@ -1523,7 +1523,7 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
* If the block was live (referenced) at the time of this
* bookmark, add its space to the bookmark's FBN.
*/
- if (BP_GET_LOGICAL_BIRTH(bp) <=
+ if (BP_GET_BIRTH(bp) <=
dbn->dbn_phys.zbm_creation_txg &&
(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
mutex_enter(&dbn->dbn_lock);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
index db568f42d24e..6b6bb8d45b6b 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
@@ -866,7 +866,7 @@ spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp,
dsl_pool_rele(dp, FTAG);
/* create any zvols under this ds */
- zvol_create_minors_recursive(dsname);
+ zvol_create_minors(dsname);
return (0);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
index c0a7872c40ad..b767c9641419 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -159,7 +159,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
return;
}
- ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+ ASSERT3U(BP_GET_BIRTH(bp), >,
dsl_dataset_phys(ds)->ds_prev_snap_txg);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_lock);
@@ -194,7 +194,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
+ BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -263,7 +263,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
return (0);
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
+ ASSERT(BP_GET_BIRTH(bp) <= tx->tx_txg);
if (ds == NULL) {
dsl_free(tx->tx_pool, tx->tx_txg, bp);
@@ -281,7 +281,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
+ BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -289,7 +289,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
bplist_append(&ds->ds_dir->dd_pending_frees, bp);
}
- if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ if (BP_GET_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
/*
@@ -346,14 +346,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
/* if (logical birth > prev prev snap txg) prev unique += bs */
if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
- ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
+ ds->ds_object && BP_GET_BIRTH(bp) >
dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
mutex_enter(&ds->ds_prev->ds_lock);
dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
- if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
+ if (BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@@ -2005,7 +2005,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
if (error == 0) {
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
- zvol_create_minor(nvpair_name(pair));
+ zvol_create_minors(nvpair_name(pair));
}
}
@@ -2944,7 +2944,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
if (snap == NULL)
return (B_FALSE);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
+ birth = BP_GET_BIRTH(dsl_dataset_get_blkptr(ds));
rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
objset_t *os, *os_snap;
@@ -3413,7 +3413,7 @@ dsl_dataset_clone(const char *clone, const char *origin)
6, ZFS_SPACE_CHECK_NORMAL);
if (rv == 0)
- zvol_create_minor(clone);
+ zvol_create_minors(clone);
crfree(cr);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
index 3113d932fb68..9ffc998ac173 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
@@ -484,7 +484,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
- dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
+ dle_tofind.dle_mintxg = BP_GET_BIRTH(bp);
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@@ -493,7 +493,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
if (dle == NULL) {
zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
- bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
+ bp, (longlong_t)BP_GET_BIRTH(bp));
dle = avl_first(&dl->dl_tree);
}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
index f5ec93b2dc5c..fff49c97f4d2 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
@@ -133,11 +133,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
- if (BP_GET_LOGICAL_BIRTH(bp) <=
+ if (BP_GET_BIRTH(bp) <=
dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
if (poa->ds_prev && !poa->after_branch_point &&
- BP_GET_LOGICAL_BIRTH(bp) >
+ BP_GET_BIRTH(bp) >
dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
bp_get_dsize_sync(dp->dp_spa, bp);
@@ -315,8 +315,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
- tx->tx_txg);
+ ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
@@ -730,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
ASSERT(zilog == NULL);
- ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+ ASSERT3U(BP_GET_BIRTH(bp), >,
dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
@@ -1020,8 +1019,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT(ds->ds_prev == NULL ||
dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
- tx->tx_txg);
+ ASSERT3U(BP_GET_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=, tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
index f1088d87208b..4f1f66b835f2 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_pool.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -1056,7 +1056,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
* will be wrong.
*/
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
- ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
+ ASSERT0(BP_GET_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
rrw_exit(&ds->ds_bp_rwlock, FTAG);
/* The origin doesn't get attached to itself */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
index 1b2cd3e361d1..5052992d775c 100644
--- a/sys/contrib/openzfs/module/zfs/dsl_scan.c
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -454,7 +454,7 @@ static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
sio->sio_blk_prop = bp->blk_prop;
- sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+ sio->sio_phys_birth = BP_GET_RAW_PHYSICAL_BIRTH(bp);
sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
sio->sio_cksum = bp->blk_cksum;
sio->sio_nr_dvas = BP_GET_NDVAS(bp);
@@ -1768,7 +1768,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
ASSERT(!BP_IS_REDACTED(bp));
if (BP_IS_HOLE(bp) ||
- BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
+ BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@@ -1778,7 +1778,7 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
* scrub there's nothing to do to it).
*/
if (claim_txg == 0 &&
- BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
+ BP_GET_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1804,7 +1804,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
ASSERT(!BP_IS_REDACTED(bp));
if (BP_IS_HOLE(bp) ||
- BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
+ BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@@ -1812,7 +1812,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
* already txg sync'ed (but this log block contains
* other records that are not synced)
*/
- if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
+ if (claim_txg == 0 || BP_GET_BIRTH(bp) < claim_txg)
return (0);
ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -1952,7 +1952,7 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
return;
if (BP_IS_HOLE(bp) ||
- BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
+ BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
BP_GET_TYPE(bp) != DMU_OT_OBJSET))
return;
@@ -2223,7 +2223,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
if (dnp != NULL &&
dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
scn->scn_phys.scn_errors++;
- spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+ spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp));
return (SET_ERROR(EINVAL));
}
@@ -2319,7 +2319,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
* by arc_read() for the cases above.
*/
scn->scn_phys.scn_errors++;
- spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
+ spa_log_error(spa, zb, BP_GET_PHYSICAL_BIRTH(bp));
return (SET_ERROR(EINVAL));
}
@@ -2396,7 +2396,12 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
if (f != SPA_FEATURE_NONE)
ASSERT(dsl_dataset_feature_is_active(ds, f));
- if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
+ /*
+ * Recurse any blocks that were written either logically or physically
+ * at or after cur_min_txg. About logical birth we care for traversal,
+ * looking for any changes, while about physical for the actual scan.
+ */
+ if (BP_GET_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
scn->scn_lt_min_this_txg++;
return;
}
@@ -2422,7 +2427,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
* Don't scan it now unless we need to because something
* under it was modified.
*/
- if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+ if (BP_GET_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
scn->scn_gt_max_this_txg++;
return;
}
@@ -4806,7 +4811,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- uint64_t phys_birth = BP_GET_BIRTH(bp);
+ uint64_t phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
size_t psize = BP_GET_PSIZE(bp);
boolean_t needs_io = B_FALSE;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index 43b94eba2d58..0e5f09b2724c 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -375,6 +375,16 @@ static metaslab_stats_t metaslab_stats = {
#define METASLABSTAT_BUMP(stat) \
atomic_inc_64(&metaslab_stats.stat.value.ui64);
+char *
+metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name)
+{
+ return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
+ spa_name(mg->mg_vd->vdev_spa),
+ (u_longlong_t)mg->mg_vd->vdev_guid,
+ (u_longlong_t)ms->ms_id,
+ name));
+}
+
static kstat_t *metaslab_ksp;
@@ -750,7 +760,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
}
IMPLY(mg == mg->mg_vd->vdev_log_mg,
- mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++)
mc_hist[i] += mg->mg_histogram[i];
@@ -1183,14 +1194,16 @@ metaslab_group_passivate(metaslab_group_t *mg)
if (msp != NULL) {
mutex_enter(&msp->ms_lock);
metaslab_passivate(msp,
- metaslab_weight_from_range_tree(msp));
+ metaslab_weight(msp, B_TRUE) &
+ ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
}
msp = mga->mga_secondary;
if (msp != NULL) {
mutex_enter(&msp->ms_lock);
metaslab_passivate(msp,
- metaslab_weight_from_range_tree(msp));
+ metaslab_weight(msp, B_TRUE) &
+ ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
}
}
@@ -1288,7 +1301,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
mutex_enter(&mc->mc_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
IMPLY(mg == mg->mg_vd->vdev_log_mg,
- mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] +=
@@ -1316,7 +1330,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
ASSERT3U(mc->mc_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
IMPLY(mg == mg->mg_vd->vdev_log_mg,
- mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
@@ -2895,30 +2910,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
zfs_range_seg_type_t type =
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
- ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
- shift);
+ ms->ms_allocatable = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable"));
for (int t = 0; t < TXG_SIZE; t++) {
- ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
- NULL, start, shift);
- }
- ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
- ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_allocating[t] = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(mg, ms, "ms_allocating"));
+ }
+ ms->ms_freeing = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing"));
+ ms->ms_freed = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed"));
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
- start, shift);
+ ms->ms_defer[t] = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer"));
}
- ms->ms_checkpointing =
- zfs_range_tree_create(NULL, type, NULL, start, shift);
- ms->ms_unflushed_allocs =
- zfs_range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_checkpointing = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing"));
+ ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs"));
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
mrap->mra_floor_shift = metaslab_by_size_min_shift;
- ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
- type, mrap, start, shift);
+ ms->ms_unflushed_frees = zfs_range_tree_create_flags(
+ &metaslab_rt_ops, type, mrap, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees"));
- ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_trim = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim"));
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms, B_FALSE);
@@ -3892,7 +3920,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
&start, &shift);
- condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
+ condense_tree = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(msp->ms_group, msp, "condense_tree"));
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
zfs_range_tree_walk(msp->ms_defer[t],
@@ -3949,8 +3980,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
* followed by FREES (due to space_map_write() in metaslab_sync()) for
* sync pass 1.
*/
- zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
- start, shift);
+ zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(msp->ms_group, msp, "tmp_tree"));
zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
@@ -5199,29 +5232,16 @@ next:
/*
* We were unable to allocate from this metaslab so determine
- * a new weight for this metaslab. Now that we have loaded
- * the metaslab we can provide a better hint to the metaslab
- * selector.
- *
- * For space-based metaslabs, we use the maximum block size.
- * This information is only available when the metaslab
- * is loaded and is more accurate than the generic free
- * space weight that was calculated by metaslab_weight().
- * This information allows us to quickly compare the maximum
- * available allocation in the metaslab to the allocation
- * size being requested.
- *
- * For segment-based metaslabs, determine the new weight
- * based on the highest bucket in the range tree. We
- * explicitly use the loaded segment weight (i.e. the range
- * tree histogram) since it contains the space that is
- * currently available for allocation and is accurate
- * even within a sync pass.
+ * a new weight for this metaslab. The weight was last
+ * recalculated either when we loaded it (if this is the first
+ * TXG it's been loaded in), or the last time a txg was synced
+ * out.
*/
uint64_t weight;
if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
- weight = metaslab_largest_allocatable(msp);
- WEIGHT_SET_SPACEBASED(weight);
+ metaslab_set_fragmentation(msp, B_TRUE);
+ weight = metaslab_space_weight(msp) &
+ ~METASLAB_ACTIVE_MASK;
} else {
weight = metaslab_weight_from_range_tree(msp);
}
@@ -5233,13 +5253,6 @@ next:
* For the case where we use the metaslab that is
* active for another allocator we want to make
* sure that we retain the activation mask.
- *
- * Note that we could attempt to use something like
- * metaslab_recalculate_weight_and_sort() that
- * retains the activation mask here. That function
- * uses metaslab_weight() to set the weight though
- * which is not as accurate as the calculations
- * above.
*/
weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
metaslab_group_sort(mg, msp, weight);
@@ -5590,7 +5603,21 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
- BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
+
+ /*
+ * For rewritten blocks, use the old physical birth as the new logical
+ * birth (representing when the space was allocated) and the removal
+ * time as the new physical birth (representing when it was actually
+ * written).
+ */
+ if (BP_GET_REWRITE(bp)) {
+ uint64_t old_physical_birth = BP_GET_PHYSICAL_BIRTH(bp);
+ ASSERT3U(old_physical_birth, <, physical_birth);
+ BP_SET_BIRTH(bp, old_physical_birth, physical_birth);
+ BP_SET_REWRITE(bp, 0);
+ } else {
+ BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
+ }
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5757,21 +5784,21 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
}
/*
- * Reserve some allocation slots. The reservation system must be called
- * before we call into the allocator. If there aren't any available slots
- * then the I/O will be throttled until an I/O completes and its slots are
- * freed up. The function returns true if it was successful in placing
- * the reservation.
+ * Reserve some space for a future allocation. The reservation system must be
+ * called before we call into the allocator. If there aren't enough space
+ * available, the calling I/O will be throttled until another I/O completes and
+ * its reservation is released. The function returns true if it was successful
+ * in placing the reservation.
*/
boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
- boolean_t must, boolean_t *more)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int allocator,
+ int copies, uint64_t io_size, boolean_t must, boolean_t *more)
{
- metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- if (mc->mc_alloc_io_size < zio->io_size) {
- mc->mc_alloc_io_size = zio->io_size;
+ if (mc->mc_alloc_io_size < io_size) {
+ mc->mc_alloc_io_size = io_size;
metaslab_class_balance(mc, B_FALSE);
}
if (must || mca->mca_reserved <= mc->mc_alloc_max) {
@@ -5782,10 +5809,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
* worst that can happen is few more I/Os get to allocation
* earlier, that is not a problem.
*/
- int64_t delta = slots * zio->io_size;
+ int64_t delta = copies * io_size;
*more = (atomic_add_64_nv(&mca->mca_reserved, delta) <=
mc->mc_alloc_max);
- zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
return (B_TRUE);
}
*more = B_FALSE;
@@ -5793,13 +5819,13 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
}
boolean_t
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
- zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int allocator,
+ int copies, uint64_t io_size)
{
- metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- int64_t delta = slots * zio->io_size;
+ int64_t delta = copies * io_size;
return (atomic_add_64_nv(&mca->mca_reserved, -delta) <=
mc->mc_alloc_max);
}
@@ -5960,7 +5986,7 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
int error = 0;
ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
- ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+ ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@@ -5975,12 +6001,12 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
- uint64_t cur_psize = 0;
-
+ uint64_t smallest_psize = UINT64_MAX;
for (int d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
- dva, d, hintdva, txg, flags, zal, allocator,
- actual_psize ? &cur_psize : NULL);
+ uint64_t cur_psize = 0;
+ error = metaslab_alloc_dva_range(spa, mc, psize,
+ MIN(smallest_psize, max_psize), dva, d, hintdva, txg,
+ flags, zal, allocator, actual_psize ? &cur_psize : NULL);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
@@ -6000,13 +6026,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
tag);
if (actual_psize)
- max_psize = MIN(cur_psize, max_psize);
+ smallest_psize = MIN(cur_psize, smallest_psize);
}
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
if (actual_psize)
- *actual_psize = max_psize;
+ *actual_psize = smallest_psize;
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -6022,7 +6048,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
+ ASSERT(!now || BP_GET_BIRTH(bp) >= spa_syncing_txg(spa));
/*
* If we have a checkpoint for the pool we need to make sure that
@@ -6040,7 +6066,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
* normally as they will be referenced by the checkpointed uberblock.
*/
boolean_t checkpoint = B_FALSE;
- if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
+ if (BP_GET_BIRTH(bp) <= spa->spa_checkpoint_txg &&
spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
/*
* At this point, if the block is part of the checkpoint
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
index 373636c69254..fc2b17606bd2 100644
--- a/sys/contrib/openzfs/module/zfs/range_tree.c
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t,
ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf,
zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare)
-zfs_range_tree_t *
-zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
+static zfs_range_tree_t *
+zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops,
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
- uint64_t gap)
+ uint64_t gap, uint64_t flags, const char *name)
{
zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP);
@@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
rt->rt_ops = ops;
rt->rt_gap = gap;
+ rt->rt_flags = flags;
+ rt->rt_name = name;
rt->rt_arg = arg;
rt->rt_type = type;
rt->rt_start = start;
@@ -248,10 +250,29 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
}
zfs_range_tree_t *
+zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
+ zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+ uint64_t gap)
+{
+ return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap,
+ 0, NULL));
+}
+
+zfs_range_tree_t *
zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift)
{
- return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0));
+ return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
+ 0, NULL));
+}
+
+zfs_range_tree_t *
+zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
+ zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
+ uint64_t flags, const char *name)
+{
+ return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0,
+ flags, name));
}
void
@@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt)
if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
+ if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME))
+ kmem_strfree((char *)(uintptr_t)rt->rt_name);
+
zfs_btree_destroy(&rt->rt_root);
kmem_free(rt, sizeof (*rt));
}
@@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
int64_t delta)
{
if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) {
- zfs_panic_recover("zfs: attempting to decrease fill to or "
- "below 0; probable double remove in segment [%llx:%llx]",
+ zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to "
+ "or below 0; probable double remove in segment [%llx:%llx]",
+ ZFS_RT_NAME(rt),
(longlong_t)zfs_rs_get_start(rs, rt),
(longlong_t)zfs_rs_get_end(rs, rt));
}
if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) -
zfs_rs_get_start(rs, rt)) {
- zfs_panic_recover("zfs: attempting to increase fill beyond "
- "max; probable double add in segment [%llx:%llx]",
+ zfs_panic_recover("zfs: rt=%s: attempting to increase fill "
+ "beyond max; probable double add in segment [%llx:%llx]",
+ ZFS_RT_NAME(rt),
(longlong_t)zfs_rs_get_start(rs, rt),
(longlong_t)zfs_rs_get_end(rs, rt));
}
@@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
* the normal code paths.
*/
if (rs != NULL) {
+ uint64_t rstart = zfs_rs_get_start(rs, rt);
+ uint64_t rend = zfs_rs_get_end(rs, rt);
if (gap == 0) {
- zfs_panic_recover("zfs: adding existent segment to "
- "range tree (offset=%llx size=%llx)",
- (longlong_t)start, (longlong_t)size);
+ zfs_panic_recover("zfs: rt=%s: adding segment "
+ "(offset=%llx size=%llx) overlapping with existing "
+ "one (offset=%llx size=%llx)",
+ ZFS_RT_NAME(rt),
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rstart, (longlong_t)(rend - rstart));
return;
}
- uint64_t rstart = zfs_rs_get_start(rs, rt);
- uint64_t rend = zfs_rs_get_end(rs, rt);
if (rstart <= start && rend >= end) {
zfs_range_tree_adjust_fill(rt, rs, fill);
return;
@@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
zfs_range_seg_t *rs;
zfs_range_seg_max_t rsearch, rs_tmp;
uint64_t end = start + size;
+ uint64_t rstart, rend;
boolean_t left_over, right_over;
VERIFY3U(size, !=, 0);
@@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
/* Make sure we completely overlap with someone */
if (rs == NULL) {
- zfs_panic_recover("zfs: removing nonexistent segment from "
- "range tree (offset=%llx size=%llx)",
- (longlong_t)start, (longlong_t)size);
+ zfs_panic_recover("zfs: rt=%s: removing nonexistent segment "
+ "from range tree (offset=%llx size=%llx)",
+ ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size);
return;
}
+ rstart = zfs_rs_get_start(rs, rt);
+ rend = zfs_rs_get_end(rs, rt);
+
/*
* Range trees with gap support must only remove complete segments
* from the tree. This allows us to maintain accurate fill accounting
@@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size,
if (rt->rt_gap != 0) {
if (do_fill) {
if (zfs_rs_get_fill(rs, rt) == size) {
- start = zfs_rs_get_start(rs, rt);
- end = zfs_rs_get_end(rs, rt);
+ start = rstart;
+ end = rend;
size = end - start;
} else {
zfs_range_tree_adjust_fill(rt, rs, -size);
return;
}
- } else if (zfs_rs_get_start(rs, rt) != start ||
- zfs_rs_get_end(rs, rt) != end) {
- zfs_panic_recover("zfs: freeing partial segment of "
- "gap tree (offset=%llx size=%llx) of "
+ } else if (rstart != start || rend != end) {
+ zfs_panic_recover("zfs: rt=%s: freeing partial segment "
+ "of gap tree (offset=%llx size=%llx) of "
"(offset=%llx size=%llx)",
+ ZFS_RT_NAME(rt),
(longlong_t)start, (longlong_t)size,
- (longlong_t)zfs_rs_get_start(rs, rt),
- (longlong_t)zfs_rs_get_end(rs, rt) -
- zfs_rs_get_start(rs, rt));
+ (longlong_t)rstart, (longlong_t)(rend - rstart));
return;
}
}
- VERIFY3U(zfs_rs_get_start(rs, rt), <=, start);
- VERIFY3U(zfs_rs_get_end(rs, rt), >=, end);
+ if (!(rstart <= start && rend >= end)) {
+ panic("zfs: rt=%s: removing segment "
+ "(offset=%llx size=%llx) not completely overlapped by "
+ "existing one (offset=%llx size=%llx)",
+ ZFS_RT_NAME(rt),
+ (longlong_t)start, (longlong_t)size,
+ (longlong_t)rstart, (longlong_t)(rend - rstart));
+ return;
+ }
- left_over = (zfs_rs_get_start(rs, rt) != start);
- right_over = (zfs_rs_get_end(rs, rt) != end);
+ left_over = (rstart != start);
+ right_over = (rend != end);
zfs_range_tree_stat_decr(rt, rs);
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
index 6b52c6cb1f9e..5ecb175fbd63 100644
--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -100,6 +100,7 @@
#include <sys/vmsystm.h>
#endif /* _KERNEL */
+#include "zfs_crrd.h"
#include "zfs_prop.h"
#include "zfs_comutil.h"
#include <cityhash.h>
@@ -311,6 +312,41 @@ static int zfs_livelist_condense_zthr_cancel = 0;
static int zfs_livelist_condense_new_alloc = 0;
/*
+ * Time variable to decide how often the txg should be added into the
+ * database (in seconds).
+ * The smallest available resolution is in minutes, which means an update occurs
+ * each time we reach `spa_note_txg_time` and the txg has changed. We provide
+ * a 256-slot ring buffer for minute-level resolution. The number is limited by
+ * the size of the structure we use and the maximum amount of bytes we can write
+ * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately
+ * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of
+ * high-resolution data.
+ *
+ * The user can decrease `spa_note_txg_time` to increase resolution within
+ * a day, at the cost of retaining fewer days of data. Alternatively, increasing
+ * the interval allows storing data over a longer period, but with lower
+ * frequency.
+ *
+ * This parameter does not affect the daily or monthly databases, as those only
+ * store one record per day and per month, respectively.
+ */
+static uint_t spa_note_txg_time = 10 * 60;
+
+/*
+ * How often flush txg database to a disk (in seconds).
+ * We flush data every time we write to it, making it the most reliable option.
+ * Since this happens every 10 minutes, it shouldn't introduce any noticeable
+ * overhead for the system. In case of failure, we will always have an
+ * up-to-date version of the database.
+ *
+ * The user can adjust the flush interval to a lower value, but it probably
+ * doesn't make sense to flush more often than the database is updated.
+ * The user can also increase the interval if they're concerned about the
+ * performance of writing the entire database to disk.
+ */
+static uint_t spa_flush_txg_time = 10 * 60;
+
+/*
* ==========================================================================
* SPA properties routines
* ==========================================================================
@@ -417,11 +453,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t *nv)
alloc += metaslab_class_get_alloc(spa_special_class(spa));
alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
+ alloc += metaslab_class_get_alloc(
+ spa_special_embedded_log_class(spa));
size = metaslab_class_get_space(mc);
size += metaslab_class_get_space(spa_special_class(spa));
size += metaslab_class_get_space(spa_dedup_class(spa));
size += metaslab_class_get_space(spa_embedded_log_class(spa));
+ size += metaslab_class_get_space(
+ spa_special_embedded_log_class(spa));
spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
@@ -1679,6 +1719,8 @@ spa_activate(spa_t *spa, spa_mode_t mode)
"embedded_log", msp, B_TRUE);
spa->spa_special_class = metaslab_class_create(spa, "special",
msp, B_FALSE);
+ spa->spa_special_embedded_log_class = metaslab_class_create(spa,
+ "special_embedded_log", msp, B_TRUE);
spa->spa_dedup_class = metaslab_class_create(spa, "dedup",
msp, B_FALSE);
@@ -1853,6 +1895,9 @@ spa_deactivate(spa_t *spa)
metaslab_class_destroy(spa->spa_special_class);
spa->spa_special_class = NULL;
+ metaslab_class_destroy(spa->spa_special_embedded_log_class);
+ spa->spa_special_embedded_log_class = NULL;
+
metaslab_class_destroy(spa->spa_dedup_class);
spa->spa_dedup_class = NULL;
@@ -2031,6 +2076,111 @@ spa_destroy_aux_threads(spa_t *spa)
}
}
+static void
+spa_sync_time_logger(spa_t *spa, uint64_t txg)
+{
+ uint64_t curtime;
+ dmu_tx_t *tx;
+
+ if (!spa_writeable(spa)) {
+ return;
+ }
+ curtime = gethrestime_sec();
+ if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) {
+ return;
+ }
+
+ if (txg > spa->spa_last_noted_txg) {
+ spa->spa_last_noted_txg_time = curtime;
+ spa->spa_last_noted_txg = txg;
+
+ mutex_enter(&spa->spa_txg_log_time_lock);
+ dbrrd_add(&spa->spa_txg_log_time, curtime, txg);
+ mutex_exit(&spa->spa_txg_log_time_lock);
+ }
+
+ if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) {
+ return;
+ }
+ spa->spa_last_flush_txg_time = curtime;
+
+ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+ VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_minutes, tx));
+ VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_days, tx));
+ VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_months, tx));
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_unload_sync_time_logger(spa_t *spa)
+{
+ uint64_t txg;
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
+
+ txg = dmu_tx_get_txg(tx);
+ spa->spa_last_noted_txg_time = 0;
+ spa->spa_last_flush_txg_time = 0;
+ spa_sync_time_logger(spa, txg);
+
+ dmu_tx_commit(tx);
+}
+
+static void
+spa_load_txg_log_time(spa_t *spa)
+{
+ int error;
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_minutes);
+ if (error != 0 && error != ENOENT) {
+ spa_load_note(spa, "unable to load a txg time database with "
+ "minute resolution [error=%d]", error);
+ }
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_days);
+ if (error != 0 && error != ENOENT) {
+ spa_load_note(spa, "unable to load a txg time database with "
+ "day resolution [error=%d]", error);
+ }
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
+ &spa->spa_txg_log_time.dbr_months);
+ if (error != 0 && error != ENOENT) {
+ spa_load_note(spa, "unable to load a txg time database with "
+ "month resolution [error=%d]", error);
+ }
+}
+
+static boolean_t
+spa_should_sync_time_logger_on_unload(spa_t *spa)
+{
+
+ if (!spa_writeable(spa))
+ return (B_FALSE);
+
+ if (!spa->spa_sync_on)
+ return (B_FALSE);
+
+ if (spa_state(spa) != POOL_STATE_EXPORTED)
+ return (B_FALSE);
+
+ if (spa->spa_last_noted_txg == 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+
/*
* Opposite of spa_load().
*/
@@ -2052,6 +2202,9 @@ spa_unload(spa_t *spa)
* we delay the final TXGs beyond what spa_final_txg is set at.
*/
if (spa->spa_final_txg == UINT64_MAX) {
+ if (spa_should_sync_time_logger_on_unload(spa))
+ spa_unload_sync_time_logger(spa);
+
/*
* If the log space map feature is enabled and the pool is
* getting exported (but not destroyed), we want to spend some
@@ -2709,8 +2862,8 @@ spa_claim_notify(zio_t *zio)
return;
mutex_enter(&spa->spa_props_lock); /* any mutex will do */
- if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
- spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
+ if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp))
+ spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp);
mutex_exit(&spa->spa_props_lock);
}
@@ -3768,20 +3921,17 @@ out:
* ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool
*/
if (error == EREMOTEIO) {
- const char *hostname = "<unknown>";
- uint64_t hostid = 0;
-
if (mmp_label) {
if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
- hostname = fnvlist_lookup_string(mmp_label,
- ZPOOL_CONFIG_HOSTNAME);
+ const char *hostname = fnvlist_lookup_string(
+ mmp_label, ZPOOL_CONFIG_HOSTNAME);
fnvlist_add_string(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
}
if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
- hostid = fnvlist_lookup_uint64(mmp_label,
- ZPOOL_CONFIG_HOSTID);
+ uint64_t hostid = fnvlist_lookup_uint64(
+ mmp_label, ZPOOL_CONFIG_HOSTID);
fnvlist_add_uint64(spa->spa_load_info,
ZPOOL_CONFIG_MMP_HOSTID, hostid);
}
@@ -4711,6 +4861,9 @@ spa_ld_get_props(spa_t *spa)
if (error != 0 && error != ENOENT)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ /* Load time log */
+ spa_load_txg_log_time(spa);
+
/*
* Load the persistent error log. If we have an older pool, this will
* not be present.
@@ -5899,7 +6052,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag,
}
if (firstopen)
- zvol_create_minors_recursive(spa_name(spa));
+ zvol_create_minors(spa_name(spa));
*spapp = spa;
@@ -6877,7 +7030,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
mutex_exit(&spa_namespace_lock);
- zvol_create_minors_recursive(pool);
+ zvol_create_minors(pool);
spa_import_os(spa);
@@ -7134,6 +7287,9 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
spa_config_exit(spa, SCL_ALL, FTAG);
}
+ if (spa_should_sync_time_logger_on_unload(spa))
+ spa_unload_sync_time_logger(spa);
+
/*
* If the log space map feature is enabled and the pool is
* getting exported (but not destroyed), we want to spend some
@@ -9092,6 +9248,8 @@ spa_async_thread(void *arg)
old_space += metaslab_class_get_space(spa_dedup_class(spa));
old_space += metaslab_class_get_space(
spa_embedded_log_class(spa));
+ old_space += metaslab_class_get_space(
+ spa_special_embedded_log_class(spa));
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
@@ -9100,6 +9258,8 @@ spa_async_thread(void *arg)
new_space += metaslab_class_get_space(spa_dedup_class(spa));
new_space += metaslab_class_get_space(
spa_embedded_log_class(spa));
+ new_space += metaslab_class_get_space(
+ spa_special_embedded_log_class(spa));
mutex_exit(&spa_namespace_lock);
/*
@@ -10180,6 +10340,8 @@ spa_sync(spa_t *spa, uint64_t txg)
*/
brt_pending_apply(spa, txg);
+ spa_sync_time_logger(spa, txg);
+
/*
* Lock out configuration changes.
*/
@@ -10222,6 +10384,7 @@ spa_sync(spa_t *spa, uint64_t txg)
dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
spa->spa_sync_starttime = gethrtime();
+
taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
@@ -10309,7 +10472,7 @@ spa_sync(spa_t *spa, uint64_t txg)
metaslab_class_evict_old(spa->spa_normal_class, txg);
metaslab_class_evict_old(spa->spa_log_class, txg);
- /* spa_embedded_log_class has only one metaslab per vdev. */
+ /* Embedded log classes have only one metaslab per vdev. */
metaslab_class_evict_old(spa->spa_special_class, txg);
metaslab_class_evict_old(spa->spa_dedup_class, txg);
@@ -11095,6 +11258,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
"Whether extra ALLOC blkptrs were added to a livelist entry while it "
"was being condensed");
+ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW,
+ "How frequently TXG timestamps are stored internally (in seconds)");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW,
+ "How frequently the TXG timestamps database should be flushed "
+ "to disk (in seconds)");
+
#ifdef _KERNEL
ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
index 3e08f261fda1..7252fd534bdf 100644
--- a/sys/contrib/openzfs/module/zfs/spa_errlog.c
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -253,7 +253,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
if (error == 0 && BP_IS_HOLE(&bp))
error = SET_ERROR(ENOENT);
- *birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
+ *birth_txg = BP_GET_PHYSICAL_BIRTH(&bp);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
return (error);
@@ -885,7 +885,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
if (error == EACCES)
error = 0;
else if (!error)
- zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
+ zep.zb_birth = BP_GET_PHYSICAL_BIRTH(&bp);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
index f054e4290bbf..2eba8362a166 100644
--- a/sys/contrib/openzfs/module/zfs/spa_misc.c
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -715,6 +715,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_txg_log_time_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -903,6 +904,7 @@ spa_remove(spa_t *spa)
mutex_destroy(&spa->spa_vdev_top_lock);
mutex_destroy(&spa->spa_feat_stats_lock);
mutex_destroy(&spa->spa_activities_lock);
+ mutex_destroy(&spa->spa_txg_log_time_lock);
kmem_free(spa, sizeof (spa_t));
}
@@ -1308,6 +1310,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
metaslab_class_validate(spa_log_class(spa));
metaslab_class_validate(spa_embedded_log_class(spa));
metaslab_class_validate(spa_special_class(spa));
+ metaslab_class_validate(spa_special_embedded_log_class(spa));
metaslab_class_validate(spa_dedup_class(spa));
spa_config_exit(spa, SCL_ALL, spa);
@@ -1896,6 +1899,8 @@ spa_get_slop_space(spa_t *spa)
*/
uint64_t embedded_log =
metaslab_class_get_dspace(spa_embedded_log_class(spa));
+ embedded_log += metaslab_class_get_dspace(
+ spa_special_embedded_log_class(spa));
slop -= MIN(embedded_log, slop >> 1);
/*
@@ -2001,6 +2006,12 @@ spa_special_class(spa_t *spa)
}
metaslab_class_t *
+spa_special_embedded_log_class(spa_t *spa)
+{
+ return (spa->spa_special_embedded_log_class);
+}
+
+metaslab_class_t *
spa_dedup_class(spa_t *spa)
{
return (spa->spa_dedup_class);
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 01758b0c54c0..70b14fb9b2c8 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
}
+char *
+vdev_rt_name(vdev_t *vd, const char *name)
+{
+ return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}",
+ spa_name(vd->vdev_spa),
+ (u_longlong_t)vd->vdev_guid,
+ name));
+}
+
+static char *
+vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type)
+{
+ return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}",
+ spa_name(vd->vdev_spa),
+ (u_longlong_t)vd->vdev_guid,
+ name,
+ dtl_type));
+}
+
/*
* Virtual device management.
*/
@@ -282,12 +301,15 @@ vdev_getops(const char *type)
* Given a vdev and a metaslab class, find which metaslab group we're
* interested in. All vdevs may belong to two different metaslab classes.
* Dedicated slog devices use only the primary metaslab group, rather than a
- * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and
+ * will point to a metaslab group of either embedded_log_class (for normal
+ * vdevs) or special_embedded_log_class (for special vdevs).
*/
metaslab_group_t *
vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
{
- if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+ if ((mc == spa_embedded_log_class(vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(vd->vdev_spa)) &&
vd->vdev_log_mg != NULL)
return (vd->vdev_log_mg);
else
@@ -692,8 +714,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
- vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
- ZFS_RANGE_SEG64, NULL, 0, 0);
+ vd->vdev_obsolete_segments = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments"));
/*
* Initialize rate limit structs for events. We rate limit ZIO delay
@@ -747,8 +770,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
- vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ vd->vdev_dtl[t] = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t));
}
txg_list_create(&vd->vdev_ms_list, spa,
@@ -1508,8 +1532,13 @@ vdev_metaslab_group_create(vdev_t *vd)
vd->vdev_mg = metaslab_group_create(mc, vd);
if (!vd->vdev_islog) {
- vd->vdev_log_mg = metaslab_group_create(
- spa_embedded_log_class(spa), vd);
+ if (mc == spa_special_class(spa)) {
+ vd->vdev_log_mg = metaslab_group_create(
+ spa_special_embedded_log_class(spa), vd);
+ } else {
+ vd->vdev_log_mg = metaslab_group_create(
+ spa_embedded_log_class(spa), vd);
+ }
}
/*
@@ -1624,9 +1653,10 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
/*
* Find the emptiest metaslab on the vdev and mark it for use for
* embedded slog by moving it from the regular to the log metaslab
- * group.
+ * group. This works for normal and special vdevs.
*/
- if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+ if ((vd->vdev_mg->mg_class == spa_normal_class(spa) ||
+ vd->vdev_mg->mg_class == spa_special_class(spa)) &&
vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
uint64_t slog_msid = 0;
@@ -3449,7 +3479,9 @@ vdev_dtl_load(vdev_t *vd)
return (error);
ASSERT(vd->vdev_dtl_sm != NULL);
- rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+ rt = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt"));
error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
if (error == 0) {
mutex_enter(&vd->vdev_dtl_lock);
@@ -3597,7 +3629,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
ASSERT(vd->vdev_dtl_sm != NULL);
}
- rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+ rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync"));
mutex_enter(&vd->vdev_dtl_lock);
zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
index fac2c3a5f154..9fc71fa0e03e 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -1842,7 +1842,7 @@ vdev_indirect_io_done(zio_t *zio)
*/
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
zio->io_error = ret;
- zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
ret = 0;
}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
index 4274728578ad..9243c76e810d 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg)
abd_t *deadbeef = vdev_initialize_block_alloc();
- vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ vd->vdev_initialize_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree"));
for (uint64_t i = 0; !vd->vdev_detached &&
i < vd->vdev_top->vdev_ms_count; i++) {
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
index a6aee9437066..18efdaac006f 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -532,7 +532,7 @@ vdev_mirror_child_select(zio_t *zio)
uint64_t txg = zio->io_txg;
int c, lowest_load;
- ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
+ ASSERT(zio->io_bp == NULL || BP_GET_PHYSICAL_BIRTH(zio->io_bp) == txg);
lowest_load = INT_MAX;
mm->mm_preferred_cnt = 0;
@@ -779,7 +779,7 @@ vdev_mirror_io_done(zio_t *zio)
* being written out during self healing.
*/
if ((zio->io_flags & ZIO_FLAG_DIO_READ) &&
- (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+ (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
zio_dio_chksum_verify_error_report(zio);
zio->io_error = vdev_mirror_worst_error(mm);
ASSERT3U(zio->io_error, ==, ECKSUM);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
index 71c4bfbdaf00..210cdcab1ecc 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -2206,11 +2206,7 @@ vdev_raidz_close(vdev_t *vd)
/*
* Return the logical width to use, given the txg in which the allocation
- * happened. Note that BP_GET_BIRTH() is usually the txg in which the
- * BP was allocated. Remapped BP's (that were relocated due to device
- * removal, see remap_blkptr_cb()), will have a more recent physical birth
- * which reflects when the BP was relocated, but we can ignore these because
- * they can't be on RAIDZ (device removal doesn't support RAIDZ).
+ * happened.
*/
static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
@@ -2249,10 +2245,9 @@ vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t psize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vdrz->vd_original_width;
uint64_t nparity = vdrz->vd_nparity;
- cols = vdev_raidz_get_logical_width(vdrz, txg);
+ uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
ASSERT0(asize % (1 << ashift));
@@ -2285,10 +2280,9 @@ vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
- uint64_t cols = vdrz->vd_original_width;
uint64_t nparity = vdrz->vd_nparity;
- cols = vdev_raidz_get_logical_width(vdrz, txg);
+ uint64_t cols = vdev_raidz_get_logical_width(vdrz, txg);
asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
@@ -2345,7 +2339,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
- BP_GET_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
raidz_col_t *rc = &rr->rr_col[col];
vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
@@ -2568,7 +2562,7 @@ vdev_raidz_io_start(zio_t *zio)
raidz_map_t *rm;
uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
- BP_GET_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
if (logical_width != vdrz->vd_physical_width) {
zfs_locked_range_t *lr = NULL;
uint64_t synced_offset = UINT64_MAX;
@@ -2691,7 +2685,7 @@ raidz_checksum_verify(zio_t *zio)
*/
if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
zio->io_error = ret;
- zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
zio_checksum_verified(zio);
return (0);
@@ -3048,7 +3042,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
/* Check for success */
if (raidz_checksum_verify(zio) == 0) {
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+ if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
return (0);
/* Reconstruction succeeded - report errors */
@@ -3514,7 +3508,7 @@ vdev_raidz_io_done(zio_t *zio)
}
if (raidz_checksum_verify(zio) == 0) {
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
+ if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)
goto done;
for (int i = 0; i < rm->rm_nrows; i++) {
@@ -4591,8 +4585,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr)
uint64_t shift, start;
zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
raidvd, msp, &start, &shift);
- zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
- start, shift);
+ zfs_range_tree_t *rt = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(msp->ms_group, msp,
+ "spa_raidz_expand_thread:rt"));
zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
rt);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
index 0e296606d037..cf259788ccf4 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg)
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
vr->vr_top_vdev = vd;
vr->vr_scan_msp = NULL;
- vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL,
- 0, 0);
+ vr->vr_scan_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree"));
mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index db79ded6dce4..3887be4bd548 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd)
spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
- svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ svr->svr_allocd_segs = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs"));
svr->svr_vdev_id = vd->vdev_id;
for (int i = 0; i < TXG_SIZE; i++) {
- svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ svr->svr_frees[i] = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees"));
list_create(&svr->svr_new_segments[i],
sizeof (vdev_indirect_mapping_entry_t),
offsetof(vdev_indirect_mapping_entry_t, vime_node));
@@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs,
* relative to the start of the range to be copied (i.e. relative to the
* local variable "start").
*/
- zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL,
- ZFS_RANGE_SEG64, NULL, 0, 0);
+ zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs"));
zfs_btree_index_t where;
zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
@@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
* allocated segments that we are copying. We may also be copying
* free segments (of up to vdev_removal_max_span bytes).
*/
- zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs"));
for (;;) {
zfs_range_tree_t *rt = svr->svr_allocd_segs;
zfs_range_seg_t *rs = zfs_range_tree_first(rt);
@@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg)
vca.vca_read_error_bytes = 0;
vca.vca_write_error_bytes = 0;
- zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs"));
mutex_enter(&svr->svr_lock);
@@ -1895,8 +1900,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
vdev_indirect_mapping_max_offset(vim));
}
- zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ zfs_range_tree_t *segs = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME,
+ vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs"));
for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
metaslab_t *msp = vd->vdev_ms[msi];
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
index 842bb3e690d4..fc8d5b8e9a8a 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_trim.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -902,7 +902,9 @@ vdev_trim_thread(void *arg)
ta.trim_vdev = vd;
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
- ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+ ta.trim_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
ta.trim_type = TRIM_TYPE_MANUAL;
ta.trim_flags = 0;
@@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg)
* Allocate an empty range tree which is swapped in
* for the existing ms_trim tree while it is processed.
*/
- trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
- NULL, 0, 0);
+ trim_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME,
+ vdev_rt_name(vd, "autotrim_tree"));
zfs_range_tree_swap(&msp->ms_trim, &trim_tree);
ASSERT(zfs_range_tree_is_empty(msp->ms_trim));
@@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg)
if (!cvd->vdev_ops->vdev_op_leaf)
continue;
- ta->trim_tree = zfs_range_tree_create(NULL,
- ZFS_RANGE_SEG64, NULL, 0, 0);
+ ta->trim_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME,
+ vdev_rt_name(vd, "autotrim_tree"));
zfs_range_tree_walk(trim_tree,
vdev_trim_range_add, ta);
}
@@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg)
vd->vdev_trim_secure = 0;
ta.trim_vdev = vd;
- ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+ ta.trim_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
ta.trim_type = TRIM_TYPE_MANUAL;
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
@@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
ASSERT(!vd->vdev_top->vdev_rz_expanding);
ta.trim_vdev = vd;
- ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
+ ta.trim_tree = zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree"));
ta.trim_type = TRIM_TYPE_SIMPLE;
ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
index 9711c91d7e4e..0896690c97e3 100644
--- a/sys/contrib/openzfs/module/zfs/zap.c
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -1304,7 +1304,7 @@ zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
int
fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
{
- int err = ENOENT;
+ int err;
zap_entry_handle_t zeh;
zap_leaf_t *l;
diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c
index 6960ea360b15..9aecf67fd256 100644
--- a/sys/contrib/openzfs/module/zfs/zcp.c
+++ b/sys/contrib/openzfs/module/zfs/zcp.c
@@ -1175,7 +1175,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL);
pair != NULL;
pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) {
- zvol_create_minor(nvpair_name(pair));
+ zvol_create_minors(nvpair_name(pair));
}
fnvlist_free(runinfo.zri_new_zvols);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_chksum.c b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
index 5c92be21c0c8..21852bf3d865 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_chksum.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_chksum.c
@@ -32,9 +32,6 @@
#include <sys/blake3.h>
#include <sys/sha2.h>
-/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
-#define LIMIT_PERF_MBS 300
-
typedef struct {
const char *name;
const char *impl;
@@ -52,9 +49,15 @@ typedef struct {
zio_checksum_tmpl_free_t *(free);
} chksum_stat_t;
+#define AT_STARTUP 0
+#define AT_BENCHMARK 1
+#define AT_DONE 2
+
static chksum_stat_t *chksum_stat_data = 0;
-static int chksum_stat_cnt = 0;
static kstat_t *chksum_kstat = NULL;
+static int chksum_stat_limit = AT_STARTUP;
+static int chksum_stat_cnt = 0;
+static void chksum_benchmark(void);
/*
* Sample output on i3-1005G1 System:
@@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data)
static void *
chksum_kstat_addr(kstat_t *ksp, loff_t n)
{
+ /* full benchmark */
+ chksum_benchmark();
+
if (n < chksum_stat_cnt)
ksp->ks_private = (void *)(chksum_stat_data + n);
else
@@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
kpreempt_enable();
run_bw = size * run_count * NANOSEC;
- run_bw /= run_time_ns; /* B/s */
+ run_bw /= run_time_ns; /* B/s */
*result = run_bw/1024/1024; /* MiB/s */
}
-#define LIMIT_INIT 0
-#define LIMIT_NEEDED 1
-#define LIMIT_NOLIMIT 2
-
static void
chksum_benchit(chksum_stat_t *cs)
{
abd_t *abd;
void *ctx = 0;
void *salt = &cs->salt.zcs_bytes;
- static int chksum_stat_limit = LIMIT_INIT;
memset(salt, 0, sizeof (cs->salt.zcs_bytes));
if (cs->init)
ctx = cs->init(&cs->salt);
+ /* benchmarks in startup mode */
+ if (chksum_stat_limit == AT_STARTUP) {
+ abd = abd_alloc_linear(1<<18, B_FALSE);
+ chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+ goto done;
+ }
+
/* allocate test memory via abd linear interface */
abd = abd_alloc_linear(1<<20, B_FALSE);
+
+ /* benchmarks when requested */
chksum_run(cs, abd, ctx, 1, &cs->bs1k);
chksum_run(cs, abd, ctx, 2, &cs->bs4k);
chksum_run(cs, abd, ctx, 3, &cs->bs16k);
chksum_run(cs, abd, ctx, 4, &cs->bs64k);
- chksum_run(cs, abd, ctx, 5, &cs->bs256k);
-
- /* check if we ran on a slow cpu */
- if (chksum_stat_limit == LIMIT_INIT) {
- if (cs->bs1k < LIMIT_PERF_MBS) {
- chksum_stat_limit = LIMIT_NEEDED;
- } else {
- chksum_stat_limit = LIMIT_NOLIMIT;
- }
- }
-
- /* skip benchmarks >= 1MiB when the CPU is to slow */
- if (chksum_stat_limit == LIMIT_NEEDED)
- goto abort;
-
chksum_run(cs, abd, ctx, 6, &cs->bs1m);
abd_free(abd);
@@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs)
chksum_run(cs, abd, ctx, 7, &cs->bs4m);
chksum_run(cs, abd, ctx, 8, &cs->bs16m);
-abort:
+done:
abd_free(abd);
/* free up temp memory */
@@ -243,7 +238,6 @@ chksum_benchmark(void)
/* we need the benchmark only for the kernel module */
return;
#endif
-
chksum_stat_t *cs;
uint64_t max;
uint32_t id, cbid = 0, id_save;
@@ -251,8 +245,14 @@ chksum_benchmark(void)
const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+ /* benchmarks are done */
+ if (chksum_stat_limit == AT_DONE)
+ return;
+
+
/* count implementations */
- chksum_stat_cnt = 2;
+ chksum_stat_cnt = 1; /* edonr */
+ chksum_stat_cnt += 1; /* skein */
chksum_stat_cnt += sha256->getcnt();
chksum_stat_cnt += sha512->getcnt();
chksum_stat_cnt += blake3->getcnt();
@@ -332,6 +332,17 @@ chksum_benchmark(void)
}
}
blake3->setid(id_save);
+
+ switch (chksum_stat_limit) {
+ case AT_STARTUP:
+ /* next time we want a full benchmark */
+ chksum_stat_limit = AT_BENCHMARK;
+ break;
+ case AT_BENCHMARK:
+ /* no further benchmarks */
+ chksum_stat_limit = AT_DONE;
+ break;
+ }
}
void
@@ -341,7 +352,7 @@ chksum_init(void)
blake3_per_cpu_ctx_init();
#endif
- /* Benchmark supported implementations */
+ /* 256KiB benchmark */
chksum_benchmark();
/* Install kstats for all implementations */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_crrd.c b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
new file mode 100644
index 000000000000..f9267ed41d71
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_crrd.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: CDDL-1.0
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2024 Klara Inc.
+ *
+ * This software was developed by
+ * Mariusz Zaborski <mariusz.zaborski@klarasystems.com>
+ * Fred Weigel <fred.weigel@klarasystems.com>
+ * under sponsorship from Wasabi Technology, Inc. and Klara Inc.
+ */
+/*
+ * This file implements a round-robin database that stores timestamps and txg
+ * numbers. Due to limited space, we use a round-robin approach, where
+ * the oldest records are overwritten when there is no longer enough room.
+ * This is a best-effort mechanism, and the database should be treated as
+ * an approximation. Consider this before consuming it.
+ *
+ * The database is linear, meaning we assume each new entry is newer than the
+ * ones already stored. Because of this, if time is manipulated, the database
+ * will only accept records that are newer than the existing ones.
+ * (For example, jumping 10 years into the future and then back can lead to
+ * situation when for 10 years we wont write anything to database)
+ *
+ * All times stored in the database use UTC, which makes it easy to convert to
+ * and from local time.
+ *
+ * Each database holds 256 records (as defined in the `RRD_MAX_ENTRIES` macro).
+ * This limit comes from the maximum size of a ZAP object, where we store the
+ * binary blob.
+ *
+ * We've split the database into three smaller ones.
+ * The `minute database` provides high resolution (default: every 10 minutes),
+ * but only covers approximately 1.5 days. This gives a detailed view of recent
+ * activity, useful, for example, when performing a scrub of the last hour.
+ * The `daily database` records one txg per day. With 256 entries, it retains
+ * roughly 8 months of data. This allows users to scrub or analyze txgs across
+ * a range of days.
+ * The `monthly database` stores one record per month, giving approximately
+ * 21 years of history.
+ * All these calculations assume the worst-case scenario: the pool is always
+ * online and actively written to.
+ *
+ * A potential source of confusion is that the database does not store data
+ * while the pool is offline, leading to potential gaps in timeline. Also,
+ * the database contains no records from before this feature was enabled.
+ * Both, upon reflection, are expected.
+ */
+#include <sys/zfs_context.h>
+
+#include "zfs_crrd.h"
+
+rrd_data_t *
+rrd_tail_entry(rrd_t *rrd)
+{
+ size_t n;
+
+ if (rrd_len(rrd) == 0)
+ return (NULL);
+
+ if (rrd->rrd_tail == 0)
+ n = RRD_MAX_ENTRIES - 1;
+ else
+ n = rrd->rrd_tail - 1;
+
+ return (&rrd->rrd_entries[n]);
+}
+
+uint64_t
+rrd_tail(rrd_t *rrd)
+{
+ const rrd_data_t *tail;
+
+ tail = rrd_tail_entry(rrd);
+
+ return (tail == NULL ? 0 : tail->rrdd_time);
+}
+
+/*
+ * Return length of data in the rrd.
+ * rrd_get works from 0..rrd_len()-1.
+ */
+size_t
+rrd_len(rrd_t *rrd)
+{
+
+ return (rrd->rrd_length);
+}
+
+const rrd_data_t *
+rrd_entry(rrd_t *rrd, size_t i)
+{
+ size_t n;
+
+ if (i >= rrd_len(rrd)) {
+ return (0);
+ }
+
+ n = (rrd->rrd_head + i) % RRD_MAX_ENTRIES;
+ return (&rrd->rrd_entries[n]);
+}
+
+uint64_t
+rrd_get(rrd_t *rrd, size_t i)
+{
+ const rrd_data_t *data = rrd_entry(rrd, i);
+
+ return (data == NULL ? 0 : data->rrdd_txg);
+}
+
+/* Add value to database. */
+void
+rrd_add(rrd_t *rrd, hrtime_t time, uint64_t txg)
+{
+ rrd_data_t *tail;
+
+ tail = rrd_tail_entry(rrd);
+ if (tail != NULL && tail->rrdd_time == time) {
+ if (tail->rrdd_txg < txg) {
+ tail->rrdd_txg = txg;
+ } else {
+ return;
+ }
+ }
+
+ rrd->rrd_entries[rrd->rrd_tail].rrdd_time = time;
+ rrd->rrd_entries[rrd->rrd_tail].rrdd_txg = txg;
+
+ rrd->rrd_tail = (rrd->rrd_tail + 1) % RRD_MAX_ENTRIES;
+
+ if (rrd->rrd_length < RRD_MAX_ENTRIES) {
+ rrd->rrd_length++;
+ } else {
+ rrd->rrd_head = (rrd->rrd_head + 1) % RRD_MAX_ENTRIES;
+ }
+}
+
+void
+dbrrd_add(dbrrd_t *db, hrtime_t time, uint64_t txg)
+{
+ hrtime_t daydiff, monthdiff, minutedif;
+
+ minutedif = time - rrd_tail(&db->dbr_minutes);
+ daydiff = time - rrd_tail(&db->dbr_days);
+ monthdiff = time - rrd_tail(&db->dbr_months);
+
+ if (monthdiff >= 0 && monthdiff >= SEC2NSEC(30 * 24 * 60 * 60))
+ rrd_add(&db->dbr_months, time, txg);
+ else if (daydiff >= 0 && daydiff >= SEC2NSEC(24 * 60 * 60))
+ rrd_add(&db->dbr_days, time, txg);
+ else if (minutedif >= 0)
+ rrd_add(&db->dbr_minutes, time, txg);
+}
+
+/*
+ * We could do a binary search here, but the routine isn't frequently
+ * called and the data is small so we stick to a simple loop.
+ */
+static const rrd_data_t *
+rrd_query(rrd_t *rrd, hrtime_t tv, dbrrd_rounding_t rounding)
+{
+ const rrd_data_t *data = NULL;
+
+ for (size_t i = 0; i < rrd_len(rrd); i++) {
+ const rrd_data_t *cur = rrd_entry(rrd, i);
+
+ if (rounding == DBRRD_FLOOR) {
+ if (tv < cur->rrdd_time) {
+ break;
+ }
+ data = cur;
+ } else {
+ /* DBRRD_CEILING */
+ if (tv <= cur->rrdd_time) {
+ data = cur;
+ break;
+ }
+ }
+ }
+
+ return (data);
+}
+
+static const rrd_data_t *
+dbrrd_closest(hrtime_t tv, const rrd_data_t *r1, const rrd_data_t *r2)
+{
+
+ if (r1 == NULL)
+ return (r2);
+ if (r2 == NULL)
+ return (r1);
+
+ return (ABS(tv - r1->rrdd_time) < ABS(tv - r2->rrdd_time) ? r1 : r2);
+}
+
+uint64_t
+dbrrd_query(dbrrd_t *r, hrtime_t tv, dbrrd_rounding_t rounding)
+{
+ const rrd_data_t *data, *dm, *dd, *dy;
+
+ data = NULL;
+ dm = rrd_query(&r->dbr_minutes, tv, rounding);
+ dd = rrd_query(&r->dbr_days, tv, rounding);
+ dy = rrd_query(&r->dbr_months, tv, rounding);
+
+ data = dbrrd_closest(tv, dbrrd_closest(tv, dd, dm), dy);
+
+ return (data == NULL ? 0 : data->rrdd_txg);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index ebb1cfd07125..dcb71229f96a 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -1704,6 +1704,8 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
{"scan_type", DATA_TYPE_UINT64, 0},
{"scan_command", DATA_TYPE_UINT64, 0},
+ {"scan_date_start", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"scan_date_end", DATA_TYPE_UINT64, ZK_OPTIONAL},
};
static int
@@ -1712,6 +1714,7 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
spa_t *spa;
int error;
uint64_t scan_type, scan_cmd;
+ uint64_t date_start, date_end;
if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
return (SET_ERROR(EINVAL));
@@ -1721,6 +1724,11 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
if (scan_cmd >= POOL_SCRUB_FLAGS_END)
return (SET_ERROR(EINVAL));
+ if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0)
+ date_start = 0;
+ if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0)
+ date_end = 0;
+
if ((error = spa_open(poolname, &spa, FTAG)) != 0)
return (error);
@@ -1732,7 +1740,24 @@ zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
error = spa_scan_range(spa, scan_type,
spa_get_last_scrubbed_txg(spa), 0);
} else {
- error = spa_scan(spa, scan_type);
+ uint64_t txg_start, txg_end;
+
+ txg_start = txg_end = 0;
+ if (date_start != 0 || date_end != 0) {
+ mutex_enter(&spa->spa_txg_log_time_lock);
+ if (date_start != 0) {
+ txg_start = dbrrd_query(&spa->spa_txg_log_time,
+ date_start, DBRRD_FLOOR);
+ }
+
+ if (date_end != 0) {
+ txg_end = dbrrd_query(&spa->spa_txg_log_time,
+ date_end, DBRRD_CEILING);
+ }
+ mutex_exit(&spa->spa_txg_log_time_lock);
+ }
+
+ error = spa_scan_range(spa, scan_type, txg_start, txg_end);
}
spa_close(spa, FTAG);
@@ -5000,15 +5025,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
- case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
- /*
- * This property could require the allocation classes
- * feature to be active for setting, however we allow
- * it so that tests of settable properties succeed.
- * The CLI will issue a warning in this case.
- */
- break;
-
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c
index 2ce25b72b288..2f61ecfd9b3b 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_log.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_log.c
@@ -607,8 +607,6 @@ zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
* called as soon as the write is on stable storage (be it via a DMU sync or a
* ZIL commit).
*/
-static uint_t zfs_immediate_write_sz = 32768;
-
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, boolean_t commit,
@@ -626,15 +624,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
return;
}
- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
- write_state = WR_INDIRECT;
- else if (!spa_has_slogs(zilog->zl_spa) &&
- resid >= zfs_immediate_write_sz)
- write_state = WR_INDIRECT;
- else if (commit)
- write_state = WR_COPIED;
- else
- write_state = WR_NEED_COPY;
+ write_state = zil_write_state(zilog, resid, blocksize, o_direct,
+ commit);
(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &gen,
sizeof (gen));
@@ -938,6 +929,3 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
len -= partlen;
}
}
-
-ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW,
- "Largest data block to write to zil");
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
index 656ca4dc22ff..74aa91a4f2eb 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -49,6 +49,7 @@
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dsl_crypt.h>
+#include <sys/dsl_dataset.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/dbuf.h>
@@ -67,13 +68,14 @@
int zfs_bclone_enabled = 1;
/*
- * When set zfs_clone_range() waits for dirty data to be written to disk.
- * This allows the clone operation to reliably succeed when a file is modified
- * and then immediately cloned. For small files this may be slower than making
- * a copy of the file and is therefore not the default. However, in certain
- * scenarios this behavior may be desirable so a tunable is provided.
+ * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
+ * data to be written to disk before proceeding. This ensures that the clone
+ * operation reliably succeeds, even if a file is modified and then immediately
+ * cloned. Note that for small files this may be slower than simply copying
+ * the file. When set to 0 the clone operation will immediately fail if it
+ * encounters any dirty blocks. By default waiting is enabled.
*/
-int zfs_bclone_wait_dirty = 0;
+int zfs_bclone_wait_dirty = 1;
/*
* Enable Direct I/O. If this setting is 0, then all I/O requests will be
@@ -114,9 +116,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
- atomic_inc_32(&zp->z_sync_writes_cnt);
zil_commit(zfsvfs->z_log, zp->z_id);
- atomic_dec_32(&zp->z_sync_writes_cnt);
zfs_exit(zfsvfs, FTAG);
}
return (error);
@@ -1102,13 +1102,21 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
{
int error;
- if (flags != 0 || arg != 0)
+ if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0)
return (SET_ERROR(EINVAL));
zfsvfs_t *zfsvfs = ZTOZSB(zp);
if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);
+ /* Check if physical rewrite is allowed */
+ spa_t *spa = zfsvfs->z_os->os_spa;
+ if ((flags & ZFS_REWRITE_PHYSICAL) &&
+ !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) {
+ zfs_exit(zfsvfs, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
if (zfs_is_readonly(zfsvfs)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EROFS));
@@ -1196,7 +1204,10 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags,
if (dmu_buf_is_dirty(dbp[i], tx))
continue;
nw += dbp[i]->db_size;
- dmu_buf_will_dirty(dbp[i], tx);
+ if (flags & ZFS_REWRITE_PHYSICAL)
+ dmu_buf_will_rewrite(dbp[i], tx);
+ else
+ dmu_buf_will_dirty(dbp[i], tx);
}
dmu_buf_rele_array(dbp, numbufs, FTAG);
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 00059b2c6de0..6e4f84257407 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -589,7 +589,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* that we rewind to is invalid. Thus, we return -1 so
* zil_parse() doesn't attempt to read it.
*/
- if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
+ if (BP_GET_BIRTH(bp) >= first_txg)
return (-1);
if (zil_bp_tree_add(zilog, bp) != 0)
@@ -615,7 +615,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
- if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
+ if (BP_IS_HOLE(bp) || BP_GET_BIRTH(bp) < first_txg ||
zil_bp_tree_add(zilog, bp) != 0)
return (0);
@@ -640,7 +640,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
- if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
+ if (BP_GET_BIRTH(&lr->lr_blkptr) >= first_txg) {
error = zil_read_log_data(zilog, lr, NULL);
if (error != 0)
return (error);
@@ -687,7 +687,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
* just in case lets be safe and just stop here now instead of
* corrupting the pool.
*/
- if (BP_GET_BIRTH(bp) >= first_txg)
+ if (BP_GET_PHYSICAL_BIRTH(bp) >= first_txg)
return (SET_ERROR(ENOENT));
/*
@@ -742,7 +742,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
/*
* If we previously claimed it, we need to free it.
*/
- if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+ if (BP_GET_BIRTH(bp) >= claim_txg &&
zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
}
@@ -1997,7 +1997,7 @@ next_lwb:
&slog);
}
if (error == 0) {
- ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
+ ASSERT3U(BP_GET_BIRTH(bp), ==, txg);
BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
ZIO_CHECKSUM_ZILOG);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -2095,6 +2095,19 @@ zil_max_waste_space(zilog_t *zilog)
*/
static uint_t zil_maxcopied = 7680;
+/*
+ * Largest write size to store the data directly into ZIL.
+ */
+uint_t zfs_immediate_write_sz = 32768;
+
+/*
+ * When enabled and blocks go to normal vdev, treat special vdevs as SLOG,
+ * writing data to ZIL (WR_COPIED/WR_NEED_COPY). Disabling this forces the
+ * indirect writes (WR_INDIRECT) to preserve special vdev throughput and
+ * endurance, likely at the cost of normal vdev latency.
+ */
+int zil_special_is_slog = 1;
+
uint64_t
zil_max_copied_data(zilog_t *zilog)
{
@@ -2102,6 +2115,46 @@ zil_max_copied_data(zilog_t *zilog)
return (MIN(max_data, zil_maxcopied));
}
+/*
+ * Determine the appropriate write state for ZIL transactions based on
+ * pool configuration, data placement, write size, and logbias settings.
+ */
+itx_wr_state_t
+zil_write_state(zilog_t *zilog, uint64_t size, uint32_t blocksize,
+ boolean_t o_direct, boolean_t commit)
+{
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct)
+ return (WR_INDIRECT);
+
+ /*
+ * Don't use indirect for too small writes to reduce overhead.
+ * Don't use indirect if written less than a half of a block if
+ * we are going to commit it immediately, since next write might
+ * rewrite the same block again, causing inflation. If commit
+ * is not planned, then next writes might coalesce, and so the
+ * indirect may be perfect.
+ */
+ boolean_t indirect = (size >= zfs_immediate_write_sz &&
+ (size >= blocksize / 2 || !commit));
+
+ if (spa_has_slogs(zilog->zl_spa)) {
+ /* Dedicated slogs: never use indirect */
+ indirect = B_FALSE;
+ } else if (spa_has_special(zilog->zl_spa)) {
+ /* Special vdevs: only when beneficial */
+ boolean_t on_special = (blocksize <=
+ zilog->zl_os->os_zpl_special_smallblock);
+ indirect &= (on_special || !zil_special_is_slog);
+ }
+
+ if (indirect)
+ return (WR_INDIRECT);
+ else if (commit)
+ return (WR_COPIED);
+ else
+ return (WR_NEED_COPY);
+}
+
static uint64_t
zil_itx_record_size(itx_t *itx)
{
@@ -2902,19 +2955,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
- /*
- * Return if there's nothing to commit before we dirty the fs by
- * calling zil_create().
- */
- if (list_is_empty(&zilog->zl_itx_commit_list))
- return;
-
- list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
- list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
- offsetof(zil_commit_waiter_t, zcw_node));
-
lwb = list_tail(&zilog->zl_lwb_list);
if (lwb == NULL) {
+ /*
+ * Return if there's nothing to commit before we dirty the fs.
+ */
+ if (list_is_empty(&zilog->zl_itx_commit_list))
+ return;
+
lwb = zil_create(zilog);
} else {
/*
@@ -2942,6 +2990,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
}
}
+ list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+ list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+ offsetof(zil_commit_waiter_t, zcw_node));
+
while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
lr_t *lrc = &itx->itx_lr;
uint64_t txg = lrc->lrc_txg;
@@ -3111,7 +3163,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* possible, without significantly impacting the latency
* of each individual itx.
*/
- if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
+ if (lwb->lwb_state == LWB_STATE_OPENED &&
+ (!zilog->zl_parallel || zilog->zl_suspend > 0)) {
zil_burst_done(zilog);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
@@ -4418,3 +4471,9 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
"Limit in bytes WR_COPIED size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, UINT, ZMOD_RW,
+ "Largest write size to store data into ZIL");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, special_is_slog, INT, ZMOD_RW,
+ "Treat special vdevs as SLOG");
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
index 6d7bce8b0e10..218aec6093e2 100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -692,7 +692,7 @@ error:
zio->io_error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark,
- BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, &zio->io_bookmark, zio, 0);
}
@@ -850,15 +850,9 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
mutex_enter(&pio->io_lock);
if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
*errorp = zio_worst_error(*errorp, zio->io_error);
- pio->io_reexecute |= zio->io_reexecute;
+ pio->io_post |= zio->io_post;
ASSERT3U(*countp, >, 0);
- /*
- * Propogate the Direct I/O checksum verify failure to the parent.
- */
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
- pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
-
(*countp)--;
if (*countp == 0 && pio->io_stall == countp) {
@@ -1110,7 +1104,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
"DVA[1]=%#llx/%#llx "
"DVA[2]=%#llx/%#llx "
"prop=%#llx "
- "pad=%#llx,%#llx "
+ "prop2=%#llx "
+ "pad=%#llx "
"phys_birth=%#llx "
"birth=%#llx "
"fill=%#llx "
@@ -1123,9 +1118,9 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
(long long)bp->blk_dva[2].dva_word[0],
(long long)bp->blk_dva[2].dva_word[1],
(long long)bp->blk_prop,
- (long long)bp->blk_pad[0],
- (long long)bp->blk_pad[1],
- (long long)BP_GET_PHYSICAL_BIRTH(bp),
+ (long long)bp->blk_prop2,
+ (long long)bp->blk_pad,
+ (long long)BP_GET_RAW_PHYSICAL_BIRTH(bp),
(long long)BP_GET_LOGICAL_BIRTH(bp),
(long long)bp->blk_fill,
(long long)bp->blk_cksum.zc_word[0],
@@ -1340,7 +1335,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
{
zio_t *zio;
- zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
+ zio = zio_create(pio, spa, BP_GET_PHYSICAL_BIRTH(bp), bp,
data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1649,7 +1644,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* through the mirror during self healing. See comment in
* vdev_mirror_io_done() for more details.
*/
- ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+ ASSERT0(pio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
} else if (type == ZIO_TYPE_WRITE &&
pio->io_prop.zp_direct_write == B_TRUE) {
/*
@@ -1685,7 +1680,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
* If this is a retried I/O then we ignore it since we will
* have already processed the original allocating I/O.
*/
- if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ if (flags & ZIO_FLAG_ALLOC_THROTTLED &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
ASSERT(pio->io_metaslab_class != NULL);
ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
@@ -1695,7 +1690,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
pio->io_child_type == ZIO_CHILD_GANG);
- flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
}
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
@@ -1860,7 +1855,7 @@ zio_write_bp_init(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_prop_t *zp = &zio->io_prop;
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
+ ASSERT(BP_GET_BIRTH(bp) != zio->io_txg);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -1948,7 +1943,7 @@ zio_write_compress(zio_t *zio)
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
ASSERT(zio->io_bp_override == NULL);
- if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
+ if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
@@ -2085,7 +2080,7 @@ zio_write_compress(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
+ if (!BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
VERIFY3U(psize, !=, 0);
@@ -2602,7 +2597,7 @@ zio_reexecute(void *arg)
pio->io_flags = pio->io_orig_flags;
pio->io_stage = pio->io_orig_stage;
pio->io_pipeline = pio->io_orig_pipeline;
- pio->io_reexecute = 0;
+ pio->io_post = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
pio->io_pipeline_trace = 0;
pio->io_error = 0;
@@ -2749,11 +2744,14 @@ zio_resume_wait(spa_t *spa)
* being nearly full, it calls zio_write_gang_block() to construct the
* block from smaller fragments.
*
- * A gang block consists of a gang header (zio_gbh_phys_t) and up to
- * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like
- * an indirect block: it's an array of block pointers. It consumes
- * only one sector and hence is allocatable regardless of fragmentation.
- * The gang header's bps point to its gang members, which hold the data.
+ * A gang block consists of a a gang header and up to gbh_nblkptrs(size)
+ * gang members. The gang header is like an indirect block: it's an array
+ * of block pointers, though the header has a small tail (a zio_eck_t)
+ * that stores an embedded checksum. It is allocated using only a single
+ * sector as the requested size, and hence is allocatable regardless of
+ * fragmentation. Its size is determined by the smallest allocatable
+ * asize of the vdevs it was allocated on. The gang header's bps point
+ * to its gang members, which hold the data.
*
* Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
* as the verifier to ensure uniqueness of the SHA256 checksum.
@@ -2832,10 +2830,10 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
if (gn != NULL) {
abd_t *gbh_abd =
- abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ abd_get_from_buf(gn->gn_gbh, gn->gn_gangblocksize);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ gbh_abd, gn->gn_gangblocksize, zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
@@ -2906,14 +2904,16 @@ static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
static void zio_gang_tree_assemble_done(zio_t *zio);
static zio_gang_node_t *
-zio_gang_node_alloc(zio_gang_node_t **gnpp)
+zio_gang_node_alloc(zio_gang_node_t **gnpp, uint64_t gangblocksize)
{
zio_gang_node_t *gn;
ASSERT(*gnpp == NULL);
- gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
- gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+ gn = kmem_zalloc(sizeof (*gn) +
+ (gbh_nblkptrs(gangblocksize) * sizeof (gn)), KM_SLEEP);
+ gn->gn_gangblocksize = gn->gn_allocsize = gangblocksize;
+ gn->gn_gbh = zio_buf_alloc(gangblocksize);
*gnpp = gn;
return (gn);
@@ -2924,11 +2924,12 @@ zio_gang_node_free(zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = *gnpp;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
ASSERT(gn->gn_child[g] == NULL);
- zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
- kmem_free(gn, sizeof (*gn));
+ zio_buf_free(gn->gn_gbh, gn->gn_allocsize);
+ kmem_free(gn, sizeof (*gn) +
+ (gbh_nblkptrs(gn->gn_allocsize) * sizeof (gn)));
*gnpp = NULL;
}
@@ -2940,7 +2941,7 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
if (gn == NULL)
return;
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_allocsize); g++)
zio_gang_tree_free(&gn->gn_child[g]);
zio_gang_node_free(gnpp);
@@ -2949,13 +2950,28 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
- zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
- abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+ uint64_t gangblocksize = UINT64_MAX;
+ if (spa_feature_is_active(gio->io_spa,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+ spa_config_enter(gio->io_spa, SCL_VDEV, FTAG, RW_READER);
+ for (int dva = 0; dva < BP_GET_NDVAS(bp); dva++) {
+ vdev_t *vd = vdev_lookup_top(gio->io_spa,
+ DVA_GET_VDEV(&bp->blk_dva[dva]));
+ uint64_t psize = vdev_gang_header_psize(vd);
+ gangblocksize = MIN(gangblocksize, psize);
+ }
+ spa_config_exit(gio->io_spa, SCL_VDEV, FTAG);
+ } else {
+ gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ }
+ ASSERT3U(gangblocksize, !=, UINT64_MAX);
+ zio_gang_node_t *gn = zio_gang_node_alloc(gnpp, gangblocksize);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, gangblocksize);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
- zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, gangblocksize,
zio_gang_tree_assemble_done, gn, gio->io_priority,
ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
@@ -2978,13 +2994,17 @@ zio_gang_tree_assemble_done(zio_t *zio)
byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
- ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ /*
+ * If this was an old-style gangblock, the gangblocksize should have
+ * been updated in zio_checksum_error to reflect that.
+ */
+ ASSERT3U(gbh_eck(gn->gn_gbh, gn->gn_gangblocksize)->zec_magic,
+ ==, ZEC_MAGIC);
abd_free(zio->io_abd);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+ blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (!BP_IS_GANG(gbp))
continue;
zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
@@ -3009,10 +3029,11 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
- ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ ASSERT3U(gbh_eck(gn->gn_gbh,
+ gn->gn_gangblocksize)->zec_magic, ==, ZEC_MAGIC);
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
- blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
+ blkptr_t *gbp = gbh_bp(gn->gn_gbh, g);
if (BP_IS_HOLE(gbp))
continue;
zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
@@ -3119,6 +3140,13 @@ zio_write_gang_done(zio_t *zio)
abd_free(zio->io_abd);
}
+static void
+zio_update_feature(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ spa_feature_incr(spa, (spa_feature_t)(uintptr_t)arg, tx);
+}
+
static zio_t *
zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
{
@@ -3157,20 +3185,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(ZIO_HAS_ALLOCATOR(pio));
int flags = METASLAB_GANG_HEADER;
- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(has_data);
flags |= METASLAB_ASYNC_ALLOC;
}
- error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ uint64_t gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ uint64_t candidate = gangblocksize;
+ error = metaslab_alloc_range(spa, mc, gangblocksize, gangblocksize,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
- &pio->io_alloc_list, pio->io_allocator, pio);
+ &pio->io_alloc_list, pio->io_allocator, pio, &candidate);
if (error) {
pio->io_error = error;
return (pio);
}
+ if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+ gangblocksize = candidate;
if (pio == gio) {
gnpp = &gio->io_gang_tree;
@@ -3179,23 +3211,24 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
ASSERT(pio->io_ready == zio_write_gang_member_ready);
}
- gn = zio_gang_node_alloc(gnpp);
+ gn = zio_gang_node_alloc(gnpp, gangblocksize);
gbh = gn->gn_gbh;
- memset(gbh, 0, SPA_GANGBLOCKSIZE);
- gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+ memset(gbh, 0, gangblocksize);
+ gbh_abd = abd_get_from_buf(gbh, gangblocksize);
/*
* Create the gang header.
*/
- zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, gangblocksize,
zio_write_gang_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
zio_gang_inherit_allocator(pio, zio);
- if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (pio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
boolean_t more;
- VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies,
- zio, B_TRUE, &more));
+ VERIFY(metaslab_class_throttle_reserve(mc, zio->io_allocator,
+ gbh_copies, zio->io_size, B_TRUE, &more));
+ zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED;
}
/*
@@ -3203,7 +3236,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
* opportunistic allocations. If that fails to generate enough
* space, we fall back to normal zio_write calls for nested gang.
*/
- for (int g = 0; resid != 0; g++) {
+ int g;
+ boolean_t any_failed = B_FALSE;
+ for (g = 0; resid != 0; g++) {
flags &= METASLAB_ASYNC_ALLOC;
flags |= METASLAB_GANG_CHILD;
zp.zp_checksum = gio->io_prop.zp_checksum;
@@ -3224,9 +3259,9 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
uint64_t min_size = zio_roundup_alloc_size(spa,
- resid / (SPA_GBH_NBLKPTRS - g));
+ resid / (gbh_nblkptrs(gangblocksize) - g));
min_size = MIN(min_size, resid);
- bp = &gbh->zg_blkptr[g];
+ bp = &((blkptr_t *)gbh)[g];
zio_alloc_list_t cio_list;
metaslab_trace_init(&cio_list);
@@ -3236,6 +3271,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
boolean_t allocated = error == 0;
+ any_failed |= !allocated;
uint64_t psize = allocated ? MIN(resid, allocated_size) :
min_size;
@@ -3268,6 +3304,29 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
}
/*
+ * If we used more gang children than the old limit, we must already be
+ * using the new headers. No need to update anything, just move on.
+ *
+ * Otherwise, we might be in a case where we need to turn on the new
+ * feature, so we check that. We enable the new feature if we didn't
+ * manage to fit everything into 3 gang children and we could have
+ * written more than that.
+ */
+ if (g > gbh_nblkptrs(SPA_OLD_GANGBLOCKSIZE)) {
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_DYNAMIC_GANG_HEADER));
+ } else if (any_failed && candidate > SPA_OLD_GANGBLOCKSIZE &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER) &&
+ !spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER)) {
+ dmu_tx_t *tx =
+ dmu_tx_create_assigned(spa->spa_dsl_pool, txg + 1);
+ dsl_sync_task_nowait(spa->spa_dsl_pool,
+ zio_update_feature,
+ (void *)SPA_FEATURE_DYNAMIC_GANG_HEADER, tx);
+ dmu_tx_commit(tx);
+ }
+
+ /*
* Set pio's pipeline to just wait for zio to finish.
*/
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -3836,7 +3895,7 @@ zio_ddt_write(zio_t *zio)
* block and leave.
*/
if (have_dvas == 0) {
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+ ASSERT(BP_GET_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_extend(ddp, v, bp);
ddt_phys_addref(ddp, v);
@@ -3864,6 +3923,23 @@ zio_ddt_write(zio_t *zio)
* then we can just use them as-is.
*/
if (have_dvas >= need_dvas) {
+ /*
+ * For rewrite operations, try preserving the original
+ * logical birth time. If the result matches the
+ * original BP, this becomes a NOP.
+ */
+ if (zp->zp_rewrite) {
+ uint64_t orig_logical_birth =
+ BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
+ ddt_bp_fill(ddp, v, bp, orig_logical_birth);
+ if (BP_EQUAL(bp, &zio->io_bp_orig)) {
+ /* We can skip accounting. */
+ zio->io_flags |= ZIO_FLAG_NOPWRITE;
+ ddt_exit(ddt);
+ return (zio);
+ }
+ }
+
ddt_bp_fill(ddp, v, bp, txg);
ddt_phys_addref(ddp, v);
ddt_exit(ddt);
@@ -4078,9 +4154,11 @@ zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more)
* reserve then we throttle.
*/
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
- zio->io_prop.zp_copies, zio, B_FALSE, more)) {
+ zio->io_allocator, zio->io_prop.zp_copies, zio->io_size,
+ B_FALSE, more)) {
return (NULL);
}
+ zio->io_flags |= ZIO_FLAG_ALLOC_THROTTLED;
avl_remove(&mca->mca_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
@@ -4164,8 +4242,10 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG);
memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva,
3 * sizeof (dva_t));
- BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig),
- BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig));
+ BP_SET_LOGICAL_BIRTH(zio->io_bp,
+ BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig));
+ BP_SET_PHYSICAL_BIRTH(zio->io_bp,
+ BP_GET_RAW_PHYSICAL_BIRTH(&zio->io_bp_orig));
return (zio);
}
@@ -4236,13 +4316,14 @@ again:
* If we are holding old class reservation, drop it.
* Dispatch the next ZIO(s) there if some are waiting.
*/
- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
if (metaslab_class_throttle_unreserve(mc,
- zio->io_prop.zp_copies, zio)) {
+ zio->io_allocator, zio->io_prop.zp_copies,
+ zio->io_size)) {
zio_allocate_dispatch(zio->io_metaslab_class,
zio->io_allocator);
}
- zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ zio->io_flags &= ~ZIO_FLAG_ALLOC_THROTTLED;
}
if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
@@ -4291,6 +4372,15 @@ again:
error);
}
zio->io_error = error;
+ } else if (zio->io_prop.zp_rewrite) {
+ /*
+ * For rewrite operations, preserve the logical birth time
+ * but set the physical birth time to the current txg.
+ */
+ uint64_t logical_birth = BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig);
+ ASSERT3U(logical_birth, <=, zio->io_txg);
+ BP_SET_BIRTH(zio->io_bp, logical_birth, zio->io_txg);
+ BP_SET_REWRITE(zio->io_bp, 1);
}
return (zio);
@@ -4324,18 +4414,17 @@ zio_dva_claim(zio_t *zio)
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
+ ASSERT(BP_GET_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp)) {
- metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
- B_TRUE);
+ metaslab_free(zio->io_spa, bp, BP_GET_BIRTH(bp), B_TRUE);
}
if (gn != NULL) {
- for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+ for (int g = 0; g < gbh_nblkptrs(gn->gn_gangblocksize); g++) {
zio_dva_unallocate(zio, gn->gn_child[g],
- &gn->gn_gbh->zg_blkptr[g]);
+ gbh_bp(gn->gn_gbh, g));
}
}
}
@@ -4347,7 +4436,7 @@ int
zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t *slog)
{
- int error = 1;
+ int error;
zio_alloc_list_t io_alloc_list;
ASSERT(txg > spa_syncing_txg(spa));
@@ -4372,14 +4461,34 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object)
% spa->spa_alloc_count;
ZIOSTAT_BUMP(ziostat_total_allocations);
+
+ /* Try log class (dedicated slog devices) first */
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, allocator, NULL);
*slog = (error == 0);
+
+ /* Try special_embedded_log class (reserved on special vdevs) */
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_special_embedded_log_class(spa),
+ size, new_bp, 1, txg, NULL, flags, &io_alloc_list,
+ allocator, NULL);
+ }
+
+ /* Try special class (general special vdev allocation) */
+ if (error != 0) {
+ error = metaslab_alloc(spa, spa_special_class(spa), size,
+ new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
+ NULL);
+ }
+
+ /* Try embedded_log class (reserved on normal vdevs) */
if (error != 0) {
error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
NULL);
}
+
+ /* Finally fall back to normal class */
if (error != 0) {
ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
error = metaslab_alloc(spa, spa_normal_class(spa), size,
@@ -4722,7 +4831,7 @@ zio_vdev_io_assess(zio_t *zio)
* If a Direct I/O operation has a checksum verify error then this I/O
* should not attempt to be issued again.
*/
- if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
+ if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) {
if (zio->io_type == ZIO_TYPE_WRITE) {
ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
ASSERT3U(zio->io_error, ==, EIO);
@@ -5031,7 +5140,7 @@ zio_checksum_verify(zio_t *zio)
ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
}
- ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+ ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
!(zio->io_flags & ZIO_FLAG_SPECULATIVE));
@@ -5040,7 +5149,7 @@ zio_checksum_verify(zio_t *zio)
if (error == ECKSUM &&
!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
if (zio->io_flags & ZIO_FLAG_DIO_READ) {
- zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
zio_t *pio = zio_unique_parent(zio);
/*
* Any Direct I/O read that has a checksum
@@ -5090,7 +5199,7 @@ zio_dio_checksum_verify(zio_t *zio)
if ((error = zio_checksum_error(zio, NULL)) != 0) {
zio->io_error = error;
if (error == ECKSUM) {
- zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
+ zio->io_post |= ZIO_POST_DIO_CHKSUM_ERR;
zio_dio_chksum_verify_error_report(zio);
}
}
@@ -5115,7 +5224,7 @@ zio_checksum_verified(zio_t *zio)
void
zio_dio_chksum_verify_error_report(zio_t *zio)
{
- ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
+ ASSERT(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
return;
@@ -5187,7 +5296,7 @@ zio_ready(zio_t *zio)
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
- ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+ ASSERT(BP_GET_BIRTH(bp) == zio->io_txg ||
BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
@@ -5202,7 +5311,7 @@ zio_ready(zio_t *zio)
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(zio->io_metaslab_class != NULL);
@@ -5213,8 +5322,8 @@ zio_ready(zio_t *zio)
* issue the next I/O to allocate.
*/
if (metaslab_class_throttle_unreserve(
- zio->io_metaslab_class, zio->io_prop.zp_copies,
- zio)) {
+ zio->io_metaslab_class, zio->io_allocator,
+ zio->io_prop.zp_copies, zio->io_size)) {
zio_allocate_dispatch(zio->io_metaslab_class,
zio->io_allocator);
}
@@ -5264,6 +5373,7 @@ zio_dva_throttle_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
int flags = METASLAB_ASYNC_ALLOC;
const void *tag = pio;
+ uint64_t size = pio->io_size;
ASSERT3P(zio->io_bp, !=, NULL);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
@@ -5273,16 +5383,19 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT3P(vd, ==, vd->vdev_top);
ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
- ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED);
/*
* Parents of gang children can have two flavors -- ones that allocated
* the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
* allocated the constituent blocks. The first use their parent as tag.
+ * We set the size to match the original allocation call for that case.
*/
if (pio->io_child_type == ZIO_CHILD_GANG &&
- (pio->io_flags & ZIO_FLAG_IO_REWRITE))
+ (pio->io_flags & ZIO_FLAG_IO_REWRITE)) {
tag = zio_unique_parent(pio);
+ size = SPA_OLD_GANGBLOCKSIZE;
+ }
ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
(pio->io_flags & ZIO_FLAG_IO_REWRITE)));
@@ -5295,9 +5408,10 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
- pio->io_allocator, flags, pio->io_size, tag);
+ pio->io_allocator, flags, size, tag);
- if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) {
+ if (metaslab_class_throttle_unreserve(pio->io_metaslab_class,
+ pio->io_allocator, 1, pio->io_size)) {
zio_allocate_dispatch(zio->io_metaslab_class,
pio->io_allocator);
}
@@ -5328,7 +5442,7 @@ zio_done(zio_t *zio)
* write. We must do this since the allocation is performed
* by the logical I/O but the actual write is done by child I/Os.
*/
- if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ if (zio->io_flags & ZIO_FLAG_ALLOC_THROTTLED &&
zio->io_child_type == ZIO_CHILD_VDEV)
zio_dva_throttle_done(zio);
@@ -5337,8 +5451,6 @@ zio_done(zio_t *zio)
ASSERT(zio->io_children[c][w] == 0);
if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
- ASSERT(zio->io_bp->blk_pad[0] == 0);
- ASSERT(zio->io_bp->blk_pad[1] == 0);
ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
sizeof (blkptr_t)) == 0 ||
(zio->io_bp == zio_unique_parent(zio)->io_bp));
@@ -5431,7 +5543,7 @@ zio_done(zio_t *zio)
*/
if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
!vdev_is_dead(zio->io_vd) &&
- !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+ !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
if (ret != EALREADY) {
@@ -5446,14 +5558,14 @@ zio_done(zio_t *zio)
if ((zio->io_error == EIO || !(zio->io_flags &
(ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
- !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
+ !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) &&
zio == zio->io_logical) {
/*
* For logical I/O requests, tell the SPA to log the
* error and generate a logical data ereport.
*/
spa_log_error(zio->io_spa, &zio->io_bookmark,
- BP_GET_LOGICAL_BIRTH(zio->io_bp));
+ BP_GET_PHYSICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
}
@@ -5467,7 +5579,7 @@ zio_done(zio_t *zio)
*/
if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
zio->io_prop.zp_dedup) {
- zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+ zio->io_post |= ZIO_POST_REEXECUTE;
zio->io_prop.zp_dedup = B_FALSE;
}
/*
@@ -5479,11 +5591,11 @@ zio_done(zio_t *zio)
if (IO_IS_ALLOCATING(zio) &&
!(zio->io_flags & ZIO_FLAG_CANFAIL) &&
- !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
+ !(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR)) {
if (zio->io_error != ENOSPC)
- zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+ zio->io_post |= ZIO_POST_REEXECUTE;
else
- zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ zio->io_post |= ZIO_POST_SUSPEND;
}
if ((zio->io_type == ZIO_TYPE_READ ||
@@ -5492,10 +5604,11 @@ zio_done(zio_t *zio)
zio->io_error == ENXIO &&
spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
- zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ zio->io_post |= ZIO_POST_SUSPEND;
- if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
- zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL) &&
+ !(zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)))
+ zio->io_post |= ZIO_POST_SUSPEND;
/*
* Here is a possibly good place to attempt to do
@@ -5514,7 +5627,8 @@ zio_done(zio_t *zio)
*/
zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
- if ((zio->io_error || zio->io_reexecute) &&
+ if ((zio->io_error ||
+ (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND))) &&
IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
!(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
@@ -5525,16 +5639,16 @@ zio_done(zio_t *zio)
* Godfather I/Os should never suspend.
*/
if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
- (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
- zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
+ (zio->io_post & ZIO_POST_SUSPEND))
+ zio->io_post &= ~ZIO_POST_SUSPEND;
- if (zio->io_reexecute) {
+ if (zio->io_post & (ZIO_POST_REEXECUTE|ZIO_POST_SUSPEND)) {
/*
* A Direct I/O operation that has a checksum verify error
* should not attempt to reexecute. Instead, the error should
* just be propagated back.
*/
- ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
+ ASSERT0(zio->io_post & ZIO_POST_DIO_CHKSUM_ERR);
/*
* This is a logical I/O that wants to reexecute.
@@ -5571,7 +5685,7 @@ zio_done(zio_t *zio)
pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
- (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+ (zio->io_post & ZIO_POST_SUSPEND)) {
zio_remove_child(pio, zio, remove_zl);
/*
* This is a rare code path, so we don't
@@ -5595,13 +5709,14 @@ zio_done(zio_t *zio)
* "next_to_execute".
*/
zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
- } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+ } else if (zio->io_post & ZIO_POST_SUSPEND) {
/*
* We'd fail again if we reexecuted now, so suspend
* until conditions improve (e.g. device comes online).
*/
zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
} else {
+ ASSERT(zio->io_post & ZIO_POST_REEXECUTE);
/*
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
@@ -5614,7 +5729,8 @@ zio_done(zio_t *zio)
}
ASSERT(list_is_empty(&zio->io_child_list));
- ASSERT(zio->io_reexecute == 0);
+ ASSERT0(zio->io_post & ZIO_POST_REEXECUTE);
+ ASSERT0(zio->io_post & ZIO_POST_SUSPEND);
ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
/*
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
index a91775b04af2..63d0c6dadd46 100644
--- a/sys/contrib/openzfs/module/zfs/zio_checksum.c
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -279,7 +279,7 @@ static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
{
const dva_t *dva = BP_IDENTITY(bp);
- uint64_t txg = BP_GET_BIRTH(bp);
+ uint64_t txg = BP_GET_PHYSICAL_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));
@@ -545,14 +545,39 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
(BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
int error;
- uint64_t size = (bp == NULL ? zio->io_size :
- (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+ uint64_t size = bp ? BP_GET_PSIZE(bp) : zio->io_size;
uint64_t offset = zio->io_offset;
abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
+ if (bp && BP_IS_GANG(bp)) {
+ if (spa_feature_is_active(spa, SPA_FEATURE_DYNAMIC_GANG_HEADER))
+ size = zio->io_size;
+ else
+ size = SPA_OLD_GANGBLOCKSIZE;
+ }
+
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
offset, info);
+ if (error && bp && BP_IS_GANG(bp) && size > SPA_OLD_GANGBLOCKSIZE) {
+ /*
+ * It's possible that this is an old gang block. Rerun
+ * the checksum with the old size; if that passes, then
+ * update the gangblocksize appropriately.
+ */
+ error = zio_checksum_error_impl(spa, bp, checksum, data,
+ SPA_OLD_GANGBLOCKSIZE, offset, info);
+ if (error == 0) {
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ zio_t *pio;
+ for (pio = zio_unique_parent(zio);
+ pio->io_child_type != ZIO_CHILD_GANG;
+ pio = zio_unique_parent(pio))
+ ;
+ zio_gang_node_t *gn = pio->io_private;
+ gn->gn_gangblocksize = SPA_OLD_GANGBLOCKSIZE;
+ }
+ }
if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
error = zio_handle_fault_injection(zio, ECKSUM);
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 3568d4f43fcb..7e264f308cf2 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -102,6 +102,7 @@ extern int zfs_bclone_wait_dirty;
zv_taskq_t zvol_taskqs;
typedef enum {
+ ZVOL_ASYNC_CREATE_MINORS,
ZVOL_ASYNC_REMOVE_MINORS,
ZVOL_ASYNC_RENAME_MINORS,
ZVOL_ASYNC_SET_SNAPDEV,
@@ -110,10 +111,14 @@ typedef enum {
} zvol_async_op_t;
typedef struct {
- zvol_async_op_t op;
- char name1[MAXNAMELEN];
- char name2[MAXNAMELEN];
- uint64_t value;
+ zvol_async_op_t zt_op;
+ char zt_name1[MAXNAMELEN];
+ char zt_name2[MAXNAMELEN];
+ uint64_t zt_value;
+ uint32_t zt_total;
+ uint32_t zt_done;
+ int32_t zt_status;
+ int zt_error;
} zvol_task_t;
zv_request_task_t *
@@ -859,13 +864,8 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
};
/*
- * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
- *
- * We store data in the log buffers if it's small enough.
- * Otherwise we will later flush the data out via dmu_sync().
+ * zvol_log_write() handles TX_WRITE transactions.
*/
-static const ssize_t zvol_immediate_write_sz = 32768;
-
void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
uint64_t size, boolean_t commit)
@@ -878,15 +878,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
if (zil_replaying(zilog, tx))
return;
- if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
- write_state = WR_INDIRECT;
- else if (!spa_has_slogs(zilog->zl_spa) &&
- size >= blocksize && blocksize > zvol_immediate_write_sz)
- write_state = WR_INDIRECT;
- else if (commit)
- write_state = WR_COPIED;
- else
- write_state = WR_NEED_COPY;
+ write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit);
while (size) {
itx_t *itx;
@@ -1434,6 +1426,57 @@ zvol_create_minors_cb(const char *dsname, void *arg)
return (0);
}
+static void
+zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done,
+ int error)
+{
+
+ task->zt_total += total;
+ task->zt_done += done;
+ if (task->zt_total != task->zt_done) {
+ task->zt_status = -1;
+ if (error)
+ task->zt_error = error;
+ }
+}
+
+static const char *
+zvol_task_op_msg(zvol_async_op_t op)
+{
+ switch (op) {
+ case ZVOL_ASYNC_CREATE_MINORS:
+ return ("create");
+ case ZVOL_ASYNC_REMOVE_MINORS:
+ return ("remove");
+ case ZVOL_ASYNC_RENAME_MINORS:
+ return ("rename");
+ case ZVOL_ASYNC_SET_SNAPDEV:
+ case ZVOL_ASYNC_SET_VOLMODE:
+ return ("set property");
+ default:
+ return ("unknown");
+ }
+
+ __builtin_unreachable();
+ return (NULL);
+}
+
+static void
+zvol_task_report_status(zvol_task_t *task)
+{
+
+ if (task->zt_status == 0)
+ return;
+
+ if (task->zt_error) {
+ dprintf("The %s minors zvol task was not ok, last error %d\n",
+ zvol_task_op_msg(task->zt_op), task->zt_error);
+ } else {
+ dprintf("The %s minors zvol task was not ok\n",
+ zvol_task_op_msg(task->zt_op));
+ }
+}
+
/*
* Create minors for the specified dataset, including children and snapshots.
* Pay attention to the 'snapdev' property and iterate over the snapshots
@@ -1451,14 +1494,27 @@ zvol_create_minors_cb(const char *dsname, void *arg)
* 'visible' (which also verifies that the parent is a zvol), and if so,
* a minor node for that snapshot is created.
*/
-void
-zvol_create_minors_recursive(const char *name)
+static void
+zvol_create_minors_impl(zvol_task_t *task)
{
+ const char *name = task->zt_name1;
list_t minors_list;
minors_job_t *job;
+ uint64_t snapdev;
+ int total = 0, done = 0, last_error, error;
- if (zvol_inhibit_dev)
+ /*
+ * Note: the dsl_pool_config_lock must not be held.
+ * Minor node creation needs to obtain the zvol_state_lock.
+ * zvol_open() obtains the zvol_state_lock and then the dsl pool
+ * config lock. Therefore, we can't have the config lock now if
+ * we are going to wait for the zvol_state_lock, because it
+ * would be a lock order inversion which could lead to deadlock.
+ */
+
+ if (zvol_inhibit_dev) {
return;
+ }
/*
* This is the list for prefetch jobs. Whenever we found a match
@@ -1474,13 +1530,16 @@ zvol_create_minors_recursive(const char *name)
if (strchr(name, '@') != NULL) {
- uint64_t snapdev;
-
- int error = dsl_prop_get_integer(name, "snapdev",
- &snapdev, NULL);
-
- if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- (void) zvol_os_create_minor(name);
+ error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL);
+ if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) {
+ error = zvol_os_create_minor(name);
+ if (error == 0) {
+ done++;
+ } else {
+ last_error = error;
+ }
+ total++;
+ }
} else {
fstrans_cookie_t cookie = spl_fstrans_mark();
(void) dmu_objset_find(name, zvol_create_minors_cb,
@@ -1495,41 +1554,30 @@ zvol_create_minors_recursive(const char *name)
* sequentially.
*/
while ((job = list_remove_head(&minors_list)) != NULL) {
- if (!job->error)
- (void) zvol_os_create_minor(job->name);
+ if (!job->error) {
+ error = zvol_os_create_minor(job->name);
+ if (error == 0) {
+ done++;
+ } else {
+ last_error = error;
+ }
+ } else if (job->error == EINVAL) {
+ /*
+ * The objset, with the name requested by current job
+ * exist, but have the type different from zvol.
+ * Just ignore this sort of errors.
+ */
+ done++;
+ } else {
+ last_error = job->error;
+ }
+ total++;
kmem_strfree(job->name);
kmem_free(job, sizeof (minors_job_t));
}
list_destroy(&minors_list);
-}
-
-void
-zvol_create_minor(const char *name)
-{
- /*
- * Note: the dsl_pool_config_lock must not be held.
- * Minor node creation needs to obtain the zvol_state_lock.
- * zvol_open() obtains the zvol_state_lock and then the dsl pool
- * config lock. Therefore, we can't have the config lock now if
- * we are going to wait for the zvol_state_lock, because it
- * would be a lock order inversion which could lead to deadlock.
- */
-
- if (zvol_inhibit_dev)
- return;
-
- if (strchr(name, '@') != NULL) {
- uint64_t snapdev;
-
- int error = dsl_prop_get_integer(name,
- "snapdev", &snapdev, NULL);
-
- if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
- (void) zvol_os_create_minor(name);
- } else {
- (void) zvol_os_create_minor(name);
- }
+ zvol_task_update_status(task, total, done, last_error);
}
/*
@@ -1577,10 +1625,11 @@ zvol_free_task(void *arg)
zvol_os_free(arg);
}
-void
-zvol_remove_minors_impl(const char *name)
+static void
+zvol_remove_minors_impl(zvol_task_t *task)
{
zvol_state_t *zv, *zv_next;
+ const char *name = task ? task->zt_name1 : NULL;
int namelen = ((name) ? strlen(name) : 0);
taskqid_t t;
list_t delay_list, free_list;
@@ -1662,13 +1711,13 @@ zvol_remove_minors_impl(const char *name)
}
/* Remove minor for this specific volume only */
-static void
+static int
zvol_remove_minor_impl(const char *name)
{
zvol_state_t *zv = NULL, *zv_next;
if (zvol_inhibit_dev)
- return;
+ return (0);
rw_enter(&zvol_state_lock, RW_WRITER);
@@ -1684,7 +1733,7 @@ zvol_remove_minor_impl(const char *name)
if (zv == NULL) {
rw_exit(&zvol_state_lock);
- return;
+ return (ENOENT);
}
ASSERT(MUTEX_HELD(&zv->zv_state_lock));
@@ -1698,7 +1747,7 @@ zvol_remove_minor_impl(const char *name)
mutex_exit(&zv->zv_state_lock);
rw_exit(&zvol_state_lock);
zvol_remove_minor_task(zv);
- return;
+ return (0);
}
zvol_remove(zv);
@@ -1708,16 +1757,20 @@ zvol_remove_minor_impl(const char *name)
rw_exit(&zvol_state_lock);
zvol_os_free(zv);
+
+ return (0);
}
/*
* Rename minors for specified dataset including children and snapshots.
*/
static void
-zvol_rename_minors_impl(const char *oldname, const char *newname)
+zvol_rename_minors_impl(zvol_task_t *task)
{
zvol_state_t *zv, *zv_next;
- int oldnamelen;
+ const char *oldname = task->zt_name1;
+ const char *newname = task->zt_name2;
+ int total = 0, done = 0, last_error, error, oldnamelen;
if (zvol_inhibit_dev)
return;
@@ -1732,24 +1785,31 @@ zvol_rename_minors_impl(const char *oldname, const char *newname)
mutex_enter(&zv->zv_state_lock);
if (strcmp(zv->zv_name, oldname) == 0) {
- zvol_os_rename_minor(zv, newname);
+ error = zvol_os_rename_minor(zv, newname);
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
(zv->zv_name[oldnamelen] == '/' ||
zv->zv_name[oldnamelen] == '@')) {
char *name = kmem_asprintf("%s%c%s", newname,
zv->zv_name[oldnamelen],
zv->zv_name + oldnamelen + 1);
- zvol_os_rename_minor(zv, name);
+ error = zvol_os_rename_minor(zv, name);
kmem_strfree(name);
}
-
+ if (error) {
+ last_error = error;
+ } else {
+ done++;
+ }
+ total++;
mutex_exit(&zv->zv_state_lock);
}
rw_exit(&zvol_state_lock);
+ zvol_task_update_status(task, total, done, last_error);
}
typedef struct zvol_snapdev_cb_arg {
+ zvol_task_t *task;
uint64_t snapdev;
} zvol_snapdev_cb_arg_t;
@@ -1757,26 +1817,31 @@ static int
zvol_set_snapdev_cb(const char *dsname, void *param)
{
zvol_snapdev_cb_arg_t *arg = param;
+ int error = 0;
if (strchr(dsname, '@') == NULL)
return (0);
switch (arg->snapdev) {
case ZFS_SNAPDEV_VISIBLE:
- (void) zvol_os_create_minor(dsname);
+ error = zvol_os_create_minor(dsname);
break;
case ZFS_SNAPDEV_HIDDEN:
- (void) zvol_remove_minor_impl(dsname);
+ error = zvol_remove_minor_impl(dsname);
break;
}
+ zvol_task_update_status(arg->task, 1, error == 0, error);
return (0);
}
static void
-zvol_set_snapdev_impl(char *name, uint64_t snapdev)
+zvol_set_snapdev_impl(zvol_task_t *task)
{
- zvol_snapdev_cb_arg_t arg = {snapdev};
+ const char *name = task->zt_name1;
+ uint64_t snapdev = task->zt_value;
+
+ zvol_snapdev_cb_arg_t arg = {task, snapdev};
fstrans_cookie_t cookie = spl_fstrans_mark();
/*
* The zvol_set_snapdev_sync() sets snapdev appropriately
@@ -1787,11 +1852,14 @@ zvol_set_snapdev_impl(char *name, uint64_t snapdev)
}
static void
-zvol_set_volmode_impl(char *name, uint64_t volmode)
+zvol_set_volmode_impl(zvol_task_t *task)
{
+ const char *name = task->zt_name1;
+ uint64_t volmode = task->zt_value;
fstrans_cookie_t cookie;
uint64_t old_volmode;
zvol_state_t *zv;
+ int error;
if (strchr(name, '@') != NULL)
return;
@@ -1804,7 +1872,7 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
*/
zv = zvol_find_by_name(name, RW_NONE);
if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
- return;
+ return;
if (zv != NULL) {
old_volmode = zv->zv_volmode;
mutex_exit(&zv->zv_state_lock);
@@ -1815,51 +1883,34 @@ zvol_set_volmode_impl(char *name, uint64_t volmode)
cookie = spl_fstrans_mark();
switch (volmode) {
case ZFS_VOLMODE_NONE:
- (void) zvol_remove_minor_impl(name);
+ error = zvol_remove_minor_impl(name);
break;
case ZFS_VOLMODE_GEOM:
case ZFS_VOLMODE_DEV:
- (void) zvol_remove_minor_impl(name);
- (void) zvol_os_create_minor(name);
+ error = zvol_remove_minor_impl(name);
+ /*
+ * The remove minor function call above, might be not
+ * needed, if volmode was switched from 'none' value.
+ * Ignore error in this case.
+ */
+ if (error == ENOENT)
+ error = 0;
+ else if (error)
+ break;
+ error = zvol_os_create_minor(name);
break;
case ZFS_VOLMODE_DEFAULT:
- (void) zvol_remove_minor_impl(name);
+ error = zvol_remove_minor_impl(name);
if (zvol_volmode == ZFS_VOLMODE_NONE)
break;
else /* if zvol_volmode is invalid defaults to "geom" */
- (void) zvol_os_create_minor(name);
+ error = zvol_os_create_minor(name);
break;
}
+ zvol_task_update_status(task, 1, error == 0, error);
spl_fstrans_unmark(cookie);
}
-static zvol_task_t *
-zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
- uint64_t value)
-{
- zvol_task_t *task;
-
- /* Never allow tasks on hidden names. */
- if (name1[0] == '$')
- return (NULL);
-
- task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
- task->op = op;
- task->value = value;
-
- strlcpy(task->name1, name1, sizeof (task->name1));
- if (name2 != NULL)
- strlcpy(task->name2, name2, sizeof (task->name2));
-
- return (task);
-}
-
-static void
-zvol_task_free(zvol_task_t *task)
-{
- kmem_free(task, sizeof (zvol_task_t));
-}
-
/*
* The worker thread function performed asynchronously.
*/
@@ -1868,25 +1919,29 @@ zvol_task_cb(void *arg)
{
zvol_task_t *task = arg;
- switch (task->op) {
+ switch (task->zt_op) {
+ case ZVOL_ASYNC_CREATE_MINORS:
+ zvol_create_minors_impl(task);
+ break;
case ZVOL_ASYNC_REMOVE_MINORS:
- zvol_remove_minors_impl(task->name1);
+ zvol_remove_minors_impl(task);
break;
case ZVOL_ASYNC_RENAME_MINORS:
- zvol_rename_minors_impl(task->name1, task->name2);
+ zvol_rename_minors_impl(task);
break;
case ZVOL_ASYNC_SET_SNAPDEV:
- zvol_set_snapdev_impl(task->name1, task->value);
+ zvol_set_snapdev_impl(task);
break;
case ZVOL_ASYNC_SET_VOLMODE:
- zvol_set_volmode_impl(task->name1, task->value);
+ zvol_set_volmode_impl(task);
break;
default:
VERIFY(0);
break;
}
- zvol_task_free(task);
+ zvol_task_report_status(task);
+ kmem_free(task, sizeof (zvol_task_t));
}
typedef struct zvol_set_prop_int_arg {
@@ -1931,23 +1986,17 @@ zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
return (0);
- switch (zsda->zsda_prop) {
- case ZFS_PROP_VOLMODE:
- task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname,
- NULL, prop);
- break;
- case ZFS_PROP_SNAPDEV:
- task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
- NULL, prop);
- break;
- default:
- task = NULL;
- break;
- }
-
- if (task == NULL)
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ if (zsda->zsda_prop == ZFS_PROP_VOLMODE) {
+ task->zt_op = ZVOL_ASYNC_SET_VOLMODE;
+ } else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) {
+ task->zt_op = ZVOL_ASYNC_SET_SNAPDEV;
+ } else {
+ kmem_free(task, sizeof (zvol_task_t));
return (0);
-
+ }
+ task->zt_value = prop;
+ strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1));
(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
task, TQ_SLEEP);
return (0);
@@ -2001,15 +2050,34 @@ zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
}
void
-zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+zvol_create_minors(const char *name)
{
+ spa_t *spa;
zvol_task_t *task;
taskqid_t id;
- task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
- if (task == NULL)
+ if (spa_open(name, &spa, FTAG) != 0)
return;
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->zt_op = ZVOL_ASYNC_CREATE_MINORS;
+ strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
+ id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+ if (id != TASKQID_INVALID)
+ taskq_wait_id(spa->spa_zvol_taskq, id);
+
+ spa_close(spa, FTAG);
+}
+
+void
+zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+{
+ zvol_task_t *task;
+ taskqid_t id;
+
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
+ strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
if ((async == B_FALSE) && (id != TASKQID_INVALID))
taskq_wait_id(spa->spa_zvol_taskq, id);
@@ -2022,10 +2090,10 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
zvol_task_t *task;
taskqid_t id;
- task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
- if (task == NULL)
- return;
-
+ task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+ task->zt_op = ZVOL_ASYNC_RENAME_MINORS;
+ strlcpy(task->zt_name1, name1, sizeof (task->zt_name1));
+ strlcpy(task->zt_name2, name2, sizeof (task->zt_name2));
id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
if ((async == B_FALSE) && (id != TASKQID_INVALID))
taskq_wait_id(spa->spa_zvol_taskq, id);