aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/dbuf.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/dbuf.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/dbuf.c156
1 files changed, 109 insertions, 47 deletions
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
index f1b5a17f337e..3d0f88b36336 100644
--- a/sys/contrib/openzfs/module/zfs/dbuf.c
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -523,7 +523,7 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
return;
/* Only data blocks support the attachment of user data. */
- ASSERT(db->db_level == 0);
+ ASSERT0(db->db_level);
/* Clients must resolve a dbuf before attaching user data. */
ASSERT(db->db.db_data != NULL);
@@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size)
* and grabbing the lock results in massive lock contention.
*/
if (size > dbuf_cache_target_bytes()) {
- if (size > dbuf_cache_hiwater_bytes())
+ /*
+ * Avoid calling dbuf_evict_one() from memory reclaim context
+ * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks.
+ * Memory reclaim threads can get stuck waiting for the dbuf
+ * hash lock.
+ */
+ if (size > dbuf_cache_hiwater_bytes() &&
+ !current_is_reclaim_thread()) {
dbuf_evict_one();
+ }
cv_signal(&dbuf_evict_cv);
}
}
@@ -1120,8 +1128,8 @@ dbuf_verify(dmu_buf_impl_t *db)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
if (dn == NULL) {
- ASSERT(db->db_parent == NULL);
- ASSERT(db->db_blkptr == NULL);
+ ASSERT0P(db->db_parent);
+ ASSERT0P(db->db_blkptr);
} else {
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
@@ -1172,7 +1180,7 @@ dbuf_verify(dmu_buf_impl_t *db)
/* db is pointed to by the dnode */
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
- ASSERT(db->db_parent == NULL);
+ ASSERT0P(db->db_parent);
else
ASSERT(db->db_parent != NULL);
if (db->db_blkid != DMU_SPILL_BLKID)
@@ -1211,7 +1219,7 @@ dbuf_verify(dmu_buf_impl_t *db)
int i;
for (i = 0; i < db->db.db_size >> 3; i++) {
- ASSERT(buf[i] == 0);
+ ASSERT0(buf[i]);
}
} else {
blkptr_t *bps = db->db.db_data;
@@ -1235,11 +1243,9 @@ dbuf_verify(dmu_buf_impl_t *db)
DVA_IS_EMPTY(&bp->blk_dva[1]) &&
DVA_IS_EMPTY(&bp->blk_dva[2]));
ASSERT0(bp->blk_fill);
- ASSERT0(bp->blk_pad[0]);
- ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
- ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+ ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
}
}
}
@@ -1253,7 +1259,7 @@ dbuf_clear_data(dmu_buf_impl_t *db)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
dbuf_evict_user(db);
- ASSERT3P(db->db_buf, ==, NULL);
+ ASSERT0P(db->db_buf);
db->db.db_data = NULL;
if (db->db_state != DB_NOFILL) {
db->db_state = DB_UNCACHED;
@@ -1378,13 +1384,13 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
* All reads are synchronous, so we must have a hold on the dbuf
*/
ASSERT(zfs_refcount_count(&db->db_holds) > 0);
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
+ ASSERT0P(db->db_buf);
+ ASSERT0P(db->db.db_data);
if (buf == NULL) {
/* i/o error */
ASSERT(zio == NULL || zio->io_error != 0);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT3P(db->db_buf, ==, NULL);
+ ASSERT0P(db->db_buf);
db->db_state = DB_UNCACHED;
DTRACE_SET_STATE(db, "i/o error");
} else if (db->db_level == 0 && db->db_freed_in_flight) {
@@ -1578,7 +1584,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
- ASSERT(db->db_buf == NULL);
+ ASSERT0P(db->db_buf);
ASSERT(db->db_parent == NULL ||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1615,7 +1621,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags,
*/
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) {
spa_log_error(db->db_objset->os_spa, &zb,
- BP_GET_LOGICAL_BIRTH(bp));
+ BP_GET_PHYSICAL_BIRTH(bp));
err = SET_ERROR(EIO);
goto early_unlock;
}
@@ -1676,7 +1682,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db.db_data != NULL);
- ASSERT(db->db_level == 0);
+ ASSERT0(db->db_level);
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
if (dr == NULL ||
@@ -1895,8 +1901,8 @@ dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags)
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
+ ASSERT0P(db->db_buf);
+ ASSERT0P(db->db.db_data);
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
db->db_state = DB_FILL;
DTRACE_SET_STATE(db, "assigning filled buffer");
@@ -1923,7 +1929,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
* comes from dbuf_dirty() callers who must also hold a range lock.
*/
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
- ASSERT(db->db_level == 0);
+ ASSERT0(db->db_level);
if (db->db_blkid == DMU_BONUS_BLKID ||
dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
@@ -1988,7 +1994,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
mutex_enter(&dn->dn_dbufs_mtx);
db = avl_find(&dn->dn_dbufs, db_search, &where);
- ASSERT3P(db, ==, NULL);
+ ASSERT0P(db);
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
@@ -2011,7 +2017,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
if (db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL ||
db->db_state == DB_EVICTING) {
- ASSERT(db->db.db_data == NULL);
+ ASSERT0P(db->db.db_data);
mutex_exit(&db->db_mtx);
continue;
}
@@ -2154,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
ASSERT(arc_released(db->db_buf));
arc_buf_thaw(db->db_buf);
}
+
+ /*
+ * Clear the rewrite flag since this is now a logical
+ * modification.
+ */
+ dr->dt.dl.dr_rewrite = B_FALSE;
}
}
@@ -2701,6 +2713,38 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
}
+void
+dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(tx->tx_txg != 0);
+ ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+ /*
+ * If the dbuf is already dirty in this txg, it will be written
+ * anyway, so there's nothing to do.
+ */
+ mutex_enter(&db->db_mtx);
+ if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ mutex_exit(&db->db_mtx);
+
+ /*
+ * The dbuf is not dirty, so we need to make it dirty and
+ * mark it for rewrite (preserve logical birth time).
+ */
+ dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH);
+
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+ if (dr != NULL && db->db_level == 0)
+ dr->dt.dl.dr_rewrite = B_TRUE;
+ mutex_exit(&db->db_mtx);
+}
+
boolean_t
dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
@@ -2852,8 +2896,8 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx)
dbuf_clear_data(db);
}
- ASSERT3P(db->db_buf, ==, NULL);
- ASSERT3P(db->db.db_data, ==, NULL);
+ ASSERT0P(db->db_buf);
+ ASSERT0P(db->db.db_data);
db->db_state = DB_NOFILL;
DTRACE_SET_STATE(db,
@@ -2888,7 +2932,7 @@ dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail,
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(tx->tx_txg != 0);
- ASSERT(db->db_level == 0);
+ ASSERT0(db->db_level);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
@@ -3100,7 +3144,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
{
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT(db->db_level == 0);
+ ASSERT0(db->db_level);
ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
ASSERT(buf != NULL);
ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
@@ -3165,7 +3209,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
VERIFY(!dbuf_undirty(db, tx));
db->db_state = DB_UNCACHED;
}
- ASSERT(db->db_buf == NULL);
+ ASSERT0P(db->db_buf);
dbuf_set_data(db, buf);
db->db_state = DB_FILL;
DTRACE_SET_STATE(db, "filling assigned arcbuf");
@@ -3225,7 +3269,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
}
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
- ASSERT(db->db_data_pending == NULL);
+ ASSERT0P(db->db_data_pending);
ASSERT(list_is_empty(&db->db_dirty_records));
db->db_state = DB_EVICTING;
@@ -3277,11 +3321,11 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_parent = NULL;
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db.db_data == NULL);
- ASSERT(db->db_hash_next == NULL);
- ASSERT(db->db_blkptr == NULL);
- ASSERT(db->db_data_pending == NULL);
+ ASSERT0P(db->db_buf);
+ ASSERT0P(db->db.db_data);
+ ASSERT0P(db->db_hash_next);
+ ASSERT0P(db->db_blkptr);
+ ASSERT0P(db->db_data_pending);
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT(!multilist_link_active(&db->db_cache_link));
@@ -3916,7 +3960,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
if (fail_uncached)
return (SET_ERROR(ENOENT));
- ASSERT3P(parent, ==, NULL);
+ ASSERT0P(parent);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) {
if (err == 0 && bp && BP_IS_HOLE(bp))
@@ -4020,7 +4064,7 @@ dbuf_create_bonus(dnode_t *dn)
{
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
- ASSERT(dn->dn_bonus == NULL);
+ ASSERT0P(dn->dn_bonus);
dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
dn->dn_bonus->db_pending_evict = FALSE;
@@ -4372,7 +4416,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
* inappropriate to hook it in (i.e., nlevels mismatch).
*/
ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
- ASSERT(db->db_parent == NULL);
+ ASSERT0P(db->db_parent);
db->db_parent = dn->dn_dbuf;
db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
DBUF_VERIFY(db);
@@ -4433,7 +4477,7 @@ dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
- ASSERT3U(db->db_level, ==, 0);
+ ASSERT0(db->db_level);
if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
zbookmark_phys_t zb;
@@ -4544,7 +4588,7 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
/* ensure that everything is zero after our data */
for (; datap_end < datap_max; datap_end++)
- ASSERT(*datap_end == 0);
+ ASSERT0(*datap_end);
#endif
}
@@ -4552,7 +4596,7 @@ static blkptr_t *
dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
{
/* This must be a lightweight dirty record. */
- ASSERT3P(dr->dr_dbuf, ==, NULL);
+ ASSERT0P(dr->dr_dbuf);
dnode_t *dn = dr->dr_dnode;
if (dn->dn_phys->dn_nlevels == 1) {
@@ -4695,7 +4739,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
if (db->db_state == DB_UNCACHED) {
/* This buffer has been freed since it was dirtied */
- ASSERT3P(db->db.db_data, ==, NULL);
+ ASSERT0P(db->db.db_data);
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
@@ -4712,9 +4756,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
dbuf_dirty_record_t *dr_head =
list_head(&db->db_dirty_records);
- ASSERT3P(db->db_buf, ==, NULL);
- ASSERT3P(db->db.db_data, ==, NULL);
- ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL);
+ ASSERT0P(db->db_buf);
+ ASSERT0P(db->db.db_data);
+ ASSERT0P(dr_head->dt.dl.dr_data);
ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
@@ -4899,7 +4943,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
- if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
+ if (BP_GET_BIRTH(bp) != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
@@ -5186,7 +5230,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
drica.drica_os = dn->dn_objset;
- drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
+ drica.drica_blk_birth = BP_GET_BIRTH(bp);
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
@@ -5201,8 +5245,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- BP_GET_LOGICAL_BIRTH(bp) >
- ds->ds_dir->dd_origin_txg) {
+ BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@@ -5320,7 +5363,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
}
ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
+ ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg);
ASSERT(pio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -5334,6 +5377,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
/*
+ * Set rewrite properties for zfs_rewrite() operations.
+ */
+ if (db->db_level == 0 && dr->dt.dl.dr_rewrite) {
+ zp.zp_rewrite = B_TRUE;
+
+ /*
+ * Mark physical rewrite feature for activation.
+ * This will be activated automatically during dataset sync.
+ */
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ if (!dsl_dataset_feature_is_active(ds,
+ SPA_FEATURE_PHYSICAL_REWRITE)) {
+ ds->ds_feature_activation[
+ SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE;
+ }
+ }
+
+ /*
* We copy the blkptr now (rather than when we instantiate the dirty
* record), because its value can change between open context and
* syncing context. We do not need to hold dn_struct_rwlock to read
@@ -5403,6 +5464,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_will_rewrite);
EXPORT_SYMBOL(dmu_buf_is_dirty);
EXPORT_SYMBOL(dmu_buf_will_clone_or_dio);
EXPORT_SYMBOL(dmu_buf_will_not_fill);