aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/dnode.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/dnode.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/dnode.c222
1 files changed, 126 insertions, 96 deletions
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
index 6f68e76561bc..e88d394b5229 100644
--- a/sys/contrib/openzfs/module/zfs/dnode.c
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
#endif /* _KERNEL */
+static char *
+rt_name(dnode_t *dn, const char *name)
+{
+ struct objset *os = dn->dn_objset;
+
+ return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}",
+ spa_name(os->os_spa),
+ (u_longlong_t)(os->os_dsl_dataset ?
+ os->os_dsl_dataset->ds_object : DMU_META_OBJSET),
+ (u_longlong_t)dn->dn_object,
+ name));
+}
+
static int
dbuf_compare(const void *x1, const void *x2)
{
@@ -160,9 +173,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
- dn->dn_dirty_txg = 0;
- dn->dn_dirtyctx = 0;
- dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_dirtycnt = 0;
dn->dn_bonus = NULL;
dn->dn_have_spill = B_FALSE;
dn->dn_zio = NULL;
@@ -201,7 +212,7 @@ dnode_dest(void *arg, void *unused)
for (int i = 0; i < TXG_SIZE; i++) {
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
- ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+ ASSERT0P(dn->dn_free_ranges[i]);
list_destroy(&dn->dn_dirty_records[i]);
ASSERT0(dn->dn_next_nblkptr[i]);
ASSERT0(dn->dn_next_nlevels[i]);
@@ -216,12 +227,10 @@ dnode_dest(void *arg, void *unused)
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_free_txg);
ASSERT0(dn->dn_assigned_txg);
- ASSERT0(dn->dn_dirty_txg);
- ASSERT0(dn->dn_dirtyctx);
- ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
- ASSERT3P(dn->dn_bonus, ==, NULL);
+ ASSERT0(dn->dn_dirtycnt);
+ ASSERT0P(dn->dn_bonus);
ASSERT(!dn->dn_have_spill);
- ASSERT3P(dn->dn_zio, ==, NULL);
+ ASSERT0P(dn->dn_zio);
ASSERT0(dn->dn_oldused);
ASSERT0(dn->dn_oldflags);
ASSERT0(dn->dn_olduid);
@@ -305,7 +314,7 @@ dnode_kstats_update(kstat_t *ksp, int rw)
void
dnode_init(void)
{
- ASSERT(dnode_cache == NULL);
+ ASSERT0P(dnode_cache);
dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
kmem_cache_set_move(dnode_cache, dnode_move);
@@ -496,7 +505,7 @@ dnode_buf_byteswap(void *vbuf, size_t size)
int i = 0;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
- ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+ ASSERT0((size & (sizeof (dnode_phys_t)-1)));
while (i < size) {
dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
@@ -660,7 +669,7 @@ dnode_destroy(dnode_t *dn)
objset_t *os = dn->dn_objset;
boolean_t complete_os_eviction = B_FALSE;
- ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
+ ASSERT0((dn->dn_id_flags & DN_ID_NEW_EXIST));
mutex_enter(&os->os_lock);
POINTER_INVALIDATE(&dn->dn_objset);
@@ -679,10 +688,8 @@ dnode_destroy(dnode_t *dn)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_assigned_txg = 0;
- dn->dn_dirty_txg = 0;
+ dn->dn_dirtycnt = 0;
- dn->dn_dirtyctx = 0;
- dn->dn_dirtyctx_firstset = NULL;
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_destroy(dn->dn_bonus);
@@ -767,7 +774,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT0(dn->dn_next_maxblkid[i]);
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
- ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+ ASSERT0P(dn->dn_free_ranges[i]);
}
dn->dn_type = ot;
@@ -787,11 +794,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
- dn->dn_dirtyctx = 0;
dn->dn_free_txg = 0;
- dn->dn_dirtyctx_firstset = NULL;
- dn->dn_dirty_txg = 0;
+ dn->dn_dirtycnt = 0;
dn->dn_allocated_txg = tx->tx_txg;
dn->dn_id_flags = 0;
@@ -942,10 +947,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
- ndn->dn_dirty_txg = odn->dn_dirty_txg;
- ndn->dn_dirtyctx = odn->dn_dirtyctx;
- ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
- ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
+ ndn->dn_dirtycnt = odn->dn_dirtycnt;
+ ASSERT0(zfs_refcount_count(&odn->dn_tx_holds));
zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
ASSERT(avl_is_empty(&ndn->dn_dbufs));
avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
@@ -1007,9 +1010,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
odn->dn_allocated_txg = 0;
odn->dn_free_txg = 0;
odn->dn_assigned_txg = 0;
- odn->dn_dirty_txg = 0;
- odn->dn_dirtyctx = 0;
- odn->dn_dirtyctx_firstset = NULL;
+ odn->dn_dirtycnt = 0;
odn->dn_have_spill = B_FALSE;
odn->dn_zio = NULL;
odn->dn_oldused = 0;
@@ -1260,8 +1261,8 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
} else if (DN_SLOT_IS_PTR(dn)) {
mutex_enter(&dn->dn_mtx);
boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
- zfs_refcount_is_zero(&dn->dn_holds) &&
- !DNODE_IS_DIRTY(dn));
+ dn->dn_dirtycnt == 0 &&
+ zfs_refcount_is_zero(&dn->dn_holds));
mutex_exit(&dn->dn_mtx);
if (!can_free)
@@ -1510,7 +1511,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
* if we get the encrypted or decrypted version.
*/
err = dbuf_read(db, NULL, DB_RF_CANFAIL |
- DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
+ DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
if (err) {
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
@@ -1744,17 +1745,23 @@ dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
* reference on the dnode. Returns FALSE if unable to add a
* new reference.
*/
+static boolean_t
+dnode_add_ref_locked(dnode_t *dn, const void *tag)
+{
+ ASSERT(MUTEX_HELD(&dn->dn_mtx));
+ if (zfs_refcount_is_zero(&dn->dn_holds))
+ return (FALSE);
+ VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+ return (TRUE);
+}
+
boolean_t
dnode_add_ref(dnode_t *dn, const void *tag)
{
mutex_enter(&dn->dn_mtx);
- if (zfs_refcount_is_zero(&dn->dn_holds)) {
- mutex_exit(&dn->dn_mtx);
- return (FALSE);
- }
- VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+ boolean_t r = dnode_add_ref_locked(dn, tag);
mutex_exit(&dn->dn_mtx);
- return (TRUE);
+ return (r);
}
void
@@ -1817,31 +1824,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
}
/*
- * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
- * It is important to check both conditions, as some operations (eg appending
- * to a file) can dirty both as a single logical unit, but they are not synced
- * out atomically, so checking one and not the other can result in an object
- * appearing to be clean mid-way through a commit.
+ * Test if the dnode is dirty, or carrying uncommitted records.
*
- * Do not change this lightly! If you get it wrong, dmu_offset_next() can
- * detect a hole where there is really data, leading to silent corruption.
+ * dn_dirtycnt is the number of txgs this dnode is dirty on. It's incremented
+ * in dnode_setdirty() the first time the dnode is dirtied on a txg, and
+ * decremented in either dnode_rele_task() or userquota_updates_task() when the
+ * txg is synced out.
*/
boolean_t
dnode_is_dirty(dnode_t *dn)
{
mutex_enter(&dn->dn_mtx);
-
- for (int i = 0; i < TXG_SIZE; i++) {
- if (multilist_link_active(&dn->dn_dirty_link[i]) ||
- !list_is_empty(&dn->dn_dirty_records[i])) {
- mutex_exit(&dn->dn_mtx);
- return (B_TRUE);
- }
- }
-
+ boolean_t dirty = (dn->dn_dirtycnt != 0);
mutex_exit(&dn->dn_mtx);
-
- return (B_FALSE);
+ return (dirty);
}
void
@@ -1903,7 +1899,11 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
* dnode will hang around after we finish processing its
* children.
*/
- VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+ mutex_enter(&dn->dn_mtx);
+ VERIFY(dnode_add_ref_locked(dn, (void *)(uintptr_t)tx->tx_txg));
+ dn->dn_dirtycnt++;
+ ASSERT3U(dn->dn_dirtycnt, <=, 3);
+ mutex_exit(&dn->dn_mtx);
(void) dbuf_dirty(dn->dn_dbuf, tx);
@@ -2208,32 +2208,6 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
mutex_exit(&dn->dn_dbufs_mtx);
}
-void
-dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
-{
- /*
- * Don't set dirtyctx to SYNC if we're just modifying this as we
- * initialize the objset.
- */
- if (dn->dn_dirtyctx == DN_UNDIRTIED) {
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-
- if (ds != NULL) {
- rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
- }
- if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
- if (dmu_tx_is_syncing(tx))
- dn->dn_dirtyctx = DN_DIRTY_SYNC;
- else
- dn->dn_dirtyctx = DN_DIRTY_OPEN;
- dn->dn_dirtyctx_firstset = tag;
- }
- if (ds != NULL) {
- rrw_exit(&ds->ds_bp_rwlock, tag);
- }
- }
-}
-
static void
dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
dmu_tx_t *tx)
@@ -2291,7 +2265,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if ((off >> blkshift) > dn->dn_maxblkid)
return;
} else {
- ASSERT(dn->dn_maxblkid == 0);
+ ASSERT0(dn->dn_maxblkid);
if (off == 0 && len >= blksz) {
/*
* Freeing the whole block; fast-track this request.
@@ -2436,8 +2410,10 @@ done:
{
int txgoff = tx->tx_txg & TXG_MASK;
if (dn->dn_free_ranges[txgoff] == NULL) {
- dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
- ZFS_RANGE_SEG64, NULL, 0, 0);
+ dn->dn_free_ranges[txgoff] =
+ zfs_range_tree_create_flags(
+ NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
+ ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges"));
}
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
@@ -2509,7 +2485,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
}
space += delta;
if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
- ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+ ASSERT0((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES));
ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
dn->dn_phys->dn_used = space >> DEV_BSHIFT;
} else {
@@ -2559,6 +2535,11 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = 0;
epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr;
+ if (dn->dn_dbuf != NULL)
+ rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER);
+ else if (dmu_objset_ds(dn->dn_objset) != NULL)
+ rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
+ RW_READER, FTAG);
} else {
uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
@@ -2578,7 +2559,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
}
error = dbuf_read(db, NULL,
DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
- DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
+ DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT);
if (error) {
dbuf_rele(db, FTAG);
return (error);
@@ -2663,12 +2644,44 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
if (db != NULL) {
rw_exit(&db->db_rwlock);
dbuf_rele(db, FTAG);
+ } else {
+ if (dn->dn_dbuf != NULL)
+ rw_exit(&dn->dn_dbuf->db_rwlock);
+ else if (dmu_objset_ds(dn->dn_objset) != NULL)
+ rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock,
+ FTAG);
}
return (error);
}
/*
+ * Adjust *offset to the next (or previous) block byte offset at lvl.
+ * Returns FALSE if *offset would overflow or underflow.
+ */
+static boolean_t
+dnode_next_block(dnode_t *dn, int flags, uint64_t *offset, int lvl)
+{
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int span = lvl * epbs + dn->dn_datablkshift;
+ uint64_t blkid, maxblkid;
+
+ if (span >= 8 * sizeof (uint64_t))
+ return (B_FALSE);
+
+ blkid = *offset >> span;
+ maxblkid = 1ULL << (8 * sizeof (*offset) - span);
+ if (!(flags & DNODE_FIND_BACKWARDS) && blkid + 1 < maxblkid)
+ *offset = (blkid + 1) << span;
+ else if ((flags & DNODE_FIND_BACKWARDS) && blkid > 0)
+ *offset = (blkid << span) - 1;
+ else
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+/*
* Find the next hole, data, or sparse region at or after *offset.
* The value 'blkfill' tells us how many items we expect to find
* in an L0 data block; this value is 1 for normal objects,
@@ -2695,7 +2708,7 @@ int
dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
int minlvl, uint64_t blkfill, uint64_t txg)
{
- uint64_t initial_offset = *offset;
+ uint64_t matched = *offset;
int lvl, maxlvl;
int error = 0;
@@ -2719,16 +2732,36 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
maxlvl = dn->dn_phys->dn_nlevels;
- for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+ for (lvl = minlvl; lvl <= maxlvl; ) {
error = dnode_next_offset_level(dn,
flags, offset, lvl, blkfill, txg);
- if (error != ESRCH)
+ if (error == 0 && lvl > minlvl) {
+ --lvl;
+ matched = *offset;
+ } else if (error == ESRCH && lvl < maxlvl &&
+ dnode_next_block(dn, flags, &matched, lvl)) {
+ /*
+ * Continue search at next/prev offset in lvl+1 block.
+ *
+ * Usually we only search upwards at the start of the
+ * search as higher level blocks point at a matching
+ * minlvl block in most cases, but we backtrack if not.
+ *
+ * This can happen for txg > 0 searches if the block
+ * contains only BPs/dnodes freed at that txg. It also
+ * happens if we are still syncing out the tree, and
+ * some BP's at higher levels are not updated yet.
+ *
+ * We must adjust offset to avoid coming back to the
+ * same offset and getting stuck looping forever. This
+ * also deals with the case where offset is already at
+ * the beginning or end of the object.
+ */
+ ++lvl;
+ *offset = matched;
+ } else {
break;
- }
-
- while (error == 0 && --lvl >= minlvl) {
- error = dnode_next_offset_level(dn,
- flags, offset, lvl, blkfill, txg);
+ }
}
/*
@@ -2740,9 +2773,6 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
error = 0;
}
- if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
- initial_offset < *offset : initial_offset > *offset))
- error = SET_ERROR(ESRCH);
out:
if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock);