diff options
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/dbuf.c')
-rw-r--r-- | sys/contrib/openzfs/module/zfs/dbuf.c | 398 |
1 files changed, 223 insertions, 175 deletions
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 0a243a24266f..fccc4c5b5b94 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -523,7 +523,7 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) return; /* Only data blocks support the attachment of user data. */ - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); @@ -866,12 +866,33 @@ dbuf_evict_notify(uint64_t size) * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { - if (size > dbuf_cache_hiwater_bytes()) + /* + * Avoid calling dbuf_evict_one() from memory reclaim context + * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks. + * Memory reclaim threads can get stuck waiting for the dbuf + * hash lock. + */ + if (size > dbuf_cache_hiwater_bytes() && + !current_is_reclaim_thread()) { dbuf_evict_one(); + } cv_signal(&dbuf_evict_cv); } } +/* + * Since dbuf cache size is a fraction of target ARC size, ARC calls this when + * its target size is reduced due to memory pressure. + */ +void +dbuf_cache_reduce_target_size(void) +{ + uint64_t size = zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); + + if (size > dbuf_cache_target_bytes()) + cv_signal(&dbuf_evict_cv); +} + static int dbuf_kstat_update(kstat_t *ksp, int rw) { @@ -1107,8 +1128,8 @@ dbuf_verify(dmu_buf_impl_t *db) DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { - ASSERT(db->db_parent == NULL); - ASSERT(db->db_blkptr == NULL); + ASSERT0P(db->db_parent); + ASSERT0P(db->db_blkptr); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); @@ -1159,7 +1180,7 @@ dbuf_verify(dmu_buf_impl_t *db) /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) - ASSERT(db->db_parent == NULL); + ASSERT0P(db->db_parent); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) @@ -1172,16 +1193,9 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); - /* - * dnode_grow_indblksz() can make this fail if we don't - * have the parent's rwlock. XXX indblksz no longer - * grows. safe to do this now? - */ - if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { - ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + - db->db_blkid % epb)); - } + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && @@ -1205,7 +1219,7 @@ dbuf_verify(dmu_buf_impl_t *db) int i; for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); + ASSERT0(buf[i]); } } else { blkptr_t *bps = db->db.db_data; @@ -1229,11 +1243,9 @@ dbuf_verify(dmu_buf_impl_t *db) DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); - ASSERT0(bp->blk_pad[0]); - ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); - ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp)); } } } @@ -1247,7 +1259,7 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); + ASSERT0P(db->db_buf); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; @@ -1275,33 +1287,6 @@ dbuf_alloc_arcbuf(dmu_buf_impl_t *db) } /* - * Loan out an arc_buf for read. Return the loaned arc_buf. - */ -arc_buf_t * -dbuf_loan_arcbuf(dmu_buf_impl_t *db) -{ - arc_buf_t *abuf; - - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - mutex_enter(&db->db_mtx); - if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { - int blksz = db->db.db_size; - spa_t *spa = db->db_objset->os_spa; - - mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(spa, B_FALSE, blksz); - memcpy(abuf->b_data, db->db.db_data, blksz); - } else { - abuf = db->db_buf; - arc_loan_inuse_buf(abuf, db); - db->db_buf = NULL; - dbuf_clear_data(db); - mutex_exit(&db->db_mtx); - } - return (abuf); -} - -/* * Calculate which level n block references the data at the level 0 offset * provided. */ @@ -1399,13 +1384,13 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(zfs_refcount_count(&db->db_holds) > 0); - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); if (buf == NULL) { /* i/o error */ ASSERT(zio == NULL || zio->io_error != 0); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); + ASSERT0P(db->db_buf); db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "i/o error"); } else if (db->db_level == 0 && db->db_freed_in_flight) { @@ -1437,6 +1422,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, static int dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) { + void* db_data; int bonuslen, max_bonuslen; bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); @@ -1444,21 +1430,22 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn) ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(DB_DNODE_HELD(db)); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); + db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) - memset(db->db.db_data, 0, max_bonuslen); + memset(db_data, 0, max_bonuslen); if (bonuslen) - memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen); + memcpy(db_data, DN_BONUS(dn->dn_phys), bonuslen); + db->db.db_data = db_data; db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "bonus buffer filled"); return (0); } static void -dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp) +dbuf_handle_indirect_hole(void *data, dnode_t *dn, blkptr_t *dbbp) { - blkptr_t *bps = db->db.db_data; + blkptr_t *bps = data; uint32_t indbs = 1ULL << dn->dn_indblkshift; int n_bps = indbs >> SPA_BLKPTRSHIFT; @@ -1483,6 +1470,7 @@ static int dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); + arc_buf_t *db_data; int is_hole = bp == NULL || BP_IS_HOLE(bp); /* @@ -1495,13 +1483,14 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp); if (is_hole) { - dbuf_set_data(db, dbuf_alloc_arcbuf(db)); - memset(db->db.db_data, 0, db->db.db_size); + db_data = dbuf_alloc_arcbuf(db); + memset(db_data->b_data, 0, db->db.db_size); if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) != 0) { - dbuf_handle_indirect_hole(db, dn, bp); + dbuf_handle_indirect_hole(db_data->b_data, dn, bp); } + dbuf_set_data(db, db_data); db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "hole read satisfied"); return (0); @@ -1522,7 +1511,8 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp) * decrypt / authenticate them when we need to read an encrypted bonus buffer. */ static int -dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, + dmu_flags_t flags) { objset_t *os = db->db_objset; dmu_buf_impl_t *dndb; @@ -1530,7 +1520,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) zbookmark_phys_t zb; int err; - if ((flags & DB_RF_NO_DECRYPT) != 0 || + if ((flags & DMU_READ_NO_DECRYPT) != 0 || !os->os_encrypted || os->os_raw_receive || (dndb = dn->dn_dbuf) == NULL) return (0); @@ -1584,7 +1574,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) * returning. */ static int -dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, +dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, dmu_flags_t flags, db_lock_type_t dblt, blkptr_t *bp, const void *tag) { zbookmark_phys_t zb; @@ -1594,7 +1584,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - ASSERT(db->db_buf == NULL); + ASSERT0P(db->db_buf); ASSERT(db->db_parent == NULL || RW_LOCK_HELD(&db->db_parent->db_rwlock)); @@ -1631,7 +1621,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb, - BP_GET_LOGICAL_BIRTH(bp)); + BP_GET_PHYSICAL_BIRTH(bp)); err = SET_ERROR(EIO); goto early_unlock; } @@ -1650,7 +1640,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + if ((flags & DMU_READ_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; /* @@ -1692,7 +1682,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || @@ -1751,7 +1741,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) } int -dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) +dbuf_read(dmu_buf_impl_t *db, zio_t *pio, dmu_flags_t flags) { dnode_t *dn; boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch; @@ -1771,12 +1761,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) goto done; prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0; + (flags & DMU_READ_NO_PREFETCH) == 0; mutex_enter(&db->db_mtx); - if (flags & DB_RF_PARTIAL_FIRST) + if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING))) + db->db_pending_evict = B_FALSE; + if (flags & DMU_PARTIAL_FIRST) db->db_partial_read = B_TRUE; - else if (!(flags & DB_RF_PARTIAL_MORE)) + else if (!(flags & (DMU_PARTIAL_MORE | DMU_KEEP_CACHING))) db->db_partial_read = B_FALSE; miss = (db->db_state != DB_CACHED); @@ -1817,7 +1809,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) * unauthenticated blocks, which will verify their MAC if * the key is now available. */ - if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL && + if ((flags & DMU_READ_NO_DECRYPT) == 0 && db->db_buf != NULL && (arc_is_encrypted(db->db_buf) || arc_is_unauthenticated(db->db_buf) || arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { @@ -1865,7 +1857,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) if (err == 0 && prefetch) { dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss, - flags & DB_RF_HAVESTRUCT); + flags & DB_RF_HAVESTRUCT, (flags & DMU_UNCACHEDIO) || + db->db_pending_evict); } DB_DNODE_EXIT(db); @@ -1897,16 +1890,19 @@ done: } static void -dbuf_noread(dmu_buf_impl_t *db) +dbuf_noread(dmu_buf_impl_t *db, dmu_flags_t flags) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); + if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING))) + db->db_pending_evict = B_FALSE; + db->db_partial_read = B_FALSE; while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); dbuf_set_data(db, dbuf_alloc_arcbuf(db)); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "assigning filled buffer"); @@ -1933,7 +1929,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * comes from dbuf_dirty() callers who must also hold a range lock. */ ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) @@ -1998,7 +1994,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_enter(&dn->dn_dbufs_mtx); db = avl_find(&dn->dn_dbufs, db_search, &where); - ASSERT3P(db, ==, NULL); + ASSERT0P(db); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); @@ -2021,7 +2017,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { - ASSERT(db->db.db_data == NULL); + ASSERT0P(db->db.db_data); mutex_exit(&db->db_mtx); continue; } @@ -2164,6 +2160,12 @@ dbuf_redirty(dbuf_dirty_record_t *dr) ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + + /* + * Clear the rewrite flag since this is now a logical + * modification. + */ + dr->dt.dl.dr_rewrite = B_FALSE; } } @@ -2214,8 +2216,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) kmem_free(dr, sizeof (*dr)); return (NULL); } - int err = dbuf_read(parent_db, NULL, - (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + int err = dbuf_read(parent_db, NULL, DB_RF_CANFAIL | + DMU_READ_NO_PREFETCH); if (err != 0) { dbuf_rele(parent_db, FTAG); kmem_free(dr, sizeof (*dr)); @@ -2268,14 +2270,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_objset->os_dsl_dataset != NULL) rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); #endif - /* - * We make this assert for private objects as well, but after we - * check if we're already dirty. They are allowed to re-dirty - * in syncing context. - */ - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* @@ -2287,12 +2281,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); - mutex_enter(&dn->dn_mtx); - dnode_set_dirtyctx(dn, tx, db); - if (tx->tx_txg > dn->dn_dirty_txg) - dn->dn_dirty_txg = tx->tx_txg; - mutex_exit(&dn->dn_mtx); - if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; @@ -2311,13 +2299,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr_next); } - /* - * Only valid if not already dirty. - */ - ASSERT(dn->dn_object == 0 || - dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); - ASSERT3U(dn->dn_nlevels, >, db->db_level); /* @@ -2520,6 +2501,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; + ASSERT(MUTEX_HELD(&db->db_mtx)); if (dr->dt.dl.dr_data != db->db.db_data) { struct dnode *dn = dr->dr_dnode; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); @@ -2554,12 +2536,13 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * Due to our use of dn_nlevels below, this can only be called - * in open context, unless we are operating on the MOS. - * From syncing context, dn_nlevels may be different from the - * dn_nlevels used when dbuf was dirtied. + * in open context, unless we are operating on the MOS or it's + * a special object. From syncing context, dn_nlevels may be + * different from the dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || + DMU_OBJECT_IS_SPECIAL(db->db.db_object) || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); @@ -2642,8 +2625,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); } -static void -dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) +void +dmu_buf_will_dirty_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, dmu_flags_t flags) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; boolean_t undirty = B_FALSE; @@ -2695,7 +2678,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * not the uderlying block that is being replaced. dbuf_undirty() will * do brt_pending_remove() before removing the dirty record. */ - (void) dbuf_read(db, NULL, flags); + (void) dbuf_read(db, NULL, flags | DB_RF_MUST_SUCCEED); if (undirty) { mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); @@ -2707,8 +2690,39 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { - dmu_buf_will_dirty_impl(db_fake, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); + dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); +} + +void +dmu_buf_will_rewrite(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(tx->tx_txg != 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + /* + * If the dbuf is already dirty in this txg, it will be written + * anyway, so there's nothing to do. + */ + mutex_enter(&db->db_mtx); + if (dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + return; + } + mutex_exit(&db->db_mtx); + + /* + * The dbuf is not dirty, so we need to make it dirty and + * mark it for rewrite (preserve logical birth time). + */ + dmu_buf_will_dirty_flags(db_fake, tx, DMU_READ_NO_PREFETCH); + + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + if (dr != NULL && db->db_level == 0) + dr->dt.dl.dr_rewrite = B_TRUE; + mutex_exit(&db->db_mtx); } boolean_t @@ -2862,8 +2876,8 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) dbuf_clear_data(db); } - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, @@ -2872,7 +2886,7 @@ dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) DBUF_VERIFY(db); mutex_exit(&db->db_mtx); - dbuf_noread(db); + dbuf_noread(db, DMU_KEEP_CACHING); (void) dbuf_dirty(db, tx); } @@ -2886,18 +2900,19 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) DTRACE_SET_STATE(db, "allocating NOFILL buffer"); mutex_exit(&db->db_mtx); - dbuf_noread(db); + dbuf_noread(db, DMU_KEEP_CACHING); (void) dbuf_dirty(db, tx); } void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) +dmu_buf_will_fill_flags(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail, + dmu_flags_t flags) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || @@ -2913,7 +2928,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) */ if (canfail && dr) { mutex_exit(&db->db_mtx); - dmu_buf_will_dirty(db_fake, tx); + dmu_buf_will_dirty_flags(db_fake, tx, flags); return; } /* @@ -2929,10 +2944,16 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) } mutex_exit(&db->db_mtx); - dbuf_noread(db); + dbuf_noread(db, flags); (void) dbuf_dirty(db, tx); } +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) +{ + dmu_buf_will_fill_flags(db_fake, tx, canfail, DMU_READ_NO_PREFETCH); +} + /* * This function is effectively the same as dmu_buf_will_dirty(), but * indicates the caller expects raw encrypted data in the db, and provides @@ -2955,8 +2976,8 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, ASSERT0(db->db_level); ASSERT(db->db_objset->os_raw_receive); - dmu_buf_will_dirty_impl(db_fake, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); + dmu_buf_will_dirty_flags(db_fake, tx, + DMU_READ_NO_PREFETCH | DMU_READ_NO_DECRYPT); dr = dbuf_find_dirty_eq(db, tx->tx_txg); @@ -3098,11 +3119,12 @@ dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void -dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx, + dmu_flags_t flags) { ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_level == 0); + ASSERT0(db->db_level); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size); @@ -3112,6 +3134,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); + if (!(flags & (DMU_UNCACHEDIO | DMU_KEEP_CACHING))) + db->db_pending_evict = B_FALSE; + db->db_partial_read = B_FALSE; while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); @@ -3164,7 +3189,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } - ASSERT(db->db_buf == NULL); + ASSERT0P(db->db_buf); dbuf_set_data(db, buf); db->db_state = DB_FILL; DTRACE_SET_STATE(db, "filling assigned arcbuf"); @@ -3224,7 +3249,7 @@ dbuf_destroy(dmu_buf_impl_t *db) } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - ASSERT(db->db_data_pending == NULL); + ASSERT0P(db->db_data_pending); ASSERT(list_is_empty(&db->db_dirty_records)); db->db_state = DB_EVICTING; @@ -3276,11 +3301,11 @@ dbuf_destroy(dmu_buf_impl_t *db) db->db_parent = NULL; - ASSERT(db->db_buf == NULL); - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); + ASSERT0P(db->db_hash_next); + ASSERT0P(db->db_blkptr); + ASSERT0P(db->db_data_pending); ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); @@ -3366,19 +3391,15 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, if (err) return (err); - err = dbuf_read(*parentp, NULL, - (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + err = dbuf_read(*parentp, NULL, DB_RF_CANFAIL | + DB_RF_HAVESTRUCT | DMU_READ_NO_PREFETCH); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } - rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); - if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) - ASSERT(BP_IS_HOLE(*bpp)); - rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ @@ -3426,7 +3447,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; - db->db_pending_evict = FALSE; + db->db_pending_evict = TRUE; + db->db_partial_read = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -3560,12 +3582,9 @@ dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { - ASSERT(!BP_IS_REDACTED(bp) || - dsl_dataset_feature_is_active( - dpa->dpa_dnode->dn_objset->os_dsl_dataset, - SPA_FEATURE_REDACTED_DATASETS)); - - if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_REDACTED(bp)); + if (BP_IS_EMBEDDED(bp)) return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; @@ -3640,8 +3659,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dbuf_prefetch_fini(dpa, B_TRUE); return; } - (void) dbuf_read(db, NULL, - DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + (void) dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT | + DMU_READ_NO_PREFETCH); dbuf_rele(db, FTAG); } @@ -3651,10 +3670,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); - ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode && + ASSERT(!BP_IS_REDACTED(bp) || dpa->dpa_dnode == NULL || dsl_dataset_feature_is_active( dpa->dpa_dnode->dn_objset->os_dsl_dataset, - SPA_FEATURE_REDACTED_DATASETS))); + SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { arc_buf_destroy(abuf, private); dbuf_prefetch_fini(dpa, B_TRUE); @@ -3667,8 +3686,14 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) - iter_aflags |= ARC_FLAG_L2CACHE; + if (dpa->dpa_dnode) { + if (dnode_level_is_l2cacheable(bp, dpa->dpa_dnode, + dpa->dpa_curlevel)) + iter_aflags |= ARC_FLAG_L2CACHE; + } else { + if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) + iter_aflags |= ARC_FLAG_L2CACHE; + } ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); @@ -3807,7 +3832,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (dnode_level_is_l2cacheable(&bp, dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, curlevel)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, @@ -3851,6 +3876,7 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) { dbuf_dirty_record_t *dr = db->db_data_pending; arc_buf_t *data = dr->dt.dl.dr_data; + arc_buf_t *db_data; enum zio_compress compress_type = arc_get_compression(data); uint8_t complevel = arc_get_complevel(data); @@ -3861,22 +3887,21 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) uint8_t mac[ZIO_DATA_MAC_LEN]; arc_get_raw_params(data, &byteorder, salt, iv, mac); - dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db, + db_data = arc_alloc_raw_buf(dn->dn_objset->os_spa, db, dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, dn->dn_type, arc_buf_size(data), arc_buf_lsize(data), - compress_type, complevel)); + compress_type, complevel); } else if (compress_type != ZIO_COMPRESS_OFF) { - dbuf_set_data(db, arc_alloc_compressed_buf( + db_data = arc_alloc_compressed_buf( dn->dn_objset->os_spa, db, arc_buf_size(data), - arc_buf_lsize(data), compress_type, complevel)); + arc_buf_lsize(data), compress_type, complevel); } else { - dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, - DBUF_GET_BUFC_TYPE(db), db->db.db_size)); + db_data = arc_alloc_buf(dn->dn_objset->os_spa, db, + DBUF_GET_BUFC_TYPE(db), db->db.db_size); } + memcpy(db_data->b_data, data->b_data, arc_buf_size(data)); - rw_enter(&db->db_rwlock, RW_WRITER); - memcpy(db->db.db_data, data->b_data, arc_buf_size(data)); - rw_exit(&db->db_rwlock); + dbuf_set_data(db, db_data); } /* @@ -3900,7 +3925,8 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT3U(dn->dn_nlevels, >, level); + if (!fail_sparse) + ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; @@ -3914,7 +3940,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, if (fail_uncached) return (SET_ERROR(ENOENT)); - ASSERT3P(parent, ==, NULL); + ASSERT0P(parent); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) @@ -3937,6 +3963,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, if (db->db_buf != NULL) { arc_buf_access(db->db_buf); + ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } @@ -4017,9 +4044,10 @@ dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - ASSERT(dn->dn_bonus == NULL); + ASSERT0P(dn->dn_bonus); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL, dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID)); + dn->dn_bonus->db_pending_evict = FALSE; } int @@ -4185,8 +4213,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); - } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) || - db->db_pending_evict) { + } else if (!db->db_partial_read && !DBUF_IS_CACHEABLE(db)) { + /* + * We don't expect more accesses to the dbuf, and it + * is either not cacheable or was marked for eviction. + */ dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); @@ -4365,7 +4396,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) * inappropriate to hook it in (i.e., nlevels mismatch). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); - ASSERT(db->db_parent == NULL); + ASSERT0P(db->db_parent); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); @@ -4426,7 +4457,7 @@ dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); - ASSERT3U(db->db_level, ==, 0); + ASSERT0(db->db_level); if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { zbookmark_phys_t zb; @@ -4537,7 +4568,7 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) /* ensure that everything is zero after our data */ for (; datap_end < datap_max; datap_end++) - ASSERT(*datap_end == 0); + ASSERT0(*datap_end); #endif } @@ -4545,7 +4576,7 @@ static blkptr_t * dbuf_lightweight_bp(dbuf_dirty_record_t *dr) { /* This must be a lightweight dirty record. */ - ASSERT3P(dr->dr_dbuf, ==, NULL); + ASSERT0P(dr->dr_dbuf); dnode_t *dn = dr->dr_dnode; if (dn->dn_phys->dn_nlevels == 1) { @@ -4688,7 +4719,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ - ASSERT3P(db->db.db_data, ==, NULL); + ASSERT0P(db->db.db_data); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); @@ -4705,9 +4736,9 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ dbuf_dirty_record_t *dr_head = list_head(&db->db_dirty_records); - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); - ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL); + ASSERT0P(db->db_buf); + ASSERT0P(db->db.db_data); + ASSERT0P(dr_head->dt.dl.dr_data); ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); @@ -4892,7 +4923,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (BP_GET_LOGICAL_BIRTH(bp) != 0) { + if (BP_GET_BIRTH(bp) != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && @@ -5179,7 +5210,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); drica.drica_os = dn->dn_objset; - drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp); + drica.drica_blk_birth = BP_GET_BIRTH(bp); drica.drica_tx = tx; if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { @@ -5194,8 +5225,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) if (dn->dn_objset != spa_meta_objset(spa)) { dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && - BP_GET_LOGICAL_BIRTH(bp) > - ds->ds_dir->dd_origin_txg) { + BP_GET_BIRTH(bp) > ds->ds_dir->dd_origin_txg) { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, @@ -5313,7 +5343,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg); + ASSERT3U(BP_GET_BIRTH(db->db_blkptr), <=, txg); ASSERT(pio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? @@ -5327,6 +5357,24 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); /* + * Set rewrite properties for zfs_rewrite() operations. + */ + if (db->db_level == 0 && dr->dt.dl.dr_rewrite) { + zp.zp_rewrite = B_TRUE; + + /* + * Mark physical rewrite feature for activation. + * This will be activated automatically during dataset sync. + */ + dsl_dataset_t *ds = os->os_dsl_dataset; + if (!dsl_dataset_feature_is_active(ds, + SPA_FEATURE_PHYSICAL_REWRITE)) { + ds->ds_feature_activation[ + SPA_FEATURE_PHYSICAL_REWRITE] = (void *)B_TRUE; + } + } + + /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read @@ -5387,7 +5435,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); EXPORT_SYMBOL(dbuf_destroy); -EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); EXPORT_SYMBOL(dbuf_unoverride); @@ -5397,6 +5444,7 @@ EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); +EXPORT_SYMBOL(dmu_buf_will_rewrite); EXPORT_SYMBOL(dmu_buf_is_dirty); EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); |