aboutsummaryrefslogtreecommitdiff
path: root/uts
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-08-12 12:05:40 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-08-12 12:05:40 +0000
commit55f75bf072909962e95f5c900c338b5ad7ce98b0 (patch)
tree0b884c2e46daf6bc015f1ba67cf3ba3cb32cc8a9 /uts
parentfeaa27590c679fea301c2c0f391cfa4b6f62e8fa (diff)
downloadsrc-55f75bf072909962e95f5c900c338b5ad7ce98b0.tar.gz
src-55f75bf072909962e95f5c900c338b5ad7ce98b0.zip
8423 8199 7432 Implement large_dnode pool feature
8423 Implement large_dnode pool feature 8199 multi-threaded dmu_object_alloc() 7432 Large dnode pool feature llumos/illumos-gate@54811da5ac6b517992fdc173df5d605e4e61fdc0 https://github.com/illumos/illumos-gate/commit/54811da5ac6b517992fdc173df5d605e4e61fdc0 https://www.illumos.org/issues/8423 https://www.illumos.org/issues/8199 https://www.illumos.org/issues/7432 ZoL issues: Improved dnode allocation #6564 Clean up large dnode code #6262 Fix dnode_hold() freeing dnode behavior #8172 Fix dnode allocation race #6414, #6439 Partial: Raw sends must be able to decrease nlevels #6821, #6864 Remove unnecessary txg syncs from receive_object() Closes #7197 Author: Toomas Soome <tsoome@me.com>
Notes
Notes: svn path=/vendor-sys/illumos/dist/; revision=350898
Diffstat (limited to 'uts')
-rw-r--r--uts/common/fs/zfs/dbuf.c69
-rw-r--r--uts/common/fs/zfs/dmu.c19
-rw-r--r--uts/common/fs/zfs/dmu_object.c297
-rw-r--r--uts/common/fs/zfs/dmu_objset.c52
-rw-r--r--uts/common/fs/zfs/dmu_send.c107
-rw-r--r--uts/common/fs/zfs/dmu_traverse.c8
-rw-r--r--uts/common/fs/zfs/dmu_tx.c14
-rw-r--r--uts/common/fs/zfs/dnode.c565
-rw-r--r--uts/common/fs/zfs/dnode_sync.c22
-rw-r--r--uts/common/fs/zfs/dsl_scan.c10
-rw-r--r--uts/common/fs/zfs/sa.c20
-rw-r--r--uts/common/fs/zfs/spa.c11
-rw-r--r--uts/common/fs/zfs/spa_misc.c15
-rw-r--r--uts/common/fs/zfs/sys/arc.h1
-rw-r--r--uts/common/fs/zfs/sys/dmu.h15
-rw-r--r--uts/common/fs/zfs/sys/dmu_objset.h7
-rw-r--r--uts/common/fs/zfs/sys/dnode.h246
-rw-r--r--uts/common/fs/zfs/sys/dsl_dataset.h7
-rw-r--r--uts/common/fs/zfs/sys/sa_impl.h6
-rw-r--r--uts/common/fs/zfs/sys/spa.h1
-rw-r--r--uts/common/fs/zfs/sys/zap.h19
-rw-r--r--uts/common/fs/zfs/sys/zfs_ioctl.h9
-rw-r--r--uts/common/fs/zfs/sys/zfs_znode.h1
-rw-r--r--uts/common/fs/zfs/sys/zil.h17
-rw-r--r--uts/common/fs/zfs/zap.c13
-rw-r--r--uts/common/fs/zfs/zap_micro.c58
-rw-r--r--uts/common/fs/zfs/zfs_acl.c18
-rw-r--r--uts/common/fs/zfs/zfs_ioctl.c18
-rw-r--r--uts/common/fs/zfs/zfs_log.c2
-rw-r--r--uts/common/fs/zfs/zfs_replay.c30
-rw-r--r--uts/common/fs/zfs/zfs_sa.c3
-rw-r--r--uts/common/fs/zfs/zfs_znode.c40
-rw-r--r--uts/common/fs/zfs/zil.c16
-rw-r--r--uts/common/sys/fs/zfs.h12
34 files changed, 1426 insertions, 322 deletions
diff --git a/uts/common/fs/zfs/dbuf.c b/uts/common/fs/zfs/dbuf.c
index 4fcf14fba512..4bb53837716c 100644
--- a/uts/common/fs/zfs/dbuf.c
+++ b/uts/common/fs/zfs/dbuf.c
@@ -742,7 +742,6 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
} else if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT0(db->db.db_offset);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -995,13 +994,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
+ /*
+ * The bonus length stored in the dnode may be less than
+ * the maximum available space in the bonus buffer.
+ */
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(bonuslen, <=, db->db.db_size);
- db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- if (bonuslen < DN_MAX_BONUSLEN)
- bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ db->db.db_data = zio_buf_alloc(max_bonuslen);
+ arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
+ if (bonuslen < max_bonuslen)
+ bzero(db->db.db_data, max_bonuslen);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
DB_DNODE_EXIT(db);
@@ -1108,9 +1112,11 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
ASSERT(dr->dr_txg >= txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ dnode_t *dn = DB_DNODE(db);
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
+ arc_space_consume(bonuslen, ARC_SPACE_BONUS);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
@@ -2081,10 +2087,13 @@ dbuf_destroy(dmu_buf_impl_t *db)
}
if (db->db_blkid == DMU_BONUS_BLKID) {
- ASSERT(db->db.db_data != NULL);
- zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- db->db_state = DB_UNCACHED;
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ if (db->db.db_data != NULL) {
+ zio_buf_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ }
}
dbuf_clear_data(db);
@@ -2188,7 +2197,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
mutex_enter(&dn->dn_mtx);
if (dn->dn_have_spill &&
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
- *bpp = &dn->dn_phys->dn_spill;
+ *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
else
*bpp = NULL;
dbuf_add_ref(dn->dn_dbuf, NULL);
@@ -2289,7 +2298,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = DN_MAX_BONUSLEN -
+ db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
@@ -3031,7 +3040,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
return;
if (db->db_blkid == DMU_SPILL_BLKID) {
- db->db_blkptr = &dn->dn_phys->dn_spill;
+ db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
BP_ZERO(db->db_blkptr);
return;
}
@@ -3162,13 +3171,17 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
- bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
+ DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
+ bcopy(*datap, DN_BONUS(dn->dn_phys),
+ DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
- zio_buf_free(*datap, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(*datap, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
@@ -3324,7 +3337,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(bp)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
}
#endif
@@ -3336,11 +3349,17 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_exit(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_DNODE) {
- dnode_phys_t *dnp = db->db.db_data;
- for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
- i--, dnp++) {
- if (dnp->dn_type != DMU_OT_NONE)
+ i = 0;
+ while (i < db->db.db_size) {
+ dnode_phys_t *dnp =
+ (void *)(((char *)db->db.db_data) + i);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ i += dnp->dn_extra_slots *
+ DNODE_MIN_SIZE;
+ }
}
} else {
if (BP_IS_HOLE(bp)) {
@@ -3493,7 +3512,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
DB_DNODE_EXIT(db);
}
#endif
diff --git a/uts/common/fs/zfs/dmu.c b/uts/common/fs/zfs/dmu.c
index 02027ef6bbd1..d338286a9b50 100644
--- a/uts/common/fs/zfs/dmu.c
+++ b/uts/common/fs/zfs/dmu.c
@@ -254,7 +254,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
int
dmu_bonus_max(void)
{
- return (DN_MAX_BONUSLEN);
+ return (DN_OLD_MAX_BONUSLEN);
}
int
@@ -2264,6 +2264,7 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
doi->doi_type = dn->dn_type;
doi->doi_bonus_type = dn->dn_bonustype;
doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
@@ -2326,9 +2327,21 @@ dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
- /* add 1 for dnode space */
+ /* add in number of slots used for the dnode itself */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
- SPA_MINBLOCKSHIFT) + 1;
+ SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+ DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ *dnsize = dn->dn_num_slots << DNODE_SHIFT;
DB_DNODE_EXIT(db);
}
diff --git a/uts/common/fs/zfs/dmu_object.c b/uts/common/fs/zfs/dmu_object.c
index b853081e8b7c..2fe866b89d29 100644
--- a/uts/common/fs/zfs/dmu_object.c
+++ b/uts/common/fs/zfs/dmu_object.c
@@ -30,53 +30,132 @@
#include <sys/dnode.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
-uint64_t
-dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
- int indirect_blockshift,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+/*
+ * Each of the concurrent object allocators will grab
+ * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
+ * grab 128 slots, which is 4 blocks worth. This was experimentally
+ * determined to be the lowest value that eliminates the measurable effect
+ * of lock contention from this code path.
+ */
+int dmu_object_alloc_chunk_shift = 7;
+
+static uint64_t
+dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
{
uint64_t object;
uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
(DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ boolean_t restarted = B_FALSE;
+ uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
+ os->os_obj_next_percpu_len];
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ int error;
+
+ if (dn_slots == 0) {
+ dn_slots = DNODE_MIN_SLOTS;
+ } else {
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+ }
+
+ /*
+ * The "chunk" of dnodes that is assigned to a CPU-specific
+ * allocator needs to be at least one block's worth, to avoid
+ * lock contention on the dbuf. It can be at most one L1 block's
+ * worth, so that the "rescan after polishing off a L1's worth"
+ * logic below will be sure to kick in.
+ */
+ if (dnodes_per_chunk < DNODES_PER_BLOCK)
+ dnodes_per_chunk = DNODES_PER_BLOCK;
+ if (dnodes_per_chunk > L1_dnode_count)
+ dnodes_per_chunk = L1_dnode_count;
+
+ object = *cpuobj;
- mutex_enter(&os->os_obj_lock);
for (;;) {
- object = os->os_obj_next;
/*
- * Each time we polish off a L1 bp worth of dnodes (2^12
- * objects), move to another L1 bp that's still reasonably
- * sparse (at most 1/4 full). Look from the beginning at most
- * once per txg, but after that keep looking from here.
- * os_scan_dnodes is set during txg sync if enough objects
- * have been freed since the previous rescan to justify
- * backfilling again. If we can't find a suitable block, just
- * keep going from here.
- *
- * Note that dmu_traverse depends on the behavior that we use
- * multiple blocks of the dnode object before going back to
- * reuse objects. Any change to this algorithm should preserve
- * that property or find another solution to the issues
- * described in traverse_visitbp.
+ * If we finished a chunk of dnodes, get a new one from
+ * the global allocator.
*/
-
- if (P2PHASE(object, L1_dnode_count) == 0) {
- uint64_t offset;
- int error;
- if (os->os_rescan_dnodes) {
- offset = 0;
- os->os_rescan_dnodes = B_FALSE;
- } else {
- offset = object << DNODE_SHIFT;
+ if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+ (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+ dn_slots)) {
+ DNODE_STAT_BUMP(dnode_alloc_next_chunk);
+ mutex_enter(&os->os_obj_lock);
+ ASSERT0(P2PHASE(os->os_obj_next_chunk,
+ dnodes_per_chunk));
+ object = os->os_obj_next_chunk;
+
+ /*
+ * Each time we polish off a L1 bp worth of dnodes
+ * (2^12 objects), move to another L1 bp that's
+ * still reasonably sparse (at most 1/4 full). Look
+ * from the beginning at most once per txg. If we
+ * still can't allocate from that L1 block, search
+ * for an empty L0 block, which will quickly skip
+ * to the end of the metadnode if the no nearby L0
+ * blocks are empty. This fallback avoids a
+ * pathology where full dnode blocks containing
+ * large dnodes appear sparse because they have a
+ * low blk_fill, leading to many failed allocation
+ * attempts. In the long term a better mechanism to
+ * search for sparse metadnode regions, such as
+ * spacemaps, could be implemented.
+ *
+ * os_scan_dnodes is set during txg sync if enough
+ * objects have been freed since the previous
+ * rescan to justify backfilling again.
+ *
+ * Note that dmu_traverse depends on the behavior
+ * that we use multiple blocks of the dnode object
+ * before going back to reuse objects. Any change
+ * to this algorithm should preserve that property
+ * or find another solution to the issues described
+ * in traverse_visitbp.
+ */
+ if (P2PHASE(object, L1_dnode_count) == 0) {
+ uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
+ if (os->os_rescan_dnodes) {
+ offset = 0;
+ os->os_rescan_dnodes = B_FALSE;
+ } else {
+ offset = object << DNODE_SHIFT;
+ }
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ DNODE_FIND_HOLE, &offset, minlvl,
+ blkfill, 0);
+ if (error == 0) {
+ object = offset >> DNODE_SHIFT;
+ }
}
- error = dnode_next_offset(DMU_META_DNODE(os),
- DNODE_FIND_HOLE,
- &offset, 2, DNODES_PER_BLOCK >> 2, 0);
- if (error == 0)
- object = offset >> DNODE_SHIFT;
+ /*
+ * Note: if "restarted", we may find a L0 that
+ * is not suitably aligned.
+ */
+ os->os_obj_next_chunk =
+ P2ALIGN(object, dnodes_per_chunk) +
+ dnodes_per_chunk;
+ (void) atomic_swap_64(cpuobj, object);
+ mutex_exit(&os->os_obj_lock);
}
- os->os_obj_next = ++object;
+
+ /*
+ * The value of (*cpuobj) before adding dn_slots is the object
+ * ID assigned to us. The value afterwards is the object ID
+ * assigned to whoever wants to do an allocation next.
+ */
+ object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
/*
* XXX We should check for an i/o error here and return
@@ -84,47 +163,94 @@ dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
- FTAG, &dn);
- if (dn)
- break;
+ error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ dn_slots, FTAG, &dn);
+ if (error == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ /*
+ * Another thread could have allocated it; check
+ * again now that we have the struct lock.
+ */
+ if (dn->dn_type == DMU_OT_NONE) {
+ dnode_allocate(dn, ot, blocksize, 0,
+ bonustype, bonuslen, dn_slots, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_tx_add_new_object(tx, dn);
+ dnode_rele(dn, FTAG);
+ return (object);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ DNODE_STAT_BUMP(dnode_alloc_race);
+ }
- if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- os->os_obj_next = object - 1;
+ /*
+ * Skip to next known valid starting point on error. This
+ * is the start of the next block of dnodes.
+ */
+ if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
+ object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+ DNODE_STAT_BUMP(dnode_alloc_next_block);
+ }
+ (void) atomic_swap_64(cpuobj, object);
}
-
- dnode_allocate(dn, ot, blocksize, indirect_blockshift,
- bonustype, bonuslen, tx);
- mutex_exit(&os->os_obj_lock);
-
- dmu_tx_add_new_object(tx, dn);
- dnode_rele(dn, FTAG);
-
- return (object);
}
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- return (dmu_object_alloc_ibs(os, ot, blocksize, 0,
- bonustype, bonuslen, tx));
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, 0, tx));
+}
+
+uint64_t
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, dnodesize, tx));
}
int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
if (err)
return (err);
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
dmu_tx_add_new_object(tx, dn);
dnode_rele(dn, FTAG);
@@ -136,18 +262,28 @@ int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
if (object == DMU_META_DNODE_OBJECT)
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
return (err);
@@ -161,7 +297,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
@@ -186,9 +322,54 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
- uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ uint64_t offset;
+ uint64_t start_obj;
+ struct dsl_dataset *ds = os->os_dsl_dataset;
int error;
+ if (*objectp == 0) {
+ start_obj = 1;
+ } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+ uint64_t i = *objectp + 1;
+ uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+ dmu_object_info_t doi;
+
+ /*
+ * Scan through the remaining meta dnode block. The contents
+ * of each slot in the block are known so it can be quickly
+ * checked. If the block is exhausted without a match then
+ * hand off to dnode_next_offset() for further scanning.
+ */
+ while (i <= last_obj) {
+ error = dmu_object_info(os, i, &doi);
+ if (error == ENOENT) {
+ if (hole) {
+ *objectp = i;
+ return (0);
+ } else {
+ i++;
+ }
+ } else if (error == EEXIST) {
+ i++;
+ } else if (error == 0) {
+ if (hole) {
+ i += doi.doi_dnodesize >> DNODE_SHIFT;
+ } else {
+ *objectp = i;
+ return (0);
+ }
+ } else {
+ return (error);
+ }
+ }
+
+ start_obj = i;
+ } else {
+ start_obj = *objectp + 1;
+ }
+
+ offset = start_obj << DNODE_SHIFT;
+
error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
diff --git a/uts/common/fs/zfs/dmu_objset.c b/uts/common/fs/zfs/dmu_objset.c
index c5267ac18dcd..db0fff702ef8 100644
--- a/uts/common/fs/zfs/dmu_objset.c
+++ b/uts/common/fs/zfs/dmu_objset.c
@@ -140,6 +140,12 @@ dmu_objset_id(objset_t *os)
return (ds ? ds->ds_object : 0);
}
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+ return (os->os_dnodesize);
+}
+
zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
@@ -270,6 +276,34 @@ redundant_metadata_changed_cb(void *arg, uint64_t newval)
}
static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ switch (newval) {
+ case ZFS_DNSIZE_LEGACY:
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ break;
+ case ZFS_DNSIZE_AUTO:
+ /*
+ * Choose a dnode size that will work well for most
+ * workloads if the user specified "auto". Future code
+ * improvements could dynamically select a dnode size
+ * based on observed workload patterns.
+ */
+ os->os_dnodesize = DNODE_MIN_SIZE * 2;
+ break;
+ case ZFS_DNSIZE_1K:
+ case ZFS_DNSIZE_2K:
+ case ZFS_DNSIZE_4K:
+ case ZFS_DNSIZE_8K:
+ case ZFS_DNSIZE_16K:
+ os->os_dnodesize = newval;
+ break;
+ }
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -477,6 +511,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ dnodesize_changed_cb, os);
+ }
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
@@ -496,6 +535,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_dnodesize = DNODE_MIN_SIZE;
}
/*
* These properties will be filled in by the logic in zfs_get_zplprop()
@@ -524,6 +564,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+ os->os_obj_next_percpu_len = boot_ncpus;
+ os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
+ sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
dnode_special_open(os, &os->os_phys->os_meta_dnode,
DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
@@ -802,6 +845,9 @@ dmu_objset_evict_done(objset_t *os)
rw_enter(&os_lock, RW_READER);
rw_exit(&os_lock);
+ kmem_free(os->os_obj_next_percpu,
+ os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
+
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_userused_lock);
mutex_destroy(&os->os_obj_lock);
@@ -836,8 +882,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mdn = DMU_META_DNODE(os);
- dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
- DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+ dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
+ DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
@@ -1496,7 +1542,7 @@ do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
uint64_t user, uint64_t group, boolean_t subtract)
{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
- int64_t delta = DNODE_SIZE + used;
+ int64_t delta = DNODE_MIN_SIZE + used;
if (subtract)
delta = -delta;
diff --git a/uts/common/fs/zfs/dmu_send.c b/uts/common/fs/zfs/dmu_send.c
index 62abee3637eb..3d6858bfec2a 100644
--- a/uts/common/fs/zfs/dmu_send.c
+++ b/uts/common/fs/zfs/dmu_send.c
@@ -469,6 +469,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
drro->drr_bonustype = dnp->dn_bonustype;
drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_dn_slots = dnp->dn_extra_slots + 1;
drro->drr_checksumtype = dnp->dn_checksum;
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
@@ -621,7 +622,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
return (0);
} else if (type == DMU_OT_DNODE) {
- int blksz = BP_GET_LSIZE(bp);
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
@@ -633,8 +634,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
return (SET_ERROR(EIO));
dnode_phys_t *blk = abuf->b_data;
- uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
- for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj = zb->zb_blkid * epb;
+ for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
err = dump_dnode(dsa, dnobj + i, blk + i);
if (err != 0)
break;
@@ -802,6 +803,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -1396,11 +1399,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/*
* The receiving code doesn't know how to translate large blocks
* to smaller ones, so the pool must have the LARGE_BLOCKS
- * feature enabled if the stream has LARGE_BLOCKS.
+ * feature enabled if the stream has LARGE_BLOCKS. Same with
+ * large dnodes.
*/
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
@@ -1605,6 +1612,9 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
/* already checked */
ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
@@ -1632,8 +1642,18 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
- /* 6 extra bytes for /%recv */
- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS. Same with
+ * large dnodes.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);
@@ -2024,7 +2044,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
return (1);
} else {
return (1 +
- ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+ ((DN_OLD_MAX_BONUSLEN -
+ MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
}
}
@@ -2082,15 +2103,17 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
- drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ drro->drr_bonuslen >
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+ drro->drr_dn_slots >
+ (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
return (SET_ERROR(EINVAL));
}
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
- if (err != 0 && err != ENOENT)
+ if (err != 0 && err != ENOENT && err != EEXIST)
return (SET_ERROR(EINVAL));
- object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
if (drro->drr_object > rwa->max_object)
rwa->max_object = drro->drr_object;
@@ -2103,16 +2126,64 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (err == 0) {
int nblkptr;
+ object = drro->drr_object;
+
nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);
if (drro->drr_blksz != doi.doi_data_block_size ||
- nblkptr < doi.doi_nblkptr) {
+ nblkptr < doi.doi_nblkptr ||
+ drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
+ } else if (err == EEXIST) {
+ /*
+ * The object requested is currently an interior slot of a
+ * multi-slot dnode. This will be resolved when the next txg
+ * is synced out, since the send stream will have told us
+ * to free this slot when we freed the associated dnode
+ * earlier in the stream.
+ */
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = drro->drr_object;
+ } else {
+ /* object is free and we are about to allocate a new one */
+ object = DMU_NEW_OBJECT;
+ }
+
+ /*
+ * If this is a multi-slot dnode there is a chance that this
+ * object will expand into a slot that is already used by
+ * another object from the previous snapshot. We must free
+ * these objects before we attempt to allocate the new dnode.
+ */
+ if (drro->drr_dn_slots > 1) {
+ boolean_t need_sync = B_FALSE;
+
+ for (uint64_t slot = drro->drr_object + 1;
+ slot < drro->drr_object + drro->drr_dn_slots;
+ slot++) {
+ dmu_object_info_t slot_doi;
+
+ err = dmu_object_info(rwa->os, slot, &slot_doi);
+ if (err == ENOENT || err == EEXIST)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, slot);
+
+ if (err != 0)
+ return (err);
+
+ need_sync = B_TRUE;
+ }
+
+ if (need_sync)
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
}
tx = dmu_tx_create(rwa->os);
@@ -2125,9 +2196,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
- err = dmu_object_claim(rwa->os, drro->drr_object,
+ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
+ drro->drr_bonustype, drro->drr_bonuslen,
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
} else if (drro->drr_type != doi.doi_type ||
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
@@ -2179,13 +2251,18 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
- for (obj = drrfo->drr_firstobj;
+ for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
- if (dmu_object_info(rwa->os, obj, NULL) != 0)
+ err = dmu_object_info(rwa->os, obj, NULL);
+ if (err == ENOENT) {
+ obj++;
continue;
+ } else if (err != 0) {
+ return (err);
+ }
err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
diff --git a/uts/common/fs/zfs/dmu_traverse.c b/uts/common/fs/zfs/dmu_traverse.c
index 050cd69811a6..d54042b04108 100644
--- a/uts/common/fs/zfs/dmu_traverse.c
+++ b/uts/common/fs/zfs/dmu_traverse.c
@@ -327,13 +327,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
goto post;
dnode_phys_t *child_dnp = buf->b_data;
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
prefetch_dnode_metadata(td, &child_dnp[i],
zb->zb_objset, zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
err = traverse_dnode(td, &child_dnp[i],
zb->zb_objset, zb->zb_blkid * epb + i);
if (err != 0)
@@ -435,7 +435,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
+ traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
}
}
@@ -470,7 +470,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
- err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
+ err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
}
if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
diff --git a/uts/common/fs/zfs/dmu_tx.c b/uts/common/fs/zfs/dmu_tx.c
index 557f7f2e8162..4f181fa54739 100644
--- a/uts/common/fs/zfs/dmu_tx.c
+++ b/uts/common/fs/zfs/dmu_tx.c
@@ -280,7 +280,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
static void
dmu_tx_count_dnode(dmu_tx_hold_t *txh)
{
- (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
+ (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG);
}
void
@@ -1246,11 +1246,13 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
void
dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
{
- dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
- tx->tx_objset, object, THT_SPILL, 0, 0);
+ dmu_tx_hold_t *txh;
- (void) refcount_add_many(&txh->txh_space_towrite,
- SPA_OLD_MAXBLOCKSIZE, FTAG);
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+ THT_SPILL, 0, 0);
+ if (txh != NULL)
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
}
void
@@ -1274,7 +1276,7 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
dmu_tx_sa_registration_hold(sa, tx);
- if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+ if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
return;
(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
diff --git a/uts/common/fs/zfs/dnode.c b/uts/common/fs/zfs/dnode.c
index 2720cdbce4d6..1305a4f64aa9 100644
--- a/uts/common/fs/zfs/dnode.c
+++ b/uts/common/fs/zfs/dnode.c
@@ -40,20 +40,40 @@
#include <sys/dmu_zfetch.h>
#include <sys/range_tree.h>
+dnode_stats_t dnode_stats = {
+ { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
+ { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
+ { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_allocate", KSTAT_DATA_UINT64 },
+ { "dnode_reallocate", KSTAT_DATA_UINT64 },
+ { "dnode_buf_evict", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_race", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
+ { "dnode_move_invalid", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
+ { "dnode_move_special", KSTAT_DATA_UINT64 },
+ { "dnode_move_handle", KSTAT_DATA_UINT64 },
+ { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
+ { "dnode_move_active", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
static kmem_cache_t *dnode_cache;
-/*
- * Define DNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef DEBUG
-#define DNODE_STATS
-#endif /* DEBUG */
-
-#ifdef DNODE_STATS
-#define DNODE_STAT_ADD(stat) ((stat)++)
-#else
-#define DNODE_STAT_ADD(stat) /* nothing */
-#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
@@ -218,12 +238,25 @@ dnode_init(void)
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
#ifdef _KERNEL
kmem_cache_set_move(dnode_cache, dnode_move);
+
+ dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (dnode_ksp != NULL) {
+ dnode_ksp->ks_data = &dnode_stats;
+ kstat_install(dnode_ksp);
+ }
#endif /* _KERNEL */
}
void
dnode_fini(void)
{
+ if (dnode_ksp != NULL) {
+ kstat_delete(dnode_ksp);
+ dnode_ksp = NULL;
+ }
+
kmem_cache_destroy(dnode_cache);
dnode_cache = NULL;
}
@@ -250,6 +283,7 @@ dnode_verify(dnode_t *dn)
}
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(dn->dn_indblkshift, >=, 0);
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
if (dn->dn_datablkshift) {
@@ -261,12 +295,12 @@ dnode_verify(dnode_t *dn)
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
ASSERT3U(dn->dn_nblkptr, >=, 1);
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
ASSERT3U(dn->dn_datablksz, ==,
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
- dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ dn->dn_bonuslen, <=, max_bonuslen);
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
}
@@ -297,6 +331,7 @@ dnode_byteswap(dnode_phys_t *dnp)
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
dnp->dn_used = BSWAP_64(dnp->dn_used);
@@ -323,7 +358,8 @@ dnode_byteswap(dnode_phys_t *dnp)
* dnode buffer).
*/
int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- size_t len = DN_MAX_BONUSLEN - off;
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -332,23 +368,25 @@ dnode_byteswap(dnode_phys_t *dnp)
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
- byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
}
void
dnode_buf_byteswap(void *vbuf, size_t size)
{
- dnode_phys_t *buf = vbuf;
- int i;
+ int i = 0;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
- size >>= DNODE_SHIFT;
- for (i = 0; i < size; i++) {
- dnode_byteswap(buf);
- buf++;
+ while (i < size) {
+ dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+ dnode_byteswap(dnp);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
}
}
@@ -359,7 +397,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
dn->dn_bonuslen = newsize;
if (newsize == 0)
@@ -439,6 +477,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
@@ -446,14 +485,10 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+ ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
mutex_enter(&os->os_lock);
- if (dnh->dnh_dnode != NULL) {
- /* Lost the allocation race. */
- mutex_exit(&os->os_lock);
- kmem_cache_free(dnode_cache, dn);
- return (dnh->dnh_dnode);
- }
/*
* Exclude special dnodes from os_dnodes so an empty os_dnodes
@@ -476,6 +511,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+
return (dn);
}
@@ -501,7 +537,8 @@ dnode_destroy(dnode_t *dn)
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
- zrl_remove(&dn->dn_handle->dnh_zrlock);
+ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
@@ -538,10 +575,13 @@ dnode_destroy(dnode_t *dn)
void
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int i;
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
ASSERT3U(blocksize, <=,
spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
@@ -554,8 +594,10 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
- dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
- dn->dn_object, tx->tx_txg, blocksize, ibs);
+ dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
+ " blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+ DNODE_STAT_BUMP(dnode_allocate);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -566,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
(bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
@@ -592,11 +634,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
dn->dn_nblkptr = 1;
- else
- dn->dn_nblkptr = 1 +
- ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -621,7 +667,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int nblkptr;
@@ -635,7 +681,13 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+ dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+
+ dnode_free_interior_slots(dn);
+ DNODE_STAT_BUMP(dnode_reallocate);
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
@@ -658,7 +710,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
nblkptr = 1;
else
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
if (dn->dn_bonustype != bonustype)
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
@@ -676,6 +730,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_enter(&dn->dn_mtx);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -684,7 +739,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
/* fix up the bonus db_size */
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
- DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
@@ -692,18 +748,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_exit(&dn->dn_mtx);
}
-#ifdef DNODE_STATS
-static struct {
- uint64_t dms_dnode_invalid;
- uint64_t dms_dnode_recheck1;
- uint64_t dms_dnode_recheck2;
- uint64_t dms_dnode_special;
- uint64_t dms_dnode_handle;
- uint64_t dms_dnode_rwlock;
- uint64_t dms_dnode_active;
-} dnode_move_stats;
-#endif /* DNODE_STATS */
-
#ifdef _KERNEL
static void
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
@@ -733,6 +777,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_datablkszsec = odn->dn_datablkszsec;
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
+ ndn->dn_num_slots = odn->dn_num_slots;
bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
sizeof (odn->dn_next_type));
bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
@@ -863,7 +908,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
*/
os = odn->dn_objset;
if (!POINTER_IS_VALID(os)) {
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+ DNODE_STAT_BUMP(dnode_move_invalid);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -873,7 +918,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
rw_enter(&os_lock, RW_WRITER);
if (os != odn->dn_objset) {
rw_exit(&os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+ DNODE_STAT_BUMP(dnode_move_recheck1);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -891,7 +936,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
if (os != odn->dn_objset) {
mutex_exit(&os->os_lock);
rw_exit(&os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+ DNODE_STAT_BUMP(dnode_move_recheck2);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -904,7 +949,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
rw_exit(&os_lock);
if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+ DNODE_STAT_BUMP(dnode_move_special);
return (KMEM_CBRC_NO);
}
ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
@@ -919,7 +964,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
*/
if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+ DNODE_STAT_BUMP(dnode_move_handle);
return (KMEM_CBRC_LATER);
}
@@ -935,7 +980,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+ DNODE_STAT_BUMP(dnode_move_rwlock);
return (KMEM_CBRC_LATER);
}
@@ -961,7 +1006,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
rw_exit(&odn->dn_struct_rwlock);
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+ DNODE_STAT_BUMP(dnode_move_active);
return (KMEM_CBRC_LATER);
}
@@ -985,6 +1030,132 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
}
#endif /* _KERNEL */
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ zrl_add(&dnh->dnh_zrlock);
+ }
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (zrl_is_locked(&dnh->dnh_zrlock))
+ zrl_exit(&dnh->dnh_zrlock);
+ else
+ zrl_remove(&dnh->dnh_zrlock);
+ }
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+ for (int j = idx; j < i; j++) {
+ dnh = &children->dnc_children[j];
+ zrl_exit(&dnh->dnh_zrlock);
+ }
+
+ return (0);
+ }
+ }
+
+ return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnh->dnh_dnode = ptr;
+ }
+}
+
+static boolean_t
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnode_t *dn = dnh->dnh_dnode;
+
+ if (dn == DN_SLOT_FREE) {
+ continue;
+ } else if (DN_SLOT_IS_PTR(dn)) {
+ mutex_enter(&dn->dn_mtx);
+ dmu_object_type_t type = dn->dn_type;
+ mutex_exit(&dn->dn_mtx);
+
+ if (type != DMU_OT_NONE)
+ return (B_FALSE);
+
+ continue;
+ } else {
+ return (B_FALSE);
+ }
+
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+ dnode_destroy(dnh->dnh_dnode);
+ dnh->dnh_dnode = DN_SLOT_FREE;
+ }
+ }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+ dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+ int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+ int idx = (dn->dn_object & (epb - 1)) + 1;
+ int slots = dn->dn_num_slots - 1;
+
+ if (slots == 0)
+ return;
+
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ while (!dnode_slots_tryenter(children, idx, slots))
+ DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+ dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+ dnode_slots_rele(children, idx, slots);
+}
+
void
dnode_special_close(dnode_handle_t *dnh)
{
@@ -992,7 +1163,7 @@ dnode_special_close(dnode_handle_t *dnh)
/*
* Wait for final references to the dnode to clear. This can
- * only happen if the arc is asyncronously evicting state that
+ * only happen if the arc is asynchronously evicting state that
* has a hold on this dnode while we are trying to evict this
* dnode.
*/
@@ -1012,19 +1183,24 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
{
dnode_t *dn;
- dn = dnode_create(os, dnp, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
+ zrl_tryenter(&dnh->dnh_zrlock);
+
+ dn = dnode_create(os, dnp, NULL, object, dnh);
DNODE_VERIFY(dn);
+
+ zrl_exit(&dnh->dnh_zrlock);
}
static void
dnode_buf_evict_async(void *dbu)
{
- dnode_children_t *children_dnodes = dbu;
- int i;
+ dnode_children_t *dnc = dbu;
+
+ DNODE_STAT_BUMP(dnode_buf_evict);
- for (i = 0; i < children_dnodes->dnc_count; i++) {
- dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+ for (int i = 0; i < dnc->dnc_count; i++) {
+ dnode_handle_t *dnh = &dnc->dnc_children[i];
dnode_t *dn;
/*
@@ -1032,8 +1208,9 @@ dnode_buf_evict_async(void *dbu)
* another valid address, so there is no need here to guard
* against changes to or from NULL.
*/
- if (dnh->dnh_dnode == NULL) {
+ if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
continue;
}
@@ -1048,22 +1225,40 @@ dnode_buf_evict_async(void *dbu)
ASSERT(refcount_is_zero(&dn->dn_holds));
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- dnode_destroy(dn); /* implicit zrl_remove() */
+ dnode_destroy(dn); /* implicit zrl_remove() for first slot */
zrl_destroy(&dnh->dnh_zrlock);
- dnh->dnh_dnode = NULL;
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
}
- kmem_free(children_dnodes, sizeof (dnode_children_t) +
- children_dnodes->dnc_count * sizeof (dnode_handle_t));
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ dnc->dnc_count * sizeof (dnode_handle_t));
}
/*
+ * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
+ * to ensure the hole at the specified object offset is large enough to
+ * hold the dnode being created. The slots parameter is also used to ensure
+ * a dnode does not span multiple dnode blocks. In both of these cases, if
+ * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
+ * are only possible when using DNODE_MUST_BE_FREE.
+ *
+ * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
+ * dnode_hold_impl() will check if the requested dnode is already consumed
+ * as an extra dnode slot by an large dnode, in which case it returns
+ * ENOENT.
+ *
* errors:
- * EINVAL - invalid object number.
- * EIO - i/o error.
+ * EINVAL - invalid object number or flags.
+ * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ * - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
+ * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
+ * EIO - i/o error error when reading the meta dnode dbuf.
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
void *tag, dnode_t **dnp)
{
int epb, idx, err;
@@ -1072,9 +1267,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_children_t *children_dnodes;
+ dnode_children_t *dnc;
+ dnode_phys_t *dn_block;
dnode_handle_t *dnh;
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
@@ -1121,10 +1320,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
- if (db == NULL)
+ if (db == NULL) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
return (SET_ERROR(EIO));
+ }
err = dbuf_read(db, NULL, DB_RF_CANFAIL);
if (err) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
return (err);
}
@@ -1132,62 +1334,194 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
- idx = object & (epb-1);
+ idx = object & (epb - 1);
+ dn_block = (dnode_phys_t *)db->db.db_data;
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
- children_dnodes = dmu_buf_get_user(&db->db);
- if (children_dnodes == NULL) {
- int i;
+ dnc = dmu_buf_get_user(&db->db);
+ dnh = NULL;
+ if (dnc == NULL) {
dnode_children_t *winner;
- children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+ int skip = 0;
+
+ dnc = kmem_zalloc(sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t), KM_SLEEP);
- children_dnodes->dnc_count = epb;
- dnh = &children_dnodes->dnc_children[0];
- for (i = 0; i < epb; i++) {
+ dnc->dnc_count = epb;
+ dnh = &dnc->dnc_children[0];
+
+ /* Initialize dnode slot status from dnode_phys_t */
+ for (int i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);
+
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ if (dn_block[i].dn_type != DMU_OT_NONE) {
+ int interior = dn_block[i].dn_extra_slots;
+
+ dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+ dnode_set_slots(dnc, i + 1, interior,
+ DN_SLOT_INTERIOR);
+ skip = interior;
+ } else {
+ dnh[i].dnh_dnode = DN_SLOT_FREE;
+ skip = 0;
+ }
}
- dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+
+ dmu_buf_init_user(&dnc->dnc_dbu, NULL,
dnode_buf_evict_async, NULL);
- winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+ winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
if (winner != NULL) {
- for (i = 0; i < epb; i++) {
+ for (int i = 0; i < epb; i++)
zrl_destroy(&dnh[i].dnh_zrlock);
- }
- kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ kmem_free(dnc, sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t));
- children_dnodes = winner;
+ dnc = winner;
}
}
- ASSERT(children_dnodes->dnc_count == epb);
- dnh = &children_dnodes->dnc_children[idx];
- zrl_add(&dnh->dnh_zrlock);
- dn = dnh->dnh_dnode;
- if (dn == NULL) {
- dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
+ ASSERT(dnc->dnc_count == epb);
+ dn = DN_SLOT_UNINIT;
+
+ if (flag & DNODE_MUST_BE_ALLOCATED) {
+ slots = 1;
+
+ while (dn == DN_SLOT_UNINIT) {
+ dnode_slots_hold(dnc, idx, slots);
+ dnh = &dnc->dnc_children[idx];
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ break;
+ } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ if (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+ continue;
+ }
+
+ /*
+ * Someone else won the race and called dnode_create()
+ * after we checked DN_SLOT_IS_PTR() above but before
+ * we acquired the lock.
+ */
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+ } else if (flag & DNODE_MUST_BE_FREE) {
+
+ if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+ DNODE_STAT_BUMP(dnode_hold_free_overflow);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ while (dn == DN_SLOT_UNINIT) {
+ dnode_slots_hold(dnc, idx, slots);
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ if (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+ continue;
+ }
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * Allocated but otherwise free dnodes which would
+ * be in the interior of a multi-slot dnodes need
+ * to be freed. Single slot dnodes can be safely
+ * re-purposed as a performance optimization.
+ */
+ if (slots > 1)
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+ dnh = &dnc->dnc_children[idx];
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (!refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
+ DNODE_STAT_BUMP(dnode_hold_free_refcount);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
- dn = dnode_create(os, phys, db, object, dnh);
+ dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+ DNODE_STAT_BUMP(dnode_hold_free_hits);
+ } else {
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EINVAL));
}
- mutex_enter(&dn->dn_mtx);
- type = dn->dn_type;
- if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+ if (dn->dn_free_txg) {
+ DNODE_STAT_BUMP(dnode_hold_free_txg);
+ type = dn->dn_type;
mutex_exit(&dn->dn_mtx);
- zrl_remove(&dnh->dnh_zrlock);
+ dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
- return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST);
+ return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
+ ENOENT : EEXIST));
}
+
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
+
mutex_exit(&dn->dn_mtx);
/* Now we can rely on the hold to prevent the dnode from moving. */
- zrl_remove(&dnh->dnh_zrlock);
+ dnode_slots_rele(dnc, idx, slots);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
@@ -1204,7 +1538,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
int
dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
}
/*
@@ -1936,17 +2271,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = SET_ERROR(ESRCH);
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
- span = DNODE_SHIFT;
+
ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
- for (i = (*offset >> span) & (blkfill - 1);
- i >= 0 && i < blkfill; i += inc) {
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
- *offset += (1ULL << span) * inc;
}
- if (i < 0 || i == blkfill)
+
+ if (i == blkfill)
error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
} else {
blkptr_t *bp = data;
uint64_t start = *offset;
diff --git a/uts/common/fs/zfs/dnode_sync.c b/uts/common/fs/zfs/dnode_sync.c
index 02f263c82e42..a37607e0e307 100644
--- a/uts/common/fs/zfs/dnode_sync.c
+++ b/uts/common/fs/zfs/dnode_sync.c
@@ -553,7 +553,8 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
- bzero(dn->dn_phys, sizeof (dnode_phys_t));
+ bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+ dnode_free_interior_slots(dn);
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
@@ -561,6 +562,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_have_spill = B_FALSE;
+ dn->dn_num_slots = 1;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -587,7 +589,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
ASSERT(dnp->dn_type != DMU_OT_NONE ||
- bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
+ bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
DNODE_VERIFY(dn);
ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
@@ -619,6 +621,9 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
+
+ dnp->dn_extra_slots = dn->dn_num_slots - 1;
+
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
@@ -651,7 +656,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnp->dn_bonuslen = 0;
else
dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
- ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ ASSERT(dnp->dn_bonuslen <=
+ DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
dn->dn_next_bonuslen[txgoff] = 0;
}
@@ -691,7 +697,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
if (kill_spill) {
- free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
mutex_enter(&dn->dn_mtx);
dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
@@ -721,6 +727,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
return;
}
+ if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ mutex_enter(&ds->ds_lock);
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_DNODE] =
+ B_TRUE;
+ mutex_exit(&ds->ds_lock);
+ }
+
if (dn->dn_next_nlevels[txgoff]) {
dnode_increase_indirection(dn, tx);
dn->dn_next_nlevels[txgoff] = 0;
diff --git a/uts/common/fs/zfs/dsl_scan.c b/uts/common/fs/zfs/dsl_scan.c
index 6fd97d9bfcd6..c19e43bd9fa7 100644
--- a/uts/common/fs/zfs/dsl_scan.c
+++ b/uts/common/fs/zfs/dsl_scan.c
@@ -773,14 +773,18 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
scn->scn_phys.scn_errors++;
return (err);
}
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
for (j = 0; j < cdnp->dn_nblkptr; j++) {
blkptr_t *cbp = &cdnp->dn_blkptr[j];
dsl_scan_prefetch(scn, buf, cbp,
zb->zb_objset, zb->zb_blkid * epb + i, j);
}
}
- for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb;
+ i += cdnp->dn_extra_slots + 1,
+ cdnp += cdnp->dn_extra_slots + 1) {
dsl_scan_visitdnode(scn, ds, ostype,
cdnp, zb->zb_blkid * epb + i, tx);
}
@@ -843,7 +847,7 @@ dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
0, DMU_SPILL_BLKID);
- dsl_scan_visitbp(&dnp->dn_spill,
+ dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
&czb, dnp, ds, scn, ostype, tx);
}
}
diff --git a/uts/common/fs/zfs/sa.c b/uts/common/fs/zfs/sa.c
index f36483d26531..8cb8199088eb 100644
--- a/uts/common/fs/zfs/sa.c
+++ b/uts/common/fs/zfs/sa.c
@@ -35,6 +35,7 @@
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
#include <sys/dbuf.h>
#include <sys/dnode.h>
#include <sys/zap.h>
@@ -543,12 +544,11 @@ sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
*/
static int
sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
- dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
- boolean_t *will_spill)
+ dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
+ int *total, boolean_t *will_spill)
{
int var_size = 0;
int i;
- int full_space;
int hdrsize;
int extra_hdrsize;
@@ -567,7 +567,6 @@ sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
sizeof (sa_hdr_phys_t);
- full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
ASSERT(IS_P2ALIGNED(full_space, 8));
for (i = 0; i != attr_count; i++) {
@@ -653,6 +652,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
int buf_space;
sa_attr_type_t *attrs, *attrs_start;
int i, lot_count;
+ int dnodesize;
int hdrsize;
int spillhdrsize = 0;
int used;
@@ -660,20 +660,24 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
sa_lot_t *lot;
int len_idx;
int spill_used;
+ int bonuslen;
boolean_t spilling;
dmu_buf_will_dirty(hdl->sa_bonus, tx);
bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+ dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+ bonuslen = DN_BONUS_SIZE(dnodesize);
+
/* first determine bonus header size and sum of all attributes */
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
- SA_BONUS, &i, &used, &spilling);
+ SA_BONUS, bonuslen, &i, &used, &spilling);
if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
- MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+ MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
used + hdrsize, tx));
ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
@@ -690,8 +694,8 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
dmu_buf_will_dirty(hdl->sa_spill, tx);
spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
- attr_count - i, hdl->sa_spill, SA_SPILL, &i,
- &spill_used, &dummy);
+ attr_count - i, hdl->sa_spill, SA_SPILL,
+ hdl->sa_spill->db_size, &i, &spill_used, &dummy);
if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
diff --git a/uts/common/fs/zfs/spa.c b/uts/common/fs/zfs/spa.c
index b975e5939d16..3ae96d35030f 100644
--- a/uts/common/fs/zfs/spa.c
+++ b/uts/common/fs/zfs/spa.c
@@ -350,6 +350,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
}
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MAX_SIZE, ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+ DNODE_MIN_SIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -577,8 +585,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
/*
* Must be ZPL, and its property settings
- * must be supported by GRUB (compression
- * is not gzip, and large blocks are not used).
+ * must be supported.
*/
if (dmu_objset_type(os) != DMU_OST_ZFS) {
diff --git a/uts/common/fs/zfs/spa_misc.c b/uts/common/fs/zfs/spa_misc.c
index 8ba49fed4155..4481fa52003f 100644
--- a/uts/common/fs/zfs/spa_misc.c
+++ b/uts/common/fs/zfs/spa_misc.c
@@ -990,10 +990,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
/*
* Spares are tracked globally due to the following constraints:
*
- * - A spare may be part of multiple pools.
- * - A spare may be added to a pool even if it's actively in use within
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
* another pool.
- * - A spare in use in any pool can only be the source of a replacement if
+ * - A spare in use in any pool can only be the source of a replacement if
* the target is a spare in the same pool.
*
* We keep track of all spares on the system through the use of a reference
@@ -2104,6 +2104,15 @@ spa_maxblocksize(spa_t *spa)
return (SPA_OLD_MAXBLOCKSIZE);
}
+int
+spa_maxdnodesize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+ return (DNODE_MAX_SIZE);
+ else
+ return (DNODE_MIN_SIZE);
+}
+
/*
* Returns the txg that the last device removal completed. No indirect mappings
* have been added since this txg.
diff --git a/uts/common/fs/zfs/sys/arc.h b/uts/common/fs/zfs/sys/arc.h
index 45db7701e1aa..641ae93e9c23 100644
--- a/uts/common/fs/zfs/sys/arc.h
+++ b/uts/common/fs/zfs/sys/arc.h
@@ -149,6 +149,7 @@ typedef enum arc_space_type {
ARC_SPACE_HDRS,
ARC_SPACE_L2HDRS,
ARC_SPACE_OTHER,
+ ARC_SPACE_BONUS,
ARC_SPACE_NUMTYPES
} arc_space_type_t;
diff --git a/uts/common/fs/zfs/sys/dmu.h b/uts/common/fs/zfs/sys/dmu.h
index 887a5ff7e3b0..535c13fe05c1 100644
--- a/uts/common/fs/zfs/sys/dmu.h
+++ b/uts/common/fs/zfs/sys/dmu.h
@@ -358,6 +358,15 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+ int dnodesize, dmu_tx_t *tx);
+int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonus_type, int bonus_len,
+ int dnodesize, dmu_tx_t *tx);
+int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
+ dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
+ int bonuslen, int dnodesize, dmu_tx_t *txp);
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
@@ -804,7 +813,8 @@ typedef struct dmu_object_info {
uint8_t doi_checksum;
uint8_t doi_compress;
uint8_t doi_nblkptr;
- uint8_t doi_pad[4];
+ int8_t doi_pad[4];
+ uint64_t doi_dnodesize;
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
uint64_t doi_max_offset;
uint64_t doi_fill_count; /* number of non-empty blocks */
@@ -846,6 +856,8 @@ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
u_longlong_t *nblk512);
+void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
+
typedef struct dmu_objset_stats {
uint64_t dds_num_clones; /* number of clones of this */
uint64_t dds_creation_txg;
@@ -903,6 +915,7 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_dnodesize(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
diff --git a/uts/common/fs/zfs/sys/dmu_objset.h b/uts/common/fs/zfs/sys/dmu_objset.h
index 25ff8642177d..3028f0436566 100644
--- a/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/uts/common/fs/zfs/sys/dmu_objset.h
@@ -91,6 +91,7 @@ struct objset {
list_node_t os_evicting_node;
/* can change, under dsl_dir's locks: */
+ uint64_t os_dnodesize; /* default dnode size for new objects */
enum zio_checksum os_checksum;
enum zio_compress os_compress;
uint8_t os_copies;
@@ -129,7 +130,11 @@ struct objset {
/* Protected by os_obj_lock */
kmutex_t os_obj_lock;
- uint64_t os_obj_next;
+ uint64_t os_obj_next_chunk;
+
+ /* Per-CPU next object to allocate, protected by atomic ops. */
+ uint64_t *os_obj_next_percpu;
+ int os_obj_next_percpu_len;
/* Protected by os_lock */
kmutex_t os_lock;
diff --git a/uts/common/fs/zfs/sys/dnode.h b/uts/common/fs/zfs/sys/dnode.h
index 89a7b2ef60e4..68872a8e9e88 100644
--- a/uts/common/fs/zfs/sys/dnode.h
+++ b/uts/common/fs/zfs/sys/dnode.h
@@ -86,12 +86,26 @@ extern "C" {
/*
* Derived constants.
*/
-#define DNODE_SIZE (1 << DNODE_SHIFT)
-#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
-#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
-#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
-#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
-#define DN_KILL_SPILLBLK (1)
+#define DNODE_MIN_SIZE (1 << DNODE_SHIFT)
+#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT)
+#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT)
+#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT)
+#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \
+ (1 << SPA_BLKPTRSHIFT))
+#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
+#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
+#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
+#define DN_KILL_SPILLBLK (1)
+
+#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */
+#define DN_SLOT_FREE ((void *)1UL) /* Free slot */
+#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */
+#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */
+#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
+#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -109,6 +123,10 @@ extern "C" {
#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+#define DN_MAX_BONUS_LEN(dnp) \
+ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \
+ (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \
+ (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp))
#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
@@ -132,6 +150,57 @@ enum dnode_dirtycontext {
/* Does dnode have a SA spill blkptr in bonus? */
#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
+/*
+ * VARIABLE-LENGTH (LARGE) DNODES
+ *
+ * The motivation for variable-length dnodes is to eliminate the overhead
+ * associated with using spill blocks. Spill blocks are used to store
+ * system attribute data (i.e. file metadata) that does not fit in the
+ * dnode's bonus buffer. By allowing a larger bonus buffer area the use of
+ * a spill block can be avoided. Spill blocks potentially incur an
+ * additional read I/O for every dnode in a dnode block. As a worst case
+ * example, reading 32 dnodes from a 16k dnode block and all of the spill
+ * blocks could issue 33 separate reads. Now suppose those dnodes have size
+ * 1024 and therefore don't need spill blocks. Then the worst case number
+ * of blocks read is reduced to from 33 to two--one per dnode block.
+ *
+ * ZFS-on-Linux systems that make heavy use of extended attributes benefit
+ * from this feature. In particular, ZFS-on-Linux supports the xattr=sa
+ * dataset property which allows file extended attribute data to be stored
+ * in the dnode bonus buffer as an alternative to the traditional
+ * directory-based format. Workloads such as SELinux and the Lustre
+ * distributed filesystem often store enough xattr data to force spill
+ * blocks when xattr=sa is in effect. Large dnodes may therefore provide a
+ * performance benefit to such systems. Other use cases that benefit from
+ * this feature include files with large ACLs and symbolic links with long
+ * target names.
+ *
+ * The size of a dnode may be a multiple of 512 bytes up to the size of a
+ * dnode block (currently 16384 bytes). The dn_extra_slots field of the
+ * on-disk dnode_phys_t structure describes the size of the physical dnode
+ * on disk. The field represents how many "extra" dnode_phys_t slots a
+ * dnode consumes in its dnode block. This convention results in a value of
+ * 0 for 512 byte dnodes which preserves on-disk format compatibility with
+ * older software which doesn't support large dnodes.
+ *
+ * Similarly, the in-memory dnode_t structure has a dn_num_slots field
+ * to represent the total number of dnode_phys_t slots consumed on disk.
+ * Thus dn->dn_num_slots is 1 greater than the corresponding
+ * dnp->dn_extra_slots. This difference in convention was adopted
+ * because, unlike on-disk structures, backward compatibility is not a
+ * concern for in-memory objects, so we used a more natural way to
+ * represent size for a dnode_t.
+ *
+ * The default size for newly created dnodes is determined by the value of
+ * the "dnodesize" dataset property. By default the property is set to
+ * "legacy" which is compatible with older software. Setting the property
+ * to "auto" will allow the filesystem to choose the most suitable dnode
+ * size. Currently this just sets the default dnode size to 1k, but future
+ * code improvements could dynamically choose a size based on observed
+ * workload patterns. Dnodes of varying sizes can coexist within the same
+ * dataset and even within the same dnode block.
+ */
+
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
uint8_t dn_indblkshift; /* ln2(indirect block size) */
@@ -143,19 +212,32 @@ typedef struct dnode_phys {
uint8_t dn_flags; /* DNODE_FLAG_* */
uint16_t dn_datablkszsec; /* data block size in 512b sectors */
uint16_t dn_bonuslen; /* length of dn_bonus */
- uint8_t dn_pad2[4];
+ uint8_t dn_extra_slots; /* # of subsequent slots consumed */
+ uint8_t dn_pad2[3];
/* accounting is protected by dn_dirty_mtx */
uint64_t dn_maxblkid; /* largest allocated block ID */
uint64_t dn_used; /* bytes (or sectors) of disk space */
uint64_t dn_pad3[4];
-
- blkptr_t dn_blkptr[1];
- uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
- blkptr_t dn_spill;
+ union {
+ blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
+ struct {
+ blkptr_t __dn_ignore1;
+ uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN];
+ };
+ struct {
+ blkptr_t __dn_ignore2;
+ uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
+ sizeof (blkptr_t)];
+ blkptr_t dn_spill;
+ };
+ };
} dnode_phys_t;
+#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
+ (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
+
struct dnode {
/*
* Protects the structure of the dnode, including the number of levels
@@ -192,6 +274,7 @@ struct dnode {
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
uint8_t dn_next_type[TXG_SIZE];
+ uint8_t dn_num_slots; /* metadnode slots consumed on disk */
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
@@ -287,7 +370,7 @@ void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
int dnode_hold(struct objset *dd, uint64_t object,
void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
@@ -295,9 +378,9 @@ void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
void dnode_free(dnode_t *dn, dmu_tx_t *tx);
void dnode_byteswap(dnode_phys_t *dnp);
void dnode_buf_byteswap(void *buf, size_t size);
@@ -313,6 +396,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
+void dnode_free_interior_slots(dnode_t *dn);
boolean_t dnode_needs_remap(const dnode_t *dn);
#define DNODE_IS_CACHEABLE(_dn) \
@@ -324,6 +408,140 @@ boolean_t dnode_needs_remap(const dnode_t *dn);
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
+/*
+ * Used for dnodestats kstat.
+ */
+typedef struct dnode_stats {
+ /*
+ * Number of failed attempts to hold a meta dnode dbuf.
+ */
+ kstat_named_t dnode_hold_dbuf_hold;
+ /*
+ * Number of failed attempts to read a meta dnode dbuf.
+ */
+ kstat_named_t dnode_hold_dbuf_read;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
+ * to hold the requested object number which was allocated. This is
+ * the common case when looking up any allocated object number.
+ */
+ kstat_named_t dnode_hold_alloc_hits;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
+ * able to hold the request object number because it was not allocated.
+ */
+ kstat_named_t dnode_hold_alloc_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
+ * able to hold the request object number because the object number
+ * refers to an interior large dnode slot.
+ */
+ kstat_named_t dnode_hold_alloc_interior;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
+ * to retry acquiring slot zrl locks due to contention.
+ */
+ kstat_named_t dnode_hold_alloc_lock_retry;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
+ * need to create the dnode because another thread did so after
+ * dropping the read lock but before acquiring the write lock.
+ */
+ kstat_named_t dnode_hold_alloc_lock_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
+ * a free dnode instantiated by dnode_create() but not yet allocated
+ * by dnode_allocate().
+ */
+ kstat_named_t dnode_hold_alloc_type_none;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
+ * to hold the requested range of free dnode slots.
+ */
+ kstat_named_t dnode_hold_free_hits;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
+ * able to hold the requested range of free dnode slots because
+ * at least one slot was allocated.
+ */
+ kstat_named_t dnode_hold_free_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
+ * able to hold the requested range of free dnode slots because
+ * after acquiring the zrl lock at least one slot was allocated.
+ */
+ kstat_named_t dnode_hold_free_lock_misses;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
+ * to retry acquiring slot zrl locks due to contention.
+ */
+ kstat_named_t dnode_hold_free_lock_retry;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
+ * a range of dnode slots which were held by another thread.
+ */
+ kstat_named_t dnode_hold_free_refcount;
+ /*
+ * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
+ * a range of dnode slots which would overflow the dnode_phys_t.
+ */
+ kstat_named_t dnode_hold_free_overflow;
+ /*
+ * Number of times a dnode_hold(...) was attempted on a dnode
+ * which had already been unlinked in an earlier txg.
+ */
+ kstat_named_t dnode_hold_free_txg;
+ /*
+ * Number of times dnode_free_interior_slots() needed to retry
+ * acquiring a slot zrl lock due to contention.
+ */
+ kstat_named_t dnode_free_interior_lock_retry;
+ /*
+ * Number of new dnodes allocated by dnode_allocate().
+ */
+ kstat_named_t dnode_allocate;
+ /*
+ * Number of dnodes re-allocated by dnode_reallocate().
+ */
+ kstat_named_t dnode_reallocate;
+ /*
+ * Number of meta dnode dbufs evicted.
+ */
+ kstat_named_t dnode_buf_evict;
+ /*
+ * Number of times dmu_object_alloc*() reached the end of the existing
+ * object ID chunk and advanced to a new one.
+ */
+ kstat_named_t dnode_alloc_next_chunk;
+ /*
+ * Number of times multiple threads attempted to allocate a dnode
+ * from the same block of free dnodes.
+ */
+ kstat_named_t dnode_alloc_race;
+ /*
+ * Number of times dmu_object_alloc*() was forced to advance to the
+ * next meta dnode dbuf due to an error from dmu_object_next().
+ */
+ kstat_named_t dnode_alloc_next_block;
+ /*
+ * Statistics for tracking dnodes which have been moved.
+ */
+ kstat_named_t dnode_move_invalid;
+ kstat_named_t dnode_move_recheck1;
+ kstat_named_t dnode_move_recheck2;
+ kstat_named_t dnode_move_special;
+ kstat_named_t dnode_move_handle;
+ kstat_named_t dnode_move_rwlock;
+ kstat_named_t dnode_move_active;
+} dnode_stats_t;
+
+extern dnode_stats_t dnode_stats;
+
+#define DNODE_STAT_INCR(stat, val) \
+ atomic_add_64(&dnode_stats.stat.value.ui64, (val));
+#define DNODE_STAT_BUMP(stat) \
+ DNODE_STAT_INCR(stat, 1);
+
#ifdef ZFS_DEBUG
/*
diff --git a/uts/common/fs/zfs/sys/dsl_dataset.h b/uts/common/fs/zfs/sys/dsl_dataset.h
index 03dca17bee6d..15a64a832630 100644
--- a/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -87,6 +87,13 @@ struct dsl_pool;
#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
/*
+ * This field is present (with value=0) if this dataset may contain large
+ * dnodes (>512B). If it is present, then this dataset is counted in the
+ * refcount of the SPA_FEATURE_LARGE_DNODE feature.
+ */
+#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"
+
+/*
* These fields are set on datasets that are in the middle of a resumable
* receive, and allow the sender to resume the send if it is interrupted.
*/
diff --git a/uts/common/fs/zfs/sys/sa_impl.h b/uts/common/fs/zfs/sys/sa_impl.h
index 50874c6bf083..4bea074b545f 100644
--- a/uts/common/fs/zfs/sys/sa_impl.h
+++ b/uts/common/fs/zfs/sys/sa_impl.h
@@ -101,7 +101,7 @@ typedef struct sa_lot {
sa_attr_type_t *lot_attrs; /* array of attr #'s */
uint32_t lot_var_sizes; /* how many aren't fixed size */
uint32_t lot_attr_count; /* total attr count */
- list_t lot_idx_tab; /* should be only a couple of entries */
+ list_t lot_idx_tab; /* should be only a couple of entries */
int lot_instance; /* used with lot_hash to identify entry */
} sa_lot_t;
@@ -134,7 +134,7 @@ typedef struct sa_idx_tab {
* adding a completely new attribute is a very rare operation.
*/
struct sa_os {
- kmutex_t sa_lock;
+ kmutex_t sa_lock;
boolean_t sa_need_attr_registration;
boolean_t sa_force_spill;
uint64_t sa_master_obj;
@@ -237,7 +237,7 @@ struct sa_handle {
#define SA_BONUSTYPE_FROM_DB(db) \
(dmu_get_bonustype((dmu_buf_t *)db))
-#define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t))
+#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t))
#define SA_LAYOUT_NUM(x, type) \
((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
diff --git a/uts/common/fs/zfs/sys/spa.h b/uts/common/fs/zfs/sys/spa.h
index dc5da8fd778d..3ecec3df3956 100644
--- a/uts/common/fs/zfs/sys/spa.h
+++ b/uts/common/fs/zfs/sys/spa.h
@@ -843,6 +843,7 @@ extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
extern boolean_t spa_has_pending_synctask(spa_t *spa);
extern int spa_maxblocksize(spa_t *spa);
+extern int spa_maxdnodesize(spa_t *spa);
extern boolean_t spa_has_checkpoint(spa_t *spa);
extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
extern boolean_t spa_suspend_async_destroy(spa_t *spa);
diff --git a/uts/common/fs/zfs/sys/zap.h b/uts/common/fs/zfs/sys/zap.h
index 10cb6b449bde..2c909ff53aa2 100644
--- a/uts/common/fs/zfs/sys/zap.h
+++ b/uts/common/fs/zfs/sys/zap.h
@@ -115,16 +115,30 @@ typedef enum zap_flags {
/*
* Create a new zapobj with no attributes and return its object number.
+ *
+ * dnodesize specifies the on-disk size of the dnode for the new zapobj.
+ * Valid values are multiples of 512 up to DNODE_MAX_SIZE.
*/
uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags_dnsize(objset_t *os, int normflags,
+ zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx);
uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
+uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
+ uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
/*
* Initialize an already-allocated object.
@@ -138,9 +152,14 @@ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
*/
int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
int zap_create_claim_norm(objset_t *ds, uint64_t obj,
int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj,
+ int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx);
/*
* The zapobj passed in must be a valid ZAP object for all of the
diff --git a/uts/common/fs/zfs/sys/zfs_ioctl.h b/uts/common/fs/zfs/sys/zfs_ioctl.h
index f3df29218d41..824d1d8bb70f 100644
--- a/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -93,7 +93,7 @@ typedef enum drr_headertype {
#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
/* flag #21 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
-/* flag #23 is reserved for the large dnode feature */
+#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23)
/* flag #24 is reserved for the raw send feature */
/* flag #25 is reserved for the ZSTD compression feature */
@@ -104,7 +104,7 @@ typedef enum drr_headertype {
DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \
DMU_BACKUP_FEATURE_RESUMING | \
- DMU_BACKUP_FEATURE_LARGE_BLOCKS | \
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \
DMU_BACKUP_FEATURE_COMPRESSED)
/* Are all features in the given flag word currently supported? */
@@ -120,7 +120,7 @@ typedef enum dmu_send_resume_token_version {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * | reserved | feature-flags |C|S|
+ * | reserved | feature-flags |C|S|
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* The low order two bits indicate the header type: SUBSTREAM (0x1)
@@ -197,7 +197,8 @@ typedef struct dmu_replay_record {
uint32_t drr_bonuslen;
uint8_t drr_checksumtype;
uint8_t drr_compress;
- uint8_t drr_pad[6];
+ uint8_t drr_dn_slots;
+ uint8_t drr_pad[5];
uint64_t drr_toguid;
/* bonus content follows */
} drr_object;
diff --git a/uts/common/fs/zfs/sys/zfs_znode.h b/uts/common/fs/zfs/sys/zfs_znode.h
index 8c4f8f7dc850..a9f9876530bd 100644
--- a/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/uts/common/fs/zfs/sys/zfs_znode.h
@@ -185,6 +185,7 @@ typedef struct znode {
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
+ uint64_t z_dnodesize; /* dnode size */
uint64_t z_gen; /* generation (cached) */
uint64_t z_size; /* file size (cached) */
uint64_t z_atime[2]; /* atime (cached) */
diff --git a/uts/common/fs/zfs/sys/zil.h b/uts/common/fs/zfs/sys/zil.h
index b1567acd4add..e6b18da95be8 100644
--- a/uts/common/fs/zfs/sys/zil.h
+++ b/uts/common/fs/zfs/sys/zil.h
@@ -157,7 +157,7 @@ typedef enum zil_create {
#define TX_ACL 13 /* Set ACL */
#define TX_CREATE_ACL 14 /* create with ACL */
#define TX_CREATE_ATTR 15 /* create + attrs */
-#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
+#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
#define TX_MKDIR_ACL 17 /* mkdir with ACL */
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
@@ -185,6 +185,19 @@ typedef enum zil_create {
(txtype) == TX_WRITE2)
/*
+ * The number of dnode slots consumed by the object is stored in the 8
+ * unused upper bits of the object ID. We subtract 1 from the value
+ * stored on disk for compatibility with implementations that don't
+ * support large dnodes. The slot count for a single-slot dnode will
+ * contain 0 for those bits to preserve the log record format for
+ * "small" dnodes.
+ */
+#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1)
+#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1)
+#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT)
+#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x))
+
+/*
* Format of log records.
* The fields are carefully defined to allow them to be aligned
* and sized the same on sparc & intel architectures.
@@ -422,7 +435,7 @@ extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
extern int zil_reset(const char *osname, void *txarg);
extern int zil_claim(struct dsl_pool *dp,
struct dsl_dataset *ds, void *txarg);
-extern int zil_check_log_chain(struct dsl_pool *dp,
+extern int zil_check_log_chain(struct dsl_pool *dp,
struct dsl_dataset *ds, void *tx);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
diff --git a/uts/common/fs/zfs/zap.c b/uts/common/fs/zfs/zap.c
index e9ed41e556ea..7a1994f603c1 100644
--- a/uts/common/fs/zfs/zap.c
+++ b/uts/common/fs/zfs/zap.c
@@ -948,8 +948,17 @@ uint64_t
zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
const char *name, dmu_tx_t *tx)
{
- uint64_t new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx);
- VERIFY(new_obj != 0);
+ return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
+}
+
+uint64_t
+zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+ const char *name, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t new_obj;
+
+ VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
+ dnodesize, tx)) > 0);
VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
tx));
diff --git a/uts/common/fs/zfs/zap_micro.c b/uts/common/fs/zfs/zap_micro.c
index b07079ed4408..d093fe1e736e 100644
--- a/uts/common/fs/zfs/zap_micro.c
+++ b/uts/common/fs/zfs/zap_micro.c
@@ -693,8 +693,16 @@ int
zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- return (zap_create_claim_norm(os, obj,
- 0, ot, bonustype, bonuslen, tx));
+ return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_claim_norm_dnsize(os, obj,
+ 0, ot, bonustype, bonuslen, dnodesize, tx));
}
int
@@ -702,8 +710,19 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
- int err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+ dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
+ int err;
+
+ err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
if (err != 0)
return (err);
mzap_create_impl(os, obj, normflags, 0, tx);
@@ -718,11 +737,28 @@ zap_create(objset_t *os, dmu_object_type_t ot,
}
uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+ dnodesize, tx));
+}
+
+uint64_t
zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+ 0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
mzap_create_impl(os, obj, normflags, 0, tx);
return (obj);
@@ -734,7 +770,17 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
- uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+ return (zap_create_flags_dnsize(os, normflags, flags, ot,
+ leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+ dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
+ dnodesize, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
diff --git a/uts/common/fs/zfs/zfs_acl.c b/uts/common/fs/zfs/zfs_acl.c
index 5906db36567e..149103206a8e 100644
--- a/uts/common/fs/zfs/zfs_acl.c
+++ b/uts/common/fs/zfs/zfs_acl.c
@@ -895,7 +895,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
int entry_type;
mode_t mode;
mode_t seen = 0;
- zfs_ace_hdr_t *acep = NULL;
+ zfs_ace_hdr_t *acep = NULL;
uint64_t who;
uint16_t iflags, type;
uint32_t access_mask;
@@ -1262,7 +1262,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
otype == DMU_OT_ACL ?
DMU_OT_SYSACL : DMU_OT_NONE,
otype == DMU_OT_ACL ?
- DN_MAX_BONUSLEN : 0, tx);
+ DN_OLD_MAX_BONUSLEN : 0, tx);
} else {
(void) dmu_object_set_blocksize(zfsvfs->z_os,
aoid, aclp->z_acl_bytes, 0, tx);
@@ -1337,12 +1337,12 @@ zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
uint64_t who;
int new_count, new_bytes;
int ace_size;
- int entry_type;
+ int entry_type;
uint16_t iflags, type;
uint32_t access_mask;
zfs_acl_node_t *newnode;
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
- void *zacep;
+ size_t abstract_size = aclp->z_ops.ace_abstract_size();
+ void *zacep;
boolean_t isdir;
trivial_acl_t masks;
@@ -1786,7 +1786,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
zfs_acl_t *aclp;
ulong_t mask;
int error;
- int count = 0;
+ int count = 0;
int largeace = 0;
mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
@@ -2107,7 +2107,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
zfs_acl_t *aclp;
int error;
uid_t uid = crgetuid(cr);
- uint64_t who;
+ uint64_t who;
uint16_t type, iflags;
uint16_t entry_type;
uint32_t access_mask;
@@ -2380,9 +2380,9 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
uint32_t working_mode;
int error;
int is_attr;
- boolean_t check_privs;
+ boolean_t check_privs;
znode_t *xzp;
- znode_t *check_zp = zp;
+ znode_t *check_zp = zp;
mode_t needed_bits;
uid_t owner;
diff --git a/uts/common/fs/zfs/zfs_ioctl.c b/uts/common/fs/zfs/zfs_ioctl.c
index 87435f18ac1a..833cc26302ab 100644
--- a/uts/common/fs/zfs/zfs_ioctl.c
+++ b/uts/common/fs/zfs/zfs_ioctl.c
@@ -4055,6 +4055,24 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
+ case ZFS_PROP_DNODESIZE:
+ /* Dnode sizes above 512 need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval != ZFS_DNSIZE_LEGACY) {
+ spa_t *spa;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_DNODE)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
diff --git a/uts/common/fs/zfs/zfs_log.c b/uts/common/fs/zfs/zfs_log.c
index fbac2d99c289..1afaa8434bef 100644
--- a/uts/common/fs/zfs/zfs_log.c
+++ b/uts/common/fs/zfs/zfs_log.c
@@ -280,6 +280,8 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
lr = (lr_create_t *)&itx->itx_lr;
lr->lr_doid = dzp->z_id;
lr->lr_foid = zp->z_id;
+ /* Store dnode slot count in 8 bits above object id. */
+ LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
lr->lr_mode = zp->z_mode;
if (!IS_EPHEMERAL(zp->z_uid)) {
lr->lr_uid = (uint64_t)zp->z_uid;
diff --git a/uts/common/fs/zfs/zfs_replay.c b/uts/common/fs/zfs/zfs_replay.c
index de8d9c10b616..f75ec48cd7a6 100644
--- a/uts/common/fs/zfs/zfs_replay.c
+++ b/uts/common/fs/zfs/zfs_replay.c
@@ -278,6 +278,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
void *fuidstart;
size_t xvatlen = 0;
uint64_t txtype;
+ uint64_t objid;
+ uint64_t dnodesize;
int error;
txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
@@ -303,19 +305,24 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
return (error);
+ objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic VOP_CREATE()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ * creation time, generation number, and dnode size. The generic
+ * zfs_create() has no concept of these attributes, so we smuggle
+ * the values inside the vattr's otherwise unused va_ctime,
+ * va_nblocks, and va_fsid fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
if (error != ENOENT)
@@ -432,21 +439,26 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
return (error);
+ uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid);
+ int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
- lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, lr->lr_foid);
+ lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
/*
* All forms of zfs create (create, mkdir, mkxattrdir, symlink)
* eventually end up in zfs_mknode(), which assigns the object's
- * creation time and generation number. The generic VOP_CREATE()
- * doesn't have either concept, so we smuggle the values inside
- * the vattr's otherwise unused va_ctime and va_nblocks fields.
+ * creation time, generation number, and dnode slot count. The
+ * generic zfs_create() has no concept of these attributes, so
+ * we smuggle the values inside the vattr's otherwise unused
+ * va_ctime, va_nblocks and va_fsid fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
+ xva.xva_vattr.va_fsid = dnodesize;
- error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
+ error = dmu_object_info(zfsvfs->z_os, objid, NULL);
if (error != ENOENT)
goto out;
diff --git a/uts/common/fs/zfs/zfs_sa.c b/uts/common/fs/zfs/zfs_sa.c
index 3a472aa11a45..a39cff1a7b9b 100644
--- a/uts/common/fs/zfs/zfs_sa.c
+++ b/uts/common/fs/zfs/zfs_sa.c
@@ -97,8 +97,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
- VERIFY(dmu_set_bonus(db,
- len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+ VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
if (len) {
bcopy(link, (caddr_t)db->db_data +
ZFS_OLD_ZNODE_PHYS_SIZE, len);
diff --git a/uts/common/fs/zfs/zfs_znode.c b/uts/common/fs/zfs/zfs_znode.c
index 536216deafe9..b56cb7bd700d 100644
--- a/uts/common/fs/zfs/zfs_znode.c
+++ b/uts/common/fs/zfs/zfs_znode.c
@@ -60,6 +60,7 @@
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -799,9 +800,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
timestruc_t now;
uint64_t gen, obj;
int bonuslen;
+ int dnodesize;
sa_handle_t *sa_hdl;
dmu_object_type_t obj_type;
- sa_bulk_attr_t sa_attrs[ZPL_END];
+ sa_bulk_attr_t *sa_attrs;
int cnt = 0;
zfs_acl_locator_cb_t locate = { 0 };
@@ -811,15 +813,20 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
obj = vap->va_nodeid;
now = vap->va_ctime; /* see zfs_replay_create() */
gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
} else {
obj = 0;
gethrestime(&now);
gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
}
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
bonuslen = (obj_type == DMU_OT_SA) ?
- DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
/*
* Create a new DMU object.
@@ -832,28 +839,28 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
*/
if (vap->va_type == VDIR) {
if (zfsvfs->z_replay) {
- VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
+ VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx));
+ obj_type, bonuslen, dnodesize, tx));
} else {
- obj = zap_create_norm(zfsvfs->z_os,
+ obj = zap_create_norm_dnsize(zfsvfs->z_os,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx);
+ obj_type, bonuslen, dnodesize, tx);
}
} else {
if (zfsvfs->z_replay) {
- VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
+ VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx));
+ obj_type, bonuslen, dnodesize, tx));
} else {
- obj = dmu_object_alloc(zfsvfs->z_os,
+ obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx);
+ obj_type, bonuslen, dnodesize, tx);
}
}
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+ VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer
@@ -925,6 +932,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
* order for DMU_OT_ZNODE is critical since it needs to be constructed
* in the old znode_phys_t format. Don't change this ordering
*/
+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
if (obj_type == DMU_OT_ZNODE) {
SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
@@ -950,10 +958,10 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
NULL, &size, 8);
SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
NULL, &gen, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
- &acl_ids->z_fuid, 8);
- SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
- &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+ NULL, &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+ NULL, &acl_ids->z_fgid, 8);
SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
NULL, &parent, 8);
SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
@@ -1019,6 +1027,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
(*zpp)->z_pflags = pflags;
(*zpp)->z_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
if (vap->va_mask & AT_XVATTR)
zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
@@ -1027,6 +1036,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
}
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
}
diff --git a/uts/common/fs/zfs/zil.c b/uts/common/fs/zfs/zil.c
index 9ce72d16f6a0..44df4c96f3ce 100644
--- a/uts/common/fs/zfs/zil.c
+++ b/uts/common/fs/zfs/zil.c
@@ -63,9 +63,9 @@
* representation, and the on-disk representation). The on-disk format
* consists of 3 parts:
*
- * - a single, per-dataset, ZIL header; which points to a chain of
- * - zero or more ZIL blocks; each of which contains
- * - zero or more ZIL records
+ * - a single, per-dataset, ZIL header; which points to a chain of
+ * - zero or more ZIL blocks; each of which contains
+ * - zero or more ZIL records
*
* A ZIL record holds the information necessary to replay a single
* system call transaction. A ZIL block can hold many ZIL records, and
@@ -1355,7 +1355,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
8192+4096, /* data base */
- 32*1024 + 4096, /* NFS writes */
+ 32*1024 + 4096, /* NFS writes */
UINT64_MAX
};
@@ -1840,7 +1840,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
list_insert_tail(&itxs->i_sync_list, itx);
} else {
avl_tree_t *t = &itxs->i_async_tree;
- uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+ uint64_t foid =
+ LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
itx_async_node_t *ian;
avl_index_t where;
@@ -3088,7 +3089,8 @@ zil_close(zilog_t *zilog)
if (zilog_is_dirty(zilog))
zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
- VERIFY(!zilog_is_dirty(zilog));
+ if (txg < spa_freeze_txg(zilog->zl_spa))
+ VERIFY(!zilog_is_dirty(zilog));
zilog->zl_get_data = NULL;
@@ -3303,7 +3305,7 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
*/
if (TX_OOO(txtype)) {
error = dmu_object_info(zilog->zl_os,
- ((lr_ooo_t *)lr)->lr_foid, NULL);
+ LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
if (error == ENOENT || error == EEXIST)
return (0);
}
diff --git a/uts/common/sys/fs/zfs.h b/uts/common/sys/fs/zfs.h
index 4c26edaecde6..b0dbfe0f2537 100644
--- a/uts/common/sys/fs/zfs.h
+++ b/uts/common/sys/fs/zfs.h
@@ -148,6 +148,7 @@ typedef enum {
ZFS_PROP_DEDUP,
ZFS_PROP_MLSLABEL,
ZFS_PROP_SYNC,
+ ZFS_PROP_DNODESIZE,
ZFS_PROP_REFRATIO,
ZFS_PROP_WRITTEN,
ZFS_PROP_CLONES,
@@ -211,6 +212,7 @@ typedef enum {
ZPOOL_PROP_BOOTSIZE,
ZPOOL_PROP_CHECKPOINT,
ZPOOL_PROP_TNAME,
+ ZPOOL_PROP_MAXDNODESIZE,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -366,6 +368,16 @@ typedef enum {
} zfs_sync_type_t;
typedef enum {
+ ZFS_DNSIZE_LEGACY = 0,
+ ZFS_DNSIZE_AUTO = 1,
+ ZFS_DNSIZE_1K = 1024,
+ ZFS_DNSIZE_2K = 2048,
+ ZFS_DNSIZE_4K = 4096,
+ ZFS_DNSIZE_8K = 8192,
+ ZFS_DNSIZE_16K = 16384
+} zfs_dnsize_type_t;
+
+typedef enum {
ZFS_REDUNDANT_METADATA_ALL,
ZFS_REDUNDANT_METADATA_MOST
} zfs_redundant_metadata_type_t;