aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb.c43
-rw-r--r--cddl/contrib/opensolaris/cmd/zdb/zdb_il.c18
-rw-r--r--cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c10
-rw-r--r--cddl/contrib/opensolaris/cmd/ztest/ztest.c56
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c261
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c91
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c612
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h217
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c16
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c12
28 files changed, 418 insertions, 1002 deletions
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index f02c2af5ef5f..68d75fcbcdb4 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -2134,8 +2134,7 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
};
static void
-dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
- uint64_t *dnode_slots_used)
+dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
{
dmu_buf_t *db = NULL;
dmu_object_info_t doi;
@@ -2155,7 +2154,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
if (*print_header) {
- (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
+ (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
"Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
"lsize", "%full", "type");
*print_header = 0;
@@ -2174,9 +2173,6 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
}
dmu_object_info_from_dnode(dn, &doi);
- if (dnode_slots_used != NULL)
- *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
-
zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
@@ -2199,9 +2195,8 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
ZDB_COMPRESS_NAME(doi.doi_compress));
}
- (void) printf("%10" PRIu64
- " %3u %5s %5s %5s %5s %5s %6s %s%s\n",
- object, doi.doi_indirection, iblk, dblk,
+ (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
+ (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
@@ -2310,9 +2305,6 @@ dump_dir(objset_t *os)
int print_header = 1;
unsigned i;
int error;
- uint64_t total_slots_used = 0;
- uint64_t max_slot_used = 0;
- uint64_t dnode_slots;
/* make sure nicenum has enough space */
CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
@@ -2357,7 +2349,7 @@ dump_dir(objset_t *os)
if (zopt_objects != 0) {
for (i = 0; i < zopt_objects; i++)
dump_object(os, zopt_object[i], verbosity,
- &print_header, NULL);
+ &print_header);
(void) printf("\n");
return;
}
@@ -2382,37 +2374,22 @@ dump_dir(objset_t *os)
if (BP_IS_HOLE(os->os_rootbp))
return;
- dump_object(os, 0, verbosity, &print_header, NULL);
+ dump_object(os, 0, verbosity, &print_header);
object_count = 0;
if (DMU_USERUSED_DNODE(os) != NULL &&
DMU_USERUSED_DNODE(os)->dn_type != 0) {
- dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
- NULL);
- dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
- NULL);
+ dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
+ dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
}
object = 0;
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
- dump_object(os, object, verbosity, &print_header, &dnode_slots);
+ dump_object(os, object, verbosity, &print_header);
object_count++;
- total_slots_used += dnode_slots;
- max_slot_used = object + dnode_slots - 1;
}
(void) printf("\n");
- (void) printf(" Dnode slots:\n");
- (void) printf("\tTotal used: %10llu\n",
- (u_longlong_t)total_slots_used);
- (void) printf("\tMax used: %10llu\n",
- (u_longlong_t)max_slot_used);
- (void) printf("\tPercent empty: %10lf\n",
- (double)(max_slot_used - total_slots_used)*100 /
- (double)max_slot_used);
-
- (void) printf("\n");
-
if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
@@ -2604,7 +2581,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
return (dump_path_impl(os, child_obj, s + 1));
/*FALLTHROUGH*/
case DMU_OT_PLAIN_FILE_CONTENTS:
- dump_object(os, child_obj, dump_opt['v'], &header, NULL);
+ dump_object(os, child_obj, dump_opt['v'], &header);
return (0);
default:
(void) fprintf(stderr, "object %llu has non-file/directory "
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
index 9f3f23f82da1..75b0cd91d262 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb_il.c
@@ -84,15 +84,15 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg)
}
(void) printf("%s%s", tab_prefix, ctime(&crtime));
- (void) printf("%sdoid %" PRIu64 ", foid %" PRIu64 ", slots %" PRIu64
- ", mode %" PRIo64 "\n",
- tab_prefix, lr->lr_doid,
- (uint64_t)LR_FOID_GET_OBJ(lr->lr_foid),
- (uint64_t)LR_FOID_GET_SLOTS(lr->lr_foid),
- lr->lr_mode);
- (void) printf("%suid %" PRIu64 ", gid %" PRIu64 ", gen %" PRIu64
- ", rdev %#" PRIx64 "\n",
- tab_prefix, lr->lr_uid, lr->lr_gid, lr->lr_gen, lr->lr_rdev);
+ (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", tab_prefix,
+ (u_longlong_t)lr->lr_doid,
+ (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
+ (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
+ (longlong_t)lr->lr_mode);
+ (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+ tab_prefix,
+ (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
+ (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
}
/* ARGSUSED */
diff --git a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
index 51c4c8e0e649..54edb566ad2f 100644
--- a/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
+++ b/cddl/contrib/opensolaris/cmd/zstreamdump/zstreamdump.c
@@ -416,15 +416,13 @@ main(int argc, char *argv[])
drro->drr_toguid = BSWAP_64(drro->drr_toguid);
}
if (verbose) {
- (void) printf("OBJECT object = %" PRIu64
- " type = %u bonustype = %u blksz = %u"
- " bonuslen = %u dn_slots = %u\n",
- drro->drr_object,
+ (void) printf("OBJECT object = %llu type = %u "
+ "bonustype = %u blksz = %u bonuslen = %u\n",
+ (u_longlong_t)drro->drr_object,
drro->drr_type,
drro->drr_bonustype,
drro->drr_blksz,
- drro->drr_bonuslen,
- drro->drr_dn_slots);
+ drro->drr_bonuslen);
}
if (drro->drr_bonuslen > 0) {
(void) ssread(buf,
diff --git a/cddl/contrib/opensolaris/cmd/ztest/ztest.c b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
index 538fd040c95e..37acf34ec369 100644
--- a/cddl/contrib/opensolaris/cmd/ztest/ztest.c
+++ b/cddl/contrib/opensolaris/cmd/ztest/ztest.c
@@ -196,7 +196,6 @@ extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
extern boolean_t zfs_abd_scatter_enabled;
-extern int dmu_object_alloc_chunk_shift;
extern boolean_t zfs_force_some_double_word_sm_entries;
static ztest_shared_opts_t *ztest_shared_opts;
@@ -323,7 +322,6 @@ static ztest_shared_callstate_t *ztest_shared_callstate;
ztest_func_t ztest_dmu_read_write;
ztest_func_t ztest_dmu_write_parallel;
ztest_func_t ztest_dmu_object_alloc_free;
-ztest_func_t ztest_dmu_object_next_chunk;
ztest_func_t ztest_dmu_commit_callbacks;
ztest_func_t ztest_zap;
ztest_func_t ztest_zap_parallel;
@@ -365,7 +363,6 @@ ztest_info_t ztest_info[] = {
{ ztest_dmu_read_write, 1, &zopt_always },
{ ztest_dmu_write_parallel, 10, &zopt_always },
{ ztest_dmu_object_alloc_free, 1, &zopt_always },
- { ztest_dmu_object_next_chunk, 1, &zopt_sometimes },
{ ztest_dmu_commit_callbacks, 1, &zopt_always },
{ ztest_zap, 30, &zopt_always },
{ ztest_zap_parallel, 100, &zopt_always },
@@ -1369,7 +1366,7 @@ ztest_bt_bonus(dmu_buf_t *db)
* it unique to the object, generation, and offset to verify that data
* is not getting overwritten by data from other dnodes.
*/
-#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
+#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
(((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset))
/*
@@ -1898,7 +1895,6 @@ ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
txg, crtxg);
ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
-
dmu_buf_rele(db, FTAG);
(void) ztest_log_setattr(zd, tx, lr);
@@ -3819,10 +3815,8 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
ztest_od_t od[4];
int batchsize = sizeof (od) / sizeof (od[0]);
- for (int b = 0; b < batchsize; b++) {
- ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER,
- 0, 0, 0);
- }
+ for (int b = 0; b < batchsize; b++)
+ ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0);
/*
* Destroy the previous batch of objects, create a new batch,
@@ -3837,26 +3831,6 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
}
/*
- * Rewind the global allocator to verify object allocation backfilling.
- */
-void
-ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
-{
- objset_t *os = zd->zd_os;
- int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
- uint64_t object;
-
- /*
- * Rewind the global allocator randomly back to a lower object number
- * to force backfilling and reclamation of recently freed dnodes.
- */
- mutex_enter(&os->os_obj_lock);
- object = ztest_random(os->os_obj_next_chunk);
- os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
- mutex_exit(&os->os_obj_lock);
-}
-
-/*
* Verify that dmu_{read,write} work as expected.
*/
void
@@ -3902,10 +3876,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
/*
* Read the directory info. If it's the first time, set things up.
*/
- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0,
- chunksize);
- ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
- chunksize);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
+ ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
@@ -4174,10 +4146,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
/*
* Read the directory info. If it's the first time, set things up.
*/
- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
- 0, 0);
- ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
- chunksize);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
+ ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
@@ -4377,8 +4347,7 @@ ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
* to verify that parallel writes to an object -- even to the
* same blocks within the object -- doesn't cause any trouble.
*/
- ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER,
- 0, 0, 0);
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
@@ -4397,8 +4366,7 @@ ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
uint64_t blocksize = ztest_random_blocksize();
void *data;
- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
- 0, 0);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
return;
@@ -4622,8 +4590,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
char name[20], string_value[20];
void *data;
- ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER,
- 0, 0, 0);
+ ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
@@ -5444,8 +5411,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
blocksize = ztest_random_blocksize();
blocksize = MIN(blocksize, 2048); /* because we write so many */
- ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize,
- 0, 0);
+ ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
return;
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index 9c1038312423..7b1474edf58f 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -292,11 +292,10 @@ zfs_prop_init(void)
ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
"default | geom | dev | none", "VOLMODE", volmode_table);
-
zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
"legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
-
+
/* inherit index (boolean) properties */
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 3e99898ea230..9012baa0a994 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -3757,8 +3757,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
if (dn->dn_type == DMU_OT_DNODE) {
i = 0;
while (i < db->db.db_size) {
- dnode_phys_t *dnp =
- (void *)(((char *)db->db.db_data) + i);
+ dnode_phys_t *dnp = db->db.db_data + i;
i += DNODE_MIN_SIZE;
if (dnp->dn_type != DMU_OT_NONE) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 2ee62504c081..f830076f767f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -32,14 +32,6 @@
#include <sys/zfeature.h>
#include <sys/dsl_dataset.h>
-/*
- * Each of the concurrent object allocators will grab
- * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
- * grab 128 slots, which is 4 blocks worth. This was experimentally
- * determined to be the lowest value that eliminates the measurable effect
- * of lock contention from this code path.
- */
-int dmu_object_alloc_chunk_shift = 7;
static uint64_t
dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
@@ -52,10 +44,6 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
dnode_t *dn = NULL;
int dn_slots = dnodesize >> DNODE_SHIFT;
boolean_t restarted = B_FALSE;
- uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
- os->os_obj_next_percpu_len];
- int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
- int error;
if (dn_slots == 0) {
dn_slots = DNODE_MIN_SLOTS;
@@ -63,99 +51,55 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
}
-
- /*
- * The "chunk" of dnodes that is assigned to a CPU-specific
- * allocator needs to be at least one block's worth, to avoid
- * lock contention on the dbuf. It can be at most one L1 block's
- * worth, so that the "rescan after polishing off a L1's worth"
- * logic below will be sure to kick in.
- */
- if (dnodes_per_chunk < DNODES_PER_BLOCK)
- dnodes_per_chunk = DNODES_PER_BLOCK;
- if (dnodes_per_chunk > L1_dnode_count)
- dnodes_per_chunk = L1_dnode_count;
-
- object = *cpuobj;
-
+
+ mutex_enter(&os->os_obj_lock);
for (;;) {
+ object = os->os_obj_next;
/*
- * If we finished a chunk of dnodes, get a new one from
- * the global allocator.
+ * Each time we polish off a L1 bp worth of dnodes (2^12
+ * objects), move to another L1 bp that's still
+ * reasonably sparse (at most 1/4 full). Look from the
+ * beginning at most once per txg. If we still can't
+ * allocate from that L1 block, search for an empty L0
+ * block, which will quickly skip to the end of the
+ * metadnode if the no nearby L0 blocks are empty. This
+ * fallback avoids a pathology where full dnode blocks
+ * containing large dnodes appear sparse because they
+ * have a low blk_fill, leading to many failed
+ * allocation attempts. In the long term a better
+ * mechanism to search for sparse metadnode regions,
+ * such as spacemaps, could be implemented.
+ *
+ * os_scan_dnodes is set during txg sync if enough objects
+ * have been freed since the previous rescan to justify
+ * backfilling again.
+ *
+ * Note that dmu_traverse depends on the behavior that we use
+ * multiple blocks of the dnode object before going back to
+ * reuse objects. Any change to this algorithm should preserve
+ * that property or find another solution to the issues
+ * described in traverse_visitbp.
*/
- if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
- (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
- dn_slots)) {
- DNODE_STAT_BUMP(dnode_alloc_next_chunk);
- mutex_enter(&os->os_obj_lock);
- ASSERT0(P2PHASE(os->os_obj_next_chunk,
- dnodes_per_chunk));
- object = os->os_obj_next_chunk;
-
- /*
- * Each time we polish off a L1 bp worth of dnodes
- * (2^12 objects), move to another L1 bp that's
- * still reasonably sparse (at most 1/4 full). Look
- * from the beginning at most once per txg. If we
- * still can't allocate from that L1 block, search
- * for an empty L0 block, which will quickly skip
- * to the end of the metadnode if the no nearby L0
- * blocks are empty. This fallback avoids a
- * pathology where full dnode blocks containing
- * large dnodes appear sparse because they have a
- * low blk_fill, leading to many failed allocation
- * attempts. In the long term a better mechanism to
- * search for sparse metadnode regions, such as
- * spacemaps, could be implemented.
- *
- * os_scan_dnodes is set during txg sync if enough
- * objects have been freed since the previous
- * rescan to justify backfilling again.
- *
- * Note that dmu_traverse depends on the behavior
- * that we use multiple blocks of the dnode object
- * before going back to reuse objects. Any change
- * to this algorithm should preserve that property
- * or find another solution to the issues described
- * in traverse_visitbp.
- */
- if (P2PHASE(object, L1_dnode_count) == 0) {
- uint64_t offset;
- uint64_t blkfill;
- int minlvl;
- if (os->os_rescan_dnodes) {
- offset = 0;
- os->os_rescan_dnodes = B_FALSE;
- } else {
- offset = object << DNODE_SHIFT;
- }
- blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
- minlvl = restarted ? 1 : 2;
- restarted = B_TRUE;
- error = dnode_next_offset(DMU_META_DNODE(os),
- DNODE_FIND_HOLE, &offset, minlvl,
- blkfill, 0);
- if (error == 0) {
- object = offset >> DNODE_SHIFT;
- }
+ if (P2PHASE(object, L1_dnode_count) == 0) {
+ uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
+ int error;
+ if (os->os_rescan_dnodes) {
+ offset = 0;
+ os->os_rescan_dnodes = B_FALSE;
+ } else {
+ offset = object << DNODE_SHIFT;
}
- /*
- * Note: if "restarted", we may find a L0 that
- * is not suitably aligned.
- */
- os->os_obj_next_chunk =
- P2ALIGN(object, dnodes_per_chunk) +
- dnodes_per_chunk;
- (void) atomic_swap_64(cpuobj, object);
- mutex_exit(&os->os_obj_lock);
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ DNODE_FIND_HOLE, &offset, minlvl, blkfill, 0);
+ if (error == 0)
+ object = offset >> DNODE_SHIFT;
}
-
- /*
- * The value of (*cpuobj) before adding dn_slots is the object
- * ID assigned to us. The value afterwards is the object ID
- * assigned to whoever wants to do an allocation next.
- */
- object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+ os->os_obj_next = object + dn_slots;
/*
* XXX We should check for an i/o error here and return
@@ -163,45 +107,37 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
- dn_slots, FTAG, &dn);
- if (error == 0) {
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
+ if (dn)
+ break;
+
+ if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
+ os->os_obj_next = object;
+ else
/*
- * Another thread could have allocated it; check
- * again now that we have the struct lock.
+ * Skip to next known valid starting point for a dnode.
*/
- if (dn->dn_type == DMU_OT_NONE) {
- dnode_allocate(dn, ot, blocksize, 0,
- bonustype, bonuslen, dn_slots, tx);
- rw_exit(&dn->dn_struct_rwlock);
- dmu_tx_add_new_object(tx, dn);
- dnode_rele(dn, FTAG);
- return (object);
- }
- rw_exit(&dn->dn_struct_rwlock);
- dnode_rele(dn, FTAG);
- DNODE_STAT_BUMP(dnode_alloc_race);
- }
-
- /*
- * Skip to next known valid starting point on error. This
- * is the start of the next block of dnodes.
- */
- if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
- object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
- DNODE_STAT_BUMP(dnode_alloc_next_block);
- }
- (void) atomic_swap_64(cpuobj, object);
+ os->os_obj_next = P2ROUNDUP(object + 1,
+ DNODES_PER_BLOCK);
}
+
+ dnode_allocate(dn, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, dn_slots, tx);
+ mutex_exit(&os->os_obj_lock);
+
+ dmu_tx_add_new_object(tx, dn);
+ dnode_rele(dn, FTAG);
+
+ return (object);
}
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
- bonuslen, 0, tx));
+ return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, 0, tx);
}
uint64_t
@@ -209,8 +145,8 @@ dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
dmu_tx_t *tx)
{
- return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
- bonustype, bonuslen, 0, tx));
+ return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, 0, tx);
}
uint64_t
@@ -242,7 +178,7 @@ dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
dn_slots = DNODE_MIN_SLOTS;
ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
-
+
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (SET_ERROR(EBADF));
@@ -275,9 +211,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
- if (dn_slots == 0)
- dn_slots = DNODE_MIN_SLOTS;
-
if (object == DMU_META_DNODE_OBJECT)
return (SET_ERROR(EBADF));
@@ -327,52 +260,28 @@ int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
uint64_t offset;
- uint64_t start_obj;
+ dmu_object_info_t doi;
struct dsl_dataset *ds = os->os_dsl_dataset;
+ int dnodesize;
int error;
- if (*objectp == 0) {
- start_obj = 1;
- } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
- uint64_t i = *objectp + 1;
- uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
- dmu_object_info_t doi;
-
- /*
- * Scan through the remaining meta dnode block. The contents
- * of each slot in the block are known so it can be quickly
- * checked. If the block is exhausted without a match then
- * hand off to dnode_next_offset() for further scanning.
- */
- while (i <= last_obj) {
- error = dmu_object_info(os, i, &doi);
- if (error == ENOENT) {
- if (hole) {
- *objectp = i;
- return (0);
- } else {
- i++;
- }
- } else if (error == EEXIST) {
- i++;
- } else if (error == 0) {
- if (hole) {
- i += doi.doi_dnodesize >> DNODE_SHIFT;
- } else {
- *objectp = i;
- return (0);
- }
- } else {
- return (error);
- }
- }
-
- start_obj = i;
+ /*
+ * Avoid expensive dnode hold if this dataset doesn't use large dnodes.
+ */
+ if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+ error = dmu_object_info(os, *objectp, &doi);
+ if (error && !(error == EINVAL && *objectp == 0))
+ return (SET_ERROR(error));
+ else
+ dnodesize = doi.doi_dnodesize;
} else {
- start_obj = *objectp + 1;
+ dnodesize = DNODE_MIN_SIZE;
}
- offset = start_obj << DNODE_SHIFT;
+ if (*objectp == 0)
+ offset = 1 << DNODE_SHIFT;
+ else
+ offset = (*objectp << DNODE_SHIFT) + dnodesize;
error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 332f7cd2fe4b..71d9cd7b6f17 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -566,9 +566,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
- os->os_obj_next_percpu_len = boot_ncpus;
- os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
- sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
dnode_special_open(os, &os->os_phys->os_meta_dnode,
DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
@@ -847,9 +844,6 @@ dmu_objset_evict_done(objset_t *os)
rw_enter(&os_lock, RW_READER);
rw_exit(&os_lock);
- kmem_free(os->os_obj_next_percpu,
- os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
-
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_userused_lock);
mutex_destroy(&os->os_obj_lock);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 3f6f3316b2d2..8d78d1d5ec18 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -1437,12 +1437,17 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/*
* The receiving code doesn't know how to translate large blocks
* to smaller ones, so the pool must have the LARGE_BLOCKS
- * feature enabled if the stream has LARGE_BLOCKS. Same with
- * large dnodes.
+ * feature enabled if the stream has LARGE_BLOCKS.
*/
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate large dnodes
+ * to smaller ones, so the pool must have the LARGE_DNODE
+ * feature enabled if the stream has LARGE_DNODE.
+ */
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
return (SET_ERROR(ENOTSUP));
@@ -1650,9 +1655,6 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
- /* 6 extra bytes for /%recv */
- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
-
/* already checked */
ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
@@ -1680,18 +1682,8 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
- /*
- * The receiving code doesn't know how to translate large blocks
- * to smaller ones, so the pool must have the LARGE_BLOCKS
- * feature enabled if the stream has LARGE_BLOCKS. Same with
- * large dnodes.
- */
- if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
- return (SET_ERROR(ENOTSUP));
- if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
- !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
- return (SET_ERROR(ENOTSUP));
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);
@@ -2153,8 +2145,6 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_tx_t *tx;
uint64_t object;
int err;
- uint8_t dn_slots = drro->drr_dn_slots != 0 ?
- drro->drr_dn_slots : DNODE_MIN_SLOTS;
if (drro->drr_type == DMU_OT_NONE ||
!DMU_OT_IS_VALID(drro->drr_type) ||
@@ -2165,16 +2155,15 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
drro->drr_bonuslen >
- DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
- dn_slots >
- (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) {
return (SET_ERROR(EINVAL));
}
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
- if (err != 0 && err != ENOENT && err != EEXIST)
+ if (err != 0 && err != ENOENT)
return (SET_ERROR(EINVAL));
+ object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
if (drro->drr_object > rwa->max_object)
rwa->max_object = drro->drr_object;
@@ -2187,64 +2176,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
if (err == 0) {
int nblkptr;
- object = drro->drr_object;
-
nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);
if (drro->drr_blksz != doi.doi_data_block_size ||
- nblkptr < doi.doi_nblkptr ||
- dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
+ nblkptr < doi.doi_nblkptr) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
- } else if (err == EEXIST) {
- /*
- * The object requested is currently an interior slot of a
- * multi-slot dnode. This will be resolved when the next txg
- * is synced out, since the send stream will have told us
- * to free this slot when we freed the associated dnode
- * earlier in the stream.
- */
- txg_wait_synced(dmu_objset_pool(rwa->os), 0);
- object = drro->drr_object;
- } else {
- /* object is free and we are about to allocate a new one */
- object = DMU_NEW_OBJECT;
- }
-
- /*
- * If this is a multi-slot dnode there is a chance that this
- * object will expand into a slot that is already used by
- * another object from the previous snapshot. We must free
- * these objects before we attempt to allocate the new dnode.
- */
- if (dn_slots > 1) {
- boolean_t need_sync = B_FALSE;
-
- for (uint64_t slot = drro->drr_object + 1;
- slot < drro->drr_object + dn_slots;
- slot++) {
- dmu_object_info_t slot_doi;
-
- err = dmu_object_info(rwa->os, slot, &slot_doi);
- if (err == ENOENT || err == EEXIST)
- continue;
- else if (err != 0)
- return (err);
-
- err = dmu_free_long_object(rwa->os, slot);
-
- if (err != 0)
- return (err);
-
- need_sync = B_TRUE;
- }
-
- if (need_sync)
- txg_wait_synced(dmu_objset_pool(rwa->os), 0);
}
tx = dmu_tx_create(rwa->os);
@@ -2260,7 +2201,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
- dn_slots << DNODE_SHIFT, tx);
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
} else if (drro->drr_type != doi.doi_type ||
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
@@ -2318,10 +2259,10 @@ receive_freeobjects(struct receive_writer_arg *rwa,
dmu_object_info_t doi;
int err;
- err = dmu_object_info(rwa->os, obj, NULL);
+ err = dmu_object_info(rwa->os, obj, &doi);
if (err == ENOENT) {
obj++;
- continue;
+ continue;
} else if (err != 0) {
return (err);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index 849a4fea28e9..e9f1f4ac19c6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -1252,13 +1252,11 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
void
dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
{
- dmu_tx_hold_t *txh;
+ dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
+ tx->tx_objset, object, THT_SPILL, 0, 0);
- txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
- THT_SPILL, 0, 0);
- if (txh != NULL)
- (void) refcount_add_many(&txh->txh_space_towrite,
- SPA_OLD_MAXBLOCKSIZE, FTAG);
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index 86f2440e0095..91b3a0414834 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -40,40 +40,20 @@
#include <sys/dmu_zfetch.h>
#include <sys/range_tree.h>
-dnode_stats_t dnode_stats = {
- { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
- { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
- { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
- { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
- { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
- { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
- { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
- { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
- { "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
- { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
- { "dnode_allocate", KSTAT_DATA_UINT64 },
- { "dnode_reallocate", KSTAT_DATA_UINT64 },
- { "dnode_buf_evict", KSTAT_DATA_UINT64 },
- { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
- { "dnode_alloc_race", KSTAT_DATA_UINT64 },
- { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
- { "dnode_move_invalid", KSTAT_DATA_UINT64 },
- { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
- { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
- { "dnode_move_special", KSTAT_DATA_UINT64 },
- { "dnode_move_handle", KSTAT_DATA_UINT64 },
- { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
- { "dnode_move_active", KSTAT_DATA_UINT64 },
-};
-
-static kstat_t *dnode_ksp;
static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef DEBUG
+#define DNODE_STATS
+#endif /* DEBUG */
+
+#ifdef DNODE_STATS
+#define DNODE_STAT_ADD(stat) ((stat)++)
+#else
+#define DNODE_STAT_ADD(stat) /* nothing */
+#endif /* DNODE_STATS */
static dnode_phys_t dnode_phys_zero;
@@ -235,25 +215,12 @@ dnode_init(void)
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
#ifdef _KERNEL
kmem_cache_set_move(dnode_cache, dnode_move);
-
- dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
- KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL);
- if (dnode_ksp != NULL) {
- dnode_ksp->ks_data = &dnode_stats;
- kstat_install(dnode_ksp);
- }
#endif /* _KERNEL */
}
void
dnode_fini(void)
{
- if (dnode_ksp != NULL) {
- kstat_delete(dnode_ksp);
- dnode_ksp = NULL;
- }
-
kmem_cache_destroy(dnode_cache);
dnode_cache = NULL;
}
@@ -366,7 +333,6 @@ dnode_byteswap(dnode_phys_t *dnp)
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
-
}
void
@@ -378,7 +344,7 @@ dnode_buf_byteswap(void *vbuf, size_t size)
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
while (i < size) {
- dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+ dnode_phys_t *dnp = vbuf + i;
dnode_byteswap(dnp);
i += DNODE_MIN_SIZE;
@@ -482,10 +448,14 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
- ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
- ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
mutex_enter(&os->os_lock);
+ if (dnh->dnh_dnode != NULL) {
+ /* Lost the allocation race. */
+ mutex_exit(&os->os_lock);
+ kmem_cache_free(dnode_cache, dn);
+ return (dnh->dnh_dnode);
+ }
/*
* Exclude special dnodes from os_dnodes so an empty os_dnodes
@@ -508,7 +478,6 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
-
return (dn);
}
@@ -534,8 +503,7 @@ dnode_destroy(dnode_t *dn)
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
- if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
- zrl_remove(&dn->dn_handle->dnh_zrlock);
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
@@ -591,10 +559,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
- dprintf("os=%p obj=%" PRIu64 " txg=%" PRIu64
- " blocksize=%d ibs=%d dn_slots=%d\n",
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
- DNODE_STAT_BUMP(dnode_allocate);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -679,13 +645,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
ASSERT3U(bonuslen, <=,
- DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
- dnode_free_interior_slots(dn);
- DNODE_STAT_BUMP(dnode_reallocate);
-
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
@@ -737,7 +700,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
- (dn->dn_nblkptr - 1) * sizeof (blkptr_t);
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
@@ -745,6 +708,18 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
mutex_exit(&dn->dn_mtx);
}
+#ifdef DNODE_STATS
+static struct {
+ uint64_t dms_dnode_invalid;
+ uint64_t dms_dnode_recheck1;
+ uint64_t dms_dnode_recheck2;
+ uint64_t dms_dnode_special;
+ uint64_t dms_dnode_handle;
+ uint64_t dms_dnode_rwlock;
+ uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif /* DNODE_STATS */
+
#ifdef _KERNEL
static void
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
@@ -774,7 +749,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_datablkszsec = odn->dn_datablkszsec;
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
- ndn->dn_num_slots = odn->dn_num_slots;
bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
sizeof (odn->dn_next_type));
bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
@@ -906,7 +880,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
*/
os = odn->dn_objset;
if (!POINTER_IS_VALID(os)) {
- DNODE_STAT_BUMP(dnode_move_invalid);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -916,7 +890,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
rw_enter(&os_lock, RW_WRITER);
if (os != odn->dn_objset) {
rw_exit(&os_lock);
- DNODE_STAT_BUMP(dnode_move_recheck1);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -934,7 +908,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
if (os != odn->dn_objset) {
mutex_exit(&os->os_lock);
rw_exit(&os_lock);
- DNODE_STAT_BUMP(dnode_move_recheck2);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
return (KMEM_CBRC_DONT_KNOW);
}
@@ -947,7 +921,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
rw_exit(&os_lock);
if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
mutex_exit(&os->os_lock);
- DNODE_STAT_BUMP(dnode_move_special);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
return (KMEM_CBRC_NO);
}
ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
@@ -962,7 +936,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
*/
if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
mutex_exit(&os->os_lock);
- DNODE_STAT_BUMP(dnode_move_handle);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
return (KMEM_CBRC_LATER);
}
@@ -978,7 +952,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
- DNODE_STAT_BUMP(dnode_move_rwlock);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
return (KMEM_CBRC_LATER);
}
@@ -1004,7 +978,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
rw_exit(&odn->dn_struct_rwlock);
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
- DNODE_STAT_BUMP(dnode_move_active);
+ DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
return (KMEM_CBRC_LATER);
}
@@ -1029,132 +1003,6 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
#endif /* illumos */
#endif /* _KERNEL */
-static void
-dnode_slots_hold(dnode_children_t *children, int idx, int slots)
-{
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- for (int i = idx; i < idx + slots; i++) {
- dnode_handle_t *dnh = &children->dnc_children[i];
- zrl_add(&dnh->dnh_zrlock);
- }
-}
-
-static void
-dnode_slots_rele(dnode_children_t *children, int idx, int slots)
-{
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- for (int i = idx; i < idx + slots; i++) {
- dnode_handle_t *dnh = &children->dnc_children[i];
-
- if (zrl_is_locked(&dnh->dnh_zrlock))
- zrl_exit(&dnh->dnh_zrlock);
- else
- zrl_remove(&dnh->dnh_zrlock);
- }
-}
-
-static int
-dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
-{
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- for (int i = idx; i < idx + slots; i++) {
- dnode_handle_t *dnh = &children->dnc_children[i];
-
- if (!zrl_tryenter(&dnh->dnh_zrlock)) {
- for (int j = idx; j < i; j++) {
- dnh = &children->dnc_children[j];
- zrl_exit(&dnh->dnh_zrlock);
- }
-
- return (0);
- }
- }
-
- return (1);
-}
-
-static void
-dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
-{
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- for (int i = idx; i < idx + slots; i++) {
- dnode_handle_t *dnh = &children->dnc_children[i];
- dnh->dnh_dnode = ptr;
- }
-}
-
-static boolean_t
-dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
-{
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- for (int i = idx; i < idx + slots; i++) {
- dnode_handle_t *dnh = &children->dnc_children[i];
- dnode_t *dn = dnh->dnh_dnode;
-
- if (dn == DN_SLOT_FREE) {
- continue;
- } else if (DN_SLOT_IS_PTR(dn)) {
- mutex_enter(&dn->dn_mtx);
- dmu_object_type_t type = dn->dn_type;
- mutex_exit(&dn->dn_mtx);
-
- if (type != DMU_OT_NONE)
- return (B_FALSE);
-
- continue;
- } else {
- return (B_FALSE);
- }
-
- return (B_FALSE);
- }
-
- return (B_TRUE);
-}
-
-static void
-dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
-{
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- for (int i = idx; i < idx + slots; i++) {
- dnode_handle_t *dnh = &children->dnc_children[i];
-
- ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
-
- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
- ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
- dnode_destroy(dnh->dnh_dnode);
- dnh->dnh_dnode = DN_SLOT_FREE;
- }
- }
-}
-
-void
-dnode_free_interior_slots(dnode_t *dn)
-{
- dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
- int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
- int idx = (dn->dn_object & (epb - 1)) + 1;
- int slots = dn->dn_num_slots - 1;
-
- if (slots == 0)
- return;
-
- ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
-
- while (!dnode_slots_tryenter(children, idx, slots))
- DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
-
- dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
- dnode_slots_rele(children, idx, slots);
-}
-
void
dnode_special_close(dnode_handle_t *dnh)
{
@@ -1162,7 +1010,7 @@ dnode_special_close(dnode_handle_t *dnh)
/*
* Wait for final references to the dnode to clear. This can
- * only happen if the arc is asynchronously evicting state that
+ * only happen if the arc is asyncronously evicting state that
* has a hold on this dnode while we are trying to evict this
* dnode.
*/
@@ -1182,24 +1030,19 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
{
dnode_t *dn;
- zrl_init(&dnh->dnh_zrlock);
- zrl_tryenter(&dnh->dnh_zrlock);
-
dn = dnode_create(os, dnp, NULL, object, dnh);
+ zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
-
- zrl_exit(&dnh->dnh_zrlock);
}
static void
dnode_buf_evict_async(void *dbu)
{
- dnode_children_t *dnc = dbu;
-
- DNODE_STAT_BUMP(dnode_buf_evict);
+ dnode_children_t *children_dnodes = dbu;
+ int i;
- for (int i = 0; i < dnc->dnc_count; i++) {
- dnode_handle_t *dnh = &dnc->dnc_children[i];
+ for (i = 0; i < children_dnodes->dnc_count; i++) {
+ dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
dnode_t *dn;
/*
@@ -1207,9 +1050,8 @@ dnode_buf_evict_async(void *dbu)
* another valid address, so there is no need here to guard
* against changes to or from NULL.
*/
- if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ if (dnh->dnh_dnode == NULL) {
zrl_destroy(&dnh->dnh_zrlock);
- dnh->dnh_dnode = DN_SLOT_UNINIT;
continue;
}
@@ -1224,36 +1066,140 @@ dnode_buf_evict_async(void *dbu)
ASSERT(refcount_is_zero(&dn->dn_holds));
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- dnode_destroy(dn); /* implicit zrl_remove() for first slot */
+ dnode_destroy(dn); /* implicit zrl_remove() */
zrl_destroy(&dnh->dnh_zrlock);
- dnh->dnh_dnode = DN_SLOT_UNINIT;
+ dnh->dnh_dnode = NULL;
}
- kmem_free(dnc, sizeof (dnode_children_t) +
- dnc->dnc_count * sizeof (dnode_handle_t));
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
+ children_dnodes->dnc_count * sizeof (dnode_handle_t));
}
/*
- * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
- * to ensure the hole at the specified object offset is large enough to
- * hold the dnode being created. The slots parameter is also used to ensure
- * a dnode does not span multiple dnode blocks. In both of these cases, if
- * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
- * are only possible when using DNODE_MUST_BE_FREE.
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
*
- * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
- * dnode_hold_impl() will check if the requested dnode is already consumed
- * as an extra dnode slot by an large dnode, in which case it returns
- * ENOENT.
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int skip;
+ int i;
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ for (i = 0; i < idx; i += skip) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL) {
+ ot = dnh->dnh_dnode->dn_type;
+ skip = dnh->dnh_dnode->dn_num_slots;
+ } else {
+ ot = dn_block[i].dn_type;
+ skip = dn_block[i].dn_extra_slots + 1;
+ }
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot == DMU_OT_NONE)
+ skip = 1;
+ }
+
+ return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
*
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ dnh = &children_dnodes->dnc_children[idx];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[idx].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int i;
+
+ if (idx + slots > DNODES_PER_BLOCK)
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ for (i = idx; i < idx + slots; i++) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[i].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot != DMU_OT_NONE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
* errors:
- * EINVAL - invalid object number or flags.
- * ENOSPC - hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
- * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
- * - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
- * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
- * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
- * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
- * EIO - i/o error error when reading the meta dnode dbuf.
+ * EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
+ * EIO - i/o error.
* succeeds even for free dnodes.
*/
int
@@ -1266,8 +1212,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_children_t *dnc;
- dnode_phys_t *dn_block;
+ dnode_children_t *children_dnodes;
dnode_phys_t *dn_block_begin;
dnode_handle_t *dnh;
@@ -1320,13 +1265,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
- if (db == NULL) {
- DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
+ if (db == NULL)
return (SET_ERROR(EIO));
- }
err = dbuf_read(db, NULL, DB_RF_CANFAIL);
if (err) {
- DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
return (err);
}
@@ -1334,194 +1276,68 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
- idx = object & (epb - 1);
- dn_block = (dnode_phys_t *)db->db.db_data;
-
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
- dnc = dmu_buf_get_user(&db->db);
- dnh = NULL;
- if (dnc == NULL) {
+ children_dnodes = dmu_buf_get_user(&db->db);
+ if (children_dnodes == NULL) {
dnode_children_t *winner;
- int skip = 0;
-
- dnc = kmem_zalloc(sizeof (dnode_children_t) +
+ children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t), KM_SLEEP);
- dnc->dnc_count = epb;
- dnh = &dnc->dnc_children[0];
-
- /* Initialize dnode slot status from dnode_phys_t */
- for (int i = 0; i < epb; i++) {
+ children_dnodes->dnc_count = epb;
+ dnh = &children_dnodes->dnc_children[0];
+ for (i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);
-
- if (skip) {
- skip--;
- continue;
- }
-
- if (dn_block[i].dn_type != DMU_OT_NONE) {
- int interior = dn_block[i].dn_extra_slots;
-
- dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
- dnode_set_slots(dnc, i + 1, interior,
- DN_SLOT_INTERIOR);
- skip = interior;
- } else {
- dnh[i].dnh_dnode = DN_SLOT_FREE;
- skip = 0;
- }
}
-
- dmu_buf_init_user(&dnc->dnc_dbu, NULL,
+ dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
dnode_buf_evict_async, NULL);
- winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
+ winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
if (winner != NULL) {
- for (int i = 0; i < epb; i++)
+ for (i = 0; i < epb; i++) {
zrl_destroy(&dnh[i].dnh_zrlock);
+ }
- kmem_free(dnc, sizeof (dnode_children_t) +
+ kmem_free(children_dnodes, sizeof (dnode_children_t) +
epb * sizeof (dnode_handle_t));
- dnc = winner;
+ children_dnodes = winner;
}
}
+ ASSERT(children_dnodes->dnc_count == epb);
- ASSERT(dnc->dnc_count == epb);
- dn = DN_SLOT_UNINIT;
-
- if (flag & DNODE_MUST_BE_ALLOCATED) {
- slots = 1;
-
- while (dn == DN_SLOT_UNINIT) {
- dnode_slots_hold(dnc, idx, slots);
- dnh = &dnc->dnc_children[idx];
-
- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
- dn = dnh->dnh_dnode;
- break;
- } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
- DNODE_STAT_BUMP(dnode_hold_alloc_interior);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(EEXIST));
- } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
- DNODE_STAT_BUMP(dnode_hold_alloc_misses);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(ENOENT));
- }
-
- dnode_slots_rele(dnc, idx, slots);
- if (!dnode_slots_tryenter(dnc, idx, slots)) {
- DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
- continue;
- }
-
- /*
- * Someone else won the race and called dnode_create()
- * after we checked DN_SLOT_IS_PTR() above but before
- * we acquired the lock.
- */
- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
- DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
- dn = dnh->dnh_dnode;
- } else {
- dn = dnode_create(os, dn_block + idx, db,
- object, dnh);
- }
- }
-
- mutex_enter(&dn->dn_mtx);
- if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
- DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
- mutex_exit(&dn->dn_mtx);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(ENOENT));
- }
-
- DNODE_STAT_BUMP(dnode_hold_alloc_hits);
- } else if (flag & DNODE_MUST_BE_FREE) {
-
- if (idx + slots - 1 >= DNODES_PER_BLOCK) {
- DNODE_STAT_BUMP(dnode_hold_free_overflow);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(ENOSPC));
- }
-
- while (dn == DN_SLOT_UNINIT) {
- dnode_slots_hold(dnc, idx, slots);
-
- if (!dnode_check_slots_free(dnc, idx, slots)) {
- DNODE_STAT_BUMP(dnode_hold_free_misses);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(ENOSPC));
- }
-
- dnode_slots_rele(dnc, idx, slots);
- if (!dnode_slots_tryenter(dnc, idx, slots)) {
- DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
- continue;
- }
-
- if (!dnode_check_slots_free(dnc, idx, slots)) {
- DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(ENOSPC));
- }
-
- /*
- * Allocated but otherwise free dnodes which would
- * be in the interior of a multi-slot dnodes need
- * to be freed. Single slot dnodes can be safely
- * re-purposed as a performance optimization.
- */
- if (slots > 1)
- dnode_reclaim_slots(dnc, idx + 1, slots - 1);
-
- dnh = &dnc->dnc_children[idx];
- if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
- dn = dnh->dnh_dnode;
- } else {
- dn = dnode_create(os, dn_block + idx, db,
- object, dnh);
- }
- }
-
- mutex_enter(&dn->dn_mtx);
- if (!refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
- DNODE_STAT_BUMP(dnode_hold_free_refcount);
- mutex_exit(&dn->dn_mtx);
- dnode_slots_rele(dnc, idx, slots);
- dbuf_rele(db, FTAG);
- return (SET_ERROR(EEXIST));
- }
+ idx = object & (epb - 1);
+ dn_block_begin = (dnode_phys_t *)db->db.db_data;
- dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
- DNODE_STAT_BUMP(dnode_hold_free_hits);
- } else {
+ if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
dbuf_rele(db, FTAG);
- return (SET_ERROR(EINVAL));
+ return (ENOSPC);
+ } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+ !dnode_is_allocated(db, idx)) {
+ dbuf_rele(db, FTAG);
+ return (ENOENT);
}
- if (dn->dn_free_txg) {
- DNODE_STAT_BUMP(dnode_hold_free_txg);
- type = dn->dn_type;
+ dnh = &children_dnodes->dnc_children[idx];
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
+ if (dn == NULL)
+ dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
+
+ mutex_enter(&dn->dn_mtx);
+ type = dn->dn_type;
+ if (dn->dn_free_txg ||
+ ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+ ((flag & DNODE_MUST_BE_FREE) &&
+ (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
mutex_exit(&dn->dn_mtx);
- dnode_slots_rele(dnc, idx, slots);
+ zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
- return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
- ENOENT : EEXIST));
+ return ((flag & DNODE_MUST_BE_ALLOCATED) ? ENOENT : EEXIST);
}
-
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
-
mutex_exit(&dn->dn_mtx);
/* Now we can rely on the hold to prevent the dnode from moving. */
- dnode_slots_rele(dnc, idx, slots);
+ zrl_remove(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index a37607e0e307..551c44aa3f28 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -554,7 +554,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
if (dn->dn_allocated_txg != dn->dn_free_txg)
dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
- dnode_free_interior_slots(dn);
mutex_enter(&dn->dn_mtx);
dn->dn_type = DMU_OT_NONE;
@@ -562,7 +561,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
dn->dn_have_spill = B_FALSE;
- dn->dn_num_slots = 1;
mutex_exit(&dn->dn_mtx);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
index 9f9cdce8fbd9..ee7852a0df0e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -660,9 +660,6 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
bonuslen = DN_BONUS_SIZE(dnodesize);
- dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
- bonuslen = DN_BONUS_SIZE(dnodesize);
-
/* first determine bonus header size and sum of all attributes */
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
SA_BONUS, bonuslen, &i, &used, &spilling);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 6ba5bed27b2b..ff8eeb1b2db9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -1109,10 +1109,10 @@ spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
/*
* Spares are tracked globally due to the following constraints:
*
- * - A spare may be part of multiple pools.
- * - A spare may be added to a pool even if it's actively in use within
+ * - A spare may be part of multiple pools.
+ * - A spare may be added to a pool even if it's actively in use within
* another pool.
- * - A spare in use in any pool can only be the source of a replacement if
+ * - A spare in use in any pool can only be the source of a replacement if
* the target is a spare in the same pool.
*
* We keep track of all spares on the system through the use of a reference
@@ -2255,6 +2255,7 @@ spa_maxdnodesize(spa_t *spa)
return (DNODE_MIN_SIZE);
}
+
/*
* Returns the txg that the last device removal completed. No indirect mappings
* have been added since this txg.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 744914a30580..b23ce0194378 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -823,7 +823,7 @@ typedef struct dmu_object_info {
uint8_t doi_checksum;
uint8_t doi_compress;
uint8_t doi_nblkptr;
- int8_t doi_pad[4];
+ uint8_t doi_pad[4];
uint64_t doi_dnodesize;
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
uint64_t doi_max_offset;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 3028f0436566..f692dae90fe1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -130,11 +130,7 @@ struct objset {
/* Protected by os_obj_lock */
kmutex_t os_obj_lock;
- uint64_t os_obj_next_chunk;
-
- /* Per-CPU next object to allocate, protected by atomic ops. */
- uint64_t *os_obj_next_percpu;
- int os_obj_next_percpu_len;
+ uint64_t os_obj_next;
/* Protected by os_lock */
kmutex_t os_lock;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index 4707c6181855..74acef0ae194 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -96,16 +96,9 @@ extern "C" {
#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT)
#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE))
#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
-#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
-#define DN_KILL_SPILLBLK (1)
-
-#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */
-#define DN_SLOT_FREE ((void *)1UL) /* Free slot */
-#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */
-#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */
-#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
-#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)
+#define DN_KILL_SPILLBLK (1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -127,7 +120,7 @@ extern "C" {
((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \
(uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \
(uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp))
-
+
#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \
(dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT)
@@ -150,57 +143,6 @@ enum dnode_dirtycontext {
/* Does dnode have a SA spill blkptr in bonus? */
#define DNODE_FLAG_SPILL_BLKPTR (1<<2)
-/*
- * VARIABLE-LENGTH (LARGE) DNODES
- *
- * The motivation for variable-length dnodes is to eliminate the overhead
- * associated with using spill blocks. Spill blocks are used to store
- * system attribute data (i.e. file metadata) that does not fit in the
- * dnode's bonus buffer. By allowing a larger bonus buffer area the use of
- * a spill block can be avoided. Spill blocks potentially incur an
- * additional read I/O for every dnode in a dnode block. As a worst case
- * example, reading 32 dnodes from a 16k dnode block and all of the spill
- * blocks could issue 33 separate reads. Now suppose those dnodes have size
- * 1024 and therefore don't need spill blocks. Then the worst case number
- * of blocks read is reduced to from 33 to two--one per dnode block.
- *
- * ZFS-on-Linux systems that make heavy use of extended attributes benefit
- * from this feature. In particular, ZFS-on-Linux supports the xattr=sa
- * dataset property which allows file extended attribute data to be stored
- * in the dnode bonus buffer as an alternative to the traditional
- * directory-based format. Workloads such as SELinux and the Lustre
- * distributed filesystem often store enough xattr data to force spill
- * blocks when xattr=sa is in effect. Large dnodes may therefore provide a
- * performance benefit to such systems. Other use cases that benefit from
- * this feature include files with large ACLs and symbolic links with long
- * target names.
- *
- * The size of a dnode may be a multiple of 512 bytes up to the size of a
- * dnode block (currently 16384 bytes). The dn_extra_slots field of the
- * on-disk dnode_phys_t structure describes the size of the physical dnode
- * on disk. The field represents how many "extra" dnode_phys_t slots a
- * dnode consumes in its dnode block. This convention results in a value of
- * 0 for 512 byte dnodes which preserves on-disk format compatibility with
- * older software which doesn't support large dnodes.
- *
- * Similarly, the in-memory dnode_t structure has a dn_num_slots field
- * to represent the total number of dnode_phys_t slots consumed on disk.
- * Thus dn->dn_num_slots is 1 greater than the corresponding
- * dnp->dn_extra_slots. This difference in convention was adopted
- * because, unlike on-disk structures, backward compatibility is not a
- * concern for in-memory objects, so we used a more natural way to
- * represent size for a dnode_t.
- *
- * The default size for newly created dnodes is determined by the value of
- * the "dnodesize" dataset property. By default the property is set to
- * "legacy" which is compatible with older software. Setting the property
- * to "auto" will allow the filesystem to choose the most suitable dnode
- * size. Currently this just sets the default dnode size to 1k, but future
- * code improvements could dynamically choose a size based on observed
- * workload patterns. Dnodes of varying sizes can coexist within the same
- * dataset and even within the same dnode block.
- */
-
typedef struct dnode_phys {
uint8_t dn_type; /* dmu_object_type_t */
uint8_t dn_indblkshift; /* ln2(indirect block size) */
@@ -227,6 +169,22 @@ typedef struct dnode_phys {
* protected properly.
*/
uint64_t dn_pad3[4];
+ /*
+ * The tail region is 448 bytes for a 512 byte dnode, and
+ * correspondingly larger for larger dnode sizes. The spill
+ * block pointer, when present, is always at the end of the tail
+ * region. There are three ways this space may be used, using
+ * a 512 byte dnode for this diagram:
+ *
+ * 0 64 128 192 256 320 384 448 (offset)
+ * +---------------+---------------+---------------+-------+
+ * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / |
+ * +---------------+---------------+---------------+-------+
+ * | dn_blkptr[0] | dn_bonus[0..319] |
+ * +---------------+-----------------------+---------------+
+ * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill |
+ * +---------------+-----------------------+---------------+
+ */
union {
blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)];
struct {
@@ -238,7 +196,7 @@ typedef struct dnode_phys {
uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN -
sizeof (blkptr_t)];
blkptr_t dn_spill;
- };
+ };
};
} dnode_phys_t;
@@ -403,7 +361,6 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
void dnode_evict_bonus(dnode_t *dn);
-void dnode_free_interior_slots(dnode_t *dn);
boolean_t dnode_needs_remap(const dnode_t *dn);
#define DNODE_IS_CACHEABLE(_dn) \
@@ -415,140 +372,6 @@ boolean_t dnode_needs_remap(const dnode_t *dn);
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
-/*
- * Used for dnodestats kstat.
- */
-typedef struct dnode_stats {
- /*
- * Number of failed attempts to hold a meta dnode dbuf.
- */
- kstat_named_t dnode_hold_dbuf_hold;
- /*
- * Number of failed attempts to read a meta dnode dbuf.
- */
- kstat_named_t dnode_hold_dbuf_read;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
- * to hold the requested object number which was allocated. This is
- * the common case when looking up any allocated object number.
- */
- kstat_named_t dnode_hold_alloc_hits;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
- * able to hold the request object number because it was not allocated.
- */
- kstat_named_t dnode_hold_alloc_misses;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
- * able to hold the request object number because the object number
- * refers to an interior large dnode slot.
- */
- kstat_named_t dnode_hold_alloc_interior;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
- * to retry acquiring slot zrl locks due to contention.
- */
- kstat_named_t dnode_hold_alloc_lock_retry;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
- * need to create the dnode because another thread did so after
- * dropping the read lock but before acquiring the write lock.
- */
- kstat_named_t dnode_hold_alloc_lock_misses;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
- * a free dnode instantiated by dnode_create() but not yet allocated
- * by dnode_allocate().
- */
- kstat_named_t dnode_hold_alloc_type_none;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
- * to hold the requested range of free dnode slots.
- */
- kstat_named_t dnode_hold_free_hits;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
- * able to hold the requested range of free dnode slots because
- * at least one slot was allocated.
- */
- kstat_named_t dnode_hold_free_misses;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
- * able to hold the requested range of free dnode slots because
- * after acquiring the zrl lock at least one slot was allocated.
- */
- kstat_named_t dnode_hold_free_lock_misses;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
- * to retry acquiring slot zrl locks due to contention.
- */
- kstat_named_t dnode_hold_free_lock_retry;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
- * a range of dnode slots which were held by another thread.
- */
- kstat_named_t dnode_hold_free_refcount;
- /*
- * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
- * a range of dnode slots which would overflow the dnode_phys_t.
- */
- kstat_named_t dnode_hold_free_overflow;
- /*
- * Number of times a dnode_hold(...) was attempted on a dnode
- * which had already been unlinked in an earlier txg.
- */
- kstat_named_t dnode_hold_free_txg;
- /*
- * Number of times dnode_free_interior_slots() needed to retry
- * acquiring a slot zrl lock due to contention.
- */
- kstat_named_t dnode_free_interior_lock_retry;
- /*
- * Number of new dnodes allocated by dnode_allocate().
- */
- kstat_named_t dnode_allocate;
- /*
- * Number of dnodes re-allocated by dnode_reallocate().
- */
- kstat_named_t dnode_reallocate;
- /*
- * Number of meta dnode dbufs evicted.
- */
- kstat_named_t dnode_buf_evict;
- /*
- * Number of times dmu_object_alloc*() reached the end of the existing
- * object ID chunk and advanced to a new one.
- */
- kstat_named_t dnode_alloc_next_chunk;
- /*
- * Number of times multiple threads attempted to allocate a dnode
- * from the same block of free dnodes.
- */
- kstat_named_t dnode_alloc_race;
- /*
- * Number of times dmu_object_alloc*() was forced to advance to the
- * next meta dnode dbuf due to an error from dmu_object_next().
- */
- kstat_named_t dnode_alloc_next_block;
- /*
- * Statistics for tracking dnodes which have been moved.
- */
- kstat_named_t dnode_move_invalid;
- kstat_named_t dnode_move_recheck1;
- kstat_named_t dnode_move_recheck2;
- kstat_named_t dnode_move_special;
- kstat_named_t dnode_move_handle;
- kstat_named_t dnode_move_rwlock;
- kstat_named_t dnode_move_active;
-} dnode_stats_t;
-
-extern dnode_stats_t dnode_stats;
-
-#define DNODE_STAT_INCR(stat, val) \
- atomic_add_64(&dnode_stats.stat.value.ui64, (val));
-#define DNODE_STAT_BUMP(stat) \
- DNODE_STAT_INCR(stat, 1);
-
#ifdef ZFS_DEBUG
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
index 4bea074b545f..e444e2fb5723 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
@@ -101,7 +101,7 @@ typedef struct sa_lot {
sa_attr_type_t *lot_attrs; /* array of attr #'s */
uint32_t lot_var_sizes; /* how many aren't fixed size */
uint32_t lot_attr_count; /* total attr count */
- list_t lot_idx_tab; /* should be only a couple of entries */
+ list_t lot_idx_tab; /* should be only a couple of entries */
int lot_instance; /* used with lot_hash to identify entry */
} sa_lot_t;
@@ -134,7 +134,7 @@ typedef struct sa_idx_tab {
* adding a completely new attribute is a very rare operation.
*/
struct sa_os {
- kmutex_t sa_lock;
+ kmutex_t sa_lock;
boolean_t sa_need_attr_registration;
boolean_t sa_force_spill;
uint64_t sa_master_obj;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
index 3b976808e941..ea372f02e99a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
@@ -139,8 +139,6 @@ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
-uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot,
- uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx);
/*
* Initialize an already-allocated object.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 38fda1d40585..a3c0e4c31d0d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -97,7 +97,6 @@ extern "C" {
#endif
#include <sys/callo.h>
#include <sys/disp.h>
-#include <machine/_inttypes.h>
#include <machine/stdarg.h>
#include <vm/vm.h>
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
index 756800f8afde..7cd294316984 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -94,7 +94,7 @@ typedef enum drr_headertype {
/* flag #21 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23)
-/* flag #24 is reserved for the raw send feature */
+/* flag #24 is reserved for the raw send (encryption) feature */
/* flag #25 is reserved for the ZSTD compression feature */
/*
@@ -120,7 +120,7 @@ typedef enum dmu_send_resume_token_version {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * | reserved | feature-flags |C|S|
+ * | reserved | feature-flags |C|S|
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* The low order two bits indicate the header type: SUBSTREAM (0x1)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
index 040dfaa29a94..3f0b771df48f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
@@ -157,7 +157,7 @@ typedef enum zil_create {
#define TX_ACL 13 /* Set ACL */
#define TX_CREATE_ACL 14 /* create with ACL */
#define TX_CREATE_ATTR 15 /* create + attrs */
-#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
+#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */
#define TX_MKDIR_ACL 17 /* mkdir with ACL */
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
@@ -436,7 +436,7 @@ extern void zil_commit_impl(zilog_t *zilog, uint64_t oid);
extern int zil_reset(const char *osname, void *txarg);
extern int zil_claim(struct dsl_pool *dp,
struct dsl_dataset *ds, void *txarg);
-extern int zil_check_log_chain(struct dsl_pool *dp,
+extern int zil_check_log_chain(struct dsl_pool *dp,
struct dsl_dataset *ds, void *tx);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index a801709dcc41..9e905acbcf3b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -955,8 +955,8 @@ uint64_t
zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
const char *name, int dnodesize, dmu_tx_t *tx)
{
- uint64_t new_obj;
-
+ uint64_t new_obj;
+
VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0,
dnodesize, tx)) > 0);
VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index fbf2e224db16..a2f7b08163c3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -726,9 +726,9 @@ int
zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
int dnodesize, dmu_tx_t *tx)
-{
- int err;
-
+ {
+ int err;
+
err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
dnodesize, tx);
if (err != 0)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
index 6332559edd74..e0bc7422c5ad 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
@@ -892,7 +892,7 @@ zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
int entry_type;
mode_t mode;
mode_t seen = 0;
- zfs_ace_hdr_t *acep = NULL;
+ zfs_ace_hdr_t *acep = NULL;
uint64_t who;
uint16_t iflags, type;
uint32_t access_mask;
@@ -1320,12 +1320,12 @@ zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
uint64_t who;
int new_count, new_bytes;
int ace_size;
- int entry_type;
+ int entry_type;
uint16_t iflags, type;
uint32_t access_mask;
zfs_acl_node_t *newnode;
- size_t abstract_size = aclp->z_ops.ace_abstract_size();
- void *zacep;
+ size_t abstract_size = aclp->z_ops.ace_abstract_size();
+ void *zacep;
boolean_t isdir;
trivial_acl_t masks;
@@ -1773,7 +1773,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
zfs_acl_t *aclp;
ulong_t mask;
int error;
- int count = 0;
+ int count = 0;
int largeace = 0;
mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
@@ -2104,7 +2104,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
zfs_acl_t *aclp;
int error;
uid_t uid = crgetuid(cr);
- uint64_t who;
+ uint64_t who;
uint16_t type, iflags;
uint16_t entry_type;
uint32_t access_mask;
@@ -2378,9 +2378,9 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
uint32_t working_mode;
int error;
int is_attr;
- boolean_t check_privs;
+ boolean_t check_privs;
znode_t *xzp;
- znode_t *check_zp = zp;
+ znode_t *check_zp = zp;
mode_t needed_bits;
uid_t owner;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
index 7e4e9cf85f57..9bf7643258c8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
@@ -310,7 +310,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
objid = LR_FOID_GET_OBJ(lr->lr_foid);
dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
-
+
xva_init(&xva);
zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
@@ -322,6 +322,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
* zfs_create() has no concept of these attributes, so we smuggle
* the values inside the vattr's otherwise unused va_ctime,
* va_nblocks, and va_fsid fields.
+
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
@@ -463,8 +464,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
* eventually end up in zfs_mknode(), which assigns the object's
* creation time, generation number, and dnode slot count. The
* generic zfs_create() has no concept of these attributes, so
- * we smuggle the values inside the vattr's otherwise unused
- * va_ctime, va_nblocks and va_fsid fields.
+ * we smuggle the values inside * the vattr's otherwise unused
+ * va_ctime, va_nblocks, and va_nlink fields.
*/
ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
xva.xva_vattr.va_nblocks = lr->lr_gen;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 40a7798149a1..c94cef7d456a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -832,7 +832,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
- VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+ VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
/*
* If this is the root, fix up the half-initialized parent pointer
@@ -1863,6 +1863,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
ASSERT(error == 0);
/*
+ * Give dmu_object_alloc() a hint about where to start
+ * allocating new objects. Otherwise, since the metadnode's
+ * dnode_phys_t structure isn't initialized yet, dmu_object_next()
+ * would fail and we'd have to skip to the next dnode block.
+ */
+ os->os_obj_next = moid + 1;
+
+ /*
* Set starting attributes.
*/
version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 466a36845899..c701cc9c15f7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -63,9 +63,9 @@
* representation, and the on-disk representation). The on-disk format
* consists of 3 parts:
*
- * - a single, per-dataset, ZIL header; which points to a chain of
- * - zero or more ZIL blocks; each of which contains
- * - zero or more ZIL records
+ * - a single, per-dataset, ZIL header; which points to a chain of
+ * - zero or more ZIL blocks; each of which contains
+ * - zero or more ZIL records
*
* A ZIL record holds the information necessary to replay a single
* system call transaction. A ZIL block can hold many ZIL records, and
@@ -1354,7 +1354,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
8192+4096, /* data base */
- 32*1024 + 4096, /* NFS writes */
+ 32*1024 + 4096, /* NFS writes */
UINT64_MAX
};
@@ -3093,10 +3093,8 @@ zil_close(zilog_t *zilog)
if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- if (zilog_is_dirty(zilog))
- zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
if (txg < spa_freeze_txg(zilog->zl_spa))
- VERIFY(!zilog_is_dirty(zilog));
+ ASSERT(!zilog_is_dirty(zilog));
zilog->zl_get_data = NULL;