aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--cmd/zdb/zdb.c275
-rw-r--r--man/man1m/zdb.1m4
-rw-r--r--uts/common/fs/zfs/metaslab.c498
-rw-r--r--uts/common/fs/zfs/range_tree.c8
-rw-r--r--uts/common/fs/zfs/spa_checkpoint.c6
-rw-r--r--uts/common/fs/zfs/space_map.c122
-rw-r--r--uts/common/fs/zfs/sys/metaslab.h3
-rw-r--r--uts/common/fs/zfs/sys/metaslab_impl.h79
-rw-r--r--uts/common/fs/zfs/sys/range_tree.h3
-rw-r--r--uts/common/fs/zfs/sys/space_map.h31
-rw-r--r--uts/common/fs/zfs/sys/vdev_impl.h11
-rw-r--r--uts/common/fs/zfs/vdev.c95
-rw-r--r--uts/common/fs/zfs/vdev_indirect.c3
-rw-r--r--uts/common/fs/zfs/vdev_indirect_mapping.c1
-rw-r--r--uts/common/fs/zfs/vdev_initialize.c2
-rw-r--r--uts/common/fs/zfs/vdev_removal.c56
16 files changed, 761 insertions, 436 deletions
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index acfe7ca5f7a8..57c39cf05bd3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm)
return;
(void) printf("space map object %llu:\n",
- (longlong_t)sm->sm_phys->smp_object);
- (void) printf(" smp_objsize = 0x%llx\n",
- (longlong_t)sm->sm_phys->smp_objsize);
+ (longlong_t)sm->sm_object);
+ (void) printf(" smp_length = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_length);
(void) printf(" smp_alloc = 0x%llx\n",
(longlong_t)sm->sm_phys->smp_alloc);
+ if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+ return;
+
/*
* Print out the freelist entries in both encoded and decoded form.
*/
uint8_t mapshift = sm->sm_shift;
int64_t alloc = 0;
- uint64_t word;
+ uint64_t word, entry_id = 0;
for (uint64_t offset = 0; offset < space_map_length(sm);
offset += sizeof (word)) {
@@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm)
sizeof (word), &word, DMU_READ_PREFETCH));
if (sm_entry_is_debug(word)) {
- (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n",
- (u_longlong_t)(offset / sizeof (word)),
+ (void) printf("\t [%6llu] %s: txg %llu pass %llu\n",
+ (u_longlong_t)entry_id,
ddata[SM_DEBUG_ACTION_DECODE(word)],
(u_longlong_t)SM_DEBUG_TXG_DECODE(word),
(u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+ entry_id++;
continue;
}
@@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm)
(void) printf("\t [%6llu] %c range:"
" %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
- (u_longlong_t)(offset / sizeof (word)),
+ (u_longlong_t)entry_id,
entry_type, (u_longlong_t)entry_off,
(u_longlong_t)(entry_off + entry_run),
(u_longlong_t)entry_run,
@@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm)
alloc += entry_run;
else
alloc -= entry_run;
+ entry_id++;
}
- if ((uint64_t)alloc != space_map_allocated(sm)) {
+ if (alloc != space_map_allocated(sm)) {
(void) printf("space_map_object alloc (%lld) INCONSISTENT "
"with space map summary (%lld)\n",
(longlong_t)space_map_allocated(sm), (longlong_t)alloc);
@@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
- if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
- ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
-
- dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
- }
+ ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+ dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
}
static void
@@ -3096,6 +3098,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
ddt_entry_t dde;
int error;
+ ASSERT(!dump_opt['L']);
+
bzero(&ddb, sizeof (ddb));
while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
blkptr_t blk;
@@ -3119,12 +3123,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
zcb->zcb_dedup_blocks++;
}
}
- if (!dump_opt['L']) {
- ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
- ddt_enter(ddt);
- VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
- ddt_exit(ddt);
- }
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
}
ASSERT(error == ENOENT);
@@ -3166,6 +3168,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
static void
zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
{
+ if (dump_opt['L'])
+ return;
+
if (spa->spa_vdev_removal == NULL)
return;
@@ -3257,7 +3262,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
space_map_t *prev_obsolete_sm = NULL;
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
prev_obsolete_sm);
space_map_close(prev_obsolete_sm);
@@ -3351,9 +3355,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
- space_map_update(checkpoint_sm);
VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
checkpoint_sm_exclude_entry_cb, &cseea));
space_map_close(checkpoint_sm);
@@ -3363,6 +3367,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
static void
zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
{
+ ASSERT(!dump_opt['L']);
+
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
@@ -3459,6 +3465,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
static void
zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
{
+ ASSERT(!dump_opt['L']);
+
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
@@ -3505,67 +3513,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
{
zcb->zcb_spa = spa;
- if (!dump_opt['L']) {
- dsl_pool_t *dp = spa->spa_dsl_pool;
- vdev_t *rvd = spa->spa_root_vdev;
+ if (dump_opt['L'])
+ return;
- /*
- * We are going to be changing the meaning of the metaslab's
- * ms_allocatable. Ensure that the allocator doesn't try to
- * use the tree.
- */
- spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
- spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ vdev_t *rvd = spa->spa_root_vdev;
- zcb->zcb_vd_obsolete_counts =
- umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
- UMEM_NOFAIL);
+ /*
+ * We are going to be changing the meaning of the metaslab's
+ * ms_allocatable. Ensure that the allocator doesn't try to
+ * use the tree.
+ */
+ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
- /*
- * For leak detection, we overload the ms_allocatable trees
- * to contain allocated segments instead of free segments.
- * As a result, we can't use the normal metaslab_load/unload
- * interfaces.
- */
- zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
- load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+ zcb->zcb_vd_obsolete_counts =
+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+ UMEM_NOFAIL);
- /*
- * On load_concrete_ms_allocatable_trees() we loaded all the
- * allocated entries from the ms_sm to the ms_allocatable for
- * each metaslab. If the pool has a checkpoint or is in the
- * middle of discarding a checkpoint, some of these blocks
- * may have been freed but their ms_sm may not have been
- * updated because they are referenced by the checkpoint. In
- * order to avoid false-positives during leak-detection, we
- * go through the vdev's checkpoint space map and exclude all
- * its entries from their relevant ms_allocatable.
- *
- * We also aggregate the space held by the checkpoint and add
- * it to zcb_checkpoint_size.
- *
- * Note that at this point we are also verifying that all the
- * entries on the checkpoint_sm are marked as allocated in
- * the ms_sm of their relevant metaslab.
- * [see comment in checkpoint_sm_exclude_entry_cb()]
- */
- zdb_leak_init_exclude_checkpoint(spa, zcb);
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
- /* for cleaner progress output */
- (void) fprintf(stderr, "\n");
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
+ ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
- if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
- ASSERT(spa_feature_is_enabled(spa,
- SPA_FEATURE_DEVICE_REMOVAL));
- (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
- increment_indirect_mapping_cb, zcb, NULL);
- }
- } else {
- /*
- * If leak tracing is disabled, we still need to consider
- * any checkpointed space in our space verification.
- */
- zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa);
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+ increment_indirect_mapping_cb, zcb, NULL);
}
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -3646,52 +3650,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
static boolean_t
zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
{
+ if (dump_opt['L'])
+ return (B_FALSE);
+
boolean_t leaks = B_FALSE;
- if (!dump_opt['L']) {
- vdev_t *rvd = spa->spa_root_vdev;
- for (unsigned c = 0; c < rvd->vdev_children; c++) {
- vdev_t *vd = rvd->vdev_child[c];
- metaslab_group_t *mg = vd->vdev_mg;
-
- if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
- leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
- }
- for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
- ASSERT3P(mg, ==, msp->ms_group);
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+#if DEBUG
+ metaslab_group_t *mg = vd->vdev_mg;
+#endif
- /*
- * ms_allocatable has been overloaded
- * to contain allocated segments. Now that
- * we finished traversing all blocks, any
- * block that remains in the ms_allocatable
- * represents an allocated block that we
- * did not claim during the traversal.
- * Claimed blocks would have been removed
- * from the ms_allocatable. For indirect
- * vdevs, space remaining in the tree
- * represents parts of the mapping that are
- * not referenced, which is not a bug.
- */
- if (vd->vdev_ops == &vdev_indirect_ops) {
- range_tree_vacate(msp->ms_allocatable,
- NULL, NULL);
- } else {
- range_tree_vacate(msp->ms_allocatable,
- zdb_leak, vd);
- }
+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+ leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+ }
- if (msp->ms_loaded) {
- msp->ms_loaded = B_FALSE;
- }
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(mg, ==, msp->ms_group);
+
+ /*
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
+ * represents an allocated block that we
+ * did not claim during the traversal.
+ * Claimed blocks would have been removed
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ range_tree_vacate(msp->ms_allocatable,
+ NULL, NULL);
+ } else {
+ range_tree_vacate(msp->ms_allocatable,
+ zdb_leak, vd);
+ }
+
+ if (msp->ms_loaded) {
+ msp->ms_loaded = B_FALSE;
}
}
- umem_free(zcb->zcb_vd_obsolete_counts,
- rvd->vdev_children * sizeof (uint32_t *));
- zcb->zcb_vd_obsolete_counts = NULL;
}
+
+ umem_free(zcb->zcb_vd_obsolete_counts,
+ rvd->vdev_children * sizeof (uint32_t *));
+ zcb->zcb_vd_obsolete_counts = NULL;
+
return (leaks);
}
@@ -3730,13 +3740,18 @@ dump_block_stats(spa_t *spa)
!dump_opt['L'] ? "nothing leaked " : "");
/*
- * Load all space maps as SM_ALLOC maps, then traverse the pool
- * claiming each block we discover. If the pool is perfectly
- * consistent, the space maps will be empty when we're done.
- * Anything left over is a leak; any block we can't claim (because
- * it's not part of any space map) is a double allocation,
- * reference to a freed block, or an unclaimed log block.
+ * When leak detection is enabled we load all space maps as SM_ALLOC
+ * maps, then traverse the pool claiming each block we discover. If
+ * the pool is perfectly consistent, the segment trees will be empty
+ * when we're done. Anything left over is a leak; any block we can't
+ * claim (because it's not part of any space map) is a double
+ * allocation, reference to a freed block, or an unclaimed log block.
+ *
+ * When leak detection is disabled (-L option) we still traverse the
+ * pool claiming each block we discover, but we skip opening any space
+ * maps.
*/
+ bzero(&zcb, sizeof (zdb_cb_t));
zdb_leak_init(spa, &zcb);
/*
@@ -3815,11 +3830,10 @@ dump_block_stats(spa_t *spa)
total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
- if (total_found == total_alloc) {
- if (!dump_opt['L'])
- (void) printf("\n\tNo leaks (block sum matches space"
- " maps exactly)\n");
- } else {
+ if (total_found == total_alloc && !dump_opt['L']) {
+ (void) printf("\n\tNo leaks (block sum matches space"
+ " maps exactly)\n");
+ } else if (!dump_opt['L']) {
(void) printf("block traversal size %llu != alloc %llu "
"(%s %lld)\n",
(u_longlong_t)total_found,
@@ -4159,7 +4173,6 @@ verify_device_removal_feature_counts(spa_t *spa)
spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object,
0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
(void) printf("\n");
space_map_close(prev_obsolete_sm);
@@ -4365,7 +4378,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
* their respective ms_allocateable trees should not contain them.
*/
mutex_enter(&ms->ms_lock);
- range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);
return (0);
@@ -4428,7 +4442,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
checkpoint_sm_obj, 0, current_vd->vdev_asize,
current_vd->vdev_ashift));
- space_map_update(checkpoint_sm);
verify_checkpoint_sm_entry_cb_arg_t vcsec;
vcsec.vcsec_vd = ckpoint_vd;
@@ -4436,6 +4449,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
vcsec.vcsec_num_entries =
space_map_length(checkpoint_sm) / sizeof (uint64_t);
VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
verify_checkpoint_sm_entry_cb, &vcsec));
dump_spacemap(current->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
@@ -4515,7 +4529,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
* are part of the checkpoint were freed by mistake.
*/
range_tree_walk(ckpoint_msp->ms_allocatable,
- (range_tree_func_t *)range_tree_verify,
+ (range_tree_func_t *)range_tree_verify_not_present,
current_msp->ms_allocatable);
}
}
@@ -4527,6 +4541,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
static void
verify_checkpoint_blocks(spa_t *spa)
{
+ ASSERT(!dump_opt['L']);
+
spa_t *checkpoint_spa;
char *checkpoint_pool;
nvlist_t *config = NULL;
@@ -4592,7 +4608,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa)
VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
- space_map_update(checkpoint_sm);
dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
space_map_close(checkpoint_sm);
}
diff --git a/man/man1m/zdb.1m b/man/man1m/zdb.1m
index 63cfc5d7f1b8..ca771c24d787 100644
--- a/man/man1m/zdb.1m
+++ b/man/man1m/zdb.1m
@@ -10,7 +10,7 @@
.\"
.\"
.\" Copyright 2012, Richard Lowe.
-.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\"
.Dd April 14, 2017
@@ -187,7 +187,7 @@ If the
.Fl u
option is also specified, also display the uberblocks on this device.
.It Fl L
-Disable leak tracing and the loading of space maps.
+Disable leak detection and the loading of space maps.
By default,
.Nm
verifies that all non-free blocks are referenced, which can be very expensive.
diff --git a/uts/common/fs/zfs/metaslab.c b/uts/common/fs/zfs/metaslab.c
index c92297c0fd2a..4552b809ed35 100644
--- a/uts/common/fs/zfs/metaslab.c
+++ b/uts/common/fs/zfs/metaslab.c
@@ -489,45 +489,62 @@ metaslab_compare(const void *x1, const void *x2)
return (AVL_CMP(m1->ms_start, m2->ms_start));
}
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+ return (msp->ms_allocated_space);
+}
+
/*
* Verify that the space accounting on disk matches the in-core range_trees.
*/
-void
+static void
metaslab_verify_space(metaslab_t *msp, uint64_t txg)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
- uint64_t allocated = 0;
+ uint64_t allocating = 0;
uint64_t sm_free_space, msp_free_space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_condensing);
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/*
* We can only verify the metaslab space when we're called
- * from syncing context with a loaded metaslab that has an allocated
- * space map. Calling this in non-syncing context does not
- * provide a consistent view of the metaslab since we're performing
- * allocations in the future.
+ * from syncing context with a loaded metaslab that has an
+ * allocated space map. Calling this in non-syncing context
+ * does not provide a consistent view of the metaslab since
+ * we're performing allocations in the future.
*/
if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
!msp->ms_loaded)
return;
- sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
- space_map_alloc_delta(msp->ms_sm);
+ /*
+ * Even though the smp_alloc field can get negative (e.g.
+ * see vdev_checkpoint_sm), that should never be the case
+ * when it come's to a metaslab's space map.
+ */
+ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+ sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
/*
- * Account for future allocations since we would have already
- * deducted that space from the ms_freetree.
+ * Account for future allocations since we would have
+ * already deducted that space from the ms_allocatable.
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
- allocated +=
+ allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
- msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+ ASSERT3U(msp->ms_deferspace, ==,
+ range_tree_space(msp->ms_defer[0]) +
+ range_tree_space(msp->ms_defer[1]));
+
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
msp->ms_deferspace + range_tree_space(msp->ms_freed);
VERIFY3U(sm_free_space, ==, msp_free_space);
@@ -832,6 +849,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT(msp != NULL);
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1445,6 +1463,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
* ==========================================================================
*/
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+ /*
+ * Auxiliary histograms are only cleared when resetting them,
+ * which can only happen while the metaslab is loaded.
+ */
+ ASSERT(msp->ms_loaded);
+
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+ range_tree_t *rt)
+{
+ /*
+ * This is modeled after space_map_histogram_add(), so refer to that
+ * function for implementation details. We want this to work like
+ * the space map histogram, and not the range tree histogram, as we
+ * are essentially constructing a delta that will be later subtracted
+ * from the space map histogram.
+ */
+ int idx = 0;
+ for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(i, >=, idx + shift);
+ histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(sm != NULL);
+
+ /*
+ * This is similar to the metaslab's space map histogram updates
+ * that take place in metaslab_sync(). The only difference is that
+ * we only care about segments that haven't made it into the
+ * ms_allocatable tree yet.
+ */
+ if (msp->ms_loaded) {
+ metaslab_aux_histograms_clear(msp);
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freed);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ metaslab_aux_histogram_add(msp->ms_deferhist[t],
+ sm->sm_shift, msp->ms_defer[t]);
+ }
+ }
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+
+ if (sm == NULL) {
+ /*
+ * We came here from metaslab_init() when creating/opening a
+ * pool, looking at a metaslab that hasn't had any allocations
+ * yet.
+ */
+ return;
+ }
+
+ /*
+ * This is similar to the actions that we take for the ms_freed
+ * and ms_defer trees in metaslab_sync_done().
+ */
+ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+ if (defer_allowed) {
+ bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_synchist));
+ } else {
+ bzero(msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_deferhist[hist_index]));
+ }
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /* see comment in metaslab_verify_unflushed_changes() */
+ if (msp->ms_group == NULL)
+ return;
+
+ /*
+ * Devices being removed always return a weight of 0 and leave
+ * fragmentation and ms_max_size as is - there is nothing for
+ * us to verify here.
+ */
+ vdev_t *vd = msp->ms_group->mg_vd;
+ if (vd->vdev_removing)
+ return;
+
+ /*
+ * If the metaslab is dirty it probably means that we've done
+ * some allocations or frees that have changed our histograms
+ * and thus the weight.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&vd->vdev_ms_list, msp, t))
+ return;
+ }
+
+ /*
+ * This verification checks that our in-memory state is consistent
+ * with what's on disk. If the pool is read-only then there aren't
+ * any changes and we just have the initially-loaded state.
+ */
+ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+ return;
+
+ /* some extra verification for in-core tree if you can */
+ if (msp->ms_loaded) {
+ range_tree_stat_verify(msp->ms_allocatable);
+ VERIFY(space_map_histogram_verify(msp->ms_sm,
+ msp->ms_allocatable));
+ }
+
+ uint64_t weight = msp->ms_weight;
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+ uint64_t frag = msp->ms_fragmentation;
+ uint64_t max_segsize = msp->ms_max_size;
+
+ msp->ms_weight = 0;
+ msp->ms_fragmentation = 0;
+ msp->ms_max_size = 0;
+
+ /*
+ * This function is used for verification purposes. Regardless of
+ * whether metaslab_weight() thinks this metaslab should be active or
+ * not, we want to ensure that the actual weight (and therefore the
+ * value of ms_weight) would be the same if it was to be recalculated
+ * at this point.
+ */
+ msp->ms_weight = metaslab_weight(msp) | was_active;
+
+ VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+ /*
+ * If the weight type changed then there is no point in doing
+ * verification. Revert fields to their original values.
+ */
+ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+ (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+ msp->ms_fragmentation = frag;
+ msp->ms_weight = weight;
+ return;
+ }
+
+ VERIFY3U(msp->ms_fragmentation, ==, frag);
+ VERIFY3U(msp->ms_weight, ==, weight);
+}
+
/*
* Wait for any in-progress metaslab loads to complete.
*/
@@ -1466,47 +1681,94 @@ metaslab_load_impl(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
/*
- * Nobody else can manipulate a loading metaslab, so it's now safe
- * to drop the lock. This way we don't have to hold the lock while
- * reading the spacemap from disk.
+ * We temporarily drop the lock to unblock other operations while we
+ * are reading the space map. Therefore, metaslab_sync() and
+ * metaslab_sync_done() can run at the same time as we do.
+ *
+ * metaslab_sync() can append to the space map while we are loading.
+ * Therefore we load only entries that existed when we started the
+ * load. Additionally, metaslab_sync_done() has to wait for the load
+ * to complete because there are potential races like metaslab_load()
+ * loading parts of the space map that are currently being appended
+ * by metaslab_sync(). If we didn't, the ms_allocatable would have
+ * entries that metaslab_sync_done() would try to re-add later.
+ *
+ * That's why before dropping the lock we remember the synced length
+ * of the metaslab and read up to that point of the space map,
+ * ignoring entries appended by metaslab_sync() that happen after we
+ * drop the lock.
*/
+ uint64_t length = msp->ms_synced_length;
mutex_exit(&msp->ms_lock);
- /*
- * If the space map has not been allocated yet, then treat
- * all the space in the metaslab as free and add it to ms_allocatable.
- */
if (msp->ms_sm != NULL) {
- error = space_map_load(msp->ms_sm, msp->ms_allocatable,
- SM_FREE);
+ error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE, length);
} else {
+ /*
+ * The space map has not been allocated yet, so treat
+ * all the space in the metaslab as free and add it to the
+ * ms_allocatable tree.
+ */
range_tree_add(msp->ms_allocatable,
msp->ms_start, msp->ms_size);
}
+ /*
+ * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+ * changing the ms_sm and the metaslab's range trees while we are
+ * about to use them and populate the ms_allocatable. The ms_lock
+ * is insufficient for this because metaslab_sync() doesn't hold
+ * the ms_lock while writing the ms_checkpointing tree to disk.
+ */
+ mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
+ ASSERT(!msp->ms_condensing);
- if (error != 0)
+ if (error != 0) {
+ mutex_exit(&msp->ms_sync_lock);
return (error);
+ }
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
/*
- * If the metaslab already has a spacemap, then we need to
- * remove all segments from the defer tree; otherwise, the
- * metaslab is completely empty and we can skip this.
+ * The ms_allocatable contains the segments that exist in the
+ * ms_defer trees [see ms_synced_length]. Thus we need to remove
+ * them from ms_allocatable as they will be added again in
+ * metaslab_sync_done().
*/
- if (msp->ms_sm != NULL) {
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defer[t],
- range_tree_remove, msp->ms_allocatable);
- }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
}
+
+ /*
+ * Call metaslab_recalculate_weight_and_sort() now that the
+ * metaslab is loaded so we get the metaslab's real weight.
+ *
+ * Unless this metaslab was created with older software and
+ * has not yet been converted to use segment-based weight, we
+ * expect the new weight to be better or equal to the weight
+ * that the metaslab had while it was not loaded. This is
+ * because the old weight does not take into account the
+ * consolidation of adjacent segments between TXGs. [see
+ * comment for ms_synchist and ms_deferhist[] for more info]
+ */
+ uint64_t weight = msp->ms_weight;
+ metaslab_recalculate_weight_and_sort(msp);
+ if (!WEIGHT_IS_SPACEBASED(weight))
+ ASSERT3U(weight, <=, msp->ms_weight);
msp->ms_max_size = metaslab_block_maxsize(msp);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ metaslab_verify_space(msp, spa_syncing_txg(spa));
+ mutex_exit(&msp->ms_sync_lock);
+
return (0);
}
@@ -1523,6 +1785,7 @@ metaslab_load(metaslab_t *msp)
if (msp->ms_loaded)
return (0);
VERIFY(!msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
@@ -1536,10 +1799,29 @@ void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_verify_weight_and_frag(msp);
+
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
+
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
+
+ /*
+ * We explicitly recalculate the metaslab's weight based on its space
+ * map (as it is now not loaded). We want unload metaslabs to always
+ * have their weights calculated from the space map histograms, while
+ * loaded ones have it calculated from their in-core range tree
+ * [see metaslab_load()]. This way, the weight reflects the information
+ * available in-core, whether it is loaded or not
+ *
+ * If ms_group == NULL means that we came here from metaslab_fini(),
+ * at which point it doesn't make sense for us to do the recalculation
+ * and the sorting.
+ */
+ if (msp->ms_group != NULL)
+ metaslab_recalculate_weight_and_sort(msp);
}
static void
@@ -1579,6 +1861,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
+ *
+ * Note:
+ * When called from vdev_expand(), we can't call into the DMU as
+ * we are holding the spa_config_lock as a writer and we would
+ * deadlock [see relevant comment in vdev_metaslab_init()]. in
+ * that case, the object parameter is zero though, so we won't
+ * call into the DMU.
*/
if (object != 0) {
error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
@@ -1590,14 +1879,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
}
ASSERT(ms->ms_sm != NULL);
+ ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
+ ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
/*
- * We create the main range tree here, but we don't create the
+ * We create the ms_allocatable here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
- * addition of new space; and for debugging, it ensures that we'd
- * data fault on any attempt to use this metaslab before it's ready.
+ * addition of new space; and for debugging, it ensures that
+ * we'd data fault on any attempt to use this metaslab before
+ * it's ready.
*/
ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms);
metaslab_group_add(mg, ms);
@@ -1613,8 +1905,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
* out this txg. This ensures that we don't attempt to allocate
* from it before we have initialized it completely.
*/
- if (txg <= TXG_INITIAL)
+ if (txg <= TXG_INITIAL) {
metaslab_sync_done(ms, 0);
+ metaslab_space_update(vd, mg->mg_class,
+ metaslab_allocated_space(ms), 0, 0);
+ }
/*
* If metaslab_debug_load is set and we're initializing a metaslab
@@ -1648,7 +1943,7 @@ metaslab_fini(metaslab_t *msp)
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
metaslab_space_update(vd, mg->mg_class,
- -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+ -metaslab_allocated_space(msp), 0, -msp->ms_size);
space_map_close(msp->ms_sm);
@@ -1669,6 +1964,9 @@ metaslab_fini(metaslab_t *msp)
range_tree_destroy(msp->ms_checkpointing);
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@@ -1684,7 +1982,7 @@ metaslab_fini(metaslab_t *msp)
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
- * multiplying that by the fragmetation metric in this table. Doing
+ * multiplying that by the fragmentation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
@@ -1719,10 +2017,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
};
/*
- * Calclate the metaslab's fragmentation metric. A return value
- * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
- * not support this metric. Otherwise, the return value should be in the
- * range [0, 100].
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
*/
static void
metaslab_set_fragmentation(metaslab_t *msp)
@@ -1815,7 +2113,7 @@ metaslab_space_weight(metaslab_t *msp)
/*
* The baseline weight is the metaslab's free space.
*/
- space = msp->ms_size - space_map_allocated(msp->ms_sm);
+ space = msp->ms_size - metaslab_allocated_space(msp);
if (metaslab_fragmentation_factor_enabled &&
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
@@ -1919,14 +2217,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
static uint64_t
metaslab_weight_from_spacemap(metaslab_t *msp)
{
- uint64_t weight = 0;
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(!msp->ms_loaded);
+ ASSERT(sm != NULL);
+ ASSERT3U(space_map_object(sm), !=, 0);
+ ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+ /*
+ * Create a joint histogram from all the segments that have made
+ * it to the metaslab's space map histogram, that are not yet
+ * available for allocation because they are still in the freeing
+ * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+ * these segments from the space map's histogram to get a more
+ * accurate weight.
+ */
+ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ deferspace_histogram[i] += msp->ms_synchist[i];
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ deferspace_histogram[i] += msp->ms_deferhist[t][i];
+ }
+ }
+
+ uint64_t weight = 0;
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
- if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
- WEIGHT_SET_COUNT(weight,
- msp->ms_sm->sm_phys->smp_histogram[i]);
- WEIGHT_SET_INDEX(weight, i +
- msp->ms_sm->sm_shift);
+ ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+ deferspace_histogram[i]);
+ uint64_t count =
+ sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+ if (count != 0) {
+ WEIGHT_SET_COUNT(weight, count);
+ WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
@@ -1951,7 +2273,7 @@ metaslab_segment_weight(metaslab_t *msp)
/*
* The metaslab is completely free.
*/
- if (space_map_allocated(msp->ms_sm) == 0) {
+ if (metaslab_allocated_space(msp) == 0) {
int idx = highbit64(msp->ms_size) - 1;
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
@@ -1973,7 +2295,7 @@ metaslab_segment_weight(metaslab_t *msp)
/*
* If the metaslab is fully allocated then just make the weight 0.
*/
- if (space_map_allocated(msp->ms_sm) == msp->ms_size)
+ if (metaslab_allocated_space(msp) == msp->ms_size)
return (0);
/*
* If the metaslab is already loaded, then use the range tree to
@@ -2054,6 +2376,8 @@ metaslab_weight(metaslab_t *msp)
*/
if (msp->ms_loaded)
msp->ms_max_size = metaslab_block_maxsize(msp);
+ else
+ ASSERT0(msp->ms_max_size);
/*
* Segment-based weighting requires space map histogram support.
@@ -2069,6 +2393,15 @@ metaslab_weight(metaslab_t *msp)
return (weight);
}
+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+ /* note: we preserve the mask (e.g. indication of primary, etc..) */
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(msp->ms_group, msp,
+ metaslab_weight(msp) | was_active);
+}
+
static int
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
int allocator, uint64_t activation_weight)
@@ -2453,17 +2786,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY(txg <= spa_final_dirty_txg(spa));
/*
- * The only state that can actually be changing concurrently with
- * metaslab_sync() is the metaslab's ms_allocatable. No other
- * thread can be modifying this txg's alloc, freeing,
+ * The only state that can actually be changing concurrently
+ * with metaslab_sync() is the metaslab's ms_allocatable. No
+ * other thread can be modifying this txg's alloc, freeing,
* freed, or space_map_phys_t. We drop ms_lock whenever we
- * could call into the DMU, because the DMU can call down to us
- * (e.g. via zio_free()) at any time.
+ * could call into the DMU, because the DMU can call down to
+ * us (e.g. via zio_free()) at any time.
*
* The spa_vdev_remove_thread() can be reading metaslab state
- * concurrently, and it is locked out by the ms_sync_lock. Note
- * that the ms_lock is insufficient for this, because it is dropped
- * by space_map_write().
+ * concurrently, and it is locked out by the ms_sync_lock.
+ * Note that the ms_lock is insufficient for this, because it
+ * is dropped by space_map_write().
*/
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
@@ -2475,7 +2808,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
msp->ms_start, msp->ms_size, vd->vdev_ashift));
+
ASSERT(msp->ms_sm != NULL);
+ ASSERT0(metaslab_allocated_space(msp));
}
if (!range_tree_is_empty(msp->ms_checkpointing) &&
@@ -2523,6 +2858,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
mutex_enter(&msp->ms_lock);
}
+ msp->ms_allocated_space += range_tree_space(alloctree);
+ ASSERT3U(msp->ms_allocated_space, >=,
+ range_tree_space(msp->ms_freeing));
+ msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
if (!range_tree_is_empty(msp->ms_checkpointing)) {
ASSERT(spa_has_checkpoint(spa));
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
@@ -2536,14 +2876,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_write(vd->vdev_checkpoint_sm,
msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
- space_map_update(vd->vdev_checkpoint_sm);
spa->spa_checkpoint_info.sci_dspace +=
range_tree_space(msp->ms_checkpointing);
vd->vdev_stat.vs_checkpoint_space +=
range_tree_space(msp->ms_checkpointing);
ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
- -vd->vdev_checkpoint_sm->sm_alloc);
+ -space_map_allocated(vd->vdev_checkpoint_sm));
range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
}
@@ -2588,6 +2927,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* time we load the space map.
*/
space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+ metaslab_aux_histograms_update(msp);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
@@ -2595,16 +2935,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* For sync pass 1, we avoid traversing this txg's free range tree
- * and instead will just swap the pointers for freeing and
- * freed. We can safely do this since the freed_tree is
- * guaranteed to be empty on the initial pass.
+ * and instead will just swap the pointers for freeing and freed.
+ * We can safely do this since the freed_tree is guaranteed to be
+ * empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+ ASSERT0(msp->ms_allocated_this_txg);
} else {
range_tree_vacate(msp->ms_freeing,
range_tree_add, msp->ms_freed);
}
+ msp->ms_allocated_this_txg += range_tree_space(alloctree);
range_tree_vacate(alloctree, NULL, NULL);
ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
@@ -2682,7 +3024,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
}
defer_delta = 0;
- alloc_delta = space_map_alloc_delta(msp->ms_sm);
+ alloc_delta = msp->ms_allocated_this_txg -
+ range_tree_space(msp->ms_freed);
if (defer_allowed) {
defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
@@ -2714,7 +3057,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
msp->ms_loaded ? range_tree_add : NULL,
msp->ms_allocatable);
}
- space_map_update(msp->ms_sm);
+
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
msp->ms_deferspace += defer_delta;
ASSERT3S(msp->ms_deferspace, >=, 0);
@@ -2726,6 +3070,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
*/
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
+ metaslab_aux_histograms_update_done(msp, defer_allowed);
if (msp->ms_new) {
msp->ms_new = B_FALSE;
@@ -2733,12 +3078,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
mg->mg_ms_ready++;
mutex_exit(&mg->mg_lock);
}
+
/*
- * Calculate the new weights before unloading any metaslabs.
- * This will give us the most accurate weighting.
+ * Re-sort metaslab within its group now that we've adjusted
+ * its allocatable space.
*/
- metaslab_group_sort(mg, msp, metaslab_weight(msp) |
- (msp->ms_weight & METASLAB_ACTIVE_MASK));
+ metaslab_recalculate_weight_and_sort(msp);
/*
* If the metaslab is loaded and we've not tried to load or allocate
@@ -2765,6 +3110,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_freed));
ASSERT0(range_tree_space(msp->ms_checkpointing));
+ msp->ms_allocated_this_txg = 0;
mutex_exit(&msp->ms_lock);
}
@@ -4020,7 +4366,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
zio_alloc_list_t *zal, zio_t *zio, int allocator)
{
dva_t *dva = bp->blk_dva;
- dva_t *hintdva = hintbp->blk_dva;
+ dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
ASSERT(bp->blk_birth == 0);
@@ -4187,14 +4533,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
mutex_enter(&msp->ms_lock);
- if (msp->ms_loaded)
- range_tree_verify(msp->ms_allocatable, offset, size);
+ if (msp->ms_loaded) {
+ range_tree_verify_not_present(msp->ms_allocatable,
+ offset, size);
+ }
- range_tree_verify(msp->ms_freeing, offset, size);
- range_tree_verify(msp->ms_checkpointing, offset, size);
- range_tree_verify(msp->ms_freed, offset, size);
+ range_tree_verify_not_present(msp->ms_freeing, offset, size);
+ range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
+ range_tree_verify_not_present(msp->ms_freed, offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
- range_tree_verify(msp->ms_defer[j], offset, size);
+ range_tree_verify_not_present(msp->ms_defer[j], offset, size);
mutex_exit(&msp->ms_lock);
}
diff --git a/uts/common/fs/zfs/range_tree.c b/uts/common/fs/zfs/range_tree.c
index 99bdacb87deb..0a852a9c8da7 100644
--- a/uts/common/fs/zfs/range_tree.c
+++ b/uts/common/fs/zfs/range_tree.c
@@ -311,13 +311,11 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
}
void
-range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
+range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
{
- range_seg_t *rs;
-
- rs = range_tree_find(rt, off, size);
+ range_seg_t *rs = range_tree_find(rt, off, size);
if (rs != NULL)
- panic("freeing free block; rs=%p", (void *)rs);
+ panic("segment already in tree; rs=%p", (void *)rs);
}
boolean_t
diff --git a/uts/common/fs/zfs/spa_checkpoint.c b/uts/common/fs/zfs/spa_checkpoint.c
index 12d50366455c..62c3137cd590 100644
--- a/uts/common/fs/zfs/spa_checkpoint.c
+++ b/uts/common/fs/zfs/spa_checkpoint.c
@@ -129,7 +129,7 @@
* uberblock would reference data in the removed device. For this reason
* and others of similar nature, we disallow the following operations that
* can change the config:
- * vdev removal and attach/detach, mirror splitting, and pool reguid.
+ * vdev removal and attach/detach, mirror splitting, and pool reguid.
*
* - As most of the checkpoint logic is implemented in the SPA and doesn't
* distinguish datasets when it comes to space accounting, having a
@@ -262,7 +262,7 @@ spa_checkpoint_accounting_verify(spa_t *spa)
if (vd->vdev_checkpoint_sm != NULL) {
ckpoint_sm_space_sum +=
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vs_ckpoint_space_sum +=
vd->vdev_stat.vs_checkpoint_space;
ASSERT3U(ckpoint_sm_space_sum, ==,
@@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
error, vd->vdev_id);
}
ASSERT0(words_after);
- ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
+ ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
space_map_free(vd->vdev_checkpoint_sm, tx);
diff --git a/uts/common/fs/zfs/space_map.c b/uts/common/fs/zfs/space_map.c
index e85a85f91346..71e1e8cabc96 100644
--- a/uts/common/fs/zfs/space_map.c
+++ b/uts/common/fs/zfs/space_map.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -81,20 +81,22 @@ sm_entry_is_double_word(uint64_t e)
/*
* Iterate through the space map, invoking the callback on each (non-debug)
- * space map entry.
+ * space map entry. Stop after reading 'end' bytes of the space map.
*/
int
-space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
{
- uint64_t sm_len = space_map_length(sm);
- ASSERT3U(sm->sm_blksz, !=, 0);
+ uint64_t blksz = sm->sm_blksz;
+
+ ASSERT3U(blksz, !=, 0);
+ ASSERT3U(end, <=, space_map_length(sm));
+ ASSERT0(P2PHASE(end, sizeof (uint64_t)));
- dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
ZIO_PRIORITY_SYNC_READ);
- uint64_t blksz = sm->sm_blksz;
int error = 0;
- for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+ for (uint64_t block_base = 0; block_base < end && error == 0;
block_base += blksz) {
dmu_buf_t *db;
error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
@@ -103,7 +105,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
return (error);
uint64_t *block_start = db->db_data;
- uint64_t block_length = MIN(sm_len - block_base, blksz);
+ uint64_t block_length = MIN(end - block_base, blksz);
uint64_t *block_end = block_start +
(block_length / sizeof (uint64_t));
@@ -186,7 +188,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
* dmu_buf_hold().
*/
uint64_t last_word_offset =
- sm->sm_phys->smp_objsize - sizeof (uint64_t);
+ sm->sm_phys->smp_length - sizeof (uint64_t);
error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (error != 0)
@@ -199,7 +201,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
uint64_t *words = db->db_data;
*nwords =
- (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
@@ -298,8 +300,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
uint64_t e = buf[i];
if (sm_entry_is_debug(e)) {
- sm->sm_phys->smp_objsize -= sizeof (uint64_t);
- space_map_update(sm);
+ sm->sm_phys->smp_length -= sizeof (uint64_t);
continue;
}
@@ -354,15 +355,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
sm->sm_phys->smp_alloc -= entry_run;
else
sm->sm_phys->smp_alloc += entry_run;
- sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
- space_map_update(sm);
+ sm->sm_phys->smp_length -= words * sizeof (uint64_t);
}
}
if (space_map_length(sm) == 0) {
ASSERT0(error);
- ASSERT0(sm->sm_phys->smp_objsize);
- ASSERT0(sm->sm_alloc);
+ ASSERT0(space_map_allocated(sm));
}
zio_buf_free(buf, bufsz);
@@ -391,38 +390,42 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
}
/*
- * Load the space map disk into the specified range tree. Segments of maptype
- * are added to the range tree, other segment types are removed.
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
*/
int
-space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length)
{
- uint64_t space;
- int err;
space_map_load_arg_t smla;
VERIFY0(range_tree_space(rt));
- space = space_map_allocated(sm);
- if (maptype == SM_FREE) {
+ if (maptype == SM_FREE)
range_tree_add(rt, sm->sm_start, sm->sm_size);
- space = sm->sm_size - space;
- }
smla.smla_rt = rt;
smla.smla_sm = sm;
smla.smla_type = maptype;
- err = space_map_iterate(sm, space_map_load_callback, &smla);
+ int err = space_map_iterate(sm, length,
+ space_map_load_callback, &smla);
- if (err == 0) {
- VERIFY3U(range_tree_space(rt), ==, space);
- } else {
+ if (err != 0)
range_tree_vacate(rt, NULL, NULL);
- }
return (err);
}
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+ return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+}
+
void
space_map_histogram_clear(space_map_t *sm)
{
@@ -506,10 +509,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
- dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+ dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
sizeof (dentry), &dentry, tx);
- sm->sm_phys->smp_objsize += sizeof (dentry);
+ sm->sm_phys->smp_length += sizeof (dentry);
}
/*
@@ -541,7 +544,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
uint64_t *block_base = db->db_data;
uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
uint64_t *block_cursor = block_base +
- (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
ASSERT3P(block_cursor, <=, block_end);
@@ -564,7 +567,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
if (block_cursor == block_end) {
dmu_buf_rele(db, tag);
- uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
VERIFY0(dmu_buf_hold(sm->sm_os,
space_map_object(sm), next_word_offset,
tag, &db, DMU_READ_PREFETCH));
@@ -594,7 +597,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
SM_DEBUG_SYNCPASS_ENCODE(0) |
SM_DEBUG_TXG_ENCODE(0);
block_cursor++;
- sm->sm_phys->smp_objsize += sizeof (uint64_t);
+ sm->sm_phys->smp_length += sizeof (uint64_t);
ASSERT3P(block_cursor, ==, block_end);
continue;
}
@@ -625,7 +628,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
words);
break;
}
- sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+ sm->sm_phys->smp_length += words * sizeof (uint64_t);
start += run_len;
size -= run_len;
@@ -652,7 +655,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* We do this right after we write the intro debug entry
* because the estimate does not take it into account.
*/
- uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+ uint64_t initial_objsize = sm->sm_phys->smp_length;
uint64_t estimated_growth =
space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
@@ -663,7 +666,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* and use that to get a hold of the last block, so we can
* start appending to it.
*/
- uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
ASSERT3U(db->db_size, ==, sm->sm_blksz);
@@ -711,7 +714,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* Therefore we expect the actual objsize to be equal or less
* than whatever we estimated it to be.
*/
- ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
#endif
}
@@ -867,23 +870,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
}
dmu_buf_will_dirty(sm->sm_dbuf, tx);
- sm->sm_phys->smp_objsize = 0;
+ sm->sm_phys->smp_length = 0;
sm->sm_phys->smp_alloc = 0;
}
-/*
- * Update the in-core space_map allocation and length values.
- */
-void
-space_map_update(space_map_t *sm)
-{
- if (sm == NULL)
- return;
-
- sm->sm_alloc = sm->sm_phys->smp_alloc;
- sm->sm_length = sm->sm_phys->smp_objsize;
-}
-
uint64_t
space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
{
@@ -1065,32 +1055,14 @@ space_map_object(space_map_t *sm)
return (sm != NULL ? sm->sm_object : 0);
}
-/*
- * Returns the already synced, on-disk allocated space.
- */
-uint64_t
+int64_t
space_map_allocated(space_map_t *sm)
{
- return (sm != NULL ? sm->sm_alloc : 0);
+ return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
}
-/*
- * Returns the already synced, on-disk length;
- */
uint64_t
space_map_length(space_map_t *sm)
{
- return (sm != NULL ? sm->sm_length : 0);
-}
-
-/*
- * Returns the allocated space that is currently syncing.
- */
-int64_t
-space_map_alloc_delta(space_map_t *sm)
-{
- if (sm == NULL)
- return (0);
- ASSERT(sm->sm_dbuf != NULL);
- return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
+ return (sm != NULL ? sm->sm_phys->smp_length : 0);
}
diff --git a/uts/common/fs/zfs/sys/metaslab.h b/uts/common/fs/zfs/sys/metaslab.h
index 08fe3955b688..d26b095d14ef 100644
--- a/uts/common/fs/zfs/sys/metaslab.h
+++ b/uts/common/fs/zfs/sys/metaslab.h
@@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
+uint64_t metaslab_allocated_space(metaslab_t *);
+
void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
@@ -115,6 +117,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
+void metaslab_recalculate_weight_and_sort(metaslab_t *);
#ifdef __cplusplus
}
diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h
index a2c8e6051772..f8d36f38f7b7 100644
--- a/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -340,8 +340,34 @@ struct metaslab_group {
* being written.
*/
struct metaslab {
+ /*
+ * This is the main lock of the metaslab and its purpose is to
+ * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+ * metaslab_free_concrete(), ..etc] with our various syncing
+ * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+ *
+ * The lock is also used during some miscellaneous operations like
+ * using the metaslab's histogram for the metaslab group's histogram
+ * aggregation, or marking the metaslab for initialization.
+ */
kmutex_t ms_lock;
+
+ /*
+ * Acquired together with the ms_lock whenever we expect to
+ * write to metaslab data on-disk (i.e flushing entries to
+ * the metaslab's space map). It helps coordinate readers of
+ * the metaslab's space map [see spa_vdev_remove_thread()]
+ * with writers [see metaslab_sync()].
+ *
+ * Note that metaslab_load(), even though a reader, uses
+ * a completely different mechanism to deal with the reading
+ * of the metaslab's space map based on ms_synced_length. That
+ * said, the function still uses the ms_sync_lock after it
+ * has read the ms_sm [see relevant comment in metaslab_load()
+ * as to why].
+ */
kmutex_t ms_sync_lock;
+
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
@@ -351,6 +377,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
+ uint64_t ms_allocated_this_txg;
/*
* The following range trees are accessed only from syncing context.
@@ -375,6 +402,55 @@ struct metaslab {
boolean_t ms_loaded;
boolean_t ms_loading;
+ /*
+ * The following histograms count entries that are in the
+ * metaslab's space map (and its histogram) but are not in
+ * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+ * or ms_defer[].
+ *
+ * When the metaslab is not loaded, its ms_weight needs to
+ * reflect what is allocatable (i.e. what will be part of
+ * ms_allocatable if it is loaded). The weight is computed from
+ * the spacemap histogram, but that includes ranges that are
+ * not yet allocatable (because they are in ms_freed,
+ * ms_freeing, or ms_defer[]). Therefore, when calculating the
+ * weight, we need to remove those ranges.
+ *
+ * The ranges in the ms_freed and ms_defer[] range trees are all
+ * present in the spacemap. However, the spacemap may have
+ * multiple entries to represent a contiguous range, because it
+ * is written across multiple sync passes, but the changes of
+ * all sync passes are consolidated into the range trees.
+ * Adjacent ranges that are freed in different sync passes of
+ * one txg will be represented separately (as 2 or more entries)
+ * in the space map (and its histogram), but these adjacent
+ * ranges will be consolidated (represented as one entry) in the
+ * ms_freed/ms_defer[] range trees (and their histograms).
+ *
+ * When calculating the weight, we can not simply subtract the
+ * range trees' histograms from the spacemap's histogram,
+ * because the range trees' histograms may have entries in
+ * higher buckets than the spacemap, due to consolidation.
+ * Instead we must subtract the exact entries that were added to
+ * the spacemap's histogram. ms_synchist and ms_deferhist[]
+ * represent these exact entries, so we can subtract them from
+ * the spacemap's histogram when calculating ms_weight.
+ *
+ * ms_synchist represents the same ranges as ms_freeing +
+ * ms_freed, but without consolidation across sync passes.
+ *
+ * ms_deferhist[i] represents the same ranges as ms_defer[i],
+ * but without consolidation across sync passes.
+ */
+ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+ uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+ /*
+ * Tracks the exact amount of allocated space of this metaslab
+ * (and specifically the metaslab's space map) up to the most
+ * recently completed sync pass [see usage in metaslab_sync()].
+ */
+ uint64_t ms_allocated_space;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_activation_weight; /* activation weight */
@@ -411,6 +487,9 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+ /* updated every time we are done syncing the metaslab's space map */
+ uint64_t ms_synced_length;
+
boolean_t ms_new;
};
diff --git a/uts/common/fs/zfs/sys/range_tree.h b/uts/common/fs/zfs/sys/range_tree.h
index 9360e0150933..3816dabf7c1c 100644
--- a/uts/common/fs/zfs/sys/range_tree.h
+++ b/uts/common/fs/zfs/sys/range_tree.h
@@ -81,9 +81,10 @@ void range_tree_fini(void);
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_verify_not_present(range_tree_t *rt,
+ uint64_t start, uint64_t size);
uint64_t range_tree_space(range_tree_t *rt);
boolean_t range_tree_is_empty(range_tree_t *rt);
-void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
uint64_t range_tree_min(range_tree_t *rt);
diff --git a/uts/common/fs/zfs/sys/space_map.h b/uts/common/fs/zfs/sys/space_map.h
index d3d852978a57..2bce20b48ba5 100644
--- a/uts/common/fs/zfs/sys/space_map.h
+++ b/uts/common/fs/zfs/sys/space_map.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -55,10 +55,17 @@ extern "C" {
* for backward compatibility.
*/
typedef struct space_map_phys {
- uint64_t smp_object; /* on-disk space map object */
- uint64_t smp_objsize; /* size of the object */
- int64_t smp_alloc; /* space allocated from the map */
- uint64_t smp_pad[5]; /* reserved */
+ /* object number: not needed but kept for backwards compatibility */
+ uint64_t smp_object;
+
+ /* length of the object in bytes */
+ uint64_t smp_length;
+
+ /* space allocated from the map */
+ int64_t smp_alloc;
+
+ /* reserved */
+ uint64_t smp_pad[5];
/*
* The smp_histogram maintains a histogram of free regions. Each
@@ -81,8 +88,6 @@ typedef struct space_map {
uint64_t sm_start; /* start of map */
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
- uint64_t sm_length; /* synced length */
- int64_t sm_alloc; /* synced space allocated */
objset_t *sm_os; /* objset for this map */
uint64_t sm_object; /* object id for this map */
uint32_t sm_blksz; /* block size for space map */
@@ -189,18 +194,20 @@ boolean_t sm_entry_is_double_word(uint64_t e);
typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
-int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length);
+int space_map_iterate(space_map_t *sm, uint64_t length,
+ sm_cb_t callback, void *arg);
int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
dmu_tx_t *tx);
+boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
dmu_tx_t *tx);
-void space_map_update(space_map_t *sm);
-
uint64_t space_map_object(space_map_t *sm);
-uint64_t space_map_allocated(space_map_t *sm);
+int64_t space_map_allocated(space_map_t *sm);
uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
@@ -216,8 +223,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
uint64_t start, uint64_t size, uint8_t shift);
void space_map_close(space_map_t *sm);
-int64_t space_map_alloc_delta(space_map_t *sm);
-
#ifdef __cplusplus
}
#endif
diff --git a/uts/common/fs/zfs/sys/vdev_impl.h b/uts/common/fs/zfs/sys/vdev_impl.h
index c0bdeffb6451..6ddbe55a0c4b 100644
--- a/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/uts/common/fs/zfs/sys/vdev_impl.h
@@ -246,7 +246,6 @@ struct vdev {
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
boolean_t vdev_ishole; /* is a hole in the namespace */
- kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
@@ -306,16 +305,6 @@ struct vdev {
space_map_t *vdev_obsolete_sm;
/*
- * The queue depth parameters determine how many async writes are
- * still pending (i.e. allocated but not yet issued to disk) per
- * top-level (vdev_async_write_queue_depth) and the maximum allowed
- * (vdev_max_async_write_queue_depth). These values only apply to
- * top-level vdevs.
- */
- uint64_t vdev_async_write_queue_depth;
- uint64_t vdev_max_async_write_queue_depth;
-
- /*
* Leaf vdev state.
*/
range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */
diff --git a/uts/common/fs/zfs/vdev.c b/uts/common/fs/zfs/vdev.c
index c72aebe87722..11767fdcad59 100644
--- a/uts/common/fs/zfs/vdev.c
+++ b/uts/common/fs/zfs/vdev.c
@@ -501,7 +501,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
@@ -889,7 +888,6 @@ vdev_free(vdev_t *vd)
rw_destroy(&vd->vdev_indirect_rwlock);
mutex_destroy(&vd->vdev_obsolete_lock);
- mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
@@ -1251,12 +1249,12 @@ vdev_metaslab_fini(vdev_t *vd)
}
if (vd->vdev_ms != NULL) {
- uint64_t count = vd->vdev_ms_count;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
- metaslab_group_passivate(vd->vdev_mg);
+ uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
-
if (msp != NULL)
metaslab_fini(msp);
}
@@ -1264,6 +1262,9 @@ vdev_metaslab_fini(vdev_t *vd)
vd->vdev_ms = NULL;
vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
}
ASSERT0(vd->vdev_ms_count);
}
@@ -2549,13 +2550,6 @@ vdev_dtl_load(vdev_t *vd)
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
-
- /*
- * Now that we've opened the space_map we need to update
- * the in-core DTL.
- */
- space_map_update(vd->vdev_dtl_sm);
-
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
@@ -2715,10 +2709,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
}
dmu_tx_commit(tx);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_update(vd->vdev_dtl_sm);
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
@@ -2861,7 +2851,10 @@ vdev_load(vdev_t *vd)
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
(u_longlong_t)vd->vdev_asize);
return (SET_ERROR(ENXIO));
- } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
"[error=%d]", error);
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -2875,9 +2868,10 @@ vdev_load(vdev_t *vd)
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
- if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+ error = space_map_open(&vd->vdev_checkpoint_sm,
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
- vd->vdev_ashift))) {
+ vd->vdev_ashift);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: space_map_open "
"failed for checkpoint spacemap (obj %llu) "
"[error=%d]",
@@ -2885,15 +2879,15 @@ vdev_load(vdev_t *vd)
return (error);
}
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
- space_map_update(vd->vdev_checkpoint_sm);
/*
* Since the checkpoint_sm contains free entries
- * exclusively we can use sm_alloc to indicate the
- * culmulative checkpointed space that has been freed.
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
*/
vd->vdev_stat.vs_checkpoint_space =
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
vd->vdev_stat.vs_checkpoint_space;
}
@@ -2925,7 +2919,6 @@ vdev_load(vdev_t *vd)
(u_longlong_t)obsolete_sm_object, error);
return (error);
}
- space_map_update(vd->vdev_obsolete_sm);
}
return (0);
@@ -3012,47 +3005,6 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
- if (vd->vdev_ms != NULL) {
- metaslab_group_t *mg = vd->vdev_mg;
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
-
- if (msp == NULL || msp->ms_sm == NULL)
- continue;
-
- mutex_enter(&msp->ms_lock);
- /*
- * If the metaslab was not loaded when the vdev
- * was removed then the histogram accounting may
- * not be accurate. Update the histogram information
- * here so that we ensure that the metaslab group
- * and metaslab class are up-to-date.
- */
- metaslab_group_histogram_remove(mg, msp);
-
- VERIFY0(space_map_allocated(msp->ms_sm));
- space_map_close(msp->ms_sm);
- msp->ms_sm = NULL;
- mutex_exit(&msp->ms_lock);
- }
-
- if (vd->vdev_checkpoint_sm != NULL) {
- ASSERT(spa_has_checkpoint(spa));
- space_map_close(vd->vdev_checkpoint_sm);
- vd->vdev_checkpoint_sm = NULL;
- }
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
- ASSERT0(mg->mg_histogram[i]);
- }
-
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
vdev_destroy_spacemaps(vd, tx);
@@ -3086,17 +3038,14 @@ vdev_sync(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
- dmu_tx_t *tx;
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
- dmu_tx_t *tx;
-
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vdev_indirect_sync_obsolete(vd, tx);
- dmu_tx_commit(tx);
/*
* If the vdev is indirect, it can't have dirty
@@ -3105,6 +3054,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
if (vd->vdev_ops == &vdev_indirect_ops) {
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
return;
}
}
@@ -3115,12 +3065,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
!vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
- dmu_tx_commit(tx);
}
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@@ -3139,6 +3087,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
vdev_remove_empty_log(vd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
}
uint64_t
@@ -3368,8 +3317,6 @@ top:
*/
if (error == 0 &&
tvd->vdev_checkpoint_sm != NULL) {
- ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
- !=, 0);
error = ZFS_ERR_CHECKPOINT_EXISTS;
}
diff --git a/uts/common/fs/zfs/vdev_indirect.c b/uts/common/fs/zfs/vdev_indirect.c
index 75c038311004..5b6415937f61 100644
--- a/uts/common/fs/zfs/vdev_indirect.c
+++ b/uts/common/fs/zfs/vdev_indirect.c
@@ -680,7 +680,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
if (prev_obsolete_sm != NULL) {
vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
@@ -831,7 +830,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
spa->spa_meta_objset, obsolete_sm_object,
0, vd->vdev_asize, 0));
- space_map_update(vd->vdev_obsolete_sm);
}
ASSERT(vd->vdev_obsolete_sm != NULL);
@@ -840,7 +838,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
space_map_write(vd->vdev_obsolete_sm,
vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
- space_map_update(vd->vdev_obsolete_sm);
range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
}
diff --git a/uts/common/fs/zfs/vdev_indirect_mapping.c b/uts/common/fs/zfs/vdev_indirect_mapping.c
index 1da101733e4c..3d0f1344dd88 100644
--- a/uts/common/fs/zfs/vdev_indirect_mapping.c
+++ b/uts/common/fs/zfs/vdev_indirect_mapping.c
@@ -557,6 +557,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
losma.losma_counts = counts;
losma.losma_vim = vim;
VERIFY0(space_map_iterate(obsolete_space_sm,
+ space_map_length(obsolete_space_sm),
load_obsolete_sm_callback, &losma));
}
diff --git a/uts/common/fs/zfs/vdev_initialize.c b/uts/common/fs/zfs/vdev_initialize.c
index bf246cd8ddcf..e1aa4e9523b4 100644
--- a/uts/common/fs/zfs/vdev_initialize.c
+++ b/uts/common/fs/zfs/vdev_initialize.c
@@ -442,7 +442,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
- space_map_allocated(msp->ms_sm);
+ metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;
diff --git a/uts/common/fs/zfs/vdev_removal.c b/uts/common/fs/zfs/vdev_removal.c
index c3c6672ccf1a..e4d0224333ff 100644
--- a/uts/common/fs/zfs/vdev_removal.c
+++ b/uts/common/fs/zfs/vdev_removal.c
@@ -283,15 +283,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
if (ms->ms_sm == NULL)
continue;
- /*
- * Sync tasks happen before metaslab_sync(), therefore
- * smp_alloc and sm_alloc must be the same.
- */
- ASSERT3U(space_map_allocated(ms->ms_sm), ==,
- ms->ms_sm->sm_phys->smp_alloc);
-
spa->spa_removing_phys.sr_to_copy +=
- space_map_allocated(ms->ms_sm);
+ metaslab_allocated_space(ms);
/*
* Space which we are freeing this txg does not need to
@@ -1401,22 +1394,8 @@ spa_vdev_remove_thread(void *arg)
* appropriate action (see free_from_removing_vdev()).
*/
if (msp->ms_sm != NULL) {
- space_map_t *sm = NULL;
-
- /*
- * We have to open a new space map here, because
- * ms_sm's sm_length and sm_alloc may not reflect
- * what's in the object contents, if we are in between
- * metaslab_sync() and metaslab_sync_done().
- */
- VERIFY0(space_map_open(&sm,
- spa->spa_dsl_pool->dp_meta_objset,
- msp->ms_sm->sm_object, msp->ms_sm->sm_start,
- msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
- space_map_update(sm);
- VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
- SM_ALLOC));
- space_map_close(sm);
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
@@ -1611,16 +1590,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
ASSERT0(range_tree_space(msp->ms_freed));
if (msp->ms_sm != NULL) {
- /*
- * Assert that the in-core spacemap has the same
- * length as the on-disk one, so we can use the
- * existing in-core spacemap to load it from disk.
- */
- ASSERT3U(msp->ms_sm->sm_alloc, ==,
- msp->ms_sm->sm_phys->smp_alloc);
- ASSERT3U(msp->ms_sm->sm_length, ==,
- msp->ms_sm->sm_phys->smp_objsize);
-
mutex_enter(&svr->svr_lock);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
@@ -1713,9 +1682,6 @@ spa_vdev_remove_cancel(spa_t *spa)
return (error);
}
-/*
- * Called every sync pass of every txg if there's a svr.
- */
void
svr_sync(spa_t *spa, dmu_tx_t *tx)
{
@@ -1779,6 +1745,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
ASSERT(vd->vdev_islog);
ASSERT(vd == vd->vdev_top);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
* Stop allocating from this vdev.
@@ -1793,15 +1760,14 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
/*
- * Evacuate the device. We don't hold the config lock as writer
- * since we need to do I/O but we do keep the
+ * Evacuate the device. We don't hold the config lock as
+ * writer since we need to do I/O but we do keep the
* spa_namespace_lock held. Once this completes the device
* should no longer have any blocks allocated on it.
*/
- if (vd->vdev_islog) {
- if (vd->vdev_stat.vs_alloc != 0)
- error = spa_reset_logs(spa);
- }
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_reset_logs(spa);
*txg = spa_vdev_config_enter(spa);
@@ -1820,6 +1786,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
vdev_dirty_leaves(vd, VDD_DTL, *txg);
vdev_config_dirty(vd);
+ vdev_metaslab_fini(vd);
+
spa_history_log_internal(spa, "vdev remove", NULL,
"%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
(vd->vdev_path != NULL) ? vd->vdev_path : "-");
@@ -1849,6 +1817,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
if (list_link_active(&vd->vdev_config_dirty_node))
vdev_config_clean(vd);
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
/*
* Clean up the vdev namespace.
*/