aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-11-21 13:35:43 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-11-21 13:35:43 +0000
commit8491540808d9b00abc2a1ea8e37a697ce4020120 (patch)
tree0a255d1562e375d932fd1fe55fbbbf6f4f9d7b85 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs
parent68cad681496af17482fdd4d9cb71d48fa85e605c (diff)
parent4ca571aba060489dbd628e880af130c26c80b269 (diff)
downloadsrc-8491540808d9b00abc2a1ea8e37a697ce4020120.tar.gz
src-8491540808d9b00abc2a1ea8e37a697ce4020120.zip
MFV r354383: 10592 misc. metaslab and vdev related ZoL bug fixes
illumos/illumos-gate@555d674d5d4b8191dc83723188349d28278b2431 https://github.com/illumos/illumos-gate/commit/555d674d5d4b8191dc83723188349d28278b2431 https://www.illumos.org/issues/10592 This is a collection of recent fixes from ZoL: 8eef997679b Error path in metaslab_load_impl() forgets to drop ms_sync_lock 928e8ad47d3 Introduce auxiliary metaslab histograms 425d3237ee8 Get rid of space_map_update() for ms_synced_length 6c926f426a2 Simplify log vdev removal code 21e7cf5da89 zdb -L should skip leak detection altogether df72b8bebe0 Rename range_tree_verify to range_tree_verify_not_present 75058f33034 Remove unused vdev_t fields Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Author: Serapheim Dimitropoulos <serapheim@delphix.com> MFC after: 4 weeks
Notes
Notes: svn path=/head/; revision=354948
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c498
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c122
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h79
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h31
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c95
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c56
14 files changed, 614 insertions, 304 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 0e231e831251..4dd6692d22f4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -584,45 +584,62 @@ metaslab_compare(const void *x1, const void *x2)
return (AVL_CMP(m1->ms_start, m2->ms_start));
}
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+ return (msp->ms_allocated_space);
+}
+
/*
* Verify that the space accounting on disk matches the in-core range_trees.
*/
-void
+static void
metaslab_verify_space(metaslab_t *msp, uint64_t txg)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
- uint64_t allocated = 0;
+ uint64_t allocating = 0;
uint64_t sm_free_space, msp_free_space;
ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_condensing);
if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
return;
/*
* We can only verify the metaslab space when we're called
- * from syncing context with a loaded metaslab that has an allocated
- * space map. Calling this in non-syncing context does not
- * provide a consistent view of the metaslab since we're performing
- * allocations in the future.
+ * from syncing context with a loaded metaslab that has an
+ * allocated space map. Calling this in non-syncing context
+ * does not provide a consistent view of the metaslab since
+ * we're performing allocations in the future.
*/
if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
!msp->ms_loaded)
return;
- sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
- space_map_alloc_delta(msp->ms_sm);
+ /*
+ * Even though the smp_alloc field can get negative (e.g.
+ * see vdev_checkpoint_sm), that should never be the case
+ * when it come's to a metaslab's space map.
+ */
+ ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+ sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
/*
- * Account for future allocations since we would have already
- * deducted that space from the ms_freetree.
+ * Account for future allocations since we would have
+ * already deducted that space from the ms_allocatable.
*/
for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
- allocated +=
+ allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
}
- msp_free_space = range_tree_space(msp->ms_allocatable) + allocated +
+ ASSERT3U(msp->ms_deferspace, ==,
+ range_tree_space(msp->ms_defer[0]) +
+ range_tree_space(msp->ms_defer[1]));
+
+ msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
msp->ms_deferspace + range_tree_space(msp->ms_freed);
VERIFY3U(sm_free_space, ==, msp_free_space);
@@ -929,6 +946,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT(msp != NULL);
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1470,6 +1488,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
* ==========================================================================
*/
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+ /*
+ * Auxiliary histograms are only cleared when resetting them,
+ * which can only happen while the metaslab is loaded.
+ */
+ ASSERT(msp->ms_loaded);
+
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+ range_tree_t *rt)
+{
+ /*
+ * This is modeled after space_map_histogram_add(), so refer to that
+ * function for implementation details. We want this to work like
+ * the space map histogram, and not the range tree histogram, as we
+ * are essentially constructing a delta that will be later subtracted
+ * from the space map histogram.
+ */
+ int idx = 0;
+ for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(i, >=, idx + shift);
+ histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(sm != NULL);
+
+ /*
+ * This is similar to the metaslab's space map histogram updates
+ * that take place in metaslab_sync(). The only difference is that
+ * we only care about segments that haven't made it into the
+ * ms_allocatable tree yet.
+ */
+ if (msp->ms_loaded) {
+ metaslab_aux_histograms_clear(msp);
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freed);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ metaslab_aux_histogram_add(msp->ms_deferhist[t],
+ sm->sm_shift, msp->ms_defer[t]);
+ }
+ }
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+
+ if (sm == NULL) {
+ /*
+ * We came here from metaslab_init() when creating/opening a
+ * pool, looking at a metaslab that hasn't had any allocations
+ * yet.
+ */
+ return;
+ }
+
+ /*
+ * This is similar to the actions that we take for the ms_freed
+ * and ms_defer trees in metaslab_sync_done().
+ */
+ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+ if (defer_allowed) {
+ bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_synchist));
+ } else {
+ bzero(msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_deferhist[hist_index]));
+ }
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /* see comment in metaslab_verify_unflushed_changes() */
+ if (msp->ms_group == NULL)
+ return;
+
+ /*
+ * Devices being removed always return a weight of 0 and leave
+ * fragmentation and ms_max_size as is - there is nothing for
+ * us to verify here.
+ */
+ vdev_t *vd = msp->ms_group->mg_vd;
+ if (vd->vdev_removing)
+ return;
+
+ /*
+ * If the metaslab is dirty it probably means that we've done
+ * some allocations or frees that have changed our histograms
+ * and thus the weight.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&vd->vdev_ms_list, msp, t))
+ return;
+ }
+
+ /*
+ * This verification checks that our in-memory state is consistent
+ * with what's on disk. If the pool is read-only then there aren't
+ * any changes and we just have the initially-loaded state.
+ */
+ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+ return;
+
+ /* some extra verification for in-core tree if you can */
+ if (msp->ms_loaded) {
+ range_tree_stat_verify(msp->ms_allocatable);
+ VERIFY(space_map_histogram_verify(msp->ms_sm,
+ msp->ms_allocatable));
+ }
+
+ uint64_t weight = msp->ms_weight;
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+ uint64_t frag = msp->ms_fragmentation;
+ uint64_t max_segsize = msp->ms_max_size;
+
+ msp->ms_weight = 0;
+ msp->ms_fragmentation = 0;
+ msp->ms_max_size = 0;
+
+ /*
+ * This function is used for verification purposes. Regardless of
+ * whether metaslab_weight() thinks this metaslab should be active or
+ * not, we want to ensure that the actual weight (and therefore the
+ * value of ms_weight) would be the same if it was to be recalculated
+ * at this point.
+ */
+ msp->ms_weight = metaslab_weight(msp) | was_active;
+
+ VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+ /*
+ * If the weight type changed then there is no point in doing
+ * verification. Revert fields to their original values.
+ */
+ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+ (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+ msp->ms_fragmentation = frag;
+ msp->ms_weight = weight;
+ return;
+ }
+
+ VERIFY3U(msp->ms_fragmentation, ==, frag);
+ VERIFY3U(msp->ms_weight, ==, weight);
+}
+
/*
* Wait for any in-progress metaslab loads to complete.
*/
@@ -1491,47 +1706,94 @@ metaslab_load_impl(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
/*
- * Nobody else can manipulate a loading metaslab, so it's now safe
- * to drop the lock. This way we don't have to hold the lock while
- * reading the spacemap from disk.
+ * We temporarily drop the lock to unblock other operations while we
+ * are reading the space map. Therefore, metaslab_sync() and
+ * metaslab_sync_done() can run at the same time as we do.
+ *
+ * metaslab_sync() can append to the space map while we are loading.
+ * Therefore we load only entries that existed when we started the
+ * load. Additionally, metaslab_sync_done() has to wait for the load
+ * to complete because there are potential races like metaslab_load()
+ * loading parts of the space map that are currently being appended
+ * by metaslab_sync(). If we didn't, the ms_allocatable would have
+ * entries that metaslab_sync_done() would try to re-add later.
+ *
+ * That's why before dropping the lock we remember the synced length
+ * of the metaslab and read up to that point of the space map,
+ * ignoring entries appended by metaslab_sync() that happen after we
+ * drop the lock.
*/
+ uint64_t length = msp->ms_synced_length;
mutex_exit(&msp->ms_lock);
- /*
- * If the space map has not been allocated yet, then treat
- * all the space in the metaslab as free and add it to ms_allocatable.
- */
if (msp->ms_sm != NULL) {
- error = space_map_load(msp->ms_sm, msp->ms_allocatable,
- SM_FREE);
+ error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+ SM_FREE, length);
} else {
+ /*
+ * The space map has not been allocated yet, so treat
+ * all the space in the metaslab as free and add it to the
+ * ms_allocatable tree.
+ */
range_tree_add(msp->ms_allocatable,
msp->ms_start, msp->ms_size);
}
+ /*
+ * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+ * changing the ms_sm and the metaslab's range trees while we are
+ * about to use them and populate the ms_allocatable. The ms_lock
+ * is insufficient for this because metaslab_sync() doesn't hold
+ * the ms_lock while writing the ms_checkpointing tree to disk.
+ */
+ mutex_enter(&msp->ms_sync_lock);
mutex_enter(&msp->ms_lock);
+ ASSERT(!msp->ms_condensing);
- if (error != 0)
+ if (error != 0) {
+ mutex_exit(&msp->ms_sync_lock);
return (error);
+ }
ASSERT3P(msp->ms_group, !=, NULL);
msp->ms_loaded = B_TRUE;
/*
- * If the metaslab already has a spacemap, then we need to
- * remove all segments from the defer tree; otherwise, the
- * metaslab is completely empty and we can skip this.
+ * The ms_allocatable contains the segments that exist in the
+ * ms_defer trees [see ms_synced_length]. Thus we need to remove
+ * them from ms_allocatable as they will be added again in
+ * metaslab_sync_done().
*/
- if (msp->ms_sm != NULL) {
- for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- range_tree_walk(msp->ms_defer[t],
- range_tree_remove, msp->ms_allocatable);
- }
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defer[t],
+ range_tree_remove, msp->ms_allocatable);
}
+
+ /*
+ * Call metaslab_recalculate_weight_and_sort() now that the
+ * metaslab is loaded so we get the metaslab's real weight.
+ *
+ * Unless this metaslab was created with older software and
+ * has not yet been converted to use segment-based weight, we
+ * expect the new weight to be better or equal to the weight
+ * that the metaslab had while it was not loaded. This is
+ * because the old weight does not take into account the
+ * consolidation of adjacent segments between TXGs. [see
+ * comment for ms_synchist and ms_deferhist[] for more info]
+ */
+ uint64_t weight = msp->ms_weight;
+ metaslab_recalculate_weight_and_sort(msp);
+ if (!WEIGHT_IS_SPACEBASED(weight))
+ ASSERT3U(weight, <=, msp->ms_weight);
msp->ms_max_size = metaslab_block_maxsize(msp);
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ metaslab_verify_space(msp, spa_syncing_txg(spa));
+ mutex_exit(&msp->ms_sync_lock);
+
return (0);
}
@@ -1548,6 +1810,7 @@ metaslab_load(metaslab_t *msp)
if (msp->ms_loaded)
return (0);
VERIFY(!msp->ms_loading);
+ ASSERT(!msp->ms_condensing);
msp->ms_loading = B_TRUE;
int error = metaslab_load_impl(msp);
@@ -1561,10 +1824,29 @@ void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_verify_weight_and_frag(msp);
+
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
+
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
+
+ /*
+ * We explicitly recalculate the metaslab's weight based on its space
+ * map (as it is now not loaded). We want unload metaslabs to always
+ * have their weights calculated from the space map histograms, while
+ * loaded ones have it calculated from their in-core range tree
+ * [see metaslab_load()]. This way, the weight reflects the information
+ * available in-core, whether it is loaded or not
+ *
+ * If ms_group == NULL means that we came here from metaslab_fini(),
+ * at which point it doesn't make sense for us to do the recalculation
+ * and the sorting.
+ */
+ if (msp->ms_group != NULL)
+ metaslab_recalculate_weight_and_sort(msp);
}
static void
@@ -1604,6 +1886,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
/*
* We only open space map objects that already exist. All others
* will be opened when we finally allocate an object for it.
+ *
+ * Note:
+ * When called from vdev_expand(), we can't call into the DMU as
+ * we are holding the spa_config_lock as a writer and we would
+ * deadlock [see relevant comment in vdev_metaslab_init()]. in
+ * that case, the object parameter is zero though, so we won't
+ * call into the DMU.
*/
if (object != 0) {
error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
@@ -1615,14 +1904,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
}
ASSERT(ms->ms_sm != NULL);
+ ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
+ ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
}
/*
- * We create the main range tree here, but we don't create the
+ * We create the ms_allocatable here, but we don't create the
* other range trees until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
- * addition of new space; and for debugging, it ensures that we'd
- * data fault on any attempt to use this metaslab before it's ready.
+ * addition of new space; and for debugging, it ensures that
+ * we'd data fault on any attempt to use this metaslab before
+ * it's ready.
*/
ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size,
metaslab_rangesize_compare, 0);
@@ -1639,8 +1931,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
* out this txg. This ensures that we don't attempt to allocate
* from it before we have initialized it completely.
*/
- if (txg <= TXG_INITIAL)
+ if (txg <= TXG_INITIAL) {
metaslab_sync_done(ms, 0);
+ metaslab_space_update(vd, mg->mg_class,
+ metaslab_allocated_space(ms), 0, 0);
+ }
/*
* If metaslab_debug_load is set and we're initializing a metaslab
@@ -1674,7 +1969,7 @@ metaslab_fini(metaslab_t *msp)
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
metaslab_space_update(vd, mg->mg_class,
- -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+ -metaslab_allocated_space(msp), 0, -msp->ms_size);
space_map_close(msp->ms_sm);
@@ -1695,6 +1990,9 @@ metaslab_fini(metaslab_t *msp)
range_tree_destroy(msp->ms_checkpointing);
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@@ -1710,7 +2008,7 @@ metaslab_fini(metaslab_t *msp)
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
- * multiplying that by the fragmetation metric in this table. Doing
+ * multiplying that by the fragmentation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
@@ -1745,10 +2043,10 @@ int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
};
/*
- * Calclate the metaslab's fragmentation metric. A return value
- * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
- * not support this metric. Otherwise, the return value should be in the
- * range [0, 100].
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
*/
static void
metaslab_set_fragmentation(metaslab_t *msp)
@@ -1841,7 +2139,7 @@ metaslab_space_weight(metaslab_t *msp)
/*
* The baseline weight is the metaslab's free space.
*/
- space = msp->ms_size - space_map_allocated(msp->ms_sm);
+ space = msp->ms_size - metaslab_allocated_space(msp);
if (metaslab_fragmentation_factor_enabled &&
msp->ms_fragmentation != ZFS_FRAG_INVALID) {
@@ -1945,14 +2243,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
static uint64_t
metaslab_weight_from_spacemap(metaslab_t *msp)
{
- uint64_t weight = 0;
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(!msp->ms_loaded);
+ ASSERT(sm != NULL);
+ ASSERT3U(space_map_object(sm), !=, 0);
+ ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+ /*
+ * Create a joint histogram from all the segments that have made
+ * it to the metaslab's space map histogram, that are not yet
+ * available for allocation because they are still in the freeing
+ * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+ * these segments from the space map's histogram to get a more
+ * accurate weight.
+ */
+ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ deferspace_histogram[i] += msp->ms_synchist[i];
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ deferspace_histogram[i] += msp->ms_deferhist[t][i];
+ }
+ }
+
+ uint64_t weight = 0;
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
- if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
- WEIGHT_SET_COUNT(weight,
- msp->ms_sm->sm_phys->smp_histogram[i]);
- WEIGHT_SET_INDEX(weight, i +
- msp->ms_sm->sm_shift);
+ ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+ deferspace_histogram[i]);
+ uint64_t count =
+ sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+ if (count != 0) {
+ WEIGHT_SET_COUNT(weight, count);
+ WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
@@ -1977,7 +2299,7 @@ metaslab_segment_weight(metaslab_t *msp)
/*
* The metaslab is completely free.
*/
- if (space_map_allocated(msp->ms_sm) == 0) {
+ if (metaslab_allocated_space(msp) == 0) {
int idx = highbit64(msp->ms_size) - 1;
int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
@@ -1999,7 +2321,7 @@ metaslab_segment_weight(metaslab_t *msp)
/*
* If the metaslab is fully allocated then just make the weight 0.
*/
- if (space_map_allocated(msp->ms_sm) == msp->ms_size)
+ if (metaslab_allocated_space(msp) == msp->ms_size)
return (0);
/*
* If the metaslab is already loaded, then use the range tree to
@@ -2080,6 +2402,8 @@ metaslab_weight(metaslab_t *msp)
*/
if (msp->ms_loaded)
msp->ms_max_size = metaslab_block_maxsize(msp);
+ else
+ ASSERT0(msp->ms_max_size);
/*
* Segment-based weighting requires space map histogram support.
@@ -2095,6 +2419,15 @@ metaslab_weight(metaslab_t *msp)
return (weight);
}
+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+ /* note: we preserve the mask (e.g. indication of primary, etc..) */
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(msp->ms_group, msp,
+ metaslab_weight(msp) | was_active);
+}
+
static int
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
int allocator, uint64_t activation_weight)
@@ -2479,17 +2812,17 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY(txg <= spa_final_dirty_txg(spa));
/*
- * The only state that can actually be changing concurrently with
- * metaslab_sync() is the metaslab's ms_allocatable. No other
- * thread can be modifying this txg's alloc, freeing,
+ * The only state that can actually be changing concurrently
+ * with metaslab_sync() is the metaslab's ms_allocatable. No
+ * other thread can be modifying this txg's alloc, freeing,
* freed, or space_map_phys_t. We drop ms_lock whenever we
- * could call into the DMU, because the DMU can call down to us
- * (e.g. via zio_free()) at any time.
+ * could call into the DMU, because the DMU can call down to
+ * us (e.g. via zio_free()) at any time.
*
* The spa_vdev_remove_thread() can be reading metaslab state
- * concurrently, and it is locked out by the ms_sync_lock. Note
- * that the ms_lock is insufficient for this, because it is dropped
- * by space_map_write().
+ * concurrently, and it is locked out by the ms_sync_lock.
+ * Note that the ms_lock is insufficient for this, because it
+ * is dropped by space_map_write().
*/
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
@@ -2501,7 +2834,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
msp->ms_start, msp->ms_size, vd->vdev_ashift));
+
ASSERT(msp->ms_sm != NULL);
+ ASSERT0(metaslab_allocated_space(msp));
}
if (!range_tree_is_empty(msp->ms_checkpointing) &&
@@ -2549,6 +2884,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
mutex_enter(&msp->ms_lock);
}
+ msp->ms_allocated_space += range_tree_space(alloctree);
+ ASSERT3U(msp->ms_allocated_space, >=,
+ range_tree_space(msp->ms_freeing));
+ msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
if (!range_tree_is_empty(msp->ms_checkpointing)) {
ASSERT(spa_has_checkpoint(spa));
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
@@ -2562,14 +2902,13 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
space_map_write(vd->vdev_checkpoint_sm,
msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
- space_map_update(vd->vdev_checkpoint_sm);
spa->spa_checkpoint_info.sci_dspace +=
range_tree_space(msp->ms_checkpointing);
vd->vdev_stat.vs_checkpoint_space +=
range_tree_space(msp->ms_checkpointing);
ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
- -vd->vdev_checkpoint_sm->sm_alloc);
+ -space_map_allocated(vd->vdev_checkpoint_sm));
range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
}
@@ -2614,6 +2953,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* time we load the space map.
*/
space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+ metaslab_aux_histograms_update(msp);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
@@ -2621,16 +2961,18 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
/*
* For sync pass 1, we avoid traversing this txg's free range tree
- * and instead will just swap the pointers for freeing and
- * freed. We can safely do this since the freed_tree is
- * guaranteed to be empty on the initial pass.
+ * and instead will just swap the pointers for freeing and freed.
+ * We can safely do this since the freed_tree is guaranteed to be
+ * empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+ ASSERT0(msp->ms_allocated_this_txg);
} else {
range_tree_vacate(msp->ms_freeing,
range_tree_add, msp->ms_freed);
}
+ msp->ms_allocated_this_txg += range_tree_space(alloctree);
range_tree_vacate(alloctree, NULL, NULL);
ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
@@ -2708,7 +3050,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
}
defer_delta = 0;
- alloc_delta = space_map_alloc_delta(msp->ms_sm);
+ alloc_delta = msp->ms_allocated_this_txg -
+ range_tree_space(msp->ms_freed);
if (defer_allowed) {
defer_delta = range_tree_space(msp->ms_freed) -
range_tree_space(*defer_tree);
@@ -2740,7 +3083,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
msp->ms_loaded ? range_tree_add : NULL,
msp->ms_allocatable);
}
- space_map_update(msp->ms_sm);
+
+ msp->ms_synced_length = space_map_length(msp->ms_sm);
msp->ms_deferspace += defer_delta;
ASSERT3S(msp->ms_deferspace, >=, 0);
@@ -2752,6 +3096,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
*/
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
+ metaslab_aux_histograms_update_done(msp, defer_allowed);
if (msp->ms_new) {
msp->ms_new = B_FALSE;
@@ -2759,12 +3104,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
mg->mg_ms_ready++;
mutex_exit(&mg->mg_lock);
}
+
/*
- * Calculate the new weights before unloading any metaslabs.
- * This will give us the most accurate weighting.
+ * Re-sort metaslab within its group now that we've adjusted
+ * its allocatable space.
*/
- metaslab_group_sort(mg, msp, metaslab_weight(msp) |
- (msp->ms_weight & METASLAB_ACTIVE_MASK));
+ metaslab_recalculate_weight_and_sort(msp);
/*
* If the metaslab is loaded and we've not tried to load or allocate
@@ -2791,6 +3136,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_freed));
ASSERT0(range_tree_space(msp->ms_checkpointing));
+ msp->ms_allocated_this_txg = 0;
mutex_exit(&msp->ms_lock);
}
@@ -4073,7 +4419,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
zio_alloc_list_t *zal, zio_t *zio, int allocator)
{
dva_t *dva = bp->blk_dva;
- dva_t *hintdva = hintbp->blk_dva;
+ dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
ASSERT(bp->blk_birth == 0);
@@ -4240,14 +4586,16 @@ metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
mutex_enter(&msp->ms_lock);
- if (msp->ms_loaded)
- range_tree_verify(msp->ms_allocatable, offset, size);
+ if (msp->ms_loaded) {
+ range_tree_verify_not_present(msp->ms_allocatable,
+ offset, size);
+ }
- range_tree_verify(msp->ms_freeing, offset, size);
- range_tree_verify(msp->ms_checkpointing, offset, size);
- range_tree_verify(msp->ms_freed, offset, size);
+ range_tree_verify_not_present(msp->ms_freeing, offset, size);
+ range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
+ range_tree_verify_not_present(msp->ms_freed, offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
- range_tree_verify(msp->ms_defer[j], offset, size);
+ range_tree_verify_not_present(msp->ms_defer[j], offset, size);
mutex_exit(&msp->ms_lock);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
index e8bfb5a8c289..fc705e37964d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
@@ -511,13 +511,11 @@ range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
}
void
-range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
+range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
{
- range_seg_t *rs;
-
- rs = range_tree_find(rt, off, size);
+ range_seg_t *rs = range_tree_find(rt, off, size);
if (rs != NULL)
- panic("freeing free block; rs=%p", (void *)rs);
+ panic("segment already in tree; rs=%p", (void *)rs);
}
boolean_t
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
index 12d50366455c..62c3137cd590 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
@@ -129,7 +129,7 @@
* uberblock would reference data in the removed device. For this reason
* and others of similar nature, we disallow the following operations that
* can change the config:
- * vdev removal and attach/detach, mirror splitting, and pool reguid.
+ * vdev removal and attach/detach, mirror splitting, and pool reguid.
*
* - As most of the checkpoint logic is implemented in the SPA and doesn't
* distinguish datasets when it comes to space accounting, having a
@@ -262,7 +262,7 @@ spa_checkpoint_accounting_verify(spa_t *spa)
if (vd->vdev_checkpoint_sm != NULL) {
ckpoint_sm_space_sum +=
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vs_ckpoint_space_sum +=
vd->vdev_stat.vs_checkpoint_space;
ASSERT3U(ckpoint_sm_space_sum, ==,
@@ -347,7 +347,7 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
error, vd->vdev_id);
}
ASSERT0(words_after);
- ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
+ ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
space_map_free(vd->vdev_checkpoint_sm, tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index 7356e3ceea75..9ed7a1f4b761 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -86,20 +86,22 @@ sm_entry_is_double_word(uint64_t e)
/*
* Iterate through the space map, invoking the callback on each (non-debug)
- * space map entry.
+ * space map entry. Stop after reading 'end' bytes of the space map.
*/
int
-space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
{
- uint64_t sm_len = space_map_length(sm);
- ASSERT3U(sm->sm_blksz, !=, 0);
+ uint64_t blksz = sm->sm_blksz;
+
+ ASSERT3U(blksz, !=, 0);
+ ASSERT3U(end, <=, space_map_length(sm));
+ ASSERT0(P2PHASE(end, sizeof (uint64_t)));
- dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
ZIO_PRIORITY_SYNC_READ);
- uint64_t blksz = sm->sm_blksz;
int error = 0;
- for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+ for (uint64_t block_base = 0; block_base < end && error == 0;
block_base += blksz) {
dmu_buf_t *db;
error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
@@ -108,7 +110,7 @@ space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
return (error);
uint64_t *block_start = db->db_data;
- uint64_t block_length = MIN(sm_len - block_base, blksz);
+ uint64_t block_length = MIN(end - block_base, blksz);
uint64_t *block_end = block_start +
(block_length / sizeof (uint64_t));
@@ -191,7 +193,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
* dmu_buf_hold().
*/
uint64_t last_word_offset =
- sm->sm_phys->smp_objsize - sizeof (uint64_t);
+ sm->sm_phys->smp_length - sizeof (uint64_t);
error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (error != 0)
@@ -204,7 +206,7 @@ space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
uint64_t *words = db->db_data;
*nwords =
- (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
@@ -303,8 +305,7 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
uint64_t e = buf[i];
if (sm_entry_is_debug(e)) {
- sm->sm_phys->smp_objsize -= sizeof (uint64_t);
- space_map_update(sm);
+ sm->sm_phys->smp_length -= sizeof (uint64_t);
continue;
}
@@ -359,15 +360,13 @@ space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
sm->sm_phys->smp_alloc -= entry_run;
else
sm->sm_phys->smp_alloc += entry_run;
- sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
- space_map_update(sm);
+ sm->sm_phys->smp_length -= words * sizeof (uint64_t);
}
}
if (space_map_length(sm) == 0) {
ASSERT0(error);
- ASSERT0(sm->sm_phys->smp_objsize);
- ASSERT0(sm->sm_alloc);
+ ASSERT0(space_map_allocated(sm));
}
zio_buf_free(buf, bufsz);
@@ -396,38 +395,42 @@ space_map_load_callback(space_map_entry_t *sme, void *arg)
}
/*
- * Load the space map disk into the specified range tree. Segments of maptype
- * are added to the range tree, other segment types are removed.
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
*/
int
-space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length)
{
- uint64_t space;
- int err;
space_map_load_arg_t smla;
VERIFY0(range_tree_space(rt));
- space = space_map_allocated(sm);
- if (maptype == SM_FREE) {
+ if (maptype == SM_FREE)
range_tree_add(rt, sm->sm_start, sm->sm_size);
- space = sm->sm_size - space;
- }
smla.smla_rt = rt;
smla.smla_sm = sm;
smla.smla_type = maptype;
- err = space_map_iterate(sm, space_map_load_callback, &smla);
+ int err = space_map_iterate(sm, length,
+ space_map_load_callback, &smla);
- if (err == 0) {
- VERIFY3U(range_tree_space(rt), ==, space);
- } else {
+ if (err != 0)
range_tree_vacate(rt, NULL, NULL);
- }
return (err);
}
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+ return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+}
+
void
space_map_histogram_clear(space_map_t *sm)
{
@@ -511,10 +514,10 @@ space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
- dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+ dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
sizeof (dentry), &dentry, tx);
- sm->sm_phys->smp_objsize += sizeof (dentry);
+ sm->sm_phys->smp_length += sizeof (dentry);
}
/*
@@ -546,7 +549,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
uint64_t *block_base = db->db_data;
uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
uint64_t *block_cursor = block_base +
- (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+ (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
ASSERT3P(block_cursor, <=, block_end);
@@ -569,7 +572,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
if (block_cursor == block_end) {
dmu_buf_rele(db, tag);
- uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
VERIFY0(dmu_buf_hold(sm->sm_os,
space_map_object(sm), next_word_offset,
tag, &db, DMU_READ_PREFETCH));
@@ -599,7 +602,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
SM_DEBUG_SYNCPASS_ENCODE(0) |
SM_DEBUG_TXG_ENCODE(0);
block_cursor++;
- sm->sm_phys->smp_objsize += sizeof (uint64_t);
+ sm->sm_phys->smp_length += sizeof (uint64_t);
ASSERT3P(block_cursor, ==, block_end);
continue;
}
@@ -630,7 +633,7 @@ space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
words);
break;
}
- sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+ sm->sm_phys->smp_length += words * sizeof (uint64_t);
start += run_len;
size -= run_len;
@@ -657,7 +660,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* We do this right after we write the intro debug entry
* because the estimate does not take it into account.
*/
- uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+ uint64_t initial_objsize = sm->sm_phys->smp_length;
uint64_t estimated_growth =
space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
@@ -668,7 +671,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* and use that to get a hold of the last block, so we can
* start appending to it.
*/
- uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+ uint64_t next_word_offset = sm->sm_phys->smp_length;
VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
ASSERT3U(db->db_size, ==, sm->sm_blksz);
@@ -716,7 +719,7 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
* Therefore we expect the actual objsize to be equal or less
* than whatever we estimated it to be.
*/
- ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
#endif
}
@@ -872,23 +875,10 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
}
dmu_buf_will_dirty(sm->sm_dbuf, tx);
- sm->sm_phys->smp_objsize = 0;
+ sm->sm_phys->smp_length = 0;
sm->sm_phys->smp_alloc = 0;
}
-/*
- * Update the in-core space_map allocation and length values.
- */
-void
-space_map_update(space_map_t *sm)
-{
- if (sm == NULL)
- return;
-
- sm->sm_alloc = sm->sm_phys->smp_alloc;
- sm->sm_length = sm->sm_phys->smp_objsize;
-}
-
uint64_t
space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
{
@@ -1070,32 +1060,14 @@ space_map_object(space_map_t *sm)
return (sm != NULL ? sm->sm_object : 0);
}
-/*
- * Returns the already synced, on-disk allocated space.
- */
-uint64_t
+int64_t
space_map_allocated(space_map_t *sm)
{
- return (sm != NULL ? sm->sm_alloc : 0);
+ return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
}
-/*
- * Returns the already synced, on-disk length;
- */
uint64_t
space_map_length(space_map_t *sm)
{
- return (sm != NULL ? sm->sm_length : 0);
-}
-
-/*
- * Returns the allocated space that is currently syncing.
- */
-int64_t
-space_map_alloc_delta(space_map_t *sm)
-{
- if (sm == NULL)
- return (0);
- ASSERT(sm->sm_dbuf != NULL);
- return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
+ return (sm != NULL ? sm->sm_phys->smp_length : 0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 285cee006778..7219dc967427 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -52,6 +52,8 @@ void metaslab_fini(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);
+uint64_t metaslab_allocated_space(metaslab_t *);
+
void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
@@ -116,6 +118,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
+void metaslab_recalculate_weight_and_sort(metaslab_t *);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index 6cce0614b4a1..ae49795fec1a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -341,8 +341,34 @@ struct metaslab_group {
* being written.
*/
struct metaslab {
+ /*
+ * This is the main lock of the metaslab and its purpose is to
+ * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+ * metaslab_free_concrete(), ..etc] with our various syncing
+ * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+ *
+ * The lock is also used during some miscellaneous operations like
+ * using the metaslab's histogram for the metaslab group's histogram
+ * aggregation, or marking the metaslab for initialization.
+ */
kmutex_t ms_lock;
+
+ /*
+ * Acquired together with the ms_lock whenever we expect to
+ * write to metaslab data on-disk (i.e flushing entries to
+ * the metaslab's space map). It helps coordinate readers of
+ * the metaslab's space map [see spa_vdev_remove_thread()]
+ * with writers [see metaslab_sync()].
+ *
+ * Note that metaslab_load(), even though a reader, uses
+ * a completely different mechanism to deal with the reading
+ * of the metaslab's space map based on ms_synced_length. That
+ * said, the function still uses the ms_sync_lock after it
+ * has read the ms_sm [see relevant comment in metaslab_load()
+ * as to why].
+ */
kmutex_t ms_sync_lock;
+
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
@@ -352,6 +378,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
+ uint64_t ms_allocated_this_txg;
/*
* The following range trees are accessed only from syncing context.
@@ -376,6 +403,55 @@ struct metaslab {
boolean_t ms_loaded;
boolean_t ms_loading;
+ /*
+ * The following histograms count entries that are in the
+ * metaslab's space map (and its histogram) but are not in
+ * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+ * or ms_defer[].
+ *
+ * When the metaslab is not loaded, its ms_weight needs to
+ * reflect what is allocatable (i.e. what will be part of
+ * ms_allocatable if it is loaded). The weight is computed from
+ * the spacemap histogram, but that includes ranges that are
+ * not yet allocatable (because they are in ms_freed,
+ * ms_freeing, or ms_defer[]). Therefore, when calculating the
+ * weight, we need to remove those ranges.
+ *
+ * The ranges in the ms_freed and ms_defer[] range trees are all
+ * present in the spacemap. However, the spacemap may have
+ * multiple entries to represent a contiguous range, because it
+ * is written across multiple sync passes, but the changes of
+ * all sync passes are consolidated into the range trees.
+ * Adjacent ranges that are freed in different sync passes of
+ * one txg will be represented separately (as 2 or more entries)
+ * in the space map (and its histogram), but these adjacent
+ * ranges will be consolidated (represented as one entry) in the
+ * ms_freed/ms_defer[] range trees (and their histograms).
+ *
+ * When calculating the weight, we can not simply subtract the
+ * range trees' histograms from the spacemap's histogram,
+ * because the range trees' histograms may have entries in
+ * higher buckets than the spacemap, due to consolidation.
+ * Instead we must subtract the exact entries that were added to
+ * the spacemap's histogram. ms_synchist and ms_deferhist[]
+ * represent these exact entries, so we can subtract them from
+ * the spacemap's histogram when calculating ms_weight.
+ *
+ * ms_synchist represents the same ranges as ms_freeing +
+ * ms_freed, but without consolidation across sync passes.
+ *
+ * ms_deferhist[i] represents the same ranges as ms_defer[i],
+ * but without consolidation across sync passes.
+ */
+ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+ uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+ /*
+ * Tracks the exact amount of allocated space of this metaslab
+ * (and specifically the metaslab's space map) up to the most
+ * recently completed sync pass [see usage in metaslab_sync()].
+ */
+ uint64_t ms_allocated_space;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_activation_weight; /* activation weight */
@@ -412,6 +488,9 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+ /* updated every time we are done syncing the metaslab's space map */
+ uint64_t ms_synced_length;
+
boolean_t ms_new;
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
index 961202bb6cf2..bbdf66cade63 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
@@ -87,12 +87,13 @@ range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg);
void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_verify_not_present(range_tree_t *rt,
+ uint64_t start, uint64_t size);
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
uint64_t newstart, uint64_t newsize);
uint64_t range_tree_space(range_tree_t *rt);
boolean_t range_tree_is_empty(range_tree_t *rt);
-void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
uint64_t range_tree_min(range_tree_t *rt);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
index d3d852978a57..2bce20b48ba5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -55,10 +55,17 @@ extern "C" {
* for backward compatibility.
*/
typedef struct space_map_phys {
- uint64_t smp_object; /* on-disk space map object */
- uint64_t smp_objsize; /* size of the object */
- int64_t smp_alloc; /* space allocated from the map */
- uint64_t smp_pad[5]; /* reserved */
+ /* object number: not needed but kept for backwards compatibility */
+ uint64_t smp_object;
+
+ /* length of the object in bytes */
+ uint64_t smp_length;
+
+ /* space allocated from the map */
+ int64_t smp_alloc;
+
+ /* reserved */
+ uint64_t smp_pad[5];
/*
* The smp_histogram maintains a histogram of free regions. Each
@@ -81,8 +88,6 @@ typedef struct space_map {
uint64_t sm_start; /* start of map */
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
- uint64_t sm_length; /* synced length */
- int64_t sm_alloc; /* synced space allocated */
objset_t *sm_os; /* objset for this map */
uint64_t sm_object; /* object id for this map */
uint32_t sm_blksz; /* block size for space map */
@@ -189,18 +194,20 @@ boolean_t sm_entry_is_double_word(uint64_t e);
typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
-int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
+int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t length);
+int space_map_iterate(space_map_t *sm, uint64_t length,
+ sm_cb_t callback, void *arg);
int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
dmu_tx_t *tx);
+boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
dmu_tx_t *tx);
-void space_map_update(space_map_t *sm);
-
uint64_t space_map_object(space_map_t *sm);
-uint64_t space_map_allocated(space_map_t *sm);
+int64_t space_map_allocated(space_map_t *sm);
uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
@@ -216,8 +223,6 @@ int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
uint64_t start, uint64_t size, uint8_t shift);
void space_map_close(space_map_t *sm);
-int64_t space_map_alloc_delta(space_map_t *sm);
-
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index e731f2b28b73..5baac4852116 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -268,7 +268,6 @@ struct vdev {
uint64_t vdev_islog; /* is an intent log device */
uint64_t vdev_removing; /* device is being removed? */
boolean_t vdev_ishole; /* is a hole in the namespace */
- kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
@@ -328,16 +327,6 @@ struct vdev {
space_map_t *vdev_obsolete_sm;
/*
- * The queue depth parameters determine how many async writes are
- * still pending (i.e. allocated but not yet issued to disk) per
- * top-level (vdev_async_write_queue_depth) and the maximum allowed
- * (vdev_max_async_write_queue_depth). These values only apply to
- * top-level vdevs.
- */
- uint64_t vdev_async_write_queue_depth;
- uint64_t vdev_max_async_write_queue_depth;
-
- /*
* Protects the vdev_scan_io_queue field itself as well as the
* structure's contents (when present).
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index bbc5f416091e..c91551062d3b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -630,7 +630,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1032,7 +1031,6 @@ vdev_free(vdev_t *vd)
rw_destroy(&vd->vdev_indirect_rwlock);
mutex_destroy(&vd->vdev_obsolete_lock);
- mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
@@ -1401,12 +1399,12 @@ vdev_metaslab_fini(vdev_t *vd)
}
if (vd->vdev_ms != NULL) {
- uint64_t count = vd->vdev_ms_count;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
- metaslab_group_passivate(vd->vdev_mg);
+ uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
-
if (msp != NULL)
metaslab_fini(msp);
}
@@ -1414,6 +1412,9 @@ vdev_metaslab_fini(vdev_t *vd)
vd->vdev_ms = NULL;
vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
}
ASSERT0(vd->vdev_ms_count);
}
@@ -2767,13 +2768,6 @@ vdev_dtl_load(vdev_t *vd)
ASSERT(vd->vdev_dtl_sm != NULL);
mutex_enter(&vd->vdev_dtl_lock);
-
- /*
- * Now that we've opened the space_map we need to update
- * the in-core DTL.
- */
- space_map_update(vd->vdev_dtl_sm);
-
error = space_map_load(vd->vdev_dtl_sm,
vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
mutex_exit(&vd->vdev_dtl_lock);
@@ -2933,10 +2927,6 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
}
dmu_tx_commit(tx);
-
- mutex_enter(&vd->vdev_dtl_lock);
- space_map_update(vd->vdev_dtl_sm);
- mutex_exit(&vd->vdev_dtl_lock);
}
/*
@@ -3079,7 +3069,10 @@ vdev_load(vdev_t *vd)
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
(u_longlong_t)vd->vdev_asize);
return (SET_ERROR(ENXIO));
- } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
"[error=%d]", error);
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -3093,9 +3086,10 @@ vdev_load(vdev_t *vd)
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
- if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+ error = space_map_open(&vd->vdev_checkpoint_sm,
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
- vd->vdev_ashift))) {
+ vd->vdev_ashift);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: space_map_open "
"failed for checkpoint spacemap (obj %llu) "
"[error=%d]",
@@ -3103,15 +3097,15 @@ vdev_load(vdev_t *vd)
return (error);
}
ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
- space_map_update(vd->vdev_checkpoint_sm);
/*
* Since the checkpoint_sm contains free entries
- * exclusively we can use sm_alloc to indicate the
- * culmulative checkpointed space that has been freed.
+ * exclusively we can use space_map_allocated() to
+ * indicate the cumulative checkpointed space that
+ * has been freed.
*/
vd->vdev_stat.vs_checkpoint_space =
- -vd->vdev_checkpoint_sm->sm_alloc;
+ -space_map_allocated(vd->vdev_checkpoint_sm);
vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
vd->vdev_stat.vs_checkpoint_space;
}
@@ -3143,7 +3137,6 @@ vdev_load(vdev_t *vd)
(u_longlong_t)obsolete_sm_object, error);
return (error);
}
- space_map_update(vd->vdev_obsolete_sm);
}
return (0);
@@ -3230,47 +3223,6 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
ASSERT(vd == vd->vdev_top);
ASSERT3U(txg, ==, spa_syncing_txg(spa));
- if (vd->vdev_ms != NULL) {
- metaslab_group_t *mg = vd->vdev_mg;
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
-
- if (msp == NULL || msp->ms_sm == NULL)
- continue;
-
- mutex_enter(&msp->ms_lock);
- /*
- * If the metaslab was not loaded when the vdev
- * was removed then the histogram accounting may
- * not be accurate. Update the histogram information
- * here so that we ensure that the metaslab group
- * and metaslab class are up-to-date.
- */
- metaslab_group_histogram_remove(mg, msp);
-
- VERIFY0(space_map_allocated(msp->ms_sm));
- space_map_close(msp->ms_sm);
- msp->ms_sm = NULL;
- mutex_exit(&msp->ms_lock);
- }
-
- if (vd->vdev_checkpoint_sm != NULL) {
- ASSERT(spa_has_checkpoint(spa));
- space_map_close(vd->vdev_checkpoint_sm);
- vd->vdev_checkpoint_sm = NULL;
- }
-
- metaslab_group_histogram_verify(mg);
- metaslab_class_histogram_verify(mg->mg_class);
-
- for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
- ASSERT0(mg->mg_histogram[i]);
- }
-
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
vdev_destroy_spacemaps(vd, tx);
@@ -3304,17 +3256,14 @@ vdev_sync(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa;
vdev_t *lvd;
metaslab_t *msp;
- dmu_tx_t *tx;
+ ASSERT3U(txg, ==, spa->spa_syncing_txg);
+ dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
- dmu_tx_t *tx;
-
ASSERT(vd->vdev_removing ||
vd->vdev_ops == &vdev_indirect_ops);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vdev_indirect_sync_obsolete(vd, tx);
- dmu_tx_commit(tx);
/*
* If the vdev is indirect, it can't have dirty
@@ -3323,6 +3272,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
if (vd->vdev_ops == &vdev_indirect_ops) {
ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+ dmu_tx_commit(tx);
return;
}
}
@@ -3333,12 +3283,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
!vd->vdev_removing) {
ASSERT(vd == vd->vdev_top);
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
- tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
ASSERT(vd->vdev_ms_array != 0);
vdev_config_dirty(vd);
- dmu_tx_commit(tx);
}
while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@@ -3357,6 +3305,7 @@ vdev_sync(vdev_t *vd, uint64_t txg)
vdev_remove_empty_log(vd, txg);
(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+ dmu_tx_commit(tx);
}
uint64_t
@@ -3586,8 +3535,6 @@ top:
*/
if (error == 0 &&
tvd->vdev_checkpoint_sm != NULL) {
- ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc,
- !=, 0);
error = ZFS_ERR_CHECKPOINT_EXISTS;
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
index a3d0b0c66f33..0c9f3aeb1d94 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -680,7 +680,6 @@ spa_condense_indirect_thread(void *arg, zthr_t *zthr)
VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
- space_map_update(prev_obsolete_sm);
counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
if (prev_obsolete_sm != NULL) {
vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
@@ -831,7 +830,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
spa->spa_meta_objset, obsolete_sm_object,
0, vd->vdev_asize, 0));
- space_map_update(vd->vdev_obsolete_sm);
}
ASSERT(vd->vdev_obsolete_sm != NULL);
@@ -840,7 +838,6 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
space_map_write(vd->vdev_obsolete_sm,
vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
- space_map_update(vd->vdev_obsolete_sm);
range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
index 1da101733e4c..3d0f1344dd88 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
@@ -557,6 +557,7 @@ vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
losma.losma_counts = counts;
losma.losma_vim = vim;
VERIFY0(space_map_iterate(obsolete_space_sm,
+ space_map_length(obsolete_space_sm),
load_obsolete_sm_callback, &losma));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
index a78fa2643e8e..34d959008bd5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
@@ -442,7 +442,7 @@ vdev_initialize_calculate_progress(vdev_t *vd)
mutex_enter(&msp->ms_lock);
uint64_t ms_free = msp->ms_size -
- space_map_allocated(msp->ms_sm);
+ metaslab_allocated_space(msp);
if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
ms_free /= vd->vdev_top->vdev_children;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
index bef33de3ffa3..ab51c8c79055 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
@@ -283,15 +283,8 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
if (ms->ms_sm == NULL)
continue;
- /*
- * Sync tasks happen before metaslab_sync(), therefore
- * smp_alloc and sm_alloc must be the same.
- */
- ASSERT3U(space_map_allocated(ms->ms_sm), ==,
- ms->ms_sm->sm_phys->smp_alloc);
-
spa->spa_removing_phys.sr_to_copy +=
- space_map_allocated(ms->ms_sm);
+ metaslab_allocated_space(ms);
/*
* Space which we are freeing this txg does not need to
@@ -1401,22 +1394,8 @@ spa_vdev_remove_thread(void *arg)
* appropriate action (see free_from_removing_vdev()).
*/
if (msp->ms_sm != NULL) {
- space_map_t *sm = NULL;
-
- /*
- * We have to open a new space map here, because
- * ms_sm's sm_length and sm_alloc may not reflect
- * what's in the object contents, if we are in between
- * metaslab_sync() and metaslab_sync_done().
- */
- VERIFY0(space_map_open(&sm,
- spa->spa_dsl_pool->dp_meta_objset,
- msp->ms_sm->sm_object, msp->ms_sm->sm_start,
- msp->ms_sm->sm_size, msp->ms_sm->sm_shift));
- space_map_update(sm);
- VERIFY0(space_map_load(sm, svr->svr_allocd_segs,
- SM_ALLOC));
- space_map_close(sm);
+ VERIFY0(space_map_load(msp->ms_sm,
+ svr->svr_allocd_segs, SM_ALLOC));
range_tree_walk(msp->ms_freeing,
range_tree_remove, svr->svr_allocd_segs);
@@ -1612,16 +1591,6 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
ASSERT0(range_tree_space(msp->ms_freed));
if (msp->ms_sm != NULL) {
- /*
- * Assert that the in-core spacemap has the same
- * length as the on-disk one, so we can use the
- * existing in-core spacemap to load it from disk.
- */
- ASSERT3U(msp->ms_sm->sm_alloc, ==,
- msp->ms_sm->sm_phys->smp_alloc);
- ASSERT3U(msp->ms_sm->sm_length, ==,
- msp->ms_sm->sm_phys->smp_objsize);
-
mutex_enter(&svr->svr_lock);
VERIFY0(space_map_load(msp->ms_sm,
svr->svr_allocd_segs, SM_ALLOC));
@@ -1714,9 +1683,6 @@ spa_vdev_remove_cancel(spa_t *spa)
return (error);
}
-/*
- * Called every sync pass of every txg if there's a svr.
- */
void
svr_sync(spa_t *spa, dmu_tx_t *tx)
{
@@ -1780,6 +1746,7 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
ASSERT(vd->vdev_islog);
ASSERT(vd == vd->vdev_top);
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
/*
* Stop allocating from this vdev.
@@ -1794,15 +1761,14 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
*txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
/*
- * Evacuate the device. We don't hold the config lock as writer
- * since we need to do I/O but we do keep the
+ * Evacuate the device. We don't hold the config lock as
+ * writer since we need to do I/O but we do keep the
* spa_namespace_lock held. Once this completes the device
* should no longer have any blocks allocated on it.
*/
- if (vd->vdev_islog) {
- if (vd->vdev_stat.vs_alloc != 0)
- error = spa_reset_logs(spa);
- }
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ if (vd->vdev_stat.vs_alloc != 0)
+ error = spa_reset_logs(spa);
*txg = spa_vdev_config_enter(spa);
@@ -1821,6 +1787,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
vdev_dirty_leaves(vd, VDD_DTL, *txg);
vdev_config_dirty(vd);
+ vdev_metaslab_fini(vd);
+
spa_history_log_internal(spa, "vdev remove", NULL,
"%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id,
(vd->vdev_path != NULL) ? vd->vdev_path : "-");
@@ -1850,6 +1818,8 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
if (list_link_active(&vd->vdev_config_dirty_node))
vdev_config_clean(vd);
+ ASSERT0(vd->vdev_stat.vs_alloc);
+
/*
* Clean up the vdev namespace.
*/