aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/sys/metaslab.h1
-rw-r--r--include/sys/metaslab_impl.h43
-rw-r--r--include/sys/space_map.h1
-rw-r--r--module/zfs/metaslab.c295
-rw-r--r--module/zfs/vdev.c19
5 files changed, 341 insertions, 18 deletions
diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index fd0d23502a43..a513a647039c 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -119,6 +119,7 @@ void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
boolean_t);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
+void metaslab_recalculate_weight_and_sort(metaslab_t *);
#ifdef __cplusplus
}
diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h
index 02ce02226b5d..676c5dd46bf3 100644
--- a/include/sys/metaslab_impl.h
+++ b/include/sys/metaslab_impl.h
@@ -403,6 +403,49 @@ struct metaslab {
boolean_t ms_loading;
/*
+ * The following histograms count entries that are in the
+ * metaslab's space map (and its histogram) but are not in
+ * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+ * or ms_defer[].
+ *
+ * When the metaslab is not loaded, its ms_weight needs to
+ * reflect what is allocatable (i.e. what will be part of
+ * ms_allocatable if it is loaded). The weight is computed from
+ * the spacemap histogram, but that includes ranges that are
+ * not yet allocatable (because they are in ms_freed,
+ * ms_freeing, or ms_defer[]). Therefore, when calculating the
+ * weight, we need to remove those ranges.
+ *
+ * The ranges in the ms_freed and ms_defer[] range trees are all
+ * present in the spacemap. However, the spacemap may have
+ * multiple entries to represent a contiguous range, because it
+ * is written across multiple sync passes, but the changes of
+ * all sync passes are consolidated into the range trees.
+ * Adjacent ranges that are freed in different sync passes of
+ * one txg will be represented separately (as 2 or more entries)
+ * in the space map (and its histogram), but these adjacent
+ * ranges will be consolidated (represented as one entry) in the
+ * ms_freed/ms_defer[] range trees (and their histograms).
+ *
+ * When calculating the weight, we can not simply subtract the
+ * range trees' histograms from the spacemap's histogram,
+ * because the range trees' histograms may have entries in
+ * higher buckets than the spacemap, due to consolidation.
+ * Instead we must subtract the exact entries that were added to
+ * the spacemap's histogram. ms_synchist and ms_deferhist[]
+ * represent these exact entries, so we can subtract them from
+ * the spacemap's histogram when calculating ms_weight.
+ *
+ * ms_synchist represents the same ranges as ms_freeing +
+ * ms_freed, but without consolidation across sync passes.
+ *
+ * ms_deferhist[i] represents the same ranges as ms_defer[i],
+ * but without consolidation across sync passes.
+ */
+ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+ uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+ /*
* Tracks the exact amount of allocated space of this metaslab
* (and specifically the metaslab's space map) up to the most
* recently completed sync pass [see usage in metaslab_sync()].
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index 52536cccca46..7731a352f15d 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -201,6 +201,7 @@ int space_map_iterate(space_map_t *sm, uint64_t length,
int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
dmu_tx_t *tx);
+boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt);
void space_map_histogram_clear(space_map_t *sm);
void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
dmu_tx_t *tx);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 58c47a0abfb2..9f6f0048fcba 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -856,6 +856,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT(msp != NULL);
/* skip if not active or not a member */
if (msp->ms_sm == NULL || msp->ms_group != mg)
@@ -1416,6 +1417,203 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
* ==========================================================================
*/
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+ /*
+ * Auxiliary histograms are only cleared when resetting them,
+ * which can only happen while the metaslab is loaded.
+ */
+ ASSERT(msp->ms_loaded);
+
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+ for (int t = 0; t < TXG_DEFER_SIZE; t++)
+ bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+ range_tree_t *rt)
+{
+ /*
+ * This is modeled after space_map_histogram_add(), so refer to that
+ * function for implementation details. We want this to work like
+ * the space map histogram, and not the range tree histogram, as we
+ * are essentially constructing a delta that will be later subtracted
+ * from the space map histogram.
+ */
+ int idx = 0;
+ for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(i, >=, idx + shift);
+ histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(sm != NULL);
+
+ /*
+ * This is similar to the metaslab's space map histogram updates
+ * that take place in metaslab_sync(). The only difference is that
+ * we only care about segments that haven't made it into the
+ * ms_allocatable tree yet.
+ */
+ if (msp->ms_loaded) {
+ metaslab_aux_histograms_clear(msp);
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freed);
+
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ metaslab_aux_histogram_add(msp->ms_deferhist[t],
+ sm->sm_shift, msp->ms_defer[t]);
+ }
+ }
+
+ metaslab_aux_histogram_add(msp->ms_synchist,
+ sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+
+ if (sm == NULL) {
+ /*
+ * We came here from metaslab_init() when creating/opening a
+ * pool, looking at a metaslab that hasn't had any allocations
+ * yet.
+ */
+ return;
+ }
+
+ /*
+ * This is similar to the actions that we take for the ms_freed
+ * and ms_defer trees in metaslab_sync_done().
+ */
+ uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+ if (defer_allowed) {
+ bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_synchist));
+ } else {
+ bzero(msp->ms_deferhist[hist_index],
+ sizeof (msp->ms_deferhist[hist_index]));
+ }
+ bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+ return;
+
+ /* see comment in metaslab_verify_unflushed_changes() */
+ if (msp->ms_group == NULL)
+ return;
+
+ /*
+ * Devices being removed always return a weight of 0 and leave
+ * fragmentation and ms_max_size as is - there is nothing for
+ * us to verify here.
+ */
+ vdev_t *vd = msp->ms_group->mg_vd;
+ if (vd->vdev_removing)
+ return;
+
+ /*
+ * If the metaslab is dirty it probably means that we've done
+ * some allocations or frees that have changed our histograms
+ * and thus the weight.
+ */
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (txg_list_member(&vd->vdev_ms_list, msp, t))
+ return;
+ }
+
+ /*
+ * This verification checks that our in-memory state is consistent
+ * with what's on disk. If the pool is read-only then there aren't
+ * any changes and we just have the initially-loaded state.
+ */
+ if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+ return;
+
+ /* some extra verification for in-core tree if you can */
+ if (msp->ms_loaded) {
+ range_tree_stat_verify(msp->ms_allocatable);
+ VERIFY(space_map_histogram_verify(msp->ms_sm,
+ msp->ms_allocatable));
+ }
+
+ uint64_t weight = msp->ms_weight;
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+ uint64_t frag = msp->ms_fragmentation;
+ uint64_t max_segsize = msp->ms_max_size;
+
+ msp->ms_weight = 0;
+ msp->ms_fragmentation = 0;
+ msp->ms_max_size = 0;
+
+ /*
+ * This function is used for verification purposes. Regardless of
+ * whether metaslab_weight() thinks this metaslab should be active or
+ * not, we want to ensure that the actual weight (and therefore the
+ * value of ms_weight) would be the same if it was to be recalculated
+ * at this point.
+ */
+ msp->ms_weight = metaslab_weight(msp) | was_active;
+
+ VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+ /*
+ * If the weight type changed then there is no point in doing
+ * verification. Revert fields to their original values.
+ */
+ if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+ (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+ msp->ms_fragmentation = frag;
+ msp->ms_weight = weight;
+ return;
+ }
+
+ VERIFY3U(msp->ms_fragmentation, ==, frag);
+ VERIFY3U(msp->ms_weight, ==, weight);
+}
+
/*
* Wait for any in-progress metaslab loads to complete.
*/
@@ -1501,6 +1699,22 @@ metaslab_load_impl(metaslab_t *msp)
range_tree_remove, msp->ms_allocatable);
}
+ /*
+ * Call metaslab_recalculate_weight_and_sort() now that the
+ * metaslab is loaded so we get the metaslab's real weight.
+ *
+ * Unless this metaslab was created with older software and
+ * has not yet been converted to use segment-based weight, we
+ * expect the new weight to be better or equal to the weight
+ * that the metaslab had while it was not loaded. This is
+ * because the old weight does not take into account the
+ * consolidation of adjacent segments between TXGs. [see
+ * comment for ms_synchist and ms_deferhist[] for more info]
+ */
+ uint64_t weight = msp->ms_weight;
+ metaslab_recalculate_weight_and_sort(msp);
+ if (!WEIGHT_IS_SPACEBASED(weight))
+ ASSERT3U(weight, <=, msp->ms_weight);
msp->ms_max_size = metaslab_block_maxsize(msp);
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
@@ -1537,10 +1751,29 @@ void
metaslab_unload(metaslab_t *msp)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_verify_weight_and_frag(msp);
+
range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE;
+
msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
msp->ms_max_size = 0;
+
+ /*
+ * We explicitly recalculate the metaslab's weight based on its space
+ * map (as it is now not loaded). We want unload metaslabs to always
+ * have their weights calculated from the space map histograms, while
+ * loaded ones have it calculated from their in-core range tree
+ * [see metaslab_load()]. This way, the weight reflects the information
+ * available in-core, whether it is loaded or not
+ *
+ * If ms_group == NULL means that we came here from metaslab_fini(),
+ * at which point it doesn't make sense for us to do the recalculation
+ * and the sorting.
+ */
+ if (msp->ms_group != NULL)
+ metaslab_recalculate_weight_and_sort(msp);
}
static void
@@ -1683,6 +1916,9 @@ metaslab_fini(metaslab_t *msp)
range_tree_destroy(msp->ms_checkpointing);
+ for (int t = 0; t < TXG_SIZE; t++)
+ ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
mutex_exit(&msp->ms_lock);
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
@@ -1698,7 +1934,7 @@ metaslab_fini(metaslab_t *msp)
* This table defines a segment size based fragmentation metric that will
* allow each metaslab to derive its own fragmentation value. This is done
* by calculating the space in each bucket of the spacemap histogram and
- * multiplying that by the fragmetation metric in this table. Doing
+ * multiplying that by the fragmentation metric in this table. Doing
* this for all buckets and dividing it by the total amount of free
* space in this metaslab (i.e. the total free space in all buckets) gives
* us the fragmentation metric. This means that a high fragmentation metric
@@ -1933,14 +2169,38 @@ metaslab_weight_from_range_tree(metaslab_t *msp)
static uint64_t
metaslab_weight_from_spacemap(metaslab_t *msp)
{
- uint64_t weight = 0;
+ space_map_t *sm = msp->ms_sm;
+ ASSERT(!msp->ms_loaded);
+ ASSERT(sm != NULL);
+ ASSERT3U(space_map_object(sm), !=, 0);
+ ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+ /*
+ * Create a joint histogram from all the segments that have made
+ * it to the metaslab's space map histogram, that are not yet
+ * available for allocation because they are still in the freeing
+ * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+ * these segments from the space map's histogram to get a more
+ * accurate weight.
+ */
+ uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ deferspace_histogram[i] += msp->ms_synchist[i];
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ deferspace_histogram[i] += msp->ms_deferhist[t][i];
+ }
+ }
+
+ uint64_t weight = 0;
for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
- if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
- WEIGHT_SET_COUNT(weight,
- msp->ms_sm->sm_phys->smp_histogram[i]);
- WEIGHT_SET_INDEX(weight, i +
- msp->ms_sm->sm_shift);
+ ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+ deferspace_histogram[i]);
+ uint64_t count =
+ sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+ if (count != 0) {
+ WEIGHT_SET_COUNT(weight, count);
+ WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
WEIGHT_SET_ACTIVE(weight, 0);
break;
}
@@ -2084,6 +2344,15 @@ metaslab_weight(metaslab_t *msp)
return (weight);
}
+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+ /* note: we preserve the mask (e.g. indication of primary, etc..) */
+ uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+ metaslab_group_sort(msp->ms_group, msp,
+ metaslab_weight(msp) | was_active);
+}
+
static int
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
int allocator, uint64_t activation_weight)
@@ -2613,6 +2882,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
* time we load the space map.
*/
space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+ metaslab_aux_histograms_update(msp);
metaslab_group_histogram_add(mg, msp);
metaslab_group_histogram_verify(mg);
@@ -2755,6 +3025,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
*/
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
+ metaslab_aux_histograms_update_done(msp, defer_allowed);
if (msp->ms_new) {
msp->ms_new = B_FALSE;
@@ -2762,12 +3033,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
mg->mg_ms_ready++;
mutex_exit(&mg->mg_lock);
}
+
/*
- * Calculate the new weights before unloading any metaslabs.
- * This will give us the most accurate weighting.
+ * Re-sort metaslab within its group now that we've adjusted
+ * its allocatable space.
*/
- metaslab_group_sort(mg, msp, metaslab_weight(msp) |
- (msp->ms_weight & METASLAB_ACTIVE_MASK));
+ metaslab_recalculate_weight_and_sort(msp);
/*
* If the metaslab is loaded and we've not tried to load or allocate
@@ -4112,7 +4383,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
zio_alloc_list_t *zal, zio_t *zio, int allocator)
{
dva_t *dva = bp->blk_dva;
- dva_t *hintdva = hintbp->blk_dva;
+ dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
ASSERT(bp->blk_birth == 0);
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 81c34da074fd..b17682d81c1d 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1346,12 +1346,12 @@ vdev_metaslab_fini(vdev_t *vd)
}
if (vd->vdev_ms != NULL) {
- uint64_t count = vd->vdev_ms_count;
+ metaslab_group_t *mg = vd->vdev_mg;
+ metaslab_group_passivate(mg);
- metaslab_group_passivate(vd->vdev_mg);
+ uint64_t count = vd->vdev_ms_count;
for (uint64_t m = 0; m < count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
-
if (msp != NULL)
metaslab_fini(msp);
}
@@ -1359,6 +1359,9 @@ vdev_metaslab_fini(vdev_t *vd)
vd->vdev_ms = NULL;
vd->vdev_ms_count = 0;
+
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
@@ -3006,7 +3009,10 @@ vdev_load(vdev_t *vd)
"asize=%llu", (u_longlong_t)vd->vdev_ashift,
(u_longlong_t)vd->vdev_asize);
return (SET_ERROR(ENXIO));
- } else if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ }
+
+ error = vdev_metaslab_init(vd, 0);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
"[error=%d]", error);
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -3021,9 +3027,10 @@ vdev_load(vdev_t *vd)
ASSERT(vd->vdev_asize != 0);
ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
- if ((error = space_map_open(&vd->vdev_checkpoint_sm,
+ error = space_map_open(&vd->vdev_checkpoint_sm,
mos, checkpoint_sm_obj, 0, vd->vdev_asize,
- vd->vdev_ashift))) {
+ vd->vdev_ashift);
+ if (error != 0) {
vdev_dbgmsg(vd, "vdev_load: space_map_open "
"failed for checkpoint spacemap (obj %llu) "
"[error=%d]",