aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/module/zfs/metaslab.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/module/zfs/metaslab.c')
-rw-r--r--sys/contrib/openzfs/module/zfs/metaslab.c216
1 files changed, 121 insertions, 95 deletions
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
index 43b94eba2d58..9f4399af56bd 100644
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -375,13 +375,23 @@ static metaslab_stats_t metaslab_stats = {
#define METASLABSTAT_BUMP(stat) \
atomic_inc_64(&metaslab_stats.stat.value.ui64);
+char *
+metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name)
+{
+ return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
+ spa_name(mg->mg_vd->vdev_spa),
+ (u_longlong_t)mg->mg_vd->vdev_guid,
+ (u_longlong_t)ms->ms_id,
+ name));
+}
+
static kstat_t *metaslab_ksp;
void
metaslab_stat_init(void)
{
- ASSERT(metaslab_alloc_trace_cache == NULL);
+ ASSERT0P(metaslab_alloc_trace_cache);
metaslab_alloc_trace_cache = kmem_cache_create(
"metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
0, NULL, NULL, NULL, NULL, NULL, 0);
@@ -446,16 +456,16 @@ metaslab_class_destroy(metaslab_class_t *mc)
{
spa_t *spa = mc->mc_spa;
- ASSERT(mc->mc_alloc == 0);
- ASSERT(mc->mc_deferred == 0);
- ASSERT(mc->mc_space == 0);
- ASSERT(mc->mc_dspace == 0);
+ ASSERT0(mc->mc_alloc);
+ ASSERT0(mc->mc_deferred);
+ ASSERT0(mc->mc_space);
+ ASSERT0(mc->mc_dspace);
for (int i = 0; i < spa->spa_alloc_count; i++) {
metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
avl_destroy(&mca->mca_tree);
mutex_destroy(&mca->mca_lock);
- ASSERT(mca->mca_rotor == NULL);
+ ASSERT0P(mca->mca_rotor);
ASSERT0(mca->mca_reserved);
}
mutex_destroy(&mc->mc_lock);
@@ -750,7 +760,8 @@ metaslab_class_histogram_verify(metaslab_class_t *mc)
}
IMPLY(mg == mg->mg_vd->vdev_log_mg,
- mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++)
mc_hist[i] += mg->mg_histogram[i];
@@ -1076,8 +1087,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
{
spa_t *spa = mg->mg_class->mc_spa;
- ASSERT(mg->mg_prev == NULL);
- ASSERT(mg->mg_next == NULL);
+ ASSERT0P(mg->mg_prev);
+ ASSERT0P(mg->mg_next);
/*
* We may have gone below zero with the activation count
* either because we never activated in the first place or
@@ -1107,8 +1118,8 @@ metaslab_group_activate(metaslab_group_t *mg)
ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
- ASSERT(mg->mg_prev == NULL);
- ASSERT(mg->mg_next == NULL);
+ ASSERT0P(mg->mg_prev);
+ ASSERT0P(mg->mg_next);
ASSERT(mg->mg_activation_count <= 0);
if (++mg->mg_activation_count <= 0)
@@ -1153,8 +1164,8 @@ metaslab_group_passivate(metaslab_group_t *mg)
if (--mg->mg_activation_count != 0) {
for (int i = 0; i < spa->spa_alloc_count; i++)
ASSERT(mc->mc_allocator[i].mca_rotor != mg);
- ASSERT(mg->mg_prev == NULL);
- ASSERT(mg->mg_next == NULL);
+ ASSERT0P(mg->mg_prev);
+ ASSERT0P(mg->mg_next);
ASSERT(mg->mg_activation_count < 0);
return;
}
@@ -1183,14 +1194,16 @@ metaslab_group_passivate(metaslab_group_t *mg)
if (msp != NULL) {
mutex_enter(&msp->ms_lock);
metaslab_passivate(msp,
- metaslab_weight_from_range_tree(msp));
+ metaslab_weight(msp, B_TRUE) &
+ ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
}
msp = mga->mga_secondary;
if (msp != NULL) {
mutex_enter(&msp->ms_lock);
metaslab_passivate(msp,
- metaslab_weight_from_range_tree(msp));
+ metaslab_weight(msp, B_TRUE) &
+ ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
}
}
@@ -1288,7 +1301,8 @@ metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
mutex_enter(&mc->mc_lock);
for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
IMPLY(mg == mg->mg_vd->vdev_log_mg,
- mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] +=
msp->ms_sm->sm_phys->smp_histogram[i];
mc->mc_histogram[i + ashift] +=
@@ -1316,7 +1330,8 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
ASSERT3U(mc->mc_histogram[i + ashift], >=,
msp->ms_sm->sm_phys->smp_histogram[i]);
IMPLY(mg == mg->mg_vd->vdev_log_mg,
- mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+ mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) ||
+ mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa));
mg->mg_histogram[i + ashift] -=
msp->ms_sm->sm_phys->smp_histogram[i];
@@ -1330,7 +1345,7 @@ metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
static void
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
{
- ASSERT(msp->ms_group == NULL);
+ ASSERT0P(msp->ms_group);
mutex_enter(&mg->mg_lock);
msp->ms_group = mg;
msp->ms_weight = 0;
@@ -2895,30 +2910,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
zfs_range_seg_type_t type =
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
- ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
- shift);
+ ms->ms_allocatable = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable"));
for (int t = 0; t < TXG_SIZE; t++) {
- ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
- NULL, start, shift);
- }
- ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
- ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_allocating[t] = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(mg, ms, "ms_allocating"));
+ }
+ ms->ms_freeing = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing"));
+ ms->ms_freed = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed"));
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
- start, shift);
+ ms->ms_defer[t] = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer"));
}
- ms->ms_checkpointing =
- zfs_range_tree_create(NULL, type, NULL, start, shift);
- ms->ms_unflushed_allocs =
- zfs_range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_checkpointing = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing"));
+ ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs"));
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
mrap->mra_floor_shift = metaslab_by_size_min_shift;
- ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
- type, mrap, start, shift);
+ ms->ms_unflushed_frees = zfs_range_tree_create_flags(
+ &metaslab_rt_ops, type, mrap, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees"));
- ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
+ ms->ms_trim = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim"));
metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms, B_FALSE);
@@ -2989,7 +3017,7 @@ metaslab_fini(metaslab_t *msp)
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
- VERIFY(msp->ms_group == NULL);
+ VERIFY0P(msp->ms_group);
/*
* If this metaslab hasn't been through metaslab_sync_done() yet its
@@ -3892,7 +3920,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
&start, &shift);
- condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
+ condense_tree = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(msp->ms_group, msp, "condense_tree"));
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
zfs_range_tree_walk(msp->ms_defer[t],
@@ -3949,8 +3980,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
* followed by FREES (due to space_map_write() in metaslab_sync()) for
* sync pass 1.
*/
- zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
- start, shift);
+ zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags(
+ NULL, type, NULL, start, shift,
+ ZFS_RT_F_DYN_NAME,
+ metaslab_rt_name(msp->ms_group, msp, "tmp_tree"));
zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
@@ -5199,29 +5232,16 @@ next:
/*
* We were unable to allocate from this metaslab so determine
- * a new weight for this metaslab. Now that we have loaded
- * the metaslab we can provide a better hint to the metaslab
- * selector.
- *
- * For space-based metaslabs, we use the maximum block size.
- * This information is only available when the metaslab
- * is loaded and is more accurate than the generic free
- * space weight that was calculated by metaslab_weight().
- * This information allows us to quickly compare the maximum
- * available allocation in the metaslab to the allocation
- * size being requested.
- *
- * For segment-based metaslabs, determine the new weight
- * based on the highest bucket in the range tree. We
- * explicitly use the loaded segment weight (i.e. the range
- * tree histogram) since it contains the space that is
- * currently available for allocation and is accurate
- * even within a sync pass.
+ * a new weight for this metaslab. The weight was last
+ * recalculated either when we loaded it (if this is the first
+ * TXG it's been loaded in), or the last time a txg was synced
+ * out.
*/
uint64_t weight;
if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
- weight = metaslab_largest_allocatable(msp);
- WEIGHT_SET_SPACEBASED(weight);
+ metaslab_set_fragmentation(msp, B_TRUE);
+ weight = metaslab_space_weight(msp) &
+ ~METASLAB_ACTIVE_MASK;
} else {
weight = metaslab_weight_from_range_tree(msp);
}
@@ -5233,13 +5253,6 @@ next:
* For the case where we use the metaslab that is
* active for another allocator we want to make
* sure that we retain the activation mask.
- *
- * Note that we could attempt to use something like
- * metaslab_recalculate_weight_and_sort() that
- * retains the activation mask here. That function
- * uses metaslab_weight() to set the weight though
- * which is not as accurate as the calculations
- * above.
*/
weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
metaslab_group_sort(mg, msp, weight);
@@ -5590,7 +5603,21 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
- BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
+
+ /*
+ * For rewritten blocks, use the old physical birth as the new logical
+ * birth (representing when the space was allocated) and the removal
+ * time as the new physical birth (representing when it was actually
+ * written).
+ */
+ if (BP_GET_REWRITE(bp)) {
+ uint64_t old_physical_birth = BP_GET_PHYSICAL_BIRTH(bp);
+ ASSERT3U(old_physical_birth, <, physical_birth);
+ BP_SET_BIRTH(bp, old_physical_birth, physical_birth);
+ BP_SET_REWRITE(bp, 0);
+ } else {
+ BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
+ }
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5712,7 +5739,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
ASSERT(!vd->vdev_removing);
ASSERT(vdev_is_concrete(vd));
ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
- ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+ ASSERT0P(vd->vdev_indirect_mapping);
if (DVA_GET_GANG(dva))
size = vdev_gang_header_asize(vd);
@@ -5757,21 +5784,21 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
}
/*
- * Reserve some allocation slots. The reservation system must be called
- * before we call into the allocator. If there aren't any available slots
- * then the I/O will be throttled until an I/O completes and its slots are
- * freed up. The function returns true if it was successful in placing
- * the reservation.
+ * Reserve some space for a future allocation. The reservation system must be
+ * called before we call into the allocator. If there aren't enough space
+ * available, the calling I/O will be throttled until another I/O completes and
+ * its reservation is released. The function returns true if it was successful
+ * in placing the reservation.
*/
boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
- boolean_t must, boolean_t *more)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int allocator,
+ int copies, uint64_t io_size, boolean_t must, boolean_t *more)
{
- metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- if (mc->mc_alloc_io_size < zio->io_size) {
- mc->mc_alloc_io_size = zio->io_size;
+ if (mc->mc_alloc_io_size < io_size) {
+ mc->mc_alloc_io_size = io_size;
metaslab_class_balance(mc, B_FALSE);
}
if (must || mca->mca_reserved <= mc->mc_alloc_max) {
@@ -5782,10 +5809,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
* worst that can happen is few more I/Os get to allocation
* earlier, that is not a problem.
*/
- int64_t delta = slots * zio->io_size;
+ int64_t delta = copies * io_size;
*more = (atomic_add_64_nv(&mca->mca_reserved, delta) <=
mc->mc_alloc_max);
- zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
return (B_TRUE);
}
*more = B_FALSE;
@@ -5793,13 +5819,13 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
}
boolean_t
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
- zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int allocator,
+ int copies, uint64_t io_size)
{
- metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
+ metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
- int64_t delta = slots * zio->io_size;
+ int64_t delta = copies * io_size;
return (atomic_add_64_nv(&mca->mca_reserved, -delta) <=
mc->mc_alloc_max);
}
@@ -5960,7 +5986,7 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
int error = 0;
ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
- ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
+ ASSERT0(BP_GET_RAW_PHYSICAL_BIRTH(bp));
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@@ -5971,16 +5997,16 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
}
ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
- ASSERT(BP_GET_NDVAS(bp) == 0);
+ ASSERT0(BP_GET_NDVAS(bp));
ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
ASSERT3P(zal, !=, NULL);
- uint64_t cur_psize = 0;
-
+ uint64_t smallest_psize = UINT64_MAX;
for (int d = 0; d < ndvas; d++) {
- error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
- dva, d, hintdva, txg, flags, zal, allocator,
- actual_psize ? &cur_psize : NULL);
+ uint64_t cur_psize = 0;
+ error = metaslab_alloc_dva_range(spa, mc, psize,
+ MIN(smallest_psize, max_psize), dva, d, hintdva, txg,
+ flags, zal, allocator, actual_psize ? &cur_psize : NULL);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
@@ -6000,13 +6026,13 @@ metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
tag);
if (actual_psize)
- max_psize = MIN(cur_psize, max_psize);
+ smallest_psize = MIN(cur_psize, smallest_psize);
}
}
- ASSERT(error == 0);
+ ASSERT0(error);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
if (actual_psize)
- *actual_psize = max_psize;
+ *actual_psize = smallest_psize;
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -6022,7 +6048,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
- ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
+ ASSERT(!now || BP_GET_BIRTH(bp) >= spa_syncing_txg(spa));
/*
* If we have a checkpoint for the pool we need to make sure that
@@ -6040,7 +6066,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
* normally as they will be referenced by the checkpointed uberblock.
*/
boolean_t checkpoint = B_FALSE;
- if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
+ if (BP_GET_BIRTH(bp) <= spa->spa_checkpoint_txg &&
spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
/*
* At this point, if the block is part of the checkpoint