aboutsummaryrefslogtreecommitdiff
path: root/uts/common/fs/zfs/sys/metaslab_impl.h
diff options
context:
space:
mode:
Diffstat (limited to 'uts/common/fs/zfs/sys/metaslab_impl.h')
-rw-r--r--uts/common/fs/zfs/sys/metaslab_impl.h79
1 files changed, 79 insertions, 0 deletions
diff --git a/uts/common/fs/zfs/sys/metaslab_impl.h b/uts/common/fs/zfs/sys/metaslab_impl.h
index a2c8e6051772..f8d36f38f7b7 100644
--- a/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -340,8 +340,34 @@ struct metaslab_group {
* being written.
*/
struct metaslab {
+ /*
+ * This is the main lock of the metaslab and its purpose is to
+ * coordinate our allocations and frees [e.g metaslab_block_alloc(),
+ * metaslab_free_concrete(), ..etc] with our various syncing
+ * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
+ *
+ * The lock is also used during some miscellaneous operations like
+ * using the metaslab's histogram for the metaslab group's histogram
+ * aggregation, or marking the metaslab for initialization.
+ */
kmutex_t ms_lock;
+
+ /*
+ * Acquired together with the ms_lock whenever we expect to
+ * write to metaslab data on-disk (i.e flushing entries to
+ * the metaslab's space map). It helps coordinate readers of
+ * the metaslab's space map [see spa_vdev_remove_thread()]
+ * with writers [see metaslab_sync()].
+ *
+ * Note that metaslab_load(), even though a reader, uses
+ * a completely different mechanism to deal with the reading
+ * of the metaslab's space map based on ms_synced_length. That
+ * said, the function still uses the ms_sync_lock after it
+ * has read the ms_sm [see relevant comment in metaslab_load()
+ * as to why].
+ */
kmutex_t ms_sync_lock;
+
kcondvar_t ms_load_cv;
space_map_t *ms_sm;
uint64_t ms_id;
@@ -351,6 +377,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable;
+ uint64_t ms_allocated_this_txg;
/*
* The following range trees are accessed only from syncing context.
@@ -375,6 +402,55 @@ struct metaslab {
boolean_t ms_loaded;
boolean_t ms_loading;
+ /*
+ * The following histograms count entries that are in the
+ * metaslab's space map (and its histogram) but are not in
+ * ms_allocatable yet, because they are in ms_freed, ms_freeing,
+ * or ms_defer[].
+ *
+ * When the metaslab is not loaded, its ms_weight needs to
+ * reflect what is allocatable (i.e. what will be part of
+ * ms_allocatable if it is loaded). The weight is computed from
+ * the spacemap histogram, but that includes ranges that are
+ * not yet allocatable (because they are in ms_freed,
+ * ms_freeing, or ms_defer[]). Therefore, when calculating the
+ * weight, we need to remove those ranges.
+ *
+ * The ranges in the ms_freed and ms_defer[] range trees are all
+ * present in the spacemap. However, the spacemap may have
+ * multiple entries to represent a contiguous range, because it
+ * is written across multiple sync passes, but the changes of
+ * all sync passes are consolidated into the range trees.
+ * Adjacent ranges that are freed in different sync passes of
+ * one txg will be represented separately (as 2 or more entries)
+ * in the space map (and its histogram), but these adjacent
+ * ranges will be consolidated (represented as one entry) in the
+ * ms_freed/ms_defer[] range trees (and their histograms).
+ *
+ * When calculating the weight, we can not simply subtract the
+ * range trees' histograms from the spacemap's histogram,
+ * because the range trees' histograms may have entries in
+ * higher buckets than the spacemap, due to consolidation.
+ * Instead we must subtract the exact entries that were added to
+ * the spacemap's histogram. ms_synchist and ms_deferhist[]
+ * represent these exact entries, so we can subtract them from
+ * the spacemap's histogram when calculating ms_weight.
+ *
+ * ms_synchist represents the same ranges as ms_freeing +
+ * ms_freed, but without consolidation across sync passes.
+ *
+ * ms_deferhist[i] represents the same ranges as ms_defer[i],
+ * but without consolidation across sync passes.
+ */
+ uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE];
+ uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE];
+
+ /*
+ * Tracks the exact amount of allocated space of this metaslab
+ * (and specifically the metaslab's space map) up to the most
+ * recently completed sync pass [see usage in metaslab_sync()].
+ */
+ uint64_t ms_allocated_space;
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
uint64_t ms_activation_weight; /* activation weight */
@@ -411,6 +487,9 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+ /* updated every time we are done syncing the metaslab's space map */
+ uint64_t ms_synced_length;
+
boolean_t ms_new;
};