aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
diff options
context:
space:
mode:
authorMartin Matuska <mm@FreeBSD.org>2010-08-28 08:59:55 +0000
committerMartin Matuska <mm@FreeBSD.org>2010-08-28 08:59:55 +0000
commitabe5837f7c1e08e17c954dba7dd4d52b137a0083 (patch)
tree34eb07d3bbb54f0e69851f11ce45a99b9bf3749a /sys/cddl/contrib/opensolaris/uts/common/fs/zfs
parentc87f1ad43ce664de499084f7662dd59b1c180eff (diff)
downloadsrc-abe5837f7c1e08e17c954dba7dd4d52b137a0083.tar.gz
src-abe5837f7c1e08e17c954dba7dd4d52b137a0083.zip
Update ZFS metaslab code from OpenSolaris.
This provides a noticeable write speedup, especially on pools with less than 30% of free space. Detailed information (OpenSolaris onnv changesets and Bug IDs): 11146:7e58f40bcb1c 6826241 Sync write IOPS drops dramatically during TXG sync 6869229 zfs should switch to shiny new metaslabs more frequently 11728:59fdb3b856f6 6918420 zdb -m has issues printing metaslab statistics 12047:7c1fcc8419ca 6917066 zfs block picking can be improved Approved by: delphij (mentor) Obtained from: OpenSolaris (Bug ID 6826241, 6869229, 6918420, 6917066) MFC after: 2 weeks
Notes
Notes: svn path=/head/; revision=211931
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c434
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c54
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c6
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h17
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c24
11 files changed, 394 insertions, 152 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index d216154db04d..c5ce27cb677c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -37,7 +36,7 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
* Minimum size which forces the dynamic allocator to change
- * it's allocation strategy. Once the space map cannot satisfy
+ * it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
@@ -49,7 +48,23 @@ uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
* Once the space_map's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
-int metaslab_df_free_pct = 30;
+int metaslab_df_free_pct = 4;
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+
+/*
+ * Max number of space_maps to prefetch.
+ */
+int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ */
+int metaslab_smo_bonus_pct = 150;
/*
* ==========================================================================
@@ -219,6 +234,32 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
}
/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+static int
+metaslab_segsize_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+ uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+ uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+ if (ss_size1 < ss_size2)
+ return (-1);
+ if (ss_size1 > ss_size2)
+ return (1);
+
+ if (s1->ss_start < s2->ss_start)
+ return (-1);
+ if (s1->ss_start > s2->ss_start)
+ return (1);
+
+ return (0);
+}
+
+/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
@@ -258,68 +299,58 @@ metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
return (metaslab_block_picker(t, cursor, size, align));
}
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
static void
-metaslab_ff_load(space_map_t *sm)
+metaslab_pp_load(space_map_t *sm)
{
+ space_seg_t *ss;
+
ASSERT(sm->sm_ppd == NULL);
sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
- sm->sm_pp_root = NULL;
+
+ sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(sm->sm_pp_root, metaslab_segsize_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ avl_add(sm->sm_pp_root, ss);
}
static void
-metaslab_ff_unload(space_map_t *sm)
+metaslab_pp_unload(space_map_t *sm)
{
+ void *cookie = NULL;
+
kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
sm->sm_ppd = NULL;
-}
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+ /* tear down the tree */
+ }
- return (metaslab_block_picker(t, cursor, size, align));
+ avl_destroy(sm->sm_pp_root);
+ kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+ sm->sm_pp_root = NULL;
}
/* ARGSUSED */
static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
{
/* No need to update cursor */
}
/* ARGSUSED */
static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
{
/* No need to update cursor */
}
-static space_map_ops_t metaslab_ff_ops = {
- metaslab_ff_load,
- metaslab_ff_unload,
- metaslab_ff_alloc,
- metaslab_ff_claim,
- metaslab_ff_free,
- NULL /* maxsize */
-};
-
/*
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * Return the maximum contiguous segment within the metaslab.
*/
-
uint64_t
-metaslab_df_maxsize(space_map_t *sm)
+metaslab_pp_maxsize(space_map_t *sm)
{
avl_tree_t *t = sm->sm_pp_root;
space_seg_t *ss;
@@ -330,67 +361,53 @@ metaslab_df_maxsize(space_map_t *sm)
return (ss->ss_end - ss->ss_start);
}
-static int
-metaslab_df_seg_compare(const void *x1, const void *x2)
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
{
- const space_seg_t *s1 = x1;
- const space_seg_t *s2 = x2;
- uint64_t ss_size1 = s1->ss_end - s1->ss_start;
- uint64_t ss_size2 = s2->ss_end - s2->ss_start;
-
- if (ss_size1 < ss_size2)
- return (-1);
- if (ss_size1 > ss_size2)
- return (1);
-
- if (s1->ss_start < s2->ss_start)
- return (-1);
- if (s1->ss_start > s2->ss_start)
- return (1);
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
- return (0);
+ return (metaslab_block_picker(t, cursor, size, align));
}
-static void
-metaslab_df_load(space_map_t *sm)
+/* ARGSUSED */
+boolean_t
+metaslab_ff_fragmented(space_map_t *sm)
{
- space_seg_t *ss;
-
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-
- sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
- avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
- sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
- for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
- avl_add(sm->sm_pp_root, ss);
+ return (B_TRUE);
}
-static void
-metaslab_df_unload(space_map_t *sm)
-{
- void *cookie = NULL;
-
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-
- while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
- /* tear down the tree */
- }
-
- avl_destroy(sm->sm_pp_root);
- kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
- sm->sm_pp_root = NULL;
-}
+static space_map_ops_t metaslab_ff_ops = {
+ metaslab_pp_load,
+ metaslab_pp_unload,
+ metaslab_ff_alloc,
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_ff_fragmented
+};
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
static uint64_t
metaslab_df_alloc(space_map_t *sm, uint64_t size)
{
avl_tree_t *t = &sm->sm_root;
uint64_t align = size & -size;
uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
- uint64_t max_size = metaslab_df_maxsize(sm);
+ uint64_t max_size = metaslab_pp_maxsize(sm);
int free_pct = sm->sm_space * 100 / sm->sm_size;
ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -412,30 +429,158 @@ metaslab_df_alloc(space_map_t *sm, uint64_t size)
return (metaslab_block_picker(t, cursor, size, 1ULL));
}
-/* ARGSUSED */
-static void
-metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+static boolean_t
+metaslab_df_fragmented(space_map_t *sm)
{
- /* No need to update cursor */
-}
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+ int free_pct = sm->sm_space * 100 / sm->sm_size;
-/* ARGSUSED */
-static void
-metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
- /* No need to update cursor */
+ if (max_size >= metaslab_df_alloc_threshold &&
+ free_pct >= metaslab_df_free_pct)
+ return (B_FALSE);
+
+ return (B_TRUE);
}
static space_map_ops_t metaslab_df_ops = {
- metaslab_df_load,
- metaslab_df_unload,
+ metaslab_pp_load,
+ metaslab_pp_unload,
metaslab_df_alloc,
- metaslab_df_claim,
- metaslab_df_free,
- metaslab_df_maxsize
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_df_fragmented
+};
+
+/*
+ * ==========================================================================
+ * Other experimental allocators
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+ uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+ uint64_t rsize = size;
+ uint64_t offset = 0;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ ASSERT3U(*extent_end, >=, *cursor);
+
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if ((*cursor + size) > *extent_end) {
+
+ t = sm->sm_pp_root;
+ *cursor = *extent_end = 0;
+
+ if (max_size > 2 * SPA_MAXBLOCKSIZE)
+ rsize = MIN(metaslab_min_alloc_size, max_size);
+ offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
+ if (offset != -1)
+ *cursor = offset + size;
+ } else {
+ offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+ }
+ ASSERT3U(*cursor, <=, *extent_end);
+ return (offset);
+}
+
+static boolean_t
+metaslab_cdf_fragmented(space_map_t *sm)
+{
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+
+ if (max_size > (metaslab_min_alloc_size * 10))
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_cdf_ops = {
+ metaslab_pp_load,
+ metaslab_pp_unload,
+ metaslab_cdf_alloc,
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_cdf_fragmented
};
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ avl_index_t where;
+ space_seg_t *ss, ssearch;
+ uint64_t hbit = highbit(size);
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ ssearch.ss_start = *cursor;
+ ssearch.ss_end = *cursor + size;
+
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
+ t = sm->sm_pp_root;
+
+ ssearch.ss_start = 0;
+ ssearch.ss_end = MIN(max_size,
+ 1ULL << (hbit + metaslab_ndf_clump_shift));
+ ss = avl_find(t, &ssearch, &where);
+ if (ss == NULL)
+ ss = avl_nearest(t, where, AVL_AFTER);
+ ASSERT(ss != NULL);
+ }
+
+ if (ss != NULL) {
+ if (ss->ss_start + size <= ss->ss_end) {
+ *cursor = ss->ss_start + size;
+ return (ss->ss_start);
+ }
+ }
+ return (-1ULL);
+}
+
+static boolean_t
+metaslab_ndf_fragmented(space_map_t *sm)
+{
+ uint64_t max_size = metaslab_pp_maxsize(sm);
+
+ if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+
+static space_map_ops_t metaslab_ndf_ops = {
+ metaslab_pp_load,
+ metaslab_pp_unload,
+ metaslab_ndf_alloc,
+ metaslab_pp_claim,
+ metaslab_pp_free,
+ metaslab_pp_maxsize,
+ metaslab_ndf_fragmented
+};
+
+space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
/*
* ==========================================================================
@@ -522,7 +667,6 @@ metaslab_fini(metaslab_t *msp)
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
#define METASLAB_ACTIVE_MASK \
(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define METASLAB_SMO_BONUS_MULTIPLIER 2
static uint64_t
metaslab_weight(metaslab_t *msp)
@@ -555,25 +699,60 @@ metaslab_weight(metaslab_t *msp)
ASSERT(weight >= space && weight <= 2 * space);
/*
- * For locality, assign higher weight to metaslabs we've used before.
+ * For locality, assign higher weight to metaslabs which have
+ * a lower offset than what we've already activated.
*/
- if (smo->smo_object != 0)
- weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+ if (sm->sm_start <= mg->mg_bonus_area)
+ weight *= (metaslab_smo_bonus_pct / 100);
ASSERT(weight >= space &&
- weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+ weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+
+ if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
+ /*
+ * If this metaslab is one we're actively using, adjust its
+ * weight to make it preferable to any inactive metaslab so
+ * we'll polish it off.
+ */
+ weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+ }
+ return (weight);
+}
+
+static void
+metaslab_prefetch(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_t *msp;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ int m;
+
+ mutex_enter(&mg->mg_lock);
/*
- * If this metaslab is one we're actively using, adjust its weight to
- * make it preferable to any inactive metaslab so we'll polish it off.
+ * Prefetch the next potential metaslabs
*/
- weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+ for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
+ space_map_t *sm = &msp->ms_map;
+ space_map_obj_t *smo = &msp->ms_smo;
- return (weight);
+ /* If we have reached our prefetch limit then we're done */
+ if (m >= metaslab_prefetch_limit)
+ break;
+
+ if (!sm->sm_loaded && smo->smo_object != 0) {
+ mutex_exit(&mg->mg_lock);
+ dmu_prefetch(spa->spa_meta_objset, smo->smo_object,
+ 0ULL, smo->smo_objsize);
+ mutex_enter(&mg->mg_lock);
+ }
+ }
+ mutex_exit(&mg->mg_lock);
}
static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{
+ metaslab_group_t *mg = msp->ms_group;
space_map_t *sm = &msp->ms_map;
space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
@@ -588,6 +767,15 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
}
/*
+ * Track the bonus area as we activate new metaslabs.
+ */
+ if (sm->sm_start > mg->mg_bonus_area) {
+ mutex_enter(&mg->mg_lock);
+ mg->mg_bonus_area = sm->sm_start;
+ mutex_exit(&mg->mg_lock);
+ }
+
+ /*
* If we were able to load the map then make sure
* that this map is still able to satisfy our request.
*/
@@ -773,6 +961,32 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
mutex_exit(&msp->ms_lock);
}
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+
+ /*
+ * Re-evaluate all metaslabs which have lower offsets than the
+ * bonus area.
+ */
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_map.sm_start > mg->mg_bonus_area)
+ break;
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_sort(mg, msp, metaslab_weight(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+
+ /*
+ * Prefetch the next potential metaslabs
+ */
+ metaslab_prefetch(mg);
+}
+
static uint64_t
metaslab_distance(metaslab_t *msp, dva_t *dva)
{
@@ -868,7 +1082,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
break;
- metaslab_passivate(msp, size - 1);
+ metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
mutex_exit(&msp->ms_lock);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index cb6f413c640b..da030f1084e0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -74,35 +74,38 @@ enum zti_modes {
zti_mode_fixed, /* value is # of threads (min 1) */
zti_mode_online_percent, /* value is % of online CPUs */
zti_mode_tune, /* fill from zio_taskq_tune_* */
+ zti_mode_null, /* don't create a taskq */
zti_nmodes
};
-#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) }
-#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) }
-#define ZTI_THREAD_TUNE { zti_mode_tune, 0 }
+#define ZTI_FIX(n) { zti_mode_fixed, (n) }
+#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
+#define ZTI_TUNE { zti_mode_tune, 0 }
+#define ZTI_NULL { zti_mode_null, 0 }
-#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1)
+#define ZTI_ONE ZTI_FIX(1)
typedef struct zio_taskq_info {
- const char *zti_name;
- struct {
- enum zti_modes zti_mode;
- uint_t zti_value;
- } zti_nthreads[ZIO_TASKQ_TYPES];
+ enum zti_modes zti_mode;
+ uint_t zti_value;
} zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
- "issue", "intr"
+ "issue", "issue_high", "intr", "intr_high"
};
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
- /* ISSUE INTR */
- { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
- { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } },
- { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } },
- { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
- { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
- { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } },
+/*
+ * Define the taskq threads for the following I/O types:
+ * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+ /* ISSUE ISSUE_HIGH INTR INTR_HIGH */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL },
+ { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
};
enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
@@ -581,14 +584,14 @@ spa_activate(spa_t *spa, int mode)
spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
for (int t = 0; t < ZIO_TYPES; t++) {
- const zio_taskq_info_t *ztip = &zio_taskqs[t];
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
- uint_t value = ztip->zti_nthreads[q].zti_value;
+ const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ enum zti_modes mode = ztip->zti_mode;
+ uint_t value = ztip->zti_value;
char name[32];
(void) snprintf(name, sizeof (name),
- "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+ "%s_%s", zio_type_name[t], zio_taskq_types[q]);
if (mode == zti_mode_tune) {
mode = zio_taskq_tune_mode;
@@ -613,6 +616,10 @@ spa_activate(spa_t *spa, int mode)
TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
break;
+ case zti_mode_null:
+ spa->spa_zio_taskq[t][q] = NULL;
+ break;
+
case zti_mode_tune:
default:
panic("unrecognized mode for "
@@ -659,7 +666,8 @@ spa_deactivate(spa_t *spa)
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- taskq_destroy(spa->spa_zio_taskq[t][q]);
+ if (spa->spa_zio_taskq[t][q] != NULL)
+ taskq_destroy(spa->spa_zio_taskq[t][q]);
spa->spa_zio_taskq[t][q] = NULL;
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index 75b55d5c1ca7..d0251419cbc4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -368,10 +368,8 @@ space_map_unload(space_map_t *sm)
uint64_t
space_map_maxsize(space_map_t *sm)
{
- if (sm->sm_loaded && sm->sm_ops != NULL)
- return (sm->sm_ops->smop_max(sm));
- else
- return (-1ULL);
+ ASSERT(sm->sm_ops != NULL);
+ return (sm->sm_ops->smop_max(sm));
}
uint64_t
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 5d3e11c971f9..c77b77205490 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -46,6 +46,7 @@ extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
extern void metaslab_fini(metaslab_t *msp);
extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index d67dea7e975e..5f0b77086b03 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -46,6 +46,7 @@ struct metaslab_group {
kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
+ uint64_t mg_bonus_area;
int64_t mg_bias;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index f3124b1ecc0d..ecb065c3f98c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -87,7 +87,9 @@ typedef enum spa_log_state {
enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
+ ZIO_TASKQ_ISSUE_HIGH,
ZIO_TASKQ_INTERRUPT,
+ ZIO_TASKQ_INTERRUPT_HIGH,
ZIO_TASKQ_TYPES
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
index a682bbd409e8..6f935c9db27e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -77,6 +77,7 @@ struct space_map_ops {
void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
uint64_t (*smop_max)(space_map_t *sm);
+ boolean_t (*smop_fragmented)(space_map_t *sm);
};
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index d7c0febdfc72..109b64ea9d07 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -107,14 +107,15 @@ enum zio_compress {
#define ZIO_PRIORITY_NOW (zio_priority_table[0])
#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
-#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3])
-#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4])
-#define ZIO_PRIORITY_FREE (zio_priority_table[5])
-#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6])
-#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7])
-#define ZIO_PRIORITY_RESILVER (zio_priority_table[8])
-#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
-#define ZIO_PRIORITY_TABLE_SIZE 10
+#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3])
+#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4])
+#define ZIO_PRIORITY_AGG (zio_priority_table[5])
+#define ZIO_PRIORITY_FREE (zio_priority_table[6])
+#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7])
+#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
+#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
+#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
+#define ZIO_PRIORITY_TABLE_SIZE 11
#define ZIO_FLAG_MUSTSUCCEED 0x00000
#define ZIO_FLAG_CANFAIL 0x00001
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index befc8b36bc3f..140deed7a5bd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -1773,9 +1773,13 @@ void
vdev_sync_done(vdev_t *vd, uint64_t txg)
{
metaslab_t *msp;
+ boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
metaslab_sync_done(msp, txg);
+
+ if (reassess)
+ metaslab_sync_reassess(vd->vdev_mg);
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index ac8404e36acc..de3f1db75961 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -233,7 +233,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
ASSERT(size <= zfs_vdev_aggregation_limit);
aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
- zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
+ zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 75b761711566..8ddf7cdd6544 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -49,11 +49,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
0, /* ZIO_PRIORITY_NOW */
0, /* ZIO_PRIORITY_SYNC_READ */
0, /* ZIO_PRIORITY_SYNC_WRITE */
- 6, /* ZIO_PRIORITY_ASYNC_READ */
- 4, /* ZIO_PRIORITY_ASYNC_WRITE */
- 4, /* ZIO_PRIORITY_FREE */
- 0, /* ZIO_PRIORITY_CACHE_FILL */
0, /* ZIO_PRIORITY_LOG_WRITE */
+ 1, /* ZIO_PRIORITY_CACHE_FILL */
+ 1, /* ZIO_PRIORITY_AGG */
+ 4, /* ZIO_PRIORITY_FREE */
+ 4, /* ZIO_PRIORITY_ASYNC_WRITE */
+ 6, /* ZIO_PRIORITY_ASYNC_READ */
10, /* ZIO_PRIORITY_RESILVER */
20, /* ZIO_PRIORITY_SCRUB */
};
@@ -64,7 +65,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
* ==========================================================================
*/
char *zio_type_name[ZIO_TYPES] = {
- "null", "read", "write", "free", "claim", "ioctl" };
+ "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+ "zio_ioctl"
+};
#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */
#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
@@ -942,6 +945,7 @@ zio_write_bp_init(zio_t *zio)
static void
zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
{
+ spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
/*
@@ -958,7 +962,15 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
t = ZIO_TYPE_NULL;
- (void) taskq_dispatch_safe(zio->io_spa->spa_zio_taskq[t][q],
+ /*
+ * If this is a high priority I/O, then use the high priority taskq.
+ */
+ if (zio->io_priority == ZIO_PRIORITY_NOW &&
+ spa->spa_zio_taskq[t][q + 1] != NULL)
+ q++;
+
+ ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+ (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
(task_func_t *)zio_execute, zio, &zio->io_task);
}