aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib/opensolaris/uts
diff options
context:
space:
mode:
authorAndriy Gapon <avg@FreeBSD.org>2019-11-21 08:20:05 +0000
committerAndriy Gapon <avg@FreeBSD.org>2019-11-21 08:20:05 +0000
commit489912da7bb487a33a7f0642a983c233b22381ba (patch)
tree0fde8cde705e915892dcfb7fcefc6e73cfbc25ca /sys/cddl/contrib/opensolaris/uts
parentc81e4a699bf02dfb6be1b84f8f4ee5c7544ce84d (diff)
parentf4eba6fe5e46352368dd64b7df8ed55278b1175d (diff)
downloadsrc-489912da7bb487a33a7f0642a983c233b22381ba.tar.gz
src-489912da7bb487a33a7f0642a983c233b22381ba.zip
MFV r354382,r354385: 10601 10757 Pool allocation classes
illumos/illumos-gate@663207adb1669640c01c5ec6949ce78fd806efae https://github.com/illumos/illumos-gate/commit/663207adb1669640c01c5ec6949ce78fd806efae 10601 Pool allocation classes https://www.illumos.org/issues/10601 illumos port of ZoL Pool allocation classes. Includes at least these two commits: 441709695 Pool allocation classes misplacing small file blocks cc99f275a Pool allocation classes 10757 Add -gLp to zpool subcommands for alt vdev names https://www.illumos.org/issues/10757 Port from ZoL of d2f3e292d Add -gLp to zpool subcommands for alt vdev names Note that a subsequent ZoL commit changed -p to -P a77f29f93 Change full path subcommand flag from -p to -P Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Portions contributed by: HÃ¥kan Johansson <f96hajo@chalmers.se> Portions contributed by: Richard Yao <ryao@gentoo.org> Portions contributed by: Chunwei Chen <david.chen@nutanix.com> Portions contributed by: loli10K <ezomori.nozomu@gmail.com> Author: Don Brady <don.brady@delphix.com> 11541 allocation_classes feature must be enabled to add log device illumos/illumos-gate@c1064fd7ce62fe763a4475e9988ffea3b22137de https://github.com/illumos/illumos-gate/commit/c1064fd7ce62fe763a4475e9988ffea3b22137de https://www.illumos.org/issues/11541 After the allocation_classes feature was integrated, one can no longer add a log device to a pool unless that feature is enabled. There is an explicit check for this, but it is unnecessary in the case of log devices, so we should handle this better instead of forcing the feature to be enabled. Author: Jerry Jelinek <jerry.jelinek@joyent.com> FreeBSD notes. I faithfully added the new -g, -L, -P flags, but only -g does something: vdev GUIDs are displayed instead of device names. -L, resolve symlinks, and -P, display full disk paths, do nothing at the moment. The use of special vdevs is backward compatible for read-only access, so root pools should be bootable, but exercise caution. MFC after: 4 weeks
Notes
Notes: svn path=/head/; revision=354941
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c20
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c143
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c69
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c111
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h10
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c226
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c39
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c104
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h12
19 files changed, 663 insertions, 149 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 3c38749e1b82..0ee994a2ac3e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -2514,6 +2514,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
+ zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
+ os->os_zpl_special_smallblock : 0;
}
int
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index ca7bb3b455a6..d5753e8159e0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -304,6 +304,20 @@ dnodesize_changed_cb(void *arg, uint64_t newval)
}
static void
+smallblk_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ os->os_zpl_special_smallblock = newval;
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -518,6 +532,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_DNODESIZE),
dnodesize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS),
+ smallblk_changed_cb, os);
+ }
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 9085005e889b..0e231e831251 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -23,6 +23,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -356,7 +357,7 @@ metaslab_class_validate(metaslab_class_t *mc)
return (0);
}
-void
+static void
metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
{
@@ -420,7 +421,8 @@ metaslab_class_get_minblocksize(metaslab_class_t *mc)
void
metaslab_class_histogram_verify(metaslab_class_t *mc)
{
- vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ spa_t *spa = mc->mc_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
uint64_t *mc_hist;
int i;
@@ -928,7 +930,8 @@ metaslab_group_histogram_verify(metaslab_group_t *mg)
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
- if (msp->ms_sm == NULL)
+ /* skip if not active or not a member */
+ if (msp->ms_sm == NULL || msp->ms_group != mg)
continue;
for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
@@ -1061,12 +1064,14 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
continue;
+ if (msp->ms_group != mg)
+ continue;
valid_ms++;
fragmentation += msp->ms_fragmentation;
}
- if (valid_ms <= vd->vdev_ms_count / 2)
+ if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
return (ZFS_FRAG_INVALID);
fragmentation /= valid_ms;
@@ -1097,7 +1102,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* groups to select from. Otherwise, we always consider it eligible
* for allocations.
*/
- if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+ if ((mc != spa_normal_class(spa) &&
+ mc != spa_special_class(spa) &&
+ mc != spa_dedup_class(spa)) ||
+ mc->mc_groups <= 1)
return (B_TRUE);
/*
@@ -1559,12 +1567,26 @@ metaslab_unload(metaslab_t *msp)
msp->ms_max_size = 0;
}
+static void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+ int64_t defer_delta, int64_t space_delta)
+{
+ vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+ ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+ vdev_deflated_space(vd, space_delta));
+}
+
int
metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
metaslab_t **msp)
{
vdev_t *vd = mg->mg_vd;
- objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
metaslab_t *ms;
int error;
@@ -1622,8 +1644,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
/*
* If metaslab_debug_load is set and we're initializing a metaslab
- * that has an allocated space map object then load the its space
- * map so that can verify frees.
+ * that has an allocated space map object then load the space map
+ * so that we can verify frees.
*/
if (metaslab_debug_load && ms->ms_sm != NULL) {
mutex_enter(&ms->ms_lock);
@@ -1645,16 +1667,19 @@ void
metaslab_fini(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
VERIFY(msp->ms_group == NULL);
- vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
- 0, -msp->ms_size);
+ metaslab_space_update(vd, mg->mg_class,
+ -space_map_allocated(msp->ms_sm), 0, -msp->ms_size);
+
space_map_close(msp->ms_sm);
metaslab_unload(msp);
+
range_tree_destroy(msp->ms_allocatable);
range_tree_destroy(msp->ms_freeing);
range_tree_destroy(msp->ms_freed);
@@ -2669,7 +2694,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT3P(msp->ms_checkpointing, ==, NULL);
msp->ms_checkpointing = range_tree_create(NULL, NULL);
- vdev_space_update(vd, 0, 0, msp->ms_size);
+ metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
}
ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_checkpointing));
@@ -2691,7 +2716,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
defer_delta -= range_tree_space(*defer_tree);
}
- vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
+ metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+ defer_delta, 0);
/*
* If there's a metaslab_load() in progress, wait for it to complete
@@ -2790,21 +2816,25 @@ metaslab_sync_reassess(metaslab_group_t *mg)
spa_config_exit(spa, SCL_ALLOC, FTAG);
}
-static uint64_t
-metaslab_distance(metaslab_t *msp, dva_t *dva)
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
{
- uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
- uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
- uint64_t start = msp->ms_id;
+ uint64_t dva_ms_id;
+
+ if (DVA_GET_ASIZE(dva) == 0)
+ return (B_TRUE);
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
- return (1ULL << 63);
+ return (B_TRUE);
- if (offset < start)
- return ((start - offset) << ms_shift);
- if (offset > start)
- return ((offset - start) << ms_shift);
- return (0);
+ dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+ return (msp->ms_id != dva_ms_id);
}
/*
@@ -3065,7 +3095,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
*/
static metaslab_t *
find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
- dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+ dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
{
avl_index_t idx;
@@ -3100,13 +3130,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
break;
- uint64_t target_distance = min_distance
- + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
- min_distance >> 1);
-
for (i = 0; i < d; i++) {
- if (metaslab_distance(msp, &dva[i]) < target_distance)
- break;
+ if (want_unique &&
+ !metaslab_is_unique(msp, &dva[i]))
+ break; /* try another metaslab */
}
if (i == d)
break;
@@ -3124,8 +3151,8 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
- int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
@@ -3179,7 +3206,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
was_active = B_TRUE;
} else {
msp = find_valid_metaslab(mg, activation_weight, dva, d,
- min_distance, asize, allocator, zal, search,
+ want_unique, asize, allocator, zal, search,
&was_active);
}
@@ -3317,6 +3344,7 @@ next:
* metaslab.
*/
ASSERT(!metaslab_should_allocate(msp, asize));
+
mutex_exit(&msp->ms_lock);
}
mutex_exit(&msp->ms_lock);
@@ -3326,14 +3354,14 @@ next:
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
- int allocator)
+ uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
+ int d, int allocator)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
- offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
- min_distance, dva, d, allocator);
+ offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+ dva, d, allocator);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
@@ -3361,14 +3389,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
}
/*
- * If we have to write a ditto block (i.e. more than one DVA for a given BP)
- * on the same vdev as an existing DVA of this BP, then try to allocate it
- * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
- * existing DVAs.
- */
-int ditto_same_vdev_distance_shift = 3;
-
-/*
* Allocate a block for the specified i/o.
*/
int
@@ -3384,6 +3404,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
/*
* For testing, make some blocks above a certain size be gang blocks.
+ * This will also test spilling from special to normal.
*/
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
@@ -3435,6 +3456,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
} else {
+ ASSERT(mc->mc_rotor != NULL);
mg = mc->mc_rotor;
}
@@ -3499,25 +3521,17 @@ top:
ASSERT(mg->mg_class == mc);
- /*
- * If we don't need to try hard, then require that the
- * block be 1/8th of the device away from any other DVAs
- * in this BP. If we are trying hard, allow any offset
- * to be used (distance=0).
- */
- uint64_t distance = 0;
- if (!try_hard) {
- distance = vd->vdev_asize >>
- ditto_same_vdev_distance_shift;
- if (distance <= (1ULL << vd->vdev_ms_shift))
- distance = 0;
- }
-
uint64_t asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+ /*
+ * If we don't need to try hard, then require that the
+ * block be on an different metaslab from any other DVAs
+ * in this BP (unique=true). If we are trying hard, then
+ * allow any metaslab to be used (unique=false).
+ */
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
- distance, dva, d, allocator);
+ !try_hard, dva, d, allocator);
if (offset != -1ULL) {
/*
@@ -3896,7 +3910,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
if (reserved_slots < max)
available_slots = max - reserved_slots;
- if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+ if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+ flags & METASLAB_MUST_RESERVE) {
/*
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
@@ -4179,9 +4194,11 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
- for (int d = 0; d < ndvas; d++)
- if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
+ for (int d = 0; d < ndvas; d++) {
+ error = metaslab_claim_dva(spa, &dva[d], txg);
+ if (error != 0)
break;
+ }
spa_config_exit(spa, SCL_ALLOC, FTAG);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index ddb7427440c1..41ba382965ae 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -29,6 +29,7 @@
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2018 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2017 Datto Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -312,8 +313,14 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
if (rvd != NULL) {
- alloc = metaslab_class_get_alloc(spa_normal_class(spa));
- size = metaslab_class_get_space(spa_normal_class(spa));
+ alloc = metaslab_class_get_alloc(mc);
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+ size = metaslab_class_get_space(mc);
+ size += metaslab_class_get_space(spa_special_class(spa));
+ size += metaslab_class_get_space(spa_dedup_class(spa));
+
spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
@@ -1227,6 +1234,8 @@ spa_activate(spa_t *spa, int mode)
spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+ spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
/* Try to create a covering process */
mutex_enter(&spa->spa_proc_lock);
@@ -1338,6 +1347,12 @@ spa_deactivate(spa_t *spa)
metaslab_class_destroy(spa->spa_log_class);
spa->spa_log_class = NULL;
+ metaslab_class_destroy(spa->spa_special_class);
+ spa->spa_special_class = NULL;
+
+ metaslab_class_destroy(spa->spa_dedup_class);
+ spa->spa_dedup_class = NULL;
+
/*
* If this was part of an import or the open otherwise failed, we may
* still have errors left in the queues. Empty them just in case.
@@ -5096,7 +5111,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
char *poolname;
nvlist_t *nvl;
- if (nvlist_lookup_string(props,
+ if (props == NULL ||
+ nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
poolname = (char *)pool;
@@ -5184,10 +5200,16 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_ashift_optimize(rvd->vdev_child[c]);
- vdev_metaslab_set_size(rvd->vdev_child[c]);
- vdev_expand(rvd->vdev_child[c], txg);
+ /*
+ * instantiate the metaslab groups (this will dirty the vdevs)
+ * we can no longer error exit past this point
+ */
+ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ vdev_ashift_optimize(vd);
+ vdev_metaslab_set_size(vd);
+ vdev_expand(vd, txg);
}
}
@@ -7564,8 +7586,14 @@ spa_async_thread(void *arg)
mutex_enter(&spa_namespace_lock);
old_space = metaslab_class_get_space(spa_normal_class(spa));
+ old_space += metaslab_class_get_space(spa_special_class(spa));
+ old_space += metaslab_class_get_space(spa_dedup_class(spa));
+
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
new_space = metaslab_class_get_space(spa_normal_class(spa));
+ new_space += metaslab_class_get_space(spa_special_class(spa));
+ new_space += metaslab_class_get_space(spa_dedup_class(spa));
mutex_exit(&spa_namespace_lock);
/*
@@ -8303,6 +8331,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+ metaslab_class_t *normal = spa_normal_class(spa);
+ metaslab_class_t *special = spa_special_class(spa);
+ metaslab_class_t *dedup = spa_dedup_class(spa);
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
@@ -8402,9 +8433,13 @@ spa_sync(spa_t *spa, uint64_t txg)
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
+ metaslab_class_t *mc;
+
+ if (mg == NULL || !metaslab_group_initialized(mg))
+ continue;
- if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
- !metaslab_group_initialized(mg))
+ mc = mg->mg_class;
+ if (mc != normal && mc != special && mc != dedup)
continue;
/*
@@ -8423,12 +8458,18 @@ spa_sync(spa_t *spa, uint64_t txg)
}
slots_per_allocator += zfs_vdev_def_queue_depth;
}
- metaslab_class_t *mc = spa_normal_class(spa);
+
for (int i = 0; i < spa->spa_alloc_count; i++) {
- ASSERT0(zfs_refcount_count(&mc->mc_alloc_slots[i]));
- mc->mc_alloc_max_slots[i] = slots_per_allocator;
- }
- mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i]));
+ ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i]));
+ ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i]));
+ normal->mc_alloc_max_slots[i] = slots_per_allocator;
+ special->mc_alloc_max_slots[i] = slots_per_allocator;
+ dedup->mc_alloc_max_slots[i] = slots_per_allocator;
+ }
+ normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index 268c3d85a8cf..0706767a9d1f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -27,6 +27,7 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -469,6 +470,31 @@ spa_load_note(spa_t *spa, const char *fmt, ...)
}
/*
+ * By default dedup and user data indirects land in the special class
+ */
+int zfs_ddt_data_is_special = B_TRUE;
+int zfs_user_indirect_is_special = B_TRUE;
+
+/*
+ * The percentage of special class final space reserved for metadata only.
+ * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
+ * let metadata into the class.
+ */
+int zfs_special_class_metadata_reserve_pct = 25;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ddt_data_is_special, CTLFLAG_RWTUN,
+ &zfs_ddt_data_is_special, 0,
+ "Whether DDT data is eligible for the special class vdevs");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, user_indirect_is_special, CTLFLAG_RWTUN,
+ &zfs_user_indirect_is_special, 0,
+ "Whether indirect blocks are eligible for the special class vdevs");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, special_class_metadata_reserve_pct,
+ CTLFLAG_RWTUN, &zfs_special_class_metadata_reserve_pct, 0,
+ "Percentage of space in the special class reserved solely for metadata");
+#endif
+
+/*
* ==========================================================================
* SPA config locking
* ==========================================================================
@@ -1297,6 +1323,8 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
*/
ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
+ ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
spa_config_exit(spa, SCL_ALL, spa);
@@ -1640,6 +1668,16 @@ zfs_strtonum(const char *str, char **nptr)
return (val);
}
+void
+spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
+{
+ /*
+ * We bump the feature refcount for each special vdev added to the pool
+ */
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
+ spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+}
+
/*
* ==========================================================================
* Accessor functions
@@ -1889,6 +1927,79 @@ spa_log_class(spa_t *spa)
return (spa->spa_log_class);
}
+metaslab_class_t *
+spa_special_class(spa_t *spa)
+{
+ return (spa->spa_special_class);
+}
+
+metaslab_class_t *
+spa_dedup_class(spa_t *spa)
+{
+ return (spa->spa_dedup_class);
+}
+
+/*
+ * Locate an appropriate allocation class
+ */
+metaslab_class_t *
+spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
+ uint_t level, uint_t special_smallblk)
+{
+ if (DMU_OT_IS_ZIL(objtype)) {
+ if (spa->spa_log_class->mc_groups != 0)
+ return (spa_log_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
+
+ if (DMU_OT_IS_DDT(objtype)) {
+ if (spa->spa_dedup_class->mc_groups != 0)
+ return (spa_dedup_class(spa));
+ else if (has_special_class && zfs_ddt_data_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /* Indirect blocks for user data can land in special if allowed */
+ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
+ if (has_special_class && zfs_user_indirect_is_special)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ if (DMU_OT_IS_METADATA(objtype) || level > 0) {
+ if (has_special_class)
+ return (spa_special_class(spa));
+ else
+ return (spa_normal_class(spa));
+ }
+
+ /*
+ * Allow small file blocks in special class in some cases (like
+ * for the dRAID vdev feature). But always leave a reserve of
+ * zfs_special_class_metadata_reserve_pct exclusively for metadata.
+ */
+ if (DMU_OT_IS_FILE(objtype) &&
+ has_special_class && size <= special_smallblk) {
+ metaslab_class_t *special = spa_special_class(spa);
+ uint64_t alloc = metaslab_class_get_alloc(special);
+ uint64_t space = metaslab_class_get_space(special);
+ uint64_t limit =
+ (space * (100 - zfs_special_class_metadata_reserve_pct))
+ / 100;
+
+ if (alloc < limit)
+ return (special);
+ }
+
+ return (spa_normal_class(spa));
+}
+
void
spa_evicting_os_register(spa_t *spa, objset_t *os)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 3e68996fb11f..3f79409d65d1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -21,13 +21,14 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2013 DEY Storage Systems, Inc.
* Copyright 2014 HybridCluster. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -126,6 +127,16 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
+#define DMU_OT_IS_DDT(ot) \
+ ((ot) == DMU_OT_DDT_ZAP)
+
+#define DMU_OT_IS_ZIL(ot) \
+ ((ot) == DMU_OT_INTENT_LOG)
+
+/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
+#define DMU_OT_IS_FILE(ot) \
+ ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
+
#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
@@ -216,6 +227,7 @@ typedef enum dmu_object_type {
*
* The DMU_OTN_* types do not have entries in the dmu_ot table,
* use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
+ * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
* of indexing into dmu_ot directly (this works for both DMU_OT_* types
* and DMU_OTN_* types).
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 3028f0436566..cae1c7719a83 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
@@ -113,6 +113,11 @@ struct objset {
uint64_t os_normalization;
uint64_t os_utf8only;
uint64_t os_casesensitivity;
+ /*
+ * The largest zpl file block allowed in special class.
+ * cached here instead of zfsvfs for easier access.
+ */
+ int os_zpl_special_smallblock;
/*
* Pointer is constant; the blkptr it points to is protected by
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 007ac87fa39f..285cee006778 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_METASLAB_H
@@ -56,12 +57,17 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);
+/*
+ * metaslab alloc flags
+ */
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
+#define METASLAB_MUST_RESERVE 0x20
+#define METASLAB_FASTWRITE 0x40
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
@@ -92,8 +98,6 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
-void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
- int64_t, int64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index 47bf3c418957..925358febd7c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -27,6 +27,7 @@
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_SPA_H
@@ -821,6 +822,11 @@ extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern metaslab_class_t *spa_special_class(spa_t *spa);
+extern metaslab_class_t *spa_dedup_class(spa_t *spa);
+extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
+ dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
+
extern void spa_evicting_os_register(spa_t *, objset_t *os);
extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
extern void spa_evicting_os_wait(spa_t *spa);
@@ -883,6 +889,7 @@ extern boolean_t spa_trust_config(spa_t *spa);
extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
+extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index 273456a6fedc..5d7683057911 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -26,6 +26,7 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -219,6 +220,8 @@ struct spa {
boolean_t spa_is_initializing; /* true while opening pool */
metaslab_class_t *spa_normal_class; /* normal data class */
metaslab_class_t *spa_log_class; /* intent log data class */
+ metaslab_class_t *spa_special_class; /* special allocation class */
+ metaslab_class_t *spa_dedup_class; /* dedup allocation class */
uint64_t spa_first_txg; /* first txg after spa_open() */
uint64_t spa_final_txg; /* txg of export/destroy */
uint64_t spa_freeze_txg; /* freeze pool at this txg */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
index 204bb4cd41db..f1a39ef48ecc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_VDEV_H
@@ -110,6 +111,8 @@ extern boolean_t vdev_children_are_offline(vdev_t *vd);
extern void vdev_space_update(vdev_t *vd,
int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
+extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
+
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 13058124aa2c..e731f2b28b73 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -149,6 +150,14 @@ struct vdev_queue {
uint64_t vq_lastoffset;
};
+typedef enum vdev_alloc_bias {
+ VDEV_BIAS_NONE,
+ VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */
+ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */
+ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */
+} vdev_alloc_bias_t;
+
+
/*
* On-disk indirect vdev state.
*
@@ -261,6 +270,7 @@ struct vdev {
boolean_t vdev_ishole; /* is a hole in the namespace */
kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
uint64_t vdev_top_zap;
+ vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 5c2170f9d98b..dcf8f5d48924 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -307,6 +307,7 @@ typedef struct zio_prop {
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
+ uint32_t zp_zpl_smallblk;
} zio_prop_t;
typedef struct zio_cksum_report zio_cksum_report_t;
@@ -460,6 +461,7 @@ struct zio {
vdev_t *io_vd;
void *io_vsd;
const zio_vsd_ops_t *io_vsd_ops;
+ metaslab_class_t *io_metaslab_class; /* dva throttle class */
uint64_t io_offset;
hrtime_t io_timestamp;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index 5b58fa34e2f9..bbc5f416091e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -26,7 +26,8 @@
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/zfs_context.h>
@@ -314,6 +315,25 @@ vdev_getops(const char *type)
return (ops);
}
+/*
+ * Derive the enumerated alloction bias from string input.
+ * String origin is either the per-vdev zap or zpool(1M).
+ */
+static vdev_alloc_bias_t
+vdev_derive_alloc_bias(const char *bias)
+{
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+
+ if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
+ alloc_bias = VDEV_BIAS_LOG;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ alloc_bias = VDEV_BIAS_SPECIAL;
+ else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
+ alloc_bias = VDEV_BIAS_DEDUP;
+
+ return (alloc_bias);
+}
+
/* ARGSUSED */
void
vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
@@ -645,6 +665,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
uint64_t guid = 0, islog, nparity;
vdev_t *vd;
vdev_indirect_config_t *vic;
+ vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+ boolean_t top_level = (parent && !parent->vdev_parent);
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
@@ -731,11 +753,33 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
}
ASSERT(nparity != -1ULL);
+ /*
+ * If creating a top-level vdev, check for allocation classes input
+ */
+ if (top_level && alloctype == VDEV_ALLOC_ADD) {
+ char *bias;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ &bias) == 0) {
+ alloc_bias = vdev_derive_alloc_bias(bias);
+
+ /* spa_vdev_add() expects feature to be enabled */
+ if (alloc_bias != VDEV_BIAS_LOG &&
+ spa->spa_load_state != SPA_LOAD_CREATE &&
+ !spa_feature_is_enabled(spa,
+ SPA_FEATURE_ALLOCATION_CLASSES)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+ }
+
vd = vdev_alloc_common(spa, id, guid, ops);
vic = &vd->vdev_indirect_config;
vd->vdev_islog = islog;
vd->vdev_nparity = nparity;
+ if (top_level && alloc_bias != VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = alloc_bias;
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
vd->vdev_path = spa_strdup(vd->vdev_path);
@@ -786,7 +830,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
/*
* If we're a top-level vdev, try to load the allocation parameters.
*/
- if (parent && !parent->vdev_parent &&
+ if (top_level &&
(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
&vd->vdev_ms_array);
@@ -802,14 +846,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
ASSERT0(vd->vdev_top_zap);
}
- if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
+ if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
ASSERT(alloctype == VDEV_ALLOC_LOAD ||
alloctype == VDEV_ALLOC_ADD ||
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
- vd->vdev_mg = metaslab_group_create(islog ?
- spa_log_class(spa) : spa_normal_class(spa), vd,
- spa->spa_alloc_count);
+ /* Note: metaslab_group_create() is now deferred */
}
if (vd->vdev_ops->vdev_op_leaf &&
@@ -1043,6 +1085,9 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
svd->vdev_checkpoint_sm = NULL;
+ tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
+ svd->vdev_alloc_bias = VDEV_BIAS_NONE;
+
tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
@@ -1198,6 +1243,55 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
+static void
+vdev_metaslab_group_create(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /*
+ * metaslab_group_create was delayed until allocation bias was available
+ */
+ if (vd->vdev_mg == NULL) {
+ metaslab_class_t *mc;
+
+ if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+ vd->vdev_alloc_bias = VDEV_BIAS_LOG;
+
+ ASSERT3U(vd->vdev_islog, ==,
+ (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ mc = spa_log_class(spa);
+ break;
+ case VDEV_BIAS_SPECIAL:
+ mc = spa_special_class(spa);
+ break;
+ case VDEV_BIAS_DEDUP:
+ mc = spa_dedup_class(spa);
+ break;
+ default:
+ mc = spa_normal_class(spa);
+ }
+
+ vd->vdev_mg = metaslab_group_create(mc, vd,
+ spa->spa_alloc_count);
+
+ /*
+ * The spa ashift values currently only reflect the
+ * general vdev classes. Class destination is late
+ * binding so ashift checking had to wait until now
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+ }
+}
+
int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
@@ -1208,6 +1302,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
metaslab_t **mspp;
int error;
+ boolean_t expanding = (oldc != 0);
ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
@@ -1223,7 +1318,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
- if (oldc != 0) {
+ if (expanding) {
bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
}
@@ -1249,6 +1344,17 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
}
}
+#ifndef _KERNEL
+ /*
+ * To accomodate zdb_leak_init() fake indirect
+ * metaslabs, we allocate a metaslab group for
+ * indirect vdevs which normally don't have one.
+ */
+ if (vd->vdev_mg == NULL) {
+ ASSERT0(vdev_is_concrete(vd));
+ vdev_metaslab_group_create(vd);
+ }
+#endif
error = metaslab_init(vd->vdev_mg, m, object, txg,
&(vd->vdev_ms[m]));
if (error != 0) {
@@ -1266,8 +1372,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
* the metaslabs since we want to ensure that no new
* allocations are performed on this device.
*/
- if (oldc == 0 && !vd->vdev_removing)
+ if (!expanding && !vd->vdev_removing) {
metaslab_group_activate(vd->vdev_mg);
+ }
if (txg == 0)
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -1750,9 +1857,13 @@ vdev_open(vdev_t *vd)
/*
* Track the min and max ashift values for normal data devices.
+ *
+ * DJB - TBD these should perhaps be tracked per allocation class
+ * (e.g. spa_min_ashift is used to round up post compression buffers)
*/
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
- !vd->vdev_islog && vd->vdev_aux == NULL) {
+ vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
+ vd->vdev_aux == NULL) {
if (vd->vdev_ashift > spa->spa_max_ashift)
spa->spa_max_ashift = vd->vdev_ashift;
if (vd->vdev_ashift < spa->spa_min_ashift)
@@ -2250,13 +2361,13 @@ vdev_metaslab_set_size(vdev_t *vd)
*
* The net effect of applying above constrains is summarized below.
*
- * vdev size metaslab count
+ * vdev size metaslab count
* --------------|-----------------
- * < 8GB ~16
- * 8GB - 100GB one per 512MB
- * 100GB - 3TB ~200
- * 3TB - 2PB one per 16GB
- * > 2PB ~131,072
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 3TB ~200
+ * 3TB - 2PB one per 16GB
+ * > 2PB ~131,072
* --------------------------------
*
* Finally, note that all of the above calculate the initial
@@ -2679,6 +2790,30 @@ vdev_dtl_load(vdev_t *vd)
return (error);
}
+static void
+vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
+{
+ spa_t *spa = vd->vdev_spa;
+ objset_t *mos = spa->spa_meta_objset;
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *string;
+
+ ASSERT(alloc_bias != VDEV_BIAS_NONE);
+
+ string =
+ (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
+ (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+ (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
+
+ ASSERT(string != NULL);
+ VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
+ 1, strlen(string) + 1, string, tx));
+
+ if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
+ spa_activate_allocation_classes(spa, tx);
+ }
+}
+
void
vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
{
@@ -2715,8 +2850,11 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
}
if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+ if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
+ vdev_zap_allocation_data(vd, tx);
}
}
+
for (uint64_t i = 0; i < vd->vdev_children; i++) {
vdev_construct_zaps(vd->vdev_child[i], tx);
}
@@ -2914,9 +3052,26 @@ vdev_load(vdev_t *vd)
vdev_set_deflate_ratio(vd);
/*
+ * On spa_load path, grab the allocation bias from our zap
+ */
+ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+ spa_t *spa = vd->vdev_spa;
+ char bias_str[64];
+
+ if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+ VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
+ bias_str) == 0) {
+ ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
+ vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+ }
+ }
+
+ /*
* If this is a top-level vdev, initialize its metaslabs.
*/
if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
+
if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -3111,6 +3266,7 @@ vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
metaslab_group_histogram_verify(mg);
metaslab_class_histogram_verify(mg->mg_class);
+
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
ASSERT0(mg->mg_histogram[i]);
}
@@ -3700,7 +3856,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_physical_ashift = vd->vdev_physical_ashift;
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
- vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+ vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+ vd->vdev_mg->mg_fragmentation : 0;
}
/*
@@ -3886,19 +4043,25 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
}
}
+int64_t
+vdev_deflated_space(vdev_t *vd, int64_t space)
+{
+ ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
+ ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
+
+ return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
+}
+
/*
- * Update the in-core space usage stats for this vdev, its metaslab class,
- * and the root vdev.
+ * Update the in-core space usage stats for this vdev and the root vdev.
*/
void
vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta)
{
- int64_t dspace_delta = space_delta;
+ int64_t dspace_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
- metaslab_group_t *mg = vd->vdev_mg;
- metaslab_class_t *mc = mg ? mg->mg_class : NULL;
ASSERT(vd == vd->vdev_top);
@@ -3908,10 +4071,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
* because the root vdev's psize-to-asize is simply the max of its
* childrens', thus not accurate enough for us.
*/
- ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
- ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
- dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
+ dspace_delta = vdev_deflated_space(vd, space_delta);
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_alloc += alloc_delta;
@@ -3919,21 +4079,15 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- if (mc == spa_normal_class(spa)) {
+ /* every class but log contributes to root space stats */
+ if (vd->vdev_mg != NULL && !vd->vdev_islog) {
mutex_enter(&rvd->vdev_stat_lock);
rvd->vdev_stat.vs_alloc += alloc_delta;
rvd->vdev_stat.vs_space += space_delta;
rvd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&rvd->vdev_stat_lock);
}
-
- if (mc != NULL) {
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
-
- metaslab_class_space_update(mc,
- alloc_delta, defer_delta, space_delta, dspace_delta);
- }
+ /* Note: metaslab_class_space_update moved to metaslab_space_update */
}
/*
@@ -4349,7 +4503,9 @@ vdev_expand(vdev_t *vd, uint64_t txg)
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+ vdev_is_concrete(vd)) {
+ vdev_metaslab_group_create(vd);
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index e39e434b48cd..fcb5241f3fcc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
* Copyright 2019 Joyent, Inc.
*/
@@ -323,6 +324,28 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
vd->vdev_removing);
}
+
+ /* zpool command expects alloc class data */
+ if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+ const char *bias = NULL;
+
+ switch (vd->vdev_alloc_bias) {
+ case VDEV_BIAS_LOG:
+ bias = VDEV_ALLOC_BIAS_LOG;
+ break;
+ case VDEV_BIAS_SPECIAL:
+ bias = VDEV_ALLOC_BIAS_SPECIAL;
+ break;
+ case VDEV_BIAS_DEDUP:
+ bias = VDEV_ALLOC_BIAS_DEDUP;
+ break;
+ default:
+ ASSERT3U(vd->vdev_alloc_bias, ==,
+ VDEV_BIAS_NONE);
+ }
+ fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ bias);
+ }
}
if (vd->vdev_dtl_sm != NULL) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
index 20fa9c24db24..bef33de3ffa3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
@@ -950,14 +950,17 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
ASSERT3U(size, <=, maxalloc);
/*
- * We use allocator 0 for this I/O because we don't expect device remap
- * to be the steady state of the system, so parallelizing is not as
- * critical as it is for other allocation types. We also want to ensure
- * that the IOs are allocated together as much as possible, to reduce
- * mapping sizes.
+ * An allocation class might not have any remaining vdevs or space
*/
- int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal, 0);
+ metaslab_class_t *mc = mg->mg_class;
+ if (mc != spa_normal_class(spa) && mc->mc_groups <= 1)
+ mc = spa_normal_class(spa);
+ int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+ zal, 0);
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+ &dst, 0, NULL, txg, 0, zal, 0);
+ }
if (error != 0)
return (error);
@@ -1869,15 +1872,31 @@ spa_vdev_remove_top_check(vdev_t *vd)
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
return (SET_ERROR(ENOTSUP));
+ /* available space in the pool's normal class */
+ uint64_t available = dsl_dir_space_available(
+ spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+
+ metaslab_class_t *mc = vd->vdev_mg->mg_class;
+
+ /*
+ * When removing a vdev from an allocation class that has
+ * remaining vdevs, include available space from the class.
+ */
+ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) {
+ uint64_t class_avail = metaslab_class_get_space(mc) -
+ metaslab_class_get_alloc(mc);
+
+ /* add class space, adjusted for overhead */
+ available += (class_avail * 94) / 100;
+ }
+
/*
* There has to be enough free space to remove the
* device and leave double the "slop" space (i.e. we
* must leave at least 3% of the pool free, in addition to
* the normal slop space).
*/
- if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir,
- NULL, 0, B_TRUE) <
- vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
return (SET_ERROR(ENOSPC));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index 703ff56deb52..26bb522a8427 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -4325,6 +4325,15 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
}
break;
+ case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+ /*
+ * This property could require the allocation classes
+ * feature to be active for setting, however we allow
+ * it so that tests of settable properties succeed.
+ * The CLI will issue a warning in this case.
+ */
+ break;
+
case ZFS_PROP_SHARESMB:
if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
return (SET_ERROR(ENOTSUP));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index ebd498ea5781..3b5dc482d55b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -23,6 +23,7 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
*/
#include <sys/sysmacros.h>
@@ -710,6 +711,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
+ if (zio->io_metaslab_class == NULL)
+ zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
@@ -1206,9 +1209,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
*/
if (flags & ZIO_FLAG_IO_ALLOCATING &&
(vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
- metaslab_class_t *mc = spa_normal_class(pio->io_spa);
-
- ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(pio->io_metaslab_class != NULL);
+ ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
ASSERT(type == ZIO_TYPE_WRITE);
ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
@@ -1524,8 +1526,9 @@ zio_write_compress(zio_t *zio)
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
- ASSERT(psize != 0);
+ VERIFY3U(psize, !=, 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
zio->io_flags |= ZIO_FLAG_IO_REWRITE;
} else {
@@ -2952,7 +2955,7 @@ zio_io_to_allocate(spa_t *spa, int allocator)
* reserve then we throttle.
*/
ASSERT3U(zio->io_allocator, ==, allocator);
- if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+ if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
return (NULL);
}
@@ -2968,9 +2971,14 @@ zio_dva_throttle(zio_t *zio)
{
spa_t *spa = zio->io_spa;
zio_t *nio;
+ metaslab_class_t *mc;
+
+ /* locate an appropriate allocation class */
+ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
+ zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
- !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+ !mc->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
return (zio);
@@ -2992,17 +3000,16 @@ zio_dva_throttle(zio_t *zio)
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
-
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
-
- nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+ nio = zio_io_to_allocate(spa, zio->io_allocator);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
return (nio);
}
-void
+static void
zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
@@ -3022,7 +3029,7 @@ static zio_t *
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- metaslab_class_t *mc = spa_normal_class(spa);
+ metaslab_class_t *mc;
blkptr_t *bp = zio->io_bp;
int error;
int flags = 0;
@@ -3038,20 +3045,57 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
- if (zio->io_flags & ZIO_FLAG_NODATA) {
+ if (zio->io_flags & ZIO_FLAG_NODATA)
flags |= METASLAB_DONT_THROTTLE;
- }
- if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
+ if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
flags |= METASLAB_GANG_CHILD;
- }
- if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
+ if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
flags |= METASLAB_ASYNC_ALLOC;
+
+ /*
+ * if not already chosen, locate an appropriate allocation class
+ */
+ mc = zio->io_metaslab_class;
+ if (mc == NULL) {
+ mc = spa_preferred_class(spa, zio->io_size,
+ zio->io_prop.zp_type, zio->io_prop.zp_level,
+ zio->io_prop.zp_zpl_smallblk);
+ zio->io_metaslab_class = mc;
}
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
&zio->io_alloc_list, zio, zio->io_allocator);
+ /*
+ * Fallback to normal class when an alloc class is full
+ */
+ if (error == ENOSPC && mc != spa_normal_class(spa)) {
+ /*
+ * If throttling, transfer reservation over to normal class.
+ * The io_allocator slot can remain the same even though we
+ * are switching classes.
+ */
+ if (mc->mc_alloc_throttle_enabled &&
+ (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
+ metaslab_class_throttle_unreserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+
+ mc = spa_normal_class(spa);
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zio->io_prop.zp_copies, zio->io_allocator, zio,
+ flags | METASLAB_MUST_RESERVE));
+ } else {
+ mc = spa_normal_class(spa);
+ }
+ zio->io_metaslab_class = mc;
+
+ error = metaslab_alloc(spa, mc, zio->io_size, bp,
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+ &zio->io_alloc_list, zio, zio->io_allocator);
+ }
+
if (error != 0) {
zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
@@ -3119,6 +3163,15 @@ zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
+
+ /*
+ * Block pointer fields are useful to metaslabs for stats and debugging.
+ * Fill in the obvious ones before calling into metaslab_alloc().
+ */
+ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+ BP_SET_PSIZE(new_bp, size);
+ BP_SET_LEVEL(new_bp, 0);
+
/*
* When allocating a zil block, we don't have information about
* the final destination of the block except the objset it's part
@@ -3721,13 +3774,15 @@ zio_ready(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
+
/*
* We were unable to allocate anything, unreserve and
* issue the next I/O to allocate.
*/
metaslab_class_throttle_unreserve(
- spa_normal_class(zio->io_spa),
- zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio->io_metaslab_class, zio->io_prop.zp_copies,
+ zio->io_allocator, zio);
zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
}
}
@@ -3809,14 +3864,15 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT(zio->io_logical != NULL);
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+ ASSERT(zio->io_metaslab_class != NULL);
mutex_enter(&pio->io_lock);
metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
pio->io_allocator, B_TRUE);
mutex_exit(&pio->io_lock);
- metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
- 1, pio->io_allocator, pio);
+ metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
+ pio->io_allocator, pio);
/*
* Call into the pipeline to see if there is more work that
@@ -3835,7 +3891,6 @@ zio_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
- metaslab_class_t *mc = spa_normal_class(spa);
zio_link_t *zl = NULL;
/*
@@ -3854,7 +3909,8 @@ zio_done(zio_t *zio)
*/
if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
zio->io_child_type == ZIO_CHILD_VDEV) {
- ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(zio->io_metaslab_class != NULL);
+ ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
zio_dva_throttle_done(zio);
}
@@ -3866,10 +3922,12 @@ zio_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(bp != NULL);
+
metaslab_group_alloc_verify(spa, zio->io_bp, zio,
zio->io_allocator);
VERIFY(zfs_refcount_not_held(
- &mc->mc_alloc_slots[zio->io_allocator], zio));
+ &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator],
+ zio));
}
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index f3816490dadc..f27129d78e3d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -27,6 +27,7 @@
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -167,6 +168,7 @@ typedef enum {
ZFS_PROP_PREV_SNAP,
ZFS_PROP_RECEIVE_RESUME_TOKEN,
ZFS_PROP_REMAPTXG, /* not exposed to the user */
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS,
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -611,6 +613,8 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
+#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
+
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
@@ -656,6 +660,14 @@ typedef struct zpool_load_policy {
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
"com.delphix:pool_checkpoint_sm"
+#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
+ "org.zfsonlinux:allocation_bias"
+
+/* vdev metaslab allocation bias */
+#define VDEV_ALLOC_BIAS_LOG "log"
+#define VDEV_ALLOC_BIAS_SPECIAL "special"
+#define VDEV_ALLOC_BIAS_DEDUP "dedup"
+
#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
"com.delphix:next_offset_to_initialize"
#define VDEV_LEAF_ZAP_INITIALIZE_STATE \