aboutsummaryrefslogtreecommitdiff
path: root/sys/cddl/contrib
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib')
-rw-r--r--sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c393
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c6
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h1
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c73
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h6
-rw-r--r--sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/Makefile.files1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c116
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c277
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c135
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c30
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c82
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c7
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c109
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c11
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c33
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c620
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c23
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c281
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c50
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c79
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c814
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h89
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h22
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h114
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h34
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h46
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h5
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h13
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c37
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c159
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c555
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c792
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c109
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c32
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c4
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c43
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c75
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c585
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c237
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c83
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c287
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c9
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c80
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c12
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c52
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c38
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c396
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c18
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c28
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/avl.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h38
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h3
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h29
83 files changed, 5650 insertions, 1903 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
index 89a64ea1d960..c322a5bd2179 100644
--- a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
*/
#include <sys/debug.h>
@@ -142,6 +143,13 @@ static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
#define NVPAIR2I_NVP(nvp) \
((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
+#ifdef _KERNEL
+int nvpair_max_recursion = 20;
+#else
+int nvpair_max_recursion = 100;
+#endif
+
+uint64_t nvlist_hashtable_init_size = (1 << 4);
int
nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
@@ -250,6 +258,291 @@ nv_priv_alloc_embedded(nvpriv_t *priv)
return (emb_priv);
}
+static int
+nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
+{
+ ASSERT3P(priv->nvp_hashtable, ==, NULL);
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+
+ i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
+ if (tab == NULL)
+ return (ENOMEM);
+
+ priv->nvp_hashtable = tab;
+ priv->nvp_nbuckets = buckets;
+ return (0);
+}
+
+static void
+nvt_tab_free(nvpriv_t *priv)
+{
+ i_nvp_t **tab = priv->nvp_hashtable;
+ if (tab == NULL) {
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+ return;
+ }
+
+ nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
+
+ priv->nvp_hashtable = NULL;
+ priv->nvp_nbuckets = 0;
+ priv->nvp_nentries = 0;
+}
+
+static uint32_t
+nvt_hash(const char *p)
+{
+ uint32_t g, hval = 0;
+
+ while (*p) {
+ hval = (hval << 4) + *p++;
+ if ((g = (hval & 0xf0000000)) != 0)
+ hval ^= g >> 24;
+ hval &= ~g;
+ }
+ return (hval);
+}
+
+static boolean_t
+nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
+{
+ boolean_t match = B_FALSE;
+ if (nvflag & NV_UNIQUE_NAME_TYPE) {
+ if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
+ NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
+ match = B_TRUE;
+ } else {
+ ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
+ if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
+ match = B_TRUE;
+ }
+ return (match);
+}
+
+static nvpair_t *
+nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+ ASSERT(priv != NULL);
+
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ if (tab == NULL) {
+ ASSERT3P(priv->nvp_list, ==, NULL);
+ ASSERT0(priv->nvp_nbuckets);
+ ASSERT0(priv->nvp_nentries);
+ return (NULL);
+ } else {
+ ASSERT(priv->nvp_nbuckets != 0);
+ }
+
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *entry = tab[index];
+
+ for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
+ if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
+ (type == DATA_TYPE_DONTCARE ||
+ NVP_TYPE(&e->nvi_nvp) == type))
+ return (&e->nvi_nvp);
+ }
+ return (NULL);
+}
+
+static nvpair_t *
+nvt_lookup_name(nvlist_t *nvl, const char *name)
+{
+ return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
+}
+
+static int
+nvt_resize(nvpriv_t *priv, uint32_t new_size)
+{
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ /*
+ * Migrate all the entries from the current table
+ * to a newly-allocated table with the new size by
+ * re-adjusting the pointers of their entries.
+ */
+ uint32_t size = priv->nvp_nbuckets;
+ uint32_t new_mask = new_size - 1;
+ ASSERT(((new_size) & ((new_size) - 1)) == 0);
+
+ i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
+ if (new_tab == NULL)
+ return (ENOMEM);
+
+ uint32_t nentries = 0;
+ for (uint32_t i = 0; i < size; i++) {
+ i_nvp_t *next, *e = tab[i];
+
+ while (e != NULL) {
+ next = e->nvi_hashtable_next;
+
+ uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
+ uint32_t index = hash & new_mask;
+
+ e->nvi_hashtable_next = new_tab[index];
+ new_tab[index] = e;
+ nentries++;
+
+ e = next;
+ }
+ tab[i] = NULL;
+ }
+ ASSERT3U(nentries, ==, priv->nvp_nentries);
+
+ nvt_tab_free(priv);
+
+ priv->nvp_hashtable = new_tab;
+ priv->nvp_nbuckets = new_size;
+ priv->nvp_nentries = nentries;
+
+ return (0);
+}
+
+static boolean_t
+nvt_needs_togrow(nvpriv_t *priv)
+{
+ /*
+ * Grow only when we have more elements than buckets
+ * and the # of buckets doesn't overflow.
+ */
+ return (priv->nvp_nentries > priv->nvp_nbuckets &&
+ (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
+}
+
+/*
+ * Allocate a new table that's twice the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_grow(nvpriv_t *priv)
+{
+ uint32_t current_size = priv->nvp_nbuckets;
+ /* ensure we won't overflow */
+ ASSERT3U(UINT32_MAX >> 1, >=, current_size);
+ return (nvt_resize(priv, current_size << 1));
+}
+
+static boolean_t
+nvt_needs_toshrink(nvpriv_t *priv)
+{
+ /*
+ * Shrink only when the # of elements is less than or
+ * equal to 1/4 the # of buckets. Never shrink less than
+ * nvlist_hashtable_init_size.
+ */
+ ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
+ if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
+ return (B_FALSE);
+ return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
+}
+
+/*
+ * Allocate a new table that's half the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_shrink(nvpriv_t *priv)
+{
+ uint32_t current_size = priv->nvp_nbuckets;
+ /* ensure we won't overflow */
+ ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
+ return (nvt_resize(priv, current_size >> 1));
+}
+
+static int
+nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+ if (nvt_needs_toshrink(priv)) {
+ int err = nvt_shrink(priv);
+ if (err != 0)
+ return (err);
+ }
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ char *name = NVP_NAME(nvp);
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *bucket = tab[index];
+
+ for (i_nvp_t *prev = NULL, *e = bucket;
+ e != NULL; prev = e, e = e->nvi_hashtable_next) {
+ if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) {
+ if (prev != NULL) {
+ prev->nvi_hashtable_next =
+ e->nvi_hashtable_next;
+ } else {
+ ASSERT3P(e, ==, bucket);
+ tab[index] = e->nvi_hashtable_next;
+ }
+ e->nvi_hashtable_next = NULL;
+ priv->nvp_nentries--;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+ nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+ /* initialize nvpair table now if it doesn't exist. */
+ if (priv->nvp_hashtable == NULL) {
+ int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
+ if (err != 0)
+ return (err);
+ }
+
+ /*
+ * if we don't allow duplicate entries, make sure to
+ * unlink any existing entries from the table.
+ */
+ if (nvl->nvl_nvflag != 0) {
+ int err = nvt_remove_nvpair(nvl, nvp);
+ if (err != 0)
+ return (err);
+ }
+
+ if (nvt_needs_togrow(priv)) {
+ int err = nvt_grow(priv);
+ if (err != 0)
+ return (err);
+ }
+ i_nvp_t **tab = priv->nvp_hashtable;
+
+ char *name = NVP_NAME(nvp);
+ uint64_t hash = nvt_hash(name);
+ uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+ ASSERT3U(index, <, priv->nvp_nbuckets);
+ i_nvp_t *bucket = tab[index];
+
+ /* insert link at the beginning of the bucket */
+ i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
+ ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
+ new_entry->nvi_hashtable_next = bucket;
+ tab[index] = new_entry;
+
+ priv->nvp_nentries++;
+ return (0);
+}
+
static void
nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
{
@@ -582,6 +875,7 @@ nvlist_free(nvlist_t *nvl)
else
nvl->nvl_priv = 0;
+ nvt_tab_free(priv);
nv_mem_free(priv, priv, sizeof (nvpriv_t));
}
@@ -642,26 +936,14 @@ nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
int
nvlist_remove_all(nvlist_t *nvl, const char *name)
{
- nvpriv_t *priv;
- i_nvp_t *curr;
int error = ENOENT;
- if (nvl == NULL || name == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
return (EINVAL);
- curr = priv->nvp_list;
- while (curr != NULL) {
- nvpair_t *nvp = &curr->nvi_nvp;
-
- curr = curr->nvi_next;
- if (strcmp(name, NVP_NAME(nvp)) != 0)
- continue;
-
- nvp_buf_unlink(nvl, nvp);
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
-
+ nvpair_t *nvp;
+ while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
+ VERIFY0(nvlist_remove_nvpair(nvl, nvp));
error = 0;
}
@@ -674,28 +956,14 @@ nvlist_remove_all(nvlist_t *nvl, const char *name)
int
nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
{
- nvpriv_t *priv;
- i_nvp_t *curr;
-
- if (nvl == NULL || name == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
return (EINVAL);
- curr = priv->nvp_list;
- while (curr != NULL) {
- nvpair_t *nvp = &curr->nvi_nvp;
-
- if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) {
- nvp_buf_unlink(nvl, nvp);
- nvpair_free(nvp);
- nvp_buf_free(nvl, nvp);
-
- return (0);
- }
- curr = curr->nvi_next;
- }
+ nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+ if (nvp == NULL)
+ return (ENOENT);
- return (ENOENT);
+ return (nvlist_remove_nvpair(nvl, nvp));
}
int
@@ -704,6 +972,10 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
if (nvl == NULL || nvp == NULL)
return (EINVAL);
+ int err = nvt_remove_nvpair(nvl, nvp);
+ if (err != 0)
+ return (err);
+
nvp_buf_unlink(nvl, nvp);
nvpair_free(nvp);
nvp_buf_free(nvl, nvp);
@@ -908,6 +1180,8 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
/* calculate sizes of the nvpair elements and the nvpair itself */
name_sz = strlen(name) + 1;
+ if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1))
+ return (EINVAL);
nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
@@ -979,6 +1253,12 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
(void) nvlist_remove(nvl, name, type);
+ err = nvt_add_nvpair(nvl, nvp);
+ if (err != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
nvp_buf_link(nvl, nvp);
return (0);
@@ -1328,25 +1608,17 @@ static int
nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
uint_t *nelem, void *data)
{
- nvpriv_t *priv;
- nvpair_t *nvp;
- i_nvp_t *curr;
-
- if (name == NULL || nvl == NULL ||
- (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+ if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
return (EINVAL);
if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
return (ENOTSUP);
- for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
- nvp = &curr->nvi_nvp;
-
- if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type)
- return (nvpair_value_common(nvp, type, nelem, data));
- }
+ nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+ if (nvp == NULL)
+ return (ENOENT);
- return (ENOENT);
+ return (nvpair_value_common(nvp, type, nelem, data));
}
int
@@ -2018,6 +2290,7 @@ typedef struct {
const nvs_ops_t *nvs_ops;
void *nvs_private;
nvpriv_t *nvs_priv;
+ int nvs_recursion;
} nvstream_t;
/*
@@ -2103,6 +2376,12 @@ nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
return (EFAULT);
}
+ err = nvt_add_nvpair(nvl, nvp);
+ if (err != 0) {
+ nvpair_free(nvp);
+ nvp_buf_free(nvl, nvp);
+ return (err);
+ }
nvp_buf_link(nvl, nvp);
}
return (err);
@@ -2169,9 +2448,16 @@ static int
nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
{
switch (nvs->nvs_op) {
- case NVS_OP_ENCODE:
- return (nvs_operation(nvs, embedded, NULL));
+ case NVS_OP_ENCODE: {
+ int err;
+ if (nvs->nvs_recursion >= nvpair_max_recursion)
+ return (EINVAL);
+ nvs->nvs_recursion++;
+ err = nvs_operation(nvs, embedded, NULL);
+ nvs->nvs_recursion--;
+ return (err);
+ }
case NVS_OP_DECODE: {
nvpriv_t *priv;
int err;
@@ -2184,8 +2470,14 @@ nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
nvlist_init(embedded, embedded->nvl_nvflag, priv);
+ if (nvs->nvs_recursion >= nvpair_max_recursion) {
+ nvlist_free(embedded);
+ return (EINVAL);
+ }
+ nvs->nvs_recursion++;
if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
nvlist_free(embedded);
+ nvs->nvs_recursion--;
return (err);
}
default:
@@ -2273,6 +2565,7 @@ nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
return (EINVAL);
nvs.nvs_op = nvs_op;
+ nvs.nvs_recursion = 0;
/*
* For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
index 5f3d22f703bc..67774cddb8c9 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -229,6 +229,12 @@ zpool_feature_init(void)
"Pool state can be checkpointed, allowing rewind later.",
ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+ zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+ "com.delphix:spacemap_v2", "spacemap_v2",
+ "Space maps representing large segments are more efficient.",
+ ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+ NULL);
+
static const spa_feature_t large_blocks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
index 12bd4ffe1ccc..1972ba397fae 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -60,6 +60,7 @@ typedef enum spa_feature {
SPA_FEATURE_DEVICE_REMOVAL,
SPA_FEATURE_OBSOLETE_COUNTS,
SPA_FEATURE_POOL_CHECKPOINT,
+ SPA_FEATURE_SPACEMAP_V2,
SPA_FEATURES
} spa_feature_t;
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
index 09975125261b..bad8f20e6917 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
/*
@@ -34,8 +34,6 @@
* name is invalid. In the kernel, we only care whether it's valid or not.
* Each routine therefore takes a 'namecheck_err_t' which describes exactly why
* the name failed to validate.
- *
- * Each function returns 0 on success, -1 on error.
*/
#if defined(_KERNEL)
@@ -50,6 +48,14 @@
#include "zfs_namecheck.h"
#include "zfs_deleg.h"
+/*
+ * Deeply nested datasets can overflow the stack, so we put a limit
+ * in the amount of nesting a path can have. zfs_max_dataset_nesting
+ * can be tuned temporarily to fix existing datasets that exceed our
+ * predefined limit.
+ */
+int zfs_max_dataset_nesting = 50;
+
static int
valid_char(char c)
{
@@ -60,10 +66,35 @@ valid_char(char c)
}
/*
+ * Looks at a path and returns its level of nesting (depth).
+ */
+int
+get_dataset_depth(const char *path)
+{
+ const char *loc = path;
+ int nesting = 0;
+
+ /*
+ * Keep track of nesting until you hit the end of the
+ * path or found the snapshot/bookmark seperator.
+ */
+ for (int i = 0; loc[i] != '\0' &&
+ loc[i] != '@' &&
+ loc[i] != '#'; i++) {
+ if (loc[i] == '/')
+ nesting++;
+ }
+
+ return (nesting);
+}
+
+/*
* Snapshot names must be made up of alphanumeric characters plus the following
* characters:
*
- * [-_.: ]
+ * [-_.: ]
+ *
+ * Returns 0 on success, -1 on error.
*/
int
zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -99,6 +130,8 @@ zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
* Permissions set name must start with the letter '@' followed by the
* same character restrictions as snapshot names, except that the name
* cannot exceed 64 characters.
+ *
+ * Returns 0 on success, -1 on error.
*/
int
permset_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -121,28 +154,40 @@ permset_namecheck(const char *path, namecheck_err_t *why, char *what)
}
/*
+ * Dataset paths should not be deeper than zfs_max_dataset_nesting
+ * in terms of nesting.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+dataset_nestcheck(const char *path)
+{
+ return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
+}
+
+/*
* Entity names must be of the following form:
*
- * [component/]*[component][(@|#)component]?
+ * [component/]*[component][(@|#)component]?
*
* Where each component is made up of alphanumeric characters plus the following
* characters:
*
- * [-_.:%]
+ * [-_.:%]
*
* We allow '%' here as we use that character internally to create unique
* names for temporary clones (for online recv).
+ *
+ * Returns 0 on success, -1 on error.
*/
int
entity_namecheck(const char *path, namecheck_err_t *why, char *what)
{
- const char *start, *end;
- int found_delim;
+ const char *end;
/*
* Make sure the name is not too long.
*/
-
if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
if (why)
*why = NAME_ERR_TOOLONG;
@@ -162,8 +207,8 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what)
return (-1);
}
- start = path;
- found_delim = 0;
+ const char *start = path;
+ boolean_t found_delim = B_FALSE;
for (;;) {
/* Find the end of this component */
end = start;
@@ -198,7 +243,7 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what)
return (-1);
}
- found_delim = 1;
+ found_delim = B_TRUE;
}
/* Zero-length components are not allowed */
@@ -250,6 +295,8 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
* mountpoint names must be of the following form:
*
* /[component][/]*[component][/]
+ *
+ * Returns 0 on success, -1 on error.
*/
int
mountpoint_namecheck(const char *path, namecheck_err_t *why)
@@ -294,6 +341,8 @@ mountpoint_namecheck(const char *path, namecheck_err_t *why)
* dataset names, with the additional restriction that the pool name must begin
* with a letter. The pool names 'raidz' and 'mirror' are also reserved names
* that cannot be used.
+ *
+ * Returns 0 on success, -1 on error.
*/
int
pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
index db70641dbab2..527db92b0cfa 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#ifndef _ZFS_NAMECHECK_H
@@ -48,9 +48,13 @@ typedef enum {
#define ZFS_PERMSET_MAXLEN 64
+extern int zfs_max_dataset_nesting;
+
+int get_dataset_depth(const char *);
int pool_namecheck(const char *, namecheck_err_t *, char *);
int entity_namecheck(const char *, namecheck_err_t *, char *);
int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_nestcheck(const char *);
int mountpoint_namecheck(const char *, namecheck_err_t *);
int zfs_component_namecheck(const char *, namecheck_err_t *, char *);
int permset_namecheck(const char *, namecheck_err_t *, char *);
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index 5f7bcaba5450..880051800365 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -427,6 +427,10 @@ zfs_prop_init(void)
zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<count>", "SSCOUNT");
+ zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
+ zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
+ ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
/* inherit number properties */
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
@@ -434,8 +438,6 @@ zfs_prop_init(void)
ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
/* hidden properties */
- zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG");
zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
@@ -447,8 +449,6 @@ zfs_prop_init(void)
zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
"STMF_SBD_LU");
- zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
- PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID");
zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
"USERACCOUNTING");
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 8b6720e619f1..27c5fa6e06d0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -124,6 +124,7 @@ ZFS_COMMON_OBJS += \
vdev_indirect.o \
vdev_indirect_births.o \
vdev_indirect_mapping.o \
+ vdev_initialize.o \
vdev_label.o \
vdev_mirror.o \
vdev_missing.o \
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 88bbf7ef7c7e..db7ca9889d47 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
*/
@@ -377,6 +377,13 @@ u_int zfs_arc_free_target = 0;
/* Absolute min for arc min / max is 16MB. */
static uint64_t arc_abs_min = 16 << 20;
+/*
+ * ARC dirty data constraints for arc_tempreserve_space() throttle
+ */
+uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */
+uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */
+uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
+
boolean_t zfs_compressed_arc_enabled = B_TRUE;
static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
@@ -5148,12 +5155,13 @@ arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
-
if (buf == NULL) {
+ ASSERT(zio == NULL || zio->io_error != 0);
*bufp = NULL;
} else {
+ ASSERT(zio == NULL || zio->io_error == 0);
*bufp = buf;
- ASSERT(buf->b_data);
+ ASSERT(buf->b_data != NULL);
}
}
@@ -5181,6 +5189,7 @@ arc_read_done(zio_t *zio)
arc_callback_t *callback_list;
arc_callback_t *acb;
boolean_t freeable = B_FALSE;
+ boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@@ -5206,7 +5215,7 @@ arc_read_done(zio_t *zio)
ASSERT3P(hash_lock, !=, NULL);
}
- if (zio->io_error == 0) {
+ if (no_zio_error) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5227,8 +5236,7 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
- if (hash_lock && zio->io_error == 0 &&
- hdr->b_l1hdr.b_state == arc_anon) {
+ if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -5251,20 +5259,38 @@ arc_read_done(zio_t *zio)
callback_cnt++;
- if (zio->io_error != 0)
- continue;
-
- int error = arc_buf_alloc_impl(hdr, acb->acb_private,
- acb->acb_compressed,
- B_TRUE, &acb->acb_buf);
- if (error != 0) {
- arc_buf_destroy(acb->acb_buf, acb->acb_private);
- acb->acb_buf = NULL;
+ if (no_zio_error) {
+ int error = arc_buf_alloc_impl(hdr, acb->acb_private,
+ acb->acb_compressed, zio->io_error == 0,
+ &acb->acb_buf);
+ if (error != 0) {
+ /*
+ * Decompression failed. Set io_error
+ * so that when we call acb_done (below),
+ * we will indicate that the read failed.
+ * Note that in the unusual case where one
+ * callback is compressed and another
+ * uncompressed, we will mark all of them
+ * as failed, even though the uncompressed
+ * one can't actually fail. In this case,
+ * the hdr will not be anonymous, because
+ * if there are multiple callbacks, it's
+ * because multiple threads found the same
+ * arc buf in the hash table.
+ */
+ zio->io_error = error;
+ }
}
-
- if (zio->io_error == 0)
- zio->io_error = error;
}
+ /*
+ * If there are multiple callbacks, we must have the hash lock,
+ * because the only way for multiple threads to find this hdr is
+ * in the hash table. This ensures that if there are multiple
+ * callbacks, the hdr is not anonymous. If it were anonymous,
+ * we couldn't use arc_buf_destroy() in the error case below.
+ */
+ ASSERT(callback_cnt < 2 || hash_lock != NULL);
+
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
if (callback_cnt == 0) {
@@ -5276,7 +5302,7 @@ arc_read_done(zio_t *zio)
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
- if (zio->io_error == 0) {
+ if (no_zio_error) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5309,7 +5335,16 @@ arc_read_done(zio_t *zio)
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
- if (acb->acb_done) {
+ if (acb->acb_done != NULL) {
+ if (zio->io_error != 0 && acb->acb_buf != NULL) {
+ /*
+ * If arc_buf_alloc_impl() fails during
+ * decompression, the buf will still be
+ * allocated, and needs to be freed here.
+ */
+ arc_buf_destroy(acb->acb_buf, acb->acb_private);
+ acb->acb_buf = NULL;
+ }
acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
acb->acb_buf, acb->acb_private);
}
@@ -6280,12 +6315,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory = ptob(freemem);
- static uint64_t page_load = 0;
- static uint64_t last_txg = 0;
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
available_memory = MIN(available_memory, uma_avail());
@@ -6294,9 +6327,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
return (0);
- if (txg > last_txg) {
- last_txg = txg;
- page_load = 0;
+ if (txg > spa->spa_lowmem_last_txg) {
+ spa->spa_lowmem_last_txg = txg;
+ spa->spa_lowmem_page_load = 0;
}
/*
* If we are in pageout, we know that memory is already tight,
@@ -6304,18 +6337,19 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
* continue to let page writes occur as quickly as possible.
*/
if (curproc == pageproc) {
- if (page_load > MAX(ptob(minfree), available_memory) / 4)
+ if (spa->spa_lowmem_page_load >
+ MAX(ptob(minfree), available_memory) / 4)
return (SET_ERROR(ERESTART));
/* Note: reserve is inflated, so we deflate */
- page_load += reserve / 8;
+ atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
return (0);
- } else if (page_load > 0 && arc_reclaim_needed()) {
+ } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
/* memory is low, delay before restarting */
ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
return (SET_ERROR(EAGAIN));
}
- page_load = 0;
-#endif
+ spa->spa_lowmem_page_load = 0;
+#endif /* _KERNEL */
return (0);
}
@@ -6327,7 +6361,7 @@ arc_tempreserve_clear(uint64_t reserve)
}
int
-arc_tempreserve_space(uint64_t reserve, uint64_t txg)
+arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
{
int error;
uint64_t anon_size;
@@ -6356,7 +6390,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
- error = arc_memory_throttle(reserve, txg);
+ error = arc_memory_throttle(spa, reserve, txg);
if (error != 0)
return (error);
@@ -6364,12 +6398,24 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
* Throttle writes when the amount of dirty data in the cache
* gets too large. We try to keep the cache less than half full
* of dirty blocks so that our sync times don't grow too large.
+ *
+ * In the case of one pool being built on another pool, we want
+ * to make sure we don't end up throttling the lower (backing)
+ * pool when the upper pool is the majority contributor to dirty
+ * data. To insure we make forward progress during throttling, we
+ * also check the current pool's net dirty data and only throttle
+ * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
+ * data in the cache.
+ *
* Note: if two requests come in concurrently, we might let them
* both succeed, when one of them should fail. Not a huge deal.
*/
+ uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
+ uint64_t spa_dirty_anon = spa_dirty_data(spa);
- if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
- anon_size > arc_c / 4) {
+ if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
+ anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
+ spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
uint64_t meta_esize =
refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
uint64_t data_esize =
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 533de180bf1c..1db7bfe02881 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -49,8 +49,7 @@
#include <sys/abd.h>
#include <sys/vdev.h>
#include <sys/cityhash.h>
-
-uint_t zfs_dbuf_evict_key;
+#include <sys/spa_impl.h>
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@@ -74,24 +73,58 @@ static kcondvar_t dbuf_evict_cv;
static boolean_t dbuf_evict_thread_exit;
/*
- * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs are added to the dbuf cache once the last hold is released. If a
- * dbuf is later accessed and still exists in the dbuf cache, then it will
- * be removed from the cache and later re-added to the head of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ * that represent the metadata that describes filesystems/snapshots/
+ * bookmarks/properties/etc. We only evict from this cache when we export a
+ * pool, to short-circuit as much I/O as possible for all administrative
+ * commands that need the metadata. There is no eviction policy for this
+ * cache, because we try to only include types in it which would occupy a
+ * very small amount of space per object but create a large impact on the
+ * performance of these commands. Instead, after it reaches a maximum size
+ * (which should only happen on very small memory systems with a very large
+ * number of filesystem objects), we stop taking new dbufs into the
+ * metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
*/
-static multilist_t *dbuf_cache;
-static refcount_t dbuf_cache_size;
-uint64_t dbuf_cache_max_bytes = 0;
+typedef struct dbuf_cache {
+ multilist_t *cache;
+ refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
-/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+/* Size limits for the caches */
+uint64_t dbuf_cache_max_bytes = 0;
+uint64_t dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
/*
- * The dbuf cache uses a three-stage eviction policy:
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+uint64_t dbuf_metadata_cache_overflow;
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
* - A low water marker designates when the dbuf eviction thread
* should stop evicting from the dbuf cache.
* - When we reach the maximum size (aka mid water mark), we
@@ -404,6 +437,41 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
}
/*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+ DB_DNODE_ENTER(db);
+ dmu_object_type_t type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ /* Check if this dbuf is one of the types we care about */
+ if (DMU_OT_IS_METADATA_CACHED(type)) {
+ /* If we hit this, then we set something up wrong in dmu_ot */
+ ASSERT(DMU_OT_IS_METADATA(type));
+
+ /*
+ * Sanity check for small-memory systems: don't allocate too
+ * much memory for this purpose.
+ */
+ if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+ dbuf_metadata_cache_max_bytes) {
+ dbuf_metadata_cache_overflow++;
+ DTRACE_PROBE1(dbuf__metadata__cache__overflow,
+ dmu_buf_impl_t *, db);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
* This function *must* return indices evenly distributed between all
* sublists of the multilist. This is needed due to how the dbuf eviction
* code is laid out; dbuf_evict_thread() assumes dbufs are evenly
@@ -438,7 +506,7 @@ dbuf_cache_above_hiwater(void)
uint64_t dbuf_cache_hiwater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
- return (refcount_count(&dbuf_cache_size) >
+ return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
}
@@ -448,7 +516,7 @@ dbuf_cache_above_lowater(void)
uint64_t dbuf_cache_lowater_bytes =
(dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
- return (refcount_count(&dbuf_cache_size) >
+ return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
}
@@ -458,19 +526,12 @@ dbuf_cache_above_lowater(void)
static void
dbuf_evict_one(void)
{
- int idx = multilist_get_random_index(dbuf_cache);
- multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+ int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+ multilist_sublist_t *mls = multilist_sublist_lock(
+ dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
- /*
- * Set the thread's tsd to indicate that it's processing evictions.
- * Once a thread stops evicting from the dbuf cache it will
- * reset its tsd to NULL.
- */
- ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
- (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
-
dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
@@ -482,13 +543,14 @@ dbuf_evict_one(void)
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
- (void) refcount_remove_many(&dbuf_cache_size,
+ (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
db->db.db_size, db);
+ ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+ db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
} else {
multilist_sublist_unlock(mls);
}
- (void) tsd_set(zfs_dbuf_evict_key, NULL);
}
/*
@@ -542,35 +604,13 @@ dbuf_evict_thread(void *unused __unused)
static void
dbuf_evict_notify(void)
{
-
- /*
- * We use thread specific data to track when a thread has
- * started processing evictions. This allows us to avoid deeply
- * nested stacks that would have a call flow similar to this:
- *
- * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
- * ^ |
- * | |
- * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
- *
- * The dbuf_eviction_thread will always have its tsd set until
- * that thread exits. All other threads will only set their tsd
- * if they are participating in the eviction process. This only
- * happens if the eviction thread is unable to process evictions
- * fast enough. To keep the dbuf cache size in check, other threads
- * can evict from the dbuf cache directly. Those threads will set
- * their tsd values so that we ensure that they only evict one dbuf
- * from the dbuf cache.
- */
- if (tsd_get(zfs_dbuf_evict_key) != NULL)
- return;
-
/*
* We check if we should evict without holding the dbuf_evict_lock,
* because it's OK to occasionally make the wrong decision here,
* and grabbing the lock results in massive lock contention.
*/
- if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+ if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+ dbuf_cache_max_bytes) {
if (dbuf_cache_above_hiwater())
dbuf_evict_one();
cv_signal(&dbuf_evict_cv);
@@ -610,15 +650,21 @@ retry:
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
/*
- * Setup the parameters for the dbuf cache. We set the size of the
- * dbuf cache to 1/32nd (default) of the size of the ARC. If the value
- * has been set in /etc/system and it's not greater than the size of
- * the ARC, then we honor that value.
+ * Setup the parameters for the dbuf caches. We set the sizes of the
+ * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+ * of the size of the ARC, respectively. If the values are set in
+ * /etc/system and they're not greater than the size of the ARC, then
+ * we honor that value.
*/
if (dbuf_cache_max_bytes == 0 ||
dbuf_cache_max_bytes >= arc_max_bytes()) {
dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
}
+ if (dbuf_metadata_cache_max_bytes == 0 ||
+ dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
+ dbuf_metadata_cache_max_bytes =
+ arc_max_bytes() >> dbuf_metadata_cache_shift;
+ }
/*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
@@ -626,12 +672,14 @@ retry:
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
- dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
- offsetof(dmu_buf_impl_t, db_cache_link),
- dbuf_cache_multilist_index_func);
- refcount_create(&dbuf_cache_size);
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ dbuf_caches[dcs].cache =
+ multilist_create(sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_cache_link),
+ dbuf_cache_multilist_index_func);
+ refcount_create(&dbuf_caches[dcs].size);
+ }
- tsd_create(&zfs_dbuf_evict_key, NULL);
dbuf_evict_thread_exit = B_FALSE;
mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
@@ -658,13 +706,14 @@ dbuf_fini(void)
cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
}
mutex_exit(&dbuf_evict_lock);
- tsd_destroy(&zfs_dbuf_evict_key);
mutex_destroy(&dbuf_evict_lock);
cv_destroy(&dbuf_evict_cv);
- refcount_destroy(&dbuf_cache_size);
- multilist_destroy(dbuf_cache);
+ for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+ refcount_destroy(&dbuf_caches[dcs].size);
+ multilist_destroy(dbuf_caches[dcs].cache);
+ }
}
/*
@@ -915,8 +964,15 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
ASSERT(refcount_count(&db->db_holds) > 0);
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- if (db->db_level == 0 && db->db_freed_in_flight) {
- /* we were freed in flight; disregard any error */
+ if (buf == NULL) {
+ /* i/o error */
+ ASSERT(zio == NULL || zio->io_error != 0);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+ ASSERT3P(db->db_buf, ==, NULL);
+ db->db_state = DB_UNCACHED;
+ } else if (db->db_level == 0 && db->db_freed_in_flight) {
+ /* freed in flight */
+ ASSERT(zio == NULL || zio->io_error == 0);
if (buf == NULL) {
buf = arc_alloc_buf(db->db_objset->os_spa,
db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
@@ -927,16 +983,14 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else if (buf != NULL) {
+ } else {
+ /* success */
+ ASSERT(zio == NULL || zio->io_error == 0);
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else {
- ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- ASSERT3P(db->db_buf, ==, NULL);
- db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
- dbuf_rele_and_unlock(db, NULL);
+ dbuf_rele_and_unlock(db, NULL, B_FALSE);
}
static void
@@ -2051,9 +2105,15 @@ dbuf_destroy(dmu_buf_impl_t *db)
dbuf_clear_data(db);
if (multilist_link_active(&db->db_cache_link)) {
- multilist_remove(dbuf_cache, db);
- (void) refcount_remove_many(&dbuf_cache_size,
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
+
+ db->db_caching_status = DB_NO_CACHE;
}
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -2090,7 +2150,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
* value in dnode_move(), since DB_DNODE_EXIT doesn't actually
* release any lock.
*/
- dnode_rele(dn, db);
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, db, B_TRUE);
db->db_dnode_handle = NULL;
dbuf_hash_remove(db);
@@ -2107,6 +2168,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT(!multilist_link_active(&db->db_cache_link));
kmem_cache_free(dbuf_kmem_cache, db);
@@ -2116,8 +2178,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
* If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
- if (parent && parent != dndb)
- dbuf_rele(parent, db);
+ if (parent && parent != dndb) {
+ mutex_enter(&parent->db_mtx);
+ dbuf_rele_and_unlock(parent, db, B_TRUE);
+ }
}
/*
@@ -2245,6 +2309,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
db->db_state = DB_UNCACHED;
+ db->db_caching_status = DB_NO_CACHE;
/* the bonus dbuf is not placed in the hash table */
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
return (db);
@@ -2277,6 +2342,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
avl_add(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
+ db->db_caching_status = DB_NO_CACHE;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
@@ -2338,6 +2404,13 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
+ if (abuf == NULL) {
+ ASSERT(zio == NULL || zio->io_error != 0);
+ kmem_free(dpa, sizeof (*dpa));
+ return;
+ }
+ ASSERT(zio == NULL || zio->io_error == 0);
+
/*
* The dpa_dnode is only valid if we are called with a NULL
* zio. This indicates that the arc_read() returned without
@@ -2619,9 +2692,15 @@ top:
if (multilist_link_active(&db->db_cache_link)) {
ASSERT(refcount_is_zero(&db->db_holds));
- multilist_remove(dbuf_cache, db);
- (void) refcount_remove_many(&dbuf_cache_size,
+ ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+ db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+ multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ (void) refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
+
+ db->db_caching_status = DB_NO_CACHE;
}
(void) refcount_add(&db->db_holds, tag);
DBUF_VERIFY(db);
@@ -2734,7 +2813,7 @@ void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
mutex_enter(&db->db_mtx);
- dbuf_rele_and_unlock(db, tag);
+ dbuf_rele_and_unlock(db, tag, B_FALSE);
}
void
@@ -2745,10 +2824,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
/*
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically.
+ * db_dirtycnt and db_holds to be updated atomically. The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict. This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ * ^ |
+ * | |
+ * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
*/
void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
{
int64_t holds;
@@ -2838,12 +2926,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
- multilist_insert(dbuf_cache, db);
- (void) refcount_add_many(&dbuf_cache_size,
+ ASSERT3U(db->db_caching_status, ==,
+ DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(dbuf_caches[dcs].cache, db);
+ (void) refcount_add_many(&dbuf_caches[dcs].size,
db->db.db_size, db);
mutex_exit(&db->db_mtx);
- dbuf_evict_notify();
+ if (db->db_caching_status == DB_DBUF_CACHE &&
+ !evicting) {
+ dbuf_evict_notify();
+ }
}
if (do_arc_evict)
@@ -3108,7 +3207,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
- dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
return;
}
@@ -3458,7 +3557,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
- dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
}
static void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 7bab517fba8c..bfd8e2d7c9ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -79,60 +79,60 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
int zfs_object_remap_one_indirect_delay_ticks = 0;
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
- { DMU_BSWAP_UINT8, TRUE, "unallocated" },
- { DMU_BSWAP_ZAP, TRUE, "object directory" },
- { DMU_BSWAP_UINT64, TRUE, "object array" },
- { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
- { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
- { DMU_BSWAP_UINT64, TRUE, "SPA space map" },
- { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
- { DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
- { DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
- { DMU_BSWAP_UINT64, TRUE, "DSL directory" },
- { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
- { DMU_BSWAP_ZAP, TRUE, "DSL props" },
- { DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
- { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
- { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
- { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
- { DMU_BSWAP_UINT8, FALSE, "zvol object" },
- { DMU_BSWAP_ZAP, TRUE, "zvol prop" },
- { DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
- { DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
- { DMU_BSWAP_ZAP, TRUE, "other ZAP" },
- { DMU_BSWAP_ZAP, TRUE, "persistent error log" },
- { DMU_BSWAP_UINT8, TRUE, "SPA history" },
- { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
- { DMU_BSWAP_ZAP, TRUE, "Pool properties" },
- { DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
- { DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
- { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
- { DMU_BSWAP_UINT8, TRUE, "FUID table" },
- { DMU_BSWAP_UINT64, TRUE, "FUID table size" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
- { DMU_BSWAP_ZAP, TRUE, "scan work queue" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
- { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
- { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
- { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
- { DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
- { DMU_BSWAP_UINT8, TRUE, "System attributes" },
- { DMU_BSWAP_ZAP, TRUE, "SA master node" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
- { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
- { DMU_BSWAP_ZAP, TRUE, "scan translations" },
- { DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
- { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
- { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
- { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
- { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" },
+ { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" },
+ { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" },
+ { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
+ { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" },
+ { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" },
+ { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
+ { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" },
+ { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
+ { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" },
+ { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" },
+ { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" },
+ { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
};
const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
@@ -449,7 +449,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
* and can induce severe lock contention when writing to several files
* whose dnodes are in the same block.
*/
-static int
+int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
@@ -1321,7 +1321,7 @@ xuio_stat_wbuf_nocopy(void)
}
#ifdef _KERNEL
-static int
+int
dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
@@ -1437,7 +1437,7 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
return (err);
}
-static int
+int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
@@ -1881,22 +1881,17 @@ dmu_return_arcbuf(arc_buf_t *buf)
* dmu_write().
*/
void
-dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
- dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
- dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
- DB_DNODE_ENTER(dbuf);
- dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(dbuf);
/*
* We can only assign if the offset is aligned, the arc buf is the
@@ -1924,11 +1919,8 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
- DB_DNODE_ENTER(dbuf);
- dn = DB_DNODE(dbuf);
os = dn->dn_objset;
object = dn->dn_object;
- DB_DNODE_EXIT(dbuf);
dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx);
@@ -1937,6 +1929,17 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
}
}
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+
+ DB_DNODE_ENTER(dbuf);
+ dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
+ DB_DNODE_EXIT(dbuf);
+}
+
typedef struct {
dbuf_dirty_record_t *dsa_dr;
dmu_sync_cb_t *dsa_done;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 40898ef26d97..b853081e8b7c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2014 HybridCluster. All rights reserved.
*/
@@ -32,7 +32,8 @@
#include <sys/zfeature.h>
uint64_t
-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
uint64_t object;
@@ -92,7 +93,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
os->os_obj_next = object - 1;
}
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_allocate(dn, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, tx);
mutex_exit(&os->os_obj_lock);
dmu_tx_add_new_object(tx, dn);
@@ -101,6 +103,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
return (object);
}
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_ibs(os, ot, blocksize, 0,
+ bonustype, bonuslen, tx));
+}
+
int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
@@ -157,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
+ /*
+ * If we don't create this free range, we'll leak indirect blocks when
+ * we get to freeing the dnode in syncing context.
+ */
dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
dnode_free(dn, tx);
dnode_rele(dn, FTAG);
@@ -204,13 +218,19 @@ dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
}
ASSERT3U(dn->dn_type, ==, old_type);
ASSERT0(dn->dn_maxblkid);
+
+ /*
+ * We must initialize the ZAP data before changing the type,
+ * so that concurrent calls to *_is_zapified() can determine if
+ * the object has been completely zapified by checking the type.
+ */
+ mzap_create_impl(mos, object, 0, 0, tx);
+
dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
DMU_OTN_ZAP_METADATA;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
- mzap_create_impl(mos, object, 0, 0, tx);
-
spa_feature_incr(dmu_objset_spa(mos),
SPA_FEATURE_EXTENSIBLE_DATASET, tx);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 60119c7cda54..50c18a58f6bc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -54,6 +54,7 @@
#include <sys/dsl_destroy.h>
#include <sys/vdev.h>
#include <sys/zfeature.h>
+#include "zfs_namecheck.h"
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
@@ -498,6 +499,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
+ /*
+ * These properties will be filled in by the logic in zfs_get_zplprop()
+ * when they are queried for the first time.
+ */
+ os->os_version = OBJSET_PROP_UNINITIALIZED;
+ os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+ os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+ os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
if (ds == NULL || !ds->ds_is_snapshot)
os->os_zil_header = os->os_phys->os_zil_header;
@@ -905,6 +914,9 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx)
if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
+ if (dataset_nestcheck(doca->doca_name) != 0)
+ return (SET_ERROR(ENAMETOOLONG));
+
error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
if (error != 0)
return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 9609761b38f9..25c1fec0c146 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -76,6 +76,11 @@ TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
static char *dmu_recv_tag = "dmu_recv_tag";
const char *recv_clone_name = "%recv";
+/*
+ * Use this to override the recordsize calculation for fast zfs send estimates.
+ */
+uint64_t zfs_override_estimate_recordsize = 0;
+
#define BP_SPAN(datablkszsec, indblkshift, level) \
(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
(level) * (indblkshift - SPA_BLKPTRSHIFT)))
@@ -1131,7 +1136,7 @@ static int
dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
{
- int err;
+ int err = 0;
uint64_t size;
/*
* Assume that space (both on-disk and in-stream) is dominated by
@@ -1144,7 +1149,9 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
VERIFY0(dmu_objset_from_ds(ds, &os));
/* Assume all (uncompressed) blocks are recordsize. */
- if (os->os_phys->os_type == DMU_OST_ZVOL) {
+ if (zfs_override_estimate_recordsize != 0) {
+ recordsize = zfs_override_estimate_recordsize;
+ } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
err = dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
} else {
@@ -1788,6 +1795,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
drc->drc_force = force;
drc->drc_resumable = resumable;
drc->drc_cred = CRED();
+ drc->drc_clone = (origin != NULL);
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
@@ -1848,7 +1856,9 @@ struct receive_writer_arg {
/* A map from guid to dataset to help handle dedup'd streams. */
avl_tree_t *guid_to_ds_map;
boolean_t resumable;
- uint64_t last_object, last_offset;
+ uint64_t last_object;
+ uint64_t last_offset;
+ uint64_t max_object; /* highest object ID referenced in stream */
uint64_t bytes_read; /* bytes read when current record created */
};
@@ -1896,14 +1906,10 @@ typedef struct guid_map_entry {
static int
guid_compare(const void *arg1, const void *arg2)
{
- const guid_map_entry_t *gmep1 = arg1;
- const guid_map_entry_t *gmep2 = arg2;
+ const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
+ const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
- if (gmep1->guid < gmep2->guid)
- return (-1);
- else if (gmep1->guid > gmep2->guid)
- return (1);
- return (0);
+ return (AVL_CMP(gmep1->guid, gmep2->guid));
}
static void
@@ -2145,6 +2151,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
return (SET_ERROR(EINVAL));
object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
+ if (drro->drr_object > rwa->max_object)
+ rwa->max_object = drro->drr_object;
+
/*
* If we are losing blkptrs or changing the block size this must
* be a new file instance. We must clear out the previous file
@@ -2240,6 +2249,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,
err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
return (err);
+
+ if (obj > rwa->max_object)
+ rwa->max_object = obj;
}
if (next_err != ESRCH)
return (next_err);
@@ -2269,6 +2281,9 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
rwa->last_object = drrw->drr_object;
rwa->last_offset = drrw->drr_offset;
+ if (rwa->last_object > rwa->max_object)
+ rwa->max_object = rwa->last_object;
+
if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
@@ -2345,6 +2360,9 @@ receive_write_byref(struct receive_writer_arg *rwa,
ref_os = rwa->os;
}
+ if (drrwbr->drr_object > rwa->max_object)
+ rwa->max_object = drrwbr->drr_object;
+
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
if (err != 0)
@@ -2387,6 +2405,9 @@ receive_write_embedded(struct receive_writer_arg *rwa,
if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
return (EINVAL);
+ if (drrwe->drr_object > rwa->max_object)
+ rwa->max_object = drrwe->drr_object;
+
tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrwe->drr_object,
@@ -2423,6 +2444,9 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
+ if (drrs->drr_object > rwa->max_object)
+ rwa->max_object = drrs->drr_object;
+
VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
dmu_buf_rele(db, FTAG);
@@ -2467,6 +2491,9 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
+ if (drrf->drr_object > rwa->max_object)
+ rwa->max_object = drrf->drr_object;
+
err = dmu_free_long_range(rwa->os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
@@ -3086,6 +3113,41 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
}
mutex_exit(&rwa.mutex);
+ /*
+ * If we are receiving a full stream as a clone, all object IDs which
+ * are greater than the maximum ID referenced in the stream are
+ * by definition unused and must be freed. Note that it's possible that
+ * we've resumed this send and the first record we received was the END
+ * record. In that case, max_object would be 0, but we shouldn't start
+ * freeing all objects from there; instead we should start from the
+ * resumeobj.
+ */
+ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
+ uint64_t obj;
+ if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0)
+ obj = 0;
+ if (rwa.max_object > obj)
+ obj = rwa.max_object;
+ obj++;
+ int free_err = 0;
+ int next_err = 0;
+
+ while (next_err == 0) {
+ free_err = dmu_free_long_object(rwa.os, obj);
+ if (free_err != 0 && free_err != ENOENT)
+ break;
+
+ next_err = dmu_object_next(rwa.os, &obj, FALSE, 0);
+ }
+
+ if (err == 0) {
+ if (free_err != 0 && free_err != ENOENT)
+ err = free_err;
+ else if (next_err != ESRCH)
+ err = next_err;
+ }
+ }
+
cv_destroy(&rwa.cv);
mutex_destroy(&rwa.mutex);
bqueue_destroy(&rwa.q);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index ad02fa5918aa..4ac640e54d6c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -1091,7 +1091,12 @@ dmu_tx_wait(dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
tx->tx_needassign_txh = NULL;
} else {
- txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+ /*
+ * If we have a lot of dirty data just wait until we sync
+ * out a TXG at which point we'll hopefully have synced
+ * a portion of the changes.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index 13a4a02bbfb4..4d72991b5ef6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -78,19 +78,13 @@ dbuf_compare(const void *x1, const void *x2)
const dmu_buf_impl_t *d1 = x1;
const dmu_buf_impl_t *d2 = x2;
- if (d1->db_level < d2->db_level) {
- return (-1);
- }
- if (d1->db_level > d2->db_level) {
- return (1);
- }
+ int cmp = AVL_CMP(d1->db_level, d2->db_level);
+ if (likely(cmp))
+ return (cmp);
- if (d1->db_blkid < d2->db_blkid) {
- return (-1);
- }
- if (d1->db_blkid > d2->db_blkid) {
- return (1);
- }
+ cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+ if (likely(cmp))
+ return (cmp);
if (d1->db_state == DB_SEARCH) {
ASSERT3S(d2->db_state, !=, DB_SEARCH);
@@ -100,13 +94,7 @@ dbuf_compare(const void *x1, const void *x2)
return (1);
}
- if ((uintptr_t)d1 < (uintptr_t)d2) {
- return (-1);
- }
- if ((uintptr_t)d1 > (uintptr_t)d2) {
- return (1);
- }
- return (0);
+ return (AVL_PCMP(d1, d2));
}
/* ARGSUSED */
@@ -742,6 +730,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_datablkszsec = odn->dn_datablkszsec;
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
+ bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+ sizeof (odn->dn_next_type));
bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
sizeof (odn->dn_next_nblkptr));
bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
@@ -1238,11 +1228,11 @@ void
dnode_rele(dnode_t *dn, void *tag)
{
mutex_enter(&dn->dn_mtx);
- dnode_rele_and_unlock(dn, tag);
+ dnode_rele_and_unlock(dn, tag, B_FALSE);
}
void
-dnode_rele_and_unlock(dnode_t *dn, void *tag)
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
{
uint64_t refs;
/* Get while the hold prevents the dnode from moving. */
@@ -1273,7 +1263,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
* that the handle has zero references, but that will be
* asserted anyway when the handle gets destroyed.
*/
- dbuf_rele(db, dnh);
+ mutex_enter(&db->db_mtx);
+ dbuf_rele_and_unlock(db, dnh, evicting);
}
}
@@ -1518,6 +1509,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
}
}
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db;
+ avl_index_t where;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ for (;;) {
+
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ if (db == NULL || db->db_level != 1 ||
+ db->db_blkid >= end_blkid) {
+ break;
+ }
+
+ /*
+ * Setup the next blkid we want to search for.
+ */
+ db_search.db_blkid = db->db_blkid + 1;
+ ASSERT3U(db->db_blkid, >=, start_blkid);
+
+ /*
+ * If the dbuf transitions to DB_EVICTING while we're trying
+ * to dirty it, then we will be unable to discover it in
+ * the dbuf hash table. This will result in a call to
+ * dbuf_create() which needs to acquire the dn_dbufs_mtx
+ * lock. To avoid a deadlock, we drop the lock before
+ * dirtying the level-1 dbuf.
+ */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_dirty_l1(dn, db->db_blkid, tx);
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+
+#ifdef ZFS_DEBUG
+ /*
+ * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+ */
+ db_search.db_level = 1;
+ db_search.db_blkid = start_blkid + 1;
+ db_search.db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ if (db == NULL)
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+ for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+ if (db->db_level != 1 || db->db_blkid >= end_blkid)
+ break;
+ ASSERT(db->db_dirtycnt > 0);
+ }
+#endif
+ mutex_exit(&dn->dn_dbufs_mtx);
+}
+
void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
@@ -1550,13 +1607,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (off == 0 && len >= blksz) {
/*
* Freeing the whole block; fast-track this request.
- * Note that we won't dirty any indirect blocks,
- * which is fine because we will be freeing the entire
- * file and thus all indirect blocks will be freed
- * by free_children().
*/
blkid = 0;
nblks = 1;
+ if (dn->dn_nlevels > 1)
+ dnode_dirty_l1(dn, 0, tx);
goto done;
} else if (off >= blksz) {
/* Freeing past end-of-data */
@@ -1669,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
if (last != first)
dnode_dirty_l1(dn, last, tx);
+ dnode_dirty_l1range(dn, first, last, tx);
+
int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
for (uint64_t i = first + 1; i < last; i++) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 2fcaf7927de6..02f263c82e42 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
#endif
+/*
+ * We don't usually free the indirect blocks here. If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children(). If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed. Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
static void
free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
- dmu_tx_t *tx)
+ boolean_t free_indirects, dmu_tx_t *tx)
{
dnode_t *dn;
blkptr_t *bp;
@@ -248,6 +263,24 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+ /*
+ * If we modify this indirect block, and we are not freeing the
+ * dnode (!free_indirects), then this indirect block needs to get
+ * written to disk by dbuf_write(). If it is dirty, we know it will
+ * be written (otherwise, we would have incorrect on-disk state
+ * because the space would be freed but still referenced by the BP
+ * in this indirect block). Therefore we VERIFY that it is
+ * dirty.
+ *
+ * Our VERIFY covers some cases that do not actually have to be
+ * dirty, but the open-context code happens to dirty. E.g. if the
+ * blocks we are freeing are all holes, because in that case, we
+ * are only freeing part of this indirect block, so it is an
+ * ancestor of the first or last block to be freed. The first and
+ * last L1 indirect blocks are always dirtied by dnode_free_range().
+ */
+ VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+
dbuf_release_bp(db);
bp = db->db.db_data;
@@ -283,32 +316,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
rw_exit(&dn->dn_struct_rwlock);
ASSERT3P(bp, ==, subdb->db_blkptr);
- free_children(subdb, blkid, nblks, tx);
+ free_children(subdb, blkid, nblks, free_indirects, tx);
dbuf_rele(subdb, FTAG);
}
}
- /* If this whole block is free, free ourself too. */
- for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
- if (!BP_IS_HOLE(bp))
- break;
- }
- if (i == 1 << epbs) {
- /*
- * We only found holes. Grab the rwlock to prevent
- * anybody from reading the blocks we're about to
- * zero out.
- */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (free_indirects) {
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+ ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
- rw_exit(&dn->dn_struct_rwlock);
free_blocks(dn, db->db_blkptr, 1, tx);
- } else {
- /*
- * Partial block free; must be marked dirty so that it
- * will be written out.
- */
- ASSERT(db->db_dirtycnt > 0);
}
DB_DNODE_EXIT(db);
@@ -321,7 +338,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
*/
static void
dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
- dmu_tx_t *tx)
+ boolean_t free_indirects, dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
int dnlevel = dn->dn_phys->dn_nlevels;
@@ -361,7 +378,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
- free_children(db, blkid, nblks, tx);
+ free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}
}
@@ -380,6 +397,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
typedef struct dnode_sync_free_range_arg {
dnode_t *dsfra_dnode;
dmu_tx_t *dsfra_tx;
+ boolean_t dsfra_free_indirects;
} dnode_sync_free_range_arg_t;
static void
@@ -389,7 +407,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
dnode_t *dn = dsfra->dsfra_dnode;
mutex_exit(&dn->dn_mtx);
- dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+ dnode_sync_free_range_impl(dn, blkid, nblks,
+ dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
mutex_enter(&dn->dn_mtx);
}
@@ -420,6 +439,19 @@ dnode_evict_dbufs(dnode_t *dn)
avl_insert_here(&dn->dn_dbufs, &db_marker, db,
AVL_BEFORE);
+ /*
+ * We need to use the "marker" dbuf rather than
+ * simply getting the next dbuf, because
+ * dbuf_destroy() may actually remove multiple dbufs.
+ * It can call itself recursively on the parent dbuf,
+ * which may also be removed from dn_dbufs. The code
+ * flow would look like:
+ *
+ * dbuf_destroy():
+ * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+ * if (!cacheable || pending_evict)
+ * dbuf_destroy()
+ */
dbuf_destroy(db);
db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
@@ -478,7 +510,7 @@ dnode_undirty_dbufs(list_t *list)
list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
- dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
}
}
@@ -670,6 +702,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dnode_sync_free_range_arg_t dsfra;
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
+ dsfra.dsfra_free_indirects = freeing_dnode;
+ if (freeing_dnode) {
+ ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+ 0, dn->dn_maxblkid + 1));
+ }
mutex_enter(&dn->dn_mtx);
range_tree_vacate(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
index 356e5b51c3f4..2f3647bc8e86 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
@@ -55,15 +55,10 @@
static int
dsl_deadlist_compare(const void *arg1, const void *arg2)
{
- const dsl_deadlist_entry_t *dle1 = arg1;
- const dsl_deadlist_entry_t *dle2 = arg2;
+ const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1;
+ const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2;
- if (dle1->dle_mintxg < dle2->dle_mintxg)
- return (-1);
- else if (dle1->dle_mintxg > dle2->dle_mintxg)
- return (+1);
- else
- return (0);
+ return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
}
static void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
index 7870b4951b29..0ad658f910ec 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -384,14 +384,13 @@ typedef struct perm_set {
static int
perm_set_compare(const void *arg1, const void *arg2)
{
- const perm_set_t *node1 = arg1;
- const perm_set_t *node2 = arg2;
+ const perm_set_t *node1 = (const perm_set_t *)arg1;
+ const perm_set_t *node2 = (const perm_set_t *)arg2;
int val;
val = strcmp(node1->p_setname, node2->p_setname);
- if (val == 0)
- return (0);
- return (val > 0 ? 1 : -1);
+
+ return (AVL_ISIGN(val));
}
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 00b1dbe36d83..1a4194ebf16d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -1388,7 +1388,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
offsetof(struct tempreserve, tr_node));
ASSERT3S(asize, >, 0);
- err = arc_tempreserve_space(lsize, tx->tx_txg);
+ err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
if (err == 0) {
struct tempreserve *tr;
@@ -1819,16 +1819,28 @@ typedef struct dsl_dir_rename_arg {
cred_t *ddra_cred;
} dsl_dir_rename_arg_t;
+typedef struct dsl_valid_rename_arg {
+ int char_delta;
+ int nest_delta;
+} dsl_valid_rename_arg_t;
+
/* ARGSUSED */
static int
dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
- int *deltap = arg;
+ dsl_valid_rename_arg_t *dvra = arg;
char namebuf[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds, namebuf);
- if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
+ ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ int namelen = strlen(namebuf) + dvra->char_delta;
+ int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
+
+ if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+ if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
return (SET_ERROR(ENAMETOOLONG));
return (0);
}
@@ -1839,9 +1851,9 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
dsl_dir_rename_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dir_t *dd, *newparent;
+ dsl_valid_rename_arg_t dvra;
const char *mynewname;
int error;
- int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
/* target dir should exist */
error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
@@ -1870,10 +1882,19 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
return (SET_ERROR(EEXIST));
}
+ ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
+ dvra.char_delta = strlen(ddra->ddra_newname)
+ - strlen(ddra->ddra_oldname);
+ dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
+ - get_dataset_depth(ddra->ddra_oldname);
+
/* if the name length is growing, validate child name lengths */
- if (delta > 0) {
+ if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
- &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+ &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
if (error != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
index 8e7616427e19..48675909365b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
@@ -2164,7 +2164,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
* block-sharing rules don't apply to it.
*/
if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
- ds->ds_dir != dp->dp_origin_snap->ds_dir) {
+ (dp->dp_origin_snap == NULL ||
+ ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
objset_t *os;
if (dmu_objset_from_ds(ds, &os) != 0) {
goto out;
@@ -2959,6 +2960,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
{
vdev_t *vd;
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * The indirect vdev can point to multiple
+ * vdevs. For simplicity, always create
+ * the resilver zio_t. zio_vdev_io_start()
+ * will bypass the child resilver i/o's if
+ * they are on vdevs that don't have DTL's.
+ */
+ return (B_TRUE);
+ }
if (DVA_GET_GANG(dva)) {
/*
* Gang members may be spread across multiple
@@ -3541,14 +3552,14 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
int d;
+ count_block(scn, dp->dp_blkstats, bp);
+
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg)
return (0);
- if (BP_IS_EMBEDDED(bp)) {
- count_block(scn, dp->dp_blkstats, bp);
- return (0);
- }
+ /* Embedded BP's have phys_birth==0, so we reject them above. */
+ ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 6cff5eacdcdf..e374cd356792 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -275,6 +275,8 @@ static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
kmem_cache_t *metaslab_alloc_trace_cache;
@@ -294,7 +296,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
mc->mc_rotor = NULL;
mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
- refcount_create_tracked(&mc->mc_alloc_slots);
+ mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (refcount_t), KM_SLEEP);
+ mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (uint64_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ refcount_create_tracked(&mc->mc_alloc_slots[i]);
return (mc);
}
@@ -308,7 +315,12 @@ metaslab_class_destroy(metaslab_class_t *mc)
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
- refcount_destroy(&mc->mc_alloc_slots);
+ for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
+ refcount_destroy(&mc->mc_alloc_slots[i]);
+ kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
+ sizeof (refcount_t));
+ kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
+ sizeof (uint64_t));
mutex_destroy(&mc->mc_lock);
kmem_free(mc, sizeof (metaslab_class_t));
}
@@ -529,25 +541,40 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
static int
metaslab_compare(const void *x1, const void *x2)
{
- const metaslab_t *m1 = x1;
- const metaslab_t *m2 = x2;
-
- if (m1->ms_weight < m2->ms_weight)
- return (1);
- if (m1->ms_weight > m2->ms_weight)
- return (-1);
+ const metaslab_t *m1 = (const metaslab_t *)x1;
+ const metaslab_t *m2 = (const metaslab_t *)x2;
+
+ int sort1 = 0;
+ int sort2 = 0;
+ if (m1->ms_allocator != -1 && m1->ms_primary)
+ sort1 = 1;
+ else if (m1->ms_allocator != -1 && !m1->ms_primary)
+ sort1 = 2;
+ if (m2->ms_allocator != -1 && m2->ms_primary)
+ sort2 = 1;
+ else if (m2->ms_allocator != -1 && !m2->ms_primary)
+ sort2 = 2;
/*
- * If the weights are identical, use the offset to force uniqueness.
+ * Sort inactive metaslabs first, then primaries, then secondaries. When
+ * selecting a metaslab to allocate from, an allocator first tries its
+ * primary, then secondary active metaslab. If it doesn't have active
+ * metaslabs, or can't allocate from them, it searches for an inactive
+ * metaslab to activate. If it can't find a suitable one, it will steal
+ * a primary or secondary metaslab from another allocator.
*/
- if (m1->ms_start < m2->ms_start)
+ if (sort1 < sort2)
return (-1);
- if (m1->ms_start > m2->ms_start)
+ if (sort1 > sort2)
return (1);
- ASSERT3P(m1, ==, m2);
+ int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
+ if (likely(cmp))
+ return (cmp);
- return (0);
+ IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+
+ return (AVL_CMP(m1->ms_start, m2->ms_start));
}
/*
@@ -683,12 +710,18 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
}
metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
{
metaslab_group_t *mg;
mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
+ mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+ KM_SLEEP);
+ mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+ KM_SLEEP);
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
mg->mg_vd = vd;
@@ -696,7 +729,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
mg->mg_activation_count = 0;
mg->mg_initialized = B_FALSE;
mg->mg_no_free_space = B_TRUE;
- refcount_create_tracked(&mg->mg_alloc_queue_depth);
+ mg->mg_allocators = allocators;
+
+ mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
+ KM_SLEEP);
+ mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
+ sizeof (uint64_t), KM_SLEEP);
+ for (int i = 0; i < allocators; i++) {
+ refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+ mg->mg_cur_max_alloc_queue_depth[i] = 0;
+ }
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
@@ -718,8 +760,22 @@ metaslab_group_destroy(metaslab_group_t *mg)
taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
+ kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
+ kmem_free(mg->mg_secondaries, mg->mg_allocators *
+ sizeof (metaslab_t *));
mutex_destroy(&mg->mg_lock);
- refcount_destroy(&mg->mg_alloc_queue_depth);
+ mutex_destroy(&mg->mg_ms_initialize_lock);
+ cv_destroy(&mg->mg_ms_initialize_cv);
+
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+ mg->mg_cur_max_alloc_queue_depth[i] = 0;
+ }
+ kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
+ sizeof (refcount_t));
+ kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
+ sizeof (uint64_t));
+
kmem_free(mg, sizeof (metaslab_group_t));
}
@@ -799,6 +855,22 @@ metaslab_group_passivate(metaslab_group_t *mg)
taskq_wait(mg->mg_taskq);
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
metaslab_group_alloc_update(mg);
+ for (int i = 0; i < mg->mg_allocators; i++) {
+ metaslab_t *msp = mg->mg_primaries[i];
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ msp = mg->mg_secondaries[i];
+ if (msp != NULL) {
+ mutex_enter(&msp->ms_lock);
+ metaslab_passivate(msp,
+ metaslab_weight_from_range_tree(msp));
+ mutex_exit(&msp->ms_lock);
+ }
+ }
mgprev = mg->mg_prev;
mgnext = mg->mg_next;
@@ -940,6 +1012,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
}
static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_lock));
+ ASSERT(msp->ms_group == mg);
+ avl_remove(&mg->mg_metaslab_tree, msp);
+ msp->ms_weight = weight;
+ avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
{
/*
@@ -950,10 +1033,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
ASSERT(MUTEX_HELD(&msp->ms_lock));
mutex_enter(&mg->mg_lock);
- ASSERT(msp->ms_group == mg);
- avl_remove(&mg->mg_metaslab_tree, msp);
- msp->ms_weight = weight;
- avl_add(&mg->mg_metaslab_tree, msp);
+ metaslab_group_sort_impl(mg, msp, weight);
mutex_exit(&mg->mg_lock);
}
@@ -1001,7 +1081,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
- uint64_t psize)
+ uint64_t psize, int allocator)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;
@@ -1030,7 +1110,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
if (mg->mg_allocatable) {
metaslab_group_t *mgp;
int64_t qdepth;
- uint64_t qmax = mg->mg_max_alloc_queue_depth;
+ uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
if (!mc->mc_alloc_throttle_enabled)
return (B_TRUE);
@@ -1042,7 +1122,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
if (mg->mg_no_free_space)
return (B_FALSE);
- qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+ qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
/*
* If this metaslab group is below its qmax or it's
@@ -1061,9 +1141,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* groups at the same time when we make this check.
*/
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
- qmax = mgp->mg_max_alloc_queue_depth;
+ qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
- qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+ qdepth = refcount_count(
+ &mgp->mg_alloc_queue_depth[allocator]);
/*
* If there is another metaslab group that
@@ -1105,18 +1186,14 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
uint64_t rs_size1 = r1->rs_end - r1->rs_start;
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
- if (rs_size1 < rs_size2)
- return (-1);
- if (rs_size1 > rs_size2)
- return (1);
+ int cmp = AVL_CMP(rs_size1, rs_size2);
+ if (likely(cmp))
+ return (cmp);
if (r1->rs_start < r2->rs_start)
return (-1);
- if (r1->rs_start > r2->rs_start)
- return (1);
-
- return (0);
+ return (AVL_CMP(r1->rs_start, r2->rs_start));
}
/*
@@ -1468,9 +1545,12 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift;
ms->ms_size = 1ULL << vd->vdev_ms_shift;
+ ms->ms_allocator = -1;
+ ms->ms_new = B_TRUE;
/*
* We only open space map objects that already exist. All others
@@ -1567,6 +1647,7 @@ metaslab_fini(metaslab_t *msp)
cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
mutex_destroy(&msp->ms_sync_lock);
+ ASSERT3U(msp->ms_allocator, ==, -1);
kmem_free(msp, sizeof (metaslab_t));
}
@@ -1658,7 +1739,7 @@ metaslab_set_fragmentation(metaslab_t *msp)
if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
msp->ms_condense_wanted = B_TRUE;
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
- spa_dbgmsg(spa, "txg %llu, requesting force condense: "
+ zfs_dbgmsg("txg %llu, requesting force condense: "
"ms_id %llu, vdev_id %llu", txg, msp->ms_id,
vd->vdev_id);
}
@@ -1963,19 +2044,59 @@ metaslab_weight(metaslab_t *msp)
}
static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ int allocator, uint64_t activation_weight)
+{
+ /*
+ * If we're activating for the claim code, we don't want to actually
+ * set the metaslab up for a specific allocator.
+ */
+ if (activation_weight == METASLAB_WEIGHT_CLAIM)
+ return (0);
+ metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+ mg->mg_primaries : mg->mg_secondaries);
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ mutex_enter(&mg->mg_lock);
+ if (arr[allocator] != NULL) {
+ mutex_exit(&mg->mg_lock);
+ return (EEXIST);
+ }
+
+ arr[allocator] = msp;
+ ASSERT3S(msp->ms_allocator, ==, -1);
+ msp->ms_allocator = allocator;
+ msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+ mutex_exit(&mg->mg_lock);
+
+ return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+ int error = 0;
metaslab_load_wait(msp);
if (!msp->ms_loaded) {
- int error = metaslab_load(msp);
- if (error) {
+ if ((error = metaslab_load(msp)) != 0) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
}
+ if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+ /*
+ * The metaslab was activated for another allocator
+ * while we were waiting, we should reselect.
+ */
+ return (EBUSY);
+ }
+ if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+ allocator, activation_weight)) != 0) {
+ return (error);
+ }
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort(msp->ms_group, msp,
@@ -1988,6 +2109,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
}
static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+ uint64_t weight)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ metaslab_group_sort(mg, msp, weight);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+ ASSERT3P(msp->ms_group, ==, mg);
+ if (msp->ms_primary) {
+ ASSERT3U(0, <=, msp->ms_allocator);
+ ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+ ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+ mg->mg_primaries[msp->ms_allocator] = NULL;
+ } else {
+ ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+ ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+ mg->mg_secondaries[msp->ms_allocator] = NULL;
+ }
+ msp->ms_allocator = -1;
+ metaslab_group_sort_impl(mg, msp, weight);
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
metaslab_passivate(metaslab_t *msp, uint64_t weight)
{
uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
@@ -2002,7 +2151,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
ASSERT0(weight & METASLAB_ACTIVE_MASK);
msp->ms_activation_weight = 0;
- metaslab_group_sort(msp->ms_group, msp, weight);
+ metaslab_passivate_allocator(msp->ms_group, msp, weight);
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
@@ -2105,17 +2254,6 @@ metaslab_group_preload(metaslab_group_t *mg)
*
* 3. The on-disk size of the space map should actually decrease.
*
- * Checking the first condition is tricky since we don't want to walk
- * the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered range tree in the metaslab and calculate the
- * size required to write out the largest segment in our free tree. If the
- * size required to represent that segment on disk is larger than the space
- * map object then we avoid condensing this map.
- *
- * To determine the second criterion we use a best-case estimate and assume
- * each segment can be represented on-disk as a single 64-bit entry. We refer
- * to this best-case estimate as the space map's minimal form.
- *
* Unfortunately, we cannot compute the on-disk size of the space map in this
* context because we cannot accurately compute the effects of compression, etc.
* Instead, we apply the heuristic described in the block comment for
@@ -2126,9 +2264,6 @@ static boolean_t
metaslab_should_condense(metaslab_t *msp)
{
space_map_t *sm = msp->ms_sm;
- range_seg_t *rs;
- uint64_t size, entries, segsz, object_size, optimal_size, record_size;
- dmu_object_info_t doi;
vdev_t *vd = msp->ms_group->mg_vd;
uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
@@ -2154,34 +2289,22 @@ metaslab_should_condense(metaslab_t *msp)
msp->ms_condense_checked_txg = current_txg;
/*
- * Use the ms_allocatable_by_size range tree, which is ordered by
- * size, to obtain the largest segment in the free tree. We always
- * condense metaslabs that are empty and metaslabs for which a
- * condense request has been made.
+ * We always condense metaslabs that are empty and metaslabs for
+ * which a condense request has been made.
*/
- rs = avl_last(&msp->ms_allocatable_by_size);
- if (rs == NULL || msp->ms_condense_wanted)
+ if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+ msp->ms_condense_wanted)
return (B_TRUE);
- /*
- * Calculate the number of 64-bit entries this segment would
- * require when written to disk. If this single segment would be
- * larger on-disk than the entire current on-disk structure, then
- * clearly condensing will increase the on-disk structure size.
- */
- size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
- entries = size / (MIN(size, SM_RUN_MAX));
- segsz = entries * sizeof (uint64_t);
-
- optimal_size =
- sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
- object_size = space_map_length(msp->ms_sm);
+ uint64_t object_size = space_map_length(msp->ms_sm);
+ uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+ msp->ms_allocatable, SM_NO_VDEVID);
+ dmu_object_info_t doi;
dmu_object_info_from_db(sm->sm_dbuf, &doi);
- record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+ uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
- return (segsz <= object_size &&
- object_size >= (optimal_size * zfs_condense_pct / 100) &&
+ return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
object_size > zfs_metaslab_condense_block_threshold * record_size);
}
@@ -2256,11 +2379,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
* optimal, this is typically close to optimal, and much cheaper to
* compute.
*/
- space_map_write(sm, condense_tree, SM_ALLOC, tx);
+ space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
range_tree_vacate(condense_tree, NULL, NULL);
range_tree_destroy(condense_tree);
- space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
+ space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
msp->ms_condensing = B_FALSE;
}
@@ -2372,8 +2495,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
metaslab_condense(msp, txg, tx);
} else {
mutex_exit(&msp->ms_lock);
- space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
- space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
+ space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+ SM_NO_VDEVID, tx);
+ space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+ SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
}
@@ -2388,7 +2513,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
*/
mutex_exit(&msp->ms_lock);
space_map_write(vd->vdev_checkpoint_sm,
- msp->ms_checkpointing, SM_FREE, tx);
+ msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
mutex_enter(&msp->ms_lock);
space_map_update(vd->vdev_checkpoint_sm);
@@ -2580,22 +2705,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
+ if (msp->ms_new) {
+ msp->ms_new = B_FALSE;
+ mutex_enter(&mg->mg_lock);
+ mg->mg_ms_ready++;
+ mutex_exit(&mg->mg_lock);
+ }
/*
* Calculate the new weights before unloading any metaslabs.
* This will give us the most accurate weighting.
*/
- metaslab_group_sort(mg, msp, metaslab_weight(msp));
+ metaslab_group_sort(mg, msp, metaslab_weight(msp) |
+ (msp->ms_weight & METASLAB_ACTIVE_MASK));
/*
* If the metaslab is loaded and we've not tried to load or allocate
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
if (msp->ms_loaded &&
+ msp->ms_initializing == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space(
msp->ms_allocating[(txg + t) & TXG_MASK]));
}
+ if (msp->ms_allocator != -1) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ }
if (!metaslab_debug_unload)
metaslab_unload(msp);
@@ -2689,7 +2826,8 @@ metaslab_alloc_trace_fini(void)
*/
static void
metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
- metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+ metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+ int allocator)
{
if (!metaslab_trace_enabled)
return;
@@ -2722,6 +2860,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
mat->mat_dva_id = dva_id;
mat->mat_offset = offset;
mat->mat_weight = 0;
+ mat->mat_allocator = allocator;
if (msp != NULL)
mat->mat_weight = msp->ms_weight;
@@ -2762,35 +2901,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
*/
static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
- flags & METASLAB_DONT_THROTTLE)
+ (flags & METASLAB_DONT_THROTTLE))
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
- (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+ (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+ uint64_t max = mg->mg_max_alloc_queue_depth;
+ uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+ while (cur < max) {
+ if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+ cur, cur + 1) == cur) {
+ atomic_inc_64(
+ &mg->mg_class->mc_alloc_max_slots[allocator]);
+ return;
+ }
+ cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+ }
}
void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+ int allocator, boolean_t io_complete)
{
if (!(flags & METASLAB_ASYNC_ALLOC) ||
- flags & METASLAB_DONT_THROTTLE)
+ (flags & METASLAB_DONT_THROTTLE))
return;
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
if (!mg->mg_class->mc_alloc_throttle_enabled)
return;
- (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+ (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+ if (io_complete)
+ metaslab_group_increment_qdepth(mg, allocator);
}
void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+ int allocator)
{
#ifdef ZFS_DEBUG
const dva_t *dva = bp->blk_dva;
@@ -2799,7 +2959,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
for (int d = 0; d < ndvas; d++) {
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
- VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+ VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
+ tag));
}
#endif
}
@@ -2812,6 +2973,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
metaslab_class_t *mc = msp->ms_group->mg_class;
VERIFY(!msp->ms_condensing);
+ VERIFY0(msp->ms_initializing);
start = mc->mc_ops->msop_alloc(msp, size);
if (start != -1ULL) {
@@ -2841,91 +3003,147 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
return (start);
}
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried. In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab. This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+ dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+ zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+{
+ avl_index_t idx;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ metaslab_t *msp = avl_find(t, search, &idx);
+ if (msp == NULL)
+ msp = avl_nearest(t, idx, AVL_AFTER);
+
+ for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+ int i;
+ if (!metaslab_should_allocate(msp, asize)) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_TOO_SMALL, allocator);
+ continue;
+ }
+
+ /*
+ * If the selected metaslab is condensing or being
+ * initialized, skip it.
+ */
+ if (msp->ms_condensing || msp->ms_initializing > 0)
+ continue;
+
+ *was_active = msp->ms_allocator != -1;
+ /*
+ * If we're activating as primary, this is our first allocation
+ * from this disk, so we don't need to check how close we are.
+ * If the metaslab under consideration was already active,
+ * we're getting desperate enough to steal another allocator's
+ * metaslab, so we still don't care about distances.
+ */
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+ break;
+
+ uint64_t target_distance = min_distance
+ + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+ min_distance >> 1);
+
+ for (i = 0; i < d; i++) {
+ if (metaslab_distance(msp, &dva[i]) < target_distance)
+ break;
+ }
+ if (i == d)
+ break;
+ }
+
+ if (msp != NULL) {
+ search->ms_weight = msp->ms_weight;
+ search->ms_start = msp->ms_start + 1;
+ search->ms_allocator = msp->ms_allocator;
+ search->ms_primary = msp->ms_primary;
+ }
+ return (msp);
+}
+
+/* ARGSUSED */
static uint64_t
metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+ uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+ int allocator)
{
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
uint64_t activation_weight;
- uint64_t target_distance;
- int i;
+ boolean_t tertiary = B_FALSE;
activation_weight = METASLAB_WEIGHT_PRIMARY;
- for (i = 0; i < d; i++) {
- if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ for (int i = 0; i < d; i++) {
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+ tertiary = B_TRUE;
break;
}
}
+ /*
+ * If we don't have enough metaslabs active to fill the entire array, we
+ * just use the 0th slot.
+ */
+ if (mg->mg_ms_ready < mg->mg_allocators * 2) {
+ tertiary = B_FALSE;
+ allocator = 0;
+ }
+
+ ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
search->ms_weight = UINT64_MAX;
search->ms_start = 0;
+ /*
+ * At the end of the metaslab tree are the already-active metaslabs,
+ * first the primaries, then the secondaries. When we resume searching
+ * through the tree, we need to consider ms_allocator and ms_primary so
+ * we start in the location right after where we left off, and don't
+ * accidentally loop forever considering the same metaslabs.
+ */
+ search->ms_allocator = -1;
+ search->ms_primary = B_TRUE;
for (;;) {
- boolean_t was_active;
- avl_tree_t *t = &mg->mg_metaslab_tree;
- avl_index_t idx;
+ boolean_t was_active = B_FALSE;
mutex_enter(&mg->mg_lock);
- /*
- * Find the metaslab with the highest weight that is less
- * than what we've already tried. In the common case, this
- * means that we will examine each metaslab at most once.
- * Note that concurrent callers could reorder metaslabs
- * by activation/passivation once we have dropped the mg_lock.
- * If a metaslab is activated by another thread, and we fail
- * to allocate from the metaslab we have selected, we may
- * not try the newly-activated metaslab, and instead activate
- * another metaslab. This is not optimal, but generally
- * does not cause any problems (a possible exception being
- * if every metaslab is completely full except for the
- * the newly-activated metaslab which we fail to examine).
- */
- msp = avl_find(t, search, &idx);
- if (msp == NULL)
- msp = avl_nearest(t, idx, AVL_AFTER);
- for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
-
- if (!metaslab_should_allocate(msp, asize)) {
- metaslab_trace_add(zal, mg, msp, asize, d,
- TRACE_TOO_SMALL);
- continue;
- }
-
- /*
- * If the selected metaslab is condensing, skip it.
- */
- if (msp->ms_condensing)
- continue;
-
- was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
- if (activation_weight == METASLAB_WEIGHT_PRIMARY)
- break;
-
- target_distance = min_distance +
- (space_map_allocated(msp->ms_sm) != 0 ? 0 :
- min_distance >> 1);
-
- for (i = 0; i < d; i++) {
- if (metaslab_distance(msp, &dva[i]) <
- target_distance)
- break;
- }
- if (i == d)
- break;
+ if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+ mg->mg_primaries[allocator] != NULL) {
+ msp = mg->mg_primaries[allocator];
+ was_active = B_TRUE;
+ } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+ mg->mg_secondaries[allocator] != NULL && !tertiary) {
+ msp = mg->mg_secondaries[allocator];
+ was_active = B_TRUE;
+ } else {
+ msp = find_valid_metaslab(mg, activation_weight, dva, d,
+ min_distance, asize, allocator, zal, search,
+ &was_active);
}
+
mutex_exit(&mg->mg_lock);
if (msp == NULL) {
kmem_free(search, sizeof (*search));
return (-1ULL);
}
- search->ms_weight = msp->ms_weight;
- search->ms_start = msp->ms_start + 1;
mutex_enter(&msp->ms_lock);
-
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
@@ -2939,18 +3157,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
continue;
}
- if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
- activation_weight == METASLAB_WEIGHT_PRIMARY) {
- metaslab_passivate(msp,
- msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+ /*
+ * If the metaslab is freshly activated for an allocator that
+ * isn't the one we're allocating from, or if it's a primary and
+ * we're seeking a secondary (or vice versa), we go back and
+ * select a new metaslab.
+ */
+ if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ (msp->ms_allocator != -1) &&
+ (msp->ms_allocator != allocator || ((activation_weight ==
+ METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
mutex_exit(&msp->ms_lock);
continue;
}
- if (metaslab_activate(msp, activation_weight) != 0) {
+ if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_WEIGHT_CLAIM);
mutex_exit(&msp->ms_lock);
continue;
}
+
+ if (metaslab_activate(msp, allocator, activation_weight) != 0) {
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
msp->ms_selected_txg = txg;
/*
@@ -2963,24 +3195,35 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
if (!metaslab_should_allocate(msp, asize)) {
/* Passivate this metaslab and select a new one. */
metaslab_trace_add(zal, mg, msp, asize, d,
- TRACE_TOO_SMALL);
+ TRACE_TOO_SMALL, allocator);
goto next;
}
/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
- * to disk.
+ * to disk. If this metaslab is being initialized, we shouldn't
+ * allocate from it since the allocated region might be
+ * overwritten after allocation.
*/
if (msp->ms_condensing) {
metaslab_trace_add(zal, mg, msp, asize, d,
- TRACE_CONDENSING);
+ TRACE_CONDENSING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (msp->ms_initializing > 0) {
+ metaslab_trace_add(zal, mg, msp, asize, d,
+ TRACE_INITIALIZING, allocator);
+ metaslab_passivate(msp, msp->ms_weight &
+ ~METASLAB_ACTIVE_MASK);
mutex_exit(&msp->ms_lock);
continue;
}
offset = metaslab_block_alloc(msp, asize, txg);
- metaslab_trace_add(zal, mg, msp, asize, d, offset);
+ metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
if (offset != -1ULL) {
/* Proactively passivate the metaslab, if needed */
@@ -3036,19 +3279,20 @@ next:
static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
- uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+ uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+ int allocator)
{
uint64_t offset;
ASSERT(mg->mg_initialized);
offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
- min_distance, dva, d);
+ min_distance, dva, d, allocator);
mutex_enter(&mg->mg_lock);
if (offset == -1ULL) {
mg->mg_failed_allocations++;
metaslab_trace_add(zal, mg, NULL, asize, d,
- TRACE_GROUP_FAILURE);
+ TRACE_GROUP_FAILURE, allocator);
if (asize == SPA_GANGBLOCKSIZE) {
/*
* This metaslab group was unable to allocate
@@ -3083,7 +3327,7 @@ int ditto_same_vdev_distance_shift = 3;
int
metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
- zio_alloc_list_t *zal)
+ zio_alloc_list_t *zal, int allocator)
{
metaslab_group_t *mg, *rotor;
vdev_t *vd;
@@ -3095,7 +3339,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* For testing, make some blocks above a certain size be gang blocks.
*/
if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
- metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
+ metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+ allocator);
return (SET_ERROR(ENOSPC));
}
@@ -3181,12 +3426,12 @@ top:
*/
if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
allocatable = metaslab_group_allocatable(mg, rotor,
- psize);
+ psize, allocator);
}
if (!allocatable) {
metaslab_trace_add(zal, mg, NULL, psize, d,
- TRACE_NOT_ALLOCATABLE);
+ TRACE_NOT_ALLOCATABLE, allocator);
goto next;
}
@@ -3201,7 +3446,7 @@ top:
vd->vdev_state < VDEV_STATE_HEALTHY) &&
d == 0 && !try_hard && vd->vdev_children == 0) {
metaslab_trace_add(zal, mg, NULL, psize, d,
- TRACE_VDEV_ERROR);
+ TRACE_VDEV_ERROR, allocator);
goto next;
}
@@ -3225,7 +3470,7 @@ top:
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
- distance, dva, d);
+ distance, dva, d, allocator);
if (offset != -1ULL) {
/*
@@ -3288,7 +3533,7 @@ next:
bzero(&dva[d], sizeof (dva_t));
- metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
+ metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
return (SET_ERROR(ENOSPC));
}
@@ -3355,7 +3600,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
return;
if (spa->spa_vdev_removal != NULL &&
- spa->spa_vdev_removal->svr_vdev == vd &&
+ spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
vdev_is_concrete(vd)) {
/*
* Note: we check if the vdev is concrete because when
@@ -3589,18 +3834,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
* the reservation.
*/
boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
- int flags)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+ zio_t *zio, int flags)
{
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
+ uint64_t max = mc->mc_alloc_max_slots[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
- uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
- if (reserved_slots < mc->mc_alloc_max_slots)
- available_slots = mc->mc_alloc_max_slots - reserved_slots;
+ uint64_t reserved_slots =
+ refcount_count(&mc->mc_alloc_slots[allocator]);
+ if (reserved_slots < max)
+ available_slots = max - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags)) {
/*
@@ -3608,7 +3855,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++) {
- reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+ reserved_slots =
+ refcount_add(&mc->mc_alloc_slots[allocator],
+ zio);
}
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE;
@@ -3619,12 +3868,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
}
void
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+ int allocator, zio_t *zio)
{
ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++) {
- (void) refcount_remove(&mc->mc_alloc_slots, zio);
+ (void) refcount_remove(&mc->mc_alloc_slots[allocator],
+ zio);
}
mutex_exit(&mc->mc_lock);
}
@@ -3646,7 +3897,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
mutex_enter(&msp->ms_lock);
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+ /*
+ * No need to fail in that case; someone else has activated the
+ * metaslab, but that doesn't preclude us from using it.
+ */
+ if (error == EBUSY)
+ error = 0;
if (error == 0 &&
!range_tree_contains(msp->ms_allocatable, offset, size))
@@ -3751,7 +4008,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
- zio_alloc_list_t *zal, zio_t *zio)
+ zio_alloc_list_t *zal, zio_t *zio, int allocator)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
@@ -3774,12 +4031,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
- txg, flags, zal);
+ txg, flags, zal, allocator);
if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_unalloc_dva(spa, &dva[d], txg);
metaslab_group_alloc_decrement(spa,
- DVA_GET_VDEV(&dva[d]), zio, flags);
+ DVA_GET_VDEV(&dva[d]), zio, flags,
+ allocator, B_FALSE);
bzero(&dva[d], sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -3790,7 +4048,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
* based on the newly allocated dva.
*/
metaslab_group_alloc_increment(spa,
- DVA_GET_VDEV(&dva[d]), zio, flags);
+ DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
index 4ebadace742d..6359b72503ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
@@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
static range_seg_t *
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
{
- avl_index_t where;
range_seg_t rsearch;
uint64_t end = start + size;
@@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
rsearch.rs_start = start;
rsearch.rs_end = end;
- return (avl_find(&rt->rt_root, &rsearch, &where));
+ return (avl_find(&rt->rt_root, &rsearch, NULL));
}
range_seg_t *
@@ -651,3 +650,23 @@ range_tree_is_empty(range_tree_t *rt)
ASSERT(rt != NULL);
return (range_tree_space(rt) == 0);
}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+ range_seg_t *rs = avl_first(&rt->rt_root);
+ return (rs != NULL ? rs->rs_start : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+ range_seg_t *rs = avl_last(&rt->rt_root);
+ return (rs != NULL ? rs->rs_end : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+ return (range_tree_max(rt) - range_tree_min(rt));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
index dd6e90c7796b..50f3c0ad822f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -242,31 +242,23 @@ sa_cache_fini(void)
static int
layout_num_compare(const void *arg1, const void *arg2)
{
- const sa_lot_t *node1 = arg1;
- const sa_lot_t *node2 = arg2;
+ const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+ const sa_lot_t *node2 = (const sa_lot_t *)arg2;
- if (node1->lot_num > node2->lot_num)
- return (1);
- else if (node1->lot_num < node2->lot_num)
- return (-1);
- return (0);
+ return (AVL_CMP(node1->lot_num, node2->lot_num));
}
static int
layout_hash_compare(const void *arg1, const void *arg2)
{
- const sa_lot_t *node1 = arg1;
- const sa_lot_t *node2 = arg2;
+ const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+ const sa_lot_t *node2 = (const sa_lot_t *)arg2;
- if (node1->lot_hash > node2->lot_hash)
- return (1);
- if (node1->lot_hash < node2->lot_hash)
- return (-1);
- if (node1->lot_instance > node2->lot_instance)
- return (1);
- if (node1->lot_instance < node2->lot_instance)
- return (-1);
- return (0);
+ int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_CMP(node1->lot_instance, node2->lot_instance));
}
boolean_t
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 489956f1857b..c8a635ae54f3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -55,6 +55,7 @@
#include <sys/vdev_removal.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/uberblock_impl.h>
@@ -443,8 +444,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
dp = spa_get_dsl(spa);
dsl_pool_config_enter(dp, FTAG);
- if (err = dsl_dataset_hold_obj(dp,
- za.za_first_integer, FTAG, &ds)) {
+ err = dsl_dataset_hold_obj(dp,
+ za.za_first_integer, FTAG, &ds);
+ if (err != 0) {
dsl_pool_config_exit(dp, FTAG);
break;
}
@@ -599,7 +601,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
break;
}
- if (error = dmu_objset_hold(strval, FTAG, &os))
+ error = dmu_objset_hold(strval, FTAG, &os);
+ if (error != 0)
break;
/*
@@ -902,19 +905,14 @@ spa_change_guid(spa_t *spa)
static int
spa_error_entry_compare(const void *a, const void *b)
{
- spa_error_entry_t *sa = (spa_error_entry_t *)a;
- spa_error_entry_t *sb = (spa_error_entry_t *)b;
+ const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
+ const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
int ret;
- ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+ ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
sizeof (zbookmark_phys_t));
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
+ return (AVL_ISIGN(ret));
}
/*
@@ -1215,8 +1213,10 @@ spa_activate(spa_t *spa, int mode)
*/
trim_thread_create(spa);
- for (size_t i = 0; i < TXG_SIZE; i++)
- spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+ for (size_t i = 0; i < TXG_SIZE; i++) {
+ spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+ }
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
@@ -1388,6 +1388,11 @@ spa_unload(spa_t *spa)
*/
spa_async_suspend(spa);
+ if (spa->spa_root_vdev) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
/*
* Stop syncing.
*/
@@ -1403,10 +1408,10 @@ spa_unload(spa_t *spa)
* calling taskq_wait(mg_taskq).
*/
if (spa->spa_root_vdev != NULL) {
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
/*
@@ -1440,7 +1445,7 @@ spa_unload(spa_t *spa)
bpobj_close(&spa->spa_deferred_bpobj);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+ spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
/*
* Close all vdevs.
@@ -1502,7 +1507,7 @@ spa_unload(spa_t *spa)
spa->spa_comment = NULL;
}
- spa_config_exit(spa, SCL_ALL, FTAG);
+ spa_config_exit(spa, SCL_ALL, spa);
}
/*
@@ -3954,6 +3959,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
spa_restart_removal(spa);
spa_spawn_aux_threads(spa);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
}
spa_load_note(spa, "LOADED");
@@ -4362,18 +4371,14 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
}
static void
-spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
{
- nvlist_t *features;
zap_cursor_t zc;
zap_attribute_t za;
- ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
- VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
/* We may be unable to read features if pool is suspended. */
if (spa_suspended(spa))
- goto out;
+ return;
if (spa->spa_feat_for_read_obj != 0) {
for (zap_cursor_init(&zc, spa->spa_meta_objset,
@@ -4382,7 +4387,7 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config)
zap_cursor_advance(&zc)) {
ASSERT(za.za_integer_length == sizeof (uint64_t) &&
za.za_num_integers == 1);
- VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+ VERIFY0(nvlist_add_uint64(features, za.za_name,
za.za_first_integer));
}
zap_cursor_fini(&zc);
@@ -4395,16 +4400,62 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config)
zap_cursor_advance(&zc)) {
ASSERT(za.za_integer_length == sizeof (uint64_t) &&
za.za_num_integers == 1);
- VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+ VERIFY0(nvlist_add_uint64(features, za.za_name,
za.za_first_integer));
}
zap_cursor_fini(&zc);
}
+}
-out:
- VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
- features) == 0);
- nvlist_free(features);
+static void
+spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
+{
+ int i;
+
+ for (i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t feature = spa_feature_table[i];
+ uint64_t refcount;
+
+ if (feature_get_refcount(spa, &feature, &refcount) != 0)
+ continue;
+
+ VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
+ }
+}
+
+/*
+ * Store a list of pool features and their reference counts in the
+ * config.
+ *
+ * The first time this is called on a spa, allocate a new nvlist, fetch
+ * the pool features and reference counts from disk, then save the list
+ * in the spa. In subsequent calls on the same spa use the saved nvlist
+ * and refresh its values from the cached reference counts. This
+ * ensures we don't block here on I/O on a suspended pool so 'zpool
+ * clear' can resume the pool.
+ */
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t *features;
+
+ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+ mutex_enter(&spa->spa_feat_stats_lock);
+ features = spa->spa_feat_stats;
+
+ if (features != NULL) {
+ spa_feature_stats_from_cache(spa, features);
+ } else {
+ VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
+ spa->spa_feat_stats = features;
+ spa_feature_stats_from_disk(spa, features);
+ }
+
+ VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+ features));
+
+ mutex_exit(&spa->spa_feat_stats_lock);
}
int
@@ -5675,6 +5726,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
* in which case we can modify its state.
*/
if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+
/*
* Objsets may be open only because they're dirty, so we
* have to force it to sync before checking spa_refcnt.
@@ -5709,6 +5761,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
}
/*
+ * We're about to export or destroy this pool. Make sure
+ * we stop all initializtion activity here before we
+ * set the spa_final_txg. This will ensure that all
+ * dirty data resulting from the initialization is
+ * committed to disk before we unload the pool.
+ */
+ if (spa->spa_root_vdev != NULL) {
+ vdev_initialize_stop_all(spa->spa_root_vdev,
+ VDEV_INITIALIZE_ACTIVE);
+ }
+
+ /*
* We want this to be reflected on every label,
* so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out.
@@ -5837,8 +5901,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
if (spa->spa_vdev_removal != NULL &&
- tvd->vdev_ashift !=
- spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+ tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
/* Fail if top level vdev is raidz */
@@ -5954,10 +6017,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}
- if (spa->spa_vdev_removal != NULL ||
- spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
- }
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -6401,6 +6462,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
return (error);
}
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+ /*
+ * We hold the namespace lock through the whole function
+ * to prevent any changes to the pool while we're starting or
+ * stopping initialization. The config and state locks are held so that
+ * we can properly assess the vdev state before we commit to
+ * the initializing operation.
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+ /* Look up vdev and ensure it's a leaf. */
+ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_detached) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ENODEV));
+ } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EINVAL));
+ } else if (!vdev_writeable(vd)) {
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EROFS));
+ }
+ mutex_enter(&vd->vdev_initialize_lock);
+ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+ /*
+ * When we activate an initialize action we check to see
+ * if the vdev_initialize_thread is NULL. We do this instead
+ * of using the vdev_initialize_state since there might be
+ * a previous initialization process which has completed but
+ * the thread is not exited.
+ */
+ if (cmd_type == POOL_INITIALIZE_DO &&
+ (vd->vdev_initialize_thread != NULL ||
+ vd->vdev_top->vdev_removing)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(EBUSY));
+ } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+ (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+ vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+ mutex_exit(&vd->vdev_initialize_lock);
+ mutex_exit(&spa_namespace_lock);
+ return (SET_ERROR(ESRCH));
+ }
+
+ switch (cmd_type) {
+ case POOL_INITIALIZE_DO:
+ vdev_initialize(vd);
+ break;
+ case POOL_INITIALIZE_CANCEL:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+ break;
+ default:
+ panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ /* Sync out the initializing state */
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ mutex_exit(&spa_namespace_lock);
+
+ return (0);
+}
+
+
/*
* Split a set of devices from their mirrors, and create a new pool from them.
*/
@@ -6608,6 +6749,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
+ for (c = 0; c < children; c++) {
+ if (vml[c] != NULL) {
+ /*
+ * Temporarily stop the initializing activity. We set
+ * the state to ACTIVE so that we know to resume
+ * the initializing once the split has completed.
+ */
+ mutex_enter(&vml[c]->vdev_initialize_lock);
+ vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+ mutex_exit(&vml[c]->vdev_initialize_lock);
+ }
+ }
+
#ifndef illumos
/* mark that we are creating new spa by splitting */
newspa->spa_splitting_newspa = B_TRUE;
@@ -6702,6 +6856,10 @@ out:
if (vml[c] != NULL)
vml[c]->vdev_offline = B_FALSE;
}
+
+ /* restart initializing disks as necessary */
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
vdev_reopen(spa->spa_root_vdev);
nvlist_free(spa->spa_config_splitting);
@@ -7066,6 +7224,14 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_RESILVER)
dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_initialize_restart(spa->spa_root_vdev);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
/*
* Let the world know that we're done.
*/
@@ -7765,8 +7931,9 @@ spa_sync(spa_t *spa, uint64_t txg)
* Wait for i/os issued in open context that need to complete
* before this txg syncs.
*/
- VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
- spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+ spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL);
/*
* Lock out configuration changes.
@@ -7776,9 +7943,11 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
- mutex_enter(&spa->spa_alloc_lock);
- VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
- mutex_exit(&spa->spa_alloc_lock);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
/*
* If there are any pending vdev state changes, convert them
@@ -7844,7 +8013,7 @@ spa_sync(spa_t *spa, uint64_t txg)
* The max queue depth will not change in the middle of syncing
* out this txg.
*/
- uint64_t queue_depth_total = 0;
+ uint64_t slots_per_allocator = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;
@@ -7858,18 +8027,23 @@ spa_sync(spa_t *spa, uint64_t txg)
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
- ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+ for (int i = 0; i < spa->spa_alloc_count; i++)
+ ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
mg->mg_max_alloc_queue_depth = max_queue_depth;
- queue_depth_total += mg->mg_max_alloc_queue_depth;
+
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mg->mg_cur_max_alloc_queue_depth[i] =
+ zfs_vdev_def_queue_depth;
+ }
+ slots_per_allocator += zfs_vdev_def_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
- ASSERT0(refcount_count(&mc->mc_alloc_slots));
- mc->mc_alloc_max_slots = queue_depth_total;
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
+ mc->mc_alloc_max_slots[i] = slots_per_allocator;
+ }
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
- ASSERT3U(mc->mc_alloc_max_slots, <=,
- max_queue_depth * rvd->vdev_children);
-
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *vd = rvd->vdev_child[c];
vdev_indirect_state_sync_verify(vd);
@@ -8052,14 +8226,17 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg);
- mutex_enter(&spa->spa_alloc_lock);
- VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
- mutex_exit(&spa->spa_alloc_lock);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_enter(&spa->spa_alloc_locks[i]);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+ mutex_exit(&spa->spa_alloc_locks[i]);
+ }
/*
* Update usable space statistics.
*/
- while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+ != NULL)
vdev_sync_done(vd, txg);
spa_update_dspace(spa);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
index a4af48d8c58b..db0d2caa6107 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
@@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg {
} spa_checkpoint_discard_sync_callback_arg_t;
static int
-spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
- uint64_t size, void *arg)
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
{
spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
vdev_t *vd = sdc->sdc_vd;
- metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
- uint64_t end = offset + size;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
if (sdc->sdc_entry_limit == 0)
return (EINTR);
@@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
* metaslab boundaries. So if needed we could add code
* that handles metaslab-crossing segments in the future.
*/
- VERIFY3U(type, ==, SM_FREE);
- VERIFY3U(offset, >=, ms->ms_start);
+ VERIFY3U(sme->sme_type, ==, SM_FREE);
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
/*
@@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
mutex_enter(&ms->ms_lock);
if (range_tree_is_empty(ms->ms_freeing))
vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
- range_tree_add(ms->ms_freeing, offset, size);
+ range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
mutex_exit(&ms->ms_lock);
- ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
- ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+ ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+ sme->sme_run);
+ ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
- vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
- vd->vdev_stat.vs_checkpoint_space -= size;
+ vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+ vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
sdc->sdc_entry_limit--;
return (0);
@@ -289,12 +289,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
* Thus, we set the maximum entries that the space map callback
* will be applied to be half the entries that could fit in the
* imposed memory limit.
+ *
+ * Note that since this is a conservative estimate we also
+ * assume the worst case scenario in our computation where each
+ * entry is two-word.
*/
uint64_t max_entry_limit =
- (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
-
- uint64_t entries_in_sm =
- space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+ (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
/*
* Iterate from the end of the space map towards the beginning,
@@ -318,14 +319,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
spa_checkpoint_discard_sync_callback_arg_t sdc;
sdc.sdc_vd = vd;
sdc.sdc_txg = tx->tx_txg;
- sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+ sdc.sdc_entry_limit = max_entry_limit;
- uint64_t entries_before = entries_in_sm;
+ uint64_t words_before =
+ space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
spa_checkpoint_discard_sync_callback, &sdc, tx);
- uint64_t entries_after =
+ uint64_t words_after =
space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
#ifdef DEBUG
@@ -333,9 +335,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
#endif
zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
- "deleted %llu entries - %llu entries are left",
- tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
- entries_after);
+ "deleted %llu words - %llu words are left",
+ tx->tx_txg, vd->vdev_id, (words_before - words_after),
+ words_after);
if (error != EINTR) {
if (error != 0) {
@@ -344,15 +346,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
"space map of vdev %llu\n",
error, vd->vdev_id);
}
- ASSERT0(entries_after);
+ ASSERT0(words_after);
ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
- ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+ ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
space_map_free(vd->vdev_checkpoint_sm, tx);
space_map_close(vd->vdev_checkpoint_sm);
vd->vdev_checkpoint_sm = NULL;
- VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+ VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
}
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
index ff1fcb4f0b21..6865dcff2212 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2017 Joyent, Inc.
*/
@@ -559,6 +559,18 @@ spa_config_update(spa_t *spa, int what)
*/
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
+
+ /*
+ * Explicitly skip vdevs that are indirect or
+ * log vdevs that are being removed. The reason
+ * is that both of those can have vdev_ms_array
+ * set to 0 and we wouldn't want to change their
+ * metaslab size nor call vdev_expand() on them.
+ */
+ if (!vdev_is_concrete(tvd) ||
+ (tvd->vdev_islog && tvd->vdev_removing))
+ continue;
+
if (tvd->vdev_ms_array == 0) {
vdev_ashift_optimize(tvd);
vdev_metaslab_set_size(tvd);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index aaa3c310f1e8..2ceed8dd8040 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -41,6 +41,7 @@
#include <sys/zil.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
#include <sys/metaslab.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
@@ -252,7 +253,7 @@ int spa_mode_global;
* Everything except dprintf, spa, and indirect_remap is on by default
* in debug builds.
*/
-int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP);
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP);
#else
int zfs_flags = 0;
#endif
@@ -434,6 +435,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
&spa_min_slop, 0,
"Minimal value of reserved space");
+int spa_allocators = 4;
+
/*PRINTFLIKE2*/
void
spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -705,7 +708,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -779,8 +782,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_active_count++;
}
- avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
- sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ spa->spa_alloc_count = spa_allocators;
+ spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (kmutex_t), KM_SLEEP);
+ spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+ sizeof (avl_tree_t), KM_SLEEP);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+ }
/*
* Every pool starts with the default cachefile
@@ -812,8 +823,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
KM_SLEEP) == 0);
}
- spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
-
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
@@ -860,11 +869,20 @@ spa_remove(spa_t *spa)
kmem_free(dp, sizeof (spa_config_dirent_t));
}
- avl_destroy(&spa->spa_alloc_tree);
+ for (int i = 0; i < spa->spa_alloc_count; i++) {
+ avl_destroy(&spa->spa_alloc_trees[i]);
+ mutex_destroy(&spa->spa_alloc_locks[i]);
+ }
+ kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+ sizeof (kmutex_t));
+ kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+ sizeof (avl_tree_t));
+
list_destroy(&spa->spa_config_list);
nvlist_free(spa->spa_label_features);
nvlist_free(spa->spa_load_info);
+ nvlist_free(spa->spa_feat_stats);
spa_config_set(spa, NULL);
#ifdef illumos
@@ -895,7 +913,6 @@ spa_remove(spa_t *spa)
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
- mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
@@ -907,6 +924,7 @@ spa_remove(spa_t *spa)
mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_suspend_lock);
mutex_destroy(&spa->spa_vdev_top_lock);
+ mutex_destroy(&spa->spa_feat_stats_lock);
kmem_free(spa, sizeof (spa_t));
}
@@ -1001,18 +1019,13 @@ typedef struct spa_aux {
int aux_count;
} spa_aux_t;
-static int
+static inline int
spa_aux_compare(const void *a, const void *b)
{
- const spa_aux_t *sa = a;
- const spa_aux_t *sb = b;
+ const spa_aux_t *sa = (const spa_aux_t *)a;
+ const spa_aux_t *sb = (const spa_aux_t *)b;
- if (sa->aux_guid < sb->aux_guid)
- return (-1);
- else if (sa->aux_guid > sb->aux_guid)
- return (1);
- else
- return (0);
+ return (AVL_CMP(sa->aux_guid, sb->aux_guid));
}
void
@@ -1299,6 +1312,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
if (vd != NULL) {
ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+ if (vd->vdev_ops->vdev_op_leaf) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
@@ -1862,9 +1881,12 @@ spa_update_dspace(spa_t *spa)
* allocated twice (on the old device and the new
* device).
*/
- vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vdev_t *vd =
+ vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
spa->spa_dspace -= spa_deflate(spa) ?
vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
}
}
@@ -2009,6 +2031,12 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
return (dsize);
}
+uint64_t
+spa_dirty_data(spa_t *spa)
+{
+ return (spa->spa_dsl_pool->dp_dirty_total);
+}
+
/*
* ==========================================================================
* Initialization and Termination
@@ -2023,11 +2051,8 @@ spa_name_compare(const void *a1, const void *a2)
int s;
s = strcmp(s1->spa_name, s2->spa_name);
- if (s > 0)
- return (1);
- if (s < 0)
- return (-1);
- return (0);
+
+ return (AVL_ISIGN(s));
}
int
@@ -2261,12 +2286,6 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
return (0);
}
-boolean_t
-spa_debug_enabled(spa_t *spa)
-{
- return (spa->spa_debug);
-}
-
int
spa_maxblocksize(spa_t *spa)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index 2f15c5185c57..7356e3ceea75 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -43,68 +43,205 @@ SYSCTL_DECL(_vfs_zfs);
* Note on space map block size:
*
* The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
* when only a few blocks have changed since the last transaction group.
*/
/*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+/*
+ * Override the default indirect block size of 128K, instead using 16K for
+ * spacemaps (2^14 bytes). This dramatically reduces write inflation since
+ * appending to a spacemap typically has to write one data block (4KB) and one
+ * or two indirect blocks (16K-32K, rather than 128K).
+ */
+int space_map_ibs = 14;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+ &space_map_ibs, 0, "Space map indirect block shift");
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+ return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+ uint8_t prefix = SM_PREFIX_DECODE(e);
+ return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+ return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
* Iterate through the space map, invoking the callback on each (non-debug)
* space map entry.
*/
int
space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
{
- uint64_t *entry, *entry_map, *entry_map_end;
- uint64_t bufsize, size, offset, end;
+ uint64_t sm_len = space_map_length(sm);
+ ASSERT3U(sm->sm_blksz, !=, 0);
+
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+ ZIO_PRIORITY_SYNC_READ);
+
+ uint64_t blksz = sm->sm_blksz;
int error = 0;
+ for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+ block_base += blksz) {
+ dmu_buf_t *db;
+ error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+ block_base, FTAG, &db, DMU_READ_PREFETCH);
+ if (error != 0)
+ return (error);
- end = space_map_length(sm);
+ uint64_t *block_start = db->db_data;
+ uint64_t block_length = MIN(sm_len - block_base, blksz);
+ uint64_t *block_end = block_start +
+ (block_length / sizeof (uint64_t));
- bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
- entry_map = zio_buf_alloc(bufsize);
+ VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+ VERIFY3U(block_length, !=, 0);
+ ASSERT3U(blksz, ==, db->db_size);
- if (end > bufsize) {
- dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
- end - bufsize, ZIO_PRIORITY_SYNC_READ);
- }
+ for (uint64_t *block_cursor = block_start;
+ block_cursor < block_end && error == 0; block_cursor++) {
+ uint64_t e = *block_cursor;
+
+ if (sm_entry_is_debug(e)) /* Skip debug entries */
+ continue;
- for (offset = 0; offset < end && error == 0; offset += bufsize) {
- size = MIN(end - offset, bufsize);
- VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
- VERIFY(size != 0);
- ASSERT3U(sm->sm_blksz, !=, 0);
+ uint64_t raw_offset, raw_run, vdev_id;
+ maptype_t type;
+ if (sm_entry_is_single_word(e)) {
+ type = SM_TYPE_DECODE(e);
+ vdev_id = SM_NO_VDEVID;
+ raw_offset = SM_OFFSET_DECODE(e);
+ raw_run = SM_RUN_DECODE(e);
+ } else {
+ /* it is a two-word entry */
+ ASSERT(sm_entry_is_double_word(e));
+ raw_run = SM2_RUN_DECODE(e);
+ vdev_id = SM2_VDEV_DECODE(e);
+
+ /* move on to the second word */
+ block_cursor++;
+ e = *block_cursor;
+ VERIFY3P(block_cursor, <=, block_end);
+
+ type = SM2_TYPE_DECODE(e);
+ raw_offset = SM2_OFFSET_DECODE(e);
+ }
- dprintf("object=%llu offset=%llx size=%llx\n",
- space_map_object(sm), offset, size);
+ uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+ sm->sm_start;
+ uint64_t entry_run = raw_run << sm->sm_shift;
- error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
- entry_map, DMU_READ_PREFETCH);
- if (error != 0)
- break;
+ VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+ ASSERT3U(entry_offset, >=, sm->sm_start);
+ ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(entry_run, <=, sm->sm_size);
+ ASSERT3U(entry_offset + entry_run, <=,
+ sm->sm_start + sm->sm_size);
- entry_map_end = entry_map + (size / sizeof (uint64_t));
- for (entry = entry_map; entry < entry_map_end && error == 0;
- entry++) {
- uint64_t e = *entry;
- uint64_t offset, size;
+ space_map_entry_t sme = {
+ .sme_type = type,
+ .sme_vdev = vdev_id,
+ .sme_offset = entry_offset,
+ .sme_run = entry_run
+ };
+ error = callback(&sme, arg);
+ }
+ dmu_buf_rele(db, FTAG);
+ }
+ return (error);
+}
- if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
- continue;
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+ uint64_t bufsz, uint64_t *nwords)
+{
+ int error = 0;
+ dmu_buf_t *db;
- offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
- sm->sm_start;
- size = SM_RUN_DECODE(e) << sm->sm_shift;
+ /*
+ * Find the offset of the last word in the space map and use
+ * that to read the last block of the space map with
+ * dmu_buf_hold().
+ */
+ uint64_t last_word_offset =
+ sm->sm_phys->smp_objsize - sizeof (uint64_t);
+ error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+ FTAG, &db, DMU_READ_NO_PREFETCH);
+ if (error != 0)
+ return (error);
- VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
- VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
- VERIFY3U(offset, >=, sm->sm_start);
- VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
- error = callback(SM_TYPE_DECODE(e), offset, size, arg);
+ ASSERT3U(sm->sm_object, ==, db->db_object);
+ ASSERT3U(sm->sm_blksz, ==, db->db_size);
+ ASSERT3U(bufsz, >=, db->db_size);
+ ASSERT(nwords != NULL);
+
+ uint64_t *words = db->db_data;
+ *nwords =
+ (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+ ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
+
+ uint64_t n = *nwords;
+ uint64_t j = n - 1;
+ for (uint64_t i = 0; i < n; i++) {
+ uint64_t entry = words[i];
+ if (sm_entry_is_double_word(entry)) {
+ /*
+ * Since we are populating the buffer backwards
+ * we have to be extra careful and add the two
+ * words of the double-word entry in the right
+ * order.
+ */
+ ASSERT3U(j, >, 0);
+ buf[j - 1] = entry;
+
+ i++;
+ ASSERT3U(i, <, n);
+ entry = words[i];
+ buf[j] = entry;
+ j -= 2;
+ } else {
+ ASSERT(sm_entry_is_debug(entry) ||
+ sm_entry_is_single_word(entry));
+ buf[j] = entry;
+ j--;
}
}
- zio_buf_free(entry_map, bufsize);
+ /*
+ * Assert that we wrote backwards all the
+ * way to the beginning of the buffer.
+ */
+ ASSERT3S(j, ==, -1);
+
+ dmu_buf_rele(db, FTAG);
return (error);
}
@@ -118,124 +255,122 @@ int
space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
dmu_tx_t *tx)
{
- uint64_t bufsize, len;
- uint64_t *entry_map;
- int error = 0;
-
- len = space_map_length(sm);
- bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
- entry_map = zio_buf_alloc(bufsize);
+ uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+ uint64_t *buf = zio_buf_alloc(bufsz);
dmu_buf_will_dirty(sm->sm_dbuf, tx);
/*
- * Since we can't move the starting offset of the space map
- * (e.g there are reference on-disk pointing to it), we destroy
- * its entries incrementally starting from the end.
+ * Ideally we would want to iterate from the beginning of the
+ * space map to the end in incremental steps. The issue with this
+ * approach is that we don't have any field on-disk that points
+ * us where to start between each step. We could try zeroing out
+ * entries that we've destroyed, but this doesn't work either as
+ * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
+ *
+ * As a result, we destroy its entries incrementally starting from
+ * the end after applying the callback to each of them.
*
- * The logic that follows is basically the same as the one used
- * in space_map_iterate() but it traverses the space map
- * backwards:
+ * The problem with this approach is that we cannot literally
+ * iterate through the words in the space map backwards as we
+ * can't distinguish two-word space map entries from their second
+ * word. Thus we do the following:
*
- * 1] We figure out the size of the buffer that we want to use
- * to read the on-disk space map entries.
- * 2] We figure out the offset at the end of the space map where
- * we will start reading entries into our buffer.
- * 3] We read the on-disk entries into the buffer.
- * 4] We iterate over the entries from end to beginning calling
- * the callback function on each one. As we move from entry
- * to entry we decrease the size of the space map, deleting
- * effectively each entry.
- * 5] If there are no more entries in the space map or the
- * callback returns a value other than 0, we stop iterating
- * over the space map. If there are entries remaining and
- * the callback returned zero we go back to step [1].
+ * 1] We get all the entries from the last block of the space map
+ * and put them into a buffer in reverse order. This way the
+ * last entry comes first in the buffer, the second to last is
+ * second, etc.
+ * 2] We iterate through the entries in the buffer and we apply
+ * the callback to each one. As we move from entry to entry we
+ * we decrease the size of the space map, deleting effectively
+ * each entry.
+ * 3] If there are no more entries in the space map or the callback
+ * returns a value other than 0, we stop iterating over the
+ * space map. If there are entries remaining and the callback
+ * returned 0, we go back to step [1].
*/
- uint64_t offset = 0, size = 0;
- while (len > 0 && error == 0) {
- size = MIN(bufsize, len);
-
- VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
- VERIFY3U(size, >, 0);
- ASSERT3U(sm->sm_blksz, !=, 0);
-
- offset = len - size;
-
- IMPLY(bufsize > len, offset == 0);
- IMPLY(bufsize == len, offset == 0);
- IMPLY(bufsize < len, offset > 0);
-
-
- EQUIV(size == len, offset == 0);
- IMPLY(size < len, bufsize < len);
-
- dprintf("object=%llu offset=%llx size=%llx\n",
- space_map_object(sm), offset, size);
-
- error = dmu_read(sm->sm_os, space_map_object(sm),
- offset, size, entry_map, DMU_READ_PREFETCH);
+ int error = 0;
+ while (space_map_length(sm) > 0 && error == 0) {
+ uint64_t nwords = 0;
+ error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+ &nwords);
if (error != 0)
break;
- uint64_t num_entries = size / sizeof (uint64_t);
-
- ASSERT3U(num_entries, >, 0);
-
- while (num_entries > 0) {
- uint64_t e, entry_offset, entry_size;
- maptype_t type;
-
- e = entry_map[num_entries - 1];
+ ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
- ASSERT3U(num_entries, >, 0);
- ASSERT0(error);
+ for (uint64_t i = 0; i < nwords; i++) {
+ uint64_t e = buf[i];
- if (SM_DEBUG_DECODE(e)) {
+ if (sm_entry_is_debug(e)) {
sm->sm_phys->smp_objsize -= sizeof (uint64_t);
space_map_update(sm);
- len -= sizeof (uint64_t);
- num_entries--;
continue;
}
- type = SM_TYPE_DECODE(e);
- entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
- sm->sm_start;
- entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+ int words = 1;
+ uint64_t raw_offset, raw_run, vdev_id;
+ maptype_t type;
+ if (sm_entry_is_single_word(e)) {
+ type = SM_TYPE_DECODE(e);
+ vdev_id = SM_NO_VDEVID;
+ raw_offset = SM_OFFSET_DECODE(e);
+ raw_run = SM_RUN_DECODE(e);
+ } else {
+ ASSERT(sm_entry_is_double_word(e));
+ words = 2;
+
+ raw_run = SM2_RUN_DECODE(e);
+ vdev_id = SM2_VDEV_DECODE(e);
+
+ /* move to the second word */
+ i++;
+ e = buf[i];
+
+ ASSERT3P(i, <=, nwords);
+
+ type = SM2_TYPE_DECODE(e);
+ raw_offset = SM2_OFFSET_DECODE(e);
+ }
+
+ uint64_t entry_offset =
+ (raw_offset << sm->sm_shift) + sm->sm_start;
+ uint64_t entry_run = raw_run << sm->sm_shift;
VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
- VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
VERIFY3U(entry_offset, >=, sm->sm_start);
- VERIFY3U(entry_offset + entry_size, <=,
+ VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+ VERIFY3U(entry_run, <=, sm->sm_size);
+ VERIFY3U(entry_offset + entry_run, <=,
sm->sm_start + sm->sm_size);
- error = callback(type, entry_offset, entry_size, arg);
+ space_map_entry_t sme = {
+ .sme_type = type,
+ .sme_vdev = vdev_id,
+ .sme_offset = entry_offset,
+ .sme_run = entry_run
+ };
+ error = callback(&sme, arg);
if (error != 0)
break;
if (type == SM_ALLOC)
- sm->sm_phys->smp_alloc -= entry_size;
+ sm->sm_phys->smp_alloc -= entry_run;
else
- sm->sm_phys->smp_alloc += entry_size;
-
- sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+ sm->sm_phys->smp_alloc += entry_run;
+ sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
space_map_update(sm);
- len -= sizeof (uint64_t);
- num_entries--;
}
- IMPLY(error == 0, num_entries == 0);
- EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
}
- if (len == 0) {
+ if (space_map_length(sm) == 0) {
ASSERT0(error);
- ASSERT0(offset);
- ASSERT0(sm->sm_length);
ASSERT0(sm->sm_phys->smp_objsize);
ASSERT0(sm->sm_alloc);
}
- zio_buf_free(entry_map, bufsize);
+ zio_buf_free(buf, bufsz);
return (error);
}
@@ -246,16 +381,15 @@ typedef struct space_map_load_arg {
} space_map_load_arg_t;
static int
-space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
- void *arg)
+space_map_load_callback(space_map_entry_t *sme, void *arg)
{
space_map_load_arg_t *smla = arg;
- if (type == smla->smla_type) {
- VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
+ if (sme->sme_type == smla->smla_type) {
+ VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
smla->smla_sm->sm_size);
- range_tree_add(smla->smla_rt, offset, size);
+ range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
} else {
- range_tree_remove(smla->smla_rt, offset, size);
+ range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
}
return (0);
@@ -367,43 +501,239 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
}
}
-uint64_t
-space_map_entries(space_map_t *sm, range_tree_t *rt)
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
{
- avl_tree_t *t = &rt->rt_root;
- range_seg_t *rs;
- uint64_t size, entries;
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+ uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+ SM_DEBUG_ACTION_ENCODE(maptype) |
+ SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+ SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+ dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+ sizeof (dentry), &dentry, tx);
+
+ sm->sm_phys->smp_objsize += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
+ uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+{
+ ASSERT3U(words, !=, 0);
+ ASSERT3U(words, <=, 2);
+
+ /* ensure the vdev_id can be represented by the space map */
+ ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
+
+ /*
+ * if this is a single word entry, ensure that no vdev was
+ * specified.
+ */
+ IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
+
+ dmu_buf_t *db = *dbp;
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ uint64_t *block_base = db->db_data;
+ uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+ uint64_t *block_cursor = block_base +
+ (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+ ASSERT3P(block_cursor, <=, block_end);
+
+ uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+ uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+ ASSERT3U(rs->rs_start, >=, sm->sm_start);
+ ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
+ ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
+ ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+
+ while (size != 0) {
+ ASSERT3P(block_cursor, <=, block_end);
+
+ /*
+ * If we are at the end of this block, flush it and start
+ * writing again from the beginning.
+ */
+ if (block_cursor == block_end) {
+ dmu_buf_rele(db, tag);
+
+ uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+ VERIFY0(dmu_buf_hold(sm->sm_os,
+ space_map_object(sm), next_word_offset,
+ tag, &db, DMU_READ_PREFETCH));
+ dmu_buf_will_dirty(db, tx);
+
+ /* update caller's dbuf */
+ *dbp = db;
+
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ block_base = db->db_data;
+ block_cursor = block_base;
+ block_end = block_base +
+ (db->db_size / sizeof (uint64_t));
+ }
+
+ /*
+ * If we are writing a two-word entry and we only have one
+ * word left on this block, just pad it with an empty debug
+ * entry and write the two-word entry in the next block.
+ */
+ uint64_t *next_entry = block_cursor + 1;
+ if (next_entry == block_end && words > 1) {
+ ASSERT3U(words, ==, 2);
+ *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+ SM_DEBUG_ACTION_ENCODE(0) |
+ SM_DEBUG_SYNCPASS_ENCODE(0) |
+ SM_DEBUG_TXG_ENCODE(0);
+ block_cursor++;
+ sm->sm_phys->smp_objsize += sizeof (uint64_t);
+ ASSERT3P(block_cursor, ==, block_end);
+ continue;
+ }
+
+ uint64_t run_len = MIN(size, run_max);
+ switch (words) {
+ case 1:
+ *block_cursor = SM_OFFSET_ENCODE(start) |
+ SM_TYPE_ENCODE(maptype) |
+ SM_RUN_ENCODE(run_len);
+ block_cursor++;
+ break;
+ case 2:
+ /* write the first word of the entry */
+ *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+ SM2_RUN_ENCODE(run_len) |
+ SM2_VDEV_ENCODE(vdev_id);
+ block_cursor++;
+
+ /* move on to the second word of the entry */
+ ASSERT3P(block_cursor, <, block_end);
+ *block_cursor = SM2_TYPE_ENCODE(maptype) |
+ SM2_OFFSET_ENCODE(start);
+ block_cursor++;
+ break;
+ default:
+ panic("%d-word space map entries are not supported",
+ words);
+ break;
+ }
+ sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+
+ start += run_len;
+ size -= run_len;
+ }
+ ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ uint64_t vdev_id, dmu_tx_t *tx)
+{
+ spa_t *spa = tx->tx_pool->dp_spa;
+ dmu_buf_t *db;
+
+ space_map_write_intro_debug(sm, maptype, tx);
+#ifdef DEBUG
/*
- * All space_maps always have a debug entry so account for it here.
+ * We do this right after we write the intro debug entry
+ * because the estimate does not take it into account.
*/
- entries = 1;
+ uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+ uint64_t estimated_growth =
+ space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+ uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
/*
- * Traverse the range tree and calculate the number of space map
- * entries that would be required to write out the range tree.
+ * Find the offset right after the last word in the space map
+ * and use that to get a hold of the last block, so we can
+ * start appending to it.
*/
- for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
- size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
- entries += howmany(size, SM_RUN_MAX);
+ uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+ VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+ next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+ ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+ dmu_buf_will_dirty(db, tx);
+
+ avl_tree_t *t = &rt->rt_root;
+ for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+ uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+ uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ uint8_t words = 1;
+
+ /*
+ * We only write two-word entries when both of the following
+ * are true:
+ *
+ * [1] The feature is enabled.
+ * [2] The offset or run is too big for a single-word entry,
+ * or the vdev_id is set (meaning not equal to
+ * SM_NO_VDEVID).
+ *
+ * Note that for purposes of testing we've added the case that
+ * we write two-word entries occasionally when the feature is
+ * enabled and zfs_force_some_double_word_sm_entries has been
+ * set.
+ */
+ if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+ (offset >= (1ULL << SM_OFFSET_BITS) ||
+ length > SM_RUN_MAX ||
+ vdev_id != SM_NO_VDEVID ||
+ (zfs_force_some_double_word_sm_entries &&
+ spa_get_random(100) == 0)))
+ words = 2;
+
+ space_map_write_seg(sm, rs, maptype, vdev_id, words,
+ &db, FTAG, tx);
}
- return (entries);
+
+ dmu_buf_rele(db, FTAG);
+
+#ifdef DEBUG
+ /*
+ * We expect our estimation to be based on the worst case
+ * scenario [see comment in space_map_estimate_optimal_size()].
+ * Therefore we expect the actual objsize to be equal or less
+ * than whatever we estimated it to be.
+ */
+ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+#endif
}
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
void
space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
- dmu_tx_t *tx)
+ uint64_t vdev_id, dmu_tx_t *tx)
{
objset_t *os = sm->sm_os;
- spa_t *spa = dmu_objset_spa(os);
- avl_tree_t *t = &rt->rt_root;
- range_seg_t *rs;
- uint64_t size, total, rt_space, nodes;
- uint64_t *entry, *entry_map, *entry_map_end;
- uint64_t expected_entries, actual_entries = 1;
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
VERIFY3U(space_map_object(sm), !=, 0);
+
dmu_buf_will_dirty(sm->sm_dbuf, tx);
/*
@@ -423,58 +753,10 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
else
sm->sm_phys->smp_alloc -= range_tree_space(rt);
- expected_entries = space_map_entries(sm, rt);
-
- entry_map = zio_buf_alloc(sm->sm_blksz);
- entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
- entry = entry_map;
-
- *entry++ = SM_DEBUG_ENCODE(1) |
- SM_DEBUG_ACTION_ENCODE(maptype) |
- SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
- SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
-
- total = 0;
- nodes = avl_numnodes(&rt->rt_root);
- rt_space = range_tree_space(rt);
- for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
- uint64_t start;
+ uint64_t nodes = avl_numnodes(&rt->rt_root);
+ uint64_t rt_space = range_tree_space(rt);
- size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
- start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
-
- total += size << sm->sm_shift;
-
- while (size != 0) {
- uint64_t run_len;
-
- run_len = MIN(size, SM_RUN_MAX);
-
- if (entry == entry_map_end) {
- dmu_write(os, space_map_object(sm),
- sm->sm_phys->smp_objsize, sm->sm_blksz,
- entry_map, tx);
- sm->sm_phys->smp_objsize += sm->sm_blksz;
- entry = entry_map;
- }
-
- *entry++ = SM_OFFSET_ENCODE(start) |
- SM_TYPE_ENCODE(maptype) |
- SM_RUN_ENCODE(run_len);
-
- start += run_len;
- size -= run_len;
- actual_entries++;
- }
- }
-
- if (entry != entry_map) {
- size = (entry - entry_map) * sizeof (uint64_t);
- dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
- size, entry_map, tx);
- sm->sm_phys->smp_objsize += size;
- }
- ASSERT3U(expected_entries, ==, actual_entries);
+ space_map_write_impl(sm, rt, maptype, vdev_id, tx);
/*
* Ensure that the space_map's accounting wasn't changed
@@ -482,9 +764,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
*/
VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
VERIFY3U(range_tree_space(rt), ==, rt_space);
- VERIFY3U(range_tree_space(rt), ==, total);
-
- zio_buf_free(entry_map, sm->sm_blksz);
}
static int
@@ -526,7 +805,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
space_map_close(sm);
return (error);
}
-
*smp = sm;
return (0);
@@ -569,7 +847,8 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
*/
if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
- doi.doi_data_block_size != blocksize) {
+ doi.doi_data_block_size != blocksize ||
+ doi.doi_metadata_block_size != 1 << space_map_ibs) {
zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
"object[%llu]: old bonus %u, old blocksz %u",
dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@@ -625,8 +904,8 @@ space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
bonuslen = SPACE_MAP_SIZE_V0;
}
- object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
- DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+ object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
+ space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
return (object);
}
@@ -658,6 +937,133 @@ space_map_free(space_map_t *sm, dmu_tx_t *tx)
sm->sm_object = 0;
}
+/*
+ * Given a range tree, it makes a worst-case estimate of how much
+ * space would the tree's segments take if they were written to
+ * the given space map.
+ */
+uint64_t
+space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+ uint64_t vdev_id)
+{
+ spa_t *spa = dmu_objset_spa(sm->sm_os);
+ uint64_t shift = sm->sm_shift;
+ uint64_t *histogram = rt->rt_histogram;
+ uint64_t entries_for_seg = 0;
+
+ /*
+ * In order to get a quick estimate of the optimal size that this
+ * range tree would have on-disk as a space map, we iterate through
+ * its histogram buckets instead of iterating through its nodes.
+ *
+ * Note that this is a highest-bound/worst-case estimate for the
+ * following reasons:
+ *
+ * 1] We assume that we always add a debug padding for each block
+ * we write and we also assume that we start at the last word
+ * of a block attempting to write a two-word entry.
+ * 2] Rounding up errors due to the way segments are distributed
+ * in the buckets of the range tree's histogram.
+ * 3] The activation of zfs_force_some_double_word_sm_entries
+ * (tunable) when testing.
+ *
+ * = Math and Rounding Errors =
+ *
+ * rt_histogram[i] bucket of a range tree represents the number
+ * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
+ * that, we want to divide the buckets into groups: Buckets that
+ * can be represented using a single-word entry, ones that can
+ * be represented with a double-word entry, and ones that can
+ * only be represented with multiple two-word entries.
+ *
+ * [Note that if the new encoding feature is not enabled there
+ * are only two groups: single-word entry buckets and multiple
+ * single-word entry buckets. The information below assumes
+ * two-word entries enabled, but it can easily applied when
+ * the feature is not enabled]
+ *
+ * To find the highest bucket that can be represented with a
+ * single-word entry we look at the maximum run that such entry
+ * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
+ * the run of a space map entry is shifted by sm_shift, thus we
+ * add it to the exponent]. This way, excluding the value of the
+ * maximum run that can be represented by a single-word entry,
+ * all runs that are smaller exist in buckets 0 to
+ * SM_RUN_BITS + shift - 1.
+ *
+ * To find the highest bucket that can be represented with a
+ * double-word entry, we follow the same approach. Finally, any
+ * bucket higher than that are represented with multiple two-word
+ * entries. To be more specific, if the highest bucket whose
+ * segments can be represented with a single two-word entry is X,
+ * then bucket X+1 will need 2 two-word entries for each of its
+ * segments, X+2 will need 4, X+3 will need 8, ...etc.
+ *
+ * With all of the above we make our estimation based on bucket
+ * groups. There is a rounding error though. As we mentioned in
+ * the example with the one-word entry, the maximum run that can
+ * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
+ * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
+ * that length fall into the next bucket (and bucket group) where
+ * we start counting two-word entries and this is one more reason
+ * why the estimated size may end up being bigger than the actual
+ * size written.
+ */
+ uint64_t size = 0;
+ uint64_t idx = 0;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
+ (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
+
+ /*
+ * If we are trying to force some double word entries just
+ * assume the worst-case of every single word entry being
+ * written as a double word entry.
+ */
+ uint64_t entry_size =
+ (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
+ zfs_force_some_double_word_sm_entries) ?
+ (2 * sizeof (uint64_t)) : sizeof (uint64_t);
+
+ uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
+ for (; idx <= single_entry_max_bucket; idx++)
+ size += histogram[idx] * entry_size;
+
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
+ for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+ ASSERT3U(idx, >=, single_entry_max_bucket);
+ entries_for_seg =
+ 1ULL << (idx - single_entry_max_bucket);
+ size += histogram[idx] *
+ entries_for_seg * entry_size;
+ }
+ return (size);
+ }
+ }
+
+ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
+
+ uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
+ for (; idx <= double_entry_max_bucket; idx++)
+ size += histogram[idx] * 2 * sizeof (uint64_t);
+
+ for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+ ASSERT3U(idx, >=, double_entry_max_bucket);
+ entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
+ size += histogram[idx] *
+ entries_for_seg * 2 * sizeof (uint64_t);
+ }
+
+ /*
+ * Assume the worst case where we start with the padding at the end
+ * of the current block and we add an extra padding entry at the end
+ * of all subsequent blocks.
+ */
+ size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
+
+ return (size);
+}
+
uint64_t
space_map_object(space_map_t *sm)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
index a866e65d54f7..aa289ba1061d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
@@ -54,20 +54,14 @@
static int
space_reftree_compare(const void *x1, const void *x2)
{
- const space_ref_t *sr1 = x1;
- const space_ref_t *sr2 = x2;
+ const space_ref_t *sr1 = (const space_ref_t *)x1;
+ const space_ref_t *sr2 = (const space_ref_t *)x2;
- if (sr1->sr_offset < sr2->sr_offset)
- return (-1);
- if (sr1->sr_offset > sr2->sr_offset)
- return (1);
+ int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
+ if (likely(cmp))
+ return (cmp);
- if (sr1 < sr2)
- return (-1);
- if (sr1 > sr2)
- return (1);
-
- return (0);
+ return (AVL_PCMP(sr1, sr2));
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index cb1d4354579e..9b8e73b596fe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -194,7 +194,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
-int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
+int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
uint64_t arc_max_bytes(void);
void arc_init(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 69617b3dca9c..ec966432f2ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -83,6 +83,13 @@ typedef enum dbuf_states {
DB_EVICTING
} dbuf_states_t;
+typedef enum dbuf_cached_state {
+ DB_NO_CACHE = -1,
+ DB_DBUF_CACHE,
+ DB_DBUF_METADATA_CACHE,
+ DB_CACHE_MAX
+} dbuf_cached_state_t;
+
struct dnode;
struct dmu_tx;
@@ -229,11 +236,12 @@ typedef struct dmu_buf_impl {
*/
avl_node_t db_link;
- /*
- * Link in dbuf_cache.
- */
+ /* Link in dbuf_cache or dbuf_metadata_cache */
multilist_node_t db_cache_link;
+ /* Tells us which dbuf cache this dbuf is in, if any */
+ dbuf_cached_state_t db_caching_status;
+
/* Data which is unique to data (leaf) blocks: */
/* User callback information. */
@@ -295,7 +303,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
uint64_t blkid);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 0f7916e7d189..9bba698828fd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -109,7 +109,8 @@ typedef enum dmu_object_byteswap {
/*
* Defines a uint8_t object type. Object types specify if the data
* in the object is metadata (boolean) and how to byteswap the data
- * (dmu_object_byteswap_t).
+ * (dmu_object_byteswap_t). All of the types created by this method
+ * are cached in the dbuf metadata cache.
*/
#define DMU_OT(byteswap, metadata) \
(DMU_OT_NEWTYPE | \
@@ -124,6 +125,9 @@ typedef enum dmu_object_byteswap {
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
+#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
+
/*
* These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
* have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
@@ -352,6 +356,9 @@ typedef struct dmu_buf {
*/
uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
@@ -512,6 +519,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, boolean_t read, void *tag,
int *numbufsp, dmu_buf_t ***dbpp);
+int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp,
+ uint32_t flags);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
typedef void dmu_buf_evict_func_t(void *user_ptr);
@@ -750,10 +760,13 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
+int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
+int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
#ifdef _KERNEL
#ifdef illumos
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
@@ -767,6 +780,8 @@ int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset,
+ struct arc_buf *buf, dmu_tx_t *tx);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
int dmu_xuio_init(struct xuio *uio, int niov);
@@ -810,6 +825,7 @@ typedef void arc_byteswap_func_t(void *buf, size_t size);
typedef struct dmu_object_type_info {
dmu_object_byteswap_t ot_byteswap;
boolean_t ot_metadata;
+ boolean_t ot_dbuf_metadata_cache;
char *ot_name;
} dmu_object_type_info_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 59e87aab8081..25ff8642177d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -39,6 +39,7 @@
#include <sys/zio.h>
#include <sys/zil.h>
#include <sys/sa.h>
+#include <sys/zfs_ioctl.h>
#ifdef __cplusplus
extern "C" {
@@ -69,6 +70,7 @@ typedef struct objset_phys {
dnode_phys_t os_groupused_dnode;
} objset_phys_t;
+#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1)
struct objset {
/* Immutable: */
struct dsl_dataset *os_dsl_dataset;
@@ -100,6 +102,16 @@ struct objset {
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;
+ /*
+ * The next four values are used as a cache of whatever's on disk, and
+ * are initialized the first time these properties are queried. Before
+ * being initialized with their real values, their values are
+ * OBJSET_PROP_UNINITIALIZED.
+ */
+ uint64_t os_version;
+ uint64_t os_normalization;
+ uint64_t os_utf8only;
+ uint64_t os_casesensitivity;
/*
* Pointer is constant; the blkptr it points to is protected by
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
index 69a834d877ee..1f4b1f2cde9f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
@@ -70,6 +70,7 @@ typedef struct dmu_recv_cookie {
boolean_t drc_byteswap;
boolean_t drc_force;
boolean_t drc_resumable;
+ boolean_t drc_clone;
struct avl_tree *drc_guid_to_ds_map;
zio_cksum_t drc_cksum;
uint64_t drc_newsnapobj;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index 5566c70add13..89a7b2ef60e4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -291,7 +291,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
-void dnode_rele_and_unlock(dnode_t *dn, void *tag);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 8de77532ee75..0509e95b1587 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -65,9 +65,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
#define METASLAB_DONT_THROTTLE 0x10
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
- blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+ blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
+ int);
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
- dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
+ dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
@@ -88,9 +89,9 @@ int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int);
-void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
@@ -100,7 +101,7 @@ uint64_t metaslab_class_get_dspace(metaslab_class_t *);
uint64_t metaslab_class_get_deferred(metaslab_class_t *);
uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
-metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
@@ -109,8 +110,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
-void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
-void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
+ boolean_t);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
#ifdef __cplusplus
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index 939bcb30528b..5eb59df37e51 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace {
uint64_t mat_weight;
uint32_t mat_dva_id;
uint64_t mat_offset;
+ int mat_allocator;
} metaslab_alloc_trace_t;
/*
@@ -67,14 +68,17 @@ typedef enum trace_alloc_type {
TRACE_GROUP_FAILURE = -5ULL,
TRACE_ENOSPC = -6ULL,
TRACE_CONDENSING = -7ULL,
- TRACE_VDEV_ERROR = -8ULL
+ TRACE_VDEV_ERROR = -8ULL,
+ TRACE_INITIALIZING = -9ULL
} trace_alloc_type_t;
#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
-#define METASLAB_WEIGHT_TYPE (1ULL << 61)
+#define METASLAB_WEIGHT_CLAIM (1ULL << 61)
+#define METASLAB_WEIGHT_TYPE (1ULL << 60)
#define METASLAB_ACTIVE_MASK \
- (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
+ METASLAB_WEIGHT_CLAIM)
/*
* The metaslab weight is used to encode the amount of free space in a
@@ -97,37 +101,39 @@ typedef enum trace_alloc_type {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * |PS1| weighted-free space |
+ * |PSC1| weighted-free space |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* PS - indicates primary and secondary activation
+ * C - indicates activation for claimed block zio
* space - the fragmentation-weighted space
*
* Segment-based weight:
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * |PS0| idx| count of segments in region |
+ * |PSC0| idx| count of segments in region |
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* PS - indicates primary and secondary activation
+ * C - indicates activation for claimed block zio
* idx - index for the highest bucket in the histogram
* count - number of segments in the specified bucket
*/
-#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2)
-#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x)
+#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3)
+#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x)
#define WEIGHT_IS_SPACEBASED(weight) \
- ((weight) == 0 || BF64_GET((weight), 61, 1))
-#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1)
+ ((weight) == 0 || BF64_GET((weight), 60, 1))
+#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1)
/*
* These macros are only applicable to segment-based weighting.
*/
-#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6)
-#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x)
-#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55)
-#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x)
+#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6)
+#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x)
+#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54)
+#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x)
/*
* A metaslab class encompasses a category of allocatable top-level vdevs.
@@ -178,8 +184,8 @@ struct metaslab_class {
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
- uint64_t mc_alloc_max_slots;
- refcount_t mc_alloc_slots;
+ uint64_t *mc_alloc_max_slots;
+ refcount_t *mc_alloc_slots;
uint64_t mc_alloc_groups; /* # of allocatable groups */
@@ -202,9 +208,12 @@ struct metaslab_class {
*/
struct metaslab_group {
kmutex_t mg_lock;
+ metaslab_t **mg_primaries;
+ metaslab_t **mg_secondaries;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */
+ uint64_t mg_ms_ready;
/*
* A metaslab group is considered to be initialized only after
@@ -224,15 +233,33 @@ struct metaslab_group {
metaslab_group_t *mg_next;
/*
- * Each metaslab group can handle mg_max_alloc_queue_depth allocations
- * which are tracked by mg_alloc_queue_depth. It's possible for a
- * metaslab group to handle more allocations than its max. This
- * can occur when gang blocks are required or when other groups
- * are unable to handle their share of allocations.
+ * In order for the allocation throttle to function properly, we cannot
+ * have too many IOs going to each disk by default; the throttle
+ * operates by allocating more work to disks that finish quickly, so
+ * allocating larger chunks to each disk reduces its effectiveness.
+ * However, if the number of IOs going to each allocator is too small,
+ * we will not perform proper aggregation at the vdev_queue layer,
+ * also resulting in decreased performance. Therefore, we will use a
+ * ramp-up strategy.
+ *
+ * Each allocator in each metaslab group has a current queue depth
+ * (mg_alloc_queue_depth[allocator]) and a current max queue depth
+ * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
+ * has an absolute max queue depth (mg_max_alloc_queue_depth). We
+ * add IOs to an allocator until the mg_alloc_queue_depth for that
+ * allocator hits the cur_max. Every time an IO completes for a given
+ * allocator on a given metaslab group, we increment its cur_max until
+ * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
+ * help protect against disks that decrease in performance over time.
+ *
+ * It's possible for an allocator to handle more allocations than
+ * its max. This can occur when gang blocks are required or when other
+ * groups are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
- refcount_t mg_alloc_queue_depth;
-
+ uint64_t *mg_cur_max_alloc_queue_depth;
+ refcount_t *mg_alloc_queue_depth;
+ int mg_allocators;
/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
@@ -245,6 +272,11 @@ struct metaslab_group {
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+ int mg_ms_initializing;
+ boolean_t mg_initialize_updating;
+ kmutex_t mg_ms_initialize_lock;
+ kcondvar_t mg_ms_initialize_cv;
};
/*
@@ -335,6 +367,8 @@ struct metaslab {
boolean_t ms_condense_wanted;
uint64_t ms_condense_checked_txg;
+ uint64_t ms_initializing; /* leaves initializing this ms */
+
/*
* We must hold both ms_lock and ms_group->mg_lock in order to
* modify ms_loaded.
@@ -357,6 +391,13 @@ struct metaslab {
uint64_t ms_max_size; /* maximum allocatable size */
/*
+ * -1 if it's not active in an allocator, otherwise set to the allocator
+ * this metaslab is active for.
+ */
+ int ms_allocator;
+ boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */
+
+ /*
* The metaslab block allocators can optionally use a size-ordered
* range tree and/or an array of LBAs. Not all allocators use
* this functionality. The ms_allocatable_by_size should always
@@ -370,6 +411,8 @@ struct metaslab {
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
+
+ boolean_t ms_new;
};
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
index 244f35a3d6f3..feac5ae5fbf2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
@@ -95,6 +95,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt);
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
void range_tree_stat_verify(range_tree_t *rt);
+uint64_t range_tree_min(range_tree_t *rt);
+uint64_t range_tree_max(range_tree_t *rt);
+uint64_t range_tree_span(range_tree_t *rt);
void range_tree_add(void *arg, uint64_t start, uint64_t size);
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index fc4f90740efc..0b220362379e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -154,6 +154,7 @@ _NOTE(CONSTCOND) } while (0)
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
#define SPA_COMPRESSBITS 7
+#define SPA_VDEVBITS 24
/*
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
@@ -184,15 +185,15 @@ typedef struct zio_cksum_salt {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 0 | vdev1 | GRID | ASIZE |
+ * 0 | pad | vdev1 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 2 | vdev2 | GRID | ASIZE |
+ * 2 | pad | vdev2 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 4 | vdev3 | GRID | ASIZE |
+ * 4 | pad | vdev3 | GRID | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -371,8 +372,9 @@ typedef struct blkptr {
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
-#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32)
-#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x)
+#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
+#define DVA_SET_VDEV(dva, x) \
+ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
#define DVA_GET_OFFSET(dva) \
BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
@@ -668,6 +670,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_AUTOEXPAND 0x20
#define SPA_ASYNC_REMOVE_DONE 0x40
#define SPA_ASYNC_REMOVE_STOP 0x80
+#define SPA_ASYNC_INITIALIZE_RESTART 0x100
/*
* Controls the behavior of spa_vdev_remove().
@@ -683,6 +686,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -829,6 +833,7 @@ extern uint64_t spa_bootfs(spa_t *spa);
extern uint64_t spa_delegation(spa_t *spa);
extern objset_t *spa_meta_objset(spa_t *spa);
extern uint64_t spa_deadman_synctime(spa_t *spa);
+extern uint64_t spa_dirty_data(spa_t *spa);
/* Miscellaneous support routines */
extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
@@ -945,13 +950,6 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_bp(bp, fmt, ...)
#endif
-extern boolean_t spa_debug_enabled(spa_t *spa);
-#define spa_dbgmsg(spa, ...) \
-{ \
- if (spa_debug_enabled(spa)) \
- zfs_dbgmsg(__VA_ARGS__); \
-}
-
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
#ifdef __cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index 292fa5e96ac1..f69dde66dd9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -238,8 +238,16 @@ struct spa {
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
- kmutex_t spa_alloc_lock;
- avl_tree_t spa_alloc_tree;
+ /*
+ * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
+ * stored in spa_alloc_count. There is one tree and one lock for each
+ * allocator, to help improve allocation performance in write-heavy
+ * workloads.
+ */
+ kmutex_t *spa_alloc_locks;
+ avl_tree_t *spa_alloc_trees;
+ int spa_alloc_count;
+
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
@@ -324,7 +332,6 @@ struct spa {
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
uint8_t spa_claiming; /* pool is doing zil_claim() */
- boolean_t spa_debug; /* debug enabled? */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
@@ -353,6 +360,8 @@ struct spa {
uint64_t spa_feat_for_read_obj; /* required to read from pool */
uint64_t spa_feat_desc_obj; /* Feature descriptions */
uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
+ kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */
+ nvlist_t *spa_feat_stats; /* Cache of enabled features */
/* cache feature refcounts */
uint64_t spa_feat_refcount_cache[SPA_FEATURES];
#ifdef illumos
@@ -381,6 +390,10 @@ struct spa {
int spa_queued;
} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
#endif
+ /* arc_memory_throttle() parameters during low memory condition */
+ uint64_t spa_lowmem_page_load; /* memory load during txg */
+ uint64_t spa_lowmem_last_txg; /* txg window start */
+
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
index 98b87269cb6c..d3d852978a57 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -93,50 +93,100 @@ typedef struct space_map {
/*
* debug entry
*
- * 1 3 10 50
- * ,---+--------+------------+---------------------------------.
- * | 1 | action | syncpass | txg (lower bits) |
- * `---+--------+------------+---------------------------------'
- * 63 62 60 59 50 49 0
+ * 2 2 10 50
+ * +-----+-----+------------+----------------------------------+
+ * | 1 0 | act | syncpass | txg (lower bits) |
+ * +-----+-----+------------+----------------------------------+
+ * 63 62 61 60 59 50 49 0
*
*
- * non-debug entry
+ * one-word entry
*
* 1 47 1 15
- * ,-----------------------------------------------------------.
+ * +-----------------------------------------------------------+
* | 0 | offset (sm_shift units) | type | run |
- * `-----------------------------------------------------------'
- * 63 62 17 16 15 0
+ * +-----------------------------------------------------------+
+ * 63 62 16 15 14 0
+ *
+ *
+ * two-word entry
+ *
+ * 2 2 36 24
+ * +-----+-----+---------------------------+-------------------+
+ * | 1 1 | pad | run | vdev |
+ * +-----+-----+---------------------------+-------------------+
+ * 63 62 61 60 59 24 23 0
+ *
+ * 1 63
+ * +------+----------------------------------------------------+
+ * | type | offset |
+ * +------+----------------------------------------------------+
+ * 63 62 0
+ *
+ * Note that a two-word entry will not strandle a block boundary.
+ * If necessary, the last word of a block will be padded with a
+ * debug entry (with act = syncpass = txg = 0).
*/
-/* All this stuff takes and returns bytes */
-#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1)
-#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15)
-#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
-#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
-#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47)
-#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47)
-#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1)
-#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1)
+typedef enum {
+ SM_ALLOC,
+ SM_FREE
+} maptype_t;
+
+typedef struct space_map_entry {
+ maptype_t sme_type;
+ uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */
+ uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */
+ uint64_t sme_run; /* max is 2^36; units of sm_shift */
+} space_map_entry_t;
+
+#define SM_NO_VDEVID (1 << SPA_VDEVBITS)
-#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3)
-#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3)
+/* one-word entry constants */
+#define SM_DEBUG_PREFIX 2
+#define SM_OFFSET_BITS 47
+#define SM_RUN_BITS 15
+/* two-word entry constants */
+#define SM2_PREFIX 3
+#define SM2_OFFSET_BITS 63
+#define SM2_RUN_BITS 36
+
+#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2)
+#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2)
+
+#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2)
+#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2)
#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10)
#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10)
-
#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50)
#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50)
-#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
-
-typedef enum {
- SM_ALLOC,
- SM_FREE
-} maptype_t;
-
-typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
- void *arg);
+#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS)
+#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS)
+#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1)
+#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1)
+#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
+#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
+#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
+#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL)
+
+#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1)
+#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS)
+#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS)
+#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS)
+#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1)
+#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
+#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS)
+#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
+#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL)
+#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL)
+
+boolean_t sm_entry_is_debug(uint64_t e);
+boolean_t sm_entry_is_single_word(uint64_t e);
+boolean_t sm_entry_is_double_word(uint64_t e);
+
+typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
@@ -154,7 +204,9 @@ uint64_t space_map_allocated(space_map_t *sm);
uint64_t space_map_length(space_map_t *sm);
void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
- dmu_tx_t *tx);
+ uint64_t vdev_id, dmu_tx_t *tx);
+uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+ uint64_t vdev_id);
void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
void space_map_free(space_map_t *sm, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
index e583d61eac2f..bf3b269d707d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_TXG_IMPL_H
@@ -92,6 +92,7 @@ typedef struct tx_state {
kmutex_t tx_sync_lock; /* protects the rest of this struct */
uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiescing_txg; /* currently quiescing txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
uint64_t tx_synced_txg; /* last synced txg id */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 16436b7c022f..4a3af854d465 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -59,6 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
struct abd;
extern int zfs_vdev_queue_depth_pct;
+extern int zfs_vdev_def_queue_depth;
extern uint32_t zfs_vdev_async_write_max_active;
/*
@@ -79,6 +80,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
uint64_t offset, uint64_t size, void *arg);
typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+ range_seg_t *res);
typedef struct vdev_ops {
vdev_open_func_t *vdev_op_open;
@@ -91,6 +98,11 @@ typedef struct vdev_ops {
vdev_hold_func_t *vdev_op_hold;
vdev_rele_func_t *vdev_op_rele;
vdev_remap_func_t *vdev_op_remap;
+ /*
+ * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+ * Used when initializing vdevs. Isn't used by leaf ops.
+ */
+ vdev_xlation_func_t *vdev_op_xlate;
char vdev_op_type[16];
boolean_t vdev_op_leaf;
} vdev_ops_t;
@@ -251,6 +263,24 @@ struct vdev {
/* pool checkpoint related */
space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */
+
+ boolean_t vdev_initialize_exit_wanted;
+ vdev_initializing_state_t vdev_initialize_state;
+ kthread_t *vdev_initialize_thread;
+ /* Protects vdev_initialize_thread and vdev_initialize_state. */
+ kmutex_t vdev_initialize_lock;
+ kcondvar_t vdev_initialize_cv;
+ uint64_t vdev_initialize_offset[TXG_SIZE];
+ uint64_t vdev_initialize_last_offset;
+ range_tree_t *vdev_initialize_tree; /* valid while initializing */
+ uint64_t vdev_initialize_bytes_est;
+ uint64_t vdev_initialize_bytes_done;
+ time_t vdev_initialize_action_time; /* start and end time */
+
+ /* for limiting outstanding I/Os */
+ kmutex_t vdev_initialize_io_lock;
+ kcondvar_t vdev_initialize_io_cv;
+ uint64_t vdev_initialize_inflight;
/*
* Values stored in the config for an indirect or removing vdev.
@@ -470,6 +500,8 @@ extern vdev_ops_t vdev_indirect_ops;
/*
* Common size functions
*/
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+ range_seg_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
new file mode 100644
index 000000000000..db4b0572cd60
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INITIALIZE_H
+#define _SYS_VDEV_INITIALIZE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void vdev_initialize(vdev_t *vd);
+extern void vdev_initialize_stop(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_stop_all(vdev_t *vd,
+ vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_restart(vdev_t *vd);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+ range_seg_t *physical_rs);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_INITIALIZE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
index a29ae586102e..3962237afdab 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
#endif
typedef struct spa_vdev_removal {
- vdev_t *svr_vdev;
+ uint64_t svr_vdev_id;
uint64_t svr_max_offset_to_sync[TXG_SIZE];
/* Thread performing a vdev removal. */
kthread_t *svr_thread;
@@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *);
extern int spa_vdev_remove_cancel(spa_t *);
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
+extern int vdev_removal_max_span;
+extern int zfs_remove_max_segment;
+
#ifdef __cplusplus
}
#endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 3ea0da4a1d33..04606bda48db 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -146,4 +146,7 @@ extern struct mtx zfs_debug_mtx;
#define sys_shutdown rebooting
+#define noinline __attribute__((noinline))
+#define likely(x) __builtin_expect((x), 1)
+
#endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
index b04b24f17f8b..9cbfc26b64e2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
@@ -57,7 +57,7 @@ extern boolean_t zfs_free_leak_on_eio;
#define ZFS_DEBUG_DNODE_VERIFY (1 << 2)
#define ZFS_DEBUG_SNAPNAMES (1 << 3)
#define ZFS_DEBUG_MODIFY (1 << 4)
-#define ZFS_DEBUG_SPA (1 << 5)
+/* 1<<5 was previously used, try not to reuse */
#define ZFS_DEBUG_ZIO_FREE (1 << 6)
#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7)
#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 216b55b6c3ce..80a24b436a01 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright 2016 Toomas Soome <tsoome@me.com>
*/
@@ -217,7 +217,7 @@ enum zio_child {
#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
#define ZIO_CHILD_ALL_BITS \
- (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
+ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
enum zio_wait_type {
@@ -356,7 +356,7 @@ typedef struct zio_transform {
struct zio_transform *zt_next;
} zio_transform_t;
-typedef int zio_pipe_stage_t(zio_t *zio);
+typedef zio_t *zio_pipe_stage_t(zio_t *zio);
/*
* The io_reexecute flags are distinct from io_flags because the child must
@@ -489,6 +489,7 @@ struct zio {
void *io_waiter;
kmutex_t io_lock;
kcondvar_t io_cv;
+ int io_allocator;
/* FMA state */
zio_cksum_report_t *io_cksum_report;
@@ -550,8 +551,8 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, uint64_t size, enum zio_flag flags);
-extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t size, boolean_t *slog);
+extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
+ blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
uint64_t size);
@@ -586,7 +587,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
zio_done_func_t *done, void *priv);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
- struct abd *data, uint64_t size, int type, zio_priority_t priority,
+ struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *priv);
extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
index 4db05ac77598..ebe05a09dc4e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
@@ -13,7 +13,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
#ifndef _ZIO_PRIORITY_H
#define _ZIO_PRIORITY_H
@@ -30,6 +30,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
ZIO_PRIORITY_TRIM, /* free requests used for TRIM */
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
+ ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index 64b9c0cb3510..62d215aa4626 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -450,6 +450,30 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
}
}
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiesced_txg != 0);
+}
+
static void
txg_sync_thread(void *arg)
{
@@ -476,7 +500,7 @@ txg_sync_thread(void *arg)
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
- tx->tx_quiesced_txg == 0 &&
+ !txg_has_quiesced_to_sync(dp) &&
dp->dp_dirty_total < zfs_dirty_data_sync) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
@@ -489,7 +513,7 @@ txg_sync_thread(void *arg)
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
- while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -504,6 +528,7 @@ txg_sync_thread(void *arg)
* us. This may cause the quiescing thread to now be
* able to quiesce another txg, so we must signal it.
*/
+ ASSERT(tx->tx_quiesced_txg != 0);
txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
@@ -552,7 +577,7 @@ txg_quiesce_thread(void *arg)
*/
while (!tx->tx_exiting &&
(tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
- tx->tx_quiesced_txg != 0))
+ txg_has_quiesced_to_sync(dp)))
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
if (tx->tx_exiting)
@@ -562,6 +587,8 @@ txg_quiesce_thread(void *arg)
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting,
tx->tx_sync_txg_waiting);
+ tx->tx_quiescing_txg = txg;
+
mutex_exit(&tx->tx_sync_lock);
txg_quiesce(dp, txg);
mutex_enter(&tx->tx_sync_lock);
@@ -570,6 +597,7 @@ txg_quiesce_thread(void *arg)
* Hand this txg off to the sync thread.
*/
dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiescing_txg = 0;
tx->tx_quiesced_txg = txg;
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv);
@@ -667,7 +695,8 @@ txg_kick(dsl_pool_t *dp)
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
- if (tx->tx_syncing_txg == 0 &&
+ if (!txg_is_syncing(dp) &&
+ !txg_is_quiescing(dp) &&
tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
tx->tx_quiesced_txg <= tx->tx_synced_txg) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
index fbe7b619a29a..d33f451938b8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
@@ -42,14 +42,10 @@ typedef struct unique {
static int
unique_compare(const void *a, const void *b)
{
- const unique_t *una = a;
- const unique_t *unb = b;
-
- if (una->un_value < unb->un_value)
- return (-1);
- if (una->un_value > unb->un_value)
- return (+1);
- return (0);
+ const unique_t *una = (const unique_t *)a;
+ const unique_t *unb = (const unique_t *)b;
+
+ return (AVL_CMP(una->un_value, unb->un_value));
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index e17243a8c598..1baea65c5fa3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -51,6 +51,7 @@
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/trim_map.h>
+#include <sys/vdev_initialize.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
@@ -162,24 +163,30 @@ static vdev_ops_t *vdev_ops_table[] = {
};
-/* maximum number of metaslabs per top-level vdev */
+/* target number of metaslabs per top-level vdev */
int vdev_max_ms_count = 200;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RDTUN,
&vdev_max_ms_count, 0,
"Maximum number of metaslabs per top-level vdev");
-/* minimum amount of metaslabs per top-level vdev */
+/* minimum number of metaslabs per top-level vdev */
int vdev_min_ms_count = 16;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RDTUN,
&vdev_min_ms_count, 0,
"Minimum number of metaslabs per top-level vdev");
-/* see comment in vdev_metaslab_set_size() */
+/* practical upper limit of total metaslabs per top-level vdev */
+int vdev_ms_count_limit = 1ULL << 17;
+
+/* lower limit for metaslab size (512M) */
int vdev_default_ms_shift = 29;
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RDTUN,
&vdev_default_ms_shift, 0,
"Shift between vdev size and number of metaslabs");
+/* upper limit for metaslab size (256G) */
+int vdev_max_ms_shift = 38;
+
boolean_t vdev_validate_skip = B_FALSE;
/*
@@ -289,6 +296,14 @@ vdev_getops(const char *type)
return (ops);
}
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+ res->rs_start = in->rs_start;
+ res->rs_end = in->rs_end;
+}
+
/*
* Default asize function: return the MAX of psize with the asize of
* all children. This is what's used by anything other than RAID-Z.
@@ -560,7 +575,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
-
+ mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
}
@@ -752,7 +771,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
alloctype == VDEV_ALLOC_SPLIT ||
alloctype == VDEV_ALLOC_ROOTPOOL);
vd->vdev_mg = metaslab_group_create(islog ?
- spa_log_class(spa) : spa_normal_class(spa), vd);
+ spa_log_class(spa) : spa_normal_class(spa), vd,
+ spa->spa_alloc_count);
}
if (vd->vdev_ops->vdev_op_leaf &&
@@ -832,6 +852,7 @@ void
vdev_free(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
@@ -862,6 +883,7 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+ ASSERT(vd->vdev_initialize_thread == NULL);
/*
* Discard allocation state.
@@ -935,6 +957,10 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
+ mutex_destroy(&vd->vdev_initialize_lock);
+ mutex_destroy(&vd->vdev_initialize_io_lock);
+ cv_destroy(&vd->vdev_initialize_io_cv);
+ cv_destroy(&vd->vdev_initialize_cv);
if (vd == spa->spa_root_vdev)
spa->spa_root_vdev = NULL;
@@ -987,6 +1013,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_stat.vs_space = 0;
svd->vdev_stat.vs_dspace = 0;
+ /*
+ * State which may be set on a top-level vdev that's in the
+ * process of being removed.
+ */
+ ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+ ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+ ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+ ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+ ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+ ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+ ASSERT0(tvd->vdev_removing);
+ tvd->vdev_removing = svd->vdev_removing;
+ tvd->vdev_indirect_config = svd->vdev_indirect_config;
+ tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+ tvd->vdev_indirect_births = svd->vdev_indirect_births;
+ range_tree_swap(&svd->vdev_obsolete_segments,
+ &tvd->vdev_obsolete_segments);
+ tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+ svd->vdev_indirect_config.vic_mapping_object = 0;
+ svd->vdev_indirect_config.vic_births_object = 0;
+ svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+ svd->vdev_indirect_mapping = NULL;
+ svd->vdev_indirect_births = NULL;
+ svd->vdev_obsolete_sm = NULL;
+ svd->vdev_removing = 0;
+
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
@@ -1140,7 +1192,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
vd->vdev_ms = mspp;
vd->vdev_ms_count = newc;
-
for (m = oldc; m < newc; m++) {
uint64_t object = 0;
@@ -1725,7 +1776,8 @@ vdev_validate(vdev_t *vd)
if ((label = vdev_label_read_config(vd, txg)) == NULL) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
- vdev_dbgmsg(vd, "vdev_validate: failed reading config");
+ vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
+ "txg %llu", (u_longlong_t)txg);
return (0);
}
@@ -2121,34 +2173,53 @@ void
vdev_metaslab_set_size(vdev_t *vd)
{
uint64_t asize = vd->vdev_asize;
- uint64_t ms_shift = 0;
+ uint64_t ms_count = asize >> vdev_default_ms_shift;
+ uint64_t ms_shift;
/*
- * For vdevs that are bigger than 8G the metaslab size varies in
- * a way that the number of metaslabs increases in powers of two,
- * linearly in terms of vdev_asize, starting from 16 metaslabs.
- * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
- * and so on, until we hit the maximum metaslab count limit
- * [vdev_max_ms_count] from which point the metaslab count stays
- * the same.
+ * There are two dimensions to the metaslab sizing calculation:
+ * the size of the metaslab and the count of metaslabs per vdev.
+ * In general, we aim for vdev_max_ms_count (200) metaslabs. The
+ * range of the dimensions are as follows:
+ *
+ * 2^29 <= ms_size <= 2^38
+ * 16 <= ms_count <= 131,072
+ *
+ * On the lower end of vdev sizes, we aim for metaslabs sizes of
+ * at least 512MB (2^29) to minimize fragmentation effects when
+ * testing with smaller devices. However, the count constraint
+ * of at least 16 metaslabs will override this minimum size goal.
+ *
+ * On the upper end of vdev sizes, we aim for a maximum metaslab
+ * size of 256GB. However, we will cap the total count to 2^17
+ * metaslabs to keep our memory footprint in check.
+ *
+ * The net effect of applying above constrains is summarized below.
+ *
+ * vdev size metaslab count
+ * -------------|-----------------
+ * < 8GB ~16
+ * 8GB - 100GB one per 512MB
+ * 100GB - 50TB ~200
+ * 50TB - 32PB one per 256GB
+ * > 32PB ~131,072
+ * -------------------------------
*/
- ms_shift = vdev_default_ms_shift;
- if ((asize >> ms_shift) < vdev_min_ms_count) {
- /*
- * For devices that are less than 8G we want to have
- * exactly 16 metaslabs. We don't want less as integer
- * division rounds down, so less metaslabs mean more
- * wasted space. We don't want more as these vdevs are
- * small and in the likely event that we are running
- * out of space, the SPA will have a hard time finding
- * space due to fragmentation.
- */
+ if (ms_count < vdev_min_ms_count)
ms_shift = highbit64(asize / vdev_min_ms_count);
- ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
-
- } else if ((asize >> ms_shift) > vdev_max_ms_count) {
+ else if (ms_count > vdev_max_ms_count)
ms_shift = highbit64(asize / vdev_max_ms_count);
+ else
+ ms_shift = vdev_default_ms_shift;
+
+ if (ms_shift < SPA_MAXBLOCKSHIFT) {
+ ms_shift = SPA_MAXBLOCKSHIFT;
+ } else if (ms_shift > vdev_max_ms_shift) {
+ ms_shift = vdev_max_ms_shift;
+ /* cap the total count to constrain memory footprint */
+ if ((asize >> ms_shift) > vdev_ms_count_limit)
+ ms_shift = highbit64(asize / vdev_ms_count_limit);
}
vd->vdev_ms_shift = ms_shift;
@@ -2647,7 +2718,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
mutex_exit(&vd->vdev_dtl_lock);
space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
- space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
+ space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
range_tree_vacate(rtsync, NULL, NULL);
range_tree_destroy(rtsync);
@@ -3003,7 +3074,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
ASSERT(vdev_is_concrete(vd));
- while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+ != NULL)
metaslab_sync_done(msp, txg);
if (reassess)
@@ -3229,6 +3301,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+ /* Restart initializing if necessary */
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (vdev_writeable(vd) &&
+ vd->vdev_initialize_thread == NULL &&
+ vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) vdev_initialize(vd);
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
if (wasoffline ||
(oldstate < VDEV_STATE_DEGRADED &&
vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3531,8 +3612,18 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+ /*
+ * Report intializing progress. Since we don't have the
+ * initializing locks held, this is only an estimate (although a
+ * fairly accurate one).
+ */
+ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
+ vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
+ vs->vs_initialize_state = vd->vdev_initialize_state;
+ vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
+ }
/*
* Report expandable space on top-level, non-auxillary devices only.
* The expandable space is reported in terms of metaslab sized units
@@ -4193,11 +4284,11 @@ vdev_expand(vdev_t *vd, uint64_t txg)
{
ASSERT(vd->vdev_top == vd);
ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(vdev_is_concrete(vd));
vdev_set_deflate_ratio(vd);
- if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
- vdev_is_concrete(vd)) {
+ if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
VERIFY(vdev_metaslab_init(vd, txg) == 0);
vdev_config_dirty(vd);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index 26828e069d7d..be24cde54b9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -841,6 +841,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_disk_hold,
vdev_disk_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
index b5caee2ec79e..c198d77e21d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -271,6 +271,7 @@ vdev_ops_t vdev_file_ops = {
vdev_file_hold,
vdev_file_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_FILE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -291,6 +292,7 @@ vdev_ops_t vdev_disk_ops = {
vdev_file_hold,
vdev_file_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index 29a649aceb00..aa8a400f2d78 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -1151,6 +1151,7 @@ vdev_ops_t vdev_geom_ops = {
vdev_geom_hold,
vdev_geom_rele,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_DISK, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
index 62b92c677292..c4e4835447f5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
#include <sys/vdev_impl.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
+#include <sys/zio_checksum.h>
#include <sys/metaslab.h>
#include <sys/refcount.h>
#include <sys/dmu.h>
@@ -46,10 +47,11 @@
* "vdev_remap" operation that executes a callback on each contiguous
* segment of the new location. This function is used in multiple ways:
*
- * - reads and repair writes to this device use the callback to create
- * a child io for each mapped segment.
+ * - i/os to this vdev use the callback to determine where the
+ * data is now located, and issue child i/os for each segment's new
+ * location.
*
- * - frees and claims to this device use the callback to free or claim
+ * - frees and claims to this vdev use the callback to free or claim
* each mapped segment. (Note that we don't actually need to claim
* log blocks on indirect vdevs, because we don't allocate to
* removing vdevs. However, zdb uses zio_claim() for its leak
@@ -204,6 +206,94 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
int zfs_condense_indirect_commit_entry_delay_ticks = 0;
/*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. This allows all the segment copies to
+ * participate fairly in the reconstruction and prevents the repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+ abd_t *ic_data;
+ vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+ list_node_t is_node; /* link on iv_splits */
+
+ /*
+ * is_split_offset is the offset into the i/o.
+ * This is the sum of the previous splits' is_size's.
+ */
+ uint64_t is_split_offset;
+
+ vdev_t *is_vdev; /* top-level vdev */
+ uint64_t is_target_offset; /* offset on is_vdev */
+ uint64_t is_size;
+ int is_children; /* number of entries in is_child[] */
+
+ /*
+ * is_good_child is the child that we are currently using to
+ * attempt reconstruction.
+ */
+ int is_good_child;
+
+ indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+ boolean_t iv_split_block;
+ boolean_t iv_reconstruct;
+
+ list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ indirect_split_t *is;
+ while ((is = list_head(&iv->iv_splits)) != NULL) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic->ic_data != NULL)
+ abd_free(ic->ic_data);
+ }
+ list_remove(&iv->iv_splits, is);
+ kmem_free(is,
+ offsetof(indirect_split_t, is_child[is->is_children]));
+ }
+ kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+ vdev_indirect_map_free,
+ zio_vsd_default_cksum_report
+};
+/*
* Mark the given offset and size as being obsolete.
*/
void
@@ -729,7 +819,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
space_map_object(vd->vdev_obsolete_sm));
space_map_write(vd->vdev_obsolete_sm,
- vd->vdev_obsolete_segments, SM_ALLOC, tx);
+ vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
space_map_update(vd->vdev_obsolete_sm);
range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
}
@@ -818,12 +908,6 @@ vdev_indirect_close(vdev_t *vd)
}
/* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
-/* ARGSUSED */
static int
vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
@@ -1067,41 +1151,475 @@ vdev_indirect_child_io_done(zio_t *zio)
abd_put(zio->io_abd);
}
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
{
zio_t *zio = arg;
+ indirect_vsd_t *iv = zio->io_vsd;
ASSERT3P(vd, !=, NULL);
if (vd->vdev_ops == &vdev_indirect_ops)
return;
- zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
- abd_get_offset(zio->io_abd, split_offset),
- size, zio->io_type, zio->io_priority,
- 0, vdev_indirect_child_io_done, zio));
+ int n = 1;
+ if (vd->vdev_ops == &vdev_mirror_ops)
+ n = vd->vdev_children;
+
+ indirect_split_t *is =
+ kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+ is->is_children = n;
+ is->is_size = size;
+ is->is_split_offset = split_offset;
+ is->is_target_offset = offset;
+ is->is_vdev = vd;
+
+ /*
+ * Note that we only consider multiple copies of the data for
+ * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even
+ * though they use the same ops as mirror, because there's only one
+ * "good" copy under the replacing/spare.
+ */
+ if (vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < n; i++) {
+ is->is_child[i].ic_vdev = vd->vdev_child[i];
+ }
+ } else {
+ is->is_child[0].ic_vdev = vd;
+ }
+
+ list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+ indirect_child_t *ic = zio->io_private;
+
+ if (zio->io_error != 0) {
+ /*
+ * Clear ic_data to indicate that we do not have data for this
+ * child.
+ */
+ abd_free(ic->ic_data);
+ ic->ic_data = NULL;
+ }
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int i = 0; i < is->is_children; i++) {
+ indirect_child_t *ic = &is->is_child[i];
+
+ if (!vdev_readable(ic->ic_vdev))
+ continue;
+
+ /*
+ * Note, we may read from a child whose DTL
+ * indicates that the data may not be present here.
+ * While this might result in a few i/os that will
+ * likely return incorrect data, it simplifies the
+ * code since we can treat scrub and resilver
+ * identically. (The incorrect data will be
+ * detected and ignored when we verify the
+ * checksum.)
+ */
+
+ ic->ic_data = abd_alloc_sametype(zio->io_abd,
+ is->is_size);
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset, ic->ic_data,
+ is->is_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_read_split_done, ic));
+ }
+ }
+ iv->iv_reconstruct = B_TRUE;
}
static void
vdev_indirect_io_start(zio_t *zio)
{
spa_t *spa = zio->io_spa;
+ indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+ list_create(&iv->iv_splits,
+ sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+ zio->io_vsd = iv;
+ zio->io_vsd_ops = &vdev_indirect_vsd_ops;
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (zio->io_type != ZIO_TYPE_READ) {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
- ASSERT((zio->io_flags &
- (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+ /*
+ * Note: this code can handle other kinds of writes,
+ * but we don't expect them.
+ */
+ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
}
vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
- vdev_indirect_io_start_cb, zio);
+ vdev_indirect_gather_splits, zio);
+
+ indirect_split_t *first = list_head(&iv->iv_splits);
+ if (first->is_size == zio->io_size) {
+ /*
+ * This is not a split block; we are pointing to the entire
+ * data, which will checksum the same as the original data.
+ * Pass the BP down so that the child i/o can verify the
+ * checksum, and try a different location if available
+ * (e.g. on a mirror).
+ *
+ * While this special case could be handled the same as the
+ * general (split block) case, doing it this way ensures
+ * that the vast majority of blocks on indirect vdevs
+ * (which are not split) are handled identically to blocks
+ * on non-indirect vdevs. This allows us to be less strict
+ * about performance in the general (but rare) case.
+ */
+ ASSERT0(first->is_split_offset);
+ ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+ zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+ first->is_vdev, first->is_target_offset,
+ abd_get_offset(zio->io_abd, 0),
+ zio->io_size, zio->io_type, zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ } else {
+ iv->iv_split_block = B_TRUE;
+ if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+ /*
+ * Read all copies. Note that for simplicity,
+ * we don't bother consulting the DTL in the
+ * resilver case.
+ */
+ vdev_indirect_read_all(zio);
+ } else {
+ /*
+ * Read one copy of each split segment, from the
+ * top-level vdev. Since we don't know the
+ * checksum of each split individually, the child
+ * zio can't ensure that we get the right data.
+ * E.g. if it's a mirror, it will just read from a
+ * random (healthy) leaf vdev. We have to verify
+ * the checksum in vdev_indirect_io_done().
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ is->is_vdev, is->is_target_offset,
+ abd_get_offset(zio->io_abd,
+ is->is_split_offset),
+ is->is_size, zio->io_type,
+ zio->io_priority, 0,
+ vdev_indirect_child_io_done, zio));
+ }
+ }
+ }
zio_execute(zio);
}
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+ indirect_split_t *is, indirect_child_t *ic)
+{
+ vdev_t *vd = ic->ic_vdev;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zio_bad_cksum_t zbc = { 0 };
+ void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
+ abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+ void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
+ abd_return_buf(ic->ic_data, bad_buf, is->is_size);
+ abd_return_buf(good_abd, good_buf, is->is_size);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies. We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data. If they differ, then we overwrite the bad data
+ * with the good copy. Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong). For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+ if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+ flags |= ZIO_FLAG_SELF_HEAL;
+
+ if (!spa_writeable(zio->io_spa))
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+ if (ic == good_child)
+ continue;
+ if (ic->ic_data == NULL)
+ continue;
+ if (abd_cmp(good_child->ic_data, ic->ic_data,
+ is->is_size) == 0)
+ continue;
+
+ zio_nowait(zio_vdev_child_io(zio, NULL,
+ ic->ic_vdev, is->is_target_offset,
+ good_child->ic_data, is->is_size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+ NULL, NULL));
+
+ vdev_indirect_checksum_error(zio, is, ic);
+ }
+ }
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+ return;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ for (int c = 0; c < is->is_children; c++) {
+ indirect_child_t *ic = &is->is_child[c];
+
+ if (ic->ic_data == NULL)
+ continue;
+
+ vdev_t *vd = ic->ic_vdev;
+
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+
+ zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+ is->is_target_offset, is->is_size,
+ NULL, NULL, NULL);
+ }
+ }
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror. The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually. We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly. (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ * | mirror child
+ * split | [0] [1]
+ * ======|=====================
+ * A | data_A_0 data_A_1
+ * B | data_B_0 data_B_1
+ * C | data_C_0 data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary. In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit" "high bit"
+ * v v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+ uint64_t attempts = 0;
+ uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+ int segments = 0;
+
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is))
+ segments++;
+
+ for (;;) {
+ /* copy data from splits to main zio */
+ int ret;
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+ /*
+ * If this child failed, its ic_data will be NULL.
+ * Skip this combination.
+ */
+ if (is->is_child[is->is_good_child].ic_data == NULL) {
+ ret = EIO;
+ goto next;
+ }
+
+ abd_copy_off(zio->io_abd,
+ is->is_child[is->is_good_child].ic_data,
+ is->is_split_offset, 0, is->is_size);
+ }
+
+ /* See if this checksum matches. */
+ zio_bad_cksum_t zbc;
+ ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ /* Found a matching checksum. Issue repair i/os. */
+ vdev_indirect_repair(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * Checksum failed; try a different combination of split
+ * children.
+ */
+ boolean_t more;
+next:
+ more = B_FALSE;
+ if (segments <= zfs_reconstruct_indirect_segments_max) {
+ /*
+ * There are relatively few segments, so
+ * deterministically check all combinations. We do
+ * this by by adding one to the first split's
+ * good_child. If it overflows, then "carry over" to
+ * the next split (like counting in base is_children,
+ * but each digit can have a different base).
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child++;
+ if (is->is_good_child < is->is_children) {
+ more = B_TRUE;
+ break;
+ }
+ is->is_good_child = 0;
+ }
+ } else if (++attempts < attempts_max) {
+ /*
+ * There are too many combinations to try all of them
+ * in a reasonable amount of time, so try a fixed
+ * number of random combinations, after which we'll
+ * consider the block unrecoverable.
+ */
+ for (indirect_split_t *is = list_head(&iv->iv_splits);
+ is != NULL; is = list_next(&iv->iv_splits, is)) {
+ is->is_good_child =
+ spa_get_random(is->is_children);
+ }
+ more = B_TRUE;
+ }
+ if (!more) {
+ /* All combinations failed. */
+ zio->io_error = ret;
+ vdev_indirect_all_checksum_errors(zio);
+ zio_checksum_verified(zio);
+ return;
+ }
+ }
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+ indirect_vsd_t *iv = zio->io_vsd;
+
+ if (iv->iv_reconstruct) {
+ /*
+ * We have read all copies of the data (e.g. from mirrors),
+ * either because this was a scrub/resilver, or because the
+ * one-copy read didn't checksum correctly.
+ */
+ vdev_indirect_reconstruct_io_done(zio);
+ return;
+ }
+
+ if (!iv->iv_split_block) {
+ /*
+ * This was not a split block, so we passed the BP down,
+ * and the checksum was handled by the (one) child zio.
+ */
+ return;
+ }
+
+ zio_bad_cksum_t zbc;
+ int ret = zio_checksum_error(zio, &zbc);
+ if (ret == 0) {
+ zio_checksum_verified(zio);
+ return;
+ }
+
+ /*
+ * The checksum didn't match. Read all copies of all splits, and
+ * then we will try to reconstruct. The next time
+ * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+ */
+ vdev_indirect_read_all(zio);
+
+ zio_vdev_io_redone(zio);
+}
+
vdev_ops_t vdev_indirect_ops = {
vdev_indirect_open,
vdev_indirect_close,
@@ -1113,6 +1631,7 @@ vdev_ops_t vdev_indirect_ops = {
NULL,
NULL,
vdev_indirect_remap,
+ NULL,
VDEV_TYPE_INDIRECT, /* name of this vdev type */
B_FALSE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
index ea80fbc4733f..02999aae7274 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
@@ -14,7 +14,7 @@
*/
/*
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
*/
#include <sys/dmu_tx.h>
@@ -536,14 +536,13 @@ typedef struct load_obsolete_space_map_arg {
} load_obsolete_space_map_arg_t;
static int
-load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size,
- void *arg)
+load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
{
load_obsolete_space_map_arg_t *losma = arg;
- ASSERT3S(type, ==, SM_ALLOC);
+ ASSERT3S(sme->sme_type, ==, SM_ALLOC);
vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
- offset, size, losma->losma_counts);
+ sme->sme_offset, sme->sme_run, losma->losma_counts);
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
new file mode 100644
index 000000000000..a2c39c2868e5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
@@ -0,0 +1,792 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * Maximum number of metaslabs per group that can be initialized
+ * simultaneously.
+ */
+int max_initialize_ms = 3;
+
+/*
+ * Value that is written to disk during initialization.
+ */
+uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+uint64_t zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+ return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+ vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+ /*
+ * We pass in the guid instead of the vdev_t since the vdev may
+ * have been freed prior to the sync task being processed. This
+ * happens when a vdev is detached as we call spa_config_vdev_exit(),
+ * stop the intializing thread, schedule the sync task, and free
+ * the vdev. Later when the scheduled sync task is invoked, it would
+ * find that the vdev has been freed.
+ */
+ uint64_t guid = *(uint64_t *)arg;
+ uint64_t txg = dmu_tx_get_txg(tx);
+ kmem_free(arg, sizeof (uint64_t));
+
+ vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+ if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+ return;
+
+ uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+ vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+ VERIFY(vd->vdev_leaf_zap != 0);
+
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+ if (last_offset > 0) {
+ vd->vdev_initialize_last_offset = last_offset;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (last_offset), 1, &last_offset, tx));
+ }
+ if (vd->vdev_initialize_action_time > 0) {
+ uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+ 1, &val, tx));
+ }
+
+ uint64_t initialize_state = vd->vdev_initialize_state;
+ VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+ VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+ &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ spa_t *spa = vd->vdev_spa;
+
+ if (new_state == vd->vdev_initialize_state)
+ return;
+
+ /*
+ * Copy the vd's guid, this will be freed by the sync task.
+ */
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /*
+ * If we're suspending, then preserving the original start time.
+ */
+ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+ vd->vdev_initialize_action_time = gethrestime_sec();
+ }
+ vd->vdev_initialize_state = new_state;
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+ guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+
+ switch (new_state) {
+ case VDEV_INITIALIZE_ACTIVE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s activated", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_SUSPENDED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s suspended", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_CANCELED:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s canceled", vd->vdev_path);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ spa_history_log_internal(spa, "initialize", tx,
+ "vdev=%s complete", vd->vdev_path);
+ break;
+ default:
+ panic("invalid state %llu", (unsigned long long)new_state);
+ }
+
+ dmu_tx_commit(tx);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+ /*
+ * The I/O failed because the vdev was unavailable; roll the
+ * last offset back. (This works because spa_sync waits on
+ * spa_txg_zio before it runs sync tasks.)
+ */
+ uint64_t *off =
+ &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+ *off = MIN(*off, zio->io_offset);
+ } else {
+ /*
+ * Since initializing is best-effort, we ignore I/O errors and
+ * rely on vdev_probe to determine if the errors are more
+ * critical.
+ */
+ if (zio->io_error != 0)
+ vd->vdev_stat.vs_initialize_errors++;
+
+ vd->vdev_initialize_bytes_done += zio->io_orig_size;
+ }
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ cv_broadcast(&vd->vdev_initialize_io_cv);
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+ spa_t *spa = vd->vdev_spa;
+
+ /* Limit inflight initializing I/Os */
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ vd->vdev_initialize_inflight++;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+ uint64_t txg = dmu_tx_get_txg(tx);
+
+ spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+ *guid = vd->vdev_guid;
+
+ /* This is the first write of this txg. */
+ dsl_sync_task_nowait(spa_get_dsl(spa),
+ vdev_initialize_zap_update_sync, guid, 2,
+ ZFS_SPACE_CHECK_RESERVED, tx);
+ }
+
+ /*
+ * We know the vdev struct will still be around since all
+ * consumers of vdev_free must stop the initialization first.
+ */
+ if (vdev_initialize_should_stop(vd)) {
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+ vd->vdev_initialize_inflight--;
+ mutex_exit(&vd->vdev_initialize_io_lock);
+ spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+ mutex_exit(&vd->vdev_initialize_lock);
+ dmu_tx_commit(tx);
+ return (SET_ERROR(EINTR));
+ }
+ mutex_exit(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+ zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+ size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+ ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+ /* vdev_initialize_cb releases SCL_STATE_ALL */
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+ /*
+ * Walk up the vdev tree
+ */
+ if (vd != vd->vdev_top) {
+ vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+ } else {
+ /*
+ * We've reached the top-level vdev, initialize the
+ * physical range to the logical range and start to
+ * unwind.
+ */
+ physical_rs->rs_start = logical_rs->rs_start;
+ physical_rs->rs_end = logical_rs->rs_end;
+ return;
+ }
+
+ vdev_t *pvd = vd->vdev_parent;
+ ASSERT3P(pvd, !=, NULL);
+ ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+ /*
+ * As this recursive function unwinds, translate the logical
+ * range into its physical components by calling the
+ * vdev specific translate function.
+ */
+ range_seg_t intermediate = { 0 };
+ pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+ physical_rs->rs_start = intermediate.rs_start;
+ physical_rs->rs_end = intermediate.rs_end;
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+ ASSERT0(len % sizeof (uint64_t));
+ for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+ *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+ }
+ return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc()
+{
+ /* Allocate ABD for filler data */
+ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+ ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+ (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+ vdev_initialize_block_fill, NULL);
+
+ return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+ abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+ avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+
+ for (range_seg_t *rs = avl_first(rt); rs != NULL;
+ rs = AVL_NEXT(rt, rs)) {
+ uint64_t size = rs->rs_end - rs->rs_start;
+
+ /* Split range into legally-sized physical chunks */
+ uint64_t writes_required =
+ ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+ for (uint64_t w = 0; w < writes_required; w++) {
+ int error;
+
+ error = vdev_initialize_write(vd,
+ VDEV_LABEL_START_SIZE + rs->rs_start +
+ (w * zfs_initialize_chunk_size),
+ MIN(size - (w * zfs_initialize_chunk_size),
+ zfs_initialize_chunk_size), data);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static void
+vdev_initialize_ms_load(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded)
+ VERIFY0(metaslab_load(msp));
+}
+
+static void
+vdev_initialize_mg_wait(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ while (mg->mg_initialize_updating) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+}
+
+static void
+vdev_initialize_mg_mark(metaslab_group_t *mg)
+{
+ ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+ ASSERT(mg->mg_initialize_updating);
+
+ while (mg->mg_ms_initializing >= max_initialize_ms) {
+ cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+ }
+ mg->mg_ms_initializing++;
+ ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
+}
+
+/*
+ * Mark the metaslab as being initialized to prevent any allocations
+ * on this metaslab. We must also track how many metaslabs are currently
+ * being initialized within a metaslab group and limit them to prevent
+ * allocation failures from occurring because all metaslabs are being
+ * initialized.
+ */
+static void
+vdev_initialize_ms_mark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+
+ mutex_enter(&mg->mg_ms_initialize_lock);
+
+ /*
+ * To keep an accurate count of how many threads are initializing
+ * a specific metaslab group, we only allow one thread to mark
+ * the metaslab group at a time. This ensures that the value of
+ * ms_initializing will be accurate when we decide to mark a metaslab
+ * group as being initialized. To do this we force all other threads
+ * to wait till the metaslab's mg_initialize_updating flag is no
+ * longer set.
+ */
+ vdev_initialize_mg_wait(mg);
+ mg->mg_initialize_updating = B_TRUE;
+ if (msp->ms_initializing == 0) {
+ vdev_initialize_mg_mark(mg);
+ }
+ mutex_enter(&msp->ms_lock);
+ msp->ms_initializing++;
+ mutex_exit(&msp->ms_lock);
+
+ mg->mg_initialize_updating = B_FALSE;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_ms_unmark(metaslab_t *msp)
+{
+ ASSERT(!MUTEX_HELD(&msp->ms_lock));
+ metaslab_group_t *mg = msp->ms_group;
+ mutex_enter(&mg->mg_ms_initialize_lock);
+ mutex_enter(&msp->ms_lock);
+ if (--msp->ms_initializing == 0) {
+ mg->mg_ms_initializing--;
+ cv_broadcast(&mg->mg_ms_initialize_cv);
+ }
+ mutex_exit(&msp->ms_lock);
+ mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ vd->vdev_initialize_bytes_est = 0;
+ vd->vdev_initialize_bytes_done = 0;
+
+ for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+ mutex_enter(&msp->ms_lock);
+
+ uint64_t ms_free = msp->ms_size -
+ space_map_allocated(msp->ms_sm);
+
+ if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+ ms_free /= vd->vdev_top->vdev_children;
+
+ /*
+ * Convert the metaslab range to a physical range
+ * on our vdev. We use this to determine if we are
+ * in the middle of this metaslab range.
+ */
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = msp->ms_start;
+ logical_rs.rs_end = msp->ms_start + msp->ms_size;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += ms_free;
+ vd->vdev_initialize_bytes_est += ms_free;
+ mutex_exit(&msp->ms_lock);
+ continue;
+ }
+
+ /*
+ * If we get here, we're in the middle of initializing this
+ * metaslab. Load it and walk the free tree for more accurate
+ * progress estimation.
+ */
+ vdev_initialize_ms_load(msp);
+
+ for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
+ rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+ logical_rs.rs_start = rs->rs_start;
+ logical_rs.rs_end = rs->rs_end;
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ uint64_t size = physical_rs.rs_end -
+ physical_rs.rs_start;
+ vd->vdev_initialize_bytes_est += size;
+ if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done += size;
+ } else if (vd->vdev_initialize_last_offset >
+ physical_rs.rs_start &&
+ vd->vdev_initialize_last_offset <
+ physical_rs.rs_end) {
+ vd->vdev_initialize_bytes_done +=
+ vd->vdev_initialize_last_offset -
+ physical_rs.rs_start;
+ }
+ }
+ mutex_exit(&msp->ms_lock);
+ }
+}
+
+static void
+vdev_initialize_load(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+ spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+ ASSERT(vd->vdev_leaf_zap != 0);
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+ sizeof (vd->vdev_initialize_last_offset), 1,
+ &vd->vdev_initialize_last_offset);
+ ASSERT(err == 0 || err == ENOENT);
+ }
+
+ vdev_initialize_calculate_progress(vd);
+}
+
+
+/*
+ * Convert the logical range into a physcial range and add it to our
+ * avl tree.
+ */
+void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = start;
+ logical_rs.rs_end = start + size;
+
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ vdev_xlate(vd, &logical_rs, &physical_rs);
+
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_start == physical_rs.rs_start);
+ IMPLY(vd->vdev_top == vd,
+ logical_rs.rs_end == physical_rs.rs_end);
+
+ /* Only add segments that we have not visited yet */
+ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+ return;
+
+ /* Pick up where we left off mid-range. */
+ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+ zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+ "(%llu, %llu)", vd->vdev_path,
+ (u_longlong_t)physical_rs.rs_start,
+ (u_longlong_t)physical_rs.rs_end,
+ (u_longlong_t)vd->vdev_initialize_last_offset,
+ (u_longlong_t)physical_rs.rs_end);
+ ASSERT3U(physical_rs.rs_end, >,
+ vd->vdev_initialize_last_offset);
+ physical_rs.rs_start = vd->vdev_initialize_last_offset;
+ }
+ ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+ /*
+ * With raidz, it's possible that the logical range does not live on
+ * this leaf vdev. We only add the physical range to this vdev's if it
+ * has a length greater than 0.
+ */
+ if (physical_rs.rs_end > physical_rs.rs_start) {
+ range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
+ physical_rs.rs_end - physical_rs.rs_start);
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+ }
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+ vdev_t *vd = arg;
+ spa_t *spa = vd->vdev_spa;
+ int error = 0;
+ uint64_t ms_count = 0;
+
+ ASSERT(vdev_is_concrete(vd));
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ vd->vdev_initialize_last_offset = 0;
+ vdev_initialize_load(vd);
+
+ abd_t *deadbeef = vdev_initialize_block_alloc();
+
+ vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+
+ for (uint64_t i = 0; !vd->vdev_detached &&
+ i < vd->vdev_top->vdev_ms_count; i++) {
+ metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+ /*
+ * If we've expanded the top-level vdev or it's our
+ * first pass, calculate our progress.
+ */
+ if (vd->vdev_top->vdev_ms_count != ms_count) {
+ vdev_initialize_calculate_progress(vd);
+ ms_count = vd->vdev_top->vdev_ms_count;
+ }
+
+ vdev_initialize_ms_mark(msp);
+ mutex_enter(&msp->ms_lock);
+ vdev_initialize_ms_load(msp);
+
+ range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+ vd);
+ mutex_exit(&msp->ms_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ error = vdev_initialize_ranges(vd, deadbeef);
+ vdev_initialize_ms_unmark(msp);
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+ if (error != 0)
+ break;
+ }
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+ mutex_enter(&vd->vdev_initialize_io_lock);
+ while (vd->vdev_initialize_inflight > 0) {
+ cv_wait(&vd->vdev_initialize_io_cv,
+ &vd->vdev_initialize_io_lock);
+ }
+ mutex_exit(&vd->vdev_initialize_io_lock);
+
+ range_tree_destroy(vd->vdev_initialize_tree);
+ vdev_initialize_block_free(deadbeef);
+ vd->vdev_initialize_tree = NULL;
+
+ mutex_enter(&vd->vdev_initialize_lock);
+ if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+ }
+ ASSERT(vd->vdev_initialize_thread != NULL ||
+ vd->vdev_initialize_inflight == 0);
+
+ /*
+ * Drop the vdev_initialize_lock while we sync out the
+ * txg since it's possible that a device might be trying to
+ * come online and must check to see if it needs to restart an
+ * initialization. That thread will be holding the spa_config_lock
+ * which would prevent the txg_wait_synced from completing.
+ */
+ mutex_exit(&vd->vdev_initialize_lock);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mutex_enter(&vd->vdev_initialize_lock);
+
+ vd->vdev_initialize_thread = NULL;
+ cv_broadcast(&vd->vdev_initialize_cv);
+ mutex_exit(&vd->vdev_initialize_lock);
+ thread_exit();
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ ASSERT(!vd->vdev_detached);
+ ASSERT(!vd->vdev_initialize_exit_wanted);
+ ASSERT(!vd->vdev_top->vdev_removing);
+
+ vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+ vd->vdev_initialize_thread = thread_create(NULL, 0,
+ vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Stop initializng a device, with the resultant initialing state being
+ * tgt_state. Blocks until the initializing thread has exited.
+ * Caller must hold vdev_initialize_lock and must not be writing to the spa
+ * config, as the initializing thread may try to enter the config as a reader
+ * before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ spa_t *spa = vd->vdev_spa;
+ ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+
+ ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(vdev_is_concrete(vd));
+
+ /*
+ * Allow cancel requests to proceed even if the initialize thread
+ * has stopped.
+ */
+ if (vd->vdev_initialize_thread == NULL &&
+ tgt_state != VDEV_INITIALIZE_CANCELED) {
+ return;
+ }
+
+ vdev_initialize_change_state(vd, tgt_state);
+ vd->vdev_initialize_exit_wanted = B_TRUE;
+ while (vd->vdev_initialize_thread != NULL)
+ cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+ vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ vdev_initialize_stop(vd, tgt_state);
+ mutex_exit(&vd->vdev_initialize_lock);
+ return;
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+ }
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+ vdev_initialize_stop_all_impl(vd, tgt_state);
+
+ if (vd->vdev_spa->spa_sync_on) {
+ /* Make sure that our state has been synced to disk */
+ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+ }
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+ if (vd->vdev_leaf_zap != 0) {
+ mutex_enter(&vd->vdev_initialize_lock);
+ uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+ int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+ sizeof (initialize_state), 1, &initialize_state);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_state = initialize_state;
+
+ uint64_t timestamp = 0;
+ err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+ vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+ sizeof (timestamp), 1, &timestamp);
+ ASSERT(err == 0 || err == ENOENT);
+ vd->vdev_initialize_action_time = (time_t)timestamp;
+
+ if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vd->vdev_offline) {
+ /* load progress for reporting, but don't resume */
+ vdev_initialize_load(vd);
+ } else if (vd->vdev_initialize_state ==
+ VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
+ vdev_initialize(vd);
+ }
+
+ mutex_exit(&vd->vdev_initialize_lock);
+ }
+
+ for (uint64_t i = 0; i < vd->vdev_children; i++) {
+ vdev_initialize_restart(vd->vdev_child[i]);
+ }
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index d993d2aec8e1..d66fa4ef822f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -33,15 +33,15 @@
* 1. Uniquely identify this device as part of a ZFS pool and confirm its
* identity within the pool.
*
- * 2. Verify that all the devices given in a configuration are present
+ * 2. Verify that all the devices given in a configuration are present
* within the pool.
*
- * 3. Determine the uberblock for the pool.
+ * 3. Determine the uberblock for the pool.
*
- * 4. In case of an import operation, determine the configuration of the
+ * 4. In case of an import operation, determine the configuration of the
* toplevel vdev of which it is a part.
*
- * 5. If an import operation cannot find all the devices in the pool,
+ * 5. If an import operation cannot find all the devices in the pool,
* provide enough information to the administrator to determine which
* devices are missing.
*
@@ -77,9 +77,9 @@
* In order to identify which labels are valid, the labels are written in the
* following manner:
*
- * 1. For each vdev, update 'L1' to the new label
- * 2. Update the uberblock
- * 3. For each vdev, update 'L2' to the new label
+ * 1. For each vdev, update 'L1' to the new label
+ * 2. Update the uberblock
+ * 3. For each vdev, update 'L2' to the new label
*
* Given arbitrary failure, we can determine the correct label to use based on
* the transaction group. If we fail after updating L1 but before updating the
@@ -117,19 +117,19 @@
*
* The nvlist describing the pool and vdev contains the following elements:
*
- * version ZFS on-disk version
- * name Pool name
- * state Pool state
- * txg Transaction group in which this label was written
- * pool_guid Unique identifier for this pool
- * vdev_tree An nvlist describing vdev tree.
+ * version ZFS on-disk version
+ * name Pool name
+ * state Pool state
+ * txg Transaction group in which this label was written
+ * pool_guid Unique identifier for this pool
+ * vdev_tree An nvlist describing vdev tree.
* features_for_read
* An nvlist of the features necessary for reading the MOS.
*
* Each leaf device label also contains the following:
*
- * top_guid Unique ID for top-level vdev in which this is contained
- * guid Unique ID for the leaf vdev
+ * top_guid Unique ID for top-level vdev in which this is contained
+ * guid Unique ID for the leaf vdev
*
* The 'vs' configuration follows the format described in 'spa_config.c'.
*/
@@ -396,22 +396,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
* histograms.
*/
uint64_t seg_count = 0;
+ uint64_t to_alloc = vd->vdev_stat.vs_alloc;
/*
* There are the same number of allocated segments
* as free segments, so we will have at least one
- * entry per free segment.
+ * entry per free segment. However, small free
+ * segments (smaller than vdev_removal_max_span)
+ * will be combined with adjacent allocated segments
+ * as a single mapping.
*/
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
- seg_count += vd->vdev_mg->mg_histogram[i];
+ if (1ULL << (i + 1) < vdev_removal_max_span) {
+ to_alloc +=
+ vd->vdev_mg->mg_histogram[i] <<
+ i + 1;
+ } else {
+ seg_count +=
+ vd->vdev_mg->mg_histogram[i];
+ }
}
/*
- * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
- * so we need at least one entry per SPA_MAXBLOCKSIZE
- * of allocated data.
+ * The maximum length of a mapping is
+ * zfs_remove_max_segment, so we need at least one entry
+ * per zfs_remove_max_segment of allocated data.
*/
- seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
+ seg_count += to_alloc / zfs_remove_max_segment;
fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
seg_count *
@@ -546,6 +557,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
abd_t *vp_abd;
zio_t *zio;
uint64_t best_txg = 0;
+ uint64_t label_txg = 0;
int error = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SPECULATIVE;
@@ -571,8 +583,6 @@ retry:
if (zio_wait(zio) == 0 &&
nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
&label, 0) == 0) {
- uint64_t label_txg = 0;
-
/*
* Auxiliary vdevs won't have txg values in their
* labels and newly added vdevs may not have been
@@ -603,6 +613,15 @@ retry:
goto retry;
}
+ /*
+ * We found a valid label but it didn't pass txg restrictions.
+ */
+ if (config == NULL && label_txg != 0) {
+ vdev_dbgmsg(vd, "label discarded as txg is too large "
+ "(%llu > %llu)", (u_longlong_t)label_txg,
+ (u_longlong_t)txg);
+ }
+
abd_free(vp_abd);
return (config);
@@ -1028,19 +1047,13 @@ retry:
* among uberblocks with equal txg, choose the one with the latest timestamp.
*/
static int
-vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
{
- if (ub1->ub_txg < ub2->ub_txg)
- return (-1);
- if (ub1->ub_txg > ub2->ub_txg)
- return (1);
-
- if (ub1->ub_timestamp < ub2->ub_timestamp)
- return (-1);
- if (ub1->ub_timestamp > ub2->ub_timestamp)
- return (1);
+ int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+ if (likely(cmp))
+ return (cmp);
- return (0);
+ return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp));
}
struct ubl_cbdata {
@@ -1167,10 +1180,13 @@ vdev_uberblock_sync_done(zio_t *zio)
* Write the uberblock to all labels of all leaves of the specified vdev.
*/
static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+ uberblock_t *ub, vdev_t *vd, int flags)
{
- for (uint64_t c = 0; c < vd->vdev_children; c++)
- vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ vdev_uberblock_sync(zio, good_writes,
+ ub, vd->vdev_child[c], flags);
+ }
if (!vd->vdev_ops->vdev_op_leaf)
return;
@@ -1188,7 +1204,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
for (int l = 0; l < VDEV_LABELS; l++)
vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
- vdev_uberblock_sync_done, zio->io_private,
+ vdev_uberblock_sync_done, good_writes,
flags | ZIO_FLAG_DONT_PROPAGATE);
abd_free(ub_abd);
@@ -1202,10 +1218,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
zio_t *zio;
uint64_t good_writes = 0;
- zio = zio_root(spa, NULL, &good_writes, flags);
+ zio = zio_root(spa, NULL, NULL, flags);
for (int v = 0; v < svdcount; v++)
- vdev_uberblock_sync(zio, ub, svd[v], flags);
+ vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
(void) zio_wait(zio);
@@ -1266,7 +1282,8 @@ vdev_label_sync_ignore_done(zio_t *zio)
* Write all even or odd labels to all leaves of the specified vdev.
*/
static void
-vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+ vdev_t *vd, int l, uint64_t txg, int flags)
{
nvlist_t *label;
vdev_phys_t *vp;
@@ -1274,8 +1291,10 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
char *buf;
size_t buflen;
- for (int c = 0; c < vd->vdev_children; c++)
- vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_label_sync(zio, good_writes,
+ vd->vdev_child[c], l, txg, flags);
+ }
if (!vd->vdev_ops->vdev_op_leaf)
return;
@@ -1300,7 +1319,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t),
- vdev_label_sync_done, zio->io_private,
+ vdev_label_sync_done, good_writes,
flags | ZIO_FLAG_DONT_PROPAGATE);
}
}
@@ -1332,7 +1351,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
(vd->vdev_islog || vd->vdev_aux != NULL) ?
vdev_label_sync_ignore_done : vdev_label_sync_top_done,
good_writes, flags);
- vdev_label_sync(vio, vd, l, txg, flags);
+ vdev_label_sync(vio, good_writes, vd, l, txg, flags);
zio_nowait(vio);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index 60cb7aa96fca..26be35fc3501 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -516,13 +516,16 @@ vdev_mirror_io_start(zio_t *zio)
}
if (zio->io_type == ZIO_TYPE_READ) {
- if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
+ if (zio->io_bp != NULL &&
+ (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
mm->mm_children > 1) {
/*
- * For scrubbing reads we need to allocate a read
- * buffer for each child and issue reads to all
- * children. If any child succeeds, it will copy its
- * data into zio->io_data in vdev_mirror_scrub_done.
+ * For scrubbing reads (if we can verify the
+ * checksum here, as indicated by io_bp being
+ * non-NULL) we need to allocate a read buffer for
+ * each child and issue reads to all children. If
+ * any child succeeds, it will copy its data into
+ * zio->io_data in vdev_mirror_scrub_done.
*/
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
@@ -677,7 +680,21 @@ vdev_mirror_io_done(zio_t *zio)
if (mc->mc_error == 0) {
if (mc->mc_tried)
continue;
+ /*
+ * We didn't try this child. We need to
+ * repair it if:
+ * 1. it's a scrub (in which case we have
+ * tried everything that was healthy)
+ * - or -
+ * 2. it's an indirect vdev (in which case
+ * it could point to any other vdev, which
+ * might have a bad DTL)
+ * - or -
+ * 3. the DTL indicates that this data is
+ * missing from this vdev
+ */
if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+ mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
!vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
zio->io_txg, 1))
continue;
@@ -723,6 +740,7 @@ vdev_ops_t vdev_mirror_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_MIRROR, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -738,6 +756,7 @@ vdev_ops_t vdev_replacing_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_REPLACING, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
@@ -753,6 +772,7 @@ vdev_ops_t vdev_spare_ops = {
NULL,
NULL,
NULL,
+ vdev_default_xlate,
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
index 29194fc11065..6852de445049 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
/*
@@ -91,6 +91,7 @@ vdev_ops_t vdev_missing_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_MISSING, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
@@ -106,6 +107,7 @@ vdev_ops_t vdev_hole_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_HOLE, /* name of this vdev type */
B_TRUE /* leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index 5a3ba1b3e983..78b725a37b68 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -157,6 +157,8 @@ uint32_t zfs_vdev_trim_min_active = 1;
uint32_t zfs_vdev_trim_max_active = 64;
uint32_t zfs_vdev_removal_min_active = 1;
uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
/*
@@ -195,6 +197,14 @@ int zfs_vdev_queue_depth_pct = 1000;
int zfs_vdev_queue_depth_pct = 300;
#endif
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
#ifdef __FreeBSD__
#ifdef _KERNEL
@@ -301,20 +311,15 @@ sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
- const zio_t *z1 = x1;
- const zio_t *z2 = x2;
+ const zio_t *z1 = (const zio_t *)x1;
+ const zio_t *z2 = (const zio_t *)x2;
- if (z1->io_offset < z2->io_offset)
- return (-1);
- if (z1->io_offset > z2->io_offset)
- return (1);
+ int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
- if (z1 < z2)
- return (-1);
- if (z1 > z2)
- return (1);
+ if (likely(cmp))
+ return (cmp);
- return (0);
+ return (AVL_PCMP(z1, z2));
}
static inline avl_tree_t *
@@ -534,6 +539,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_trim_min_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_min_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_min_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -597,6 +604,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_trim_max_active);
case ZIO_PRIORITY_REMOVAL:
return (zfs_vdev_removal_max_active);
+ case ZIO_PRIORITY_INITIALIZING:
+ return (zfs_vdev_initializing_max_active);
default:
panic("invalid priority %u", p);
return (0);
@@ -824,8 +833,8 @@ again:
}
/*
- * For LBA-ordered queues (async / scrub), issue the i/o which follows
- * the most recently issued i/o in LBA (offset) order.
+ * For LBA-ordered queues (async / scrub / initializing), issue the
+ * i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
*/
@@ -881,12 +890,14 @@ vdev_queue_io(zio_t *zio)
if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
- zio->io_priority != ZIO_PRIORITY_REMOVAL)
+ zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+ zio->io_priority != ZIO_PRIORITY_INITIALIZING)
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
} else {
ASSERT(zio->io_type == ZIO_TYPE_FREE);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index d6f4bbc4156a..4df04c30aabf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -41,6 +41,10 @@
#include <sys/fm/fs/zfs.h>
#include <sys/bio.h>
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h> /* vdev_xlate testing */
+#endif
+
/*
* Virtual device vector for RAID-Z.
*
@@ -1896,6 +1900,39 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0;
}
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+ vdev_t *vd = zio->io_vd;
+ vdev_t *tvd = vd->vdev_top;
+
+ range_seg_t logical_rs, physical_rs;
+ logical_rs.rs_start = zio->io_offset;
+ logical_rs.rs_end = logical_rs.rs_start +
+ vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+ raidz_col_t *rc = &rm->rm_col[col];
+ vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+ vdev_xlate(cvd, &logical_rs, &physical_rs);
+ ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+ ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+ /*
+ * It would be nice to assert that rs_end is equal
+ * to rc_offset + rc_size but there might be an
+ * optional I/O at the end that is not accounted in
+ * rc_size.
+ */
+ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+ rc->rc_size + (1 << tvd->vdev_ashift));
+ } else {
+ ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+ }
+#endif
+}
+
/*
* Start an IO operation on a RAIDZ VDev
*
@@ -1953,6 +1990,12 @@ vdev_raidz_io_start(zio_t *zio)
for (c = 0; c < rm->rm_cols; c++) {
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Verify physical to logical translation.
+ */
+ vdev_raidz_io_verify(zio, rm, c);
+
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
@@ -2622,6 +2665,37 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
return (B_FALSE);
}
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+ vdev_t *raidvd = cvd->vdev_parent;
+ ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+ uint64_t width = raidvd->vdev_children;
+ uint64_t tgt_col = cvd->vdev_id;
+ uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+ /* make sure the offsets are block-aligned */
+ ASSERT0(in->rs_start % (1 << ashift));
+ ASSERT0(in->rs_end % (1 << ashift));
+ uint64_t b_start = in->rs_start >> ashift;
+ uint64_t b_end = in->rs_end >> ashift;
+
+ uint64_t start_row = 0;
+ if (b_start > tgt_col) /* avoid underflow */
+ start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+ uint64_t end_row = 0;
+ if (b_end > tgt_col)
+ end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+ res->rs_start = start_row << ashift;
+ res->rs_end = end_row << ashift;
+
+ ASSERT3U(res->rs_start, <=, in->rs_start);
+ ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
vdev_ops_t vdev_raidz_ops = {
vdev_raidz_open,
vdev_raidz_close,
@@ -2633,6 +2707,7 @@ vdev_ops_t vdev_raidz_ops = {
NULL,
NULL,
NULL,
+ vdev_raidz_xlate,
VDEV_TYPE_RAIDZ, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
index c864ab1cb0c1..20fa9c24db24 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
@@ -44,6 +44,7 @@
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
/*
* This file contains the necessary logic to remove vdevs from a
@@ -83,18 +84,12 @@ typedef struct vdev_copy_arg {
kmutex_t vca_lock;
} vdev_copy_arg_t;
-typedef struct vdev_copy_seg_arg {
- vdev_copy_arg_t *vcsa_copy_arg;
- uint64_t vcsa_txg;
- dva_t *vcsa_dest_dva;
- blkptr_t *vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
/*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal. This determines how much i/o we can have
+ * in flight concurrently.
*/
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
/*
* The largest contiguous segment that we will attempt to allocate when
@@ -111,6 +106,24 @@ int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
int zfs_remove_max_segment = 1024 * 1024;
/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops. The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ * - the mapping will be smaller, since one entry can cover more allocated
+ * segments
+ * - more of the fragmentation in the removing device will be preserved
+ * - we'll do larger allocations, which may fail and fall back on smaller
+ * allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
* This is used by the test suite so that it can ensure that certain
* actions happen while in the middle of a removal.
*/
@@ -176,7 +189,7 @@ spa_vdev_removal_create(vdev_t *vd)
mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
svr->svr_allocd_segs = range_tree_create(NULL, NULL);
- svr->svr_vdev = vd;
+ svr->svr_vdev_id = vd->vdev_id;
for (int i = 0; i < TXG_SIZE; i++) {
svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -218,9 +231,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
static void
vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
{
- vdev_t *vd = arg;
+ int vdev_id = (uintptr_t)arg;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
- spa_t *spa = vd->vdev_spa;
objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
spa_vdev_removal_t *svr = NULL;
uint64_t txg = dmu_tx_get_txg(tx);
@@ -342,7 +356,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
ASSERT3P(spa->spa_vdev_removal, ==, NULL);
spa->spa_vdev_removal = svr;
svr->svr_thread = thread_create(NULL, 0,
- spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+ spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
}
/*
@@ -384,21 +398,24 @@ spa_remove_init(spa_t *spa)
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
vdev_t *vd = vdev_lookup_top(spa,
spa->spa_removing_phys.sr_removing_vdev);
- spa_config_exit(spa, SCL_STATE, FTAG);
- if (vd == NULL)
+ if (vd == NULL) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
return (EINVAL);
+ }
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
ASSERT(vdev_is_concrete(vd));
spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
- ASSERT(svr->svr_vdev->vdev_removing);
+ ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+ ASSERT(vd->vdev_removing);
vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
spa->spa_meta_objset, vic->vic_mapping_object);
vd->vdev_indirect_births = vdev_indirect_births_open(
spa->spa_meta_objset, vic->vic_births_object);
+ spa_config_exit(spa, SCL_STATE, FTAG);
spa->spa_vdev_removal = svr;
}
@@ -451,15 +468,8 @@ spa_restart_removal(spa_t *spa)
if (!spa_writeable(spa))
return;
- vdev_t *vd = svr->svr_vdev;
- vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
- ASSERT3P(vd, !=, NULL);
- ASSERT(vd->vdev_removing);
-
- zfs_dbgmsg("restarting removal of %llu at count=%llu",
- vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
- svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+ zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+ svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
0, &p0, TS_RUN, minclsyspri);
}
@@ -480,7 +490,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
vdev_indirect_mapping_object(vim));
- ASSERT3P(vd, ==, svr->svr_vdev);
+ ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
mutex_enter(&svr->svr_lock);
@@ -663,7 +673,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
if (state == DSS_FINISHED) {
spa_removing_phys_t *srp = &spa->spa_removing_phys;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -706,7 +716,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
uint64_t txg = dmu_tx_get_txg(tx);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -734,85 +744,249 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
spa_sync_removing_state(spa, tx);
}
+typedef struct vdev_copy_segment_arg {
+ spa_t *vcsa_spa;
+ dva_t *vcsa_dest_dva;
+ uint64_t vcsa_txg;
+ range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_copy_segment_arg_t *vcsa = arg;
+ spa_t *spa = vcsa->vcsa_spa;
+ blkptr_t bp = { 0 };
+
+ BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+ BP_SET_LSIZE(&bp, size);
+ BP_SET_PSIZE(&bp, size);
+ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(&bp, DMU_OT_NONE);
+ BP_SET_LEVEL(&bp, 0);
+ BP_SET_DEDUP(&bp, 0);
+ BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+ DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+ DVA_SET_OFFSET(&bp.blk_dva[0],
+ DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+ DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+ zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+ vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+ range_tree_vacate(vcsa->vcsa_obsolete_segs,
+ unalloc_seg, vcsa);
+ range_tree_destroy(vcsa->vcsa_obsolete_segs);
+ kmem_free(vcsa, sizeof (*vcsa));
+
+ spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
static void
spa_vdev_copy_segment_write_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
- spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+ vdev_copy_arg_t *vca = zio->io_private;
+
abd_free(zio->io_abd);
mutex_enter(&vca->vca_lock);
vca->vca_outstanding_bytes -= zio->io_size;
cv_signal(&vca->vca_cv);
mutex_exit(&vca->vca_lock);
-
- ASSERT0(zio->io_error);
- kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
- kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
}
+/*
+ * The read of the old location is done. The parent zio is the write to
+ * the new location. Allow it to start.
+ */
static void
spa_vdev_copy_segment_read_done(zio_t *zio)
{
- vdev_copy_seg_arg_t *vcsa = zio->io_private;
- dva_t *dest_dva = vcsa->vcsa_dest_dva;
- uint64_t txg = vcsa->vcsa_txg;
- spa_t *spa = zio->io_spa;
- vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva));
- blkptr_t *bp = NULL;
- dva_t *dva = NULL;
- uint64_t size = zio->io_size;
-
- ASSERT3P(dest_vd, !=, NULL);
- ASSERT0(zio->io_error);
-
- vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
- bp = vcsa->vcsa_dest_bp;
- dva = bp->blk_dva;
-
- BP_ZERO(bp);
-
- /* initialize with dest_dva */
- bcopy(dest_dva, dva, sizeof (dva_t));
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
- txg, bp, zio->io_abd, size,
- spa_vdev_copy_segment_write_done, vcsa,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible. Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs. However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads. If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ * null
+ * / \
+ * write(new vdev, child 0) write(new vdev, child 1)
+ * | |
+ * read(old vdev, child 0) read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete. However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete. In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*. We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+ vdev_t *source_vd, uint64_t source_offset,
+ vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+ ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+ mutex_enter(&vca->vca_lock);
+ vca->vca_outstanding_bytes += size;
+ mutex_exit(&vca->vca_lock);
+
+ abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+ vdev_t *source_child_vd;
+ if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+ /*
+ * Source and dest are both mirrors. Copy from the same
+ * child id as we are copying to (wrapping around if there
+ * are more dest children than source children).
+ */
+ source_child_vd =
+ source_vd->vdev_child[dest_id % source_vd->vdev_children];
+ } else {
+ source_child_vd = source_vd;
+ }
+
+ zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+ dest_child_vd, dest_offset, abd, size,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_write_done, vca);
+
+ zio_nowait(zio_vdev_child_io(write_zio, NULL,
+ source_child_vd, source_offset, abd, size,
+ ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+ ZIO_FLAG_CANFAIL,
+ spa_vdev_copy_segment_read_done, vca));
}
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+ uint64_t maxalloc, uint64_t txg,
vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
{
metaslab_group_t *mg = vd->vdev_mg;
spa_t *spa = vd->vdev_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_indirect_mapping_entry_t *entry;
- vdev_copy_seg_arg_t *private;
dva_t dst = { 0 };
- blkptr_t blk, *bp = &blk;
- dva_t *dva = bp->blk_dva;
+ uint64_t start = range_tree_min(segs);
- ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
+ uint64_t size = range_tree_span(segs);
+ if (range_tree_span(segs) > maxalloc) {
+ /*
+ * We can't allocate all the segments. Prefer to end
+ * the allocation at the end of a segment, thus avoiding
+ * additional split blocks.
+ */
+ range_seg_t search;
+ avl_index_t where;
+ search.rs_start = start + maxalloc;
+ search.rs_end = search.rs_start;
+ range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+ if (rs == NULL) {
+ rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+ } else {
+ rs = AVL_PREV(&segs->rt_root, rs);
+ }
+ if (rs != NULL) {
+ size = rs->rs_end - start;
+ } else {
+ /*
+ * There are no segments that end before maxalloc.
+ * I.e. the first segment is larger than maxalloc,
+ * so we must split it.
+ */
+ size = maxalloc;
+ }
+ }
+ ASSERT3U(size, <=, maxalloc);
+
+ /*
+ * We use allocator 0 for this I/O because we don't expect device remap
+ * to be the steady state of the system, so parallelizing is not as
+ * critical as it is for other allocation types. We also want to ensure
+ * that the IOs are allocated together as much as possible, to reduce
+ * mapping sizes.
+ */
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
- &dst, 0, NULL, txg, 0, zal);
+ &dst, 0, NULL, txg, 0, zal, 0);
if (error != 0)
return (error);
/*
+ * Determine the ranges that are not actually needed. Offsets are
+ * relative to the start of the range to be copied (i.e. relative to the
+ * local variable "start").
+ */
+ range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+ range_seg_t *rs = avl_first(&segs->rt_root);
+ ASSERT3U(rs->rs_start, ==, start);
+ uint64_t prev_seg_end = rs->rs_end;
+ while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+ if (rs->rs_start >= start + size) {
+ break;
+ } else {
+ range_tree_add(obsolete_segs,
+ prev_seg_end - start,
+ rs->rs_start - prev_seg_end);
+ }
+ prev_seg_end = rs->rs_end;
+ }
+ /* We don't end in the middle of an obsolete range */
+ ASSERT3U(start + size, <=, prev_seg_end);
+
+ range_tree_clear(segs, start, size);
+
+ /*
* We can't have any padding of the allocated size, otherwise we will
* misunderstand what's allocated, and the size of the mapping.
* The caller ensures this will be true by passing in a size that is
@@ -820,51 +994,37 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
*/
ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
- mutex_enter(&vca->vca_lock);
- vca->vca_outstanding_bytes += size;
- mutex_exit(&vca->vca_lock);
-
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
entry->vime_mapping.vimep_dst = dst;
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+ entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+ }
- private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
- private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
- private->vcsa_txg = txg;
- private->vcsa_copy_arg = vca;
+ vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+ vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+ vcsa->vcsa_obsolete_segs = obsolete_segs;
+ vcsa->vcsa_spa = spa;
+ vcsa->vcsa_txg = txg;
/*
- * This lock is eventually released by the donefunc for the
- * zio_write_phys that finishes copying the data.
+ * See comment before spa_vdev_copy_one_child().
*/
- spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
- /*
- * Do logical I/O, letting the redundancy vdevs (like mirror)
- * handle their own I/O instead of duplicating that code here.
- */
- BP_ZERO(bp);
-
- DVA_SET_VDEV(&dva[0], vd->vdev_id);
- DVA_SET_OFFSET(&dva[0], start);
- DVA_SET_GANG(&dva[0], 0);
- DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
- BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
- BP_SET_LSIZE(bp, size);
- BP_SET_PSIZE(bp, size);
- BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
- BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
- BP_SET_TYPE(bp, DMU_OT_NONE);
- BP_SET_LEVEL(bp, 0);
- BP_SET_DEDUP(bp, 0);
- BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
- zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
- bp, abd_alloc_for_io(size, B_FALSE), size,
- spa_vdev_copy_segment_read_done, private,
- ZIO_PRIORITY_REMOVAL, 0, NULL));
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+ zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+ spa_vdev_copy_segment_done, vcsa, 0);
+ vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+ if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+ for (int i = 0; i < dest_vd->vdev_children; i++) {
+ vdev_t *child = dest_vd->vdev_child[i];
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ child, DVA_GET_OFFSET(&dst), i, size);
+ }
+ } else {
+ spa_vdev_copy_one_child(vca, nzio, vd, start,
+ dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+ }
+ zio_nowait(nzio);
list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -882,8 +1042,8 @@ static void
vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
{
spa_vdev_removal_t *svr = arg;
- vdev_t *vd = svr->svr_vdev;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
@@ -912,37 +1072,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
}
static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
- ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
- ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
- ASSERT(vd->vdev_indirect_mapping != NULL);
- ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
- vd->vdev_indirect_mapping = NULL;
-
- ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
- ASSERT(vd->vdev_indirect_births != NULL);
- ivd->vdev_indirect_births = vd->vdev_indirect_births;
- vd->vdev_indirect_births = NULL;
-
- ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
- ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
- if (vd->vdev_obsolete_sm != NULL) {
- ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
- /*
- * We cannot use space_map_{open,close} because we hold all
- * the config locks as writer.
- */
- ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
- ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
- vd->vdev_obsolete_sm = NULL;
- }
-}
-
-static void
vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
{
ASSERT3P(zlist, !=, NULL);
@@ -977,17 +1106,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+ ivd->vdev_removing = 0;
vd->vdev_leaf_zap = 0;
vdev_remove_child(ivd, vd);
vdev_compact_children(ivd);
- vdev_indirect_state_transfer(ivd, vd);
-
- svr->svr_vdev = ivd;
-
- ASSERT(!ivd->vdev_removing);
ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1010,9 +1135,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
* context by the removal thread after we have copied all vdev's data.
*/
static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
{
- spa_t *spa = vd->vdev_spa;
uint64_t txg;
/*
@@ -1020,8 +1144,13 @@ vdev_remove_complete(vdev_t *vd)
* vdev_metaslab_fini()
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
-
txg = spa_vdev_enter(spa);
+ vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+ ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+
+ sysevent_t *ev = spa_event_create(spa, vd, NULL,
+ ESC_ZFS_VDEV_REMOVE_DEV);
+
zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
vd->vdev_id, txg);
@@ -1041,6 +1170,10 @@ vdev_remove_complete(vdev_t *vd)
/*
* We now release the locks, allowing spa_sync to run and finish the
* removal via vdev_remove_complete_sync in syncing context.
+ *
+ * Note that we hold on to the vdev_t that has been replaced. Since
+ * it isn't part of the vdev tree any longer, it can't be concurrently
+ * manipulated, even while we don't have the config lock.
*/
(void) spa_vdev_exit(spa, NULL, txg, 0);
@@ -1062,6 +1195,8 @@ vdev_remove_complete(vdev_t *vd)
*/
vdev_config_dirty(spa->spa_root_vdev);
(void) spa_vdev_exit(spa, vd, txg, 0);
+
+ spa_event_post(ev);
}
/*
@@ -1072,7 +1207,7 @@ vdev_remove_complete(vdev_t *vd)
* this size again this txg.
*/
static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
uint64_t *max_alloc, dmu_tx_t *tx)
{
uint64_t txg = dmu_tx_get_txg(tx);
@@ -1080,39 +1215,78 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
mutex_enter(&svr->svr_lock);
- range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
- if (rs == NULL) {
+ /*
+ * Determine how big of a chunk to copy. We can allocate up
+ * to max_alloc bytes, and we can span up to vdev_removal_max_span
+ * bytes of unallocated space at a time. "segs" will track the
+ * allocated segments that we are copying. We may also be copying
+ * free segments (of up to vdev_removal_max_span bytes).
+ */
+ range_tree_t *segs = range_tree_create(NULL, NULL);
+ for (;;) {
+ range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+ if (rs == NULL)
+ break;
+
+ uint64_t seg_length;
+
+ if (range_tree_is_empty(segs)) {
+ /* need to truncate the first seg based on max_alloc */
+ seg_length =
+ MIN(rs->rs_end - rs->rs_start, *max_alloc);
+ } else {
+ if (rs->rs_start - range_tree_max(segs) >
+ vdev_removal_max_span) {
+ /*
+ * Including this segment would cause us to
+ * copy a larger unneeded chunk than is allowed.
+ */
+ break;
+ } else if (rs->rs_end - range_tree_min(segs) >
+ *max_alloc) {
+ /*
+ * This additional segment would extend past
+ * max_alloc. Rather than splitting this
+ * segment, leave it for the next mapping.
+ */
+ break;
+ } else {
+ seg_length = rs->rs_end - rs->rs_start;
+ }
+ }
+
+ range_tree_add(segs, rs->rs_start, seg_length);
+ range_tree_remove(svr->svr_allocd_segs,
+ rs->rs_start, seg_length);
+ }
+
+ if (range_tree_is_empty(segs)) {
mutex_exit(&svr->svr_lock);
+ range_tree_destroy(segs);
return;
}
- uint64_t offset = rs->rs_start;
- uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
-
- range_tree_remove(svr->svr_allocd_segs, offset, length);
if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
svr, 0, ZFS_SPACE_CHECK_NONE, tx);
}
- svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+ svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
/*
* Note: this is the amount of *allocated* space
* that we are taking care of each txg.
*/
- svr->svr_bytes_done[txg & TXG_MASK] += length;
+ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
mutex_exit(&svr->svr_lock);
zio_alloc_list_t zal;
metaslab_trace_init(&zal);
- uint64_t thismax = *max_alloc;
- while (length > 0) {
- uint64_t mylen = MIN(length, thismax);
-
- int error = spa_vdev_copy_segment(svr->svr_vdev,
- offset, mylen, txg, vca, &zal);
+ uint64_t thismax = SPA_MAXBLOCKSIZE;
+ while (!range_tree_is_empty(segs)) {
+ int error = spa_vdev_copy_segment(vd,
+ segs, thismax, txg, vca, &zal);
if (error == ENOSPC) {
/*
@@ -1126,18 +1300,17 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
*/
ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
- thismax = P2ROUNDUP(mylen / 2,
+ uint64_t attempted =
+ MIN(range_tree_span(segs), thismax);
+ thismax = P2ROUNDUP(attempted / 2,
1 << spa->spa_max_ashift);
- ASSERT3U(thismax, <, mylen);
/*
* The minimum-size allocation can not fail.
*/
- ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
- *max_alloc = mylen - (1 << spa->spa_max_ashift);
+ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+ *max_alloc = attempted - (1 << spa->spa_max_ashift);
} else {
ASSERT0(error);
- length -= mylen;
- offset += mylen;
/*
* We've performed an allocation, so reset the
@@ -1148,6 +1321,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
}
}
metaslab_trace_fini(&zal);
+ range_tree_destroy(segs);
}
/*
@@ -1169,12 +1343,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
static void
spa_vdev_remove_thread(void *arg)
{
- vdev_t *vd = arg;
- spa_t *spa = vd->vdev_spa;
+ spa_t *spa = arg;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
vdev_copy_arg_t vca;
uint64_t max_alloc = zfs_remove_max_segment;
uint64_t last_txg = 0;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
@@ -1182,7 +1358,6 @@ spa_vdev_remove_thread(void *arg)
ASSERT(vdev_is_concrete(vd));
ASSERT(vd->vdev_removing);
ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
- ASSERT3P(svr->svr_vdev, ==, vd);
ASSERT(vim != NULL);
mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1264,6 +1439,17 @@ spa_vdev_remove_thread(void *arg)
mutex_exit(&svr->svr_lock);
/*
+ * We need to periodically drop the config lock so that
+ * writers can get in. Additionally, we can't wait
+ * for a txg to sync while holding a config lock
+ * (since a waiting writer could cause a 3-way deadlock
+ * with the sync thread, which also gets a config
+ * lock for reader). So we can't hold the config lock
+ * while calling dmu_tx_assign().
+ */
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ /*
* This delay will pause the removal around the point
* specified by zfs_remove_max_bytes_pause. We do this
* solely from the test suite or during debugging.
@@ -1289,11 +1475,19 @@ spa_vdev_remove_thread(void *arg)
VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
uint64_t txg = dmu_tx_get_txg(tx);
+ /*
+ * Reacquire the vdev_config lock. The vdev_t
+ * that we're removing may have changed, e.g. due
+ * to a vdev_attach or vdev_detach.
+ */
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
if (txg != last_txg)
max_alloc = zfs_remove_max_segment;
last_txg = txg;
- spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+ spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
dmu_tx_commit(tx);
mutex_enter(&svr->svr_lock);
@@ -1301,6 +1495,9 @@ spa_vdev_remove_thread(void *arg)
}
mutex_exit(&svr->svr_lock);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
/*
* Wait for all copies to finish before cleaning up the vca.
*/
@@ -1318,7 +1515,7 @@ spa_vdev_remove_thread(void *arg)
mutex_exit(&svr->svr_lock);
} else {
ASSERT0(range_tree_space(svr->svr_allocd_segs));
- vdev_remove_complete(vd);
+ vdev_remove_complete(spa);
}
thread_exit();
}
@@ -1360,7 +1557,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
- vdev_t *vd = svr->svr_vdev;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
objset_t *mos = spa->spa_meta_objset;
@@ -1433,8 +1630,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
* because we have not allocated mappings for it yet.
*/
uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
- range_tree_clear(svr->svr_allocd_segs, syncd,
- msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+ uint64_t sm_end = msp->ms_sm->sm_start +
+ msp->ms_sm->sm_size;
+ if (sm_end > syncd)
+ range_tree_clear(svr->svr_allocd_segs,
+ syncd, sm_end - syncd);
mutex_exit(&svr->svr_lock);
}
@@ -1495,7 +1695,7 @@ spa_vdev_remove_cancel(spa_t *spa)
if (spa->spa_vdev_removal == NULL)
return (ESRCH);
- uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+ uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
spa_vdev_remove_cancel_sync, NULL, 0,
@@ -1625,6 +1825,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
/* Make sure these changes are sync'ed */
spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+ /* Stop initializing */
+ (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
*txg = spa_vdev_config_enter(spa);
sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -1785,6 +1988,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
*/
error = spa_reset_logs(spa);
+ /*
+ * We stop any initializing that is currently in progress but leave
+ * the state as "active". This will allow the initializing to resume
+ * if the removal is canceled sometime later.
+ */
+ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
*txg = spa_vdev_config_enter(spa);
/*
@@ -1796,6 +2006,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
if (error != 0) {
metaslab_group_activate(mg);
+ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
return (error);
}
@@ -1806,7 +2017,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
dsl_sync_task_nowait(spa->spa_dsl_pool,
vdev_remove_initiate_sync,
- vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+ (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
dmu_tx_commit(tx);
return (0);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
index 92c670d28b2c..a03d18704dfc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -151,6 +151,7 @@ vdev_ops_t vdev_root_ops = {
NULL,
NULL,
NULL,
+ NULL,
VDEV_TYPE_ROOT, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index b40263fc981c..fc9ac80593ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -58,9 +58,7 @@ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
void
fzap_byteswap(void *vbuf, size_t size)
{
- uint64_t block_type;
-
- block_type = *(uint64_t *)vbuf;
+ uint64_t block_type = *(uint64_t *)vbuf;
if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
zap_leaf_byteswap(vbuf, size);
@@ -73,11 +71,6 @@ fzap_byteswap(void *vbuf, size_t size)
void
fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
{
- dmu_buf_t *db;
- zap_leaf_t *l;
- int i;
- zap_phys_t *zp;
-
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
zap->zap_ismicro = FALSE;
@@ -87,7 +80,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
- zp = zap_f_phys(zap);
+ zap_phys_t *zp = zap_f_phys(zap);
/*
* explicitly zero it since it might be coming from an
* initialized microzap
@@ -106,17 +99,18 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
zp->zap_flags = flags;
/* block 1 will be the first leaf */
- for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+ for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
/*
* set up block 1 - the first leaf
*/
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ dmu_buf_t *db;
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);
- l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
l->l_dbuf = db;
zap_leaf_init(l, zp->zap_normflags != 0);
@@ -146,9 +140,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
dmu_tx_t *tx)
{
- uint64_t b, newblk;
- dmu_buf_t *db_old, *db_new;
- int err;
+ uint64_t newblk;
int bs = FZAP_BLOCK_SHIFT(zap);
int hepb = 1<<(bs-4);
/* hepb = half the number of entries in a block */
@@ -172,21 +164,23 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
* Copy the ptrtbl from the old to new location.
*/
- b = tbl->zt_blks_copied;
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ uint64_t b = tbl->zt_blks_copied;
+ dmu_buf_t *db_old;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
- if (err)
+ if (err != 0)
return (err);
/* first half of entries in old[b] go to new[2*b+0] */
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ dmu_buf_t *db_new;
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
(newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
(newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -221,22 +215,20 @@ static int
zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
dmu_tx_t *tx)
{
- int err;
- uint64_t blk, off;
int bs = FZAP_BLOCK_SHIFT(zap);
- dmu_buf_t *db;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(tbl->zt_blk != 0);
dprintf("storing %llx at index %llx\n", val, idx);
- blk = idx >> (bs-3);
- off = idx & ((1<<(bs-3))-1);
+ uint64_t blk = idx >> (bs-3);
+ uint64_t off = idx & ((1<<(bs-3))-1);
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ dmu_buf_t *db;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
- if (err)
+ if (err != 0)
return (err);
dmu_buf_will_dirty(db, tx);
@@ -249,7 +241,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
DMU_READ_NO_PREFETCH);
- if (err) {
+ if (err != 0) {
dmu_buf_rele(db, FTAG);
return (err);
}
@@ -268,27 +260,24 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
static int
zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
{
- uint64_t blk, off;
- int err;
- dmu_buf_t *db;
- dnode_t *dn;
int bs = FZAP_BLOCK_SHIFT(zap);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- blk = idx >> (bs-3);
- off = idx & ((1<<(bs-3))-1);
+ uint64_t blk = idx >> (bs-3);
+ uint64_t off = idx & ((1<<(bs-3))-1);
/*
* Note: this is equivalent to dmu_buf_hold(), but we use
* _dnode_enter / _by_dnode because it's faster because we don't
* have to hold the dnode.
*/
- dn = dmu_buf_dnode_enter(zap->zap_dbuf);
- err = dmu_buf_hold_by_dnode(dn,
+ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ dmu_buf_t *db;
+ int err = dmu_buf_hold_by_dnode(dn,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
- if (err)
+ if (err != 0)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
dmu_buf_rele(db, FTAG);
@@ -319,11 +308,10 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
static void
zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
{
- int i;
- for (i = 0; i < n; i++) {
+ for (int i = 0; i < n; i++) {
uint64_t lb = src[i];
- dst[2*i+0] = lb;
- dst[2*i+1] = lb;
+ dst[2 * i + 0] = lb;
+ dst[2 * i + 1] = lb;
}
}
@@ -345,19 +333,16 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
* stored in the header block). Give it its own entire
* block, which will double the size of the ptrtbl.
*/
- uint64_t newblk;
- dmu_buf_t *db_new;
- int err;
-
ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
- newblk = zap_allocate_blocks(zap, 1);
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ uint64_t newblk = zap_allocate_blocks(zap, 1);
+ dmu_buf_t *db_new;
+ int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
DMU_READ_NO_PREFETCH);
- if (err)
+ if (err != 0)
return (err);
dmu_buf_will_dirty(db_new, tx);
zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
@@ -392,9 +377,8 @@ zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
static uint64_t
zap_allocate_blocks(zap_t *zap, int nblocks)
{
- uint64_t newblk;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- newblk = zap_f_phys(zap)->zap_freeblk;
+ uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
zap_f_phys(zap)->zap_freeblk += nblocks;
return (newblk);
}
@@ -411,7 +395,6 @@ zap_leaf_evict_sync(void *dbu)
static zap_leaf_t *
zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
{
- void *winner;
zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -421,12 +404,11 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;
- VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
DMU_READ_NO_PREFETCH));
dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
- winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
- ASSERT(winner == NULL);
+ VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
dmu_buf_will_dirty(l->l_dbuf, tx);
zap_leaf_init(l, zap->zap_normflags != 0);
@@ -460,11 +442,9 @@ zap_put_leaf(zap_leaf_t *l)
static zap_leaf_t *
zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
{
- zap_leaf_t *l, *winner;
-
ASSERT(blkid != 0);
- l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
rw_init(&l->l_rwlock, 0, 0, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = blkid;
@@ -472,7 +452,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
l->l_dbuf = db;
dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
- winner = dmu_buf_set_user(db, &l->l_dbu);
+ zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
rw_exit(&l->l_rwlock);
if (winner != NULL) {
@@ -510,17 +490,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
zap_leaf_t **lp)
{
dmu_buf_t *db;
- zap_leaf_t *l;
- int bs = FZAP_BLOCK_SHIFT(zap);
- int err;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+ int bs = FZAP_BLOCK_SHIFT(zap);
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
- err = dmu_buf_hold_by_dnode(dn,
+ int err = dmu_buf_hold_by_dnode(dn,
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
- if (err)
+ if (err != 0)
return (err);
ASSERT3U(db->db_object, ==, zap->zap_object);
@@ -528,7 +506,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
ASSERT3U(db->db_size, ==, 1 << bs);
ASSERT(blkid != 0);
- l = dmu_buf_get_user(db);
+ zap_leaf_t *l = dmu_buf_get_user(db);
if (l == NULL)
l = zap_open_leaf(blkid, db);
@@ -583,8 +561,7 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
static int
zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
- uint64_t idx, blk;
- int err;
+ uint64_t blk;
ASSERT(zap->zap_dbuf == NULL ||
zap_f_phys(zap) == zap->zap_dbuf->db_data);
@@ -596,8 +573,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
return (SET_ERROR(EIO));
}
- idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
- err = zap_idx_to_blk(zap, idx, &blk);
+ uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+ int err = zap_idx_to_blk(zap, idx, &blk);
if (err != 0)
return (err);
err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
@@ -614,9 +591,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
{
zap_t *zap = zn->zn_zap;
uint64_t hash = zn->zn_hash;
- zap_leaf_t *nl;
- int prefix_diff, i, err;
- uint64_t sibling;
+ int err;
int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
@@ -636,19 +611,19 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
err = zap_lockdir(os, object, tx, RW_WRITER,
FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
- if (err)
+ if (err != 0)
return (err);
ASSERT(!zap->zap_ismicro);
while (old_prefix_len ==
zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
err = zap_grow_ptrtbl(zap, tx);
- if (err)
+ if (err != 0)
return (err);
}
err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
- if (err)
+ if (err != 0)
return (err);
if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
@@ -662,25 +637,26 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
zap_leaf_phys(l)->l_hdr.lh_prefix);
- prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
(old_prefix_len + 1);
- sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+ uint64_t sibling =
+ (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
/* check for i/o errors before doing zap_leaf_split */
- for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ for (int i = 0; i < (1ULL << prefix_diff); i++) {
uint64_t blk;
- err = zap_idx_to_blk(zap, sibling+i, &blk);
- if (err)
+ err = zap_idx_to_blk(zap, sibling + i, &blk);
+ if (err != 0)
return (err);
ASSERT3U(blk, ==, l->l_blkid);
}
- nl = zap_create_leaf(zap, tx);
+ zap_leaf_t *nl = zap_create_leaf(zap, tx);
zap_leaf_split(l, nl, zap->zap_normflags != 0);
/* set sibling pointers */
- for (i = 0; i < (1ULL << prefix_diff); i++) {
- err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ for (int i = 0; i < (1ULL << prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
ASSERT0(err); /* we checked for i/o errors above */
}
@@ -708,8 +684,6 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
zap_put_leaf(l);
if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
- int err;
-
/*
* We are in the middle of growing the pointer table, or
* this leaf will soon make us grow it.
@@ -719,10 +693,10 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
uint64_t zapobj = zap->zap_object;
zap_unlockdir(zap, tag);
- err = zap_lockdir(os, zapobj, tx,
+ int err = zap_lockdir(os, zapobj, tx,
RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
- if (err)
+ if (err != 0)
return;
}
@@ -763,9 +737,8 @@ fzap_checksize(uint64_t integer_size, uint64_t num_integers)
static int
fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
{
- int err;
-
- if ((err = fzap_checkname(zn)) != 0)
+ int err = fzap_checkname(zn);
+ if (err != 0)
return (err);
return (fzap_checksize(integer_size, num_integers));
}
@@ -779,10 +752,10 @@ fzap_lookup(zap_name_t *zn,
char *realname, int rn_len, boolean_t *ncp)
{
zap_leaf_t *l;
- int err;
zap_entry_handle_t zeh;
- if ((err = fzap_checkname(zn)) != 0)
+ int err = fzap_checkname(zn);
+ if (err != 0)
return (err);
err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
@@ -870,7 +843,8 @@ fzap_update(zap_name_t *zn,
void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
- int err, create;
+ int err;
+ boolean_t create;
zap_entry_handle_t zeh;
zap_t *zap = zn->zn_zap;
@@ -923,9 +897,9 @@ fzap_length(zap_name_t *zn,
if (err != 0)
goto out;
- if (integer_size)
+ if (integer_size != 0)
*integer_size = zeh.zeh_integer_size;
- if (num_integers)
+ if (num_integers != 0)
*num_integers = zeh.zeh_num_integers;
out:
zap_put_leaf(l);
@@ -954,15 +928,14 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
void
fzap_prefetch(zap_name_t *zn)
{
- uint64_t idx, blk;
+ uint64_t blk;
zap_t *zap = zn->zn_zap;
- int bs;
- idx = ZAP_HASH_IDX(zn->zn_hash,
+ uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
zap_f_phys(zap)->zap_ptrtbl.zt_shift);
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
- bs = FZAP_BLOCK_SHIFT(zap);
+ int bs = FZAP_BLOCK_SHIFT(zap);
dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
ZIO_PRIORITY_SYNC_READ);
}
@@ -975,9 +948,8 @@ uint64_t
zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
const char *name, dmu_tx_t *tx)
{
- uint64_t new_obj;
-
- VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+ uint64_t new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx);
+ VERIFY(new_obj != 0);
VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
tx));
@@ -989,13 +961,12 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
char *name)
{
zap_cursor_t zc;
- zap_attribute_t *za;
int err;
if (mask == 0)
mask = -1ULL;
- za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
for (zap_cursor_init(&zc, os, zapobj);
(err = zap_cursor_retrieve(&zc, za)) == 0;
zap_cursor_advance(&zc)) {
@@ -1005,7 +976,7 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
}
}
zap_cursor_fini(&zc);
- kmem_free(za, sizeof (zap_attribute_t));
+ kmem_free(za, sizeof (*za));
return (err);
}
@@ -1013,23 +984,23 @@ int
zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
{
zap_cursor_t zc;
- zap_attribute_t za;
- int err;
+ int err = 0;
- err = 0;
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
for (zap_cursor_init(&zc, os, fromobj);
- zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_retrieve(&zc, za) == 0;
(void) zap_cursor_advance(&zc)) {
- if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
err = SET_ERROR(EINVAL);
break;
}
- err = zap_add(os, intoobj, za.za_name,
- 8, 1, &za.za_first_integer, tx);
- if (err)
+ err = zap_add(os, intoobj, za->za_name,
+ 8, 1, &za->za_first_integer, tx);
+ if (err != 0)
break;
}
zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
return (err);
}
@@ -1038,23 +1009,23 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
uint64_t value, dmu_tx_t *tx)
{
zap_cursor_t zc;
- zap_attribute_t za;
- int err;
+ int err = 0;
- err = 0;
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
for (zap_cursor_init(&zc, os, fromobj);
- zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_retrieve(&zc, za) == 0;
(void) zap_cursor_advance(&zc)) {
- if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
err = SET_ERROR(EINVAL);
break;
}
- err = zap_add(os, intoobj, za.za_name,
+ err = zap_add(os, intoobj, za->za_name,
8, 1, &value, tx);
if (err != 0)
break;
}
zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
return (err);
}
@@ -1063,29 +1034,29 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
dmu_tx_t *tx)
{
zap_cursor_t zc;
- zap_attribute_t za;
- int err;
+ int err = 0;
- err = 0;
+ zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
for (zap_cursor_init(&zc, os, fromobj);
- zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_retrieve(&zc, za) == 0;
(void) zap_cursor_advance(&zc)) {
uint64_t delta = 0;
- if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+ if (za->za_integer_length != 8 || za->za_num_integers != 1) {
err = SET_ERROR(EINVAL);
break;
}
- err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+ err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
if (err != 0 && err != ENOENT)
break;
- delta += za.za_first_integer;
- err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
- if (err)
+ delta += za->za_first_integer;
+ err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
+ if (err != 0)
break;
}
zap_cursor_fini(&zc);
+ kmem_free(za, sizeof (*za));
return (err);
}
@@ -1150,12 +1121,11 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
dmu_tx_t *tx)
{
uint64_t value = 0;
- int err;
if (delta == 0)
return (0);
- err = zap_lookup(os, obj, name, 8, 1, &value);
+ int err = zap_lookup(os, obj, name, 8, 1, &value);
if (err != 0 && err != ENOENT)
return (err);
value += delta;
@@ -1253,7 +1223,6 @@ again:
static void
zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
{
- int i, err;
uint64_t lastblk = 0;
/*
@@ -1261,14 +1230,14 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
* can hold, then it'll be accounted for more than once, since
* we won't have lastblk.
*/
- for (i = 0; i < len; i++) {
+ for (int i = 0; i < len; i++) {
zap_leaf_t *l;
if (tbl[i] == lastblk)
continue;
lastblk = tbl[i];
- err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
if (err == 0) {
zap_leaf_stats(zap, l, zs);
zap_put_leaf(l);
@@ -1333,14 +1302,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
} else {
- int b;
-
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
- for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+ for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
b++) {
dmu_buf_t *db;
int err;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
index 35dca89728fb..1c7c736d8e97 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
*/
@@ -107,7 +107,6 @@ ldv(int len, const void *addr)
void
zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
{
- int i;
zap_leaf_t l;
dmu_buf_t l_dbuf;
@@ -123,10 +122,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
- for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+ for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
- for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
struct zap_leaf_entry *le;
@@ -162,14 +161,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
void
zap_leaf_init(zap_leaf_t *l, boolean_t sort)
{
- int i;
-
l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
sizeof (struct zap_leaf_header));
zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
- for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
}
@@ -188,11 +185,9 @@ zap_leaf_init(zap_leaf_t *l, boolean_t sort)
static uint16_t
zap_leaf_chunk_alloc(zap_leaf_t *l)
{
- int chunk;
-
ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
- chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+ int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
@@ -232,7 +227,7 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
uint16_t *chunkp = &chunk_head;
int byten = 0;
uint64_t value = 0;
- int shift = (integer_size-1)*8;
+ int shift = (integer_size - 1) * 8;
int len = num_integers;
ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
@@ -240,10 +235,9 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
while (len > 0) {
uint16_t chunk = zap_leaf_chunk_alloc(l);
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int i;
la->la_type = ZAP_CHUNK_ARRAY;
- for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
if (byten == 0)
value = ldv(integer_size, buf);
la->la_array[i] = value >> shift;
@@ -321,10 +315,9 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
while (len > 0) {
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
- int i;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
- for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+ for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
value = (value << 8) | la->la_array[i];
byten++;
if (byten == array_int_len) {
@@ -347,16 +340,13 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
int bseen = 0;
if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
- uint64_t *thiskey;
- boolean_t match;
-
+ uint64_t *thiskey =
+ kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
- thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
- KM_SLEEP);
zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
sizeof (*thiskey), array_numints, thiskey);
- match = bcmp(thiskey, zn->zn_key_orig,
+ boolean_t match = bcmp(thiskey, zn->zn_key_orig,
array_numints * sizeof (*thiskey)) == 0;
kmem_free(thiskey, array_numints * sizeof (*thiskey));
return (match);
@@ -365,11 +355,10 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
ASSERT(zn->zn_key_intlen == 1);
if (zn->zn_matchtype & MT_NORMALIZE) {
char *thisname = kmem_alloc(array_numints, KM_SLEEP);
- boolean_t match;
zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
sizeof (char), array_numints, thisname);
- match = zap_match(zn, thisname);
+ boolean_t match = zap_match(zn, thisname);
kmem_free(thisname, array_numints);
return (match);
}
@@ -400,12 +389,11 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
int
zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
{
- uint16_t *chunkp;
struct zap_leaf_entry *le;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
- for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
+ for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
*chunkp != CHAIN_END; chunkp = &le->le_next) {
uint16_t chunk = *chunkp;
le = ZAP_LEAF_ENTRY(l, chunk);
@@ -446,17 +434,15 @@ int
zap_leaf_lookup_closest(zap_leaf_t *l,
uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
{
- uint16_t chunk;
uint64_t besth = -1ULL;
uint32_t bestcd = -1U;
uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
- uint16_t lh;
struct zap_leaf_entry *le;
ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
- for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
- for (chunk = zap_leaf_phys(l)->l_hash[lh];
+ for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+ for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
@@ -529,11 +515,10 @@ int
zap_entry_update(zap_entry_handle_t *zeh,
uint8_t integer_size, uint64_t num_integers, const void *buf)
{
- int delta_chunks;
zap_leaf_t *l = zeh->zeh_leaf;
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
- delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+ int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
@@ -550,14 +535,12 @@ zap_entry_update(zap_entry_handle_t *zeh,
void
zap_entry_remove(zap_entry_handle_t *zeh)
{
- uint16_t entry_chunk;
- struct zap_leaf_entry *le;
zap_leaf_t *l = zeh->zeh_leaf;
ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
- entry_chunk = *zeh->zeh_chunkp;
- le = ZAP_LEAF_ENTRY(l, entry_chunk);
+ uint16_t entry_chunk = *zeh->zeh_chunkp;
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
zap_leaf_array_free(l, &le->le_name_chunk);
@@ -575,15 +558,12 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
zap_entry_handle_t *zeh)
{
uint16_t chunk;
- uint16_t *chunkp;
struct zap_leaf_entry *le;
- uint64_t valuelen;
- int numchunks;
uint64_t h = zn->zn_hash;
- valuelen = integer_size * num_integers;
+ uint64_t valuelen = integer_size * num_integers;
- numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+ int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (E2BIG);
@@ -645,7 +625,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
/* link it into the hash chain */
/* XXX if we did the search above, we could just use that */
- chunkp = zap_leaf_rehash_entry(l, chunk);
+ uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
zap_leaf_phys(l)->l_hdr.lh_nentries++;
@@ -673,14 +653,13 @@ boolean_t
zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
const char *name, zap_t *zap)
{
- uint64_t chunk;
struct zap_leaf_entry *le;
boolean_t allocdzn = B_FALSE;
if (zap->zap_normflags == 0)
return (B_FALSE);
- for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+ for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
if (le->le_hash != zeh->zeh_hash)
@@ -763,14 +742,11 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
static void
zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
{
- struct zap_leaf_entry *le, *nle;
- uint16_t chunk;
-
- le = ZAP_LEAF_ENTRY(l, entry);
+ struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
- chunk = zap_leaf_chunk_alloc(nl);
- nle = ZAP_LEAF_ENTRY(nl, chunk);
+ uint16_t chunk = zap_leaf_chunk_alloc(nl);
+ struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
*nle = *le; /* structure assignment */
(void) zap_leaf_rehash_entry(nl, chunk);
@@ -791,7 +767,6 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
void
zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
{
- int i;
int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* set new prefix and prefix_len */
@@ -818,7 +793,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
* but this accesses memory more sequentially, and when we're
* called, the block is usually pretty full.
*/
- for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
if (le->le_type != ZAP_CHUNK_ENTRY)
continue;
@@ -833,9 +808,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
void
zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
{
- int i, n;
-
- n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_leafs_with_2n_pointers[n]++;
@@ -851,7 +824,7 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_n_tenths_full[n]++;
- for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+ for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
int nentries = 0;
int chunk = zap_leaf_phys(l)->l_hash[i];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 31dce3b1723b..50d5fc48f0c8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
@@ -89,22 +89,20 @@ zap_hash(zap_name_t *zn)
ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
- int i;
const uint64_t *wp = zn->zn_key_norm;
ASSERT(zn->zn_key_intlen == 8);
- for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
- int j;
+ for (int i = 0; i < zn->zn_key_norm_numints;
+ wp++, i++) {
uint64_t word = *wp;
- for (j = 0; j < zn->zn_key_intlen; j++) {
+ for (int j = 0; j < zn->zn_key_intlen; j++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ word) & 0xFF];
word >>= NBBY;
}
}
} else {
- int i, len;
const uint8_t *cp = zn->zn_key_norm;
/*
@@ -114,10 +112,10 @@ zap_hash(zap_name_t *zn)
* zn_key_*_numints includes the terminating
* null for non-binary keys.)
*/
- len = zn->zn_key_norm_numints - 1;
+ int len = zn->zn_key_norm_numints - 1;
ASSERT(zn->zn_key_intlen == 1);
- for (i = 0; i < len; cp++, i++) {
+ for (int i = 0; i < len; cp++, i++) {
h = (h >> 8) ^
zfs_crc64_table[(h ^ *cp) & 0xFF];
}
@@ -137,15 +135,12 @@ zap_hash(zap_name_t *zn)
static int
zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
{
- size_t inlen, outlen;
- int err;
-
ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
- inlen = strlen(name) + 1;
- outlen = ZAP_MAXNAMELEN;
+ size_t inlen = strlen(name) + 1;
+ size_t outlen = ZAP_MAXNAMELEN;
- err = 0;
+ int err = 0;
(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
U8_UNICODE_LATEST, &err);
@@ -255,12 +250,11 @@ zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
static void
mzap_byteswap(mzap_phys_t *buf, size_t size)
{
- int i, max;
buf->mz_block_type = BSWAP_64(buf->mz_block_type);
buf->mz_salt = BSWAP_64(buf->mz_salt);
buf->mz_normflags = BSWAP_64(buf->mz_normflags);
- max = (size / MZAP_ENT_LEN) - 1;
- for (i = 0; i < max; i++) {
+ int max = (size / MZAP_ENT_LEN) - 1;
+ for (int i = 0; i < max; i++) {
buf->mz_chunk[i].mze_value =
BSWAP_64(buf->mz_chunk[i].mze_value);
buf->mz_chunk[i].mze_cd =
@@ -271,9 +265,7 @@ mzap_byteswap(mzap_phys_t *buf, size_t size)
void
zap_byteswap(void *buf, size_t size)
{
- uint64_t block_type;
-
- block_type = *(uint64_t *)buf;
+ uint64_t block_type = *(uint64_t *)buf;
if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
/* ASSERT(magic == ZAP_LEAF_MAGIC); */
@@ -289,27 +281,22 @@ mze_compare(const void *arg1, const void *arg2)
const mzap_ent_t *mze1 = arg1;
const mzap_ent_t *mze2 = arg2;
- if (mze1->mze_hash > mze2->mze_hash)
- return (+1);
- if (mze1->mze_hash < mze2->mze_hash)
- return (-1);
- if (mze1->mze_cd > mze2->mze_cd)
- return (+1);
- if (mze1->mze_cd < mze2->mze_cd)
- return (-1);
- return (0);
+ int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
+ if (likely(cmp))
+ return (cmp);
+
+ return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
}
static int
mze_insert(zap_t *zap, int chunkid, uint64_t hash)
{
- mzap_ent_t *mze;
avl_index_t idx;
ASSERT(zap->zap_ismicro);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+ mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
mze->mze_chunkid = chunkid;
mze->mze_hash = hash;
mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
@@ -352,10 +339,8 @@ static uint32_t
mze_find_unused_cd(zap_t *zap, uint64_t hash)
{
mzap_ent_t mze_tofind;
- mzap_ent_t *mze;
avl_index_t idx;
avl_tree_t *avl = &zap->zap_m.zap_avl;
- uint32_t cd;
ASSERT(zap->zap_ismicro);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@@ -363,8 +348,8 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
mze_tofind.mze_hash = hash;
mze_tofind.mze_cd = 0;
- cd = 0;
- for (mze = avl_find(avl, &mze_tofind, &idx);
+ uint32_t cd = 0;
+ for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
if (mze->mze_cd != cd)
break;
@@ -399,15 +384,13 @@ static zap_t *
mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
{
zap_t *winner;
- zap_t *zap;
- int i;
uint64_t *zap_hdr = (uint64_t *)db->db_data;
uint64_t zap_block_type = zap_hdr[0];
uint64_t zap_magic = zap_hdr[1];
ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
- zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+ zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
rw_init(&zap->zap_rwlock, 0, 0, 0);
rw_enter(&zap->zap_rwlock, RW_WRITER);
zap->zap_objset = os;
@@ -443,7 +426,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
avl_create(&zap->zap_m.zap_avl, mze_compare,
sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
- for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze =
&zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0]) {
@@ -495,28 +478,21 @@ handle_winner:
return (winner);
}
+/*
+ * This routine "consumes" the caller's hold on the dbuf, which must
+ * have the specified tag.
+ */
static int
zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
- zap_t *zap;
- krw_t lt;
-
ASSERT0(db->db_offset);
objset_t *os = dmu_buf_get_objset(db);
uint64_t obj = db->db_object;
*zapp = NULL;
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(db, &doi);
- ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
- }
-#endif
-
- zap = dmu_buf_get_user(db);
+ zap_t *zap = dmu_buf_get_user(db);
if (zap == NULL) {
zap = mzap_open(os, obj, db);
if (zap == NULL) {
@@ -535,7 +511,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
* can only be different if it was upgraded from micro to fat,
* and micro wanted WRITER but fat only needs READER.
*/
- lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
rw_enter(&zap->zap_rwlock, lt);
if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
/* it was upgraded, now we only need reader */
@@ -581,12 +557,19 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
{
dmu_buf_t *db;
- int err;
- err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
return (err);
}
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ }
+#endif
+
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0) {
dmu_buf_rele(db, tag);
@@ -599,11 +582,17 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
{
dmu_buf_t *db;
- int err;
- err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
+#ifdef ZFS_DEBUG
+ {
+ dmu_object_info_t doi;
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+ }
+#endif
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0)
dmu_buf_rele(db, tag);
@@ -620,22 +609,20 @@ zap_unlockdir(zap_t *zap, void *tag)
static int
mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
{
- mzap_phys_t *mzp;
- int i, sz, nchunks;
int err = 0;
zap_t *zap = *zapp;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- sz = zap->zap_dbuf->db_size;
- mzp = zio_buf_alloc(sz);
+ int sz = zap->zap_dbuf->db_size;
+ mzap_phys_t *mzp = zio_buf_alloc(sz);
bcopy(zap->zap_dbuf->db_data, mzp, sz);
- nchunks = zap->zap_m.zap_num_chunks;
+ int nchunks = zap->zap_m.zap_num_chunks;
if (!flags) {
err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
1ULL << fzap_default_block_shift, 0, tx);
- if (err) {
+ if (err != 0) {
zio_buf_free(mzp, sz);
return (err);
}
@@ -648,19 +635,18 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
fzap_upgrade(zap, tx, flags);
- for (i = 0; i < nchunks; i++) {
+ for (int i = 0; i < nchunks; i++) {
mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
- zap_name_t *zn;
if (mze->mze_name[0] == 0)
continue;
dprintf("adding %s=%llu\n",
mze->mze_name, mze->mze_value);
- zn = zap_name_alloc(zap, mze->mze_name, 0);
+ zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
tag, tx);
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
zap_name_free(zn);
- if (err)
+ if (err != 0)
break;
}
zio_buf_free(mzp, sz);
@@ -690,32 +676,24 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
dmu_tx_t *tx)
{
dmu_buf_t *db;
- mzap_phys_t *zp;
- VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
-
-#ifdef ZFS_DEBUG
- {
- dmu_object_info_t doi;
- dmu_object_info_from_db(db, &doi);
- ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
- }
-#endif
+ VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);
- zp = db->db_data;
+ mzap_phys_t *zp = db->db_data;
zp->mz_block_type = ZBT_MICRO;
zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
zp->mz_normflags = normflags;
- dmu_buf_rele(db, FTAG);
if (flags != 0) {
zap_t *zap;
/* Only fat zap supports flags; upgrade immediately. */
- VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
- B_FALSE, B_FALSE, FTAG, &zap));
- VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
+ VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+ B_FALSE, B_FALSE, &zap));
+ VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
zap_unlockdir(zap, FTAG);
+ } else {
+ dmu_buf_rele(db, FTAG);
}
}
@@ -732,9 +710,8 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- int err;
-
- err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+ int err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
if (err != 0)
return (err);
mzap_create_impl(os, obj, normflags, 0, tx);
@@ -752,6 +729,7 @@ uint64_t
zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
mzap_create_impl(os, obj, normflags, 0, tx);
@@ -763,6 +741,7 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
@@ -808,10 +787,10 @@ int
zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
{
zap_t *zap;
- int err;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
if (!zap->zap_ismicro) {
err = fzap_count(zap, count);
@@ -829,7 +808,6 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
static boolean_t
mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
{
- mzap_ent_t *other;
int direction = AVL_BEFORE;
boolean_t allocdzn = B_FALSE;
@@ -837,7 +815,7 @@ mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
return (B_FALSE);
again:
- for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+ for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
other && other->mze_hash == mze->mze_hash;
other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
@@ -882,10 +860,8 @@ zap_lookup_impl(zap_t *zap, const char *name,
boolean_t *ncp)
{
int err = 0;
- mzap_ent_t *mze;
- zap_name_t *zn;
- zn = zap_name_alloc(zap, name, mt);
+ zap_name_t *zn = zap_name_alloc(zap, name, mt);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
@@ -893,7 +869,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
err = fzap_lookup(zn, integer_size, num_integers, buf,
realname, rn_len, ncp);
} else {
- mze = mze_find(zn);
+ mzap_ent_t *mze = mze_find(zn);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
@@ -924,9 +900,9 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
boolean_t *ncp)
{
zap_t *zap;
- int err;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_lookup_impl(zap, name, integer_size,
@@ -950,9 +926,8 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
boolean_t *ncp)
{
zap_t *zap;
- int err;
- err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+ int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
FTAG, &zap);
if (err != 0)
return (err);
@@ -967,13 +942,12 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints)
{
zap_t *zap;
- int err;
- zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc_uint64(zap, key, key_numints);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -990,13 +964,12 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
{
zap_t *zap;
- int err;
- zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc_uint64(zap, key, key_numints);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1024,14 +997,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
uint64_t *integer_size, uint64_t *num_integers)
{
zap_t *zap;
- int err;
- mzap_ent_t *mze;
- zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc(zap, name, 0);
+ zap_name_t *zn = zap_name_alloc(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1039,7 +1010,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
if (!zap->zap_ismicro) {
err = fzap_length(zn, integer_size, num_integers);
} else {
- mze = mze_find(zn);
+ mzap_ent_t *mze = mze_find(zn);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
@@ -1059,13 +1030,12 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, uint64_t *integer_size, uint64_t *num_integers)
{
zap_t *zap;
- int err;
- zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc_uint64(zap, key, key_numints);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1079,26 +1049,24 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
static void
mzap_addent(zap_name_t *zn, uint64_t value)
{
- int i;
zap_t *zap = zn->zn_zap;
int start = zap->zap_m.zap_alloc_next;
- uint32_t cd;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
#ifdef ZFS_DEBUG
- for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+ for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
}
#endif
- cd = mze_find_unused_cd(zap, zn->zn_hash);
+ uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
/* given the limited size of the microzap, this can't happen */
ASSERT(cd < zap_maxcd(zap));
again:
- for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+ for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
@@ -1125,12 +1093,10 @@ zap_add_impl(zap_t *zap, const char *key,
int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx, void *tag)
{
- int err = 0;
- mzap_ent_t *mze;
const uint64_t *intval = val;
- zap_name_t *zn;
+ int err = 0;
- zn = zap_name_alloc(zap, key, 0);
+ zap_name_t *zn = zap_name_alloc(zap, key, 0);
if (zn == NULL) {
zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
@@ -1147,8 +1113,7 @@ zap_add_impl(zap_t *zap, const char *key,
}
zap = zn->zn_zap; /* fzap_add() may change zap */
} else {
- mze = mze_find(zn);
- if (mze != NULL) {
+ if (mze_find(zn) != NULL) {
err = SET_ERROR(EEXIST);
} else {
mzap_addent(zn, *intval);
@@ -1199,13 +1164,12 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
const void *val, dmu_tx_t *tx)
{
zap_t *zap;
- int err;
- zap_name_t *zn;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc_uint64(zap, key, key_numints);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1223,11 +1187,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
- mzap_ent_t *mze;
uint64_t oldval;
const uint64_t *intval = val;
- zap_name_t *zn;
- int err;
#ifdef ZFS_DEBUG
/*
@@ -1238,10 +1199,11 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
#endif
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc(zap, name, 0);
+ zap_name_t *zn = zap_name_alloc(zap, name, 0);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1261,7 +1223,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
}
zap = zn->zn_zap; /* fzap_update() may change zap */
} else {
- mze = mze_find(zn);
+ mzap_ent_t *mze = mze_find(zn);
if (mze != NULL) {
ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
MZE_PHYS(zap, mze)->mze_value = *intval;
@@ -1282,13 +1244,12 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
- zap_name_t *zn;
- int err;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc_uint64(zap, key, key_numints);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1311,17 +1272,15 @@ static int
zap_remove_impl(zap_t *zap, const char *name,
matchtype_t mt, dmu_tx_t *tx)
{
- mzap_ent_t *mze;
- zap_name_t *zn;
int err = 0;
- zn = zap_name_alloc(zap, name, mt);
+ zap_name_t *zn = zap_name_alloc(zap, name, mt);
if (zn == NULL)
return (SET_ERROR(ENOTSUP));
if (!zap->zap_ismicro) {
err = fzap_remove(zn, tx);
} else {
- mze = mze_find(zn);
+ mzap_ent_t *mze = mze_find(zn);
if (mze == NULL) {
err = SET_ERROR(ENOENT);
} else {
@@ -1369,13 +1328,12 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx)
{
zap_t *zap;
- int err;
- zap_name_t *zn;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
- zn = zap_name_alloc_uint64(zap, key, key_numints);
+ zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
@@ -1451,9 +1409,6 @@ int
zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
{
int err;
- avl_index_t idx;
- mzap_ent_t mze_tofind;
- mzap_ent_t *mze;
if (zc->zc_hash == -1ULL)
return (SET_ERROR(ENOENT));
@@ -1462,7 +1417,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
- if (err)
+ if (err != 0)
return (err);
/*
@@ -1482,10 +1437,14 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
if (!zc->zc_zap->zap_ismicro) {
err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
} else {
+ avl_index_t idx;
+ mzap_ent_t mze_tofind;
+
mze_tofind.mze_hash = zc->zc_hash;
mze_tofind.mze_cd = zc->zc_cd;
- mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+ mzap_ent_t *mze =
+ avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
if (mze == NULL) {
mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
idx, AVL_AFTER);
@@ -1562,11 +1521,11 @@ out:
int
zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
{
- int err;
zap_t *zap;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
- if (err)
+ int err =
+ zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
return (err);
bzero(zs, sizeof (zap_stats_t));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
index ea20891ec211..54bc638c6e98 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
@@ -433,7 +433,7 @@ zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl,
/*
* Convert a lua value to an nvpair, adding it to an nvlist with the given key.
*/
-void
+static void
zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
{
/*
@@ -445,7 +445,7 @@ zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
(void) lua_error(state);
}
-int
+static int
zcp_lua_to_nvlist_helper(lua_State *state)
{
nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2);
@@ -454,11 +454,12 @@ zcp_lua_to_nvlist_helper(lua_State *state)
return (0);
}
-void
+static void
zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
const char *key, zcp_eval_arg_t *evalargs)
{
int err;
+ VERIFY3U(1, ==, lua_gettop(state));
lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
lua_pushlightuserdata(state, (char *)key);
lua_pushlightuserdata(state, nvl);
@@ -904,6 +905,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
ZCP_RET_RETURN, evalargs);
} else if (return_count > 1) {
evalargs->ea_result = SET_ERROR(ECHRNG);
+ lua_settop(state, 0);
(void) lua_pushfstring(state, "Multiple return "
"values not supported");
zcp_convert_return_values(state, evalargs->ea_outnvl,
@@ -965,6 +967,7 @@ static void
zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname)
{
evalargs->ea_result = SET_ERROR(ECHRNG);
+ lua_settop(evalargs->ea_state, 0);
(void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s",
poolname);
zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
index 78b2912df1d6..76003e3544f4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
@@ -220,7 +220,7 @@ spa_features_check(spa_t *spa, boolean_t for_write,
*
* Note: well-designed features will not need to use this; they should
* use spa_feature_is_enabled() and spa_feature_is_active() instead.
- * However, this is non-static for zdb and zhack.
+ * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
*/
int
feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
index e74799a70fe0..581b6b1bfb64 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -71,14 +71,10 @@ static char *nulldomain = "";
static int
idx_compare(const void *arg1, const void *arg2)
{
- const fuid_domain_t *node1 = arg1;
- const fuid_domain_t *node2 = arg2;
+ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+ const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
- if (node1->f_idx < node2->f_idx)
- return (-1);
- else if (node1->f_idx > node2->f_idx)
- return (1);
- return (0);
+ return (AVL_CMP(node1->f_idx, node2->f_idx));
}
/*
@@ -87,14 +83,13 @@ idx_compare(const void *arg1, const void *arg2)
static int
domain_compare(const void *arg1, const void *arg2)
{
- const fuid_domain_t *node1 = arg1;
- const fuid_domain_t *node2 = arg2;
+ const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+ const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
int val;
val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
- if (val == 0)
- return (0);
- return (val > 0 ? 1 : -1);
+
+ return (AVL_ISIGN(val));
}
void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index ca6ac539f0d6..af73005c260e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -195,6 +195,8 @@
#include <sys/zcp.h>
#include <sys/zio_checksum.h>
#include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -3865,6 +3867,80 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
}
/*
+ * innvl: {
+ * vdevs: {
+ * guid 1, guid 2, ...
+ * },
+ * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ * [func: EINVAL (if provided command type didn't make sense)],
+ * [vdevs: {
+ * guid1: errno, (see function body for possible errnos)
+ * ...
+ * }]
+ * }
+ *
+ */
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ error = spa_open(poolname, &spa, FTAG);
+ if (error != 0)
+ return (error);
+
+ uint64_t cmd_type;
+ if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+ &cmd_type) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+ cmd_type == POOL_INITIALIZE_DO ||
+ cmd_type == POOL_INITIALIZE_SUSPEND)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_guids;
+ if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+ &vdev_guids) != 0) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ int total_errors = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+ uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+ error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
+ if (error != 0) {
+ char guid_as_str[MAXNAMELEN];
+
+ (void) snprintf(guid_as_str, sizeof (guid_as_str),
+ "%llu", (unsigned long long)vdev_guid);
+ fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+ total_errors++;
+ }
+ }
+ if (fnvlist_size(vdev_errlist) > 0) {
+ fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+ vdev_errlist);
+ }
+ fnvlist_free(vdev_errlist);
+
+ spa_close(spa, FTAG);
+ return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
* fsname is name of dataset to rollback (to most recent snapshot)
*
* innvl may contain name of expected target snapshot
@@ -6118,6 +6194,10 @@ zfs_ioctl_init(void)
zfs_secpolicy_config, POOL_NAME,
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+ zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
index b40bdbea123c..7743e81dd5f1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -594,12 +594,8 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
int
zfs_range_compare(const void *arg1, const void *arg2)
{
- const rl_t *rl1 = arg1;
- const rl_t *rl2 = arg2;
-
- if (rl1->r_off > rl2->r_off)
- return (1);
- if (rl1->r_off < rl2->r_off)
- return (-1);
- return (0);
+ const rl_t *rl1 = (const rl_t *)arg1;
+ const rl_t *rl2 = (const rl_t *)arg2;
+
+ return (AVL_CMP(rl1->r_off, rl2->r_off));
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 980c32820c3f..66d858081485 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -2630,6 +2630,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
dmu_tx_commit(tx);
zfsvfs->z_version = newvers;
+ os->os_version = newvers;
zfs_set_fuid_feature(zfsvfs);
@@ -2642,17 +2643,47 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
- const char *pname;
- int error = ENOENT;
+ uint64_t *cached_copy = NULL;
/*
- * Look up the file system's value for the property. For the
- * version property, we look up a slightly different string.
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
*/
- if (prop == ZFS_PROP_VERSION)
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
- else
+ } else {
pname = zfs_prop_to_name(prop);
+ }
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
@@ -2677,6 +2708,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
}
error = 0;
}
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
return (error);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 58c3807f6ae4..ca34a69a6553 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -1691,7 +1691,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
return (0);
}
- error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
if (error) {
zfs_range_unlock(rl);
return (error);
@@ -2102,6 +2103,17 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
*path = '\0';
sa_hdl = hdl;
+ uint64_t deleteq_obj;
+ VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+ error = zap_lookup_int(osp, deleteq_obj, obj);
+ if (error == 0) {
+ return (ESTALE);
+ } else if (error != ENOENT) {
+ return (error);
+ }
+ error = 0;
+
for (;;) {
uint64_t pobj;
char component[MAXNAMELEN + 2];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 9571998347f2..8cc65d6f31f8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -131,17 +131,11 @@ zil_bp_compare(const void *x1, const void *x2)
const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
- if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
- return (-1);
- if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
- return (1);
-
- if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
- return (-1);
- if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
- return (1);
+ int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+ if (likely(cmp))
+ return (cmp);
- return (0);
+ return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
}
static void
@@ -503,12 +497,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
- if (v1 < v2)
- return (-1);
- if (v1 > v2)
- return (1);
-
- return (0);
+ return (AVL_CMP(v1, v2));
}
static lwb_t *
@@ -665,7 +654,8 @@ zil_create(zilog_t *zilog)
BP_ZERO(&blk);
}
- error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+ error = zio_alloc_zil(zilog->zl_spa,
+ zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
ZIL_MIN_BLKSZ, &slog);
if (error == 0)
@@ -1342,7 +1332,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
+ error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
+ txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -1624,12 +1615,7 @@ zil_aitx_compare(const void *x1, const void *x2)
const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
- if (o1 < o2)
- return (-1);
- if (o1 > o2)
- return (1);
-
- return (0);
+ return (AVL_CMP(o1, o2));
}
/*
@@ -2297,7 +2283,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
*/
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
- ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+ IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
/*
* Since the lwb's zio hadn't been issued by the time this thread
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 3eb8747619fa..53d0f4d27b08 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -44,6 +44,7 @@
#include <sys/dsl_scan.h>
#include <sys/metaslab_impl.h>
#include <sys/abd.h>
+#include <sys/cityhash.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -99,9 +100,6 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
extern vmem_t *zio_alloc_arena;
#endif
-#define ZIO_PIPELINE_CONTINUE 0x100
-#define ZIO_PIPELINE_STOP 0x101
-
#define BP_SPANB(indblkshift, level) \
(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
#define COMPARE_META_LEVEL 0x80000000ul
@@ -538,7 +536,8 @@ zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
}
static void
-zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
+ zio_t **next_to_executep)
{
uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
int *errorp = &pio->io_child_error[zio->io_child_type];
@@ -557,13 +556,33 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
ZIO_TASKQ_INTERRUPT;
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
+
/*
- * Dispatch the parent zio in its own taskq so that
- * the child can continue to make progress. This also
- * prevents overflowing the stack when we have deeply nested
- * parent-child relationships.
+ * If we can tell the caller to execute this parent next, do
+ * so. Otherwise dispatch the parent zio as its own task.
+ *
+ * Having the caller execute the parent when possible reduces
+ * locking on the zio taskq's, reduces context switch
+ * overhead, and has no recursion penalty. Note that one
+ * read from disk typically causes at least 3 zio's: a
+ * zio_null(), the logical zio_read(), and then a physical
+ * zio. When the physical ZIO completes, we are able to call
+ * zio_done() on all 3 of these zio's from one invocation of
+ * zio_execute() by returning the parent back to
+ * zio_execute(). Since the parent isn't executed until this
+ * thread returns back to zio_execute(), the caller should do
+ * so promptly.
+ *
+ * In other cases, dispatching the parent prevents
+ * overflowing the stack when we have deeply nested
+ * parent-child relationships, as we do with the "mega zio"
+ * of writes for spa_sync(), and the chain of ZIL blocks.
*/
- zio_taskq_dispatch(pio, type, B_FALSE);
+ if (next_to_executep != NULL && *next_to_executep == NULL) {
+ *next_to_executep = pio;
+ } else {
+ zio_taskq_dispatch(pio, type, B_FALSE);
+ }
} else {
mutex_exit(&pio->io_lock);
}
@@ -1149,17 +1168,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
done != NULL);
- /*
- * In the common case, where the parent zio was to a normal vdev,
- * the child zio must be to a child vdev of that vdev. Otherwise,
- * the child zio must be to a top-level vdev.
- */
- if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
- ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
- } else {
- ASSERT3P(vd, ==, vd->vdev_top);
- }
-
if (type == ZIO_TYPE_READ && bp != NULL) {
/*
* If we have the bp, then the child should perform the
@@ -1223,7 +1231,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
- int type, zio_priority_t priority, enum zio_flag flags,
+ zio_type_t type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -1285,7 +1293,7 @@ zio_shrink(zio_t *zio, uint64_t size)
* ==========================================================================
*/
-static int
+static zio_t *
zio_read_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1322,14 +1330,14 @@ zio_read_bp_init(zio_t *zio)
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_write_bp_init(zio_t *zio)
{
if (!IO_IS_ALLOCATING(zio))
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
@@ -1344,7 +1352,7 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
if (BP_IS_EMBEDDED(bp))
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
/*
* If we've been overridden and nopwrite is set then
@@ -1355,13 +1363,13 @@ zio_write_bp_init(zio_t *zio)
ASSERT(!zp->zp_dedup);
ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
ASSERT(!zp->zp_nopwrite);
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
@@ -1369,7 +1377,7 @@ zio_write_bp_init(zio_t *zio)
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
BP_SET_DEDUP(bp, 1);
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -1381,10 +1389,10 @@ zio_write_bp_init(zio_t *zio)
zio->io_pipeline = zio->io_orig_pipeline;
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_write_compress(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -1403,11 +1411,11 @@ zio_write_compress(zio_t *zio)
*/
if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
if (!IO_IS_ALLOCATING(zio))
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
if (zio->io_children_ready != NULL) {
/*
@@ -1466,7 +1474,7 @@ zio_write_compress(zio_t *zio)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_EMBEDDED_DATA));
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
} else {
/*
* Round up compressed size up to the ashift
@@ -1554,10 +1562,10 @@ zio_write_compress(zio_t *zio)
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_free_bp_init(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -1569,7 +1577,7 @@ zio_free_bp_init(zio_t *zio)
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -1643,12 +1651,12 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
return (B_FALSE);
}
-static int
+static zio_t *
zio_issue_async(zio_t *zio)
{
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
void
@@ -1730,14 +1738,13 @@ static zio_pipe_stage_t *zio_pipeline[];
void
zio_execute(zio_t *zio)
{
- zio->io_executor = curthread;
-
ASSERT3U(zio->io_queued_timestamp, >, 0);
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
- int rv;
+
+ zio->io_executor = curthread;
ASSERT(!MUTEX_HELD(&zio->io_lock));
ASSERT(ISP2(stage));
@@ -1768,12 +1775,16 @@ zio_execute(zio_t *zio)
zio->io_stage = stage;
zio->io_pipeline_trace |= zio->io_stage;
- rv = zio_pipeline[highbit64(stage) - 1](zio);
- if (rv == ZIO_PIPELINE_STOP)
- return;
+ /*
+ * The zio pipeline stage returns the next zio to execute
+ * (typically the same as this one), or NULL if we should
+ * stop.
+ */
+ zio = zio_pipeline[highbit64(stage) - 1](zio);
- ASSERT(rv == ZIO_PIPELINE_CONTINUE);
+ if (zio == NULL)
+ return;
}
}
@@ -2236,7 +2247,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
zio_nowait(zio);
}
-static int
+static zio_t *
zio_gang_assemble(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -2248,16 +2259,16 @@ zio_gang_assemble(zio_t *zio)
zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_gang_issue(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
@@ -2271,7 +2282,7 @@ zio_gang_issue(zio_t *zio)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
static void
@@ -2310,7 +2321,7 @@ zio_write_gang_done(zio_t *zio)
abd_put(zio->io_abd);
}
-static int
+static zio_t *
zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
@@ -2335,7 +2346,8 @@ zio_write_gang_block(zio_t *pio)
ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
flags |= METASLAB_ASYNC_ALLOC;
- VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+ VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
+ pio));
/*
* The logical zio has already placed a reservation for
@@ -2346,12 +2358,12 @@ zio_write_gang_block(zio_t *pio)
* additional reservations for gang blocks.
*/
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
- pio, flags));
+ pio->io_allocator, pio, flags));
}
error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
- &pio->io_alloc_list, pio);
+ &pio->io_alloc_list, pio, pio->io_allocator);
if (error) {
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2365,10 +2377,10 @@ zio_write_gang_block(zio_t *pio)
* stage.
*/
metaslab_class_throttle_unreserve(mc,
- gbh_copies - copies, pio);
+ gbh_copies - copies, pio->io_allocator, pio);
}
pio->io_error = error;
- return (ZIO_PIPELINE_CONTINUE);
+ return (pio);
}
if (pio == gio) {
@@ -2423,7 +2435,7 @@ zio_write_gang_block(zio_t *pio)
* slot for them here.
*/
VERIFY(metaslab_class_throttle_reserve(mc,
- zp.zp_copies, cio, flags));
+ zp.zp_copies, cio->io_allocator, cio, flags));
}
zio_nowait(cio);
}
@@ -2435,7 +2447,7 @@ zio_write_gang_block(zio_t *pio)
zio_nowait(zio);
- return (ZIO_PIPELINE_CONTINUE);
+ return (pio);
}
/*
@@ -2456,7 +2468,7 @@ zio_write_gang_block(zio_t *pio)
* used for nopwrite, assuming that the salt and the checksums
* themselves remain secret.
*/
-static int
+static zio_t *
zio_nop_write(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -2483,7 +2495,7 @@ zio_nop_write(zio_t *zio)
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
zp->zp_copies != BP_GET_NDVAS(bp_orig))
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
/*
* If the checksums match then reset the pipeline so that we
@@ -2503,7 +2515,7 @@ zio_nop_write(zio_t *zio)
zio->io_flags |= ZIO_FLAG_NOPWRITE;
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -2531,7 +2543,7 @@ zio_ddt_child_read_done(zio_t *zio)
mutex_exit(&pio->io_lock);
}
-static int
+static zio_t *
zio_ddt_read_start(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -2551,7 +2563,7 @@ zio_ddt_read_start(zio_t *zio)
zio->io_vsd = dde;
if (ddp_self == NULL)
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
@@ -2564,23 +2576,23 @@ zio_ddt_read_start(zio_t *zio)
zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
zio_nowait(zio_read(zio, zio->io_spa, bp,
zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_ddt_read_done(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
ASSERT(BP_GET_DEDUP(bp));
@@ -2592,12 +2604,12 @@ zio_ddt_read_done(zio_t *zio)
ddt_entry_t *dde = zio->io_vsd;
if (ddt == NULL) {
ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
if (dde == NULL) {
zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
if (dde->dde_repair_abd != NULL) {
abd_copy(zio->io_abd, dde->dde_repair_abd,
@@ -2610,7 +2622,7 @@ zio_ddt_read_done(zio_t *zio)
ASSERT(zio->io_vsd == NULL);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
static boolean_t
@@ -2768,7 +2780,7 @@ zio_ddt_ditto_write_done(zio_t *zio)
ddt_exit(ddt);
}
-static int
+static zio_t *
zio_ddt_write(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -2812,7 +2824,7 @@ zio_ddt_write(zio_t *zio)
ASSERT(!BP_GET_DEDUP(bp));
zio->io_pipeline = ZIO_WRITE_PIPELINE;
ddt_exit(ddt);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
@@ -2838,7 +2850,7 @@ zio_ddt_write(zio_t *zio)
zio->io_bp_override = NULL;
BP_ZERO(bp);
ddt_exit(ddt);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
@@ -2880,12 +2892,12 @@ zio_ddt_write(zio_t *zio)
if (dio)
zio_nowait(dio);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
ddt_entry_t *freedde; /* for debugging */
-static int
+static zio_t *
zio_ddt_free(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -2903,7 +2915,7 @@ zio_ddt_free(zio_t *zio)
ddt_phys_decref(ddp);
ddt_exit(ddt);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -2913,13 +2925,13 @@ zio_ddt_free(zio_t *zio)
*/
static zio_t *
-zio_io_to_allocate(spa_t *spa)
+zio_io_to_allocate(spa_t *spa, int allocator)
{
zio_t *zio;
- ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
- zio = avl_first(&spa->spa_alloc_tree);
+ zio = avl_first(&spa->spa_alloc_trees[allocator]);
if (zio == NULL)
return (NULL);
@@ -2929,18 +2941,19 @@ zio_io_to_allocate(spa_t *spa)
* Try to place a reservation for this zio. If we're unable to
* reserve then we throttle.
*/
+ ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
- zio->io_prop.zp_copies, zio, 0)) {
+ zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
return (NULL);
}
- avl_remove(&spa->spa_alloc_tree, zio);
+ avl_remove(&spa->spa_alloc_trees[allocator], zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio);
}
-static int
+static zio_t *
zio_dva_throttle(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -2950,7 +2963,7 @@ zio_dva_throttle(zio_t *zio)
!spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
zio->io_child_type == ZIO_CHILD_GANG ||
zio->io_flags & ZIO_FLAG_NODATA) {
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2958,40 +2971,35 @@ zio_dva_throttle(zio_t *zio)
ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
- mutex_enter(&spa->spa_alloc_lock);
+ zbookmark_phys_t *bm = &zio->io_bookmark;
+ /*
+ * We want to try to use as many allocators as possible to help improve
+ * performance, but we also want logically adjacent IOs to be physically
+ * adjacent to improve sequential read performance. We chunk each object
+ * into 2^20 block regions, and then hash based on the objset, object,
+ * level, and region to accomplish both of these goals.
+ */
+ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+ bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+ mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
- avl_add(&spa->spa_alloc_tree, zio);
+ avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
- nio = zio_io_to_allocate(zio->io_spa);
- mutex_exit(&spa->spa_alloc_lock);
+ nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+ mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
- if (nio == zio)
- return (ZIO_PIPELINE_CONTINUE);
-
- if (nio != NULL) {
- ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
- /*
- * We are passing control to a new zio so make sure that
- * it is processed by a different thread. We do this to
- * avoid stack overflows that can occur when parents are
- * throttled and children are making progress. We allow
- * it to go to the head of the taskq since it's already
- * been waiting.
- */
- zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
- }
- return (ZIO_PIPELINE_STOP);
+ return (nio);
}
void
-zio_allocate_dispatch(spa_t *spa)
+zio_allocate_dispatch(spa_t *spa, int allocator)
{
zio_t *zio;
- mutex_enter(&spa->spa_alloc_lock);
- zio = zio_io_to_allocate(spa);
- mutex_exit(&spa->spa_alloc_lock);
+ mutex_enter(&spa->spa_alloc_locks[allocator]);
+ zio = zio_io_to_allocate(spa, allocator);
+ mutex_exit(&spa->spa_alloc_locks[allocator]);
if (zio == NULL)
return;
@@ -3000,7 +3008,7 @@ zio_allocate_dispatch(spa_t *spa)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
}
-static int
+static zio_t *
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -3032,10 +3040,10 @@ zio_dva_allocate(zio_t *zio)
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
- &zio->io_alloc_list, zio);
+ &zio->io_alloc_list, zio, zio->io_allocator);
if (error != 0) {
- spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+ zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -3043,18 +3051,18 @@ zio_dva_allocate(zio_t *zio)
zio->io_error = error;
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_dva_free(zio_t *zio)
{
metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_dva_claim(zio_t *zio)
{
int error;
@@ -3063,7 +3071,7 @@ zio_dva_claim(zio_t *zio)
if (error)
zio->io_error = error;
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -3092,8 +3100,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
* Try to allocate an intent log block. Return 0 on success, errno on failure.
*/
int
-zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t size, boolean_t *slog)
+zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
+ blkptr_t *old_bp, uint64_t size, boolean_t *slog)
{
int error = 1;
zio_alloc_list_t io_alloc_list;
@@ -3101,14 +3109,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
ASSERT(txg > spa_syncing_txg(spa));
metaslab_trace_init(&io_alloc_list);
+ /*
+ * When allocating a zil block, we don't have information about
+ * the final destination of the block except the objset it's part
+ * of, so we just hash the objset ID to pick the allocator to get
+ * some parallelism.
+ */
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
- txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
+ txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
+ cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
if (error == 0) {
*slog = TRUE;
} else {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
- &io_alloc_list, NULL);
+ &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
+ spa->spa_alloc_count);
if (error == 0)
*slog = FALSE;
}
@@ -3150,7 +3166,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
* force the underlying vdev layers to call either zio_execute() or
* zio_interrupt() to ensure that the pipeline continues with the correct I/O.
*/
-static int
+static zio_t *
zio_vdev_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -3169,13 +3185,13 @@ zio_vdev_io_start(zio_t *zio)
* The mirror_ops handle multiple DVAs in a single BP.
*/
vdev_mirror_ops.vdev_op_io_start(zio);
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
zio->io_priority == ZIO_PRIORITY_NOW) {
trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
ASSERT3P(zio->io_logical, !=, zio);
@@ -3183,9 +3199,13 @@ zio_vdev_io_start(zio_t *zio)
ASSERT(spa->spa_trust_config);
if (zio->io_vd->vdev_removing) {
+ /*
+ * Note: the code can handle other kinds of writes,
+ * but we don't expect them.
+ */
ASSERT(zio->io_flags &
(ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
- ZIO_FLAG_INDUCE_DAMAGE));
+ ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
}
}
@@ -3251,39 +3271,58 @@ zio_vdev_io_start(zio_t *zio)
* If this is a repair I/O, and there's no self-healing involved --
* that is, we're just resilvering what we expect to resilver --
* then don't do the I/O unless zio's txg is actually in vd's DTL.
- * This prevents spurious resilvering with nested replication.
- * For example, given a mirror of mirrors, (A+B)+(C+D), if only
- * A is out of date, we'll read from C+D, then use the data to
- * resilver A+B -- but we don't actually want to resilver B, just A.
- * The top-level mirror has no way to know this, so instead we just
- * discard unnecessary repairs as we work our way down the vdev tree.
- * The same logic applies to any form of nested replication:
- * ditto + mirror, RAID-Z + replacing, etc. This covers them all.
+ * This prevents spurious resilvering.
+ *
+ * There are a few ways that we can end up creating these spurious
+ * resilver i/os:
+ *
+ * 1. A resilver i/o will be issued if any DVA in the BP has a
+ * dirty DTL. The mirror code will issue resilver writes to
+ * each DVA, including the one(s) that are not on vdevs with dirty
+ * DTLs.
+ *
+ * 2. With nested replication, which happens when we have a
+ * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+ * For example, given mirror(replacing(A+B), C), it's likely that
+ * only A is out of date (it's the new device). In this case, we'll
+ * read from C, then use the data to resilver A+B -- but we don't
+ * actually want to resilver B, just A. The top-level mirror has no
+ * way to know this, so instead we just discard unnecessary repairs
+ * as we work our way down the vdev tree.
+ *
+ * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+ * The same logic applies to any form of nested replication: ditto
+ * + mirror, RAID-Z + replacing, etc.
+ *
+ * However, indirect vdevs point off to other vdevs which may have
+ * DTL's, so we never bypass them. The child i/os on concrete vdevs
+ * will be properly bypassed instead.
*/
if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
zio->io_txg != 0 && /* not a delegated i/o */
+ vd->vdev_ops != &vdev_indirect_ops &&
!vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio_vdev_io_bypass(zio);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
if (vd->vdev_ops->vdev_op_leaf) {
switch (zio->io_type) {
case ZIO_TYPE_READ:
if (vdev_cache_read(zio))
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
/* FALLTHROUGH */
case ZIO_TYPE_WRITE:
case ZIO_TYPE_FREE:
if ((zio = vdev_queue_io(zio)) == NULL)
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
zio_interrupt(zio);
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
break;
}
@@ -3295,14 +3334,14 @@ zio_vdev_io_start(zio_t *zio)
if (zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
!trim_map_write_start(zio))
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
vd->vdev_ops->vdev_op_io_start(zio);
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
-static int
+static zio_t *
zio_vdev_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -3310,7 +3349,7 @@ zio_vdev_io_done(zio_t *zio)
boolean_t unexpected_error = B_FALSE;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
ASSERT(zio->io_type == ZIO_TYPE_READ ||
@@ -3353,7 +3392,7 @@ zio_vdev_io_done(zio_t *zio)
if (unexpected_error)
VERIFY(vdev_probe(vd, zio) == NULL);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -3411,13 +3450,13 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
zcr->zcr_free = zio_buf_free;
}
-static int
+static zio_t *
zio_vdev_io_assess(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
@@ -3463,7 +3502,7 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
/*
@@ -3503,7 +3542,7 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_physdone(zio->io_logical);
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
void
@@ -3538,7 +3577,7 @@ zio_vdev_io_bypass(zio_t *zio)
* Generate and verify checksums
* ==========================================================================
*/
-static int
+static zio_t *
zio_checksum_generate(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -3552,7 +3591,7 @@ zio_checksum_generate(zio_t *zio)
checksum = zio->io_prop.zp_checksum;
if (checksum == ZIO_CHECKSUM_OFF)
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
ASSERT(checksum == ZIO_CHECKSUM_LABEL);
} else {
@@ -3566,10 +3605,10 @@ zio_checksum_generate(zio_t *zio)
zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
-static int
+static zio_t *
zio_checksum_verify(zio_t *zio)
{
zio_bad_cksum_t info;
@@ -3584,7 +3623,7 @@ zio_checksum_verify(zio_t *zio)
* We're either verifying a label checksum, or nothing at all.
*/
if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
}
@@ -3599,7 +3638,7 @@ zio_checksum_verify(zio_t *zio)
}
}
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -3642,7 +3681,7 @@ zio_worst_error(int e1, int e2)
* I/O completion
* ==========================================================================
*/
-static int
+static zio_t *
zio_ready(zio_t *zio)
{
blkptr_t *bp = zio->io_bp;
@@ -3651,7 +3690,7 @@ zio_ready(zio_t *zio)
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
ZIO_WAIT_READY)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
if (zio->io_ready) {
@@ -3678,8 +3717,8 @@ zio_ready(zio_t *zio)
*/
metaslab_class_throttle_unreserve(
spa_normal_class(zio->io_spa),
- zio->io_prop.zp_copies, zio);
- zio_allocate_dispatch(zio->io_spa);
+ zio->io_prop.zp_copies, zio->io_allocator, zio);
+ zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
}
}
@@ -3697,7 +3736,7 @@ zio_ready(zio_t *zio)
*/
for (; pio != NULL; pio = pio_next) {
pio_next = zio_walk_parents(zio, &zl);
- zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+ zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
}
if (zio->io_flags & ZIO_FLAG_NODATA) {
@@ -3713,7 +3752,7 @@ zio_ready(zio_t *zio)
zio->io_spa->spa_syncing_txg == zio->io_txg)
zio_handle_ignored_writes(zio);
- return (ZIO_PIPELINE_CONTINUE);
+ return (zio);
}
/*
@@ -3762,21 +3801,22 @@ zio_dva_throttle_done(zio_t *zio)
ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
mutex_enter(&pio->io_lock);
- metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+ pio->io_allocator, B_TRUE);
mutex_exit(&pio->io_lock);
metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
- 1, pio);
+ 1, pio->io_allocator, pio);
/*
* Call into the pipeline to see if there is more work that
* needs to be done. If there is work to be done it will be
* dispatched to another taskq thread.
*/
- zio_allocate_dispatch(zio->io_spa);
+ zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
}
-static int
+static zio_t *
zio_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -3793,7 +3833,7 @@ zio_done(zio_t *zio)
* wait for them and then repeat this pipeline stage.
*/
if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
/*
@@ -3816,8 +3856,10 @@ zio_done(zio_t *zio)
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
ASSERT(bp != NULL);
- metaslab_group_alloc_verify(spa, zio->io_bp, zio);
- VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+ metaslab_group_alloc_verify(spa, zio->io_bp, zio,
+ zio->io_allocator);
+ VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+ zio));
}
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
@@ -4005,7 +4047,12 @@ zio_done(zio_t *zio)
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
zio_remove_child(pio, zio, remove_zl);
- zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+ /*
+ * This is a rare code path, so we don't
+ * bother with "next_to_execute".
+ */
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
+ NULL);
}
}
@@ -4017,7 +4064,11 @@ zio_done(zio_t *zio)
*/
ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
- zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+ /*
+ * This is a rare code path, so we don't bother with
+ * "next_to_execute".
+ */
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
/*
* We'd fail again if we reexecuted now, so suspend
@@ -4038,7 +4089,7 @@ zio_done(zio_t *zio)
ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
0, &zio->io_tqent);
}
- return (ZIO_PIPELINE_STOP);
+ return (NULL);
}
ASSERT(zio->io_child_count == 0);
@@ -4068,12 +4119,17 @@ zio_done(zio_t *zio)
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
+ /*
+ * We are done executing this zio. We may want to execute a parent
+ * next. See the comment in zio_notify_parent().
+ */
+ zio_t *next_to_execute = NULL;
zl = NULL;
for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
zio_link_t *remove_zl = zl;
pio_next = zio_walk_parents(zio, &zl);
zio_remove_child(pio, zio, remove_zl);
- zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
}
if (zio->io_waiter != NULL) {
@@ -4085,7 +4141,7 @@ zio_done(zio_t *zio)
zio_destroy(zio);
}
- return (ZIO_PIPELINE_STOP);
+ return (next_to_execute);
}
/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
index 7e05b9212db0..b87303889ddb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -25,7 +25,7 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -56,6 +56,12 @@ static zcomp_stats_t zcomp_stats = {
kstat_t *zcomp_ksp;
/*
+ * If nonzero, every 1/X decompression attempts will fail, simulating
+ * an undetected memory error.
+ */
+uint64_t zio_decompress_fail_fraction = 0;
+
+/*
* Compression vectors.
*/
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
@@ -172,6 +178,16 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
abd_return_buf(src, tmp, s_len);
+ /*
+ * Decompression shouldn't fail, because we've already verifyied
+ * the checksum. However, for extra protection (e.g. against bitflips
+ * in non-ECC RAM), we handle this error (and test it).
+ */
+ ASSERT0(ret);
+ if (zio_decompress_fail_fraction != 0 &&
+ spa_get_random(zio_decompress_fail_fraction) == 0)
+ ret = SET_ERROR(EINVAL);
+
return (ret);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 805c63d09a01..3c7f669a351a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -174,7 +174,7 @@ typedef struct zvol_state {
zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */
znode_t zv_znode; /* for range locking */
- dmu_buf_t *zv_dbuf; /* bonus handle */
+ dnode_t *zv_dn; /* dnode hold */
#ifndef illumos
int zv_state;
int zv_volmode; /* Provide GEOM or cdev */
@@ -868,7 +868,7 @@ zvol_first_open(zvol_state_t *zv)
}
zv->zv_volblocksize = doi.doi_data_block_size;
- error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+ error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
if (error) {
dmu_objset_disown(os, zvol_tag);
return (error);
@@ -893,8 +893,8 @@ zvol_last_close(zvol_state_t *zv)
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- dmu_buf_rele(zv->zv_dbuf, zvol_tag);
- zv->zv_dbuf = NULL;
+ dnode_rele(zv->zv_dn, zvol_tag);
+ zv->zv_dn = NULL;
/*
* Evict cached data
@@ -1342,8 +1342,6 @@ static int
zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zvol_state_t *zv = arg;
- objset_t *os = zv->zv_objset;
- uint64_t object = ZVOL_OBJ;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length; /* length of user data */
dmu_buf_t *db;
@@ -1367,7 +1365,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
RL_READER);
- error = dmu_read(os, object, offset, size, buf,
+ error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else { /* indirect write */
/*
@@ -1380,7 +1378,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
offset = P2ALIGN(offset, size);
zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
RL_READER);
- error = dmu_buf_hold(os, object, offset, zgd, &db,
+ error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@@ -1451,8 +1449,8 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
(wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
- if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
- ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
+ off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
@@ -1874,7 +1872,7 @@ zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
if (bytes > volsize - uio->uio_loffset)
bytes = volsize - uio->uio_loffset;
- error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
+ error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
@@ -1946,7 +1944,7 @@ zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
dmu_tx_abort(tx);
break;
}
- error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
+ error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
if (error == 0)
zvol_log_write(zv, tx, off, bytes, sync);
dmu_tx_commit(tx);
@@ -2028,7 +2026,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
int
zvol_get_volume_params(minor_t minor, uint64_t *blksize,
uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
- void **rl_hdl, void **bonus_hdl)
+ void **rl_hdl, void **dnode_hdl)
{
zvol_state_t *zv;
@@ -2039,7 +2037,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
return (SET_ERROR(ENXIO));
ASSERT(blksize && max_xfer_len && minor_hdl &&
- objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+ objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
*blksize = zv->zv_volblocksize;
*max_xfer_len = (uint64_t)zvol_maxphys;
@@ -2047,7 +2045,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
*objset_hdl = zv->zv_objset;
*zil_hdl = zv->zv_zilog;
*rl_hdl = &zv->zv_znode;
- *bonus_hdl = zv->zv_dbuf;
+ *dnode_hdl = zv->zv_dn;
return (0);
}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
index 10e0ddaeef88..fea46c90481d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
@@ -107,6 +107,14 @@ extern "C" {
/*
+ * AVL comparator helpers
+ */
+#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0))
+#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b)))
+#define AVL_PCMP(a, b) \
+ (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
+/*
* Type used for the root of the AVL tree.
*/
typedef struct avl_tree avl_tree_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index c347b63a1a6f..3fcda9e8965e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -118,7 +118,7 @@ typedef enum {
ZFS_PROP_SNAPDIR,
ZFS_PROP_ACLMODE,
ZFS_PROP_ACLINHERIT,
- ZFS_PROP_CREATETXG, /* not exposed to the user */
+ ZFS_PROP_CREATETXG,
ZFS_PROP_NAME, /* not exposed to the user */
ZFS_PROP_CANMOUNT,
ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */
@@ -637,6 +637,13 @@ typedef struct zpool_load_policy {
#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
"com.delphix:pool_checkpoint_sm"
+#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \
+ "com.delphix:next_offset_to_initialize"
+#define VDEV_LEAF_ZAP_INITIALIZE_STATE \
+ "com.delphix:vdev_initialize_state"
+#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \
+ "com.delphix:vdev_initialize_action_time"
+
/*
* This is needed in userland to report the minimum necessary device size.
*
@@ -735,6 +742,15 @@ typedef enum pool_scrub_cmd {
POOL_SCRUB_FLAGS_END
} pool_scrub_cmd_t;
+/*
+ * Initialize functions.
+ */
+typedef enum pool_initialize_func {
+ POOL_INITIALIZE_DO,
+ POOL_INITIALIZE_CANCEL,
+ POOL_INITIALIZE_SUSPEND,
+ POOL_INITIALIZE_FUNCS
+} pool_initialize_func_t;
/*
* ZIO types. Needed to interpret vdev statistics below.
@@ -814,6 +830,14 @@ typedef struct pool_checkpoint_stat {
uint64_t pcs_space; /* checkpointed space */
} pool_checkpoint_stat_t;
+typedef enum {
+ VDEV_INITIALIZE_NONE,
+ VDEV_INITIALIZE_ACTIVE,
+ VDEV_INITIALIZE_CANCELED,
+ VDEV_INITIALIZE_SUSPENDED,
+ VDEV_INITIALIZE_COMPLETE
+} vdev_initializing_state_t;
+
/*
* Vdev statistics. Note: all fields should be 64-bit because this
* is passed between kernel and userland as an nvlist uint64 array.
@@ -840,6 +864,11 @@ typedef struct vdev_stat {
uint64_t vs_physical_ashift; /* vdev_physical_ashift */
uint64_t vs_fragmentation; /* device fragmentation */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
+ uint64_t vs_initialize_errors; /* initializing errors */
+ uint64_t vs_initialize_bytes_done; /* bytes initialized */
+ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
+ uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
+ uint64_t vs_initialize_action_time; /* time_t */
} vdev_stat_t;
#define VDEV_STAT_VALID(field, uint64_t_field_count) \
((uint64_t_field_count * sizeof(uint64_t)) >= \
@@ -974,6 +1003,7 @@ typedef enum zfs_ioc {
ZFS_IOC_REMAP,
ZFS_IOC_POOL_CHECKPOINT,
ZFS_IOC_POOL_DISCARD_CHECKPOINT,
+ ZFS_IOC_POOL_INITIALIZE,
ZFS_IOC_LAST
} zfs_ioc_t;
@@ -1037,6 +1067,12 @@ typedef enum {
#define ZPOOL_HIST_ERRNO "errno"
/*
+ * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
+ */
+#define ZPOOL_INITIALIZE_COMMAND "initialize_command"
+#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs"
+
+/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
#define ZFS_ONLINE_CHECKREMOVE 0x1
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
index e4545a96ee76..52d6aea0a364 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_NVPAIR_H
@@ -39,6 +39,7 @@ extern "C" {
#endif
typedef enum {
+ DATA_TYPE_DONTCARE = -1,
DATA_TYPE_UNKNOWN = 0,
DATA_TYPE_BOOLEAN,
DATA_TYPE_BYTE,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
index f12dbbfe6ef5..c9874b3e4db7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
@@ -24,11 +24,13 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
#ifndef _NVPAIR_IMPL_H
#define _NVPAIR_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -47,16 +49,27 @@ typedef struct i_nvp i_nvp_t;
struct i_nvp {
union {
- uint64_t _nvi_align; /* ensure alignment */
+ /* ensure alignment */
+ uint64_t _nvi_align;
+
struct {
- i_nvp_t *_nvi_next; /* pointer to next nvpair */
- i_nvp_t *_nvi_prev; /* pointer to prev nvpair */
+ /* pointer to next nvpair */
+ i_nvp_t *_nvi_next;
+
+ /* pointer to prev nvpair */
+ i_nvp_t *_nvi_prev;
+
+ /* next pair in table bucket */
+ i_nvp_t *_nvi_hashtable_next;
} _nvi;
} _nvi_un;
- nvpair_t nvi_nvp; /* nvpair */
+
+ /* nvpair */
+ nvpair_t nvi_nvp;
};
#define nvi_next _nvi_un._nvi._nvi_next
#define nvi_prev _nvi_un._nvi._nvi_prev
+#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next
typedef struct {
i_nvp_t *nvp_list; /* linked list of nvpairs */
@@ -64,6 +77,10 @@ typedef struct {
i_nvp_t *nvp_curr; /* current walker nvpair */
nv_alloc_t *nvp_nva; /* pluggable allocator */
uint32_t nvp_stat; /* internal state */
+
+ i_nvp_t **nvp_hashtable; /* table of entries used for lookup */
+ uint32_t nvp_nbuckets; /* # of buckets in hash table */
+ uint32_t nvp_nentries; /* # of entries in hash table */
} nvpriv_t;
#ifdef __cplusplus