83 files changed, 5650 insertions, 1903 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
index 89a64ea1d960..c322a5bd2179 100644
--- a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
+++ b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/debug.h>
@@ -142,6 +143,13 @@ static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
 #define	NVPAIR2I_NVP(nvp) \
 	((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
 
+#ifdef _KERNEL
+int nvpair_max_recursion = 20;
+#else
+int nvpair_max_recursion = 100;
+#endif
+
+uint64_t nvlist_hashtable_init_size = (1 << 4);
 
 int
 nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
@@ -250,6 +258,291 @@ nv_priv_alloc_embedded(nvpriv_t *priv)
 	return (emb_priv);
 }
 
+static int
+nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
+{
+	ASSERT3P(priv->nvp_hashtable, ==, NULL);
+	ASSERT0(priv->nvp_nbuckets);
+	ASSERT0(priv->nvp_nentries);
+
+	i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
+	if (tab == NULL)
+		return (ENOMEM);
+
+	priv->nvp_hashtable = tab;
+	priv->nvp_nbuckets = buckets;
+	return (0);
+}
+
+static void
+nvt_tab_free(nvpriv_t *priv)
+{
+	i_nvp_t **tab = priv->nvp_hashtable;
+	if (tab == NULL) {
+		ASSERT0(priv->nvp_nbuckets);
+		ASSERT0(priv->nvp_nentries);
+		return;
+	}
+
+	nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
+
+	priv->nvp_hashtable = NULL;
+	priv->nvp_nbuckets = 0;
+	priv->nvp_nentries = 0;
+}
+
+static uint32_t
+nvt_hash(const char *p)
+{
+	uint32_t g, hval = 0;
+
+	while (*p) {
+		hval = (hval << 4) + *p++;
+		if ((g = (hval & 0xf0000000)) != 0)
+			hval ^= g >> 24;
+		hval &= ~g;
+	}
+	return (hval);
+}
+
+static boolean_t
+nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
+{
+	boolean_t match = B_FALSE;
+	if (nvflag & NV_UNIQUE_NAME_TYPE) {
+		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
+		    NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
+			match = B_TRUE;
+	} else {
+		ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
+		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
+			match = B_TRUE;
+	}
+	return (match);
+}
+
+static nvpair_t *
+nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	ASSERT(priv != NULL);
+
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	if (tab == NULL) {
+		ASSERT3P(priv->nvp_list, ==, NULL);
+		ASSERT0(priv->nvp_nbuckets);
+		ASSERT0(priv->nvp_nentries);
+		return (NULL);
+	} else {
+		ASSERT(priv->nvp_nbuckets != 0);
+	}
+
+	uint64_t hash = nvt_hash(name);
+	uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+	ASSERT3U(index, <, priv->nvp_nbuckets);
+	i_nvp_t *entry = tab[index];
+
+	for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
+		if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
+		    (type == DATA_TYPE_DONTCARE ||
+		    NVP_TYPE(&e->nvi_nvp) == type))
+			return (&e->nvi_nvp);
+	}
+	return (NULL);
+}
+
+static nvpair_t *
+nvt_lookup_name(nvlist_t *nvl, const char *name)
+{
+	return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
+}
+
+static int
+nvt_resize(nvpriv_t *priv, uint32_t new_size)
+{
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	/*
+	 * Migrate all the entries from the current table
+	 * to a newly-allocated table with the new size by
+	 * re-adjusting the pointers of their entries.
+	 */
+	uint32_t size = priv->nvp_nbuckets;
+	uint32_t new_mask = new_size - 1;
+	ASSERT(((new_size) & ((new_size) - 1)) == 0);
+
+	i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
+	if (new_tab == NULL)
+		return (ENOMEM);
+
+	uint32_t nentries = 0;
+	for (uint32_t i = 0; i < size; i++) {
+		i_nvp_t *next, *e = tab[i];
+
+		while (e != NULL) {
+			next = e->nvi_hashtable_next;
+
+			uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
+			uint32_t index = hash & new_mask;
+
+			e->nvi_hashtable_next = new_tab[index];
+			new_tab[index] = e;
+			nentries++;
+
+			e = next;
+		}
+		tab[i] = NULL;
+	}
+	ASSERT3U(nentries, ==, priv->nvp_nentries);
+
+	nvt_tab_free(priv);
+
+	priv->nvp_hashtable = new_tab;
+	priv->nvp_nbuckets = new_size;
+	priv->nvp_nentries = nentries;
+
+	return (0);
+}
+
+static boolean_t
+nvt_needs_togrow(nvpriv_t *priv)
+{
+	/*
+	 * Grow only when we have more elements than buckets
+	 * and the # of buckets doesn't overflow.
+	 */
+	return (priv->nvp_nentries > priv->nvp_nbuckets &&
+	    (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
+}
+
+/*
+ * Allocate a new table that's twice the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_grow(nvpriv_t *priv)
+{
+	uint32_t current_size = priv->nvp_nbuckets;
+	/* ensure we won't overflow */
+	ASSERT3U(UINT32_MAX >> 1, >=, current_size);
+	return (nvt_resize(priv, current_size << 1));
+}
+
+static boolean_t
+nvt_needs_toshrink(nvpriv_t *priv)
+{
+	/*
+	 * Shrink only when the # of elements is less than or
+	 * equal to 1/4 the # of buckets. Never shrink less than
+	 * nvlist_hashtable_init_size.
+	 */
+	ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
+	if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
+		return (B_FALSE);
+	return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
+}
+
+/*
+ * Allocate a new table that's half the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_shrink(nvpriv_t *priv)
+{
+	uint32_t current_size = priv->nvp_nbuckets;
+	/* ensure we won't overflow */
+	ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
+	return (nvt_resize(priv, current_size >> 1));
+}
+
+static int
+nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+	if (nvt_needs_toshrink(priv)) {
+		int err = nvt_shrink(priv);
+		if (err != 0)
+			return (err);
+	}
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	char *name = NVP_NAME(nvp);
+	uint64_t hash = nvt_hash(name);
+	uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+	ASSERT3U(index, <, priv->nvp_nbuckets);
+	i_nvp_t *bucket = tab[index];
+
+	for (i_nvp_t *prev = NULL, *e = bucket;
+	    e != NULL; prev = e, e = e->nvi_hashtable_next) {
+		if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) {
+			if (prev != NULL) {
+				prev->nvi_hashtable_next =
+				    e->nvi_hashtable_next;
+			} else {
+				ASSERT3P(e, ==, bucket);
+				tab[index] = e->nvi_hashtable_next;
+			}
+			e->nvi_hashtable_next = NULL;
+			priv->nvp_nentries--;
+			break;
+		}
+	}
+
+	return (0);
+}
+
+static int
+nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+	/* initialize nvpair table now if it doesn't exist. */
+	if (priv->nvp_hashtable == NULL) {
+		int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
+		if (err != 0)
+			return (err);
+	}
+
+	/*
+	 * if we don't allow duplicate entries, make sure to
+	 * unlink any existing entries from the table.
+	 */
+	if (nvl->nvl_nvflag != 0) {
+		int err = nvt_remove_nvpair(nvl, nvp);
+		if (err != 0)
+			return (err);
+	}
+
+	if (nvt_needs_togrow(priv)) {
+		int err = nvt_grow(priv);
+		if (err != 0)
+			return (err);
+	}
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	char *name = NVP_NAME(nvp);
+	uint64_t hash = nvt_hash(name);
+	uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+	ASSERT3U(index, <, priv->nvp_nbuckets);
+	i_nvp_t *bucket = tab[index];
+
+	/* insert link at the beginning of the bucket */
+	i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
+	ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
+	new_entry->nvi_hashtable_next = bucket;
+	tab[index] = new_entry;
+
+	priv->nvp_nentries++;
+	return (0);
+}
+
 static void
 nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
 {
@@ -582,6 +875,7 @@ nvlist_free(nvlist_t *nvl)
 	else
 		nvl->nvl_priv = 0;
 
+	nvt_tab_free(priv);
 	nv_mem_free(priv, priv, sizeof (nvpriv_t));
 }
 
@@ -642,26 +936,14 @@ nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
 int
 nvlist_remove_all(nvlist_t *nvl, const char *name)
 {
-	nvpriv_t *priv;
-	i_nvp_t *curr;
 	int error = ENOENT;
 
-	if (nvl == NULL || name == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
-	curr = priv->nvp_list;
-	while (curr != NULL) {
-		nvpair_t *nvp = &curr->nvi_nvp;
-
-		curr = curr->nvi_next;
-		if (strcmp(name, NVP_NAME(nvp)) != 0)
-			continue;
-
-		nvp_buf_unlink(nvl, nvp);
-		nvpair_free(nvp);
-		nvp_buf_free(nvl, nvp);
-
+	nvpair_t *nvp;
+	while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
+		VERIFY0(nvlist_remove_nvpair(nvl, nvp));
 		error = 0;
 	}
 
@@ -674,28 +956,14 @@ nvlist_remove_all(nvlist_t *nvl, const char *name)
 int
 nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
 {
-	nvpriv_t *priv;
-	i_nvp_t *curr;
-
-	if (nvl == NULL || name == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
-	curr = priv->nvp_list;
-	while (curr != NULL) {
-		nvpair_t *nvp = &curr->nvi_nvp;
-
-		if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) {
-			nvp_buf_unlink(nvl, nvp);
-			nvpair_free(nvp);
-			nvp_buf_free(nvl, nvp);
-
-			return (0);
-		}
-		curr = curr->nvi_next;
-	}
+	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+	if (nvp == NULL)
+		return (ENOENT);
 
-	return (ENOENT);
+	return (nvlist_remove_nvpair(nvl, nvp));
 }
 
 int
@@ -704,6 +972,10 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 	if (nvl == NULL || nvp == NULL)
 		return (EINVAL);
 
+	int err = nvt_remove_nvpair(nvl, nvp);
+	if (err != 0)
+		return (err);
+
 	nvp_buf_unlink(nvl, nvp);
 	nvpair_free(nvp);
 	nvp_buf_free(nvl, nvp);
@@ -908,6 +1180,8 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
 
 	/* calculate sizes of the nvpair elements and the nvpair itself */
 	name_sz = strlen(name) + 1;
+	if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1))
+		return (EINVAL);
 
 	nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
 
@@ -979,6 +1253,12 @@ nvlist_add_common(nvlist_t *nvl, const char *name,
 	else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
 		(void) nvlist_remove(nvl, name, type);
 
+	err = nvt_add_nvpair(nvl, nvp);
+	if (err != 0) {
+		nvpair_free(nvp);
+		nvp_buf_free(nvl, nvp);
+		return (err);
+	}
 	nvp_buf_link(nvl, nvp);
 
 	return (0);
@@ -1328,25 +1608,17 @@ static int
 nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
     uint_t *nelem, void *data)
 {
-	nvpriv_t *priv;
-	nvpair_t *nvp;
-	i_nvp_t *curr;
-
-	if (name == NULL || nvl == NULL ||
-	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
 	if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
 		return (ENOTSUP);
 
-	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
-		nvp = &curr->nvi_nvp;
-
-		if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type)
-			return (nvpair_value_common(nvp, type, nelem, data));
-	}
+	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+	if (nvp == NULL)
+		return (ENOENT);
 
-	return (ENOENT);
+	return (nvpair_value_common(nvp, type, nelem, data));
 }
 
 int
@@ -2018,6 +2290,7 @@ typedef struct {
 	const nvs_ops_t	*nvs_ops;
 	void		*nvs_private;
 	nvpriv_t	*nvs_priv;
+	int		nvs_recursion;
 } nvstream_t;
 
 /*
@@ -2103,6 +2376,12 @@ nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
 			return (EFAULT);
 		}
 
+		err = nvt_add_nvpair(nvl, nvp);
+		if (err != 0) {
+			nvpair_free(nvp);
+			nvp_buf_free(nvl, nvp);
+			return (err);
+		}
 		nvp_buf_link(nvl, nvp);
 	}
 	return (err);
@@ -2169,9 +2448,16 @@ static int
 nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
 {
 	switch (nvs->nvs_op) {
-	case NVS_OP_ENCODE:
-		return (nvs_operation(nvs, embedded, NULL));
+	case NVS_OP_ENCODE: {
+		int err;
 
+		if (nvs->nvs_recursion >= nvpair_max_recursion)
+			return (EINVAL);
+		nvs->nvs_recursion++;
+		err = nvs_operation(nvs, embedded, NULL);
+		nvs->nvs_recursion--;
+		return (err);
+	}
 	case NVS_OP_DECODE: {
 		nvpriv_t *priv;
 		int err;
@@ -2184,8 +2470,14 @@ nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
 
 		nvlist_init(embedded, embedded->nvl_nvflag, priv);
 
+		if (nvs->nvs_recursion >= nvpair_max_recursion) {
+			nvlist_free(embedded);
+			return (EINVAL);
+		}
+		nvs->nvs_recursion++;
 		if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
 			nvlist_free(embedded);
+		nvs->nvs_recursion--;
 		return (err);
 	}
 	default:
@@ -2273,6 +2565,7 @@ nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
 		return (EINVAL);
 
 	nvs.nvs_op = nvs_op;
+	nvs.nvs_recursion = 0;
 
 	/*
 	 * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
index 5f3d22f703bc..67774cddb8c9 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
@@ -229,6 +229,12 @@ zpool_feature_init(void)
 	    "Pool state can be checkpointed, allowing rewind later.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
+	zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+	    "com.delphix:spacemap_v2", "spacemap_v2",
+	    "Space maps representing large segments are more efficient.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    NULL);
+
 	static const spa_feature_t large_blocks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
 		SPA_FEATURE_NONE
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
index 12bd4ffe1ccc..1972ba397fae 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
@@ -60,6 +60,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_DEVICE_REMOVAL,
 	SPA_FEATURE_OBSOLETE_COUNTS,
 	SPA_FEATURE_POOL_CHECKPOINT,
+	SPA_FEATURE_SPACEMAP_V2,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
index 09975125261b..bad8f20e6917 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -34,8 +34,6 @@
  * name is invalid.  In the kernel, we only care whether it's valid or not.
  * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
  * the name failed to validate.
- *
- * Each function returns 0 on success, -1 on error.
  */
 
 #if defined(_KERNEL)
@@ -50,6 +48,14 @@
 #include "zfs_namecheck.h"
 #include "zfs_deleg.h"
 
+/*
+ * Deeply nested datasets can overflow the stack, so we put a limit
+ * in the amount of nesting a path can have. zfs_max_dataset_nesting
+ * can be tuned temporarily to fix existing datasets that exceed our
+ * predefined limit.
+ */
+int zfs_max_dataset_nesting = 50;
+
 static int
 valid_char(char c)
 {
@@ -60,10 +66,35 @@ valid_char(char c)
 }
 
 /*
+ * Looks at a path and returns its level of nesting (depth).
+ */
+int
+get_dataset_depth(const char *path)
+{
+	const char *loc = path;
+	int nesting = 0;
+
+	/*
+	 * Keep track of nesting until you hit the end of the
+	 * path or found the snapshot/bookmark seperator.
+	 */
+	for (int i = 0; loc[i] != '\0' &&
+	    loc[i] != '@' &&
+	    loc[i] != '#'; i++) {
+		if (loc[i] == '/')
+			nesting++;
+	}
+
+	return (nesting);
+}
+
+/*
  * Snapshot names must be made up of alphanumeric characters plus the following
  * characters:
  *
- * 	[-_.: ]
+ *	[-_.: ]
+ *
+ * Returns 0 on success, -1 on error.
  */
 int
 zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -99,6 +130,8 @@ zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
  * Permissions set name must start with the letter '@' followed by the
  * same character restrictions as snapshot names, except that the name
  * cannot exceed 64 characters.
+ *
+ * Returns 0 on success, -1 on error.
  */
 int
 permset_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -121,28 +154,40 @@ permset_namecheck(const char *path, namecheck_err_t *why, char *what)
 }
 
 /*
+ * Dataset paths should not be deeper than zfs_max_dataset_nesting
+ * in terms of nesting.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+dataset_nestcheck(const char *path)
+{
+	return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
+}
+
+/*
  * Entity names must be of the following form:
  *
- * 	[component/]*[component][(@|#)component]?
+ *	[component/]*[component][(@|#)component]?
  *
  * Where each component is made up of alphanumeric characters plus the following
  * characters:
  *
- * 	[-_.:%]
+ *	[-_.:%]
  *
  * We allow '%' here as we use that character internally to create unique
  * names for temporary clones (for online recv).
+ *
+ * Returns 0 on success, -1 on error.
  */
 int
 entity_namecheck(const char *path, namecheck_err_t *why, char *what)
 {
-	const char *start, *end;
-	int found_delim;
+	const char *end;
 
 	/*
 	 * Make sure the name is not too long.
 	 */
-
 	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
 		if (why)
 			*why = NAME_ERR_TOOLONG;
@@ -162,8 +207,8 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what)
 		return (-1);
 	}
 
-	start = path;
-	found_delim = 0;
+	const char *start = path;
+	boolean_t found_delim = B_FALSE;
 	for (;;) {
 		/* Find the end of this component */
 		end = start;
@@ -198,7 +243,7 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what)
 				return (-1);
 			}
 
-			found_delim = 1;
+			found_delim = B_TRUE;
 		}
 
 		/* Zero-length components are not allowed */
@@ -250,6 +295,8 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
  * mountpoint names must be of the following form:
  *
  *	/[component][/]*[component][/]
+ *
+ * Returns 0 on success, -1 on error.
  */
 int
 mountpoint_namecheck(const char *path, namecheck_err_t *why)
@@ -294,6 +341,8 @@ mountpoint_namecheck(const char *path, namecheck_err_t *why)
  * dataset names, with the additional restriction that the pool name must begin
  * with a letter.  The pool names 'raidz' and 'mirror' are also reserved names
  * that cannot be used.
+ *
+ * Returns 0 on success, -1 on error.
  */
 int
 pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
index db70641dbab2..527db92b0cfa 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef	_ZFS_NAMECHECK_H
@@ -48,9 +48,13 @@ typedef enum {
 
 #define	ZFS_PERMSET_MAXLEN	64
 
+extern int zfs_max_dataset_nesting;
+
+int get_dataset_depth(const char *);
 int pool_namecheck(const char *, namecheck_err_t *, char *);
 int entity_namecheck(const char *, namecheck_err_t *, char *);
 int dataset_namecheck(const char *, namecheck_err_t *, char *);
+int dataset_nestcheck(const char *);
 int mountpoint_namecheck(const char *, namecheck_err_t *);
 int zfs_component_namecheck(const char *, namecheck_err_t *, char *);
 int permset_namecheck(const char *, namecheck_err_t *, char *);
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
index 5f7bcaba5450..880051800365 100644
--- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
+++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
@@ -427,6 +427,10 @@ zfs_prop_init(void)
 	zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
 	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<count>", "SSCOUNT");
+	zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
+	zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
 
 	/* inherit number properties */
 	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
@@ -434,8 +438,6 @@ zfs_prop_init(void)
 	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
 	/* hidden properties */
-	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG");
 	zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
 	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
@@ -447,8 +449,6 @@ zfs_prop_init(void)
 	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
 	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
 	    "STMF_SBD_LU");
-	zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID");
 	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
 	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
 	    "USERACCOUNTING");
diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
index 8b6720e619f1..27c5fa6e06d0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
+++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
@@ -124,6 +124,7 @@ ZFS_COMMON_OBJS +=		\
 	vdev_indirect.o		\
 	vdev_indirect_births.o	\
 	vdev_indirect_mapping.o	\
+	vdev_initialize.o	\
 	vdev_label.o		\
 	vdev_mirror.o		\
 	vdev_missing.o		\
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 88bbf7ef7c7e..db7ca9889d47 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  */
@@ -377,6 +377,13 @@ u_int zfs_arc_free_target = 0;
 /* Absolute min for arc min / max is 16MB. */
 static uint64_t arc_abs_min = 16 << 20;
 
+/*
+ * ARC dirty data constraints for arc_tempreserve_space() throttle
+ */
+uint_t zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
+uint_t zfs_arc_anon_limit_percent = 25;		/* anon block dirty limit */
+uint_t zfs_arc_pool_dirty_percent = 20;		/* each pool's anon allowance */
+
 boolean_t zfs_compressed_arc_enabled = B_TRUE;
 
 static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
@@ -5148,12 +5155,13 @@ arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	arc_buf_t **bufp = arg;
-
 	if (buf == NULL) {
+		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
+		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
-		ASSERT(buf->b_data);
+		ASSERT(buf->b_data != NULL);
 	}
 }
 
@@ -5181,6 +5189,7 @@ arc_read_done(zio_t *zio)
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 	boolean_t	freeable = B_FALSE;
+	boolean_t	no_zio_error = (zio->io_error == 0);
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
@@ -5206,7 +5215,7 @@ arc_read_done(zio_t *zio)
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
-	if (zio->io_error == 0) {
+	if (no_zio_error) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5227,8 +5236,7 @@ arc_read_done(zio_t *zio)
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 
-	if (hash_lock && zio->io_error == 0 &&
-	    hdr->b_l1hdr.b_state == arc_anon) {
+	if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
@@ -5251,20 +5259,38 @@ arc_read_done(zio_t *zio)
 
 		callback_cnt++;
 
-		if (zio->io_error != 0)
-			continue;
-		
-		int error = arc_buf_alloc_impl(hdr, acb->acb_private,
-		    acb->acb_compressed,
-		    B_TRUE, &acb->acb_buf);
-		if (error != 0) {
-			arc_buf_destroy(acb->acb_buf, acb->acb_private);
-			acb->acb_buf = NULL;
+		if (no_zio_error) {
+			int error = arc_buf_alloc_impl(hdr, acb->acb_private,
+			    acb->acb_compressed, zio->io_error == 0,
+			    &acb->acb_buf);
+			if (error != 0) {
+				/*
+				 * Decompression failed.  Set io_error
+				 * so that when we call acb_done (below),
+				 * we will indicate that the read failed.
+				 * Note that in the unusual case where one
+				 * callback is compressed and another
+				 * uncompressed, we will mark all of them
+				 * as failed, even though the uncompressed
+				 * one can't actually fail.  In this case,
+				 * the hdr will not be anonymous, because
+				 * if there are multiple callbacks, it's
+				 * because multiple threads found the same
+				 * arc buf in the hash table.
+				 */
+				zio->io_error = error;
+			}
 		}
-
-		if (zio->io_error == 0)
-			zio->io_error = error;
 	}
+	/*
+	 * If there are multiple callbacks, we must have the hash lock,
+	 * because the only way for multiple threads to find this hdr is
+	 * in the hash table.  This ensures that if there are multiple
+	 * callbacks, the hdr is not anonymous.  If it were anonymous,
+	 * we couldn't use arc_buf_destroy() in the error case below.
+	 */
+	ASSERT(callback_cnt < 2 || hash_lock != NULL);
+
 	hdr->b_l1hdr.b_acb = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	if (callback_cnt == 0) {
@@ -5276,7 +5302,7 @@ arc_read_done(zio_t *zio)
 	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
 	    callback_list != NULL);
 
-	if (zio->io_error == 0) {
+	if (no_zio_error) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5309,7 +5335,16 @@ arc_read_done(zio_t *zio)
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
-		if (acb->acb_done) {
+		if (acb->acb_done != NULL) {
+			if (zio->io_error != 0 && acb->acb_buf != NULL) {
+				/*
+				 * If arc_buf_alloc_impl() fails during
+				 * decompression, the buf will still be
+				 * allocated, and needs to be freed here.
+				 */
+				arc_buf_destroy(acb->acb_buf, acb->acb_private);
+				acb->acb_buf = NULL;
+			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
@@ -6280,12 +6315,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
 	uint64_t available_memory = ptob(freemem);
-	static uint64_t page_load = 0;
-	static uint64_t last_txg = 0;
 
 #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	available_memory = MIN(available_memory, uma_avail());
@@ -6294,9 +6327,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
 	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
 		return (0);
 
-	if (txg > last_txg) {
-		last_txg = txg;
-		page_load = 0;
+	if (txg > spa->spa_lowmem_last_txg) {
+		spa->spa_lowmem_last_txg = txg;
+		spa->spa_lowmem_page_load = 0;
 	}
 	/*
 	 * If we are in pageout, we know that memory is already tight,
@@ -6304,18 +6337,19 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg)
 	 * continue to let page writes occur as quickly as possible.
 	 */
 	if (curproc == pageproc) {
-		if (page_load > MAX(ptob(minfree), available_memory) / 4)
+		if (spa->spa_lowmem_page_load >
+		    MAX(ptob(minfree), available_memory) / 4)
 			return (SET_ERROR(ERESTART));
 		/* Note: reserve is inflated, so we deflate */
-		page_load += reserve / 8;
+		atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
 		return (0);
-	} else if (page_load > 0 && arc_reclaim_needed()) {
+	} else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
 		/* memory is low, delay before restarting */
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
 		return (SET_ERROR(EAGAIN));
 	}
-	page_load = 0;
-#endif
+	spa->spa_lowmem_page_load = 0;
+#endif /* _KERNEL */
 	return (0);
 }
 
@@ -6327,7 +6361,7 @@ arc_tempreserve_clear(uint64_t reserve)
 }
 
 int
-arc_tempreserve_space(uint64_t reserve, uint64_t txg)
+arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
@@ -6356,7 +6390,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	error = arc_memory_throttle(reserve, txg);
+	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
@@ -6364,12 +6398,24 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
+	 *
+	 * In the case of one pool being built on another pool, we want
+	 * to make sure we don't end up throttling the lower (backing)
+	 * pool when the upper pool is the majority contributor to dirty
+	 * data. To insure we make forward progress during throttling, we
+	 * also check the current pool's net dirty data and only throttle
+	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
+	 * data in the cache.
+	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
+	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
+	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 
-	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
-	    anon_size > arc_c / 4) {
+	if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
+	    anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
+	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 		uint64_t meta_esize =
 		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 533de180bf1c..1db7bfe02881 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -49,8 +49,7 @@
 #include <sys/abd.h>
 #include <sys/vdev.h>
 #include <sys/cityhash.h>
-
-uint_t zfs_dbuf_evict_key;
+#include <sys/spa_impl.h>
 
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@@ -74,24 +73,58 @@ static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
- * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs are added to the dbuf cache once the last hold is released. If a
- * dbuf is later accessed and still exists in the dbuf cache, then it will
- * be removed from the cache and later re-added to the head of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ *    that represent the metadata that describes filesystems/snapshots/
+ *    bookmarks/properties/etc. We only evict from this cache when we export a
+ *    pool, to short-circuit as much I/O as possible for all administrative
+ *    commands that need the metadata. There is no eviction policy for this
+ *    cache, because we try to only include types in it which would occupy a
+ *    very small amount of space per object but create a large impact on the
+ *    performance of these commands. Instead, after it reaches a maximum size
+ *    (which should only happen on very small memory systems with a very large
+ *    number of filesystem objects), we stop taking new dbufs into the
+ *    metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
+ *    are not currently held but have been recently released. These dbufs
+ *    are not eligible for arc eviction until they are aged out of the cache.
+ *    Dbufs that are aged out of the cache will be immediately destroyed and
+ *    become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
  */
-static multilist_t *dbuf_cache;
-static refcount_t dbuf_cache_size;
-uint64_t dbuf_cache_max_bytes = 0;
+typedef struct dbuf_cache {
+	multilist_t *cache;
+	refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
-/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+/* Size limits for the caches */
+uint64_t dbuf_cache_max_bytes = 0;
+uint64_t dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
 int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
 
 /*
- * The dbuf cache uses a three-stage eviction policy:
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+uint64_t dbuf_metadata_cache_overflow;
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
@@ -404,6 +437,41 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
 }
 
 /*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+	DB_DNODE_ENTER(db);
+	dmu_object_type_t type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	/* Check if this dbuf is one of the types we care about */
+	if (DMU_OT_IS_METADATA_CACHED(type)) {
+		/* If we hit this, then we set something up wrong in dmu_ot */
+		ASSERT(DMU_OT_IS_METADATA(type));
+
+		/*
+		 * Sanity check for small-memory systems: don't allocate too
+		 * much memory for this purpose.
+		 */
+		if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+		    dbuf_metadata_cache_max_bytes) {
+			dbuf_metadata_cache_overflow++;
+			DTRACE_PROBE1(dbuf__metadata__cache__overflow,
+			    dmu_buf_impl_t *, db);
+			return (B_FALSE);
+		}
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the dbuf eviction
  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
@@ -438,7 +506,7 @@ dbuf_cache_above_hiwater(void)
 	uint64_t dbuf_cache_hiwater_bytes =
 	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
 
-	return (refcount_count(&dbuf_cache_size) >
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
 }
 
@@ -448,7 +516,7 @@ dbuf_cache_above_lowater(void)
 	uint64_t dbuf_cache_lowater_bytes =
 	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
 
-	return (refcount_count(&dbuf_cache_size) >
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
 }
 
@@ -458,19 +526,12 @@ dbuf_cache_above_lowater(void)
 static void
 dbuf_evict_one(void)
 {
-	int idx = multilist_get_random_index(dbuf_cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(
+	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
-	/*
-	 * Set the thread's tsd to indicate that it's processing evictions.
-	 * Once a thread stops evicting from the dbuf cache it will
-	 * reset its tsd to NULL.
-	 */
-	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
-	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
-
 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
@@ -482,13 +543,14 @@ dbuf_evict_one(void)
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		(void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
 		    db->db.db_size, db);
+		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 	} else {
 		multilist_sublist_unlock(mls);
 	}
-	(void) tsd_set(zfs_dbuf_evict_key, NULL);
 }
 
 /*
@@ -542,35 +604,13 @@ dbuf_evict_thread(void *unused __unused)
 static void
 dbuf_evict_notify(void)
 {
-
-	/*
-	 * We use thread specific data to track when a thread has
-	 * started processing evictions. This allows us to avoid deeply
-	 * nested stacks that would have a call flow similar to this:
-	 *
-	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
-	 *	^						|
-	 *	|						|
-	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
-	 *
-	 * The dbuf_eviction_thread will always have its tsd set until
-	 * that thread exits. All other threads will only set their tsd
-	 * if they are participating in the eviction process. This only
-	 * happens if the eviction thread is unable to process evictions
-	 * fast enough. To keep the dbuf cache size in check, other threads
-	 * can evict from the dbuf cache directly. Those threads will set
-	 * their tsd values so that we ensure that they only evict one dbuf
-	 * from the dbuf cache.
-	 */
-	if (tsd_get(zfs_dbuf_evict_key) != NULL)
-		return;
-
 	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
-	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+	if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_max_bytes) {
 		if (dbuf_cache_above_hiwater())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
@@ -610,15 +650,21 @@ retry:
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	/*
-	 * Setup the parameters for the dbuf cache. We set the size of the
-	 * dbuf cache to 1/32nd (default) of the size of the ARC. If the value
-	 * has been set in /etc/system and it's not greater than the size of
-	 * the ARC, then we honor that value.
+	 * Setup the parameters for the dbuf caches. We set the sizes of the
+	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+	 * of the size of the ARC, respectively. If the values are set in
+	 * /etc/system and they're not greater than the size of the ARC, then
+	 * we honor that value.
 	 */
 	if (dbuf_cache_max_bytes == 0 ||
 	    dbuf_cache_max_bytes >= arc_max_bytes())  {
 		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
 	}
+	if (dbuf_metadata_cache_max_bytes == 0 ||
+	    dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
+		dbuf_metadata_cache_max_bytes =
+		    arc_max_bytes() >> dbuf_metadata_cache_shift;
+	}
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
@@ -626,12 +672,14 @@ retry:
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
 
-	dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_cache_link),
-	    dbuf_cache_multilist_index_func);
-	refcount_create(&dbuf_cache_size);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		dbuf_caches[dcs].cache =
+		    multilist_create(sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_cache_link),
+		    dbuf_cache_multilist_index_func);
+		refcount_create(&dbuf_caches[dcs].size);
+	}
 
-	tsd_create(&zfs_dbuf_evict_key, NULL);
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
@@ -658,13 +706,14 @@ dbuf_fini(void)
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
-	tsd_destroy(&zfs_dbuf_evict_key);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
-	refcount_destroy(&dbuf_cache_size);
-	multilist_destroy(dbuf_cache);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		refcount_destroy(&dbuf_caches[dcs].size);
+		multilist_destroy(dbuf_caches[dcs].cache);
+	}
 }
 
 /*
@@ -915,8 +964,15 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 	ASSERT(refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
-	if (db->db_level == 0 && db->db_freed_in_flight) {
-		/* we were freed in flight; disregard any error */
+	if (buf == NULL) {
+		/* i/o error */
+		ASSERT(zio == NULL || zio->io_error != 0);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT3P(db->db_buf, ==, NULL);
+		db->db_state = DB_UNCACHED;
+	} else if (db->db_level == 0 && db->db_freed_in_flight) {
+		/* freed in flight */
+		ASSERT(zio == NULL || zio->io_error == 0);
 		if (buf == NULL) {
 			buf = arc_alloc_buf(db->db_objset->os_spa,
 			     db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
@@ -927,16 +983,14 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
-	} else if (buf != NULL) {
+	} else {
+		/* success */
+		ASSERT(zio == NULL || zio->io_error == 0);
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
-	} else {
-		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-		ASSERT3P(db->db_buf, ==, NULL);
-		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
-	dbuf_rele_and_unlock(db, NULL);
+	dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 static void
@@ -2051,9 +2105,15 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
+
+		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
@@ -2090,7 +2150,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
-		dnode_rele(dn, db);
+		mutex_enter(&dn->dn_mtx);
+		dnode_rele_and_unlock(dn, db, B_TRUE);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
@@ -2107,6 +2168,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	kmem_cache_free(dbuf_kmem_cache, db);
@@ -2116,8 +2178,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
-	if (parent && parent != dndb)
-		dbuf_rele(parent, db);
+	if (parent && parent != dndb) {
+		mutex_enter(&parent->db_mtx);
+		dbuf_rele_and_unlock(parent, db, B_TRUE);
+	}
 }
 
 /*
@@ -2245,6 +2309,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
+		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
@@ -2277,6 +2342,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
+	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
@@ -2338,6 +2404,13 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
 	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
+	if (abuf == NULL) {
+		ASSERT(zio == NULL || zio->io_error != 0);
+		kmem_free(dpa, sizeof (*dpa));
+		return;
+	}
+	ASSERT(zio == NULL || zio->io_error == 0);
+
 	/*
 	 * The dpa_dnode is only valid if we are called with a NULL
 	 * zio. This indicates that the arc_read() returned without
@@ -2619,9 +2692,15 @@ top:
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(refcount_is_zero(&db->db_holds));
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
+
+		db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
@@ -2734,7 +2813,7 @@ void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
-	dbuf_rele_and_unlock(db, tag);
+	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
@@ -2745,10 +2824,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically.
+ * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict.  This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ *	^						|
+ *	|						|
+ *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
 {
 	int64_t holds;
 
@@ -2838,12 +2926,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 			    db->db_pending_evict) {
 				dbuf_destroy(db);
 			} else if (!multilist_link_active(&db->db_cache_link)) {
-				multilist_insert(dbuf_cache, db);
-				(void) refcount_add_many(&dbuf_cache_size,
+				ASSERT3U(db->db_caching_status, ==,
+				    DB_NO_CACHE);
+
+				dbuf_cached_state_t dcs =
+				    dbuf_include_in_metadata_cache(db) ?
+				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+				db->db_caching_status = dcs;
+
+				multilist_insert(dbuf_caches[dcs].cache, db);
+				(void) refcount_add_many(&dbuf_caches[dcs].size,
 				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
 
-				dbuf_evict_notify();
+				if (db->db_caching_status == DB_DBUF_CACHE &&
+				    !evicting) {
+					dbuf_evict_notify();
+				}
 			}
 
 			if (do_arc_evict)
@@ -3108,7 +3207,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 		return;
 	}
 
@@ -3458,7 +3557,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 static void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 7bab517fba8c..bfd8e2d7c9ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -79,60 +79,60 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
 int zfs_object_remap_one_indirect_delay_ticks = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
-	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
-	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
-	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
-	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
-	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
-	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
-	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
-	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
-	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
-	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
-	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
-	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"		},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"			},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"	},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"		},
+	{ DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"			},
+	{ DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"			},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"			},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"		},
+	{ DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"			},
+	{ DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"			},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"		},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "zvol prop"			},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"		},
+	{ DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"			},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"	},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"		},
+	{ DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"			},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"			},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"			},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"		},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"		},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"		},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
@@ -449,7 +449,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
-static int
+int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
@@ -1321,7 +1321,7 @@ xuio_stat_wbuf_nocopy(void)
 }
 
 #ifdef _KERNEL
-static int
+int
 dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
@@ -1437,7 +1437,7 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 	return (err);
 }
 
-static int
+int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
@@ -1881,22 +1881,17 @@ dmu_return_arcbuf(arc_buf_t *buf)
  * dmu_write().
  */
 void
-dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
-	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
-	DB_DNODE_ENTER(dbuf);
-	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
-	DB_DNODE_EXIT(dbuf);
 
 	/*
 	 * We can only assign if the offset is aligned, the arc buf is the
@@ -1924,11 +1919,8 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
-		DB_DNODE_ENTER(dbuf);
-		dn = DB_DNODE(dbuf);
 		os = dn->dn_objset;
 		object = dn->dn_object;
-		DB_DNODE_EXIT(dbuf);
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
@@ -1937,6 +1929,17 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
 	}
 }
 
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+
+	DB_DNODE_ENTER(dbuf);
+	dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
+	DB_DNODE_EXIT(dbuf);
+}
+
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
index 40898ef26d97..b853081e8b7c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  */
 
@@ -32,7 +32,8 @@
 #include <sys/zfeature.h>
 
 uint64_t
-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+    int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	uint64_t object;
@@ -92,7 +93,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 			os->os_obj_next = object - 1;
 	}
 
-	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+	dnode_allocate(dn, ot, blocksize, indirect_blockshift,
+	    bonustype, bonuslen, tx);
 	mutex_exit(&os->os_obj_lock);
 
 	dmu_tx_add_new_object(tx, dn);
@@ -101,6 +103,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 	return (object);
 }
 
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (dmu_object_alloc_ibs(os, ot, blocksize, 0,
+	    bonustype, bonuslen, tx));
+}
+
 int
 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
@@ -157,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 		return (err);
 
 	ASSERT(dn->dn_type != DMU_OT_NONE);
+	/*
+	 * If we don't create this free range, we'll leak indirect blocks when
+	 * we get to freeing the dnode in syncing context.
+	 */
 	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
 	dnode_free(dn, tx);
 	dnode_rele(dn, FTAG);
@@ -204,13 +218,19 @@ dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
 	}
 	ASSERT3U(dn->dn_type, ==, old_type);
 	ASSERT0(dn->dn_maxblkid);
+
+	/*
+	 * We must initialize the ZAP data before changing the type,
+	 * so that concurrent calls to *_is_zapified() can determine if
+	 * the object has been completely zapified by checking the type.
+	 */
+	mzap_create_impl(mos, object, 0, 0, tx);
+
 	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
 	    DMU_OTN_ZAP_METADATA;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 
-	mzap_create_impl(mos, object, 0, 0, tx);
-
 	spa_feature_incr(dmu_objset_spa(mos),
 	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
index 60119c7cda54..50c18a58f6bc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
@@ -54,6 +54,7 @@
 #include <sys/dsl_destroy.h>
 #include <sys/vdev.h>
 #include <sys/zfeature.h>
+#include "zfs_namecheck.h"
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -498,6 +499,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
+	/*
+	 * These properties will be filled in by the logic in zfs_get_zplprop()
+	 * when they are queried for the first time.
+	 */
+	os->os_version = OBJSET_PROP_UNINITIALIZED;
+	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
 
 	if (ds == NULL || !ds->ds_is_snapshot)
 		os->os_zil_header = os->os_phys->os_zil_header;
@@ -905,6 +914,9 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
+	if (dataset_nestcheck(doca->doca_name) != 0)
+		return (SET_ERROR(ENAMETOOLONG));
+
 	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
index 9609761b38f9..25c1fec0c146 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
@@ -76,6 +76,11 @@ TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
 static char *dmu_recv_tag = "dmu_recv_tag";
 const char *recv_clone_name = "%recv";
 
+/*
+ * Use this to override the recordsize calculation for fast zfs send estimates.
+ */
+uint64_t zfs_override_estimate_recordsize = 0;
+
 #define	BP_SPAN(datablkszsec, indblkshift, level) \
 	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
@@ -1131,7 +1136,7 @@ static int
 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
     uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
 {
-	int err;
+	int err = 0;
 	uint64_t size;
 	/*
 	 * Assume that space (both on-disk and in-stream) is dominated by
@@ -1144,7 +1149,9 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	/* Assume all (uncompressed) blocks are recordsize. */
-	if (os->os_phys->os_type == DMU_OST_ZVOL) {
+	if (zfs_override_estimate_recordsize != 0) {
+		recordsize = zfs_override_estimate_recordsize;
+	} else if (os->os_phys->os_type == DMU_OST_ZVOL) {
 		err = dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
 	} else {
@@ -1788,6 +1795,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
 	drc->drc_force = force;
 	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();
+	drc->drc_clone = (origin != NULL);
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
@@ -1848,7 +1856,9 @@ struct receive_writer_arg {
 	/* A map from guid to dataset to help handle dedup'd streams. */
 	avl_tree_t *guid_to_ds_map;
 	boolean_t resumable;
-	uint64_t last_object, last_offset;
+	uint64_t last_object;
+	uint64_t last_offset;
+	uint64_t max_object; /* highest object ID referenced in stream */
 	uint64_t bytes_read; /* bytes read when current record created */
 };
 
@@ -1896,14 +1906,10 @@ typedef struct guid_map_entry {
 static int
 guid_compare(const void *arg1, const void *arg2)
 {
-	const guid_map_entry_t *gmep1 = arg1;
-	const guid_map_entry_t *gmep2 = arg2;
+	const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
+	const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
 
-	if (gmep1->guid < gmep2->guid)
-		return (-1);
-	else if (gmep1->guid > gmep2->guid)
-		return (1);
-	return (0);
+	return (AVL_CMP(gmep1->guid, gmep2->guid));
 }
 
 static void
@@ -2145,6 +2151,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		return (SET_ERROR(EINVAL));
 	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
+	if (drro->drr_object > rwa->max_object)
+		rwa->max_object = drro->drr_object;
+
 	/*
 	 * If we are losing blkptrs or changing the block size this must
 	 * be a new file instance.  We must clear out the previous file
@@ -2240,6 +2249,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,
 		err = dmu_free_long_object(rwa->os, obj);
 		if (err != 0)
 			return (err);
+
+		if (obj > rwa->max_object)
+			rwa->max_object = obj;
 	}
 	if (next_err != ESRCH)
 		return (next_err);
@@ -2269,6 +2281,9 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
 	rwa->last_object = drrw->drr_object;
 	rwa->last_offset = drrw->drr_offset;
 
+	if (rwa->last_object > rwa->max_object)
+		rwa->max_object = rwa->last_object;
+
 	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
@@ -2345,6 +2360,9 @@ receive_write_byref(struct receive_writer_arg *rwa,
 		ref_os = rwa->os;
 	}
 
+	if (drrwbr->drr_object > rwa->max_object)
+		rwa->max_object = drrwbr->drr_object;
+
 	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
 	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
 	if (err != 0)
@@ -2387,6 +2405,9 @@ receive_write_embedded(struct receive_writer_arg *rwa,
 	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (EINVAL);
 
+	if (drrwe->drr_object > rwa->max_object)
+		rwa->max_object = drrwe->drr_object;
+
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwe->drr_object,
@@ -2423,6 +2444,9 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
 	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
+	if (drrs->drr_object > rwa->max_object)
+		rwa->max_object = drrs->drr_object;
+
 	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
@@ -2467,6 +2491,9 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
+	if (drrf->drr_object > rwa->max_object)
+		rwa->max_object = drrf->drr_object;
+
 	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
 
@@ -3086,6 +3113,41 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
 	}
 	mutex_exit(&rwa.mutex);
 
+	/*
+	 * If we are receiving a full stream as a clone, all object IDs which
+	 * are greater than the maximum ID referenced in the stream are
+	 * by definition unused and must be freed. Note that it's possible that
+	 * we've resumed this send and the first record we received was the END
+	 * record. In that case, max_object would be 0, but we shouldn't start
+	 * freeing all objects from there; instead we should start from the
+	 * resumeobj.
+	 */
+	if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
+		uint64_t obj;
+		if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0)
+			obj = 0;
+		if (rwa.max_object > obj)
+			obj = rwa.max_object;
+		obj++;
+		int free_err = 0;
+		int next_err = 0;
+
+		while (next_err == 0) {
+			free_err = dmu_free_long_object(rwa.os, obj);
+			if (free_err != 0 && free_err != ENOENT)
+				break;
+
+			next_err = dmu_object_next(rwa.os, &obj, FALSE, 0);
+		}
+
+		if (err == 0) {
+			if (free_err != 0 && free_err != ENOENT)
+				err = free_err;
+			else if (next_err != ESRCH)
+				err = next_err;
+		}
+	}
+
 	cv_destroy(&rwa.cv);
 	mutex_destroy(&rwa.mutex);
 	bqueue_destroy(&rwa.q);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index ad02fa5918aa..4ac640e54d6c 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -1091,7 +1091,12 @@ dmu_tx_wait(dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 		tx->tx_needassign_txh = NULL;
 	} else {
-		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+		/*
+		 * If we have a lot of dirty data just wait until we sync
+		 * out a TXG at which point we'll hopefully have synced
+		 * a portion of the changes.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	}
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
index 13a4a02bbfb4..4d72991b5ef6 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
@@ -78,19 +78,13 @@ dbuf_compare(const void *x1, const void *x2)
 	const dmu_buf_impl_t *d1 = x1;
 	const dmu_buf_impl_t *d2 = x2;
 
-	if (d1->db_level < d2->db_level) {
-		return (-1);
-	}
-	if (d1->db_level > d2->db_level) {
-		return (1);
-	}
+	int cmp = AVL_CMP(d1->db_level, d2->db_level);
+	if (likely(cmp))
+		return (cmp);
 
-	if (d1->db_blkid < d2->db_blkid) {
-		return (-1);
-	}
-	if (d1->db_blkid > d2->db_blkid) {
-		return (1);
-	}
+	cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+	if (likely(cmp))
+		return (cmp);
 
 	if (d1->db_state == DB_SEARCH) {
 		ASSERT3S(d2->db_state, !=, DB_SEARCH);
@@ -100,13 +94,7 @@ dbuf_compare(const void *x1, const void *x2)
 		return (1);
 	}
 
-	if ((uintptr_t)d1 < (uintptr_t)d2) {
-		return (-1);
-	}
-	if ((uintptr_t)d1 > (uintptr_t)d2) {
-		return (1);
-	}
-	return (0);
+	return (AVL_PCMP(d1, d2));
 }
 
 /* ARGSUSED */
@@ -742,6 +730,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 	ndn->dn_datablkszsec = odn->dn_datablkszsec;
 	ndn->dn_datablksz = odn->dn_datablksz;
 	ndn->dn_maxblkid = odn->dn_maxblkid;
+	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+	    sizeof (odn->dn_next_type));
 	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
 	    sizeof (odn->dn_next_nblkptr));
 	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
@@ -1238,11 +1228,11 @@ void
 dnode_rele(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
-	dnode_rele_and_unlock(dn, tag);
+	dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
-dnode_rele_and_unlock(dnode_t *dn, void *tag)
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
@@ -1273,7 +1263,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
-		dbuf_rele(db, dnh);
+		mutex_enter(&db->db_mtx);
+		dbuf_rele_and_unlock(db, dnh, evicting);
 	}
 }
 
@@ -1518,6 +1509,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 	}
 }
 
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t db_search;
+	dmu_buf_impl_t *db;
+	avl_index_t where;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+
+	db_search.db_level = 1;
+	db_search.db_blkid = start_blkid + 1;
+	db_search.db_state = DB_SEARCH;
+	for (;;) {
+
+		db = avl_find(&dn->dn_dbufs, &db_search, &where);
+		if (db == NULL)
+			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+		if (db == NULL || db->db_level != 1 ||
+		    db->db_blkid >= end_blkid) {
+			break;
+		}
+
+		/*
+		 * Setup the next blkid we want to search for.
+		 */
+		db_search.db_blkid = db->db_blkid + 1;
+		ASSERT3U(db->db_blkid, >=, start_blkid);
+
+		/*
+		 * If the dbuf transitions to DB_EVICTING while we're trying
+		 * to dirty it, then we will be unable to discover it in
+		 * the dbuf hash table. This will result in a call to
+		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
+		 * lock. To avoid a deadlock, we drop the lock before
+		 * dirtying the level-1 dbuf.
+		 */
+		mutex_exit(&dn->dn_dbufs_mtx);
+		dnode_dirty_l1(dn, db->db_blkid, tx);
+		mutex_enter(&dn->dn_dbufs_mtx);
+	}
+
+#ifdef ZFS_DEBUG
+	/*
+	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+	 */
+	db_search.db_level = 1;
+	db_search.db_blkid = start_blkid + 1;
+	db_search.db_state = DB_SEARCH;
+	db = avl_find(&dn->dn_dbufs, &db_search, &where);
+	if (db == NULL)
+		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+		if (db->db_level != 1 || db->db_blkid >= end_blkid)
+			break;
+		ASSERT(db->db_dirtycnt > 0);
+	}
+#endif
+	mutex_exit(&dn->dn_dbufs_mtx);
+}
+
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
@@ -1550,13 +1607,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		if (off == 0 && len >= blksz) {
 			/*
 			 * Freeing the whole block; fast-track this request.
-			 * Note that we won't dirty any indirect blocks,
-			 * which is fine because we will be freeing the entire
-			 * file and thus all indirect blocks will be freed
-			 * by free_children().
 			 */
 			blkid = 0;
 			nblks = 1;
+			if (dn->dn_nlevels > 1)
+				dnode_dirty_l1(dn, 0, tx);
 			goto done;
 		} else if (off >= blksz) {
 			/* Freeing past end-of-data */
@@ -1669,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 		if (last != first)
 			dnode_dirty_l1(dn, last, tx);
 
+		dnode_dirty_l1range(dn, first, last, tx);
+
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		for (uint64_t i = first + 1; i < last; i++) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
index 2fcaf7927de6..02f263c82e42 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 }
 #endif
 
+/*
+ * We don't usually free the indirect blocks here.  If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children().  If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed.  Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
 static void
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
-    dmu_tx_t *tx)
+    boolean_t free_indirects, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	blkptr_t *bp;
@@ -248,6 +263,24 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
 	if (db->db_state != DB_CACHED)
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
+	/*
+	 * If we modify this indirect block, and we are not freeing the
+	 * dnode (!free_indirects), then this indirect block needs to get
+	 * written to disk by dbuf_write().  If it is dirty, we know it will
+	 * be written (otherwise, we would have incorrect on-disk state
+	 * because the space would be freed but still referenced by the BP
+	 * in this indirect block).  Therefore we VERIFY that it is
+	 * dirty.
+	 *
+	 * Our VERIFY covers some cases that do not actually have to be
+	 * dirty, but the open-context code happens to dirty.  E.g. if the
+	 * blocks we are freeing are all holes, because in that case, we
+	 * are only freeing part of this indirect block, so it is an
+	 * ancestor of the first or last block to be freed.  The first and
+	 * last L1 indirect blocks are always dirtied by dnode_free_range().
+	 */
+	VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+
 	dbuf_release_bp(db);
 	bp = db->db.db_data;
 
@@ -283,32 +316,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
 			rw_exit(&dn->dn_struct_rwlock);
 			ASSERT3P(bp, ==, subdb->db_blkptr);
 
-			free_children(subdb, blkid, nblks, tx);
+			free_children(subdb, blkid, nblks, free_indirects, tx);
 			dbuf_rele(subdb, FTAG);
 		}
 	}
 
-	/* If this whole block is free, free ourself too. */
-	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
-		if (!BP_IS_HOLE(bp))
-			break;
-	}
-	if (i == 1 << epbs) {
-		/*
-		 * We only found holes. Grab the rwlock to prevent
-		 * anybody from reading the blocks we're about to
-		 * zero out.
-		 */
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	if (free_indirects) {
+		for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+			ASSERT(BP_IS_HOLE(bp));
 		bzero(db->db.db_data, db->db.db_size);
-		rw_exit(&dn->dn_struct_rwlock);
 		free_blocks(dn, db->db_blkptr, 1, tx);
-	} else {
-		/*
-		 * Partial block free; must be marked dirty so that it
-		 * will be written out.
-		 */
-		ASSERT(db->db_dirtycnt > 0);
 	}
 
 	DB_DNODE_EXIT(db);
@@ -321,7 +338,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
  */
 static void
 dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
-    dmu_tx_t *tx)
+    boolean_t free_indirects, dmu_tx_t *tx)
 {
 	blkptr_t *bp = dn->dn_phys->dn_blkptr;
 	int dnlevel = dn->dn_phys->dn_nlevels;
@@ -361,7 +378,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
 			    TRUE, FALSE, FTAG, &db));
 			rw_exit(&dn->dn_struct_rwlock);
 
-			free_children(db, blkid, nblks, tx);
+			free_children(db, blkid, nblks, free_indirects, tx);
 			dbuf_rele(db, FTAG);
 		}
 	}
@@ -380,6 +397,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
 typedef struct dnode_sync_free_range_arg {
 	dnode_t *dsfra_dnode;
 	dmu_tx_t *dsfra_tx;
+	boolean_t dsfra_free_indirects;
 } dnode_sync_free_range_arg_t;
 
 static void
@@ -389,7 +407,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
 	dnode_t *dn = dsfra->dsfra_dnode;
 
 	mutex_exit(&dn->dn_mtx);
-	dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+	dnode_sync_free_range_impl(dn, blkid, nblks,
+	    dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
 	mutex_enter(&dn->dn_mtx);
 }
 
@@ -420,6 +439,19 @@ dnode_evict_dbufs(dnode_t *dn)
 			avl_insert_here(&dn->dn_dbufs, &db_marker, db,
 			    AVL_BEFORE);
 
+			/*
+			 * We need to use the "marker" dbuf rather than
+			 * simply getting the next dbuf, because
+			 * dbuf_destroy() may actually remove multiple dbufs.
+			 * It can call itself recursively on the parent dbuf,
+			 * which may also be removed from dn_dbufs.  The code
+			 * flow would look like:
+			 *
+			 * dbuf_destroy():
+			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+			 *	if (!cacheable || pending_evict)
+			 *	  dbuf_destroy()
+			 */
 			dbuf_destroy(db);
 
 			db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
@@ -478,7 +510,7 @@ dnode_undirty_dbufs(list_t *list)
 			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 	}
 }
 
@@ -670,6 +702,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		dnode_sync_free_range_arg_t dsfra;
 		dsfra.dsfra_dnode = dn;
 		dsfra.dsfra_tx = tx;
+		dsfra.dsfra_free_indirects = freeing_dnode;
+		if (freeing_dnode) {
+			ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+			    0, dn->dn_maxblkid + 1));
+		}
 		mutex_enter(&dn->dn_mtx);
 		range_tree_vacate(dn->dn_free_ranges[txgoff],
 		    dnode_sync_free_range, &dsfra);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
index 356e5b51c3f4..2f3647bc8e86 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
@@ -55,15 +55,10 @@
 static int
 dsl_deadlist_compare(const void *arg1, const void *arg2)
 {
-	const dsl_deadlist_entry_t *dle1 = arg1;
-	const dsl_deadlist_entry_t *dle2 = arg2;
+	const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1;
+	const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2;
 
-	if (dle1->dle_mintxg < dle2->dle_mintxg)
-		return (-1);
-	else if (dle1->dle_mintxg > dle2->dle_mintxg)
-		return (+1);
-	else
-		return (0);
+	return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
 }
 
 static void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
index 7870b4951b29..0ad658f910ec 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
@@ -384,14 +384,13 @@ typedef struct perm_set {
 static int
 perm_set_compare(const void *arg1, const void *arg2)
 {
-	const perm_set_t *node1 = arg1;
-	const perm_set_t *node2 = arg2;
+	const perm_set_t *node1 = (const perm_set_t *)arg1;
+	const perm_set_t *node2 = (const perm_set_t *)arg2;
 	int val;
 
 	val = strcmp(node1->p_setname, node2->p_setname);
-	if (val == 0)
-		return (0);
-	return (val > 0 ? 1 : -1);
+
+	return (AVL_ISIGN(val));
 }
 
 /*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
index 00b1dbe36d83..1a4194ebf16d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
@@ -1388,7 +1388,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 	    offsetof(struct tempreserve, tr_node));
 	ASSERT3S(asize, >, 0);
 
-	err = arc_tempreserve_space(lsize, tx->tx_txg);
+	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
 	if (err == 0) {
 		struct tempreserve *tr;
 
@@ -1819,16 +1819,28 @@ typedef struct dsl_dir_rename_arg {
 	cred_t *ddra_cred;
 } dsl_dir_rename_arg_t;
 
+typedef struct dsl_valid_rename_arg {
+	int char_delta;
+	int nest_delta;
+} dsl_valid_rename_arg_t;
+
 /* ARGSUSED */
 static int
 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
-	int *deltap = arg;
+	dsl_valid_rename_arg_t *dvra = arg;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	dsl_dataset_name(ds, namebuf);
 
-	if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
+	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
+	    <, ZFS_MAX_DATASET_NAME_LEN);
+	int namelen = strlen(namebuf) + dvra->char_delta;
+	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
+
+	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
@@ -1839,9 +1851,9 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
+	dsl_valid_rename_arg_t dvra;
 	const char *mynewname;
 	int error;
-	int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
 
 	/* target dir should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
@@ -1870,10 +1882,19 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
 		return (SET_ERROR(EEXIST));
 	}
 
+	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
+	    <, ZFS_MAX_DATASET_NAME_LEN);
+	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
+	    <, ZFS_MAX_DATASET_NAME_LEN);
+	dvra.char_delta = strlen(ddra->ddra_newname)
+	    - strlen(ddra->ddra_oldname);
+	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
+	    - get_dataset_depth(ddra->ddra_oldname);
+
 	/* if the name length is growing, validate child name lengths */
-	if (delta > 0) {
+	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
-		    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
index 8e7616427e19..48675909365b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright 2016 Gary Mills
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2017 Joyent, Inc.
@@ -2164,7 +2164,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 	 * block-sharing rules don't apply to it.
 	 */
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) &&
-	    ds->ds_dir != dp->dp_origin_snap->ds_dir) {
+	    (dp->dp_origin_snap == NULL ||
+	    ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
 		objset_t *os;
 		if (dmu_objset_from_ds(ds, &os) != 0) {
 			goto out;
@@ -2959,6 +2960,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
 {
 	vdev_t *vd;
 
+	if (vd->vdev_ops == &vdev_indirect_ops) {
+		/*
+		 * The indirect vdev can point to multiple
+		 * vdevs.  For simplicity, always create
+		 * the resilver zio_t. zio_vdev_io_start()
+		 * will bypass the child resilver i/o's if
+		 * they are on vdevs that don't have DTL's.
+		 */
+		return (B_TRUE);
+	}
 	if (DVA_GET_GANG(dva)) {
 		/*
 		 * Gang members may be spread across multiple
@@ -3541,14 +3552,14 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	int d;
 	
+	count_block(scn, dp->dp_blkstats, bp);
+
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg)
 		return (0);
 
-	if (BP_IS_EMBEDDED(bp)) {
-		count_block(scn, dp->dp_blkstats, bp);
-		return (0);
-	}
+	/* Embedded BP's have phys_birth==0, so we reject them above. */
+	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 6cff5eacdcdf..e374cd356792 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -275,6 +275,8 @@ static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 
 kmem_cache_t *metaslab_alloc_trace_cache;
 
@@ -294,7 +296,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
-	refcount_create_tracked(&mc->mc_alloc_slots);
+	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (refcount_t), KM_SLEEP);
+	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (uint64_t), KM_SLEEP);
+	for (int i = 0; i < spa->spa_alloc_count; i++)
+		refcount_create_tracked(&mc->mc_alloc_slots[i]);
 
 	return (mc);
 }
@@ -308,7 +315,12 @@ metaslab_class_destroy(metaslab_class_t *mc)
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
-	refcount_destroy(&mc->mc_alloc_slots);
+	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
+		refcount_destroy(&mc->mc_alloc_slots[i]);
+	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
+	    sizeof (refcount_t));
+	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
+	    sizeof (uint64_t));
 	mutex_destroy(&mc->mc_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
@@ -529,25 +541,40 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
-	const metaslab_t *m1 = x1;
-	const metaslab_t *m2 = x2;
-
-	if (m1->ms_weight < m2->ms_weight)
-		return (1);
-	if (m1->ms_weight > m2->ms_weight)
-		return (-1);
+	const metaslab_t *m1 = (const metaslab_t *)x1;
+	const metaslab_t *m2 = (const metaslab_t *)x2;
+
+	int sort1 = 0;
+	int sort2 = 0;
+	if (m1->ms_allocator != -1 && m1->ms_primary)
+		sort1 = 1;
+	else if (m1->ms_allocator != -1 && !m1->ms_primary)
+		sort1 = 2;
+	if (m2->ms_allocator != -1 && m2->ms_primary)
+		sort2 = 1;
+	else if (m2->ms_allocator != -1 && !m2->ms_primary)
+		sort2 = 2;
 
 	/*
-	 * If the weights are identical, use the offset to force uniqueness.
+	 * Sort inactive metaslabs first, then primaries, then secondaries. When
+	 * selecting a metaslab to allocate from, an allocator first tries its
+	 * primary, then secondary active metaslab. If it doesn't have active
+	 * metaslabs, or can't allocate from them, it searches for an inactive
+	 * metaslab to activate. If it can't find a suitable one, it will steal
+	 * a primary or secondary metaslab from another allocator.
 	 */
-	if (m1->ms_start < m2->ms_start)
+	if (sort1 < sort2)
 		return (-1);
-	if (m1->ms_start > m2->ms_start)
+	if (sort1 > sort2)
 		return (1);
 
-	ASSERT3P(m1, ==, m2);
+	int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
+	if (likely(cmp))
+		return (cmp);
 
-	return (0);
+	IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+
+	return (AVL_CMP(m1->ms_start, m2->ms_start));
 }
 
 /*
@@ -683,12 +710,18 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 }
 
 metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
+	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+	    KM_SLEEP);
+	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+	    KM_SLEEP);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 	mg->mg_vd = vd;
@@ -696,7 +729,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
-	refcount_create_tracked(&mg->mg_alloc_queue_depth);
+	mg->mg_allocators = allocators;
+
+	mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
+	    KM_SLEEP);
+	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
+	    sizeof (uint64_t), KM_SLEEP);
+	for (int i = 0; i < allocators; i++) {
+		refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+		mg->mg_cur_max_alloc_queue_depth[i] = 0;
+	}
 
 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
@@ -718,8 +760,22 @@ metaslab_group_destroy(metaslab_group_t *mg)
 
 	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
+	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
+	kmem_free(mg->mg_secondaries, mg->mg_allocators *
+	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
-	refcount_destroy(&mg->mg_alloc_queue_depth);
+	mutex_destroy(&mg->mg_ms_initialize_lock);
+	cv_destroy(&mg->mg_ms_initialize_cv);
+
+	for (int i = 0; i < mg->mg_allocators; i++) {
+		refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+		mg->mg_cur_max_alloc_queue_depth[i] = 0;
+	}
+	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
+	    sizeof (refcount_t));
+	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
+	    sizeof (uint64_t));
+
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
@@ -799,6 +855,22 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	taskq_wait(mg->mg_taskq);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
+	for (int i = 0; i < mg->mg_allocators; i++) {
+		metaslab_t *msp = mg->mg_primaries[i];
+		if (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+			mutex_exit(&msp->ms_lock);
+		}
+		msp = mg->mg_secondaries[i];
+		if (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+			mutex_exit(&msp->ms_lock);
+		}
+	}
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
@@ -940,6 +1012,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 }
 
 static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_lock));
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
@@ -950,10 +1033,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
-	ASSERT(msp->ms_group == mg);
-	avl_remove(&mg->mg_metaslab_tree, msp);
-	msp->ms_weight = weight;
-	avl_add(&mg->mg_metaslab_tree, msp);
+	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
@@ -1001,7 +1081,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
-    uint64_t psize)
+    uint64_t psize, int allocator)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
@@ -1030,7 +1110,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 	if (mg->mg_allocatable) {
 		metaslab_group_t *mgp;
 		int64_t qdepth;
-		uint64_t qmax = mg->mg_max_alloc_queue_depth;
+		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
 
 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
@@ -1042,7 +1122,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 		if (mg->mg_no_free_space)
 			return (B_FALSE);
 
-		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+		qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
 
 		/*
 		 * If this metaslab group is below its qmax or it's
@@ -1061,9 +1141,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
 		 * groups at the same time when we make this check.
 		 */
 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
-			qmax = mgp->mg_max_alloc_queue_depth;
+			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
 
-			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+			qdepth = refcount_count(
+			    &mgp->mg_alloc_queue_depth[allocator]);
 
 			/*
 			 * If there is another metaslab group that
@@ -1105,18 +1186,14 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
-	if (rs_size1 < rs_size2)
-		return (-1);
-	if (rs_size1 > rs_size2)
-		return (1);
+	int cmp = AVL_CMP(rs_size1, rs_size2);
+	if (likely(cmp))
+		return (cmp);
 
 	if (r1->rs_start < r2->rs_start)
 		return (-1);
 
-	if (r1->rs_start > r2->rs_start)
-		return (1);
-
-	return (0);
+	return (AVL_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
@@ -1468,9 +1545,12 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
+	ms->ms_allocator = -1;
+	ms->ms_new = B_TRUE;
 
 	/*
 	 * We only open space map objects that already exist. All others
@@ -1567,6 +1647,7 @@ metaslab_fini(metaslab_t *msp)
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
+	ASSERT3U(msp->ms_allocator, ==, -1);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
@@ -1658,7 +1739,7 @@ metaslab_set_fragmentation(metaslab_t *msp)
 		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
-			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
+			zfs_dbgmsg("txg %llu, requesting force condense: "
 			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
 			    vd->vdev_id);
 		}
@@ -1963,19 +2044,59 @@ metaslab_weight(metaslab_t *msp)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    int allocator, uint64_t activation_weight)
+{
+	/*
+	 * If we're activating for the claim code, we don't want to actually
+	 * set the metaslab up for a specific allocator.
+	 */
+	if (activation_weight == METASLAB_WEIGHT_CLAIM)
+		return (0);
+	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+	    mg->mg_primaries : mg->mg_secondaries);
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	mutex_enter(&mg->mg_lock);
+	if (arr[allocator] != NULL) {
+		mutex_exit(&mg->mg_lock);
+		return (EEXIST);
+	}
+
+	arr[allocator] = msp;
+	ASSERT3S(msp->ms_allocator, ==, -1);
+	msp->ms_allocator = allocator;
+	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+	mutex_exit(&mg->mg_lock);
+
+	return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+		int error = 0;
 		metaslab_load_wait(msp);
 		if (!msp->ms_loaded) {
-			int error = metaslab_load(msp);
-			if (error) {
+			if ((error = metaslab_load(msp)) != 0) {
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
 			}
 		}
+		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+			/*
+			 * The metaslab was activated for another allocator
+			 * while we were waiting, we should reselect.
+			 */
+			return (EBUSY);
+		}
+		if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+		    allocator, activation_weight)) != 0) {
+			return (error);
+		}
 
 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(msp->ms_group, msp,
@@ -1988,6 +2109,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 }
 
 static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+		metaslab_group_sort(mg, msp, weight);
+		return;
+	}
+
+	mutex_enter(&mg->mg_lock);
+	ASSERT3P(msp->ms_group, ==, mg);
+	if (msp->ms_primary) {
+		ASSERT3U(0, <=, msp->ms_allocator);
+		ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		mg->mg_primaries[msp->ms_allocator] = NULL;
+	} else {
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+		mg->mg_secondaries[msp->ms_allocator] = NULL;
+	}
+	msp->ms_allocator = -1;
+	metaslab_group_sort_impl(mg, msp, weight);
+	mutex_exit(&mg->mg_lock);
+}
+
+static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
 	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
@@ -2002,7 +2151,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	msp->ms_activation_weight = 0;
-	metaslab_group_sort(msp->ms_group, msp, weight);
+	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
@@ -2105,17 +2254,6 @@ metaslab_group_preload(metaslab_group_t *mg)
  *
  * 3. The on-disk size of the space map should actually decrease.
  *
- * Checking the first condition is tricky since we don't want to walk
- * the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered range tree in the metaslab and calculate the
- * size required to write out the largest segment in our free tree. If the
- * size required to represent that segment on disk is larger than the space
- * map object then we avoid condensing this map.
- *
- * To determine the second criterion we use a best-case estimate and assume
- * each segment can be represented on-disk as a single 64-bit entry. We refer
- * to this best-case estimate as the space map's minimal form.
- *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
@@ -2126,9 +2264,6 @@ static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
-	range_seg_t *rs;
-	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
-	dmu_object_info_t doi;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
 	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
@@ -2154,34 +2289,22 @@ metaslab_should_condense(metaslab_t *msp)
 	msp->ms_condense_checked_txg = current_txg;
 
 	/*
-	 * Use the ms_allocatable_by_size range tree, which is ordered by
-	 * size, to obtain the largest segment in the free tree. We always
-	 * condense metaslabs that are empty and metaslabs for which a
-	 * condense request has been made.
+	 * We always condense metaslabs that are empty and metaslabs for
+	 * which a condense request has been made.
 	 */
-	rs = avl_last(&msp->ms_allocatable_by_size);
-	if (rs == NULL || msp->ms_condense_wanted)
+	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
-	/*
-	 * Calculate the number of 64-bit entries this segment would
-	 * require when written to disk. If this single segment would be
-	 * larger on-disk than the entire current on-disk structure, then
-	 * clearly condensing will increase the on-disk structure size.
-	 */
-	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-	entries = size / (MIN(size, SM_RUN_MAX));
-	segsz = entries * sizeof (uint64_t);
-
-	optimal_size =
-	    sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
-	object_size = space_map_length(msp->ms_sm);
+	uint64_t object_size = space_map_length(msp->ms_sm);
+	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+	    msp->ms_allocatable, SM_NO_VDEVID);
 
+	dmu_object_info_t doi;
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
 
-	return (segsz <= object_size &&
-	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
@@ -2256,11 +2379,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
-	space_map_write(sm, condense_tree, SM_ALLOC, tx);
+	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
@@ -2372,8 +2495,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
-		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+		    SM_NO_VDEVID, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
@@ -2388,7 +2513,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
-		    msp->ms_checkpointing, SM_FREE, tx);
+		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 		space_map_update(vd->vdev_checkpoint_sm);
 
@@ -2580,22 +2705,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 
+	if (msp->ms_new) {
+		msp->ms_new = B_FALSE;
+		mutex_enter(&mg->mg_lock);
+		mg->mg_ms_ready++;
+		mutex_exit(&mg->mg_lock);
+	}
 	/*
 	 * Calculate the new weights before unloading any metaslabs.
 	 * This will give us the most accurate weighting.
 	 */
-	metaslab_group_sort(mg, msp, metaslab_weight(msp));
+	metaslab_group_sort(mg, msp, metaslab_weight(msp) |
+	    (msp->ms_weight & METASLAB_ACTIVE_MASK));
 
 	/*
 	 * If the metaslab is loaded and we've not tried to load or allocate
 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
 	 */
 	if (msp->ms_loaded &&
+	    msp->ms_initializing == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
 			    msp->ms_allocating[(txg + t) & TXG_MASK]));
 		}
+		if (msp->ms_allocator != -1) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+		}
 
 		if (!metaslab_debug_unload)
 			metaslab_unload(msp);
@@ -2689,7 +2826,8 @@ metaslab_alloc_trace_fini(void)
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
-    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+    int allocator)
 {
 	if (!metaslab_trace_enabled)
 		return;
@@ -2722,6 +2860,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
+	mat->mat_allocator = allocator;
 
 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
@@ -2762,35 +2901,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
  */
 
 static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
-	    flags & METASLAB_DONT_THROTTLE)
+	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
-	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+	(void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+	uint64_t max = mg->mg_max_alloc_queue_depth;
+	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+	while (cur < max) {
+		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+		    cur, cur + 1) == cur) {
+			atomic_inc_64(
+			    &mg->mg_class->mc_alloc_max_slots[allocator]);
+			return;
+		}
+		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+	}
 }
 
 void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
-	    flags & METASLAB_DONT_THROTTLE)
+	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
-	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+	(void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+	if (io_complete)
+		metaslab_group_increment_qdepth(mg, allocator);
 }
 
 void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+    int allocator)
 {
 #ifdef ZFS_DEBUG
 	const dva_t *dva = bp->blk_dva;
@@ -2799,7 +2959,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
+		    tag));
 	}
 #endif
 }
@@ -2812,6 +2973,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	VERIFY(!msp->ms_condensing);
+	VERIFY0(msp->ms_initializing);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
@@ -2841,91 +3003,147 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 	return (start);
 }
 
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried.  In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab.  This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+    dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+{
+	avl_index_t idx;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	metaslab_t *msp = avl_find(t, search, &idx);
+	if (msp == NULL)
+		msp = avl_nearest(t, idx, AVL_AFTER);
+
+	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+		int i;
+		if (!metaslab_should_allocate(msp, asize)) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_TOO_SMALL, allocator);
+			continue;
+		}
+
+		/*
+			 * If the selected metaslab is condensing or being
+			 * initialized, skip it.
+		 */
+			if (msp->ms_condensing || msp->ms_initializing > 0)
+			continue;
+
+		*was_active = msp->ms_allocator != -1;
+		/*
+		 * If we're activating as primary, this is our first allocation
+		 * from this disk, so we don't need to check how close we are.
+		 * If the metaslab under consideration was already active,
+		 * we're getting desperate enough to steal another allocator's
+		 * metaslab, so we still don't care about distances.
+		 */
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+			break;
+
+		uint64_t target_distance = min_distance
+		    + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+		    min_distance >> 1);
+
+		for (i = 0; i < d; i++) {
+			if (metaslab_distance(msp, &dva[i]) < target_distance)
+				break;
+		}
+		if (i == d)
+			break;
+	}
+
+	if (msp != NULL) {
+		search->ms_weight = msp->ms_weight;
+		search->ms_start = msp->ms_start + 1;
+		search->ms_allocator = msp->ms_allocator;
+		search->ms_primary = msp->ms_primary;
+	}
+	return (msp);
+}
+
+/* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+    int allocator)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	uint64_t activation_weight;
-	uint64_t target_distance;
-	int i;
+	boolean_t tertiary = B_FALSE;
 
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (i = 0; i < d; i++) {
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+	for (int i = 0; i < d; i++) {
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
+		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+			tertiary = B_TRUE;
 			break;
 		}
 	}
 
+	/*
+	 * If we don't have enough metaslabs active to fill the entire array, we
+	 * just use the 0th slot.
+	 */
+	if (mg->mg_ms_ready < mg->mg_allocators * 2) {
+		tertiary = B_FALSE;
+		allocator = 0;
+	}
+
+	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
+	/*
+	 * At the end of the metaslab tree are the already-active metaslabs,
+	 * first the primaries, then the secondaries. When we resume searching
+	 * through the tree, we need to consider ms_allocator and ms_primary so
+	 * we start in the location right after where we left off, and don't
+	 * accidentally loop forever considering the same metaslabs.
+	 */
+	search->ms_allocator = -1;
+	search->ms_primary = B_TRUE;
 	for (;;) {
-		boolean_t was_active;
-		avl_tree_t *t = &mg->mg_metaslab_tree;
-		avl_index_t idx;
+		boolean_t was_active = B_FALSE;
 
 		mutex_enter(&mg->mg_lock);
 
-		/*
-		 * Find the metaslab with the highest weight that is less
-		 * than what we've already tried.  In the common case, this
-		 * means that we will examine each metaslab at most once.
-		 * Note that concurrent callers could reorder metaslabs
-		 * by activation/passivation once we have dropped the mg_lock.
-		 * If a metaslab is activated by another thread, and we fail
-		 * to allocate from the metaslab we have selected, we may
-		 * not try the newly-activated metaslab, and instead activate
-		 * another metaslab.  This is not optimal, but generally
-		 * does not cause any problems (a possible exception being
-		 * if every metaslab is completely full except for the
-		 * the newly-activated metaslab which we fail to examine).
-		 */
-		msp = avl_find(t, search, &idx);
-		if (msp == NULL)
-			msp = avl_nearest(t, idx, AVL_AFTER);
-		for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
-
-			if (!metaslab_should_allocate(msp, asize)) {
-				metaslab_trace_add(zal, mg, msp, asize, d,
-				    TRACE_TOO_SMALL);
-				continue;
-			}
-
-			/*
-			 * If the selected metaslab is condensing, skip it.
-			 */
-			if (msp->ms_condensing)
-				continue;
-
-			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
-			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
-				break;
-
-			target_distance = min_distance +
-			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
-			    min_distance >> 1);
-
-			for (i = 0; i < d; i++) {
-				if (metaslab_distance(msp, &dva[i]) <
-				    target_distance)
-					break;
-			}
-			if (i == d)
-				break;
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+		    mg->mg_primaries[allocator] != NULL) {
+			msp = mg->mg_primaries[allocator];
+			was_active = B_TRUE;
+		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+		    mg->mg_secondaries[allocator] != NULL && !tertiary) {
+			msp = mg->mg_secondaries[allocator];
+			was_active = B_TRUE;
+		} else {
+			msp = find_valid_metaslab(mg, activation_weight, dva, d,
+			    min_distance, asize, allocator, zal, search,
+			    &was_active);
 		}
+
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL) {
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
-		search->ms_weight = msp->ms_weight;
-		search->ms_start = msp->ms_start + 1;
 
 		mutex_enter(&msp->ms_lock);
-
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
@@ -2939,18 +3157,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 			continue;
 		}
 
-		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
-		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
-			metaslab_passivate(msp,
-			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+		/*
+		 * If the metaslab is freshly activated for an allocator that
+		 * isn't the one we're allocating from, or if it's a primary and
+		 * we're seeking a secondary (or vice versa), we go back and
+		 * select a new metaslab.
+		 */
+		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+		    (msp->ms_allocator != -1) &&
+		    (msp->ms_allocator != allocator || ((activation_weight ==
+		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight) != 0) {
+		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
+
+		if (metaslab_activate(msp, allocator, activation_weight) != 0) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
 		msp->ms_selected_txg = txg;
 
 		/*
@@ -2963,24 +3195,35 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
 		if (!metaslab_should_allocate(msp, asize)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_TOO_SMALL);
+			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again as
 		 * we can't manipulate this metaslab until it's committed
-		 * to disk.
+		 * to disk. If this metaslab is being initialized, we shouldn't
+		 * allocate from it since the allocated region might be
+		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
-			    TRACE_CONDENSING);
+			    TRACE_CONDENSING, allocator);
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		} else if (msp->ms_initializing > 0) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_INITIALIZING, allocator);
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		offset = metaslab_block_alloc(msp, asize, txg);
-		metaslab_trace_add(zal, mg, msp, asize, d, offset);
+		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 
 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
@@ -3036,19 +3279,20 @@ next:
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+    int allocator)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
-	    min_distance, dva, d);
+	    min_distance, dva, d, allocator);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
 		mg->mg_failed_allocations++;
 		metaslab_trace_add(zal, mg, NULL, asize, d,
-		    TRACE_GROUP_FAILURE);
+		    TRACE_GROUP_FAILURE, allocator);
 		if (asize == SPA_GANGBLOCKSIZE) {
 			/*
 			 * This metaslab group was unable to allocate
@@ -3083,7 +3327,7 @@ int ditto_same_vdev_distance_shift = 3;
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
-    zio_alloc_list_t *zal)
+    zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
@@ -3095,7 +3339,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 */
 	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
-		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
+		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}
 
@@ -3181,12 +3426,12 @@ top:
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
-			    psize);
+			    psize, allocator);
 		}
 
 		if (!allocatable) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
-			    TRACE_NOT_ALLOCATABLE);
+			    TRACE_NOT_ALLOCATABLE, allocator);
 			goto next;
 		}
 
@@ -3201,7 +3446,7 @@ top:
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
-			    TRACE_VDEV_ERROR);
+			    TRACE_VDEV_ERROR, allocator);
 			goto next;
 		}
 
@@ -3225,7 +3470,7 @@ top:
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    distance, dva, d);
+		    distance, dva, d, allocator);
 
 		if (offset != -1ULL) {
 			/*
@@ -3288,7 +3533,7 @@ next:
 
 	bzero(&dva[d], sizeof (dva_t));
 
-	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
+	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }
 
@@ -3355,7 +3600,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
-	    spa->spa_vdev_removal->svr_vdev == vd &&
+	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
@@ -3589,18 +3834,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
  * the reservation.
  */
 boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
-    int flags)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+    zio_t *zio, int flags)
 {
 	uint64_t available_slots = 0;
 	boolean_t slot_reserved = B_FALSE;
+	uint64_t max = mc->mc_alloc_max_slots[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
 
-	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
-	if (reserved_slots < mc->mc_alloc_max_slots)
-		available_slots = mc->mc_alloc_max_slots - reserved_slots;
+	uint64_t reserved_slots =
+	    refcount_count(&mc->mc_alloc_slots[allocator]);
+	if (reserved_slots < max)
+		available_slots = max - reserved_slots;
 
 	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
 		/*
@@ -3608,7 +3855,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
 		 * them individually when an I/O completes.
 		 */
 		for (int d = 0; d < slots; d++) {
-			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+			reserved_slots =
+			    refcount_add(&mc->mc_alloc_slots[allocator],
+			    zio);
 		}
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		slot_reserved = B_TRUE;
@@ -3619,12 +3868,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
 }
 
 void
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+    int allocator, zio_t *zio)
 {
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
 	for (int d = 0; d < slots; d++) {
-		(void) refcount_remove(&mc->mc_alloc_slots, zio);
+		(void) refcount_remove(&mc->mc_alloc_slots[allocator],
+		    zio);
 	}
 	mutex_exit(&mc->mc_lock);
 }
@@ -3646,7 +3897,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+	/*
+	 * No need to fail in that case; someone else has activated the
+	 * metaslab, but that doesn't preclude us from using it.
+	 */
+	if (error == EBUSY)
+		error = 0;
 
 	if (error == 0 &&
 	    !range_tree_contains(msp->ms_allocatable, offset, size))
@@ -3751,7 +4008,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
-    zio_alloc_list_t *zal, zio_t *zio)
+    zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
@@ -3774,12 +4031,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
-		    txg, flags, zal);
+		    txg, flags, zal, allocator);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
-				    DVA_GET_VDEV(&dva[d]), zio, flags);
+				    DVA_GET_VDEV(&dva[d]), zio, flags,
+				    allocator, B_FALSE);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -3790,7 +4048,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
-			    DVA_GET_VDEV(&dva[d]), zio, flags);
+			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}
 
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
index 4ebadace742d..6359b72503ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
@@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
-	avl_index_t where;
 	range_seg_t rsearch;
 	uint64_t end = start + size;
 
@@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 
 	rsearch.rs_start = start;
 	rsearch.rs_end = end;
-	return (avl_find(&rt->rt_root, &rsearch, &where));
+	return (avl_find(&rt->rt_root, &rsearch, NULL));
 }
 
 range_seg_t *
@@ -651,3 +650,23 @@ range_tree_is_empty(range_tree_t *rt)
 	ASSERT(rt != NULL);
 	return (range_tree_space(rt) == 0);
 }
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+	range_seg_t *rs = avl_first(&rt->rt_root);
+	return (rs != NULL ? rs->rs_start : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+	range_seg_t *rs = avl_last(&rt->rt_root);
+	return (rs != NULL ? rs->rs_end : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+	return (range_tree_max(rt) - range_tree_min(rt));
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
index dd6e90c7796b..50f3c0ad822f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
@@ -242,31 +242,23 @@ sa_cache_fini(void)
 static int
 layout_num_compare(const void *arg1, const void *arg2)
 {
-	const sa_lot_t *node1 = arg1;
-	const sa_lot_t *node2 = arg2;
+	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
-	if (node1->lot_num > node2->lot_num)
-		return (1);
-	else if (node1->lot_num < node2->lot_num)
-		return (-1);
-	return (0);
+	return (AVL_CMP(node1->lot_num, node2->lot_num));
 }
 
 static int
 layout_hash_compare(const void *arg1, const void *arg2)
 {
-	const sa_lot_t *node1 = arg1;
-	const sa_lot_t *node2 = arg2;
+	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
 
-	if (node1->lot_hash > node2->lot_hash)
-		return (1);
-	if (node1->lot_hash < node2->lot_hash)
-		return (-1);
-	if (node1->lot_instance > node2->lot_instance)
-		return (1);
-	if (node1->lot_instance < node2->lot_instance)
-		return (-1);
-	return (0);
+	int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash);
+	if (likely(cmp))
+		return (cmp);
+
+	return (AVL_CMP(node1->lot_instance, node2->lot_instance));
 }
 
 boolean_t
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index 489956f1857b..c8a635ae54f3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -55,6 +55,7 @@
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
@@ -443,8 +444,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 
 				dp = spa_get_dsl(spa);
 				dsl_pool_config_enter(dp, FTAG);
-				if (err = dsl_dataset_hold_obj(dp,
-				    za.za_first_integer, FTAG, &ds)) {
+				err = dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &ds);
+				if (err != 0) {
 					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
@@ -599,7 +601,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 					break;
 				}
 
-				if (error = dmu_objset_hold(strval, FTAG, &os))
+				error = dmu_objset_hold(strval, FTAG, &os);
+				if (error != 0)
 					break;
 
 				/*
@@ -902,19 +905,14 @@ spa_change_guid(spa_t *spa)
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
-	spa_error_entry_t *sa = (spa_error_entry_t *)a;
-	spa_error_entry_t *sb = (spa_error_entry_t *)b;
+	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
+	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
 	int ret;
 
-	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
-	if (ret < 0)
-		return (-1);
-	else if (ret > 0)
-		return (1);
-	else
-		return (0);
+	return (AVL_ISIGN(ret));
 }
 
 /*
@@ -1215,8 +1213,10 @@ spa_activate(spa_t *spa, int mode)
 	 */
 	trim_thread_create(spa);
 
-	for (size_t i = 0; i < TXG_SIZE; i++)
-		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+	for (size_t i = 0; i < TXG_SIZE; i++) {
+		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
@@ -1388,6 +1388,11 @@ spa_unload(spa_t *spa)
 	 */
 	spa_async_suspend(spa);
 
+	if (spa->spa_root_vdev) {
+		vdev_initialize_stop_all(spa->spa_root_vdev,
+		    VDEV_INITIALIZE_ACTIVE);
+	}
+
 	/*
 	 * Stop syncing.
 	 */
@@ -1403,10 +1408,10 @@ spa_unload(spa_t *spa)
 	 * calling taskq_wait(mg_taskq).
 	 */
 	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
 			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
-		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	/*
@@ -1440,7 +1445,7 @@ spa_unload(spa_t *spa)
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
@@ -1502,7 +1507,7 @@ spa_unload(spa_t *spa)
 		spa->spa_comment = NULL;
 	}
 
-	spa_config_exit(spa, SCL_ALL, FTAG);
+	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
@@ -3954,6 +3959,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
 		spa_restart_removal(spa);
 
 		spa_spawn_aux_threads(spa);
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_load_note(spa, "LOADED");
@@ -4362,18 +4371,14 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 }
 
 static void
-spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
 {
-	nvlist_t *features;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
-	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
 	/* We may be unable to read features if pool is suspended. */
 	if (spa_suspended(spa))
-		goto out;
+		return;
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
@@ -4382,7 +4387,7 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
-			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+			VERIFY0(nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
@@ -4395,16 +4400,62 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
-			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+			VERIFY0(nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
+}
 
-out:
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
-	    features) == 0);
-	nvlist_free(features);
+static void
+spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
+{
+	int i;
+
+	for (i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t feature = spa_feature_table[i];
+		uint64_t refcount;
+
+		if (feature_get_refcount(spa, &feature, &refcount) != 0)
+			continue;
+
+		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
+	}
+}
+
+/*
+ * Store a list of pool features and their reference counts in the
+ * config.
+ *
+ * The first time this is called on a spa, allocate a new nvlist, fetch
+ * the pool features and reference counts from disk, then save the list
+ * in the spa. In subsequent calls on the same spa use the saved nvlist
+ * and refresh its values from the cached reference counts.  This
+ * ensures we don't block here on I/O on a suspended pool so 'zpool
+ * clear' can resume the pool.
+ */
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t *features;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	mutex_enter(&spa->spa_feat_stats_lock);
+	features = spa->spa_feat_stats;
+
+	if (features != NULL) {
+		spa_feature_stats_from_cache(spa, features);
+	} else {
+		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
+		spa->spa_feat_stats = features;
+		spa_feature_stats_from_disk(spa, features);
+	}
+
+	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+	    features));
+
+	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 int
@@ -5675,6 +5726,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 	 * in which case we can modify its state.
 	 */
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+
 		/*
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
@@ -5709,6 +5761,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		}
 
 		/*
+		 * We're about to export or destroy this pool. Make sure
+		 * we stop all initializtion activity here before we
+		 * set the spa_final_txg. This will ensure that all
+		 * dirty data resulting from the initialization is
+		 * committed to disk before we unload the pool.
+		 */
+		if (spa->spa_root_vdev != NULL) {
+			vdev_initialize_stop_all(spa->spa_root_vdev,
+			    VDEV_INITIALIZE_ACTIVE);
+		}
+
+		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
@@ -5837,8 +5901,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
-			    tvd->vdev_ashift !=
-			    spa->spa_vdev_removal->svr_vdev->vdev_ashift) {
+			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz */
@@ -5954,10 +6017,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
-	if (spa->spa_vdev_removal != NULL ||
-	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+	if (spa->spa_vdev_removal != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
-	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -6401,6 +6462,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 	return (error);
 }
 
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+	/*
+	 * We hold the namespace lock through the whole function
+	 * to prevent any changes to the pool while we're starting or
+	 * stopping initialization. The config and state locks are held so that
+	 * we can properly assess the vdev state before we commit to
+	 * the initializing operation.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+	/* Look up vdev and ensure it's a leaf. */
+	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_detached) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ENODEV));
+	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EINVAL));
+	} else if (!vdev_writeable(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EROFS));
+	}
+	mutex_enter(&vd->vdev_initialize_lock);
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	/*
+	 * When we activate an initialize action we check to see
+	 * if the vdev_initialize_thread is NULL. We do this instead
+	 * of using the vdev_initialize_state since there might be
+	 * a previous initialization process which has completed but
+	 * the thread is not exited.
+	 */
+	if (cmd_type == POOL_INITIALIZE_DO &&
+	    (vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_top->vdev_removing)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EBUSY));
+	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ESRCH));
+	}
+
+	switch (cmd_type) {
+	case POOL_INITIALIZE_DO:
+		vdev_initialize(vd);
+		break;
+	case POOL_INITIALIZE_CANCEL:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+		break;
+	case POOL_INITIALIZE_SUSPEND:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+		break;
+	default:
+		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	/* Sync out the initializing state */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
@@ -6608,6 +6749,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL) {
+			/*
+			 * Temporarily stop the initializing activity. We set
+			 * the state to ACTIVE so that we know to resume
+			 * the initializing once the split has completed.
+			 */
+			mutex_enter(&vml[c]->vdev_initialize_lock);
+			vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+			mutex_exit(&vml[c]->vdev_initialize_lock);
+		}
+	}
+
 #ifndef illumos
 	/* mark that we are creating new spa by splitting */
 	newspa->spa_splitting_newspa = B_TRUE;
@@ -6702,6 +6856,10 @@ out:
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
+
+	/* restart initializing disks as necessary */
+	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
@@ -7066,6 +7224,14 @@ spa_async_thread(void *arg)
 	if (tasks & SPA_ASYNC_RESILVER)
 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
+	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
 	/*
 	 * Let the world know that we're done.
 	 */
@@ -7765,8 +7931,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
-	VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
-	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Lock out configuration changes.
@@ -7776,9 +7943,11 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
-	mutex_enter(&spa->spa_alloc_lock);
-	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-	mutex_exit(&spa->spa_alloc_lock);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}
 
 	/*
 	 * If there are any pending vdev state changes, convert them
@@ -7844,7 +8013,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * The max queue depth will not change in the middle of syncing
 	 * out this txg.
 	 */
-	uint64_t queue_depth_total = 0;
+	uint64_t slots_per_allocator = 0;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
@@ -7858,18 +8027,23 @@ spa_sync(spa_t *spa, uint64_t txg)
 		 * allocations look at mg_max_alloc_queue_depth, and async
 		 * allocations all happen from spa_sync().
 		 */
-		ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+		for (int i = 0; i < spa->spa_alloc_count; i++)
+			ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
 		mg->mg_max_alloc_queue_depth = max_queue_depth;
-		queue_depth_total += mg->mg_max_alloc_queue_depth;
+
+		for (int i = 0; i < spa->spa_alloc_count; i++) {
+			mg->mg_cur_max_alloc_queue_depth[i] =
+			    zfs_vdev_def_queue_depth;
+		}
+		slots_per_allocator += zfs_vdev_def_queue_depth;
 	}
 	metaslab_class_t *mc = spa_normal_class(spa);
-	ASSERT0(refcount_count(&mc->mc_alloc_slots));
-	mc->mc_alloc_max_slots = queue_depth_total;
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
+		mc->mc_alloc_max_slots[i] = slots_per_allocator;
+	}
 	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 
-	ASSERT3U(mc->mc_alloc_max_slots, <=,
-	    max_queue_depth * rvd->vdev_children);
-
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
@@ -8052,14 +8226,17 @@ spa_sync(spa_t *spa, uint64_t txg)
 
 	dsl_pool_sync_done(dp, txg);
 
-	mutex_enter(&spa->spa_alloc_lock);
-	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-	mutex_exit(&spa->spa_alloc_lock);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}
 
 	/*
 	 * Update usable space statistics.
 	 */
-	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
index a4af48d8c58b..db0d2caa6107 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
@@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg {
 } spa_checkpoint_discard_sync_callback_arg_t;
 
 static int
-spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
-    uint64_t size, void *arg)
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
 {
 	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 	vdev_t *vd = sdc->sdc_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	if (sdc->sdc_entry_limit == 0)
 		return (EINTR);
@@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(type, ==, SM_FREE);
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_type, ==, SM_FREE);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
 	mutex_enter(&ms->ms_lock);
 	if (range_tree_is_empty(ms->ms_freeing))
 		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
-	range_tree_add(ms->ms_freeing, offset, size);
+	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
-	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+	    sme->sme_run);
+	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
 
-	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
-	vd->vdev_stat.vs_checkpoint_space -= size;
+	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
 	sdc->sdc_entry_limit--;
 
 	return (0);
@@ -289,12 +289,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 	 * Thus, we set the maximum entries that the space map callback
 	 * will be applied to be half the entries that could fit in the
 	 * imposed memory limit.
+	 *
+	 * Note that since this is a conservative estimate we also
+	 * assume the worst case scenario in our computation where each
+	 * entry is two-word.
 	 */
 	uint64_t max_entry_limit =
-	    (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
-
-	uint64_t entries_in_sm =
-	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
 
 	/*
 	 * Iterate from the end of the space map towards the beginning,
@@ -318,14 +319,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 	spa_checkpoint_discard_sync_callback_arg_t sdc;
 	sdc.sdc_vd = vd;
 	sdc.sdc_txg = tx->tx_txg;
-	sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+	sdc.sdc_entry_limit = max_entry_limit;
 
-	uint64_t entries_before = entries_in_sm;
+	uint64_t words_before =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 	    spa_checkpoint_discard_sync_callback, &sdc, tx);
 
-	uint64_t entries_after =
+	uint64_t words_after =
 	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 #ifdef DEBUG
@@ -333,9 +335,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 #endif
 
 	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
-	    "deleted %llu entries - %llu entries are left",
-	    tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
-	    entries_after);
+	    "deleted %llu words - %llu words are left",
+	    tx->tx_txg, vd->vdev_id, (words_before - words_after),
+	    words_after);
 
 	if (error != EINTR) {
 		if (error != 0) {
@@ -344,15 +346,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 			    "space map of vdev %llu\n",
 			    error, vd->vdev_id);
 		}
-		ASSERT0(entries_after);
+		ASSERT0(words_after);
 		ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
-		ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
 
 		space_map_free(vd->vdev_checkpoint_sm, tx);
 		space_map_close(vd->vdev_checkpoint_sm);
 		vd->vdev_checkpoint_sm = NULL;
 
-		VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 	}
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
index ff1fcb4f0b21..6865dcff2212 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  */
 
@@ -559,6 +559,18 @@ spa_config_update(spa_t *spa, int what)
 		 */
 		for (c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
+
+			/*
+			 * Explicitly skip vdevs that are indirect or
+			 * log vdevs that are being removed. The reason
+			 * is that both of those can have vdev_ms_array
+			 * set to 0 and we wouldn't want to change their
+			 * metaslab size nor call vdev_expand() on them.
+			 */
+			if (!vdev_is_concrete(tvd) ||
+			    (tvd->vdev_islog && tvd->vdev_removing))
+				continue;
+
 			if (tvd->vdev_ms_array == 0) {
 				vdev_ashift_optimize(tvd);
 				vdev_metaslab_set_size(tvd);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
index aaa3c310f1e8..2ceed8dd8040 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -41,6 +41,7 @@
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
 #include <sys/metaslab.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
@@ -252,7 +253,7 @@ int spa_mode_global;
  * Everything except dprintf, spa, and indirect_remap is on by default
  * in debug builds.
  */
-int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP);
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP);
 #else
 int zfs_flags = 0;
 #endif
@@ -434,6 +435,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
     &spa_min_slop, 0,
     "Minimal value of reserved space");
 
+int spa_allocators = 4;
+
 /*PRINTFLIKE2*/
 void
 spa_load_failed(spa_t *spa, const char *fmt, ...)
@@ -705,7 +708,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
@@ -779,8 +782,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		spa_active_count++;
 	}
 
-	avl_create(&spa->spa_alloc_tree, zio_bookmark_compare,
-	    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+	spa->spa_alloc_count = spa_allocators;
+	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (kmutex_t), KM_SLEEP);
+	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (avl_tree_t), KM_SLEEP);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+	}
 
 	/*
 	 * Every pool starts with the default cachefile
@@ -812,8 +823,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 		    KM_SLEEP) == 0);
 	}
 
-	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
-
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 
@@ -860,11 +869,20 @@ spa_remove(spa_t *spa)
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}
 
-	avl_destroy(&spa->spa_alloc_tree);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		avl_destroy(&spa->spa_alloc_trees[i]);
+		mutex_destroy(&spa->spa_alloc_locks[i]);
+	}
+	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+	    sizeof (kmutex_t));
+	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+	    sizeof (avl_tree_t));
+
 	list_destroy(&spa->spa_config_list);
 
 	nvlist_free(spa->spa_label_features);
 	nvlist_free(spa->spa_load_info);
+	nvlist_free(spa->spa_feat_stats);
 	spa_config_set(spa, NULL);
 
 #ifdef illumos
@@ -895,7 +913,6 @@ spa_remove(spa_t *spa)
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
-	mutex_destroy(&spa->spa_alloc_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
@@ -907,6 +924,7 @@ spa_remove(spa_t *spa)
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
+	mutex_destroy(&spa->spa_feat_stats_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -1001,18 +1019,13 @@ typedef struct spa_aux {
 	int		aux_count;
 } spa_aux_t;
 
-static int
+static inline int
 spa_aux_compare(const void *a, const void *b)
 {
-	const spa_aux_t *sa = a;
-	const spa_aux_t *sb = b;
+	const spa_aux_t *sa = (const spa_aux_t *)a;
+	const spa_aux_t *sb = (const spa_aux_t *)b;
 
-	if (sa->aux_guid < sb->aux_guid)
-		return (-1);
-	else if (sa->aux_guid > sb->aux_guid)
-		return (1);
-	else
-		return (0);
+	return (AVL_CMP(sa->aux_guid, sb->aux_guid));
 }
 
 void
@@ -1299,6 +1312,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+		if (vd->vdev_ops->vdev_op_leaf) {
+			mutex_enter(&vd->vdev_initialize_lock);
+			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+			mutex_exit(&vd->vdev_initialize_lock);
+		}
+
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);
@@ -1862,9 +1881,12 @@ spa_update_dspace(spa_t *spa)
 		 * allocated twice (on the old device and the new
 		 * device).
 		 */
-		vdev_t *vd = spa->spa_vdev_removal->svr_vdev;
+		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+		vdev_t *vd =
+		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 		spa->spa_dspace -= spa_deflate(spa) ?
 		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+		spa_config_exit(spa, SCL_VDEV, FTAG);
 	}
 }
 
@@ -2009,6 +2031,12 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
 	return (dsize);
 }
 
+uint64_t
+spa_dirty_data(spa_t *spa)
+{
+	return (spa->spa_dsl_pool->dp_dirty_total);
+}
+
 /*
  * ==========================================================================
  * Initialization and Termination
@@ -2023,11 +2051,8 @@ spa_name_compare(const void *a1, const void *a2)
 	int s;
 
 	s = strcmp(s1->spa_name, s2->spa_name);
-	if (s > 0)
-		return (1);
-	if (s < 0)
-		return (-1);
-	return (0);
+
+	return (AVL_ISIGN(s));
 }
 
 int
@@ -2261,12 +2286,6 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 	return (0);
 }
 
-boolean_t
-spa_debug_enabled(spa_t *spa)
-{
-	return (spa->spa_debug);
-}
-
 int
 spa_maxblocksize(spa_t *spa)
 {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
index 2f15c5185c57..7356e3ceea75 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
@@ -43,68 +43,205 @@ SYSCTL_DECL(_vfs_zfs);
  * Note on space map block size:
  *
  * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
 
 /*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+/*
+ * Override the default indirect block size of 128K, instead using 16K for
+ * spacemaps (2^14 bytes).  This dramatically reduces write inflation since
+ * appending to a spacemap typically has to write one data block (4KB) and one
+ * or two indirect blocks (16K-32K, rather than 128K).
+ */
+int space_map_ibs = 14;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+    &space_map_ibs, 0, "Space map indirect block shift");
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+	uint8_t prefix = SM_PREFIX_DECODE(e);
+	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
  * Iterate through the space map, invoking the callback on each (non-debug)
  * space map entry.
  */
 int
 space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 {
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t bufsize, size, offset, end;
+	uint64_t sm_len = space_map_length(sm);
+	ASSERT3U(sm->sm_blksz, !=, 0);
+
+	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+	    ZIO_PRIORITY_SYNC_READ);
+
+	uint64_t blksz = sm->sm_blksz;
 	int error = 0;
+	for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+	    block_base += blksz) {
+		dmu_buf_t *db;
+		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+		    block_base, FTAG, &db, DMU_READ_PREFETCH);
+		if (error != 0)
+			return (error);
 
-	end = space_map_length(sm);
+		uint64_t *block_start = db->db_data;
+		uint64_t block_length = MIN(sm_len - block_base, blksz);
+		uint64_t *block_end = block_start +
+		    (block_length / sizeof (uint64_t));
 
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
+		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+		VERIFY3U(block_length, !=, 0);
+		ASSERT3U(blksz, ==, db->db_size);
 
-	if (end > bufsize) {
-		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
-		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
-	}
+		for (uint64_t *block_cursor = block_start;
+		    block_cursor < block_end && error == 0; block_cursor++) {
+			uint64_t e = *block_cursor;
+
+			if (sm_entry_is_debug(e)) /* Skip debug entries */
+				continue;
 
-	for (offset = 0; offset < end && error == 0; offset += bufsize) {
-		size = MIN(end - offset, bufsize);
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY(size != 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				/* it is a two-word entry */
+				ASSERT(sm_entry_is_double_word(e));
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move on to the second word */
+				block_cursor++;
+				e = *block_cursor;
+				VERIFY3P(block_cursor, <=, block_end);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
 
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
+			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+			    sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
 
-		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
-		    entry_map, DMU_READ_PREFETCH);
-		if (error != 0)
-			break;
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+			ASSERT3U(entry_offset, >=, sm->sm_start);
+			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			ASSERT3U(entry_run, <=, sm->sm_size);
+			ASSERT3U(entry_offset + entry_run, <=,
+			    sm->sm_start + sm->sm_size);
 
-		entry_map_end = entry_map + (size / sizeof (uint64_t));
-		for (entry = entry_map; entry < entry_map_end && error == 0;
-		    entry++) {
-			uint64_t e = *entry;
-			uint64_t offset, size;
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
+		}
+		dmu_buf_rele(db, FTAG);
+	}
+	return (error);
+}
 
-			if (SM_DEBUG_DECODE(e))	/* Skip debug entries */
-				continue;
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+    uint64_t bufsz, uint64_t *nwords)
+{
+	int error = 0;
+	dmu_buf_t *db;
 
-			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			size = SM_RUN_DECODE(e) << sm->sm_shift;
+	/*
+	 * Find the offset of the last word in the space map and use
+	 * that to read the last block of the space map with
+	 * dmu_buf_hold().
+	 */
+	uint64_t last_word_offset =
+	    sm->sm_phys->smp_objsize - sizeof (uint64_t);
+	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+	    FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (error != 0)
+		return (error);
 
-			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
-			VERIFY3U(offset, >=, sm->sm_start);
-			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
-			error = callback(SM_TYPE_DECODE(e), offset, size, arg);
+	ASSERT3U(sm->sm_object, ==, db->db_object);
+	ASSERT3U(sm->sm_blksz, ==, db->db_size);
+	ASSERT3U(bufsz, >=, db->db_size);
+	ASSERT(nwords != NULL);
+
+	uint64_t *words = db->db_data;
+	*nwords =
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
+
+	uint64_t n = *nwords;
+	uint64_t j = n - 1;
+	for (uint64_t i = 0; i < n; i++) {
+		uint64_t entry = words[i];
+		if (sm_entry_is_double_word(entry)) {
+			/*
+			 * Since we are populating the buffer backwards
+			 * we have to be extra careful and add the two
+			 * words of the double-word entry in the right
+			 * order.
+			 */
+			ASSERT3U(j, >, 0);
+			buf[j - 1] = entry;
+
+			i++;
+			ASSERT3U(i, <, n);
+			entry = words[i];
+			buf[j] = entry;
+			j -= 2;
+		} else {
+			ASSERT(sm_entry_is_debug(entry) ||
+			    sm_entry_is_single_word(entry));
+			buf[j] = entry;
+			j--;
 		}
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	/*
+	 * Assert that we wrote backwards all the
+	 * way to the beginning of the buffer.
+	 */
+	ASSERT3S(j, ==, -1);
+
+	dmu_buf_rele(db, FTAG);
 	return (error);
 }
 
@@ -118,124 +255,122 @@ int
 space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
     dmu_tx_t *tx)
 {
-	uint64_t bufsize, len;
-	uint64_t *entry_map;
-	int error = 0;
-
-	len = space_map_length(sm);
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
+	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	uint64_t *buf = zio_buf_alloc(bufsz);
 
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
-	 * Since we can't move the starting offset of the space map
-	 * (e.g there are reference on-disk pointing to it), we destroy
-	 * its entries incrementally starting from the end.
+	 * Ideally we would want to iterate from the beginning of the
+	 * space map to the end in incremental steps. The issue with this
+	 * approach is that we don't have any field on-disk that points
+	 * us where to start between each step. We could try zeroing out
+	 * entries that we've destroyed, but this doesn't work either as
+	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
+	 *
+	 * As a result, we destroy its entries incrementally starting from
+	 * the end after applying the callback to each of them.
 	 *
-	 * The logic that follows is basically the same as the one used
-	 * in space_map_iterate() but it traverses the space map
-	 * backwards:
+	 * The problem with this approach is that we cannot literally
+	 * iterate through the words in the space map backwards as we
+	 * can't distinguish two-word space map entries from their second
+	 * word. Thus we do the following:
 	 *
-	 * 1] We figure out the size of the buffer that we want to use
-	 *    to read the on-disk space map entries.
-	 * 2] We figure out the offset at the end of the space map where
-	 *    we will start reading entries into our buffer.
-	 * 3] We read the on-disk entries into the buffer.
-	 * 4] We iterate over the entries from end to beginning calling
-	 *    the callback function on each one. As we move from entry
-	 *    to entry we decrease the size of the space map, deleting
-	 *    effectively each entry.
-	 * 5] If there are no more entries in the space map or the
-	 *    callback returns a value other than 0, we stop iterating
-	 *    over the space map. If there are entries remaining and
-	 *    the callback returned zero we go back to step [1].
+	 * 1] We get all the entries from the last block of the space map
+	 *    and put them into a buffer in reverse order. This way the
+	 *    last entry comes first in the buffer, the second to last is
+	 *    second, etc.
+	 * 2] We iterate through the entries in the buffer and we apply
+	 *    the callback to each one. As we move from entry to entry we
+	 *    we decrease the size of the space map, deleting effectively
+	 *    each entry.
+	 * 3] If there are no more entries in the space map or the callback
+	 *    returns a value other than 0, we stop iterating over the
+	 *    space map. If there are entries remaining and the callback
+	 *    returned 0, we go back to step [1].
 	 */
-	uint64_t offset = 0, size = 0;
-	while (len > 0 && error == 0) {
-		size = MIN(bufsize, len);
-
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY3U(size, >, 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
-
-		offset = len - size;
-
-		IMPLY(bufsize > len, offset == 0);
-		IMPLY(bufsize == len, offset == 0);
-		IMPLY(bufsize < len, offset > 0);
-
-
-		EQUIV(size == len, offset == 0);
-		IMPLY(size < len, bufsize < len);
-
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
-
-		error = dmu_read(sm->sm_os, space_map_object(sm),
-		    offset, size, entry_map, DMU_READ_PREFETCH);
+	int error = 0;
+	while (space_map_length(sm) > 0 && error == 0) {
+		uint64_t nwords = 0;
+		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+		    &nwords);
 		if (error != 0)
 			break;
 
-		uint64_t num_entries = size / sizeof (uint64_t);
-
-		ASSERT3U(num_entries, >, 0);
-
-		while (num_entries > 0) {
-			uint64_t e, entry_offset, entry_size;
-			maptype_t type;
-
-			e = entry_map[num_entries - 1];
+		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 
-			ASSERT3U(num_entries, >, 0);
-			ASSERT0(error);
+		for (uint64_t i = 0; i < nwords; i++) {
+			uint64_t e = buf[i];
 
-			if (SM_DEBUG_DECODE(e)) {
+			if (sm_entry_is_debug(e)) {
 				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
 				space_map_update(sm);
-				len -= sizeof (uint64_t);
-				num_entries--;
 				continue;
 			}
 
-			type = SM_TYPE_DECODE(e);
-			entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+			int words = 1;
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				ASSERT(sm_entry_is_double_word(e));
+				words = 2;
+
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move to the second word */
+				i++;
+				e = buf[i];
+
+				ASSERT3P(i, <=, nwords);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
+
+			uint64_t entry_offset =
+			    (raw_offset << sm->sm_shift) + sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
 
 			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 			VERIFY3U(entry_offset, >=, sm->sm_start);
-			VERIFY3U(entry_offset + entry_size, <=,
+			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			VERIFY3U(entry_run, <=, sm->sm_size);
+			VERIFY3U(entry_offset + entry_run, <=,
 			    sm->sm_start + sm->sm_size);
 
-			error = callback(type, entry_offset, entry_size, arg);
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
 			if (error != 0)
 				break;
 
 			if (type == SM_ALLOC)
-				sm->sm_phys->smp_alloc -= entry_size;
+				sm->sm_phys->smp_alloc -= entry_run;
 			else
-				sm->sm_phys->smp_alloc += entry_size;
-
-			sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				sm->sm_phys->smp_alloc += entry_run;
+			sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
 			space_map_update(sm);
-			len -= sizeof (uint64_t);
-			num_entries--;
 		}
-		IMPLY(error == 0, num_entries == 0);
-		EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
 	}
 
-	if (len == 0) {
+	if (space_map_length(sm) == 0) {
 		ASSERT0(error);
-		ASSERT0(offset);
-		ASSERT0(sm->sm_length);
 		ASSERT0(sm->sm_phys->smp_objsize);
 		ASSERT0(sm->sm_alloc);
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	zio_buf_free(buf, bufsz);
 	return (error);
 }
 
@@ -246,16 +381,15 @@ typedef struct space_map_load_arg {
 } space_map_load_arg_t;
 
 static int
-space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+space_map_load_callback(space_map_entry_t *sme, void *arg)
 {
 	space_map_load_arg_t *smla = arg;
-	if (type == smla->smla_type) {
-		VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
+	if (sme->sme_type == smla->smla_type) {
+		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
 		    smla->smla_sm->sm_size);
-		range_tree_add(smla->smla_rt, offset, size);
+		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	} else {
-		range_tree_remove(smla->smla_rt, offset, size);
+		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	}
 
 	return (0);
@@ -367,43 +501,239 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
 	}
 }
 
-uint64_t
-space_map_entries(space_map_t *sm, range_tree_t *rt)
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 {
-	avl_tree_t *t = &rt->rt_root;
-	range_seg_t *rs;
-	uint64_t size, entries;
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+	    SM_DEBUG_ACTION_ENCODE(maptype) |
+	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+	    sizeof (dentry), &dentry, tx);
+
+	sm->sm_phys->smp_objsize += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
+    uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+{
+	ASSERT3U(words, !=, 0);
+	ASSERT3U(words, <=, 2);
+
+	/* ensure the vdev_id can be represented by the space map */
+	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
+
+	/*
+	 * if this is a single word entry, ensure that no vdev was
+	 * specified.
+	 */
+	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
+
+	dmu_buf_t *db = *dbp;
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	uint64_t *block_base = db->db_data;
+	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+	uint64_t *block_cursor = block_base +
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3P(block_cursor, <=, block_end);
+
+	uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+	uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+	ASSERT3U(rs->rs_start, >=, sm->sm_start);
+	ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
+	ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
+	ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+
+	while (size != 0) {
+		ASSERT3P(block_cursor, <=, block_end);
+
+		/*
+		 * If we are at the end of this block, flush it and start
+		 * writing again from the beginning.
+		 */
+		if (block_cursor == block_end) {
+			dmu_buf_rele(db, tag);
+
+			uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+			VERIFY0(dmu_buf_hold(sm->sm_os,
+			    space_map_object(sm), next_word_offset,
+			    tag, &db, DMU_READ_PREFETCH));
+			dmu_buf_will_dirty(db, tx);
+
+			/* update caller's dbuf */
+			*dbp = db;
+
+			ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+			block_base = db->db_data;
+			block_cursor = block_base;
+			block_end = block_base +
+			    (db->db_size / sizeof (uint64_t));
+		}
+
+		/*
+		 * If we are writing a two-word entry and we only have one
+		 * word left on this block, just pad it with an empty debug
+		 * entry and write the two-word entry in the next block.
+		 */
+		uint64_t *next_entry = block_cursor + 1;
+		if (next_entry == block_end && words > 1) {
+			ASSERT3U(words, ==, 2);
+			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+			    SM_DEBUG_ACTION_ENCODE(0) |
+			    SM_DEBUG_SYNCPASS_ENCODE(0) |
+			    SM_DEBUG_TXG_ENCODE(0);
+			block_cursor++;
+			sm->sm_phys->smp_objsize += sizeof (uint64_t);
+			ASSERT3P(block_cursor, ==, block_end);
+			continue;
+		}
+
+		uint64_t run_len = MIN(size, run_max);
+		switch (words) {
+		case 1:
+			*block_cursor = SM_OFFSET_ENCODE(start) |
+			    SM_TYPE_ENCODE(maptype) |
+			    SM_RUN_ENCODE(run_len);
+			block_cursor++;
+			break;
+		case 2:
+			/* write the first word of the entry */
+			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+			    SM2_RUN_ENCODE(run_len) |
+			    SM2_VDEV_ENCODE(vdev_id);
+			block_cursor++;
+
+			/* move on to the second word of the entry */
+			ASSERT3P(block_cursor, <, block_end);
+			*block_cursor = SM2_TYPE_ENCODE(maptype) |
+			    SM2_OFFSET_ENCODE(start);
+			block_cursor++;
+			break;
+		default:
+			panic("%d-word space map entries are not supported",
+			    words);
+			break;
+		}
+		sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+
+		start += run_len;
+		size -= run_len;
+	}
+	ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t vdev_id, dmu_tx_t *tx)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+	dmu_buf_t *db;
+
+	space_map_write_intro_debug(sm, maptype, tx);
 
+#ifdef DEBUG
 	/*
-	 * All space_maps always have a debug entry so account for it here.
+	 * We do this right after we write the intro debug entry
+	 * because the estimate does not take it into account.
 	 */
-	entries = 1;
+	uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+	uint64_t estimated_growth =
+	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
 
 	/*
-	 * Traverse the range tree and calculate the number of space map
-	 * entries that would be required to write out the range tree.
+	 * Find the offset right after the last word in the space map
+	 * and use that to get a hold of the last block, so we can
+	 * start appending to it.
 	 */
-	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-		entries += howmany(size, SM_RUN_MAX);
+	uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	dmu_buf_will_dirty(db, tx);
+
+	avl_tree_t *t = &rt->rt_root;
+	for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+		uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		uint8_t words = 1;
+
+		/*
+		 * We only write two-word entries when both of the following
+		 * are true:
+		 *
+		 * [1] The feature is enabled.
+		 * [2] The offset or run is too big for a single-word entry,
+		 *	or the vdev_id is set (meaning not equal to
+		 *	SM_NO_VDEVID).
+		 *
+		 * Note that for purposes of testing we've added the case that
+		 * we write two-word entries occasionally when the feature is
+		 * enabled and zfs_force_some_double_word_sm_entries has been
+		 * set.
+		 */
+		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    (offset >= (1ULL << SM_OFFSET_BITS) ||
+		    length > SM_RUN_MAX ||
+		    vdev_id != SM_NO_VDEVID ||
+		    (zfs_force_some_double_word_sm_entries &&
+		    spa_get_random(100) == 0)))
+			words = 2;
+
+		space_map_write_seg(sm, rs, maptype, vdev_id, words,
+		    &db, FTAG, tx);
 	}
-	return (entries);
+
+	dmu_buf_rele(db, FTAG);
+
+#ifdef DEBUG
+	/*
+	 * We expect our estimation to be based on the worst case
+	 * scenario [see comment in space_map_estimate_optimal_size()].
+	 * Therefore we expect the actual objsize to be equal or less
+	 * than whatever we estimated it to be.
+	 */
+	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+#endif
 }
 
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
 void
 space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    dmu_tx_t *tx)
+    uint64_t vdev_id, dmu_tx_t *tx)
 {
 	objset_t *os = sm->sm_os;
-	spa_t *spa = dmu_objset_spa(os);
-	avl_tree_t *t = &rt->rt_root;
-	range_seg_t *rs;
-	uint64_t size, total, rt_space, nodes;
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t expected_entries, actual_entries = 1;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	VERIFY3U(space_map_object(sm), !=, 0);
+
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
@@ -423,58 +753,10 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	else
 		sm->sm_phys->smp_alloc -= range_tree_space(rt);
 
-	expected_entries = space_map_entries(sm, rt);
-
-	entry_map = zio_buf_alloc(sm->sm_blksz);
-	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
-	entry = entry_map;
-
-	*entry++ = SM_DEBUG_ENCODE(1) |
-	    SM_DEBUG_ACTION_ENCODE(maptype) |
-	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
-	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
-
-	total = 0;
-	nodes = avl_numnodes(&rt->rt_root);
-	rt_space = range_tree_space(rt);
-	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		uint64_t start;
+	uint64_t nodes = avl_numnodes(&rt->rt_root);
+	uint64_t rt_space = range_tree_space(rt);
 
-		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-		start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
-
-		total += size << sm->sm_shift;
-
-		while (size != 0) {
-			uint64_t run_len;
-
-			run_len = MIN(size, SM_RUN_MAX);
-
-			if (entry == entry_map_end) {
-				dmu_write(os, space_map_object(sm),
-				    sm->sm_phys->smp_objsize, sm->sm_blksz,
-				    entry_map, tx);
-				sm->sm_phys->smp_objsize += sm->sm_blksz;
-				entry = entry_map;
-			}
-
-			*entry++ = SM_OFFSET_ENCODE(start) |
-			    SM_TYPE_ENCODE(maptype) |
-			    SM_RUN_ENCODE(run_len);
-
-			start += run_len;
-			size -= run_len;
-			actual_entries++;
-		}
-	}
-
-	if (entry != entry_map) {
-		size = (entry - entry_map) * sizeof (uint64_t);
-		dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
-		    size, entry_map, tx);
-		sm->sm_phys->smp_objsize += size;
-	}
-	ASSERT3U(expected_entries, ==, actual_entries);
+	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
 
 	/*
 	 * Ensure that the space_map's accounting wasn't changed
@@ -482,9 +764,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 */
 	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
 	VERIFY3U(range_tree_space(rt), ==, rt_space);
-	VERIFY3U(range_tree_space(rt), ==, total);
-
-	zio_buf_free(entry_map, sm->sm_blksz);
 }
 
 static int
@@ -526,7 +805,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
 		space_map_close(sm);
 		return (error);
 	}
-
 	*smp = sm;
 
 	return (0);
@@ -569,7 +847,8 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
 	 */
 	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
-	    doi.doi_data_block_size != blocksize) {
+	    doi.doi_data_block_size != blocksize ||
+	    doi.doi_metadata_block_size != 1 << space_map_ibs) {
 		zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
 		    "object[%llu]: old bonus %u, old blocksz %u",
 		    dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@@ -625,8 +904,8 @@ space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
 		bonuslen = SPACE_MAP_SIZE_V0;
 	}
 
-	object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
-	    DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+	object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
+	    space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
 
 	return (object);
 }
@@ -658,6 +937,133 @@ space_map_free(space_map_t *sm, dmu_tx_t *tx)
 	sm->sm_object = 0;
 }
 
+/*
+ * Given a range tree, it makes a worst-case estimate of how much
+ * space would the tree's segments take if they were written to
+ * the given space map.
+ */
+uint64_t
+space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+    uint64_t vdev_id)
+{
+	spa_t *spa = dmu_objset_spa(sm->sm_os);
+	uint64_t shift = sm->sm_shift;
+	uint64_t *histogram = rt->rt_histogram;
+	uint64_t entries_for_seg = 0;
+
+	/*
+	 * In order to get a quick estimate of the optimal size that this
+	 * range tree would have on-disk as a space map, we iterate through
+	 * its histogram buckets instead of iterating through its nodes.
+	 *
+	 * Note that this is a highest-bound/worst-case estimate for the
+	 * following reasons:
+	 *
+	 * 1] We assume that we always add a debug padding for each block
+	 *    we write and we also assume that we start at the last word
+	 *    of a block attempting to write a two-word entry.
+	 * 2] Rounding up errors due to the way segments are distributed
+	 *    in the buckets of the range tree's histogram.
+	 * 3] The activation of zfs_force_some_double_word_sm_entries
+	 *    (tunable) when testing.
+	 *
+	 * = Math and Rounding Errors =
+	 *
+	 * rt_histogram[i] bucket of a range tree represents the number
+	 * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
+	 * that, we want to divide the buckets into groups: Buckets that
+	 * can be represented using a single-word entry, ones that can
+	 * be represented with a double-word entry, and ones that can
+	 * only be represented with multiple two-word entries.
+	 *
+	 * [Note that if the new encoding feature is not enabled there
+	 * are only two groups: single-word entry buckets and multiple
+	 * single-word entry buckets. The information below assumes
+	 * two-word entries enabled, but it can easily applied when
+	 * the feature is not enabled]
+	 *
+	 * To find the highest bucket that can be represented with a
+	 * single-word entry we look at the maximum run that such entry
+	 * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
+	 * the run of a space map entry is shifted by sm_shift, thus we
+	 * add it to the exponent]. This way, excluding the value of the
+	 * maximum run that can be represented by a single-word entry,
+	 * all runs that are smaller exist in buckets 0 to
+	 * SM_RUN_BITS + shift - 1.
+	 *
+	 * To find the highest bucket that can be represented with a
+	 * double-word entry, we follow the same approach. Finally, any
+	 * bucket higher than that are represented with multiple two-word
+	 * entries. To be more specific, if the highest bucket whose
+	 * segments can be represented with a single two-word entry is X,
+	 * then bucket X+1 will need 2 two-word entries for each of its
+	 * segments, X+2 will need 4, X+3 will need 8, ...etc.
+	 *
+	 * With all of the above we make our estimation based on bucket
+	 * groups. There is a rounding error though. As we mentioned in
+	 * the example with the one-word entry, the maximum run that can
+	 * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
+	 * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
+	 * that length fall into the next bucket (and bucket group) where
+	 * we start counting two-word entries and this is one more reason
+	 * why the estimated size may end up being bigger than the actual
+	 * size written.
+	 */
+	uint64_t size = 0;
+	uint64_t idx = 0;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
+	    (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
+
+		/*
+		 * If we are trying to force some double word entries just
+		 * assume the worst-case of every single word entry being
+		 * written as a double word entry.
+		 */
+		uint64_t entry_size =
+		    (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    zfs_force_some_double_word_sm_entries) ?
+		    (2 * sizeof (uint64_t)) : sizeof (uint64_t);
+
+		uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
+		for (; idx <= single_entry_max_bucket; idx++)
+			size += histogram[idx] * entry_size;
+
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
+			for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+				ASSERT3U(idx, >=, single_entry_max_bucket);
+				entries_for_seg =
+				    1ULL << (idx - single_entry_max_bucket);
+				size += histogram[idx] *
+				    entries_for_seg * entry_size;
+			}
+			return (size);
+		}
+	}
+
+	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
+
+	uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
+	for (; idx <= double_entry_max_bucket; idx++)
+		size += histogram[idx] * 2 * sizeof (uint64_t);
+
+	for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+		ASSERT3U(idx, >=, double_entry_max_bucket);
+		entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
+		size += histogram[idx] *
+		    entries_for_seg * 2 * sizeof (uint64_t);
+	}
+
+	/*
+	 * Assume the worst case where we start with the padding at the end
+	 * of the current block and we add an extra padding entry at the end
+	 * of all subsequent blocks.
+	 */
+	size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
+
+	return (size);
+}
+
 uint64_t
 space_map_object(space_map_t *sm)
 {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
index a866e65d54f7..aa289ba1061d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c
@@ -54,20 +54,14 @@
 static int
 space_reftree_compare(const void *x1, const void *x2)
 {
-	const space_ref_t *sr1 = x1;
-	const space_ref_t *sr2 = x2;
+	const space_ref_t *sr1 = (const space_ref_t *)x1;
+	const space_ref_t *sr2 = (const space_ref_t *)x2;
 
-	if (sr1->sr_offset < sr2->sr_offset)
-		return (-1);
-	if (sr1->sr_offset > sr2->sr_offset)
-		return (1);
+	int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset);
+	if (likely(cmp))
+		return (cmp);
 
-	if (sr1 < sr2)
-		return (-1);
-	if (sr1 > sr2)
-		return (1);
-
-	return (0);
+	return (AVL_PCMP(sr1, sr2));
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
index cb1d4354579e..9b8e73b596fe 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
@@ -194,7 +194,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
 
 void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
-int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
+int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
 
 uint64_t arc_max_bytes(void);
 void arc_init(void);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
index 69617b3dca9c..ec966432f2ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
@@ -83,6 +83,13 @@ typedef enum dbuf_states {
 	DB_EVICTING
 } dbuf_states_t;
 
+typedef enum dbuf_cached_state {
+	DB_NO_CACHE = -1,
+	DB_DBUF_CACHE,
+	DB_DBUF_METADATA_CACHE,
+	DB_CACHE_MAX
+} dbuf_cached_state_t;
+
 struct dnode;
 struct dmu_tx;
 
@@ -229,11 +236,12 @@ typedef struct dmu_buf_impl {
 	 */
 	avl_node_t db_link;
 
-	/*
-	 * Link in dbuf_cache.
-	 */
+	/* Link in dbuf_cache or dbuf_metadata_cache */
 	multilist_node_t db_cache_link;
 
+	/* Tells us which dbuf cache this dbuf is in, if any */
+	dbuf_cached_state_t db_caching_status;
+
 	/* Data which is unique to data (leaf) blocks: */
 
 	/* User callback information. */
@@ -295,7 +303,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
 
 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
     uint64_t blkid);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index 0f7916e7d189..9bba698828fd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -109,7 +109,8 @@ typedef enum dmu_object_byteswap {
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
- * (dmu_object_byteswap_t).
+ * (dmu_object_byteswap_t). All of the types created by this method
+ * are cached in the dbuf metadata cache.
  */
 #define	DMU_OT(byteswap, metadata) \
 	(DMU_OT_NEWTYPE | \
@@ -124,6 +125,9 @@ typedef enum dmu_object_byteswap {
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(ot)].ot_metadata)
 
+#define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
+
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
@@ -352,6 +356,9 @@ typedef struct dmu_buf {
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+    int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
@@ -512,6 +519,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
+int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp,
+    uint32_t flags);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
@@ -750,10 +760,13 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
+int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
+int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
+    dmu_tx_t *tx);
 #ifdef _KERNEL
 #ifdef illumos
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
@@ -767,6 +780,8 @@ int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset,
+    struct arc_buf *buf, dmu_tx_t *tx);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
     dmu_tx_t *tx);
 int dmu_xuio_init(struct xuio *uio, int niov);
@@ -810,6 +825,7 @@ typedef void arc_byteswap_func_t(void *buf, size_t size);
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
+	boolean_t		ot_dbuf_metadata_cache;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
index 59e87aab8081..25ff8642177d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
@@ -39,6 +39,7 @@
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sa.h>
+#include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -69,6 +70,7 @@ typedef struct objset_phys {
 	dnode_phys_t os_groupused_dnode;
 } objset_phys_t;
 
+#define	OBJSET_PROP_UNINITIALIZED	((uint64_t)-1)
 struct objset {
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
@@ -100,6 +102,16 @@ struct objset {
 	zfs_sync_type_t os_sync;
 	zfs_redundant_metadata_type_t os_redundant_metadata;
 	int os_recordsize;
+	/*
+	 * The next four values are used as a cache of whatever's on disk, and
+	 * are initialized the first time these properties are queried. Before
+	 * being initialized with their real values, their values are
+	 * OBJSET_PROP_UNINITIALIZED.
+	 */
+	uint64_t os_version;
+	uint64_t os_normalization;
+	uint64_t os_utf8only;
+	uint64_t os_casesensitivity;
 
 	/*
 	 * Pointer is constant; the blkptr it points to is protected by
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
index 69a834d877ee..1f4b1f2cde9f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
@@ -70,6 +70,7 @@ typedef struct dmu_recv_cookie {
 	boolean_t drc_byteswap;
 	boolean_t drc_force;
 	boolean_t drc_resumable;
+	boolean_t drc_clone;
 	struct avl_tree *drc_guid_to_ds_map;
 	zio_cksum_t drc_cksum;
 	uint64_t drc_newsnapobj;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
index 5566c70add13..89a7b2ef60e4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -291,7 +291,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
-void dnode_rele_and_unlock(dnode_t *dn, void *tag);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
index 8de77532ee75..0509e95b1587 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -65,9 +65,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *);
 #define	METASLAB_DONT_THROTTLE		0x10
 
 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
-    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *,
+    int);
 int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
-    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *);
+    dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
 void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
 void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
@@ -88,9 +89,9 @@ int metaslab_class_validate(metaslab_class_t *);
 void metaslab_class_histogram_verify(metaslab_class_t *);
 uint64_t metaslab_class_fragmentation(metaslab_class_t *);
 uint64_t metaslab_class_expandable_space(metaslab_class_t *);
-boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
     zio_t *, int);
-void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
 
 void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
     int64_t, int64_t);
@@ -100,7 +101,7 @@ uint64_t metaslab_class_get_dspace(metaslab_class_t *);
 uint64_t metaslab_class_get_deferred(metaslab_class_t *);
 uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
 
-metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int);
 void metaslab_group_destroy(metaslab_group_t *);
 void metaslab_group_activate(metaslab_group_t *);
 void metaslab_group_passivate(metaslab_group_t *);
@@ -109,8 +110,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
 void metaslab_group_histogram_verify(metaslab_group_t *);
 uint64_t metaslab_group_fragmentation(metaslab_group_t *);
 void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
-void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
-void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
+    boolean_t);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
 
 #ifdef	__cplusplus
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
index 939bcb30528b..5eb59df37e51 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace {
 	uint64_t			mat_weight;
 	uint32_t			mat_dva_id;
 	uint64_t			mat_offset;
+	int					mat_allocator;
 } metaslab_alloc_trace_t;
 
 /*
@@ -67,14 +68,17 @@ typedef enum trace_alloc_type {
 	TRACE_GROUP_FAILURE	= -5ULL,
 	TRACE_ENOSPC		= -6ULL,
 	TRACE_CONDENSING	= -7ULL,
-	TRACE_VDEV_ERROR	= -8ULL
+	TRACE_VDEV_ERROR	= -8ULL,
+	TRACE_INITIALIZING	= -9ULL
 } trace_alloc_type_t;
 
 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_WEIGHT_TYPE		(1ULL << 61)
+#define	METASLAB_WEIGHT_CLAIM		(1ULL << 61)
+#define	METASLAB_WEIGHT_TYPE		(1ULL << 60)
 #define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \
+	METASLAB_WEIGHT_CLAIM)
 
 /*
  * The metaslab weight is used to encode the amount of free space in a
@@ -97,37 +101,39 @@ typedef enum trace_alloc_type {
  *
  *      64      56      48      40      32      24      16      8       0
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PS1|                   weighted-free space                     |
+ *      |PSC1|                  weighted-free space                     |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  *	PS - indicates primary and secondary activation
+ *	C - indicates activation for claimed block zio
  *	space - the fragmentation-weighted space
  *
  * Segment-based weight:
  *
  *      64      56      48      40      32      24      16      8       0
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
- *      |PS0| idx|             count of segments in region              |
+ *      |PSC0| idx|            count of segments in region              |
  *      +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  *	PS - indicates primary and secondary activation
+ *	C - indicates activation for claimed block zio
  *	idx - index for the highest bucket in the histogram
  *	count - number of segments in the specified bucket
  */
-#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 62, 2)
-#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 62, 2, x)
+#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 61, 3)
+#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 61, 3, x)
 
 #define	WEIGHT_IS_SPACEBASED(weight)		\
-	((weight) == 0 || BF64_GET((weight), 61, 1))
-#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 61, 1, 1)
+	((weight) == 0 || BF64_GET((weight), 60, 1))
+#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 60, 1, 1)
 
 /*
  * These macros are only applicable to segment-based weighting.
  */
-#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 55, 6)
-#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 55, 6, x)
-#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 55)
-#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 55, x)
+#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 54, 6)
+#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 54, 6, x)
+#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 54)
+#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 54, x)
 
 /*
  * A metaslab class encompasses a category of allocatable top-level vdevs.
@@ -178,8 +184,8 @@ struct metaslab_class {
 	 * allowed to reserve slots even if we've reached the maximum
 	 * number of allocations allowed.
 	 */
-	uint64_t		mc_alloc_max_slots;
-	refcount_t		mc_alloc_slots;
+	uint64_t		*mc_alloc_max_slots;
+	refcount_t		*mc_alloc_slots;
 
 	uint64_t		mc_alloc_groups; /* # of allocatable groups */
 
@@ -202,9 +208,12 @@ struct metaslab_class {
  */
 struct metaslab_group {
 	kmutex_t		mg_lock;
+	metaslab_t		**mg_primaries;
+	metaslab_t		**mg_secondaries;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	boolean_t		mg_allocatable;		/* can we allocate? */
+	uint64_t		mg_ms_ready;
 
 	/*
 	 * A metaslab group is considered to be initialized only after
@@ -224,15 +233,33 @@ struct metaslab_group {
 	metaslab_group_t	*mg_next;
 
 	/*
-	 * Each metaslab group can handle mg_max_alloc_queue_depth allocations
-	 * which are tracked by mg_alloc_queue_depth. It's possible for a
-	 * metaslab group to handle more allocations than its max. This
-	 * can occur when gang blocks are required or when other groups
-	 * are unable to handle their share of allocations.
+	 * In order for the allocation throttle to function properly, we cannot
+	 * have too many IOs going to each disk by default; the throttle
+	 * operates by allocating more work to disks that finish quickly, so
+	 * allocating larger chunks to each disk reduces its effectiveness.
+	 * However, if the number of IOs going to each allocator is too small,
+	 * we will not perform proper aggregation at the vdev_queue layer,
+	 * also resulting in decreased performance. Therefore, we will use a
+	 * ramp-up strategy.
+	 *
+	 * Each allocator in each metaslab group has a current queue depth
+	 * (mg_alloc_queue_depth[allocator]) and a current max queue depth
+	 * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group
+	 * has an absolute max queue depth (mg_max_alloc_queue_depth).  We
+	 * add IOs to an allocator until the mg_alloc_queue_depth for that
+	 * allocator hits the cur_max. Every time an IO completes for a given
+	 * allocator on a given metaslab group, we increment its cur_max until
+	 * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to
+	 * help protect against disks that decrease in performance over time.
+	 *
+	 * It's possible for an allocator to handle more allocations than
+	 * its max. This can occur when gang blocks are required or when other
+	 * groups are unable to handle their share of allocations.
 	 */
 	uint64_t		mg_max_alloc_queue_depth;
-	refcount_t		mg_alloc_queue_depth;
-
+	uint64_t		*mg_cur_max_alloc_queue_depth;
+	refcount_t		*mg_alloc_queue_depth;
+	int			mg_allocators;
 	/*
 	 * A metalab group that can no longer allocate the minimum block
 	 * size will set mg_no_free_space. Once a metaslab group is out
@@ -245,6 +272,11 @@ struct metaslab_group {
 	uint64_t		mg_failed_allocations;
 	uint64_t		mg_fragmentation;
 	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+	int			mg_ms_initializing;
+	boolean_t		mg_initialize_updating;
+	kmutex_t		mg_ms_initialize_lock;
+	kcondvar_t		mg_ms_initialize_cv;
 };
 
 /*
@@ -335,6 +367,8 @@ struct metaslab {
 	boolean_t	ms_condense_wanted;
 	uint64_t	ms_condense_checked_txg;
 
+	uint64_t	ms_initializing; /* leaves initializing this ms */
+
 	/*
 	 * We must hold both ms_lock and ms_group->mg_lock in order to
 	 * modify ms_loaded.
@@ -357,6 +391,13 @@ struct metaslab {
 	uint64_t	ms_max_size;	/* maximum allocatable size	*/
 
 	/*
+	 * -1 if it's not active in an allocator, otherwise set to the allocator
+	 * this metaslab is active for.
+	 */
+	int		ms_allocator;
+	boolean_t	ms_primary; /* Only valid if ms_allocator is not -1 */
+
+	/*
 	 * The metaslab block allocators can optionally use a size-ordered
 	 * range tree and/or an array of LBAs. Not all allocators use
 	 * this functionality. The ms_allocatable_by_size should always
@@ -370,6 +411,8 @@ struct metaslab {
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
+
+	boolean_t	ms_new;
 };
 
 #ifdef	__cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
index 244f35a3d6f3..feac5ae5fbf2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
@@ -95,6 +95,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt);
 void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
+uint64_t range_tree_min(range_tree_t *rt);
+uint64_t range_tree_max(range_tree_t *rt);
+uint64_t range_tree_span(range_tree_t *rt);
 
 void range_tree_add(void *arg, uint64_t start, uint64_t size);
 void range_tree_remove(void *arg, uint64_t start, uint64_t size);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
index fc4f90740efc..0b220362379e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
@@ -154,6 +154,7 @@ _NOTE(CONSTCOND) } while (0)
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 #define	SPA_COMPRESSBITS	7
+#define	SPA_VDEVBITS		24
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
@@ -184,15 +185,15 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| GRID  |	  ASIZE		|
+ * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| GRID  |	  ASIZE		|
+ * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|		vdev3		| GRID  |	  ASIZE		|
+ * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -371,8 +372,9 @@ typedef struct blkptr {
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
 
-#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
-#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
+#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
+#define	DVA_SET_VDEV(dva, x)	\
+	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
@@ -668,6 +670,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 #define	SPA_ASYNC_AUTOEXPAND	0x20
 #define	SPA_ASYNC_REMOVE_DONE	0x40
 #define	SPA_ASYNC_REMOVE_STOP	0x80
+#define	SPA_ASYNC_INITIALIZE_RESTART	0x100
 
 /*
  * Controls the behavior of spa_vdev_remove().
@@ -683,6 +686,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -829,6 +833,7 @@ extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
+extern uint64_t spa_dirty_data(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
@@ -945,13 +950,6 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
-extern boolean_t spa_debug_enabled(spa_t *spa);
-#define	spa_dbgmsg(spa, ...)			\
-{						\
-	if (spa_debug_enabled(spa))		\
-		zfs_dbgmsg(__VA_ARGS__);	\
-}
-
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
index 292fa5e96ac1..f69dde66dd9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
@@ -238,8 +238,16 @@ struct spa {
 	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
-	kmutex_t	spa_alloc_lock;
-	avl_tree_t	spa_alloc_tree;
+	/*
+	 * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are
+	 * stored in spa_alloc_count. There is one tree and one lock for each
+	 * allocator, to help improve allocation performance in write-heavy
+	 * workloads.
+	 */
+	kmutex_t	*spa_alloc_locks;
+	avl_tree_t	*spa_alloc_trees;
+	int		spa_alloc_count;
+
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
@@ -324,7 +332,6 @@ struct spa {
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
 	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
-	boolean_t	spa_debug;		/* debug enabled? */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
@@ -353,6 +360,8 @@ struct spa {
 	uint64_t	spa_feat_for_read_obj;	/* required to read from pool */
 	uint64_t	spa_feat_desc_obj;	/* Feature descriptions */
 	uint64_t	spa_feat_enabled_txg_obj; /* Feature enabled txg */
+	kmutex_t	spa_feat_stats_lock;	/* protects spa_feat_stats */
+	nvlist_t	*spa_feat_stats;	/* Cache of enabled features */
 	/* cache feature refcounts */
 	uint64_t	spa_feat_refcount_cache[SPA_FEATURES];
 #ifdef illumos
@@ -381,6 +390,10 @@ struct spa {
 		int spa_queued;
 	} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
 #endif
+	/* arc_memory_throttle() parameters during low memory condition */
+	uint64_t	spa_lowmem_page_load;	/* memory load during txg */
+	uint64_t	spa_lowmem_last_txg;	/* txg window start */
+
 	hrtime_t	spa_ccw_fail_time;	/* Conf cache write fail time */
 
 	/*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
index 98b87269cb6c..d3d852978a57 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
@@ -93,50 +93,100 @@ typedef struct space_map {
 /*
  * debug entry
  *
- *    1      3         10                     50
- *  ,---+--------+------------+---------------------------------.
- *  | 1 | action |  syncpass  |        txg (lower bits)         |
- *  `---+--------+------------+---------------------------------'
- *   63  62    60 59        50 49                               0
+ *     2     2        10                     50
+ *  +-----+-----+------------+----------------------------------+
+ *  | 1 0 | act |  syncpass  |        txg (lower bits)          |
+ *  +-----+-----+------------+----------------------------------+
+ *   63 62 61 60 59        50 49                                0
  *
  *
- * non-debug entry
+ * one-word entry
  *
  *    1               47                   1           15
- *  ,-----------------------------------------------------------.
+ *  +-----------------------------------------------------------+
  *  | 0 |   offset (sm_shift units)    | type |       run       |
- *  `-----------------------------------------------------------'
- *   63  62                          17   16   15               0
+ *  +-----------------------------------------------------------+
+ *   63  62                          16   15   14               0
+ *
+ *
+ * two-word entry
+ *
+ *     2     2               36                      24
+ *  +-----+-----+---------------------------+-------------------+
+ *  | 1 1 | pad |            run            |       vdev        |
+ *  +-----+-----+---------------------------+-------------------+
+ *   63 62 61 60 59                       24 23                 0
+ *
+ *     1                            63
+ *  +------+----------------------------------------------------+
+ *  | type |                      offset                        |
+ *  +------+----------------------------------------------------+
+ *     63   62                                                  0
+ *
+ * Note that a two-word entry will not strandle a block boundary.
+ * If necessary, the last word of a block will be padded with a
+ * debug entry (with act = syncpass = txg = 0).
  */
 
-/* All this stuff takes and returns bytes */
-#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, 15) + 1)
-#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, 15)
-#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
-#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
-#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, 47)
-#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, 47)
-#define	SM_DEBUG_DECODE(x)	BF64_DECODE(x, 63, 1)
-#define	SM_DEBUG_ENCODE(x)	BF64_ENCODE(x, 63, 1)
+typedef enum {
+	SM_ALLOC,
+	SM_FREE
+} maptype_t;
+
+typedef struct space_map_entry {
+	maptype_t sme_type;
+	uint32_t sme_vdev;	/* max is 2^24-1; SM_NO_VDEVID if not present */
+	uint64_t sme_offset;	/* max is 2^63-1; units of sm_shift */
+	uint64_t sme_run;	/* max is 2^36; units of sm_shift */
+} space_map_entry_t;
+
+#define	SM_NO_VDEVID	(1 << SPA_VDEVBITS)
 
-#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 3)
-#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 3)
+/* one-word entry constants */
+#define	SM_DEBUG_PREFIX	2
+#define	SM_OFFSET_BITS	47
+#define	SM_RUN_BITS	15
 
+/* two-word entry constants */
+#define	SM2_PREFIX	3
+#define	SM2_OFFSET_BITS	63
+#define	SM2_RUN_BITS	36
+
+#define	SM_PREFIX_DECODE(x)	BF64_DECODE(x, 62, 2)
+#define	SM_PREFIX_ENCODE(x)	BF64_ENCODE(x, 62, 2)
+
+#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 2)
+#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 2)
 #define	SM_DEBUG_SYNCPASS_DECODE(x)	BF64_DECODE(x, 50, 10)
 #define	SM_DEBUG_SYNCPASS_ENCODE(x)	BF64_ENCODE(x, 50, 10)
-
 #define	SM_DEBUG_TXG_DECODE(x)		BF64_DECODE(x, 0, 50)
 #define	SM_DEBUG_TXG_ENCODE(x)		BF64_ENCODE(x, 0, 50)
 
-#define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
-
-typedef enum {
-	SM_ALLOC,
-	SM_FREE
-} maptype_t;
-
-typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg);
+#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, SM_OFFSET_BITS)
+#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, SM_OFFSET_BITS)
+#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
+#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
+#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
+#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
+#define	SM_RUN_MAX		SM_RUN_DECODE(~0ULL)
+#define	SM_OFFSET_MAX		SM_OFFSET_DECODE(~0ULL)
+
+#define	SM2_RUN_DECODE(x)	(BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1)
+#define	SM2_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS)
+#define	SM2_VDEV_DECODE(x)	BF64_DECODE(x, 0, SPA_VDEVBITS)
+#define	SM2_VDEV_ENCODE(x)	BF64_ENCODE(x, 0, SPA_VDEVBITS)
+#define	SM2_TYPE_DECODE(x)	BF64_DECODE(x, SM2_OFFSET_BITS, 1)
+#define	SM2_TYPE_ENCODE(x)	BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
+#define	SM2_OFFSET_DECODE(x)	BF64_DECODE(x, 0, SM2_OFFSET_BITS)
+#define	SM2_OFFSET_ENCODE(x)	BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
+#define	SM2_RUN_MAX		SM2_RUN_DECODE(~0ULL)
+#define	SM2_OFFSET_MAX		SM2_OFFSET_DECODE(~0ULL)
+
+boolean_t sm_entry_is_debug(uint64_t e);
+boolean_t sm_entry_is_single_word(uint64_t e);
+boolean_t sm_entry_is_double_word(uint64_t e);
+
+typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
 
 int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
 int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
@@ -154,7 +204,9 @@ uint64_t space_map_allocated(space_map_t *sm);
 uint64_t space_map_length(space_map_t *sm);
 
 void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    dmu_tx_t *tx);
+    uint64_t vdev_id, dmu_tx_t *tx);
+uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+    uint64_t vdev_id);
 void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
 uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
 void space_map_free(space_map_t *sm, dmu_tx_t *tx);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
index e583d61eac2f..bf3b269d707d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_TXG_IMPL_H
@@ -92,6 +92,7 @@ typedef struct tx_state {
 	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
 
 	uint64_t	tx_open_txg;	/* currently open txg id */
+	uint64_t	tx_quiescing_txg; /* currently quiescing txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
 	uint64_t	tx_synced_txg;	/* last synced txg id */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
index 16436b7c022f..4a3af854d465 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -59,6 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t;
 struct abd;
 
 extern int zfs_vdev_queue_depth_pct;
+extern int zfs_vdev_def_queue_depth;
 extern uint32_t zfs_vdev_async_write_max_active;
 
 /*
@@ -79,6 +80,12 @@ typedef void	vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
     uint64_t offset, uint64_t size, void *arg);
 typedef void	vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
     vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+    range_seg_t *res);
 
 typedef struct vdev_ops {
 	vdev_open_func_t		*vdev_op_open;
@@ -91,6 +98,11 @@ typedef struct vdev_ops {
 	vdev_hold_func_t		*vdev_op_hold;
 	vdev_rele_func_t		*vdev_op_rele;
 	vdev_remap_func_t		*vdev_op_remap;
+	/*
+	 * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+	 * Used when initializing vdevs. Isn't used by leaf ops.
+	 */
+	vdev_xlation_func_t		*vdev_op_xlate;
 	char				vdev_op_type[16];
 	boolean_t			vdev_op_leaf;
 } vdev_ops_t;
@@ -251,6 +263,24 @@ struct vdev {
 
 	/* pool checkpoint related */
 	space_map_t	*vdev_checkpoint_sm;	/* contains reserved blocks */
+	
+	boolean_t	vdev_initialize_exit_wanted;
+	vdev_initializing_state_t	vdev_initialize_state;
+	kthread_t	*vdev_initialize_thread;
+	/* Protects vdev_initialize_thread and vdev_initialize_state. */
+	kmutex_t	vdev_initialize_lock;
+	kcondvar_t	vdev_initialize_cv;
+	uint64_t	vdev_initialize_offset[TXG_SIZE];
+	uint64_t	vdev_initialize_last_offset;
+	range_tree_t	*vdev_initialize_tree;	/* valid while initializing */
+	uint64_t	vdev_initialize_bytes_est;
+	uint64_t	vdev_initialize_bytes_done;
+	time_t		vdev_initialize_action_time;	/* start and end time */
+
+	/* for limiting outstanding I/Os */
+	kmutex_t	vdev_initialize_io_lock;
+	kcondvar_t	vdev_initialize_io_cv;
+	uint64_t	vdev_initialize_inflight;
 
 	/*
 	 * Values stored in the config for an indirect or removing vdev.
@@ -470,6 +500,8 @@ extern vdev_ops_t vdev_indirect_ops;
 /*
  * Common size functions
  */
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+    range_seg_t *out);
 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
new file mode 100644
index 000000000000..db4b0572cd60
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_INITIALIZE_H
+#define	_SYS_VDEV_INITIALIZE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern void vdev_initialize(vdev_t *vd);
+extern void vdev_initialize_stop(vdev_t *vd,
+    vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_stop_all(vdev_t *vd,
+    vdev_initializing_state_t tgt_state);
+extern void vdev_initialize_restart(vdev_t *vd);
+extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs,
+    range_seg_t *physical_rs);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_INITIALIZE_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
index a29ae586102e..3962237afdab 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
@@ -30,7 +30,7 @@ extern "C" {
 #endif
 
 typedef struct spa_vdev_removal {
-	vdev_t		*svr_vdev;
+	uint64_t	svr_vdev_id;
 	uint64_t	svr_max_offset_to_sync[TXG_SIZE];
 	/* Thread performing a vdev removal. */
 	kthread_t	*svr_thread;
@@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *);
 extern int spa_vdev_remove_cancel(spa_t *);
 extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
 
+extern int vdev_removal_max_span;
+extern int zfs_remove_max_segment;
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
index 3ea0da4a1d33..04606bda48db 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
@@ -146,4 +146,7 @@ extern struct mtx zfs_debug_mtx;
 
 #define	sys_shutdown	rebooting
 
+#define	noinline	__attribute__((noinline))
+#define	likely(x)	__builtin_expect((x), 1)
+
 #endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
index b04b24f17f8b..9cbfc26b64e2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_DEBUG_H
@@ -57,7 +57,7 @@ extern boolean_t zfs_free_leak_on_eio;
 #define	ZFS_DEBUG_DNODE_VERIFY		(1 << 2)
 #define	ZFS_DEBUG_SNAPNAMES		(1 << 3)
 #define	ZFS_DEBUG_MODIFY		(1 << 4)
-#define	ZFS_DEBUG_SPA			(1 << 5)
+/* 1<<5 was previously used, try not to reuse */
 #define	ZFS_DEBUG_ZIO_FREE		(1 << 6)
 #define	ZFS_DEBUG_HISTOGRAM_VERIFY	(1 << 7)
 #define	ZFS_DEBUG_METASLAB_VERIFY	(1 << 8)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
index 216b55b6c3ce..80a24b436a01 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  */
@@ -217,7 +217,7 @@ enum zio_child {
 #define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
 #define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
 #define	ZIO_CHILD_ALL_BITS					\
-	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | 		\
+	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
 	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
 
 enum zio_wait_type {
@@ -356,7 +356,7 @@ typedef struct zio_transform {
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
-typedef int zio_pipe_stage_t(zio_t *zio);
+typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
@@ -489,6 +489,7 @@ struct zio {
 	void		*io_waiter;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
+	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
@@ -550,8 +551,8 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, uint64_t size, enum zio_flag flags);
 
-extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
+extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg,
+    blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
     uint64_t size);
@@ -586,7 +587,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    struct abd *data, uint64_t size, int type, zio_priority_t priority,
+    struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
index 4db05ac77598..ebe05a09dc4e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h
@@ -13,7 +13,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  */
 #ifndef	_ZIO_PRIORITY_H
 #define	_ZIO_PRIORITY_H
@@ -30,6 +30,7 @@ typedef enum zio_priority {
 	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
 	ZIO_PRIORITY_TRIM,		/* free requests used for TRIM */
 	ZIO_PRIORITY_REMOVAL,		/* reads/writes for vdev removal */
+	ZIO_PRIORITY_INITIALIZING,	/* initializing I/O */
 	ZIO_PRIORITY_NUM_QUEUEABLE,
 
 	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index 64b9c0cb3510..62d215aa4626 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -450,6 +450,30 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 	}
 }
 
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_quiesced_txg != 0);
+}
+
 static void
 txg_sync_thread(void *arg)
 {
@@ -476,7 +500,7 @@ txg_sync_thread(void *arg)
 		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    tx->tx_quiesced_txg == 0 &&
+		    !txg_has_quiesced_to_sync(dp) &&
 		    dp->dp_dirty_total < zfs_dirty_data_sync) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
@@ -489,7 +513,7 @@ txg_sync_thread(void *arg)
 		 * Wait until the quiesce thread hands off a txg to us,
 		 * prompting it to do so if necessary.
 		 */
-		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 			cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -504,6 +528,7 @@ txg_sync_thread(void *arg)
 		 * us.  This may cause the quiescing thread to now be
 		 * able to quiesce another txg, so we must signal it.
 		 */
+		ASSERT(tx->tx_quiesced_txg != 0);
 		txg = tx->tx_quiesced_txg;
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
@@ -552,7 +577,7 @@ txg_quiesce_thread(void *arg)
 		 */
 		while (!tx->tx_exiting &&
 		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
-		    tx->tx_quiesced_txg != 0))
+		    txg_has_quiesced_to_sync(dp)))
 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 
 		if (tx->tx_exiting)
@@ -562,6 +587,8 @@ txg_quiesce_thread(void *arg)
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting,
 		    tx->tx_sync_txg_waiting);
+		tx->tx_quiescing_txg = txg;
+
 		mutex_exit(&tx->tx_sync_lock);
 		txg_quiesce(dp, txg);
 		mutex_enter(&tx->tx_sync_lock);
@@ -570,6 +597,7 @@ txg_quiesce_thread(void *arg)
 		 * Hand this txg off to the sync thread.
 		 */
 		dprintf("quiesce done, handing off txg %llu\n", txg);
+		tx->tx_quiescing_txg = 0;
 		tx->tx_quiesced_txg = txg;
 		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_more_cv);
@@ -667,7 +695,8 @@ txg_kick(dsl_pool_t *dp)
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
-	if (tx->tx_syncing_txg == 0 &&
+	if (!txg_is_syncing(dp) &&
+	    !txg_is_quiescing(dp) &&
 	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
index fbe7b619a29a..d33f451938b8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c
@@ -42,14 +42,10 @@ typedef struct unique {
 static int
 unique_compare(const void *a, const void *b)
 {
-	const unique_t *una = a;
-	const unique_t *unb = b;
-
-	if (una->un_value < unb->un_value)
-		return (-1);
-	if (una->un_value > unb->un_value)
-		return (+1);
-	return (0);
+	const unique_t *una = (const unique_t *)a;
+	const unique_t *unb = (const unique_t *)b;
+
+	return (AVL_CMP(una->un_value, unb->un_value));
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
index e17243a8c598..1baea65c5fa3 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
@@ -51,6 +51,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/trim_map.h>
+#include <sys/vdev_initialize.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
@@ -162,24 +163,30 @@ static vdev_ops_t *vdev_ops_table[] = {
 };
 
 
-/* maximum number of metaslabs per top-level vdev */
+/* target number of metaslabs per top-level vdev */
 int vdev_max_ms_count = 200;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RDTUN,
     &vdev_max_ms_count, 0,
     "Maximum number of metaslabs per top-level vdev");
 
-/* minimum amount of metaslabs per top-level vdev */
+/* minimum number of metaslabs per top-level vdev */
 int vdev_min_ms_count = 16;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RDTUN,
     &vdev_min_ms_count, 0,
     "Minimum number of metaslabs per top-level vdev");
 
-/* see comment in vdev_metaslab_set_size() */
+/* practical upper limit of total metaslabs per top-level vdev */
+int vdev_ms_count_limit = 1ULL << 17;
+
+/* lower limit for metaslab size (512M) */
 int vdev_default_ms_shift = 29;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RDTUN,
     &vdev_default_ms_shift, 0,
     "Shift between vdev size and number of metaslabs");
 
+/* upper limit for metaslab size (256G) */
+int vdev_max_ms_shift = 38;
+
 boolean_t vdev_validate_skip = B_FALSE;
 
 /*
@@ -289,6 +296,14 @@ vdev_getops(const char *type)
 	return (ops);
 }
 
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+	res->rs_start = in->rs_start;
+	res->rs_end = in->rs_end;
+}
+
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
@@ -560,7 +575,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
- 
+	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
 	}
@@ -752,7 +771,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		vd->vdev_mg = metaslab_group_create(islog ?
-		    spa_log_class(spa) : spa_normal_class(spa), vd);
+		    spa_log_class(spa) : spa_normal_class(spa), vd,
+		    spa->spa_alloc_count);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
@@ -832,6 +852,7 @@ void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
@@ -862,6 +883,7 @@ vdev_free(vdev_t *vd)
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+	ASSERT(vd->vdev_initialize_thread == NULL);
 
 	/*
 	 * Discard allocation state.
@@ -935,6 +957,10 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
+	mutex_destroy(&vd->vdev_initialize_lock);
+	mutex_destroy(&vd->vdev_initialize_io_lock);
+	cv_destroy(&vd->vdev_initialize_io_cv);
+	cv_destroy(&vd->vdev_initialize_cv);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
@@ -987,6 +1013,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
+	/*
+	 * State which may be set on a top-level vdev that's in the
+	 * process of being removed.
+	 */
+	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+	ASSERT0(tvd->vdev_removing);
+	tvd->vdev_removing = svd->vdev_removing;
+	tvd->vdev_indirect_config = svd->vdev_indirect_config;
+	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+	tvd->vdev_indirect_births = svd->vdev_indirect_births;
+	range_tree_swap(&svd->vdev_obsolete_segments,
+	    &tvd->vdev_obsolete_segments);
+	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+	svd->vdev_indirect_config.vic_mapping_object = 0;
+	svd->vdev_indirect_config.vic_births_object = 0;
+	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+	svd->vdev_indirect_mapping = NULL;
+	svd->vdev_indirect_births = NULL;
+	svd->vdev_obsolete_sm = NULL;
+	svd->vdev_removing = 0;
+
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
@@ -1140,7 +1192,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
-
 	for (m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 
@@ -1725,7 +1776,8 @@ vdev_validate(vdev_t *vd)
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
-		vdev_dbgmsg(vd, "vdev_validate: failed reading config");
+		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
+		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
@@ -2121,34 +2173,53 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
-	uint64_t ms_shift = 0;
+	uint64_t ms_count = asize >> vdev_default_ms_shift;
+	uint64_t ms_shift;
 
 	/*
-	 * For vdevs that are bigger than 8G the metaslab size varies in
-	 * a way that the number of metaslabs increases in powers of two,
-	 * linearly in terms of vdev_asize, starting from 16 metaslabs.
-	 * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32,
-	 * and so on, until we hit the maximum metaslab count limit
-	 * [vdev_max_ms_count] from which point the metaslab count stays
-	 * the same.
+	 * There are two dimensions to the metaslab sizing calculation:
+	 * the size of the metaslab and the count of metaslabs per vdev.
+	 * In general, we aim for vdev_max_ms_count (200) metaslabs. The
+	 * range of the dimensions are as follows:
+	 *
+	 *	2^29 <= ms_size  <= 2^38
+	 *	  16 <= ms_count <= 131,072
+	 *
+	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
+	 * at least 512MB (2^29) to minimize fragmentation effects when
+	 * testing with smaller devices.  However, the count constraint
+	 * of at least 16 metaslabs will override this minimum size goal.
+	 *
+	 * On the upper end of vdev sizes, we aim for a maximum metaslab
+	 * size of 256GB.  However, we will cap the total count to 2^17
+	 * metaslabs to keep our memory footprint in check.
+	 *
+	 * The net effect of applying above constrains is summarized below.
+	 *
+	 *	vdev size	metaslab count
+	 *	-------------|-----------------
+	 *	< 8GB		~16
+	 *	8GB - 100GB	one per 512MB
+	 *	100GB - 50TB	~200
+	 *	50TB - 32PB	one per 256GB
+	 *	> 32PB		~131,072
+	 *	-------------------------------
 	 */
-	ms_shift = vdev_default_ms_shift;
 
-	if ((asize >> ms_shift) < vdev_min_ms_count) {
-		/*
-		 * For devices that are less than 8G we want to have
-		 * exactly 16 metaslabs. We don't want less as integer
-		 * division rounds down, so less metaslabs mean more
-		 * wasted space. We don't want more as these vdevs are
-		 * small and in the likely event that we are running
-		 * out of space, the SPA will have a hard time finding
-		 * space due to fragmentation.
-		 */
+	if (ms_count < vdev_min_ms_count)
 		ms_shift = highbit64(asize / vdev_min_ms_count);
-		ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT);
-
-	} else if ((asize >> ms_shift) > vdev_max_ms_count) {
+	else if (ms_count > vdev_max_ms_count)
 		ms_shift = highbit64(asize / vdev_max_ms_count);
+	else
+		ms_shift = vdev_default_ms_shift;
+
+	if (ms_shift < SPA_MAXBLOCKSHIFT) {
+		ms_shift = SPA_MAXBLOCKSHIFT;
+	} else if (ms_shift > vdev_max_ms_shift) {
+		ms_shift = vdev_max_ms_shift;
+		/* cap the total count to constrain memory footprint */
+		if ((asize >> ms_shift) > vdev_ms_count_limit)
+			ms_shift = highbit64(asize / vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
@@ -2647,7 +2718,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
-	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
+	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
@@ -3003,7 +3074,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 
 	ASSERT(vdev_is_concrete(vd));
 
-	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess)
@@ -3229,6 +3301,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
+	/* Restart initializing if necessary */
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (vdev_writeable(vd) &&
+	    vd->vdev_initialize_thread == NULL &&
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+		(void) vdev_initialize(vd);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3531,8 +3612,18 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
-	if (vd->vdev_ops->vdev_op_leaf)
+	if (vd->vdev_ops->vdev_op_leaf) {
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+		/*
+		 * Report intializing progress. Since we don't have the
+		 * initializing locks held, this is only an estimate (although a
+		 * fairly accurate one).
+		 */
+		vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
+		vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
+		vs->vs_initialize_state = vd->vdev_initialize_state;
+		vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
+	}
 	/*
 	 * Report expandable space on top-level, non-auxillary devices only.
 	 * The expandable space is reported in terms of metaslab sized units
@@ -4193,11 +4284,11 @@ vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
-	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
-	    vdev_is_concrete(vd)) {
+	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
index 26828e069d7d..be24cde54b9b 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
@@ -841,6 +841,7 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_disk_hold,
 	vdev_disk_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
index b5caee2ec79e..c198d77e21d4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -271,6 +271,7 @@ vdev_ops_t vdev_file_ops = {
 	vdev_file_hold,
 	vdev_file_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_FILE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -291,6 +292,7 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_file_hold,
 	vdev_file_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
index 29a649aceb00..aa8a400f2d78 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
@@ -1151,6 +1151,7 @@ vdev_ops_t vdev_geom_ops = {
 	vdev_geom_hold,
 	vdev_geom_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
index 62b92c677292..c4e4835447f5 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
@@ -23,6 +23,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 #include <sys/metaslab.h>
 #include <sys/refcount.h>
 #include <sys/dmu.h>
@@ -46,10 +47,11 @@
  * "vdev_remap" operation that executes a callback on each contiguous
  * segment of the new location.  This function is used in multiple ways:
  *
- *  - reads and repair writes to this device use the callback to create
- *    a child io for each mapped segment.
+ *  - i/os to this vdev use the callback to determine where the
+ *    data is now located, and issue child i/os for each segment's new
+ *    location.
  *
- *  - frees and claims to this device use the callback to free or claim
+ *  - frees and claims to this vdev use the callback to free or claim
  *    each mapped segment.  (Note that we don't actually need to claim
  *    log blocks on indirect vdevs, because we don't allocate to
  *    removing vdevs.  However, zdb uses zio_claim() for its leak
@@ -204,6 +206,94 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 int zfs_condense_indirect_commit_entry_delay_ticks = 0;
 
 /*
+ * If a split block contains more than this many segments, consider it too
+ * computationally expensive to check all (2^num_segments) possible
+ * combinations. Instead, try at most 2^_segments_max randomly-selected
+ * combinations.
+ *
+ * This is reasonable if only a few segment copies are damaged and the
+ * majority of segment copies are good. This allows all the segment copies to
+ * participate fairly in the reconstruction and prevents the repeated use of
+ * one bad copy.
+ */
+int zfs_reconstruct_indirect_segments_max = 10;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+	abd_t *ic_data;
+	vdev_t *ic_vdev;
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+	list_node_t is_node; /* link on iv_splits */
+
+	/*
+	 * is_split_offset is the offset into the i/o.
+	 * This is the sum of the previous splits' is_size's.
+	 */
+	uint64_t is_split_offset;
+
+	vdev_t *is_vdev; /* top-level vdev */
+	uint64_t is_target_offset; /* offset on is_vdev */
+	uint64_t is_size;
+	int is_children; /* number of entries in is_child[] */
+
+	/*
+	 * is_good_child is the child that we are currently using to
+	 * attempt reconstruction.
+	 */
+	int is_good_child;
+
+	indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+	boolean_t iv_split_block;
+	boolean_t iv_reconstruct;
+
+	list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	indirect_split_t *is;
+	while ((is = list_head(&iv->iv_splits)) != NULL) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic->ic_data != NULL)
+				abd_free(ic->ic_data);
+		}
+		list_remove(&iv->iv_splits, is);
+		kmem_free(is,
+		    offsetof(indirect_split_t, is_child[is->is_children]));
+	}
+	kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+	vdev_indirect_map_free,
+	zio_vsd_default_cksum_report
+};
+/*
  * Mark the given offset and size as being obsolete.
  */
 void
@@ -729,7 +819,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 	    space_map_object(vd->vdev_obsolete_sm));
 
 	space_map_write(vd->vdev_obsolete_sm,
-	    vd->vdev_obsolete_segments, SM_ALLOC, tx);
+	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_update(vd->vdev_obsolete_sm);
 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 }
@@ -818,12 +908,6 @@ vdev_indirect_close(vdev_t *vd)
 }
 
 /* ARGSUSED */
-static void
-vdev_indirect_io_done(zio_t *zio)
-{
-}
-
-/* ARGSUSED */
 static int
 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
@@ -1067,41 +1151,475 @@ vdev_indirect_child_io_done(zio_t *zio)
 	abd_put(zio->io_abd);
 }
 
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
 static void
-vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	zio_t *zio = arg;
+	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3P(vd, !=, NULL);
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
-	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset,
-	    abd_get_offset(zio->io_abd, split_offset),
-	    size, zio->io_type, zio->io_priority,
-	    0, vdev_indirect_child_io_done, zio));
+	int n = 1;
+	if (vd->vdev_ops == &vdev_mirror_ops)
+		n = vd->vdev_children;
+
+	indirect_split_t *is =
+	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+	is->is_children = n;
+	is->is_size = size;
+	is->is_split_offset = split_offset;
+	is->is_target_offset = offset;
+	is->is_vdev = vd;
+
+	/*
+	 * Note that we only consider multiple copies of the data for
+	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
+	 * though they use the same ops as mirror, because there's only one
+	 * "good" copy under the replacing/spare.
+	 */
+	if (vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < n; i++) {
+			is->is_child[i].ic_vdev = vd->vdev_child[i];
+		}
+	} else {
+		is->is_child[0].ic_vdev = vd;
+	}
+
+	list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+	indirect_child_t *ic = zio->io_private;
+
+	if (zio->io_error != 0) {
+		/*
+		 * Clear ic_data to indicate that we do not have data for this
+		 * child.
+		 */
+		abd_free(ic->ic_data);
+		ic->ic_data = NULL;
+	}
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic = &is->is_child[i];
+
+			if (!vdev_readable(ic->ic_vdev))
+				continue;
+
+			/*
+			 * Note, we may read from a child whose DTL
+			 * indicates that the data may not be present here.
+			 * While this might result in a few i/os that will
+			 * likely return incorrect data, it simplifies the
+			 * code since we can treat scrub and resilver
+			 * identically.  (The incorrect data will be
+			 * detected and ignored when we verify the
+			 * checksum.)
+			 */
+
+			ic->ic_data = abd_alloc_sametype(zio->io_abd,
+			    is->is_size);
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
+			    is->is_size, zio->io_type, zio->io_priority, 0,
+			    vdev_indirect_read_split_done, ic));
+		}
+	}
+	iv->iv_reconstruct = B_TRUE;
 }
 
 static void
 vdev_indirect_io_start(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
+	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+	list_create(&iv->iv_splits,
+	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+	zio->io_vsd = iv;
+	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 	if (zio->io_type != ZIO_TYPE_READ) {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
-		ASSERT((zio->io_flags &
-		    (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+		/*
+		 * Note: this code can handle other kinds of writes,
+		 * but we don't expect them.
+		 */
+		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
 	}
 
 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
-	    vdev_indirect_io_start_cb, zio);
+	    vdev_indirect_gather_splits, zio);
+
+	indirect_split_t *first = list_head(&iv->iv_splits);
+	if (first->is_size == zio->io_size) {
+		/*
+		 * This is not a split block; we are pointing to the entire
+		 * data, which will checksum the same as the original data.
+		 * Pass the BP down so that the child i/o can verify the
+		 * checksum, and try a different location if available
+		 * (e.g. on a mirror).
+		 *
+		 * While this special case could be handled the same as the
+		 * general (split block) case, doing it this way ensures
+		 * that the vast majority of blocks on indirect vdevs
+		 * (which are not split) are handled identically to blocks
+		 * on non-indirect vdevs.  This allows us to be less strict
+		 * about performance in the general (but rare) case.
+		 */
+		ASSERT0(first->is_split_offset);
+		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    first->is_vdev, first->is_target_offset,
+		    abd_get_offset(zio->io_abd, 0),
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_indirect_child_io_done, zio));
+	} else {
+		iv->iv_split_block = B_TRUE;
+		if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+			/*
+			 * Read all copies.  Note that for simplicity,
+			 * we don't bother consulting the DTL in the
+			 * resilver case.
+			 */
+			vdev_indirect_read_all(zio);
+		} else {
+			/*
+			 * Read one copy of each split segment, from the
+			 * top-level vdev.  Since we don't know the
+			 * checksum of each split individually, the child
+			 * zio can't ensure that we get the right data.
+			 * E.g. if it's a mirror, it will just read from a
+			 * random (healthy) leaf vdev.  We have to verify
+			 * the checksum in vdev_indirect_io_done().
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				zio_nowait(zio_vdev_child_io(zio, NULL,
+				    is->is_vdev, is->is_target_offset,
+				    abd_get_offset(zio->io_abd,
+				    is->is_split_offset),
+				    is->is_size, zio->io_type,
+				    zio->io_priority, 0,
+				    vdev_indirect_child_io_done, zio));
+			}
+		}
+	}
 
 	zio_execute(zio);
 }
 
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+    indirect_split_t *is, indirect_child_t *ic)
+{
+	vdev_t *vd = ic->ic_vdev;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_checksum_errors++;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	zio_bad_cksum_t zbc = { 0 };
+	void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
+	abd_t *good_abd = is->is_child[is->is_good_child].ic_data;
+	void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
+	zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+	    is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
+	abd_return_buf(ic->ic_data, bad_buf, is->is_size);
+	abd_return_buf(good_abd, good_buf, is->is_size);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies.  We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data.  If they differ, then we overwrite the bad data
+ * with the good copy.  Note that we do this without regard for the DTL's,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong).  For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+		flags |= ZIO_FLAG_SELF_HEAL;
+
+	if (!spa_writeable(zio->io_spa))
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		indirect_child_t *good_child = &is->is_child[is->is_good_child];
+
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic == good_child)
+				continue;
+			if (ic->ic_data == NULL)
+				continue;
+			if (abd_cmp(good_child->ic_data, ic->ic_data,
+			    is->is_size) == 0)
+				continue;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset,
+			    good_child->ic_data, is->is_size,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+			    NULL, NULL));
+
+			vdev_indirect_checksum_error(zio, is, ic);
+		}
+	}
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+
+			if (ic->ic_data == NULL)
+				continue;
+
+			vdev_t *vd = ic->ic_vdev;
+
+			mutex_enter(&vd->vdev_stat_lock);
+			vd->vdev_stat.vs_checksum_errors++;
+			mutex_exit(&vd->vdev_stat_lock);
+
+			zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+			    is->is_target_offset, is->is_size,
+			    NULL, NULL, NULL);
+		}
+	}
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror.  The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.  We have to try every
+ * combination of copies of split segments, until we find one that checksums
+ * correctly.  (Or until we have tried all combinations, or have tried
+ * 2^zfs_reconstruct_indirect_segments_max combinations.  In these cases we
+ * set io_error to ECKSUM to propagate the error up to the user.)
+ *
+ * For example, if we have 3 segments in the split,
+ * and each points to a 2-way mirror, we will have the following pieces of
+ * data:
+ *
+ *       |     mirror child
+ * split |     [0]        [1]
+ * ======|=====================
+ *   A   |  data_A_0   data_A_1
+ *   B   |  data_B_0   data_B_1
+ *   C   |  data_C_0   data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary.  In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit"        "high bit"
+ *        v                 v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we try lots of combinations (see
+ * zfs_reconstruct_indirect_segments_max).  This ensures that if a mirror has
+ * small silent errors on all of its children, we can still reconstruct the
+ * correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+	uint64_t attempts = 0;
+	uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max;
+	int segments = 0;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is))
+		segments++;
+
+	for (;;) {
+		/* copy data from splits to main zio */
+		int ret;
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+			/*
+			 * If this child failed, its ic_data will be NULL.
+			 * Skip this combination.
+			 */
+			if (is->is_child[is->is_good_child].ic_data == NULL) {
+				ret = EIO;
+				goto next;
+			}
+
+			abd_copy_off(zio->io_abd,
+			    is->is_child[is->is_good_child].ic_data,
+			    is->is_split_offset, 0, is->is_size);
+		}
+
+		/* See if this checksum matches. */
+		zio_bad_cksum_t zbc;
+		ret = zio_checksum_error(zio, &zbc);
+		if (ret == 0) {
+			/* Found a matching checksum.  Issue repair i/os. */
+			vdev_indirect_repair(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+
+		/*
+		 * Checksum failed; try a different combination of split
+		 * children.
+		 */
+		boolean_t more;
+next:
+		more = B_FALSE;
+		if (segments <= zfs_reconstruct_indirect_segments_max) {
+			/*
+			 * There are relatively few segments, so
+			 * deterministically check all combinations.  We do
+			 * this by by adding one to the first split's
+			 * good_child.  If it overflows, then "carry over" to
+			 * the next split (like counting in base is_children,
+			 * but each digit can have a different base).
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				is->is_good_child++;
+				if (is->is_good_child < is->is_children) {
+					more = B_TRUE;
+					break;
+				}
+				is->is_good_child = 0;
+			}
+		} else if (++attempts < attempts_max) {
+			/*
+			 * There are too many combinations to try all of them
+			 * in a reasonable amount of time, so try a fixed
+			 * number of random combinations, after which we'll
+			 * consider the block unrecoverable.
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				is->is_good_child =
+				    spa_get_random(is->is_children);
+			}
+			more = B_TRUE;
+		}
+		if (!more) {
+			/* All combinations failed. */
+			zio->io_error = ret;
+			vdev_indirect_all_checksum_errors(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+	}
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (iv->iv_reconstruct) {
+		/*
+		 * We have read all copies of the data (e.g. from mirrors),
+		 * either because this was a scrub/resilver, or because the
+		 * one-copy read didn't checksum correctly.
+		 */
+		vdev_indirect_reconstruct_io_done(zio);
+		return;
+	}
+
+	if (!iv->iv_split_block) {
+		/*
+		 * This was not a split block, so we passed the BP down,
+		 * and the checksum was handled by the (one) child zio.
+		 */
+		return;
+	}
+
+	zio_bad_cksum_t zbc;
+	int ret = zio_checksum_error(zio, &zbc);
+	if (ret == 0) {
+		zio_checksum_verified(zio);
+		return;
+	}
+
+	/*
+	 * The checksum didn't match.  Read all copies of all splits, and
+	 * then we will try to reconstruct.  The next time
+	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+	 */
+	vdev_indirect_read_all(zio);
+
+	zio_vdev_io_redone(zio);
+}
+
 vdev_ops_t vdev_indirect_ops = {
 	vdev_indirect_open,
 	vdev_indirect_close,
@@ -1113,6 +1631,7 @@ vdev_ops_t vdev_indirect_ops = {
 	NULL,
 	NULL,
 	vdev_indirect_remap,
+	NULL,
 	VDEV_TYPE_INDIRECT,	/* name of this vdev type */
 	B_FALSE			/* leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
index ea80fbc4733f..02999aae7274 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu_tx.h>
@@ -536,14 +536,13 @@ typedef struct load_obsolete_space_map_arg {
 } load_obsolete_space_map_arg_t;
 
 static int
-load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
 {
 	load_obsolete_space_map_arg_t *losma = arg;
-	ASSERT3S(type, ==, SM_ALLOC);
+	ASSERT3S(sme->sme_type, ==, SM_ALLOC);
 
 	vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
-	    offset, size, losma->losma_counts);
+	    sme->sme_offset, sme->sme_run, losma->losma_counts);
 
 	return (0);
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
new file mode 100644
index 000000000000..a2c39c2868e5
--- /dev/null
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
@@ -0,0 +1,792 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/refcount.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+/*
+ * Maximum number of metaslabs per group that can be initialized
+ * simultaneously.
+ */
+int max_initialize_ms = 3;
+
+/*
+ * Value that is written to disk during initialization.
+ */
+uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+uint64_t zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+	    vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+	/*
+	 * We pass in the guid instead of the vdev_t since the vdev may
+	 * have been freed prior to the sync task being processed. This
+	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
+	 * stop the intializing thread, schedule the sync task, and free
+	 * the vdev. Later when the scheduled sync task is invoked, it would
+	 * find that the vdev has been freed.
+	 */
+	uint64_t guid = *(uint64_t *)arg;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	kmem_free(arg, sizeof (uint64_t));
+
+	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+		return;
+
+	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+	VERIFY(vd->vdev_leaf_zap != 0);
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+	if (last_offset > 0) {
+		vd->vdev_initialize_last_offset = last_offset;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+		    sizeof (last_offset), 1, &last_offset, tx));
+	}
+	if (vd->vdev_initialize_action_time > 0) {
+		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+		    1, &val, tx));
+	}
+
+	uint64_t initialize_state = vd->vdev_initialize_state;
+	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+	    &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	spa_t *spa = vd->vdev_spa;
+
+	if (new_state == vd->vdev_initialize_state)
+		return;
+
+	/*
+	 * Copy the vd's guid, this will be freed by the sync task.
+	 */
+	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+	*guid = vd->vdev_guid;
+
+	/*
+	 * If we're suspending, then preserving the original start time.
+	 */
+	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+		vd->vdev_initialize_action_time = gethrestime_sec();
+	}
+	vd->vdev_initialize_state = new_state;
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+	    guid, 2, ZFS_SPACE_CHECK_RESERVED, tx);
+
+	switch (new_state) {
+	case VDEV_INITIALIZE_ACTIVE:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s activated", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_SUSPENDED:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s suspended", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_CANCELED:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s canceled", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_COMPLETE:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s complete", vd->vdev_path);
+		break;
+	default:
+		panic("invalid state %llu", (unsigned long long)new_state);
+	}
+
+	dmu_tx_commit(tx);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+		/*
+		 * The I/O failed because the vdev was unavailable; roll the
+		 * last offset back. (This works because spa_sync waits on
+		 * spa_txg_zio before it runs sync tasks.)
+		 */
+		uint64_t *off =
+		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+		*off = MIN(*off, zio->io_offset);
+	} else {
+		/*
+		 * Since initializing is best-effort, we ignore I/O errors and
+		 * rely on vdev_probe to determine if the errors are more
+		 * critical.
+		 */
+		if (zio->io_error != 0)
+			vd->vdev_stat.vs_initialize_errors++;
+
+		vd->vdev_initialize_bytes_done += zio->io_orig_size;
+	}
+	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+	vd->vdev_initialize_inflight--;
+	cv_broadcast(&vd->vdev_initialize_io_cv);
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	/* Limit inflight initializing I/Os */
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+		cv_wait(&vd->vdev_initialize_io_cv,
+		    &vd->vdev_initialize_io_lock);
+	}
+	vd->vdev_initialize_inflight++;
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+	mutex_enter(&vd->vdev_initialize_lock);
+
+	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+		*guid = vd->vdev_guid;
+
+		/* This is the first write of this txg. */
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_update_sync, guid, 2,
+		    ZFS_SPACE_CHECK_RESERVED, tx);
+	}
+
+	/*
+	 * We know the vdev struct will still be around since all
+	 * consumers of vdev_free must stop the initialization first.
+	 */
+	if (vdev_initialize_should_stop(vd)) {
+		mutex_enter(&vd->vdev_initialize_io_lock);
+		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+		vd->vdev_initialize_inflight--;
+		mutex_exit(&vd->vdev_initialize_io_lock);
+		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+		mutex_exit(&vd->vdev_initialize_lock);
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINTR));
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+	/* vdev_initialize_cb releases SCL_STATE_ALL */
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Translate a logical range to the physical range for the specified vdev_t.
+ * This function is initially called with a leaf vdev and will walk each
+ * parent vdev until it reaches a top-level vdev. Once the top-level is
+ * reached the physical range is initialized and the recursive function
+ * begins to unwind. As it unwinds it calls the parent's vdev specific
+ * translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs)
+{
+	/*
+	 * Walk up the vdev tree
+	 */
+	if (vd != vd->vdev_top) {
+		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs);
+	} else {
+		/*
+		 * We've reached the top-level vdev, initialize the
+		 * physical range to the logical range and start to
+		 * unwind.
+		 */
+		physical_rs->rs_start = logical_rs->rs_start;
+		physical_rs->rs_end = logical_rs->rs_end;
+		return;
+	}
+
+	vdev_t *pvd = vd->vdev_parent;
+	ASSERT3P(pvd, !=, NULL);
+	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+	/*
+	 * As this recursive function unwinds, translate the logical
+	 * range into its physical components by calling the
+	 * vdev specific translate function.
+	 */
+	range_seg_t intermediate = { 0 };
+	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate);
+
+	physical_rs->rs_start = intermediate.rs_start;
+	physical_rs->rs_end = intermediate.rs_end;
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+	ASSERT0(len % sizeof (uint64_t));
+	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+	}
+	return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc()
+{
+	/* Allocate ABD for filler data */
+	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+	    vdev_initialize_block_fill, NULL);
+
+	return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+	abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+	avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root;
+
+	for (range_seg_t *rs = avl_first(rt); rs != NULL;
+	    rs = AVL_NEXT(rt, rs)) {
+		uint64_t size = rs->rs_end - rs->rs_start;
+
+		/* Split range into legally-sized physical chunks */
+		uint64_t writes_required =
+		    ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+		for (uint64_t w = 0; w < writes_required; w++) {
+			int error;
+
+			error = vdev_initialize_write(vd,
+			    VDEV_LABEL_START_SIZE + rs->rs_start +
+			    (w * zfs_initialize_chunk_size),
+			    MIN(size - (w * zfs_initialize_chunk_size),
+			    zfs_initialize_chunk_size), data);
+			if (error != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+static void
+vdev_initialize_ms_load(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	metaslab_load_wait(msp);
+	if (!msp->ms_loaded)
+		VERIFY0(metaslab_load(msp));
+}
+
+static void
+vdev_initialize_mg_wait(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+	while (mg->mg_initialize_updating) {
+		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+	}
+}
+
+static void
+vdev_initialize_mg_mark(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
+	ASSERT(mg->mg_initialize_updating);
+
+	while (mg->mg_ms_initializing >= max_initialize_ms) {
+		cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock);
+	}
+	mg->mg_ms_initializing++;
+	ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms);
+}
+
+/*
+ * Mark the metaslab as being initialized to prevent any allocations
+ * on this metaslab. We must also track how many metaslabs are currently
+ * being initialized within a metaslab group and limit them to prevent
+ * allocation failures from occurring because all metaslabs are being
+ * initialized.
+ */
+static void
+vdev_initialize_ms_mark(metaslab_t *msp)
+{
+	ASSERT(!MUTEX_HELD(&msp->ms_lock));
+	metaslab_group_t *mg = msp->ms_group;
+
+	mutex_enter(&mg->mg_ms_initialize_lock);
+
+	/*
+	 * To keep an accurate count of how many threads are initializing
+	 * a specific metaslab group, we only allow one thread to mark
+	 * the metaslab group at a time. This ensures that the value of
+	 * ms_initializing will be accurate when we decide to mark a metaslab
+	 * group as being initialized. To do this we force all other threads
+	 * to wait till the metaslab's mg_initialize_updating flag is no
+	 * longer set.
+	 */
+	vdev_initialize_mg_wait(mg);
+	mg->mg_initialize_updating = B_TRUE;
+	if (msp->ms_initializing == 0) {
+		vdev_initialize_mg_mark(mg);
+	}
+	mutex_enter(&msp->ms_lock);
+	msp->ms_initializing++;
+	mutex_exit(&msp->ms_lock);
+
+	mg->mg_initialize_updating = B_FALSE;
+	cv_broadcast(&mg->mg_ms_initialize_cv);
+	mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_ms_unmark(metaslab_t *msp)
+{
+	ASSERT(!MUTEX_HELD(&msp->ms_lock));
+	metaslab_group_t *mg = msp->ms_group;
+	mutex_enter(&mg->mg_ms_initialize_lock);
+	mutex_enter(&msp->ms_lock);
+	if (--msp->ms_initializing == 0) {
+		mg->mg_ms_initializing--;
+		cv_broadcast(&mg->mg_ms_initialize_cv);
+	}
+	mutex_exit(&msp->ms_lock);
+	mutex_exit(&mg->mg_ms_initialize_lock);
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	vd->vdev_initialize_bytes_est = 0;
+	vd->vdev_initialize_bytes_done = 0;
+
+	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+		mutex_enter(&msp->ms_lock);
+
+		uint64_t ms_free = msp->ms_size -
+		    space_map_allocated(msp->ms_sm);
+
+		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
+			ms_free /= vd->vdev_top->vdev_children;
+
+		/*
+		 * Convert the metaslab range to a physical range
+		 * on our vdev. We use this to determine if we are
+		 * in the middle of this metaslab range.
+		 */
+		range_seg_t logical_rs, physical_rs;
+		logical_rs.rs_start = msp->ms_start;
+		logical_rs.rs_end = msp->ms_start + msp->ms_size;
+		vdev_xlate(vd, &logical_rs, &physical_rs);
+
+		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+			vd->vdev_initialize_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		} else if (vd->vdev_initialize_last_offset >
+		    physical_rs.rs_end) {
+			vd->vdev_initialize_bytes_done += ms_free;
+			vd->vdev_initialize_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/*
+		 * If we get here, we're in the middle of initializing this
+		 * metaslab. Load it and walk the free tree for more accurate
+		 * progress estimation.
+		 */
+		vdev_initialize_ms_load(msp);
+
+		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
+		    rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+			logical_rs.rs_start = rs->rs_start;
+			logical_rs.rs_end = rs->rs_end;
+			vdev_xlate(vd, &logical_rs, &physical_rs);
+
+			uint64_t size = physical_rs.rs_end -
+			    physical_rs.rs_start;
+			vd->vdev_initialize_bytes_est += size;
+			if (vd->vdev_initialize_last_offset >
+			    physical_rs.rs_end) {
+				vd->vdev_initialize_bytes_done += size;
+			} else if (vd->vdev_initialize_last_offset >
+			    physical_rs.rs_start &&
+			    vd->vdev_initialize_last_offset <
+			    physical_rs.rs_end) {
+				vd->vdev_initialize_bytes_done +=
+				    vd->vdev_initialize_last_offset -
+				    physical_rs.rs_start;
+			}
+		}
+		mutex_exit(&msp->ms_lock);
+	}
+}
+
+static void
+vdev_initialize_load(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+		    sizeof (vd->vdev_initialize_last_offset), 1,
+		    &vd->vdev_initialize_last_offset);
+		ASSERT(err == 0 || err == ENOENT);
+	}
+
+	vdev_initialize_calculate_progress(vd);
+}
+
+
+/*
+ * Convert the logical range into a physcial range and add it to our
+ * avl tree.
+ */
+void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = arg;
+	range_seg_t logical_rs, physical_rs;
+	logical_rs.rs_start = start;
+	logical_rs.rs_end = start + size;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	vdev_xlate(vd, &logical_rs, &physical_rs);
+
+	IMPLY(vd->vdev_top == vd,
+	    logical_rs.rs_start == physical_rs.rs_start);
+	IMPLY(vd->vdev_top == vd,
+	    logical_rs.rs_end == physical_rs.rs_end);
+
+	/* Only add segments that we have not visited yet */
+	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
+		return;
+
+	/* Pick up where we left off mid-range. */
+	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
+		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+		    "(%llu, %llu)", vd->vdev_path,
+		    (u_longlong_t)physical_rs.rs_start,
+		    (u_longlong_t)physical_rs.rs_end,
+		    (u_longlong_t)vd->vdev_initialize_last_offset,
+		    (u_longlong_t)physical_rs.rs_end);
+		ASSERT3U(physical_rs.rs_end, >,
+		    vd->vdev_initialize_last_offset);
+		physical_rs.rs_start = vd->vdev_initialize_last_offset;
+	}
+	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+	/*
+	 * With raidz, it's possible that the logical range does not live on
+	 * this leaf vdev. We only add the physical range to this vdev's if it
+	 * has a length greater than 0.
+	 */
+	if (physical_rs.rs_end > physical_rs.rs_start) {
+		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
+		    physical_rs.rs_end - physical_rs.rs_start);
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+	}
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+	vdev_t *vd = arg;
+	spa_t *spa = vd->vdev_spa;
+	int error = 0;
+	uint64_t ms_count = 0;
+
+	ASSERT(vdev_is_concrete(vd));
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	vd->vdev_initialize_last_offset = 0;
+	vdev_initialize_load(vd);
+
+	abd_t *deadbeef = vdev_initialize_block_alloc();
+
+	vd->vdev_initialize_tree = range_tree_create(NULL, NULL);
+
+	for (uint64_t i = 0; !vd->vdev_detached &&
+	    i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+		/*
+		 * If we've expanded the top-level vdev or it's our
+		 * first pass, calculate our progress.
+		 */
+		if (vd->vdev_top->vdev_ms_count != ms_count) {
+			vdev_initialize_calculate_progress(vd);
+			ms_count = vd->vdev_top->vdev_ms_count;
+		}
+
+		vdev_initialize_ms_mark(msp);
+		mutex_enter(&msp->ms_lock);
+		vdev_initialize_ms_load(msp);
+
+		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+		    vd);
+		mutex_exit(&msp->ms_lock);
+
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		error = vdev_initialize_ranges(vd, deadbeef);
+		vdev_initialize_ms_unmark(msp);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+		if (error != 0)
+			break;
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	while (vd->vdev_initialize_inflight > 0) {
+		cv_wait(&vd->vdev_initialize_io_cv,
+		    &vd->vdev_initialize_io_lock);
+	}
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	range_tree_destroy(vd->vdev_initialize_tree);
+	vdev_initialize_block_free(deadbeef);
+	vd->vdev_initialize_tree = NULL;
+
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+	}
+	ASSERT(vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_initialize_inflight == 0);
+
+	/*
+	 * Drop the vdev_initialize_lock while we sync out the
+	 * txg since it's possible that a device might be trying to
+	 * come online and must check to see if it needs to restart an
+	 * initialization. That thread will be holding the spa_config_lock
+	 * which would prevent the txg_wait_synced from completing.
+	 */
+	mutex_exit(&vd->vdev_initialize_lock);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	mutex_enter(&vd->vdev_initialize_lock);
+
+	vd->vdev_initialize_thread = NULL;
+	cv_broadcast(&vd->vdev_initialize_cv);
+	mutex_exit(&vd->vdev_initialize_lock);
+	thread_exit();
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_initialize_exit_wanted);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+	vd->vdev_initialize_thread = thread_create(NULL, 0,
+	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Stop initializng a device, with the resultant initialing state being
+ * tgt_state. Blocks until the initializing thread has exited.
+ * Caller must hold vdev_initialize_lock and must not be writing to the spa
+ * config, as the initializing thread may try to enter the config as a reader
+ * before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	spa_t *spa = vd->vdev_spa;
+	ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER));
+
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+
+	/*
+	 * Allow cancel requests to proceed even if the initialize thread
+	 * has stopped.
+	 */
+	if (vd->vdev_initialize_thread == NULL &&
+	    tgt_state != VDEV_INITIALIZE_CANCELED) {
+		return;
+	}
+
+	vdev_initialize_change_state(vd, tgt_state);
+	vd->vdev_initialize_exit_wanted = B_TRUE;
+	while (vd->vdev_initialize_thread != NULL)
+		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		vdev_initialize_stop(vd, tgt_state);
+		mutex_exit(&vd->vdev_initialize_lock);
+		return;
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state);
+	}
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	vdev_initialize_stop_all_impl(vd, tgt_state);
+
+	if (vd->vdev_spa->spa_sync_on) {
+		/* Make sure that our state has been synced to disk */
+		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+	}
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_leaf_zap != 0) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+		    sizeof (initialize_state), 1, &initialize_state);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_initialize_state = initialize_state;
+
+		uint64_t timestamp = 0;
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+		    sizeof (timestamp), 1, &timestamp);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_initialize_action_time = (time_t)timestamp;
+
+		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+		    vd->vdev_offline) {
+			/* load progress for reporting, but don't resume */
+			vdev_initialize_load(vd);
+		} else if (vd->vdev_initialize_state ==
+		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) {
+			vdev_initialize(vd);
+		}
+
+		mutex_exit(&vd->vdev_initialize_lock);
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_initialize_restart(vd->vdev_child[i]);
+	}
+}
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index d993d2aec8e1..d66fa4ef822f 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -33,15 +33,15 @@
  *	1. Uniquely identify this device as part of a ZFS pool and confirm its
  *	   identity within the pool.
  *
- * 	2. Verify that all the devices given in a configuration are present
+ *	2. Verify that all the devices given in a configuration are present
  *         within the pool.
  *
- * 	3. Determine the uberblock for the pool.
+ *	3. Determine the uberblock for the pool.
  *
- * 	4. In case of an import operation, determine the configuration of the
+ *	4. In case of an import operation, determine the configuration of the
  *         toplevel vdev of which it is a part.
  *
- * 	5. If an import operation cannot find all the devices in the pool,
+ *	5. If an import operation cannot find all the devices in the pool,
  *         provide enough information to the administrator to determine which
  *         devices are missing.
  *
@@ -77,9 +77,9 @@
  * In order to identify which labels are valid, the labels are written in the
  * following manner:
  *
- * 	1. For each vdev, update 'L1' to the new label
- * 	2. Update the uberblock
- * 	3. For each vdev, update 'L2' to the new label
+ *	1. For each vdev, update 'L1' to the new label
+ *	2. Update the uberblock
+ *	3. For each vdev, update 'L2' to the new label
  *
  * Given arbitrary failure, we can determine the correct label to use based on
  * the transaction group.  If we fail after updating L1 but before updating the
@@ -117,19 +117,19 @@
  *
  * The nvlist describing the pool and vdev contains the following elements:
  *
- * 	version		ZFS on-disk version
- * 	name		Pool name
- * 	state		Pool state
- * 	txg		Transaction group in which this label was written
- * 	pool_guid	Unique identifier for this pool
- * 	vdev_tree	An nvlist describing vdev tree.
+ *	version		ZFS on-disk version
+ *	name		Pool name
+ *	state		Pool state
+ *	txg		Transaction group in which this label was written
+ *	pool_guid	Unique identifier for this pool
+ *	vdev_tree	An nvlist describing vdev tree.
  *	features_for_read
  *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
- * 	top_guid	Unique ID for top-level vdev in which this is contained
- * 	guid		Unique ID for the leaf vdev
+ *	top_guid	Unique ID for top-level vdev in which this is contained
+ *	guid		Unique ID for the leaf vdev
  *
  * The 'vs' configuration follows the format described in 'spa_config.c'.
  */
@@ -396,22 +396,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 			 * histograms.
 			 */
 			uint64_t seg_count = 0;
+			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
 
 			/*
 			 * There are the same number of allocated segments
 			 * as free segments, so we will have at least one
-			 * entry per free segment.
+			 * entry per free segment.  However, small free
+			 * segments (smaller than vdev_removal_max_span)
+			 * will be combined with adjacent allocated segments
+			 * as a single mapping.
 			 */
 			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-				seg_count += vd->vdev_mg->mg_histogram[i];
+				if (1ULL << (i + 1) < vdev_removal_max_span) {
+					to_alloc +=
+					    vd->vdev_mg->mg_histogram[i] <<
+					    i + 1;
+				} else {
+					seg_count +=
+					    vd->vdev_mg->mg_histogram[i];
+				}
 			}
 
 			/*
-			 * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
-			 * so we need at least one entry per SPA_MAXBLOCKSIZE
-			 * of allocated data.
+			 * The maximum length of a mapping is
+			 * zfs_remove_max_segment, so we need at least one entry
+			 * per zfs_remove_max_segment of allocated data.
 			 */
-			seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
+			seg_count += to_alloc / zfs_remove_max_segment;
 
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    seg_count *
@@ -546,6 +557,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
 	abd_t *vp_abd;
 	zio_t *zio;
 	uint64_t best_txg = 0;
+	uint64_t label_txg = 0;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
@@ -571,8 +583,6 @@ retry:
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
 		    &label, 0) == 0) {
-			uint64_t label_txg = 0;
-
 			/*
 			 * Auxiliary vdevs won't have txg values in their
 			 * labels and newly added vdevs may not have been
@@ -603,6 +613,15 @@ retry:
 		goto retry;
 	}
 
+	/*
+	 * We found a valid label but it didn't pass txg restrictions.
+	 */
+	if (config == NULL && label_txg != 0) {
+		vdev_dbgmsg(vd, "label discarded as txg is too large "
+		    "(%llu > %llu)", (u_longlong_t)label_txg,
+		    (u_longlong_t)txg);
+	}
+
 	abd_free(vp_abd);
 
 	return (config);
@@ -1028,19 +1047,13 @@ retry:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
 static int
-vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
 {
-	if (ub1->ub_txg < ub2->ub_txg)
-		return (-1);
-	if (ub1->ub_txg > ub2->ub_txg)
-		return (1);
-
-	if (ub1->ub_timestamp < ub2->ub_timestamp)
-		return (-1);
-	if (ub1->ub_timestamp > ub2->ub_timestamp)
-		return (1);
+	int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg);
+	if (likely(cmp))
+		return (cmp);
 
-	return (0);
+	return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp));
 }
 
 struct ubl_cbdata {
@@ -1167,10 +1180,13 @@ vdev_uberblock_sync_done(zio_t *zio)
  * Write the uberblock to all labels of all leaves of the specified vdev.
  */
 static void
-vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+    uberblock_t *ub, vdev_t *vd, int flags)
 {
-	for (uint64_t c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_uberblock_sync(zio, good_writes,
+		    ub, vd->vdev_child[c], flags);
+	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
@@ -1188,7 +1204,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 	for (int l = 0; l < VDEV_LABELS; l++)
 		vdev_label_write(zio, vd, l, ub_abd,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
-		    vdev_uberblock_sync_done, zio->io_private,
+		    vdev_uberblock_sync_done, good_writes,
 		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	abd_free(ub_abd);
@@ -1202,10 +1218,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 	zio_t *zio;
 	uint64_t good_writes = 0;
 
-	zio = zio_root(spa, NULL, &good_writes, flags);
+	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++)
-		vdev_uberblock_sync(zio, ub, svd[v], flags);
+		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
 
 	(void) zio_wait(zio);
 
@@ -1266,7 +1282,8 @@ vdev_label_sync_ignore_done(zio_t *zio)
  * Write all even or odd labels to all leaves of the specified vdev.
  */
 static void
-vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+    vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
@@ -1274,8 +1291,10 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 	char *buf;
 	size_t buflen;
 
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_label_sync(zio, good_writes,
+		    vd->vdev_child[c], l, txg, flags);
+	}
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
@@ -1300,7 +1319,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 			vdev_label_write(zio, vd, l, vp_abd,
 			    offsetof(vdev_label_t, vl_vdev_phys),
 			    sizeof (vdev_phys_t),
-			    vdev_label_sync_done, zio->io_private,
+			    vdev_label_sync_done, good_writes,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 		}
 	}
@@ -1332,7 +1351,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
-		vdev_label_sync(vio, vd, l, txg, flags);
+		vdev_label_sync(vio, good_writes, vd, l, txg, flags);
 		zio_nowait(vio);
 	}
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
index 60cb7aa96fca..26be35fc3501 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -516,13 +516,16 @@ vdev_mirror_io_start(zio_t *zio)
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
+		if (zio->io_bp != NULL &&
+		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
 		    mm->mm_children > 1) {
 			/*
-			 * For scrubbing reads we need to allocate a read
-			 * buffer for each child and issue reads to all
-			 * children.  If any child succeeds, it will copy its
-			 * data into zio->io_data in vdev_mirror_scrub_done.
+			 * For scrubbing reads (if we can verify the
+			 * checksum here, as indicated by io_bp being
+			 * non-NULL) we need to allocate a read buffer for
+			 * each child and issue reads to all children.  If
+			 * any child succeeds, it will copy its data into
+			 * zio->io_data in vdev_mirror_scrub_done.
 			 */
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
@@ -677,7 +680,21 @@ vdev_mirror_io_done(zio_t *zio)
 			if (mc->mc_error == 0) {
 				if (mc->mc_tried)
 					continue;
+				/*
+				 * We didn't try this child.  We need to
+				 * repair it if:
+				 * 1. it's a scrub (in which case we have
+				 * tried everything that was healthy)
+				 *  - or -
+				 * 2. it's an indirect vdev (in which case
+				 * it could point to any other vdev, which
+				 * might have a bad DTL)
+				 *  - or -
+				 * 3. the DTL indicates that this data is
+				 * missing from this vdev
+				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+				    mc->mc_vd->vdev_ops != &vdev_indirect_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
@@ -723,6 +740,7 @@ vdev_ops_t vdev_mirror_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -738,6 +756,7 @@ vdev_ops_t vdev_replacing_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -753,6 +772,7 @@ vdev_ops_t vdev_spare_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
index 29194fc11065..6852de445049 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -91,6 +91,7 @@ vdev_ops_t vdev_missing_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_MISSING,	/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -106,6 +107,7 @@ vdev_ops_t vdev_hole_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_HOLE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
index 5a3ba1b3e983..78b725a37b68 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -157,6 +157,8 @@ uint32_t zfs_vdev_trim_min_active = 1;
 uint32_t zfs_vdev_trim_max_active = 64;
 uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
 
 
 /*
@@ -195,6 +197,14 @@ int zfs_vdev_queue_depth_pct = 1000;
 int zfs_vdev_queue_depth_pct = 300;
 #endif
 
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
@@ -301,20 +311,15 @@ sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
 int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
-	const zio_t *z1 = x1;
-	const zio_t *z2 = x2;
+	const zio_t *z1 = (const zio_t *)x1;
+	const zio_t *z2 = (const zio_t *)x2;
 
-	if (z1->io_offset < z2->io_offset)
-		return (-1);
-	if (z1->io_offset > z2->io_offset)
-		return (1);
+	int cmp = AVL_CMP(z1->io_offset, z2->io_offset);
 
-	if (z1 < z2)
-		return (-1);
-	if (z1 > z2)
-		return (1);
+	if (likely(cmp))
+		return (cmp);
 
-	return (0);
+	return (AVL_PCMP(z1, z2));
 }
 
 static inline avl_tree_t *
@@ -534,6 +539,8 @@ vdev_queue_class_min_active(zio_priority_t p)
 		return (zfs_vdev_trim_min_active);
 	case ZIO_PRIORITY_REMOVAL:
 		return (zfs_vdev_removal_min_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		return (zfs_vdev_initializing_min_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -597,6 +604,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
 		return (zfs_vdev_trim_max_active);
 	case ZIO_PRIORITY_REMOVAL:
 		return (zfs_vdev_removal_max_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		return (zfs_vdev_initializing_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -824,8 +833,8 @@ again:
 	}
 
 	/*
-	 * For LBA-ordered queues (async / scrub), issue the i/o which follows
-	 * the most recently issued i/o in LBA (offset) order.
+	 * For LBA-ordered queues (async / scrub / initializing), issue the
+	 * i/o which follows the most recently issued i/o in LBA (offset) order.
 	 *
 	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 	 */
@@ -881,12 +890,14 @@ vdev_queue_io(zio_t *zio)
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_FREE);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index d6f4bbc4156a..4df04c30aabf 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -41,6 +41,10 @@
 #include <sys/fm/fs/zfs.h>
 #include <sys/bio.h>
 
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
+#endif
+
 /*
  * Virtual device vector for RAID-Z.
  *
@@ -1896,6 +1900,39 @@ vdev_raidz_child_done(zio_t *zio)
 	rc->rc_skipped = 0;
 }
 
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+
+	range_seg_t logical_rs, physical_rs;
+	logical_rs.rs_start = zio->io_offset;
+	logical_rs.rs_end = logical_rs.rs_start +
+	    vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+	raidz_col_t *rc = &rm->rm_col[col];
+	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+	vdev_xlate(cvd, &logical_rs, &physical_rs);
+	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+	/*
+	 * It would be nice to assert that rs_end is equal
+	 * to rc_offset + rc_size but there might be an
+	 * optional I/O at the end that is not accounted in
+	 * rc_size.
+	 */
+	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+		    rc->rc_size + (1 << tvd->vdev_ashift));
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+	}
+#endif
+}
+
 /*
  * Start an IO operation on a RAIDZ VDev
  *
@@ -1953,6 +1990,12 @@ vdev_raidz_io_start(zio_t *zio)
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
+
+			/*
+			 * Verify physical to logical translation.
+			 */
+			vdev_raidz_io_verify(zio, rm, c);
+
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
@@ -2622,6 +2665,37 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
 	return (B_FALSE);
 }
 
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+	vdev_t *raidvd = cvd->vdev_parent;
+	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+	uint64_t width = raidvd->vdev_children;
+	uint64_t tgt_col = cvd->vdev_id;
+	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+	/* make sure the offsets are block-aligned */
+	ASSERT0(in->rs_start % (1 << ashift));
+	ASSERT0(in->rs_end % (1 << ashift));
+	uint64_t b_start = in->rs_start >> ashift;
+	uint64_t b_end = in->rs_end >> ashift;
+
+	uint64_t start_row = 0;
+	if (b_start > tgt_col) /* avoid underflow */
+		start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+	uint64_t end_row = 0;
+	if (b_end > tgt_col)
+		end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+	res->rs_start = start_row << ashift;
+	res->rs_end = end_row << ashift;
+
+	ASSERT3U(res->rs_start, <=, in->rs_start);
+	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
 vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_open,
 	vdev_raidz_close,
@@ -2633,6 +2707,7 @@ vdev_ops_t vdev_raidz_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_raidz_xlate,
 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
index c864ab1cb0c1..20fa9c24db24 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
@@ -44,6 +44,7 @@
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
+#include <sys/vdev_initialize.h>
 
 /*
  * This file contains the necessary logic to remove vdevs from a
@@ -83,18 +84,12 @@ typedef struct vdev_copy_arg {
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
 
-typedef struct vdev_copy_seg_arg {
-	vdev_copy_arg_t	*vcsa_copy_arg;
-	uint64_t	vcsa_txg;
-	dva_t		*vcsa_dest_dva;
-	blkptr_t	*vcsa_dest_bp;
-} vdev_copy_seg_arg_t;
-
 /*
- * The maximum amount of allowed data we're allowed to copy from a device
- * at a time when removing it.
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal.  This determines how much i/o we can have
+ * in flight concurrently.
  */
-int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
@@ -111,6 +106,24 @@ int zfs_remove_max_copy_bytes = 8 * 1024 * 1024;
 int zfs_remove_max_segment = 1024 * 1024;
 
 /*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops.  The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ *  - the mapping will be smaller, since one entry can cover more allocated
+ *    segments
+ *  - more of the fragmentation in the removing device will be preserved
+ *  - we'll do larger allocations, which may fail and fall back on smaller
+ *    allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
@@ -176,7 +189,7 @@ spa_vdev_removal_create(vdev_t *vd)
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
 	svr->svr_allocd_segs = range_tree_create(NULL, NULL);
-	svr->svr_vdev = vd;
+	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		svr->svr_frees[i] = range_tree_create(NULL, NULL);
@@ -218,9 +231,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
 static void
 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 {
-	vdev_t *vd = arg;
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
-	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
 	spa_vdev_removal_t *svr = NULL;
 	uint64_t txg = dmu_tx_get_txg(tx);
@@ -342,7 +356,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
 	spa->spa_vdev_removal = svr;
 	svr->svr_thread = thread_create(NULL, 0,
-	    spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri);
+	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
@@ -384,21 +398,24 @@ spa_remove_init(spa_t *spa)
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 		vdev_t *vd = vdev_lookup_top(spa,
 		    spa->spa_removing_phys.sr_removing_vdev);
-		spa_config_exit(spa, SCL_STATE, FTAG);
 
-		if (vd == NULL)
+		if (vd == NULL) {
+			spa_config_exit(spa, SCL_STATE, FTAG);
 			return (EINVAL);
+		}
 
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT(vdev_is_concrete(vd));
 		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
-		ASSERT(svr->svr_vdev->vdev_removing);
+		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+		ASSERT(vd->vdev_removing);
 
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
+		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		spa->spa_vdev_removal = svr;
 	}
@@ -451,15 +468,8 @@ spa_restart_removal(spa_t *spa)
 	if (!spa_writeable(spa))
 		return;
 
-	vdev_t *vd = svr->svr_vdev;
-	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
-
-	ASSERT3P(vd, !=, NULL);
-	ASSERT(vd->vdev_removing);
-
-	zfs_dbgmsg("restarting removal of %llu at count=%llu",
-	    vd->vdev_id, vdev_indirect_mapping_num_entries(vim));
-	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd,
+	zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
 	    0, &p0, TS_RUN, minclsyspri);
 }
 
@@ -480,7 +490,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
-	ASSERT3P(vd, ==, svr->svr_vdev);
+	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
 
 	mutex_enter(&svr->svr_lock);
 
@@ -663,7 +673,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
 
 	if (state == DSS_FINISHED) {
 		spa_removing_phys_t *srp = &spa->spa_removing_phys;
-		vdev_t *vd = svr->svr_vdev;
+		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (srp->sr_prev_indirect_vdev != UINT64_MAX) {
@@ -706,7 +716,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
@@ -734,85 +744,249 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 	spa_sync_removing_state(spa, tx);
 }
 
+typedef struct vdev_copy_segment_arg {
+	spa_t *vcsa_spa;
+	dva_t *vcsa_dest_dva;
+	uint64_t vcsa_txg;
+	range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_copy_segment_arg_t *vcsa = arg;
+	spa_t *spa = vcsa->vcsa_spa;
+	blkptr_t bp = { 0 };
+
+	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+	BP_SET_LSIZE(&bp, size);
+	BP_SET_PSIZE(&bp, size);
+	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(&bp, DMU_OT_NONE);
+	BP_SET_LEVEL(&bp, 0);
+	BP_SET_DEDUP(&bp, 0);
+	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+	DVA_SET_OFFSET(&bp.blk_dva[0],
+	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+	DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+	zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+	vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+	range_tree_vacate(vcsa->vcsa_obsolete_segs,
+	    unalloc_seg, vcsa);
+	range_tree_destroy(vcsa->vcsa_obsolete_segs);
+	kmem_free(vcsa, sizeof (*vcsa));
+
+	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
 static void
 spa_vdev_copy_segment_write_done(zio_t *zio)
 {
-	vdev_copy_seg_arg_t *vcsa = zio->io_private;
-	vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg;
-	spa_config_exit(zio->io_spa, SCL_STATE, FTAG);
+	vdev_copy_arg_t *vca = zio->io_private;
+
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
-
-	ASSERT0(zio->io_error);
-	kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t));
-	kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t));
 }
 
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
-	vdev_copy_seg_arg_t *vcsa = zio->io_private;
-	dva_t *dest_dva = vcsa->vcsa_dest_dva;
-	uint64_t txg = vcsa->vcsa_txg;
-	spa_t *spa = zio->io_spa;
-	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva));
-	blkptr_t *bp = NULL;
-	dva_t *dva = NULL;
-	uint64_t size = zio->io_size;
-
-	ASSERT3P(dest_vd, !=, NULL);
-	ASSERT0(zio->io_error);
-
-	vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	bp = vcsa->vcsa_dest_bp;
-	dva = bp->blk_dva;
-
-	BP_ZERO(bp);
-
-	/* initialize with dest_dva */
-	bcopy(dest_dva, dva, sizeof (dva_t));
-	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa,
-	    txg, bp, zio->io_abd, size,
-	    spa_vdev_copy_segment_write_done, vcsa,
-	    ZIO_PRIORITY_REMOVAL, 0, NULL));
+	zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible.  Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs.  However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads.  If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ *                            null
+ *                           /    \
+ *    write(new vdev, child 0)      write(new vdev, child 1)
+ *      |                             |
+ *    read(old vdev, child 0)       read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete.  However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete.  In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*.  We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+    vdev_t *source_vd, uint64_t source_offset,
+    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+	mutex_enter(&vca->vca_lock);
+	vca->vca_outstanding_bytes += size;
+	mutex_exit(&vca->vca_lock);
+
+	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+	vdev_t *source_child_vd;
+	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+		/*
+		 * Source and dest are both mirrors.  Copy from the same
+		 * child id as we are copying to (wrapping around if there
+		 * are more dest children than source children).
+		 */
+		source_child_vd =
+		    source_vd->vdev_child[dest_id % source_vd->vdev_children];
+	} else {
+		source_child_vd = source_vd;
+	}
+
+	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+	    dest_child_vd, dest_offset, abd, size,
+	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_write_done, vca);
+
+	zio_nowait(zio_vdev_child_io(write_zio, NULL,
+	    source_child_vd, source_offset, abd, size,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_read_done, vca));
 }
 
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
 static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+    uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
-	vdev_copy_seg_arg_t *private;
 	dva_t dst = { 0 };
-	blkptr_t blk, *bp = &blk;
-	dva_t *dva = bp->blk_dva;
+	uint64_t start = range_tree_min(segs);
 
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 
+	uint64_t size = range_tree_span(segs);
+	if (range_tree_span(segs) > maxalloc) {
+		/*
+		 * We can't allocate all the segments.  Prefer to end
+		 * the allocation at the end of a segment, thus avoiding
+		 * additional split blocks.
+		 */
+		range_seg_t search;
+		avl_index_t where;
+		search.rs_start = start + maxalloc;
+		search.rs_end = search.rs_start;
+		range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+		if (rs == NULL) {
+			rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+		} else {
+			rs = AVL_PREV(&segs->rt_root, rs);
+		}
+		if (rs != NULL) {
+			size = rs->rs_end - start;
+		} else {
+			/*
+			 * There are no segments that end before maxalloc.
+			 * I.e. the first segment is larger than maxalloc,
+			 * so we must split it.
+			 */
+			size = maxalloc;
+		}
+	}
+	ASSERT3U(size, <=, maxalloc);
+
+	/*
+	 * We use allocator 0 for this I/O because we don't expect device remap
+	 * to be the steady state of the system, so parallelizing is not as
+	 * critical as it is for other allocation types. We also want to ensure
+	 * that the IOs are allocated together as much as possible, to reduce
+	 * mapping sizes.
+	 */
 	int error = metaslab_alloc_dva(spa, mg->mg_class, size,
-	    &dst, 0, NULL, txg, 0, zal);
+	    &dst, 0, NULL, txg, 0, zal, 0);
 	if (error != 0)
 		return (error);
 
 	/*
+	 * Determine the ranges that are not actually needed.  Offsets are
+	 * relative to the start of the range to be copied (i.e. relative to the
+	 * local variable "start").
+	 */
+	range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+	range_seg_t *rs = avl_first(&segs->rt_root);
+	ASSERT3U(rs->rs_start, ==, start);
+	uint64_t prev_seg_end = rs->rs_end;
+	while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+		if (rs->rs_start >= start + size) {
+			break;
+		} else {
+			range_tree_add(obsolete_segs,
+			    prev_seg_end - start,
+			    rs->rs_start - prev_seg_end);
+		}
+		prev_seg_end = rs->rs_end;
+	}
+	/* We don't end in the middle of an obsolete range */
+	ASSERT3U(start + size, <=, prev_seg_end);
+
+	range_tree_clear(segs, start, size);
+
+	/*
 	 * We can't have any padding of the allocated size, otherwise we will
 	 * misunderstand what's allocated, and the size of the mapping.
 	 * The caller ensures this will be true by passing in a size that is
@@ -820,51 +994,37 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
 	 */
 	ASSERT3U(DVA_GET_ASIZE(&dst), ==, size);
 
-	mutex_enter(&vca->vca_lock);
-	vca->vca_outstanding_bytes += size;
-	mutex_exit(&vca->vca_lock);
-
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+	}
 
-	private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP);
-	private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
-	private->vcsa_txg = txg;
-	private->vcsa_copy_arg = vca;
+	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+	vcsa->vcsa_obsolete_segs = obsolete_segs;
+	vcsa->vcsa_spa = spa;
+	vcsa->vcsa_txg = txg;
 
 	/*
-	 * This lock is eventually released by the donefunc for the
-	 * zio_write_phys that finishes copying the data.
+	 * See comment before spa_vdev_copy_one_child().
 	 */
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-
-	/*
-	 * Do logical I/O, letting the redundancy vdevs (like mirror)
-	 * handle their own I/O instead of duplicating that code here.
-	 */
-	BP_ZERO(bp);
-
-	DVA_SET_VDEV(&dva[0], vd->vdev_id);
-	DVA_SET_OFFSET(&dva[0], start);
-	DVA_SET_GANG(&dva[0], 0);
-	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size));
-
-	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
-
-	BP_SET_LSIZE(bp, size);
-	BP_SET_PSIZE(bp, size);
-	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
-	BP_SET_LEVEL(bp, 0);
-	BP_SET_DEDUP(bp, 0);
-	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-
-	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa,
-	    bp, abd_alloc_for_io(size, B_FALSE), size,
-	    spa_vdev_copy_segment_read_done, private,
-	    ZIO_PRIORITY_REMOVAL, 0, NULL));
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+	    spa_vdev_copy_segment_done, vcsa, 0);
+	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < dest_vd->vdev_children; i++) {
+			vdev_t *child = dest_vd->vdev_child[i];
+			spa_vdev_copy_one_child(vca, nzio, vd, start,
+			    child, DVA_GET_OFFSET(&dst), i, size);
+		}
+	} else {
+		spa_vdev_copy_one_child(vca, nzio, vd, start,
+		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+	}
+	zio_nowait(nzio);
 
 	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
 	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
@@ -882,8 +1042,8 @@ static void
 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
-	vdev_t *vd = svr->svr_vdev;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
@@ -912,37 +1072,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 }
 
 static void
-vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd)
-{
-	ivd->vdev_indirect_config = vd->vdev_indirect_config;
-
-	ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL);
-	ASSERT(vd->vdev_indirect_mapping != NULL);
-	ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping;
-	vd->vdev_indirect_mapping = NULL;
-
-	ASSERT3P(ivd->vdev_indirect_births, ==, NULL);
-	ASSERT(vd->vdev_indirect_births != NULL);
-	ivd->vdev_indirect_births = vd->vdev_indirect_births;
-	vd->vdev_indirect_births = NULL;
-
-	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
-	ASSERT0(range_tree_space(ivd->vdev_obsolete_segments));
-
-	if (vd->vdev_obsolete_sm != NULL) {
-		ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize);
-
-		/*
-		 * We cannot use space_map_{open,close} because we hold all
-		 * the config locks as writer.
-		 */
-		ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL);
-		ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm;
-		vd->vdev_obsolete_sm = NULL;
-	}
-}
-
-static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
@@ -977,17 +1106,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
 
 	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+	ivd->vdev_removing = 0;
 
 	vd->vdev_leaf_zap = 0;
 
 	vdev_remove_child(ivd, vd);
 	vdev_compact_children(ivd);
 
-	vdev_indirect_state_transfer(ivd, vd);
-
-	svr->svr_vdev = ivd;
-
-	ASSERT(!ivd->vdev_removing);
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1010,9 +1135,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
  * context by the removal thread after we have copied all vdev's data.
  */
 static void
-vdev_remove_complete(vdev_t *vd)
+vdev_remove_complete(spa_t *spa)
 {
-	spa_t *spa = vd->vdev_spa;
 	uint64_t txg;
 
 	/*
@@ -1020,8 +1144,13 @@ vdev_remove_complete(vdev_t *vd)
 	 * vdev_metaslab_fini()
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
-
 	txg = spa_vdev_enter(spa);
+	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+
+	sysevent_t *ev = spa_event_create(spa, vd, NULL,
+	    ESC_ZFS_VDEV_REMOVE_DEV);
+
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    vd->vdev_id, txg);
 
@@ -1041,6 +1170,10 @@ vdev_remove_complete(vdev_t *vd)
 	/*
 	 * We now release the locks, allowing spa_sync to run and finish the
 	 * removal via vdev_remove_complete_sync in syncing context.
+	 *
+	 * Note that we hold on to the vdev_t that has been replaced.  Since
+	 * it isn't part of the vdev tree any longer, it can't be concurrently
+	 * manipulated, even while we don't have the config lock.
 	 */
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
@@ -1062,6 +1195,8 @@ vdev_remove_complete(vdev_t *vd)
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 	(void) spa_vdev_exit(spa, vd, txg, 0);
+
+	spa_event_post(ev);
 }
 
 /*
@@ -1072,7 +1207,7 @@ vdev_remove_complete(vdev_t *vd)
  * this size again this txg.
  */
 static void
-spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
     uint64_t *max_alloc, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
@@ -1080,39 +1215,78 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 
 	mutex_enter(&svr->svr_lock);
 
-	range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
-	if (rs == NULL) {
+	/*
+	 * Determine how big of a chunk to copy.  We can allocate up
+	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
+	 * bytes of unallocated space at a time.  "segs" will track the
+	 * allocated segments that we are copying.  We may also be copying
+	 * free segments (of up to vdev_removal_max_span bytes).
+	 */
+	range_tree_t *segs = range_tree_create(NULL, NULL);
+	for (;;) {
+		range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+		if (rs == NULL)
+			break;
+
+		uint64_t seg_length;
+
+		if (range_tree_is_empty(segs)) {
+			/* need to truncate the first seg based on max_alloc */
+			seg_length =
+			    MIN(rs->rs_end - rs->rs_start, *max_alloc);
+		} else {
+			if (rs->rs_start - range_tree_max(segs) >
+			    vdev_removal_max_span) {
+				/*
+				 * Including this segment would cause us to
+				 * copy a larger unneeded chunk than is allowed.
+				 */
+				break;
+			} else if (rs->rs_end - range_tree_min(segs) >
+			    *max_alloc) {
+				/*
+				 * This additional segment would extend past
+				 * max_alloc. Rather than splitting this
+				 * segment, leave it for the next mapping.
+				 */
+				break;
+			} else {
+				seg_length = rs->rs_end - rs->rs_start;
+			}
+		}
+
+		range_tree_add(segs, rs->rs_start, seg_length);
+		range_tree_remove(svr->svr_allocd_segs,
+		    rs->rs_start, seg_length);
+	}
+
+	if (range_tree_is_empty(segs)) {
 		mutex_exit(&svr->svr_lock);
+		range_tree_destroy(segs);
 		return;
 	}
-	uint64_t offset = rs->rs_start;
-	uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
-
-	range_tree_remove(svr->svr_allocd_segs, offset, length);
 
 	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
 		    svr, 0, ZFS_SPACE_CHECK_NONE, tx);
 	}
 
-	svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+	svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
 
 	/*
 	 * Note: this is the amount of *allocated* space
 	 * that we are taking care of each txg.
 	 */
-	svr->svr_bytes_done[txg & TXG_MASK] += length;
+	svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
 
 	mutex_exit(&svr->svr_lock);
 
 	zio_alloc_list_t zal;
 	metaslab_trace_init(&zal);
-	uint64_t thismax = *max_alloc;
-	while (length > 0) {
-		uint64_t mylen = MIN(length, thismax);
-
-		int error = spa_vdev_copy_segment(svr->svr_vdev,
-		    offset, mylen, txg, vca, &zal);
+	uint64_t thismax = SPA_MAXBLOCKSIZE;
+	while (!range_tree_is_empty(segs)) {
+		int error = spa_vdev_copy_segment(vd,
+		    segs, thismax, txg, vca, &zal);
 
 		if (error == ENOSPC) {
 			/*
@@ -1126,18 +1300,17 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 			 */
 			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
-			thismax = P2ROUNDUP(mylen / 2,
+			uint64_t attempted =
+			    MIN(range_tree_span(segs), thismax);
+			thismax = P2ROUNDUP(attempted / 2,
 			    1 << spa->spa_max_ashift);
-			ASSERT3U(thismax, <, mylen);
 			/*
 			 * The minimum-size allocation can not fail.
 			 */
-			ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
-			*max_alloc = mylen - (1 << spa->spa_max_ashift);
+			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+			*max_alloc = attempted - (1 << spa->spa_max_ashift);
 		} else {
 			ASSERT0(error);
-			length -= mylen;
-			offset += mylen;
 
 			/*
 			 * We've performed an allocation, so reset the
@@ -1148,6 +1321,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 		}
 	}
 	metaslab_trace_fini(&zal);
+	range_tree_destroy(segs);
 }
 
 /*
@@ -1169,12 +1343,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
 static void
 spa_vdev_remove_thread(void *arg)
 {
-	vdev_t *vd = arg;
-	spa_t *spa = vd->vdev_spa;
+	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
 	uint64_t max_alloc = zfs_remove_max_segment;
 	uint64_t last_txg = 0;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
 
@@ -1182,7 +1358,6 @@ spa_vdev_remove_thread(void *arg)
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_removing);
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
-	ASSERT3P(svr->svr_vdev, ==, vd);
 	ASSERT(vim != NULL);
 
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1264,6 +1439,17 @@ spa_vdev_remove_thread(void *arg)
 			mutex_exit(&svr->svr_lock);
 
 			/*
+			 * We need to periodically drop the config lock so that
+			 * writers can get in.  Additionally, we can't wait
+			 * for a txg to sync while holding a config lock
+			 * (since a waiting writer could cause a 3-way deadlock
+			 * with the sync thread, which also gets a config
+			 * lock for reader).  So we can't hold the config lock
+			 * while calling dmu_tx_assign().
+			 */
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+			/*
 			 * This delay will pause the removal around the point
 			 * specified by zfs_remove_max_bytes_pause. We do this
 			 * solely from the test suite or during debugging.
@@ -1289,11 +1475,19 @@ spa_vdev_remove_thread(void *arg)
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
+			/*
+			 * Reacquire the vdev_config lock.  The vdev_t
+			 * that we're removing may have changed, e.g. due
+			 * to a vdev_attach or vdev_detach.
+			 */
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
 			if (txg != last_txg)
 				max_alloc = zfs_remove_max_segment;
 			last_txg = txg;
 
-			spa_vdev_copy_impl(svr, &vca, &max_alloc, tx);
+			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
 
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
@@ -1301,6 +1495,9 @@ spa_vdev_remove_thread(void *arg)
 	}
 
 	mutex_exit(&svr->svr_lock);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
 	/*
 	 * Wait for all copies to finish before cleaning up the vca.
 	 */
@@ -1318,7 +1515,7 @@ spa_vdev_remove_thread(void *arg)
 		mutex_exit(&svr->svr_lock);
 	} else {
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
-		vdev_remove_complete(vd);
+		vdev_remove_complete(spa);
 	}
 	thread_exit();
 }
@@ -1360,7 +1557,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
-	vdev_t *vd = svr->svr_vdev;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	objset_t *mos = spa->spa_meta_objset;
@@ -1433,8 +1630,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 			 * because we have not allocated mappings for it yet.
 			 */
 			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
-			range_tree_clear(svr->svr_allocd_segs, syncd,
-			    msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd);
+			uint64_t sm_end = msp->ms_sm->sm_start +
+			    msp->ms_sm->sm_size;
+			if (sm_end > syncd)
+				range_tree_clear(svr->svr_allocd_segs,
+				    syncd, sm_end - syncd);
 
 			mutex_exit(&svr->svr_lock);
 		}
@@ -1495,7 +1695,7 @@ spa_vdev_remove_cancel(spa_t *spa)
 	if (spa->spa_vdev_removal == NULL)
 		return (ESRCH);
 
-	uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id;
+	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
 
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0,
@@ -1625,6 +1825,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	/* Make sure these changes are sync'ed */
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
+	/* Stop initializing */
+	(void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
 	*txg = spa_vdev_config_enter(spa);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -1785,6 +1988,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 */
 	error = spa_reset_logs(spa);
 
+	/*
+	 * We stop any initializing that is currently in progress but leave
+	 * the state as "active". This will allow the initializing to resume
+	 * if the removal is canceled sometime later.
+	 */
+	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
 	*txg = spa_vdev_config_enter(spa);
 
 	/*
@@ -1796,6 +2006,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
+		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		return (error);
 	}
 
@@ -1806,7 +2017,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_initiate_sync,
-	    vd, 0, ZFS_SPACE_CHECK_NONE, tx);
+	    (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
index 92c670d28b2c..a03d18704dfc 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -151,6 +151,7 @@ vdev_ops_t vdev_root_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_ROOT,		/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
index b40263fc981c..fc9ac80593ac 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
@@ -58,9 +58,7 @@ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 void
 fzap_byteswap(void *vbuf, size_t size)
 {
-	uint64_t block_type;
-
-	block_type = *(uint64_t *)vbuf;
+	uint64_t block_type = *(uint64_t *)vbuf;
 
 	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
 		zap_leaf_byteswap(vbuf, size);
@@ -73,11 +71,6 @@ fzap_byteswap(void *vbuf, size_t size)
 void
 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 {
-	dmu_buf_t *db;
-	zap_leaf_t *l;
-	int i;
-	zap_phys_t *zp;
-
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	zap->zap_ismicro = FALSE;
 
@@ -87,7 +80,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
 	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
 
-	zp = zap_f_phys(zap);
+	zap_phys_t *zp = zap_f_phys(zap);
 	/*
 	 * explicitly zero it since it might be coming from an
 	 * initialized microzap
@@ -106,17 +99,18 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	zp->zap_flags = flags;
 
 	/* block 1 will be the first leaf */
-	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
 
 	/*
 	 * set up block 1 - the first leaf
 	 */
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	dmu_buf_t *db;
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
-	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	l->l_dbuf = db;
 
 	zap_leaf_init(l, zp->zap_normflags != 0);
@@ -146,9 +140,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
     dmu_tx_t *tx)
 {
-	uint64_t b, newblk;
-	dmu_buf_t *db_old, *db_new;
-	int err;
+	uint64_t newblk;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int hepb = 1<<(bs-4);
 	/* hepb = half the number of entries in a block */
@@ -172,21 +164,23 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 	 * Copy the ptrtbl from the old to new location.
 	 */
 
-	b = tbl->zt_blks_copied;
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	uint64_t b = tbl->zt_blks_copied;
+	dmu_buf_t *db_old;
+	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
-	if (err)
+	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	dmu_buf_t *db_new;
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -221,22 +215,20 @@ static int
 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
     dmu_tx_t *tx)
 {
-	int err;
-	uint64_t blk, off;
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_buf_t *db;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
 
 	dprintf("storing %llx at index %llx\n", val, idx);
 
-	blk = idx >> (bs-3);
-	off = idx & ((1<<(bs-3))-1);
+	uint64_t blk = idx >> (bs-3);
+	uint64_t off = idx & ((1<<(bs-3))-1);
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	dmu_buf_t *db;
+	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	if (err)
+	if (err != 0)
 		return (err);
 	dmu_buf_will_dirty(db, tx);
 
@@ -249,7 +241,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
-		if (err) {
+		if (err != 0) {
 			dmu_buf_rele(db, FTAG);
 			return (err);
 		}
@@ -268,27 +260,24 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 static int
 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 {
-	uint64_t blk, off;
-	int err;
-	dmu_buf_t *db;
-	dnode_t *dn;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	blk = idx >> (bs-3);
-	off = idx & ((1<<(bs-3))-1);
+	uint64_t blk = idx >> (bs-3);
+	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	/*
 	 * Note: this is equivalent to dmu_buf_hold(), but we use
 	 * _dnode_enter / _by_dnode because it's faster because we don't
 	 * have to hold the dnode.
 	 */
-	dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	err = dmu_buf_hold_by_dnode(dn,
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	dmu_buf_t *db;
+	int err = dmu_buf_hold_by_dnode(dn,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	dmu_buf_dnode_exit(zap->zap_dbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
 	dmu_buf_rele(db, FTAG);
@@ -319,11 +308,10 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 static void
 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 {
-	int i;
-	for (i = 0; i < n; i++) {
+	for (int i = 0; i < n; i++) {
 		uint64_t lb = src[i];
-		dst[2*i+0] = lb;
-		dst[2*i+1] = lb;
+		dst[2 * i + 0] = lb;
+		dst[2 * i + 1] = lb;
 	}
 }
 
@@ -345,19 +333,16 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 		 * stored in the header block).  Give it its own entire
 		 * block, which will double the size of the ptrtbl.
 		 */
-		uint64_t newblk;
-		dmu_buf_t *db_new;
-		int err;
-
 		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 
-		newblk = zap_allocate_blocks(zap, 1);
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		uint64_t newblk = zap_allocate_blocks(zap, 1);
+		dmu_buf_t *db_new;
+		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
-		if (err)
+		if (err != 0)
 			return (err);
 		dmu_buf_will_dirty(db_new, tx);
 		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
@@ -392,9 +377,8 @@ zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 static uint64_t
 zap_allocate_blocks(zap_t *zap, int nblocks)
 {
-	uint64_t newblk;
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	newblk = zap_f_phys(zap)->zap_freeblk;
+	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
 	zap_f_phys(zap)->zap_freeblk += nblocks;
 	return (newblk);
 }
@@ -411,7 +395,6 @@ zap_leaf_evict_sync(void *dbu)
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
-	void *winner;
 	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -421,12 +404,11 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
 
-	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
-	winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
-	ASSERT(winner == NULL);
+	VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
 	zap_leaf_init(l, zap->zap_normflags != 0);
@@ -460,11 +442,9 @@ zap_put_leaf(zap_leaf_t *l)
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
-	zap_leaf_t *l, *winner;
-
 	ASSERT(blkid != 0);
 
-	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	rw_init(&l->l_rwlock, 0, 0, 0);
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = blkid;
@@ -472,7 +452,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 	l->l_dbuf = db;
 
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
-	winner = dmu_buf_set_user(db, &l->l_dbu);
+	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
 
 	rw_exit(&l->l_rwlock);
 	if (winner != NULL) {
@@ -510,17 +490,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
     zap_leaf_t **lp)
 {
 	dmu_buf_t *db;
-	zap_leaf_t *l;
-	int bs = FZAP_BLOCK_SHIFT(zap);
-	int err;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
+	int bs = FZAP_BLOCK_SHIFT(zap);
 	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(dn,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
 	dmu_buf_dnode_exit(zap->zap_dbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 
 	ASSERT3U(db->db_object, ==, zap->zap_object);
@@ -528,7 +506,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 	ASSERT3U(db->db_size, ==, 1 << bs);
 	ASSERT(blkid != 0);
 
-	l = dmu_buf_get_user(db);
+	zap_leaf_t *l = dmu_buf_get_user(db);
 
 	if (l == NULL)
 		l = zap_open_leaf(blkid, db);
@@ -583,8 +561,7 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 static int
 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
-	uint64_t idx, blk;
-	int err;
+	uint64_t blk;
 
 	ASSERT(zap->zap_dbuf == NULL ||
 	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
@@ -596,8 +573,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 		return (SET_ERROR(EIO));
 	}
 
-	idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
-	err = zap_idx_to_blk(zap, idx, &blk);
+	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	int err = zap_idx_to_blk(zap, idx, &blk);
 	if (err != 0)
 		return (err);
 	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
@@ -614,9 +591,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
-	zap_leaf_t *nl;
-	int prefix_diff, i, err;
-	uint64_t sibling;
+	int err;
 	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
@@ -636,19 +611,19 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 		err = zap_lockdir(os, object, tx, RW_WRITER,
 		    FALSE, FALSE, tag, &zn->zn_zap);
 		zap = zn->zn_zap;
-		if (err)
+		if (err != 0)
 			return (err);
 		ASSERT(!zap->zap_ismicro);
 
 		while (old_prefix_len ==
 		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 			err = zap_grow_ptrtbl(zap, tx);
-			if (err)
+			if (err != 0)
 				return (err);
 		}
 
 		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
-		if (err)
+		if (err != 0)
 			return (err);
 
 		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
@@ -662,25 +637,26 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-	prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    (old_prefix_len + 1);
-	sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+	uint64_t sibling =
+	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
 	/* check for i/o errors before doing zap_leaf_split */
-	for (i = 0; i < (1ULL<<prefix_diff); i++) {
+	for (int i = 0; i < (1ULL << prefix_diff); i++) {
 		uint64_t blk;
-		err = zap_idx_to_blk(zap, sibling+i, &blk);
-		if (err)
+		err = zap_idx_to_blk(zap, sibling + i, &blk);
+		if (err != 0)
 			return (err);
 		ASSERT3U(blk, ==, l->l_blkid);
 	}
 
-	nl = zap_create_leaf(zap, tx);
+	zap_leaf_t *nl = zap_create_leaf(zap, tx);
 	zap_leaf_split(l, nl, zap->zap_normflags != 0);
 
 	/* set sibling pointers */
-	for (i = 0; i < (1ULL << prefix_diff); i++) {
-		err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+	for (int i = 0; i < (1ULL << prefix_diff); i++) {
+		err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
 		ASSERT0(err); /* we checked for i/o errors above */
 	}
 
@@ -708,8 +684,6 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
 	zap_put_leaf(l);
 
 	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
-		int err;
-
 		/*
 		 * We are in the middle of growing the pointer table, or
 		 * this leaf will soon make us grow it.
@@ -719,10 +693,10 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
 			uint64_t zapobj = zap->zap_object;
 
 			zap_unlockdir(zap, tag);
-			err = zap_lockdir(os, zapobj, tx,
+			int err = zap_lockdir(os, zapobj, tx,
 			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 			zap = zn->zn_zap;
-			if (err)
+			if (err != 0)
 				return;
 		}
 
@@ -763,9 +737,8 @@ fzap_checksize(uint64_t integer_size, uint64_t num_integers)
 static int
 fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 {
-	int err;
-
-	if ((err = fzap_checkname(zn)) != 0)
+	int err = fzap_checkname(zn);
+	if (err != 0)
 		return (err);
 	return (fzap_checksize(integer_size, num_integers));
 }
@@ -779,10 +752,10 @@ fzap_lookup(zap_name_t *zn,
     char *realname, int rn_len, boolean_t *ncp)
 {
 	zap_leaf_t *l;
-	int err;
 	zap_entry_handle_t zeh;
 
-	if ((err = fzap_checkname(zn)) != 0)
+	int err = fzap_checkname(zn);
+	if (err != 0)
 		return (err);
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
@@ -870,7 +843,8 @@ fzap_update(zap_name_t *zn,
     void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
-	int err, create;
+	int err;
+	boolean_t create;
 	zap_entry_handle_t zeh;
 	zap_t *zap = zn->zn_zap;
 
@@ -923,9 +897,9 @@ fzap_length(zap_name_t *zn,
 	if (err != 0)
 		goto out;
 
-	if (integer_size)
+	if (integer_size != 0)
 		*integer_size = zeh.zeh_integer_size;
-	if (num_integers)
+	if (num_integers != 0)
 		*num_integers = zeh.zeh_num_integers;
 out:
 	zap_put_leaf(l);
@@ -954,15 +928,14 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 void
 fzap_prefetch(zap_name_t *zn)
 {
-	uint64_t idx, blk;
+	uint64_t blk;
 	zap_t *zap = zn->zn_zap;
-	int bs;
 
-	idx = ZAP_HASH_IDX(zn->zn_hash,
+	uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
 	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
-	bs = FZAP_BLOCK_SHIFT(zap);
+	int bs = FZAP_BLOCK_SHIFT(zap);
 	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
@@ -975,9 +948,8 @@ uint64_t
 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
     const char *name, dmu_tx_t *tx)
 {
-	uint64_t new_obj;
-
-	VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+	uint64_t new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx);
+	VERIFY(new_obj != 0);
 	VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 	    tx));
 
@@ -989,13 +961,12 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name)
 {
 	zap_cursor_t zc;
-	zap_attribute_t *za;
 	int err;
 
 	if (mask == 0)
 		mask = -1ULL;
 
-	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 	for (zap_cursor_init(&zc, os, zapobj);
 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
@@ -1005,7 +976,7 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 		}
 	}
 	zap_cursor_fini(&zc);
-	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(za, sizeof (*za));
 	return (err);
 }
 
@@ -1013,23 +984,23 @@ int
 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
-	zap_attribute_t za;
-	int err;
+	int err = 0;
 
-	err = 0;
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_retrieve(&zc, za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
-		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
-		err = zap_add(os, intoobj, za.za_name,
-		    8, 1, &za.za_first_integer, tx);
-		if (err)
+		err = zap_add(os, intoobj, za->za_name,
+		    8, 1, &za->za_first_integer, tx);
+		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
 	return (err);
 }
 
@@ -1038,23 +1009,23 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
     uint64_t value, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
-	zap_attribute_t za;
-	int err;
+	int err = 0;
 
-	err = 0;
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_retrieve(&zc, za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
-		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
-		err = zap_add(os, intoobj, za.za_name,
+		err = zap_add(os, intoobj, za->za_name,
 		    8, 1, &value, tx);
 		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
 	return (err);
 }
 
@@ -1063,29 +1034,29 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
     dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
-	zap_attribute_t za;
-	int err;
+	int err = 0;
 
-	err = 0;
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 	for (zap_cursor_init(&zc, os, fromobj);
-	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_retrieve(&zc, za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		uint64_t delta = 0;
 
-		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 
-		err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+		err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
 		if (err != 0 && err != ENOENT)
 			break;
-		delta += za.za_first_integer;
-		err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
-		if (err)
+		delta += za->za_first_integer;
+		err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
+		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
 	return (err);
 }
 
@@ -1150,12 +1121,11 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
     dmu_tx_t *tx)
 {
 	uint64_t value = 0;
-	int err;
 
 	if (delta == 0)
 		return (0);
 
-	err = zap_lookup(os, obj, name, 8, 1, &value);
+	int err = zap_lookup(os, obj, name, 8, 1, &value);
 	if (err != 0 && err != ENOENT)
 		return (err);
 	value += delta;
@@ -1253,7 +1223,6 @@ again:
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
-	int i, err;
 	uint64_t lastblk = 0;
 
 	/*
@@ -1261,14 +1230,14 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 	 * can hold, then it'll be accounted for more than once, since
 	 * we won't have lastblk.
 	 */
-	for (i = 0; i < len; i++) {
+	for (int i = 0; i < len; i++) {
 		zap_leaf_t *l;
 
 		if (tbl[i] == lastblk)
 			continue;
 		lastblk = tbl[i];
 
-		err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+		int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
 		if (err == 0) {
 			zap_leaf_stats(zap, l, zs);
 			zap_put_leaf(l);
@@ -1333,14 +1302,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
-		int b;
-
 		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 
-		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+		for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
 			int err;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
index 35dca89728fb..1c7c736d8e97 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
@@ -107,7 +107,6 @@ ldv(int len, const void *addr)
 void
 zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 {
-	int i;
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
 
@@ -123,10 +122,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
-	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
-	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
@@ -162,14 +161,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
-	int i;
-
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
 	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
 	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
@@ -188,11 +185,9 @@ zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 static uint16_t
 zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
-	int chunk;
-
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
@@ -232,7 +227,7 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 	uint16_t *chunkp = &chunk_head;
 	int byten = 0;
 	uint64_t value = 0;
-	int shift = (integer_size-1)*8;
+	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
@@ -240,10 +235,9 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int i;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
-		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
 			if (byten == 0)
 				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
@@ -321,10 +315,9 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 
 	while (len > 0) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int i;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
@@ -347,16 +340,13 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
 	int bseen = 0;
 
 	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
-		uint64_t *thiskey;
-		boolean_t match;
-
+		uint64_t *thiskey =
+		    kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
 		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
-		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
-		    KM_SLEEP);
 
 		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
 		    sizeof (*thiskey), array_numints, thiskey);
-		match = bcmp(thiskey, zn->zn_key_orig,
+		boolean_t match = bcmp(thiskey, zn->zn_key_orig,
 		    array_numints * sizeof (*thiskey)) == 0;
 		kmem_free(thiskey, array_numints * sizeof (*thiskey));
 		return (match);
@@ -365,11 +355,10 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
 	ASSERT(zn->zn_key_intlen == 1);
 	if (zn->zn_matchtype & MT_NORMALIZE) {
 		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
-		boolean_t match;
 
 		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
 		    sizeof (char), array_numints, thisname);
-		match = zap_match(zn, thisname);
+		boolean_t match = zap_match(zn, thisname);
 		kmem_free(thisname, array_numints);
 		return (match);
 	}
@@ -400,12 +389,11 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
 int
 zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
 {
-	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
-	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
+	for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
 	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
 		uint16_t chunk = *chunkp;
 		le = ZAP_LEAF_ENTRY(l, chunk);
@@ -446,17 +434,15 @@ int
 zap_leaf_lookup_closest(zap_leaf_t *l,
     uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
 {
-	uint16_t chunk;
 	uint64_t besth = -1ULL;
 	uint32_t bestcd = -1U;
 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
-	uint16_t lh;
 	struct zap_leaf_entry *le;
 
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
-	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-		for (chunk = zap_leaf_phys(l)->l_hash[lh];
+	for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+		for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
 		    chunk != CHAIN_END; chunk = le->le_next) {
 			le = ZAP_LEAF_ENTRY(l, chunk);
 
@@ -529,11 +515,10 @@ int
 zap_entry_update(zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, const void *buf)
 {
-	int delta_chunks;
 	zap_leaf_t *l = zeh->zeh_leaf;
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
 
-	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+	int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
 	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
@@ -550,14 +535,12 @@ zap_entry_update(zap_entry_handle_t *zeh,
 void
 zap_entry_remove(zap_entry_handle_t *zeh)
 {
-	uint16_t entry_chunk;
-	struct zap_leaf_entry *le;
 	zap_leaf_t *l = zeh->zeh_leaf;
 
 	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
 
-	entry_chunk = *zeh->zeh_chunkp;
-	le = ZAP_LEAF_ENTRY(l, entry_chunk);
+	uint16_t entry_chunk = *zeh->zeh_chunkp;
+	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	zap_leaf_array_free(l, &le->le_name_chunk);
@@ -575,15 +558,12 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
-	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
-	uint64_t valuelen;
-	int numchunks;
 	uint64_t h = zn->zn_hash;
 
-	valuelen = integer_size * num_integers;
+	uint64_t valuelen = integer_size * num_integers;
 
-	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (E2BIG);
@@ -645,7 +625,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
-	chunkp = zap_leaf_rehash_entry(l, chunk);
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
@@ -673,14 +653,13 @@ boolean_t
 zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
     const char *name, zap_t *zap)
 {
-	uint64_t chunk;
 	struct zap_leaf_entry *le;
 	boolean_t allocdzn = B_FALSE;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
-	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+	for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
 	    chunk != CHAIN_END; chunk = le->le_next) {
 		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
 		if (le->le_hash != zeh->zeh_hash)
@@ -763,14 +742,11 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 static void
 zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 {
-	struct zap_leaf_entry *le, *nle;
-	uint16_t chunk;
-
-	le = ZAP_LEAF_ENTRY(l, entry);
+	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-	chunk = zap_leaf_chunk_alloc(nl);
-	nle = ZAP_LEAF_ENTRY(nl, chunk);
+	uint16_t chunk = zap_leaf_chunk_alloc(nl);
+	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
 	(void) zap_leaf_rehash_entry(nl, chunk);
@@ -791,7 +767,6 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
-	int i;
 	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
@@ -818,7 +793,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
-	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
@@ -833,9 +808,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
-	int i, n;
-
-	n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
@@ -851,7 +824,7 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
-	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
 		int nentries = 0;
 		int chunk = zap_leaf_phys(l)->l_hash[i];
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
index 31dce3b1723b..50d5fc48f0c8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
@@ -89,22 +89,20 @@ zap_hash(zap_name_t *zn)
 		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 
 		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
-			int i;
 			const uint64_t *wp = zn->zn_key_norm;
 
 			ASSERT(zn->zn_key_intlen == 8);
-			for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
-				int j;
+			for (int i = 0; i < zn->zn_key_norm_numints;
+			    wp++, i++) {
 				uint64_t word = *wp;
 
-				for (j = 0; j < zn->zn_key_intlen; j++) {
+				for (int j = 0; j < zn->zn_key_intlen; j++) {
 					h = (h >> 8) ^
 					    zfs_crc64_table[(h ^ word) & 0xFF];
 					word >>= NBBY;
 				}
 			}
 		} else {
-			int i, len;
 			const uint8_t *cp = zn->zn_key_norm;
 
 			/*
@@ -114,10 +112,10 @@ zap_hash(zap_name_t *zn)
 			 * zn_key_*_numints includes the terminating
 			 * null for non-binary keys.)
 			 */
-			len = zn->zn_key_norm_numints - 1;
+			int len = zn->zn_key_norm_numints - 1;
 
 			ASSERT(zn->zn_key_intlen == 1);
-			for (i = 0; i < len; cp++, i++) {
+			for (int i = 0; i < len; cp++, i++) {
 				h = (h >> 8) ^
 				    zfs_crc64_table[(h ^ *cp) & 0xFF];
 			}
@@ -137,15 +135,12 @@ zap_hash(zap_name_t *zn)
 static int
 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
 {
-	size_t inlen, outlen;
-	int err;
-
 	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
 
-	inlen = strlen(name) + 1;
-	outlen = ZAP_MAXNAMELEN;
+	size_t inlen = strlen(name) + 1;
+	size_t outlen = ZAP_MAXNAMELEN;
 
-	err = 0;
+	int err = 0;
 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
 	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
 	    U8_UNICODE_LATEST, &err);
@@ -255,12 +250,11 @@ zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
 static void
 mzap_byteswap(mzap_phys_t *buf, size_t size)
 {
-	int i, max;
 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
 	buf->mz_salt = BSWAP_64(buf->mz_salt);
 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
-	max = (size / MZAP_ENT_LEN) - 1;
-	for (i = 0; i < max; i++) {
+	int max = (size / MZAP_ENT_LEN) - 1;
+	for (int i = 0; i < max; i++) {
 		buf->mz_chunk[i].mze_value =
 		    BSWAP_64(buf->mz_chunk[i].mze_value);
 		buf->mz_chunk[i].mze_cd =
@@ -271,9 +265,7 @@ mzap_byteswap(mzap_phys_t *buf, size_t size)
 void
 zap_byteswap(void *buf, size_t size)
 {
-	uint64_t block_type;
-
-	block_type = *(uint64_t *)buf;
+	uint64_t block_type = *(uint64_t *)buf;
 
 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
@@ -289,27 +281,22 @@ mze_compare(const void *arg1, const void *arg2)
 	const mzap_ent_t *mze1 = arg1;
 	const mzap_ent_t *mze2 = arg2;
 
-	if (mze1->mze_hash > mze2->mze_hash)
-		return (+1);
-	if (mze1->mze_hash < mze2->mze_hash)
-		return (-1);
-	if (mze1->mze_cd > mze2->mze_cd)
-		return (+1);
-	if (mze1->mze_cd < mze2->mze_cd)
-		return (-1);
-	return (0);
+	int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
+	if (likely(cmp))
+		return (cmp);
+
+	return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
 }
 
 static int
 mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 {
-	mzap_ent_t *mze;
 	avl_index_t idx;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+	mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
 	mze->mze_hash = hash;
 	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
@@ -352,10 +339,8 @@ static uint32_t
 mze_find_unused_cd(zap_t *zap, uint64_t hash)
 {
 	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
 	avl_index_t idx;
 	avl_tree_t *avl = &zap->zap_m.zap_avl;
-	uint32_t cd;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@@ -363,8 +348,8 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
-	cd = 0;
-	for (mze = avl_find(avl, &mze_tofind, &idx);
+	uint32_t cd = 0;
+	for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
 		if (mze->mze_cd != cd)
 			break;
@@ -399,15 +384,13 @@ static zap_t *
 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 {
 	zap_t *winner;
-	zap_t *zap;
-	int i;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
 	uint64_t zap_block_type = zap_hdr[0];
 	uint64_t zap_magic = zap_hdr[1];
 
 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
 
-	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, 0, 0, 0);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
 	zap->zap_objset = os;
@@ -443,7 +426,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 		avl_create(&zap->zap_m.zap_avl, mze_compare,
 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 
-		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+		for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
 			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
@@ -495,28 +478,21 @@ handle_winner:
 	return (winner);
 }
 
+/*
+ * This routine "consumes" the caller's hold on the dbuf, which must
+ * have the specified tag.
+ */
 static int
 zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
-	zap_t *zap;
-	krw_t lt;
-
 	ASSERT0(db->db_offset);
 	objset_t *os = dmu_buf_get_objset(db);
 	uint64_t obj = db->db_object;
 
 	*zapp = NULL;
 
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	zap = dmu_buf_get_user(db);
+	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
 		zap = mzap_open(os, obj, db);
 		if (zap == NULL) {
@@ -535,7 +511,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
 	 * can only be different if it was upgraded from micro to fat,
 	 * and micro wanted WRITER but fat only needs READER.
 	 */
-	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 	rw_enter(&zap->zap_rwlock, lt);
 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 		/* it was upgraded, now we only need reader */
@@ -581,12 +557,19 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
 {
 	dmu_buf_t *db;
-	int err;
 
-	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0) {
 		return (err);
 	}
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	}
+#endif
+
 	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
 	if (err != 0) {
 		dmu_buf_rele(db, tag);
@@ -599,11 +582,17 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
 {
 	dmu_buf_t *db;
-	int err;
 
-	err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	}
+#endif
 	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
 	if (err != 0)
 		dmu_buf_rele(db, tag);
@@ -620,22 +609,20 @@ zap_unlockdir(zap_t *zap, void *tag)
 static int
 mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
-	mzap_phys_t *mzp;
-	int i, sz, nchunks;
 	int err = 0;
 	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	sz = zap->zap_dbuf->db_size;
-	mzp = zio_buf_alloc(sz);
+	int sz = zap->zap_dbuf->db_size;
+	mzap_phys_t *mzp = zio_buf_alloc(sz);
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
-	nchunks = zap->zap_m.zap_num_chunks;
+	int nchunks = zap->zap_m.zap_num_chunks;
 
 	if (!flags) {
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
-		if (err) {
+		if (err != 0) {
 			zio_buf_free(mzp, sz);
 			return (err);
 		}
@@ -648,19 +635,18 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 
 	fzap_upgrade(zap, tx, flags);
 
-	for (i = 0; i < nchunks; i++) {
+	for (int i = 0; i < nchunks; i++) {
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
-		zap_name_t *zn;
 		if (mze->mze_name[0] == 0)
 			continue;
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
-		zn = zap_name_alloc(zap, mze->mze_name, 0);
+		zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
 		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
 		    tag, tx);
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 		zap_name_free(zn);
-		if (err)
+		if (err != 0)
 			break;
 	}
 	zio_buf_free(mzp, sz);
@@ -690,32 +676,24 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
     dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
-	mzap_phys_t *zp;
 
-	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
-
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
+	VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 	dmu_buf_will_dirty(db, tx);
-	zp = db->db_data;
+	mzap_phys_t *zp = db->db_data;
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	zp->mz_normflags = normflags;
-	dmu_buf_rele(db, FTAG);
 
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
-		    B_FALSE, B_FALSE, FTAG, &zap));
-		VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
+		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		    B_FALSE, B_FALSE, &zap));
+		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
+	} else {
+		dmu_buf_rele(db, FTAG);
 	}
 }
 
@@ -732,9 +710,8 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
     dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	int err;
-
-	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+	int err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 	if (err != 0)
 		return (err);
 	mzap_create_impl(os, obj, normflags, 0, tx);
@@ -752,6 +729,7 @@ uint64_t
 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	mzap_create_impl(os, obj, normflags, 0, tx);
@@ -763,6 +741,7 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
@@ -808,10 +787,10 @@ int
 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 {
 	zap_t *zap;
-	int err;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
 	if (!zap->zap_ismicro) {
 		err = fzap_count(zap, count);
@@ -829,7 +808,6 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 static boolean_t
 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
 {
-	mzap_ent_t *other;
 	int direction = AVL_BEFORE;
 	boolean_t allocdzn = B_FALSE;
 
@@ -837,7 +815,7 @@ mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
 		return (B_FALSE);
 
 again:
-	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+	for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
 	    other && other->mze_hash == mze->mze_hash;
 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 
@@ -882,10 +860,8 @@ zap_lookup_impl(zap_t *zap, const char *name,
     boolean_t *ncp)
 {
 	int err = 0;
-	mzap_ent_t *mze;
-	zap_name_t *zn;
 
-	zn = zap_name_alloc(zap, name, mt);
+	zap_name_t *zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
 
@@ -893,7 +869,7 @@ zap_lookup_impl(zap_t *zap, const char *name,
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
 		    realname, rn_len, ncp);
 	} else {
-		mze = mze_find(zn);
+		mzap_ent_t *mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
@@ -924,9 +900,9 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     boolean_t *ncp)
 {
 	zap_t *zap;
-	int err;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_impl(zap, name, integer_size,
@@ -950,9 +926,8 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
     boolean_t *ncp)
 {
 	zap_t *zap;
-	int err;
 
-	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
 	    FTAG, &zap);
 	if (err != 0)
 		return (err);
@@ -967,13 +942,12 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
 	zap_t *zap;
-	int err;
-	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -990,13 +964,12 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	zap_t *zap;
-	int err;
-	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1024,14 +997,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
-	int err;
-	mzap_ent_t *mze;
-	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc(zap, name, 0);
+	zap_name_t *zn = zap_name_alloc(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1039,7 +1010,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	if (!zap->zap_ismicro) {
 		err = fzap_length(zn, integer_size, num_integers);
 	} else {
-		mze = mze_find(zn);
+		mzap_ent_t *mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
@@ -1059,13 +1030,12 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
-	int err;
-	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1079,26 +1049,24 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
-	int i;
 	zap_t *zap = zn->zn_zap;
 	int start = zap->zap_m.zap_alloc_next;
-	uint32_t cd;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
-	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
 
-	cd = mze_find_unused_cd(zap, zn->zn_hash);
+	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
 	ASSERT(cd < zap_maxcd(zap));
 
 again:
-	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
+	for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
@@ -1125,12 +1093,10 @@ zap_add_impl(zap_t *zap, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx, void *tag)
 {
-	int err = 0;
-	mzap_ent_t *mze;
 	const uint64_t *intval = val;
-	zap_name_t *zn;
+	int err = 0;
 
-	zn = zap_name_alloc(zap, key, 0);
+	zap_name_t *zn = zap_name_alloc(zap, key, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
@@ -1147,8 +1113,7 @@ zap_add_impl(zap_t *zap, const char *key,
 		}
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
-		mze = mze_find(zn);
-		if (mze != NULL) {
+		if (mze_find(zn) != NULL) {
 			err = SET_ERROR(EEXIST);
 		} else {
 			mzap_addent(zn, *intval);
@@ -1199,13 +1164,12 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
-	int err;
-	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1223,11 +1187,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
-	mzap_ent_t *mze;
 	uint64_t oldval;
 	const uint64_t *intval = val;
-	zap_name_t *zn;
-	int err;
 
 #ifdef ZFS_DEBUG
 	/*
@@ -1238,10 +1199,11 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
 #endif
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc(zap, name, 0);
+	zap_name_t *zn = zap_name_alloc(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1261,7 +1223,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 		}
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
-		mze = mze_find(zn);
+		mzap_ent_t *mze = mze_find(zn);
 		if (mze != NULL) {
 			ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
 			MZE_PHYS(zap, mze)->mze_value = *intval;
@@ -1282,13 +1244,12 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
-	zap_name_t *zn;
-	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1311,17 +1272,15 @@ static int
 zap_remove_impl(zap_t *zap, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
 {
-	mzap_ent_t *mze;
-	zap_name_t *zn;
 	int err = 0;
 
-	zn = zap_name_alloc(zap, name, mt);
+	zap_name_t *zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
-		mze = mze_find(zn);
+		mzap_ent_t *mze = mze_find(zn);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
@@ -1369,13 +1328,12 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
 {
 	zap_t *zap;
-	int err;
-	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
-	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
@@ -1451,9 +1409,6 @@ int
 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err;
-	avl_index_t idx;
-	mzap_ent_t mze_tofind;
-	mzap_ent_t *mze;
 
 	if (zc->zc_hash == -1ULL)
 		return (SET_ERROR(ENOENT));
@@ -1462,7 +1417,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
-		if (err)
+		if (err != 0)
 			return (err);
 
 		/*
@@ -1482,10 +1437,14 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
+		avl_index_t idx;
+		mzap_ent_t mze_tofind;
+
 		mze_tofind.mze_hash = zc->zc_hash;
 		mze_tofind.mze_cd = zc->zc_cd;
 
-		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+		mzap_ent_t *mze =
+		    avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
 		if (mze == NULL) {
 			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
 			    idx, AVL_AFTER);
@@ -1562,11 +1521,11 @@ out:
 int
 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 {
-	int err;
 	zap_t *zap;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
-	if (err)
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
 		return (err);
 
 	bzero(zs, sizeof (zap_stats_t));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
index ea20891ec211..54bc638c6e98 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c
@@ -433,7 +433,7 @@ zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl,
 /*
  * Convert a lua value to an nvpair, adding it to an nvlist with the given key.
  */
-void
+static void
 zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
 {
 	/*
@@ -445,7 +445,7 @@ zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
 		(void) lua_error(state);
 }
 
-int
+static int
 zcp_lua_to_nvlist_helper(lua_State *state)
 {
 	nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2);
@@ -454,11 +454,12 @@ zcp_lua_to_nvlist_helper(lua_State *state)
 	return (0);
 }
 
-void
+static void
 zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
     const char *key, zcp_eval_arg_t *evalargs)
 {
 	int err;
+	VERIFY3U(1, ==, lua_gettop(state));
 	lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
 	lua_pushlightuserdata(state, (char *)key);
 	lua_pushlightuserdata(state, nvl);
@@ -904,6 +905,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
 			    ZCP_RET_RETURN, evalargs);
 		} else if (return_count > 1) {
 			evalargs->ea_result = SET_ERROR(ECHRNG);
+			lua_settop(state, 0);
 			(void) lua_pushfstring(state, "Multiple return "
 			    "values not supported");
 			zcp_convert_return_values(state, evalargs->ea_outnvl,
@@ -965,6 +967,7 @@ static void
 zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname)
 {
 	evalargs->ea_result = SET_ERROR(ECHRNG);
+	lua_settop(evalargs->ea_state, 0);
 	(void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s",
 	    poolname);
 	zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
index 78b2912df1d6..76003e3544f4 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
@@ -220,7 +220,7 @@ spa_features_check(spa_t *spa, boolean_t for_write,
  *
  * Note: well-designed features will not need to use this; they should
  * use spa_feature_is_enabled() and spa_feature_is_active() instead.
- * However, this is non-static for zdb and zhack.
+ * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
  */
 int
 feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
index e74799a70fe0..581b6b1bfb64 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
@@ -71,14 +71,10 @@ static char *nulldomain = "";
 static int
 idx_compare(const void *arg1, const void *arg2)
 {
-	const fuid_domain_t *node1 = arg1;
-	const fuid_domain_t *node2 = arg2;
+	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
 
-	if (node1->f_idx < node2->f_idx)
-		return (-1);
-	else if (node1->f_idx > node2->f_idx)
-		return (1);
-	return (0);
+	return (AVL_CMP(node1->f_idx, node2->f_idx));
 }
 
 /*
@@ -87,14 +83,13 @@ idx_compare(const void *arg1, const void *arg2)
 static int
 domain_compare(const void *arg1, const void *arg2)
 {
-	const fuid_domain_t *node1 = arg1;
-	const fuid_domain_t *node2 = arg2;
+	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
 	int val;
 
 	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
-	if (val == 0)
-		return (0);
-	return (val > 0 ? 1 : -1);
+
+	return (AVL_ISIGN(val));
 }
 
 void
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
index ca6ac539f0d6..af73005c260e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
@@ -195,6 +195,8 @@
 #include <sys/zcp.h>
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -3865,6 +3867,80 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 }
 
 /*
+ * innvl: {
+ *     vdevs: {
+ *         guid 1, guid 2, ...
+ *     },
+ *     func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ *     [func: EINVAL (if provided command type didn't make sense)],
+ *     [vdevs: {
+ *         guid1: errno, (see function body for possible errnos)
+ *         ...
+ *     }]
+ * }
+ *
+ */
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(poolname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	uint64_t cmd_type;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+	    &cmd_type) != 0) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+	    cmd_type == POOL_INITIALIZE_DO ||
+	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_guids;
+	if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+	    &vdev_guids) != 0) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	int total_errors = 0;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+		uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+		error = spa_vdev_initialize(spa, vdev_guid, cmd_type);
+		if (error != 0) {
+			char guid_as_str[MAXNAMELEN];
+
+			(void) snprintf(guid_as_str, sizeof (guid_as_str),
+			    "%llu", (unsigned long long)vdev_guid);
+			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+			total_errors++;
+		}
+	}
+	if (fnvlist_size(vdev_errlist) > 0) {
+		fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+		    vdev_errlist);
+	}
+	fnvlist_free(vdev_errlist);
+
+	spa_close(spa, FTAG);
+	return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
  * fsname is name of dataset to rollback (to most recent snapshot)
  *
  * innvl may contain name of expected target snapshot
@@ -6118,6 +6194,10 @@ zfs_ioctl_init(void)
 	    zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
 
+	zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+	    zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
index b40bdbea123c..7743e81dd5f1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
@@ -594,12 +594,8 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
 int
 zfs_range_compare(const void *arg1, const void *arg2)
 {
-	const rl_t *rl1 = arg1;
-	const rl_t *rl2 = arg2;
-
-	if (rl1->r_off > rl2->r_off)
-		return (1);
-	if (rl1->r_off < rl2->r_off)
-		return (-1);
-	return (0);
+	const rl_t *rl1 = (const rl_t *)arg1;
+	const rl_t *rl2 = (const rl_t *)arg2;
+
+	return (AVL_CMP(rl1->r_off, rl2->r_off));
 }
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
index 980c32820c3f..66d858081485 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
@@ -2630,6 +2630,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
@@ -2642,17 +2643,47 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
-	const char *pname;
-	int error = ENOENT;
+	uint64_t *cached_copy = NULL;
 
 	/*
-	 * Look up the file system's value for the property.  For the
-	 * version property, we look up a slightly different string.
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
 	 */
-	if (prop == ZFS_PROP_VERSION)
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION) {
 		pname = ZPL_VERSION_STR;
-	else
+	} else {
 		pname = zfs_prop_to_name(prop);
+	}
 
 	if (os != NULL) {
 		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
@@ -2677,6 +2708,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 		}
 		error = 0;
 	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
 	return (error);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
index 58c3807f6ae4..ca34a69a6553 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
@@ -1691,7 +1691,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
 		return (0);
 	}
 
-	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+	    DMU_OBJECT_END);
 	if (error) {
 		zfs_range_unlock(rl);
 		return (error);
@@ -2102,6 +2103,17 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
 	*path = '\0';
 	sa_hdl = hdl;
 
+	uint64_t deleteq_obj;
+	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+	error = zap_lookup_int(osp, deleteq_obj, obj);
+	if (error == 0) {
+		return (ESTALE);
+	} else if (error != ENOENT) {
+		return (error);
+	}
+	error = 0;
+
 	for (;;) {
 		uint64_t pobj;
 		char component[MAXNAMELEN + 2];
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
index 9571998347f2..8cc65d6f31f8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
@@ -131,17 +131,11 @@ zil_bp_compare(const void *x1, const void *x2)
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
-	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
-		return (-1);
-	if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2))
-		return (1);
-
-	if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2))
-		return (-1);
-	if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2))
-		return (1);
+	int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+	if (likely(cmp))
+		return (cmp);
 
-	return (0);
+	return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
@@ -503,12 +497,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2)
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
-	if (v1 < v2)
-		return (-1);
-	if (v1 > v2)
-		return (1);
-
-	return (0);
+	return (AVL_CMP(v1, v2));
 }
 
 static lwb_t *
@@ -665,7 +654,8 @@ zil_create(zilog_t *zilog)
 			BP_ZERO(&blk);
 		}
 
-		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+		error = zio_alloc_zil(zilog->zl_spa,
+		    zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
 		    ZIL_MIN_BLKSZ, &slog);
 
 		if (error == 0)
@@ -1342,7 +1332,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	BP_ZERO(bp);
 
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
+	error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
+	    txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -1624,12 +1615,7 @@ zil_aitx_compare(const void *x1, const void *x2)
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
-	if (o1 < o2)
-		return (-1);
-	if (o1 > o2)
-		return (1);
-
-	return (0);
+	return (AVL_CMP(o1, o2));
 }
 
 /*
@@ -2297,7 +2283,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 	 */
 	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
 
-	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+	IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
 
 	/*
 	 * Since the lwb's zio hadn't been issued by the time this thread
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
index 3eb8747619fa..53d0f4d27b08 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -44,6 +44,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/abd.h>
+#include <sys/cityhash.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
@@ -99,9 +100,6 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 extern vmem_t *zio_alloc_arena;
 #endif
 
-#define	ZIO_PIPELINE_CONTINUE		0x100
-#define	ZIO_PIPELINE_STOP		0x101
-
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
@@ -538,7 +536,8 @@ zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 }
 
 static void
-zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
+    zio_t **next_to_executep)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
@@ -557,13 +556,33 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
+
 		/*
-		 * Dispatch the parent zio in its own taskq so that
-		 * the child can continue to make progress. This also
-		 * prevents overflowing the stack when we have deeply nested
-		 * parent-child relationships.
+		 * If we can tell the caller to execute this parent next, do
+		 * so.  Otherwise dispatch the parent zio as its own task.
+		 *
+		 * Having the caller execute the parent when possible reduces
+		 * locking on the zio taskq's, reduces context switch
+		 * overhead, and has no recursion penalty.  Note that one
+		 * read from disk typically causes at least 3 zio's: a
+		 * zio_null(), the logical zio_read(), and then a physical
+		 * zio.  When the physical ZIO completes, we are able to call
+		 * zio_done() on all 3 of these zio's from one invocation of
+		 * zio_execute() by returning the parent back to
+		 * zio_execute().  Since the parent isn't executed until this
+		 * thread returns back to zio_execute(), the caller should do
+		 * so promptly.
+		 *
+		 * In other cases, dispatching the parent prevents
+		 * overflowing the stack when we have deeply nested
+		 * parent-child relationships, as we do with the "mega zio"
+		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
-		zio_taskq_dispatch(pio, type, B_FALSE);
+		if (next_to_executep != NULL && *next_to_executep == NULL) {
+			*next_to_executep = pio;
+		} else {
+			zio_taskq_dispatch(pio, type, B_FALSE);
+		}
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
@@ -1149,17 +1168,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
-	/*
-	 * In the common case, where the parent zio was to a normal vdev,
-	 * the child zio must be to a child vdev of that vdev.  Otherwise,
-	 * the child zio must be to a top-level vdev.
-	 */
-	if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) {
-		ASSERT3P(vd->vdev_parent, ==, pio->io_vd);
-	} else {
-		ASSERT3P(vd, ==, vd->vdev_top);
-	}
-
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
@@ -1223,7 +1231,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
-    int type, zio_priority_t priority, enum zio_flag flags,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
@@ -1285,7 +1293,7 @@ zio_shrink(zio_t *zio, uint64_t size)
  * ==========================================================================
  */
 
-static int
+static zio_t *
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1322,14 +1330,14 @@ zio_read_bp_init(zio_t *zio)
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
@@ -1344,7 +1352,7 @@ zio_write_bp_init(zio_t *zio)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (BP_IS_EMBEDDED(bp))
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
@@ -1355,13 +1363,13 @@ zio_write_bp_init(zio_t *zio)
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
@@ -1369,7 +1377,7 @@ zio_write_bp_init(zio_t *zio)
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 		}
 
 		/*
@@ -1381,10 +1389,10 @@ zio_write_bp_init(zio_t *zio)
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1403,11 +1411,11 @@ zio_write_compress(zio_t *zio)
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 
 	if (zio->io_children_ready != NULL) {
 		/*
@@ -1466,7 +1474,7 @@ zio_write_compress(zio_t *zio)
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 		} else {
 			/*
 			 * Round up compressed size up to the ashift
@@ -1554,10 +1562,10 @@ zio_write_compress(zio_t *zio)
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1569,7 +1577,7 @@ zio_free_bp_init(zio_t *zio)
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -1643,12 +1651,12 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 	return (B_FALSE);
 }
 
-static int
+static zio_t *
 zio_issue_async(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
-	return (ZIO_PIPELINE_STOP);
+	return (NULL);
 }
 
 void
@@ -1730,14 +1738,13 @@ static zio_pipe_stage_t *zio_pipeline[];
 void
 zio_execute(zio_t *zio)
 {
-	zio->io_executor = curthread;
-
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
-		int rv;
+
+		zio->io_executor = curthread;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
@@ -1768,12 +1775,16 @@ zio_execute(zio_t *zio)
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
-		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
-		if (rv == ZIO_PIPELINE_STOP)
-			return;
+		/*
+		 * The zio pipeline stage returns the next zio to execute
+		 * (typically the same as this one), or NULL if we should
+		 * stop.
+		 */
+		zio = zio_pipeline[highbit64(stage) - 1](zio);
 
-		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
+		if (zio == NULL)
+			return;
 	}
 }
 
@@ -2236,7 +2247,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
 		zio_nowait(zio);
 }
 
-static int
+static zio_t *
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -2248,16 +2259,16 @@ zio_gang_assemble(zio_t *zio)
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
@@ -2271,7 +2282,7 @@ zio_gang_issue(zio_t *zio)
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 static void
@@ -2310,7 +2321,7 @@ zio_write_gang_done(zio_t *zio)
 	abd_put(zio->io_abd);
 }
 
-static int
+static zio_t *
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
@@ -2335,7 +2346,8 @@ zio_write_gang_block(zio_t *pio)
 		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 		flags |= METASLAB_ASYNC_ALLOC;
-		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+		VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator],
+		    pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
@@ -2346,12 +2358,12 @@ zio_write_gang_block(zio_t *pio)
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
-		    pio, flags));
+		    pio->io_allocator, pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
-	    &pio->io_alloc_list, pio);
+	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
@@ -2365,10 +2377,10 @@ zio_write_gang_block(zio_t *pio)
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
-			    gbh_copies - copies, pio);
+			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 		pio->io_error = error;
-		return (ZIO_PIPELINE_CONTINUE);
+		return (pio);
 	}
 
 	if (pio == gio) {
@@ -2423,7 +2435,7 @@ zio_write_gang_block(zio_t *pio)
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
-			    zp.zp_copies, cio, flags));
+			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
@@ -2435,7 +2447,7 @@ zio_write_gang_block(zio_t *pio)
 
 	zio_nowait(zio);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (pio);
 }
 
 /*
@@ -2456,7 +2468,7 @@ zio_write_gang_block(zio_t *pio)
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
-static int
+static zio_t *
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -2483,7 +2495,7 @@ zio_nop_write(zio_t *zio)
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
@@ -2503,7 +2515,7 @@ zio_nop_write(zio_t *zio)
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -2531,7 +2543,7 @@ zio_ddt_child_read_done(zio_t *zio)
 	mutex_exit(&pio->io_lock);
 }
 
-static int
+static zio_t *
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -2551,7 +2563,7 @@ zio_ddt_read_start(zio_t *zio)
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
@@ -2564,23 +2576,23 @@ zio_ddt_read_start(zio_t *zio)
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
@@ -2592,12 +2604,12 @@ zio_ddt_read_done(zio_t *zio)
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
-			return (ZIO_PIPELINE_STOP);
+			return (NULL);
 		}
 		if (dde->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_repair_abd,
@@ -2610,7 +2622,7 @@ zio_ddt_read_done(zio_t *zio)
 
 	ASSERT(zio->io_vsd == NULL);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 static boolean_t
@@ -2768,7 +2780,7 @@ zio_ddt_ditto_write_done(zio_t *zio)
 	ddt_exit(ddt);
 }
 
-static int
+static zio_t *
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -2812,7 +2824,7 @@ zio_ddt_write(zio_t *zio)
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 	}
 
 	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
@@ -2838,7 +2850,7 @@ zio_ddt_write(zio_t *zio)
 			zio->io_bp_override = NULL;
 			BP_ZERO(bp);
 			ddt_exit(ddt);
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 		}
 
 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
@@ -2880,12 +2892,12 @@ zio_ddt_write(zio_t *zio)
 	if (dio)
 		zio_nowait(dio);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 ddt_entry_t *freedde; /* for debugging */
 
-static int
+static zio_t *
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -2903,7 +2915,7 @@ zio_ddt_free(zio_t *zio)
 	ddt_phys_decref(ddp);
 	ddt_exit(ddt);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -2913,13 +2925,13 @@ zio_ddt_free(zio_t *zio)
  */
 
 static zio_t *
-zio_io_to_allocate(spa_t *spa)
+zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
 
-	zio = avl_first(&spa->spa_alloc_tree);
+	zio = avl_first(&spa->spa_alloc_trees[allocator]);
 	if (zio == NULL)
 		return (NULL);
 
@@ -2929,18 +2941,19 @@ zio_io_to_allocate(spa_t *spa)
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
+	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
-	    zio->io_prop.zp_copies, zio, 0)) {
+	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
 		return (NULL);
 	}
 
-	avl_remove(&spa->spa_alloc_tree, zio);
+	avl_remove(&spa->spa_alloc_trees[allocator], zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
-static int
+static zio_t *
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -2950,7 +2963,7 @@ zio_dva_throttle(zio_t *zio)
 	    !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 	}
 
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2958,40 +2971,35 @@ zio_dva_throttle(zio_t *zio)
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
-	mutex_enter(&spa->spa_alloc_lock);
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	avl_add(&spa->spa_alloc_tree, zio);
+	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
 
-	nio = zio_io_to_allocate(zio->io_spa);
-	mutex_exit(&spa->spa_alloc_lock);
+	nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator);
+	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
 
-	if (nio == zio)
-		return (ZIO_PIPELINE_CONTINUE);
-
-	if (nio != NULL) {
-		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
-		/*
-		 * We are passing control to a new zio so make sure that
-		 * it is processed by a different thread. We do this to
-		 * avoid stack overflows that can occur when parents are
-		 * throttled and children are making progress. We allow
-		 * it to go to the head of the taskq since it's already
-		 * been waiting.
-		 */
-		zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
-	}
-	return (ZIO_PIPELINE_STOP);
+	return (nio);
 }
 
 void
-zio_allocate_dispatch(spa_t *spa)
+zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
-	mutex_enter(&spa->spa_alloc_lock);
-	zio = zio_io_to_allocate(spa);
-	mutex_exit(&spa->spa_alloc_lock);
+	mutex_enter(&spa->spa_alloc_locks[allocator]);
+	zio = zio_io_to_allocate(spa, allocator);
+	mutex_exit(&spa->spa_alloc_locks[allocator]);
 	if (zio == NULL)
 		return;
 
@@ -3000,7 +3008,7 @@ zio_allocate_dispatch(spa_t *spa)
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
-static int
+static zio_t *
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -3032,10 +3040,10 @@ zio_dva_allocate(zio_t *zio)
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
-	    &zio->io_alloc_list, zio);
+	    &zio->io_alloc_list, zio, zio->io_allocator);
 
 	if (error != 0) {
-		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		zfs_dbgmsg("%s: metaslab allocation failure: zio %p, "
 		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
 		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -3043,18 +3051,18 @@ zio_dva_allocate(zio_t *zio)
 		zio->io_error = error;
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_dva_claim(zio_t *zio)
 {
 	int error;
@@ -3063,7 +3071,7 @@ zio_dva_claim(zio_t *zio)
 	if (error)
 		zio->io_error = error;
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -3092,8 +3100,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t *slog)
+zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp,
+    blkptr_t *old_bp, uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
@@ -3101,14 +3109,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
+	/*
+	 * When allocating a zil block, we don't have information about
+	 * the final destination of the block except the objset it's part
+	 * of, so we just hash the objset ID to pick the allocator to get
+	 * some parallelism.
+	 */
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
-	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
+	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL,
+	    cityhash4(0, 0, 0, objset) % spa->spa_alloc_count);
 	if (error == 0) {
 		*slog = TRUE;
 	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
-		    &io_alloc_list, NULL);
+		    &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) %
+		    spa->spa_alloc_count);
 		if (error == 0)
 			*slog = FALSE;
 	}
@@ -3150,7 +3166,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
-static int
+static zio_t *
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -3169,13 +3185,13 @@ zio_vdev_io_start(zio_t *zio)
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
 	    zio->io_priority == ZIO_PRIORITY_NOW) {
 		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
@@ -3183,9 +3199,13 @@ zio_vdev_io_start(zio_t *zio)
 		ASSERT(spa->spa_trust_config);
 
 		if (zio->io_vd->vdev_removing) {
+			/*
+			 * Note: the code can handle other kinds of writes,
+			 * but we don't expect them.
+			 */
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
-			    ZIO_FLAG_INDUCE_DAMAGE));
+			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
@@ -3251,39 +3271,58 @@ zio_vdev_io_start(zio_t *zio)
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
-	 * This prevents spurious resilvering with nested replication.
-	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
-	 * A is out of date, we'll read from C+D, then use the data to
-	 * resilver A+B -- but we don't actually want to resilver B, just A.
-	 * The top-level mirror has no way to know this, so instead we just
-	 * discard unnecessary repairs as we work our way down the vdev tree.
-	 * The same logic applies to any form of nested replication:
-	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 * This prevents spurious resilvering.
+	 *
+	 * There are a few ways that we can end up creating these spurious
+	 * resilver i/os:
+	 *
+	 * 1. A resilver i/o will be issued if any DVA in the BP has a
+	 * dirty DTL.  The mirror code will issue resilver writes to
+	 * each DVA, including the one(s) that are not on vdevs with dirty
+	 * DTLs.
+	 *
+	 * 2. With nested replication, which happens when we have a
+	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+	 * For example, given mirror(replacing(A+B), C), it's likely that
+	 * only A is out of date (it's the new device). In this case, we'll
+	 * read from C, then use the data to resilver A+B -- but we don't
+	 * actually want to resilver B, just A. The top-level mirror has no
+	 * way to know this, so instead we just discard unnecessary repairs
+	 * as we work our way down the vdev tree.
+	 *
+	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+	 * The same logic applies to any form of nested replication: ditto
+	 * + mirror, RAID-Z + replacing, etc.
+	 *
+	 * However, indirect vdevs point off to other vdevs which may have
+	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
+	 * will be properly bypassed instead.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    vd->vdev_ops != &vdev_indirect_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
-		return (ZIO_PIPELINE_CONTINUE);
+		return (zio);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		switch (zio->io_type) {
 		case ZIO_TYPE_READ:
 			if (vdev_cache_read(zio))
-				return (ZIO_PIPELINE_CONTINUE);
+				return (zio);
 			/* FALLTHROUGH */
 		case ZIO_TYPE_WRITE:
 		case ZIO_TYPE_FREE:
 			if ((zio = vdev_queue_io(zio)) == NULL)
-				return (ZIO_PIPELINE_STOP);
+				return (NULL);
 
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 				zio_interrupt(zio);
-				return (ZIO_PIPELINE_STOP);
+				return (NULL);
 			}
 			break;
 		}
@@ -3295,14 +3334,14 @@ zio_vdev_io_start(zio_t *zio)
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !trim_map_write_start(zio))
-			return (ZIO_PIPELINE_STOP);
+			return (NULL);
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
-	return (ZIO_PIPELINE_STOP);
+	return (NULL);
 }
 
-static int
+static zio_t *
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -3310,7 +3349,7 @@ zio_vdev_io_done(zio_t *zio)
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
@@ -3353,7 +3392,7 @@ zio_vdev_io_done(zio_t *zio)
 	if (unexpected_error)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -3411,13 +3450,13 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
 	zcr->zcr_free = zio_buf_free;
 }
 
-static int
+static zio_t *
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
@@ -3463,7 +3502,7 @@ zio_vdev_io_assess(zio_t *zio)
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	/*
@@ -3503,7 +3542,7 @@ zio_vdev_io_assess(zio_t *zio)
 		zio->io_physdone(zio->io_logical);
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 void
@@ -3538,7 +3577,7 @@ zio_vdev_io_bypass(zio_t *zio)
  * Generate and verify checksums
  * ==========================================================================
  */
-static int
+static zio_t *
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -3552,7 +3591,7 @@ zio_checksum_generate(zio_t *zio)
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
@@ -3566,10 +3605,10 @@ zio_checksum_generate(zio_t *zio)
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
-static int
+static zio_t *
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
@@ -3584,7 +3623,7 @@ zio_checksum_verify(zio_t *zio)
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
-			return (ZIO_PIPELINE_CONTINUE);
+			return (zio);
 
 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
 	}
@@ -3599,7 +3638,7 @@ zio_checksum_verify(zio_t *zio)
 		}
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -3642,7 +3681,7 @@ zio_worst_error(int e1, int e2)
  * I/O completion
  * ==========================================================================
  */
-static int
+static zio_t *
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -3651,7 +3690,7 @@ zio_ready(zio_t *zio)
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
 	    ZIO_WAIT_READY)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	if (zio->io_ready) {
@@ -3678,8 +3717,8 @@ zio_ready(zio_t *zio)
 			 */
 			metaslab_class_throttle_unreserve(
 			    spa_normal_class(zio->io_spa),
-			    zio->io_prop.zp_copies, zio);
-			zio_allocate_dispatch(zio->io_spa);
+			    zio->io_prop.zp_copies, zio->io_allocator, zio);
+			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}
 
@@ -3697,7 +3736,7 @@ zio_ready(zio_t *zio)
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
-		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
@@ -3713,7 +3752,7 @@ zio_ready(zio_t *zio)
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
-	return (ZIO_PIPELINE_CONTINUE);
+	return (zio);
 }
 
 /*
@@ -3762,21 +3801,22 @@ zio_dva_throttle_done(zio_t *zio)
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 
 	mutex_enter(&pio->io_lock);
-	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
-	    1, pio);
+	    1, pio->io_allocator, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
-	zio_allocate_dispatch(zio->io_spa);
+	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }
 
-static int
+static zio_t *
 zio_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -3793,7 +3833,7 @@ zio_done(zio_t *zio)
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	/*
@@ -3816,8 +3856,10 @@ zio_done(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(bp != NULL);
-		metaslab_group_alloc_verify(spa, zio->io_bp, zio);
-		VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+		metaslab_group_alloc_verify(spa, zio->io_bp, zio,
+		    zio->io_allocator);
+		VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator],
+		    zio));
 	}
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
@@ -4005,7 +4047,12 @@ zio_done(zio_t *zio)
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
-				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+				/*
+				 * This is a rare code path, so we don't
+				 * bother with "next_to_execute".
+				 */
+				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
+				    NULL);
 			}
 		}
 
@@ -4017,7 +4064,11 @@ zio_done(zio_t *zio)
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
-			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+			/*
+			 * This is a rare code path, so we don't bother with
+			 * "next_to_execute".
+			 */
+			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
@@ -4038,7 +4089,7 @@ zio_done(zio_t *zio)
 			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
 			    0, &zio->io_tqent);
 		}
-		return (ZIO_PIPELINE_STOP);
+		return (NULL);
 	}
 
 	ASSERT(zio->io_child_count == 0);
@@ -4068,12 +4119,17 @@ zio_done(zio_t *zio)
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
+	/*
+	 * We are done executing this zio.  We may want to execute a parent
+	 * next.  See the comment in zio_notify_parent().
+	 */
+	zio_t *next_to_execute = NULL;
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
-		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
 	}
 
 	if (zio->io_waiter != NULL) {
@@ -4085,7 +4141,7 @@ zio_done(zio_t *zio)
 		zio_destroy(zio);
 	}
 
-	return (ZIO_PIPELINE_STOP);
+	return (next_to_execute);
 }
 
 /*
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
index 7e05b9212db0..b87303889ddb 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
@@ -25,7 +25,7 @@
  */
 /*
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -56,6 +56,12 @@ static zcomp_stats_t zcomp_stats = {
 kstat_t		*zcomp_ksp;
 
 /*
+ * If nonzero, every 1/X decompression attempts will fail, simulating
+ * an undetected memory error.
+ */
+uint64_t zio_decompress_fail_fraction = 0;
+
+/*
  * Compression vectors.
  */
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
@@ -172,6 +178,16 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
 	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
 	abd_return_buf(src, tmp, s_len);
 
+	/*
+	 * Decompression shouldn't fail, because we've already verifyied
+	 * the checksum.  However, for extra protection (e.g. against bitflips
+	 * in non-ECC RAM), we handle this error (and test it).
+	 */
+	ASSERT0(ret);
+	if (zio_decompress_fail_fraction != 0 &&
+	    spa_get_random(zio_decompress_fail_fraction) == 0)
+		ret = SET_ERROR(EINVAL);
+
 	return (ret);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
index 805c63d09a01..3c7f669a351a 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
@@ -174,7 +174,7 @@ typedef struct zvol_state {
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	list_t		zv_extents;	/* List of extents for dump */
 	znode_t		zv_znode;	/* for range locking */
-	dmu_buf_t	*zv_dbuf;	/* bonus handle */
+	dnode_t		*zv_dn;		/* dnode hold */
 #ifndef illumos
 	int		zv_state;
 	int		zv_volmode;	/* Provide GEOM or cdev */
@@ -868,7 +868,7 @@ zvol_first_open(zvol_state_t *zv)
 	}
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
-	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+	error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
 	if (error) {
 		dmu_objset_disown(os, zvol_tag);
 		return (error);
@@ -893,8 +893,8 @@ zvol_last_close(zvol_state_t *zv)
 	zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
 
-	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
-	zv->zv_dbuf = NULL;
+	dnode_rele(zv->zv_dn, zvol_tag);
+	zv->zv_dn = NULL;
 
 	/*
 	 * Evict cached data
@@ -1342,8 +1342,6 @@ static int
 zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
-	objset_t *os = zv->zv_objset;
-	uint64_t object = ZVOL_OBJ;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;	/* length of user data */
 	dmu_buf_t *db;
@@ -1367,7 +1365,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
 		    RL_READER);
-		error = dmu_read(os, object, offset, size, buf,
+		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
 		/*
@@ -1380,7 +1378,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 		offset = P2ALIGN(offset, size);
 		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
 		    RL_READER);
-		error = dmu_buf_hold(os, object, offset, zgd, &db,
+		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
@@ -1451,8 +1449,8 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
-		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+		if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
+		    off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
@@ -1874,7 +1872,7 @@ zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
-		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
+		error =  dmu_read_uio_dnode(zv->zv_dn, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
@@ -1946,7 +1944,7 @@ zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
 			dmu_tx_abort(tx);
 			break;
 		}
-		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
+		error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
 		if (error == 0)
 			zvol_log_write(zv, tx, off, bytes, sync);
 		dmu_tx_commit(tx);
@@ -2028,7 +2026,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
 int
 zvol_get_volume_params(minor_t minor, uint64_t *blksize,
     uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
-    void **rl_hdl, void **bonus_hdl)
+    void **rl_hdl, void **dnode_hdl)
 {
 	zvol_state_t *zv;
 
@@ -2039,7 +2037,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
 		return (SET_ERROR(ENXIO));
 
 	ASSERT(blksize && max_xfer_len && minor_hdl &&
-	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+	    objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
 
 	*blksize = zv->zv_volblocksize;
 	*max_xfer_len = (uint64_t)zvol_maxphys;
@@ -2047,7 +2045,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
 	*objset_hdl = zv->zv_objset;
 	*zil_hdl = zv->zv_zilog;
 	*rl_hdl = &zv->zv_znode;
-	*bonus_hdl = zv->zv_dbuf;
+	*dnode_hdl = zv->zv_dn;
 	return (0);
 }
 
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
index 10e0ddaeef88..fea46c90481d 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
@@ -107,6 +107,14 @@ extern "C" {
 
 
 /*
+ * AVL comparator helpers
+ */
+#define	AVL_ISIGN(a)	(((a) > 0) - ((a) < 0))
+#define	AVL_CMP(a, b)	(((a) > (b)) - ((a) < (b)))
+#define	AVL_PCMP(a, b)	\
+	(((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b)))
+
+/*
  * Type used for the root of the AVL tree.
  */
 typedef struct avl_tree avl_tree_t;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
index c347b63a1a6f..3fcda9e8965e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
@@ -118,7 +118,7 @@ typedef enum {
 	ZFS_PROP_SNAPDIR,
 	ZFS_PROP_ACLMODE,
 	ZFS_PROP_ACLINHERIT,
-	ZFS_PROP_CREATETXG,		/* not exposed to the user */
+	ZFS_PROP_CREATETXG,
 	ZFS_PROP_NAME,			/* not exposed to the user */
 	ZFS_PROP_CANMOUNT,
 	ZFS_PROP_ISCSIOPTIONS,		/* not exposed to the user */
@@ -637,6 +637,13 @@ typedef struct zpool_load_policy {
 #define	VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \
 	"com.delphix:pool_checkpoint_sm"
 
+#define	VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET	\
+	"com.delphix:next_offset_to_initialize"
+#define	VDEV_LEAF_ZAP_INITIALIZE_STATE	\
+	"com.delphix:vdev_initialize_state"
+#define	VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME	\
+	"com.delphix:vdev_initialize_action_time"
+
 /*
  * This is needed in userland to report the minimum necessary device size.
  *
@@ -735,6 +742,15 @@ typedef enum pool_scrub_cmd {
 	POOL_SCRUB_FLAGS_END
 } pool_scrub_cmd_t;
 
+/*
+ * Initialize functions.
+ */
+typedef enum pool_initialize_func {
+	POOL_INITIALIZE_DO,
+	POOL_INITIALIZE_CANCEL,
+	POOL_INITIALIZE_SUSPEND,
+	POOL_INITIALIZE_FUNCS
+} pool_initialize_func_t;
 
 /*
  * ZIO types.  Needed to interpret vdev statistics below.
@@ -814,6 +830,14 @@ typedef struct pool_checkpoint_stat {
 	uint64_t pcs_space;		/* checkpointed space */
 } pool_checkpoint_stat_t;
 
+typedef enum {
+	VDEV_INITIALIZE_NONE,
+	VDEV_INITIALIZE_ACTIVE,
+	VDEV_INITIALIZE_CANCELED,
+	VDEV_INITIALIZE_SUSPENDED,
+	VDEV_INITIALIZE_COMPLETE
+} vdev_initializing_state_t;
+
 /*
  * Vdev statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
@@ -840,6 +864,11 @@ typedef struct vdev_stat {
  	uint64_t	vs_physical_ashift;	/* vdev_physical_ashift */
 	uint64_t	vs_fragmentation;	/* device fragmentation */
 	uint64_t	vs_checkpoint_space;    /* checkpoint-consumed space */
+	uint64_t	vs_initialize_errors;	/* initializing errors	*/
+	uint64_t	vs_initialize_bytes_done; /* bytes initialized */
+	uint64_t	vs_initialize_bytes_est; /* total bytes to initialize */
+	uint64_t	vs_initialize_state;	/* vdev_initialzing_state_t */
+	uint64_t	vs_initialize_action_time; /* time_t */
 } vdev_stat_t;
 #define VDEV_STAT_VALID(field, uint64_t_field_count) \
     ((uint64_t_field_count * sizeof(uint64_t)) >= \
@@ -974,6 +1003,7 @@ typedef enum zfs_ioc {
 	ZFS_IOC_REMAP,
 	ZFS_IOC_POOL_CHECKPOINT,
 	ZFS_IOC_POOL_DISCARD_CHECKPOINT,
+	ZFS_IOC_POOL_INITIALIZE,
 	ZFS_IOC_LAST
 } zfs_ioc_t;
 
@@ -1037,6 +1067,12 @@ typedef enum {
 #define	ZPOOL_HIST_ERRNO	"errno"
 
 /*
+ * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE.
+ */
+#define	ZPOOL_INITIALIZE_COMMAND	"initialize_command"
+#define	ZPOOL_INITIALIZE_VDEVS		"initialize_vdevs"
+
+/*
  * Flags for ZFS_IOC_VDEV_SET_STATE
  */
 #define	ZFS_ONLINE_CHECKREMOVE	0x1
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
index e4545a96ee76..52d6aea0a364 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_NVPAIR_H
@@ -39,6 +39,7 @@ extern "C" {
 #endif
 
 typedef enum {
+	DATA_TYPE_DONTCARE = -1,
 	DATA_TYPE_UNKNOWN = 0,
 	DATA_TYPE_BOOLEAN,
 	DATA_TYPE_BYTE,
diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
index f12dbbfe6ef5..c9874b3e4db7 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
@@ -24,11 +24,13 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
 #ifndef	_NVPAIR_IMPL_H
 #define	_NVPAIR_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -47,16 +49,27 @@ typedef struct i_nvp i_nvp_t;
 
 struct i_nvp {
 	union {
-		uint64_t	_nvi_align;	/* ensure alignment */
+		/* ensure alignment */
+		uint64_t	_nvi_align;
+
 		struct {
-			i_nvp_t	*_nvi_next;	/* pointer to next nvpair */
-			i_nvp_t	*_nvi_prev;	/* pointer to prev nvpair */
+			/* pointer to next nvpair */
+			i_nvp_t	*_nvi_next;
+
+			/* pointer to prev nvpair */
+			i_nvp_t	*_nvi_prev;
+
+			/* next pair in table bucket */
+			i_nvp_t	*_nvi_hashtable_next;
 		} _nvi;
 	} _nvi_un;
-	nvpair_t nvi_nvp;			/* nvpair */
+
+	/* nvpair */
+	nvpair_t nvi_nvp;
 };
 #define	nvi_next	_nvi_un._nvi._nvi_next
 #define	nvi_prev	_nvi_un._nvi._nvi_prev
+#define	nvi_hashtable_next	_nvi_un._nvi._nvi_hashtable_next
 
 typedef struct {
 	i_nvp_t		*nvp_list;	/* linked list of nvpairs */
@@ -64,6 +77,10 @@ typedef struct {
 	i_nvp_t		*nvp_curr;	/* current walker nvpair */
 	nv_alloc_t	*nvp_nva;	/* pluggable allocator */
 	uint32_t	nvp_stat;	/* internal state */
+
+	i_nvp_t		**nvp_hashtable; /* table of entries used for lookup */
+	uint32_t	nvp_nbuckets;	/* # of buckets in hash table */
+	uint32_t	nvp_nentries;	/* # of entries in hash table */
 } nvpriv_t;
 
 #ifdef	__cplusplus