diff options
Diffstat (limited to 'sys/cddl/contrib')
83 files changed, 5650 insertions, 1903 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c index 89a64ea1d960..c322a5bd2179 100644 --- a/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c +++ b/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2017 by Delphix. All rights reserved. */ #include <sys/debug.h> @@ -142,6 +143,13 @@ static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, #define NVPAIR2I_NVP(nvp) \ ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) +#ifdef _KERNEL +int nvpair_max_recursion = 20; +#else +int nvpair_max_recursion = 100; +#endif + +uint64_t nvlist_hashtable_init_size = (1 << 4); int nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) @@ -250,6 +258,291 @@ nv_priv_alloc_embedded(nvpriv_t *priv) return (emb_priv); } +static int +nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets) +{ + ASSERT3P(priv->nvp_hashtable, ==, NULL); + ASSERT0(priv->nvp_nbuckets); + ASSERT0(priv->nvp_nentries); + + i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *)); + if (tab == NULL) + return (ENOMEM); + + priv->nvp_hashtable = tab; + priv->nvp_nbuckets = buckets; + return (0); +} + +static void +nvt_tab_free(nvpriv_t *priv) +{ + i_nvp_t **tab = priv->nvp_hashtable; + if (tab == NULL) { + ASSERT0(priv->nvp_nbuckets); + ASSERT0(priv->nvp_nentries); + return; + } + + nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *)); + + priv->nvp_hashtable = NULL; + priv->nvp_nbuckets = 0; + priv->nvp_nentries = 0; +} + +static uint32_t +nvt_hash(const char *p) +{ + uint32_t g, hval = 0; + + while (*p) { + hval = (hval << 4) + *p++; + if ((g = (hval & 0xf0000000)) != 0) + hval ^= g >> 24; + hval &= ~g; + } + return (hval); +} + +static boolean_t +nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag) +{ + boolean_t match = B_FALSE; + if (nvflag & NV_UNIQUE_NAME_TYPE) { + if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 && + NVP_TYPE(nvp1) == NVP_TYPE(nvp2)) + match = B_TRUE; + } else { + ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME); + if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0) + match = B_TRUE; + } + return (match); +} + +static nvpair_t * +nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + ASSERT(priv != NULL); + + i_nvp_t **tab = priv->nvp_hashtable; + + if (tab == NULL) { + ASSERT3P(priv->nvp_list, ==, NULL); + ASSERT0(priv->nvp_nbuckets); + ASSERT0(priv->nvp_nentries); + return (NULL); + } else { + ASSERT(priv->nvp_nbuckets != 0); + } + + uint64_t hash = nvt_hash(name); + uint64_t index = hash & (priv->nvp_nbuckets - 1); + + ASSERT3U(index, <, priv->nvp_nbuckets); + i_nvp_t *entry = tab[index]; + + for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) { + if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 && + (type == DATA_TYPE_DONTCARE || + NVP_TYPE(&e->nvi_nvp) == type)) + return (&e->nvi_nvp); + } + return (NULL); +} + +static nvpair_t * +nvt_lookup_name(nvlist_t *nvl, const char *name) +{ + return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE)); +} + +static int +nvt_resize(nvpriv_t *priv, uint32_t new_size) +{ + i_nvp_t **tab = priv->nvp_hashtable; + + /* + * Migrate all the entries from the current table + * to a newly-allocated table with the new size by + * re-adjusting the pointers of their entries. + */ + uint32_t size = priv->nvp_nbuckets; + uint32_t new_mask = new_size - 1; + ASSERT(((new_size) & ((new_size) - 1)) == 0); + + i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *)); + if (new_tab == NULL) + return (ENOMEM); + + uint32_t nentries = 0; + for (uint32_t i = 0; i < size; i++) { + i_nvp_t *next, *e = tab[i]; + + while (e != NULL) { + next = e->nvi_hashtable_next; + + uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp)); + uint32_t index = hash & new_mask; + + e->nvi_hashtable_next = new_tab[index]; + new_tab[index] = e; + nentries++; + + e = next; + } + tab[i] = NULL; + } + ASSERT3U(nentries, ==, priv->nvp_nentries); + + nvt_tab_free(priv); + + priv->nvp_hashtable = new_tab; + priv->nvp_nbuckets = new_size; + priv->nvp_nentries = nentries; + + return (0); +} + +static boolean_t +nvt_needs_togrow(nvpriv_t *priv) +{ + /* + * Grow only when we have more elements than buckets + * and the # of buckets doesn't overflow. + */ + return (priv->nvp_nentries > priv->nvp_nbuckets && + (UINT32_MAX >> 1) >= priv->nvp_nbuckets); +} + +/* + * Allocate a new table that's twice the size of the old one, + * and migrate all the entries from the old one to the new + * one by re-adjusting their pointers. + */ +static int +nvt_grow(nvpriv_t *priv) +{ + uint32_t current_size = priv->nvp_nbuckets; + /* ensure we won't overflow */ + ASSERT3U(UINT32_MAX >> 1, >=, current_size); + return (nvt_resize(priv, current_size << 1)); +} + +static boolean_t +nvt_needs_toshrink(nvpriv_t *priv) +{ + /* + * Shrink only when the # of elements is less than or + * equal to 1/4 the # of buckets. Never shrink less than + * nvlist_hashtable_init_size. + */ + ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size); + if (priv->nvp_nbuckets == nvlist_hashtable_init_size) + return (B_FALSE); + return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2)); +} + +/* + * Allocate a new table that's half the size of the old one, + * and migrate all the entries from the old one to the new + * one by re-adjusting their pointers. + */ +static int +nvt_shrink(nvpriv_t *priv) +{ + uint32_t current_size = priv->nvp_nbuckets; + /* ensure we won't overflow */ + ASSERT3U(current_size, >=, nvlist_hashtable_init_size); + return (nvt_resize(priv, current_size >> 1)); +} + +static int +nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + + if (nvt_needs_toshrink(priv)) { + int err = nvt_shrink(priv); + if (err != 0) + return (err); + } + i_nvp_t **tab = priv->nvp_hashtable; + + char *name = NVP_NAME(nvp); + uint64_t hash = nvt_hash(name); + uint64_t index = hash & (priv->nvp_nbuckets - 1); + + ASSERT3U(index, <, priv->nvp_nbuckets); + i_nvp_t *bucket = tab[index]; + + for (i_nvp_t *prev = NULL, *e = bucket; + e != NULL; prev = e, e = e->nvi_hashtable_next) { + if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_flag)) { + if (prev != NULL) { + prev->nvi_hashtable_next = + e->nvi_hashtable_next; + } else { + ASSERT3P(e, ==, bucket); + tab[index] = e->nvi_hashtable_next; + } + e->nvi_hashtable_next = NULL; + priv->nvp_nentries--; + break; + } + } + + return (0); +} + +static int +nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + + /* initialize nvpair table now if it doesn't exist. */ + if (priv->nvp_hashtable == NULL) { + int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size); + if (err != 0) + return (err); + } + + /* + * if we don't allow duplicate entries, make sure to + * unlink any existing entries from the table. + */ + if (nvl->nvl_nvflag != 0) { + int err = nvt_remove_nvpair(nvl, nvp); + if (err != 0) + return (err); + } + + if (nvt_needs_togrow(priv)) { + int err = nvt_grow(priv); + if (err != 0) + return (err); + } + i_nvp_t **tab = priv->nvp_hashtable; + + char *name = NVP_NAME(nvp); + uint64_t hash = nvt_hash(name); + uint64_t index = hash & (priv->nvp_nbuckets - 1); + + ASSERT3U(index, <, priv->nvp_nbuckets); + i_nvp_t *bucket = tab[index]; + + /* insert link at the beginning of the bucket */ + i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); + ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); + new_entry->nvi_hashtable_next = bucket; + tab[index] = new_entry; + + priv->nvp_nentries++; + return (0); +} + static void nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) { @@ -582,6 +875,7 @@ nvlist_free(nvlist_t *nvl) else nvl->nvl_priv = 0; + nvt_tab_free(priv); nv_mem_free(priv, priv, sizeof (nvpriv_t)); } @@ -642,26 +936,14 @@ nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) int nvlist_remove_all(nvlist_t *nvl, const char *name) { - nvpriv_t *priv; - i_nvp_t *curr; int error = ENOENT; - if (nvl == NULL || name == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) return (EINVAL); - curr = priv->nvp_list; - while (curr != NULL) { - nvpair_t *nvp = &curr->nvi_nvp; - - curr = curr->nvi_next; - if (strcmp(name, NVP_NAME(nvp)) != 0) - continue; - - nvp_buf_unlink(nvl, nvp); - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - + nvpair_t *nvp; + while ((nvp = nvt_lookup_name(nvl, name)) != NULL) { + VERIFY0(nvlist_remove_nvpair(nvl, nvp)); error = 0; } @@ -674,28 +956,14 @@ nvlist_remove_all(nvlist_t *nvl, const char *name) int nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) { - nvpriv_t *priv; - i_nvp_t *curr; - - if (nvl == NULL || name == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) return (EINVAL); - curr = priv->nvp_list; - while (curr != NULL) { - nvpair_t *nvp = &curr->nvi_nvp; - - if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) { - nvp_buf_unlink(nvl, nvp); - nvpair_free(nvp); - nvp_buf_free(nvl, nvp); - - return (0); - } - curr = curr->nvi_next; - } + nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); + if (nvp == NULL) + return (ENOENT); - return (ENOENT); + return (nvlist_remove_nvpair(nvl, nvp)); } int @@ -704,6 +972,10 @@ nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) if (nvl == NULL || nvp == NULL) return (EINVAL); + int err = nvt_remove_nvpair(nvl, nvp); + if (err != 0) + return (err); + nvp_buf_unlink(nvl, nvp); nvpair_free(nvp); nvp_buf_free(nvl, nvp); @@ -908,6 +1180,8 @@ nvlist_add_common(nvlist_t *nvl, const char *name, /* calculate sizes of the nvpair elements and the nvpair itself */ name_sz = strlen(name) + 1; + if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * 8 - 1)) + return (EINVAL); nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); @@ -979,6 +1253,12 @@ nvlist_add_common(nvlist_t *nvl, const char *name, else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) (void) nvlist_remove(nvl, name, type); + err = nvt_add_nvpair(nvl, nvp); + if (err != 0) { + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (err); + } nvp_buf_link(nvl, nvp); return (0); @@ -1328,25 +1608,17 @@ static int nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, uint_t *nelem, void *data) { - nvpriv_t *priv; - nvpair_t *nvp; - i_nvp_t *curr; - - if (name == NULL || nvl == NULL || - (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) return (EINVAL); if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) return (ENOTSUP); - for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { - nvp = &curr->nvi_nvp; - - if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) - return (nvpair_value_common(nvp, type, nelem, data)); - } + nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); + if (nvp == NULL) + return (ENOENT); - return (ENOENT); + return (nvpair_value_common(nvp, type, nelem, data)); } int @@ -2018,6 +2290,7 @@ typedef struct { const nvs_ops_t *nvs_ops; void *nvs_private; nvpriv_t *nvs_priv; + int nvs_recursion; } nvstream_t; /* @@ -2103,6 +2376,12 @@ nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) return (EFAULT); } + err = nvt_add_nvpair(nvl, nvp); + if (err != 0) { + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (err); + } nvp_buf_link(nvl, nvp); } return (err); @@ -2169,9 +2448,16 @@ static int nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) { switch (nvs->nvs_op) { - case NVS_OP_ENCODE: - return (nvs_operation(nvs, embedded, NULL)); + case NVS_OP_ENCODE: { + int err; + if (nvs->nvs_recursion >= nvpair_max_recursion) + return (EINVAL); + nvs->nvs_recursion++; + err = nvs_operation(nvs, embedded, NULL); + nvs->nvs_recursion--; + return (err); + } case NVS_OP_DECODE: { nvpriv_t *priv; int err; @@ -2184,8 +2470,14 @@ nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) nvlist_init(embedded, embedded->nvl_nvflag, priv); + if (nvs->nvs_recursion >= nvpair_max_recursion) { + nvlist_free(embedded); + return (EINVAL); + } + nvs->nvs_recursion++; if ((err = nvs_operation(nvs, embedded, NULL)) != 0) nvlist_free(embedded); + nvs->nvs_recursion--; return (err); } default: @@ -2273,6 +2565,7 @@ nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, return (EINVAL); nvs.nvs_op = nvs_op; + nvs.nvs_recursion = 0; /* * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c index 5f3d22f703bc..67774cddb8c9 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c @@ -229,6 +229,12 @@ zpool_feature_init(void) "Pool state can be checkpointed, allowing rewind later.", ZFEATURE_FLAG_READONLY_COMPAT, NULL); + zfeature_register(SPA_FEATURE_SPACEMAP_V2, + "com.delphix:spacemap_v2", "spacemap_v2", + "Space maps representing large segments are more efficient.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + NULL); + static const spa_feature_t large_blocks_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h index 12bd4ffe1ccc..1972ba397fae 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h @@ -60,6 +60,7 @@ typedef enum spa_feature { SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_OBSOLETE_COUNTS, SPA_FEATURE_POOL_CHECKPOINT, + SPA_FEATURE_SPACEMAP_V2, SPA_FEATURES } spa_feature_t; diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c index 09975125261b..bad8f20e6917 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ /* @@ -34,8 +34,6 @@ * name is invalid. In the kernel, we only care whether it's valid or not. * Each routine therefore takes a 'namecheck_err_t' which describes exactly why * the name failed to validate. - * - * Each function returns 0 on success, -1 on error. */ #if defined(_KERNEL) @@ -50,6 +48,14 @@ #include "zfs_namecheck.h" #include "zfs_deleg.h" +/* + * Deeply nested datasets can overflow the stack, so we put a limit + * in the amount of nesting a path can have. zfs_max_dataset_nesting + * can be tuned temporarily to fix existing datasets that exceed our + * predefined limit. + */ +int zfs_max_dataset_nesting = 50; + static int valid_char(char c) { @@ -60,10 +66,35 @@ valid_char(char c) } /* + * Looks at a path and returns its level of nesting (depth). + */ +int +get_dataset_depth(const char *path) +{ + const char *loc = path; + int nesting = 0; + + /* + * Keep track of nesting until you hit the end of the + * path or found the snapshot/bookmark seperator. + */ + for (int i = 0; loc[i] != '\0' && + loc[i] != '@' && + loc[i] != '#'; i++) { + if (loc[i] == '/') + nesting++; + } + + return (nesting); +} + +/* * Snapshot names must be made up of alphanumeric characters plus the following * characters: * - * [-_.: ] + * [-_.: ] + * + * Returns 0 on success, -1 on error. */ int zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what) @@ -99,6 +130,8 @@ zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what) * Permissions set name must start with the letter '@' followed by the * same character restrictions as snapshot names, except that the name * cannot exceed 64 characters. + * + * Returns 0 on success, -1 on error. */ int permset_namecheck(const char *path, namecheck_err_t *why, char *what) @@ -121,28 +154,40 @@ permset_namecheck(const char *path, namecheck_err_t *why, char *what) } /* + * Dataset paths should not be deeper than zfs_max_dataset_nesting + * in terms of nesting. + * + * Returns 0 on success, -1 on error. + */ +int +dataset_nestcheck(const char *path) +{ + return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1); +} + +/* * Entity names must be of the following form: * - * [component/]*[component][(@|#)component]? + * [component/]*[component][(@|#)component]? * * Where each component is made up of alphanumeric characters plus the following * characters: * - * [-_.:%] + * [-_.:%] * * We allow '%' here as we use that character internally to create unique * names for temporary clones (for online recv). + * + * Returns 0 on success, -1 on error. */ int entity_namecheck(const char *path, namecheck_err_t *why, char *what) { - const char *start, *end; - int found_delim; + const char *end; /* * Make sure the name is not too long. */ - if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; @@ -162,8 +207,8 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what) return (-1); } - start = path; - found_delim = 0; + const char *start = path; + boolean_t found_delim = B_FALSE; for (;;) { /* Find the end of this component */ end = start; @@ -198,7 +243,7 @@ entity_namecheck(const char *path, namecheck_err_t *why, char *what) return (-1); } - found_delim = 1; + found_delim = B_TRUE; } /* Zero-length components are not allowed */ @@ -250,6 +295,8 @@ dataset_namecheck(const char *path, namecheck_err_t *why, char *what) * mountpoint names must be of the following form: * * /[component][/]*[component][/] + * + * Returns 0 on success, -1 on error. */ int mountpoint_namecheck(const char *path, namecheck_err_t *why) @@ -294,6 +341,8 @@ mountpoint_namecheck(const char *path, namecheck_err_t *why) * dataset names, with the additional restriction that the pool name must begin * with a letter. The pool names 'raidz' and 'mirror' are also reserved names * that cannot be used. + * + * Returns 0 on success, -1 on error. */ int pool_namecheck(const char *pool, namecheck_err_t *why, char *what) diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h index db70641dbab2..527db92b0cfa 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_namecheck.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #ifndef _ZFS_NAMECHECK_H @@ -48,9 +48,13 @@ typedef enum { #define ZFS_PERMSET_MAXLEN 64 +extern int zfs_max_dataset_nesting; + +int get_dataset_depth(const char *); int pool_namecheck(const char *, namecheck_err_t *, char *); int entity_namecheck(const char *, namecheck_err_t *, char *); int dataset_namecheck(const char *, namecheck_err_t *, char *); +int dataset_nestcheck(const char *); int mountpoint_namecheck(const char *, namecheck_err_t *); int zfs_component_namecheck(const char *, namecheck_err_t *, char *); int permset_namecheck(const char *, namecheck_err_t *, char *); diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c index 5f7bcaba5450..880051800365 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c @@ -427,6 +427,10 @@ zfs_prop_init(void) zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<count>", "SSCOUNT"); + zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID"); + zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG"); /* inherit number properties */ zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", @@ -434,8 +438,6 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); /* hidden properties */ - zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG"); zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, @@ -447,8 +449,6 @@ zfs_prop_init(void) zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "STMF_SBD_LU"); - zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID"); zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "USERACCOUNTING"); diff --git a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files index 8b6720e619f1..27c5fa6e06d0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/Makefile.files +++ b/sys/cddl/contrib/opensolaris/uts/common/Makefile.files @@ -124,6 +124,7 @@ ZFS_COMMON_OBJS += \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ + vdev_initialize.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 88bbf7ef7c7e..db7ca9889d47 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, Joyent, Inc. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. */ @@ -377,6 +377,13 @@ u_int zfs_arc_free_target = 0; /* Absolute min for arc min / max is 16MB. */ static uint64_t arc_abs_min = 16 << 20; +/* + * ARC dirty data constraints for arc_tempreserve_space() throttle + */ +uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ +uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ +uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ + boolean_t zfs_compressed_arc_enabled = B_TRUE; static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); @@ -5148,12 +5155,13 @@ arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; - if (buf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { + ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; - ASSERT(buf->b_data); + ASSERT(buf->b_data != NULL); } } @@ -5181,6 +5189,7 @@ arc_read_done(zio_t *zio) arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; + boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists @@ -5206,7 +5215,7 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } - if (zio->io_error == 0) { + if (no_zio_error) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -5227,8 +5236,7 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && zio->io_error == 0 && - hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -5251,20 +5259,38 @@ arc_read_done(zio_t *zio) callback_cnt++; - if (zio->io_error != 0) - continue; - - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, - B_TRUE, &acb->acb_buf); - if (error != 0) { - arc_buf_destroy(acb->acb_buf, acb->acb_private); - acb->acb_buf = NULL; + if (no_zio_error) { + int error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, zio->io_error == 0, + &acb->acb_buf); + if (error != 0) { + /* + * Decompression failed. Set io_error + * so that when we call acb_done (below), + * we will indicate that the read failed. + * Note that in the unusual case where one + * callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ + zio->io_error = error; + } } - - if (zio->io_error == 0) - zio->io_error = error; } + /* + * If there are multiple callbacks, we must have the hash lock, + * because the only way for multiple threads to find this hdr is + * in the hash table. This ensures that if there are multiple + * callbacks, the hdr is not anonymous. If it were anonymous, + * we couldn't use arc_buf_destroy() in the error case below. + */ + ASSERT(callback_cnt < 2 || hash_lock != NULL); + hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) { @@ -5276,7 +5302,7 @@ arc_read_done(zio_t *zio) ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (zio->io_error == 0) { + if (no_zio_error) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -5309,7 +5335,16 @@ arc_read_done(zio_t *zio) /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + if (zio->io_error != 0 && acb->acb_buf != NULL) { + /* + * If arc_buf_alloc_impl() fails during + * decompression, the buf will still be + * allocated, and needs to be freed here. + */ + arc_buf_destroy(acb->acb_buf, acb->acb_private); + acb->acb_buf = NULL; + } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } @@ -6280,12 +6315,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, } static int -arc_memory_throttle(uint64_t reserve, uint64_t txg) +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); - static uint64_t page_load = 0; - static uint64_t last_txg = 0; #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = MIN(available_memory, uma_avail()); @@ -6294,9 +6327,9 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg) if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); - if (txg > last_txg) { - last_txg = txg; - page_load = 0; + if (txg > spa->spa_lowmem_last_txg) { + spa->spa_lowmem_last_txg = txg; + spa->spa_lowmem_page_load = 0; } /* * If we are in pageout, we know that memory is already tight, @@ -6304,18 +6337,19 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg) * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { - if (page_load > MAX(ptob(minfree), available_memory) / 4) + if (spa->spa_lowmem_page_load > + MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ - page_load += reserve / 8; + atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); return (0); - } else if (page_load > 0 && arc_reclaim_needed()) { + } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (SET_ERROR(EAGAIN)); } - page_load = 0; -#endif + spa->spa_lowmem_page_load = 0; +#endif /* _KERNEL */ return (0); } @@ -6327,7 +6361,7 @@ arc_tempreserve_clear(uint64_t reserve) } int -arc_tempreserve_space(uint64_t reserve, uint64_t txg) +arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; @@ -6356,7 +6390,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ - error = arc_memory_throttle(reserve, txg); + error = arc_memory_throttle(spa, reserve, txg); if (error != 0) return (error); @@ -6364,12 +6398,24 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. + * + * In the case of one pool being built on another pool, we want + * to make sure we don't end up throttling the lower (backing) + * pool when the upper pool is the majority contributor to dirty + * data. To insure we make forward progress during throttling, we + * also check the current pool's net dirty data and only throttle + * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty + * data in the cache. + * * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ + uint64_t total_dirty = reserve + arc_tempreserve + anon_size; + uint64_t spa_dirty_anon = spa_dirty_data(spa); - if (reserve + arc_tempreserve + anon_size > arc_c / 2 && - anon_size > arc_c / 4) { + if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && + anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && + spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { uint64_t meta_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 533de180bf1c..1db7bfe02881 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -49,8 +49,7 @@ #include <sys/abd.h> #include <sys/vdev.h> #include <sys/cityhash.h> - -uint_t zfs_dbuf_evict_key; +#include <sys/spa_impl.h> static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -74,24 +73,58 @@ static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* - * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that - * are not currently held but have been recently released. These dbufs - * are not eligible for arc eviction until they are aged out of the cache. - * Dbufs are added to the dbuf cache once the last hold is released. If a - * dbuf is later accessed and still exists in the dbuf cache, then it will - * be removed from the cache and later re-added to the head of the cache. - * Dbufs that are aged out of the cache will be immediately destroyed and - * become eligible for arc eviction. + * There are two dbuf caches; each dbuf can only be in one of them at a time. + * + * 1. Cache of metadata dbufs, to help make read-heavy administrative commands + * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs + * that represent the metadata that describes filesystems/snapshots/ + * bookmarks/properties/etc. We only evict from this cache when we export a + * pool, to short-circuit as much I/O as possible for all administrative + * commands that need the metadata. There is no eviction policy for this + * cache, because we try to only include types in it which would occupy a + * very small amount of space per object but create a large impact on the + * performance of these commands. Instead, after it reaches a maximum size + * (which should only happen on very small memory systems with a very large + * number of filesystem objects), we stop taking new dbufs into the + * metadata cache, instead putting them in the normal dbuf cache. + * + * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + * + * Dbufs are added to these caches once the last hold is released. If a dbuf is + * later accessed and still exists in the dbuf cache, then it will be removed + * from the cache and later re-added to the head of the cache. + * + * If a given dbuf meets the requirements for the metadata cache, it will go + * there, otherwise it will be considered for the generic LRU dbuf cache. The + * caches and the refcounts tracking their sizes are stored in an array indexed + * by those caches' matching enum values (from dbuf_cached_state_t). */ -static multilist_t *dbuf_cache; -static refcount_t dbuf_cache_size; -uint64_t dbuf_cache_max_bytes = 0; +typedef struct dbuf_cache { + multilist_t *cache; + refcount_t size; +} dbuf_cache_t; +dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; -/* Set the default size of the dbuf cache to log2 fraction of arc size. */ +/* Size limits for the caches */ +uint64_t dbuf_cache_max_bytes = 0; +uint64_t dbuf_metadata_cache_max_bytes = 0; +/* Set the default sizes of the caches to log2 fraction of arc size */ int dbuf_cache_shift = 5; +int dbuf_metadata_cache_shift = 6; /* - * The dbuf cache uses a three-stage eviction policy: + * For diagnostic purposes, this is incremented whenever we can't add + * something to the metadata cache because it's full, and instead put + * the data in the regular dbuf cache. + */ +uint64_t dbuf_metadata_cache_overflow; + +/* + * The LRU dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we @@ -404,6 +437,41 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } /* + * This returns whether this dbuf should be stored in the metadata cache, which + * is based on whether it's from one of the dnode types that store data related + * to traversing dataset hierarchies. + */ +static boolean_t +dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) +{ + DB_DNODE_ENTER(db); + dmu_object_type_t type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + /* Check if this dbuf is one of the types we care about */ + if (DMU_OT_IS_METADATA_CACHED(type)) { + /* If we hit this, then we set something up wrong in dmu_ot */ + ASSERT(DMU_OT_IS_METADATA(type)); + + /* + * Sanity check for small-memory systems: don't allocate too + * much memory for this purpose. + */ + if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) > + dbuf_metadata_cache_max_bytes) { + dbuf_metadata_cache_overflow++; + DTRACE_PROBE1(dbuf__metadata__cache__overflow, + dmu_buf_impl_t *, db); + return (B_FALSE); + } + + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly @@ -438,7 +506,7 @@ dbuf_cache_above_hiwater(void) uint64_t dbuf_cache_hiwater_bytes = (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; - return (refcount_count(&dbuf_cache_size) > + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); } @@ -448,7 +516,7 @@ dbuf_cache_above_lowater(void) uint64_t dbuf_cache_lowater_bytes = (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; - return (refcount_count(&dbuf_cache_size) > + return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); } @@ -458,19 +526,12 @@ dbuf_cache_above_lowater(void) static void dbuf_evict_one(void) { - int idx = multilist_get_random_index(dbuf_cache); - multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx); + int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + multilist_sublist_t *mls = multilist_sublist_lock( + dbuf_caches[DB_DBUF_CACHE].cache, idx); ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); - /* - * Set the thread's tsd to indicate that it's processing evictions. - * Once a thread stops evicting from the dbuf cache it will - * reset its tsd to NULL. - */ - ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); - (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); - dmu_buf_impl_t *db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); @@ -482,13 +543,14 @@ dbuf_evict_one(void) if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); - (void) refcount_remove_many(&dbuf_cache_size, + (void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); + db->db_caching_status = DB_NO_CACHE; dbuf_destroy(db); } else { multilist_sublist_unlock(mls); } - (void) tsd_set(zfs_dbuf_evict_key, NULL); } /* @@ -542,35 +604,13 @@ dbuf_evict_thread(void *unused __unused) static void dbuf_evict_notify(void) { - - /* - * We use thread specific data to track when a thread has - * started processing evictions. This allows us to avoid deeply - * nested stacks that would have a call flow similar to this: - * - * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() - * ^ | - * | | - * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ - * - * The dbuf_eviction_thread will always have its tsd set until - * that thread exits. All other threads will only set their tsd - * if they are participating in the eviction process. This only - * happens if the eviction thread is unable to process evictions - * fast enough. To keep the dbuf cache size in check, other threads - * can evict from the dbuf cache directly. Those threads will set - * their tsd values so that we ensure that they only evict one dbuf - * from the dbuf cache. - */ - if (tsd_get(zfs_dbuf_evict_key) != NULL) - return; - /* * We check if we should evict without holding the dbuf_evict_lock, * because it's OK to occasionally make the wrong decision here, * and grabbing the lock results in massive lock contention. */ - if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_max_bytes) { if (dbuf_cache_above_hiwater()) dbuf_evict_one(); cv_signal(&dbuf_evict_cv); @@ -610,15 +650,21 @@ retry: mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); /* - * Setup the parameters for the dbuf cache. We set the size of the - * dbuf cache to 1/32nd (default) of the size of the ARC. If the value - * has been set in /etc/system and it's not greater than the size of - * the ARC, then we honor that value. + * Setup the parameters for the dbuf caches. We set the sizes of the + * dbuf cache and the metadata cache to 1/32nd and 1/16th (default) + * of the size of the ARC, respectively. If the values are set in + * /etc/system and they're not greater than the size of the ARC, then + * we honor that value. */ if (dbuf_cache_max_bytes == 0 || dbuf_cache_max_bytes >= arc_max_bytes()) { dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift; } + if (dbuf_metadata_cache_max_bytes == 0 || + dbuf_metadata_cache_max_bytes >= arc_max_bytes()) { + dbuf_metadata_cache_max_bytes = + arc_max_bytes() >> dbuf_metadata_cache_shift; + } /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc @@ -626,12 +672,14 @@ retry: */ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); - dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t), - offsetof(dmu_buf_impl_t, db_cache_link), - dbuf_cache_multilist_index_func); - refcount_create(&dbuf_cache_size); + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + dbuf_caches[dcs].cache = + multilist_create(sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_caches[dcs].size); + } - tsd_create(&zfs_dbuf_evict_key, NULL); dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); @@ -658,13 +706,14 @@ dbuf_fini(void) cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); - tsd_destroy(&zfs_dbuf_evict_key); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); - refcount_destroy(&dbuf_cache_size); - multilist_destroy(dbuf_cache); + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + refcount_destroy(&dbuf_caches[dcs].size); + multilist_destroy(dbuf_caches[dcs].cache); + } } /* @@ -915,8 +964,15 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { - /* we were freed in flight; disregard any error */ + if (buf == NULL) { + /* i/o error */ + ASSERT(zio == NULL || zio->io_error != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + } else if (db->db_level == 0 && db->db_freed_in_flight) { + /* freed in flight */ + ASSERT(zio == NULL || zio->io_error == 0); if (buf == NULL) { buf = arc_alloc_buf(db->db_objset->os_spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); @@ -927,16 +983,14 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (buf != NULL) { + } else { + /* success */ + ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); - db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - dbuf_rele_and_unlock(db, NULL); + dbuf_rele_and_unlock(db, NULL, B_FALSE); } static void @@ -2051,9 +2105,15 @@ dbuf_destroy(dmu_buf_impl_t *db) dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { - multilist_remove(dbuf_cache, db); - (void) refcount_remove_many(&dbuf_cache_size, + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); + + db->db_caching_status = DB_NO_CACHE; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); @@ -2090,7 +2150,8 @@ dbuf_destroy(dmu_buf_impl_t *db) * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ - dnode_rele(dn, db); + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, db, B_TRUE); db->db_dnode_handle = NULL; dbuf_hash_remove(db); @@ -2107,6 +2168,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); @@ -2116,8 +2178,10 @@ dbuf_destroy(dmu_buf_impl_t *db) * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ - if (parent && parent != dndb) - dbuf_rele(parent, db); + if (parent && parent != dndb) { + mutex_enter(&parent->db_mtx); + dbuf_rele_and_unlock(parent, db, B_TRUE); + } } /* @@ -2245,6 +2309,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); @@ -2277,6 +2342,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, avl_add(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; + db->db_caching_status = DB_NO_CACHE; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); @@ -2338,6 +2404,13 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + if (abuf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + kmem_free(dpa, sizeof (*dpa)); + return; + } + ASSERT(zio == NULL || zio->io_error == 0); + /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without @@ -2619,9 +2692,15 @@ top: if (multilist_link_active(&db->db_cache_link)) { ASSERT(refcount_is_zero(&db->db_holds)); - multilist_remove(dbuf_cache, db); - (void) refcount_remove_many(&dbuf_cache_size, + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, db->db.db_size, db); + + db->db_caching_status = DB_NO_CACHE; } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); @@ -2734,7 +2813,7 @@ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); - dbuf_rele_and_unlock(db, tag); + dbuf_rele_and_unlock(db, tag, B_FALSE); } void @@ -2745,10 +2824,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag) /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow - * db_dirtycnt and db_holds to be updated atomically. + * db_dirtycnt and db_holds to be updated atomically. The 'evicting' + * argument should be set if we are already in the dbuf-evicting code + * path, in which case we don't want to recursively evict. This allows us to + * avoid deeply nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * */ void -dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) { int64_t holds; @@ -2838,12 +2926,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { - multilist_insert(dbuf_cache, db); - (void) refcount_add_many(&dbuf_cache_size, + ASSERT3U(db->db_caching_status, ==, + DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(dbuf_caches[dcs].cache, db); + (void) refcount_add_many(&dbuf_caches[dcs].size, db->db.db_size, db); mutex_exit(&db->db_mtx); - dbuf_evict_notify(); + if (db->db_caching_status == DB_DBUF_CACHE && + !evicting) { + dbuf_evict_notify(); + } } if (do_arc_evict) @@ -3108,7 +3207,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); return; } @@ -3458,7 +3557,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index 7bab517fba8c..bfd8e2d7c9ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -79,60 +79,60 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, int zfs_object_remap_one_indirect_delay_ticks = 0; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { DMU_BSWAP_UINT8, TRUE, "unallocated" }, - { DMU_BSWAP_ZAP, TRUE, "object directory" }, - { DMU_BSWAP_UINT64, TRUE, "object array" }, - { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, - { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, - { DMU_BSWAP_UINT64, TRUE, "bpobj" }, - { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, - { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, - { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, - { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, - { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, - { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, - { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, - { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, - { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, - { DMU_BSWAP_ZAP, TRUE, "DSL props" }, - { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, - { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, - { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, - { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, - { DMU_BSWAP_UINT8, FALSE, "zvol object" }, - { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, - { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, - { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, - { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, - { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, - { DMU_BSWAP_UINT8, TRUE, "SPA history" }, - { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, - { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, - { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, - { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, - { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, - { DMU_BSWAP_UINT8, TRUE, "FUID table" }, - { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, - { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, - { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, - { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, - { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, - { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, - { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, - { DMU_BSWAP_UINT8, TRUE, "System attributes" }, - { DMU_BSWAP_ZAP, TRUE, "SA master node" }, - { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, - { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, - { DMU_BSWAP_ZAP, TRUE, "scan translations" }, - { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, - { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, - { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, - { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, - { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } + { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { @@ -449,7 +449,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ -static int +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { @@ -1321,7 +1321,7 @@ xuio_stat_wbuf_nocopy(void) } #ifdef _KERNEL -static int +int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; @@ -1437,7 +1437,7 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) return (err); } -static int +int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; @@ -1881,22 +1881,17 @@ dmu_return_arcbuf(arc_buf_t *buf) * dmu_write(). */ void -dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, +dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; - dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; - DB_DNODE_ENTER(dbuf); - dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the @@ -1924,11 +1919,8 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); - DB_DNODE_ENTER(dbuf); - dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; - DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); @@ -1937,6 +1929,17 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, } } +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + + DB_DNODE_ENTER(dbuf); + dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); + DB_DNODE_EXIT(dbuf); +} + typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c index 40898ef26d97..b853081e8b7c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ @@ -32,7 +32,8 @@ #include <sys/zfeature.h> uint64_t -dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t object; @@ -92,7 +93,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, os->os_obj_next = object - 1; } - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_allocate(dn, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, tx); mutex_exit(&os->os_obj_lock); dmu_tx_add_new_object(tx, dn); @@ -101,6 +103,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, return (object); } +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_alloc_ibs(os, ot, blocksize, 0, + bonustype, bonuslen, tx)); +} + int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) @@ -157,6 +167,10 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); @@ -204,13 +218,19 @@ dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, } ASSERT3U(dn->dn_type, ==, old_type); ASSERT0(dn->dn_maxblkid); + + /* + * We must initialize the ZAP data before changing the type, + * so that concurrent calls to *_is_zapified() can determine if + * the object has been completely zapified by checking the type. + */ + mzap_create_impl(mos, object, 0, 0, tx); + dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = DMU_OTN_ZAP_METADATA; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); - mzap_create_impl(mos, object, 0, 0, tx); - spa_feature_incr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c index 60119c7cda54..50c18a58f6bc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c @@ -54,6 +54,7 @@ #include <sys/dsl_destroy.h> #include <sys/vdev.h> #include <sys/zfeature.h> +#include "zfs_namecheck.h" /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -498,6 +499,14 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } + /* + * These properties will be filled in by the logic in zfs_get_zplprop() + * when they are queried for the first time. + */ + os->os_version = OBJSET_PROP_UNINITIALIZED; + os->os_normalization = OBJSET_PROP_UNINITIALIZED; + os->os_utf8only = OBJSET_PROP_UNINITIALIZED; + os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; @@ -905,6 +914,9 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx) if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); + if (dataset_nestcheck(doca->doca_name) != 0) + return (SET_ERROR(ENAMETOOLONG)); + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 9609761b38f9..25c1fec0c146 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -76,6 +76,11 @@ TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit); static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; +/* + * Use this to override the recordsize calculation for fast zfs send estimates. + */ +uint64_t zfs_override_estimate_recordsize = 0; + #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) @@ -1131,7 +1136,7 @@ static int dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { - int err; + int err = 0; uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by @@ -1144,7 +1149,9 @@ dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, VERIFY0(dmu_objset_from_ds(ds, &os)); /* Assume all (uncompressed) blocks are recordsize. */ - if (os->os_phys->os_type == DMU_OST_ZVOL) { + if (zfs_override_estimate_recordsize != 0) { + recordsize = zfs_override_estimate_recordsize; + } else if (os->os_phys->os_type == DMU_OST_ZVOL) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); } else { @@ -1788,6 +1795,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, drc->drc_force = force; drc->drc_resumable = resumable; drc->drc_cred = CRED(); + drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; @@ -1848,7 +1856,9 @@ struct receive_writer_arg { /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; boolean_t resumable; - uint64_t last_object, last_offset; + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ }; @@ -1896,14 +1906,10 @@ typedef struct guid_map_entry { static int guid_compare(const void *arg1, const void *arg2) { - const guid_map_entry_t *gmep1 = arg1; - const guid_map_entry_t *gmep2 = arg2; + const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; + const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; - if (gmep1->guid < gmep2->guid) - return (-1); - else if (gmep1->guid > gmep2->guid) - return (1); - return (0); + return (AVL_CMP(gmep1->guid, gmep2->guid)); } static void @@ -2145,6 +2151,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; + if (drro->drr_object > rwa->max_object) + rwa->max_object = drro->drr_object; + /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file @@ -2240,6 +2249,9 @@ receive_freeobjects(struct receive_writer_arg *rwa, err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); + + if (obj > rwa->max_object) + rwa->max_object = obj; } if (next_err != ESRCH) return (next_err); @@ -2269,6 +2281,9 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; + if (rwa->last_object > rwa->max_object) + rwa->max_object = rwa->last_object; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -2345,6 +2360,9 @@ receive_write_byref(struct receive_writer_arg *rwa, ref_os = rwa->os; } + if (drrwbr->drr_object > rwa->max_object) + rwa->max_object = drrwbr->drr_object; + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) @@ -2387,6 +2405,9 @@ receive_write_embedded(struct receive_writer_arg *rwa, if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); + if (drrwe->drr_object > rwa->max_object) + rwa->max_object = drrwe->drr_object; + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, @@ -2423,6 +2444,9 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (drrs->drr_object > rwa->max_object) + rwa->max_object = drrs->drr_object; + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); @@ -2467,6 +2491,9 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (drrf->drr_object > rwa->max_object) + rwa->max_object = drrf->drr_object; + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); @@ -3086,6 +3113,41 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, } mutex_exit(&rwa.mutex); + /* + * If we are receiving a full stream as a clone, all object IDs which + * are greater than the maximum ID referenced in the stream are + * by definition unused and must be freed. Note that it's possible that + * we've resumed this send and the first record we received was the END + * record. In that case, max_object would be 0, but we shouldn't start + * freeing all objects from there; instead we should start from the + * resumeobj. + */ + if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { + uint64_t obj; + if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0) + obj = 0; + if (rwa.max_object > obj) + obj = rwa.max_object; + obj++; + int free_err = 0; + int next_err = 0; + + while (next_err == 0) { + free_err = dmu_free_long_object(rwa.os, obj); + if (free_err != 0 && free_err != ENOENT) + break; + + next_err = dmu_object_next(rwa.os, &obj, FALSE, 0); + } + + if (err == 0) { + if (free_err != 0 && free_err != ENOENT) + err = free_err; + else if (next_err != ESRCH) + err = next_err; + } + } + cv_destroy(&rwa.cv); mutex_destroy(&rwa.mutex); bqueue_destroy(&rwa.q); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index ad02fa5918aa..4ac640e54d6c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -1091,7 +1091,12 @@ dmu_tx_wait(dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + /* + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 13a4a02bbfb4..4d72991b5ef6 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -78,19 +78,13 @@ dbuf_compare(const void *x1, const void *x2) const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; - if (d1->db_level < d2->db_level) { - return (-1); - } - if (d1->db_level > d2->db_level) { - return (1); - } + int cmp = AVL_CMP(d1->db_level, d2->db_level); + if (likely(cmp)) + return (cmp); - if (d1->db_blkid < d2->db_blkid) { - return (-1); - } - if (d1->db_blkid > d2->db_blkid) { - return (1); - } + cmp = AVL_CMP(d1->db_blkid, d2->db_blkid); + if (likely(cmp)) + return (cmp); if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); @@ -100,13 +94,7 @@ dbuf_compare(const void *x1, const void *x2) return (1); } - if ((uintptr_t)d1 < (uintptr_t)d2) { - return (-1); - } - if ((uintptr_t)d1 > (uintptr_t)d2) { - return (1); - } - return (0); + return (AVL_PCMP(d1, d2)); } /* ARGSUSED */ @@ -742,6 +730,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], + sizeof (odn->dn_next_type)); bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], @@ -1238,11 +1228,11 @@ void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); - dnode_rele_and_unlock(dn, tag); + dnode_rele_and_unlock(dn, tag, B_FALSE); } void -dnode_rele_and_unlock(dnode_t *dn, void *tag) +dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ @@ -1273,7 +1263,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag) * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ - dbuf_rele(db, dnh); + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, dnh, evicting); } } @@ -1518,6 +1509,72 @@ dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) } } +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -1550,13 +1607,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. - * Note that we won't dirty any indirect blocks, - * which is fine because we will be freeing the entire - * file and thus all indirect blocks will be freed - * by free_children(). */ blkid = 0; nblks = 1; + if (dn->dn_nlevels > 1) + dnode_dirty_l1(dn, 0, tx); goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ @@ -1669,6 +1724,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) if (last != first) dnode_dirty_l1(dn, last, tx); + dnode_dirty_l1range(dn, first, last, tx); + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 2fcaf7927de6..02f263c82e42 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -229,9 +229,24 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } #endif +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; @@ -248,6 +263,24 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + /* + * If we modify this indirect block, and we are not freeing the + * dnode (!free_indirects), then this indirect block needs to get + * written to disk by dbuf_write(). If it is dirty, we know it will + * be written (otherwise, we would have incorrect on-disk state + * because the space would be freed but still referenced by the BP + * in this indirect block). Therefore we VERIFY that it is + * dirty. + * + * Our VERIFY covers some cases that do not actually have to be + * dirty, but the open-context code happens to dirty. E.g. if the + * blocks we are freeing are all holes, because in that case, we + * are only freeing part of this indirect block, so it is an + * ancestor of the first or last block to be freed. The first and + * last L1 indirect blocks are always dirtied by dnode_free_range(). + */ + VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dbuf_release_bp(db); bp = db->db.db_data; @@ -283,32 +316,16 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); - free_children(subdb, blkid, nblks, tx); + free_children(subdb, blkid, nblks, free_indirects, tx); dbuf_rele(subdb, FTAG); } } - /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { - if (!BP_IS_HOLE(bp)) - break; - } - if (i == 1 << epbs) { - /* - * We only found holes. Grab the rwlock to prevent - * anybody from reading the blocks we're about to - * zero out. - */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (free_indirects) { + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); free_blocks(dn, db->db_blkptr, 1, tx); - } else { - /* - * Partial block free; must be marked dirty so that it - * will be written out. - */ - ASSERT(db->db_dirtycnt > 0); } DB_DNODE_EXIT(db); @@ -321,7 +338,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, - dmu_tx_t *tx) + boolean_t free_indirects, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; @@ -361,7 +378,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, tx); + free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } } @@ -380,6 +397,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; } dnode_sync_free_range_arg_t; static void @@ -389,7 +407,8 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); - dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } @@ -420,6 +439,19 @@ dnode_evict_dbufs(dnode_t *dn) avl_insert_here(&dn->dn_dbufs, &db_marker, db, AVL_BEFORE); + /* + * We need to use the "marker" dbuf rather than + * simply getting the next dbuf, because + * dbuf_destroy() may actually remove multiple dbufs. + * It can call itself recursively on the parent dbuf, + * which may also be removed from dn_dbufs. The code + * flow would look like: + * + * dbuf_destroy(): + * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): + * if (!cacheable || pending_evict) + * dbuf_destroy() + */ dbuf_destroy(db); db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); @@ -478,7 +510,7 @@ dnode_undirty_dbufs(list_t *list) list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); } } @@ -670,6 +702,11 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c index 356e5b51c3f4..2f3647bc8e86 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c @@ -55,15 +55,10 @@ static int dsl_deadlist_compare(const void *arg1, const void *arg2) { - const dsl_deadlist_entry_t *dle1 = arg1; - const dsl_deadlist_entry_t *dle2 = arg2; + const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; + const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; - if (dle1->dle_mintxg < dle2->dle_mintxg) - return (-1); - else if (dle1->dle_mintxg > dle2->dle_mintxg) - return (+1); - else - return (0); + return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c index 7870b4951b29..0ad658f910ec 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c @@ -384,14 +384,13 @@ typedef struct perm_set { static int perm_set_compare(const void *arg1, const void *arg2) { - const perm_set_t *node1 = arg1; - const perm_set_t *node2 = arg2; + const perm_set_t *node1 = (const perm_set_t *)arg1; + const perm_set_t *node2 = (const perm_set_t *)arg2; int val; val = strcmp(node1->p_setname, node2->p_setname); - if (val == 0) - return (0); - return (val > 0 ? 1 : -1); + + return (AVL_ISIGN(val)); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c index 00b1dbe36d83..1a4194ebf16d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c @@ -1388,7 +1388,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); - err = arc_tempreserve_space(lsize, tx->tx_txg); + err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; @@ -1819,16 +1819,28 @@ typedef struct dsl_dir_rename_arg { cred_t *ddra_cred; } dsl_dir_rename_arg_t; +typedef struct dsl_valid_rename_arg { + int char_delta; + int nest_delta; +} dsl_valid_rename_arg_t; + /* ARGSUSED */ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { - int *deltap = arg; + dsl_valid_rename_arg_t *dvra = arg; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, namebuf); - if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN) + ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + int namelen = strlen(namebuf) + dvra->char_delta; + int depth = get_dataset_depth(namebuf) + dvra->nest_delta; + + if (namelen >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) return (SET_ERROR(ENAMETOOLONG)); return (0); } @@ -1839,9 +1851,9 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; + dsl_valid_rename_arg_t dvra; const char *mynewname; int error; - int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); @@ -1870,10 +1882,19 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EEXIST)); } + ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + dvra.char_delta = strlen(ddra->ddra_newname) + - strlen(ddra->ddra_oldname); + dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) + - get_dataset_depth(ddra->ddra_oldname); + /* if the name length is growing, validate child name lengths */ - if (delta > 0) { + if (dvra.char_delta > 0 || dvra.nest_delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, - &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c index 8e7616427e19..48675909365b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. @@ -2164,7 +2164,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * block-sharing rules don't apply to it. */ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds) && - ds->ds_dir != dp->dp_origin_snap->ds_dir) { + (dp->dp_origin_snap == NULL || + ds->ds_dir != dp->dp_origin_snap->ds_dir)) { objset_t *os; if (dmu_objset_from_ds(ds, &os) != 0) { goto out; @@ -2959,6 +2960,16 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, { vdev_t *vd; + if (vd->vdev_ops == &vdev_indirect_ops) { + /* + * The indirect vdev can point to multiple + * vdevs. For simplicity, always create + * the resilver zio_t. zio_vdev_io_start() + * will bypass the child resilver i/o's if + * they are on vdevs that don't have DTL's. + */ + return (B_TRUE); + } if (DVA_GET_GANG(dva)) { /* * Gang members may be spread across multiple @@ -3541,14 +3552,14 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int d; + count_block(scn, dp->dp_blkstats, bp); + if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); - if (BP_IS_EMBEDDED(bp)) { - count_block(scn, dp->dp_blkstats, bp); - return (0); - } + /* Embedded BP's have phys_birth==0, so we reject them above. */ + ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 6cff5eacdcdf..e374cd356792 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -275,6 +275,8 @@ static uint64_t metaslab_weight(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t); +static void metaslab_passivate(metaslab_t *msp, uint64_t weight); +static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); kmem_cache_t *metaslab_alloc_trace_cache; @@ -294,7 +296,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); - refcount_create_tracked(&mc->mc_alloc_slots); + mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (refcount_t), KM_SLEEP); + mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) + refcount_create_tracked(&mc->mc_alloc_slots[i]); return (mc); } @@ -308,7 +315,12 @@ metaslab_class_destroy(metaslab_class_t *mc) ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); - refcount_destroy(&mc->mc_alloc_slots); + for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) + refcount_destroy(&mc->mc_alloc_slots[i]); + kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * + sizeof (refcount_t)); + kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * + sizeof (uint64_t)); mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -529,25 +541,40 @@ metaslab_class_expandable_space(metaslab_class_t *mc) static int metaslab_compare(const void *x1, const void *x2) { - const metaslab_t *m1 = x1; - const metaslab_t *m2 = x2; - - if (m1->ms_weight < m2->ms_weight) - return (1); - if (m1->ms_weight > m2->ms_weight) - return (-1); + const metaslab_t *m1 = (const metaslab_t *)x1; + const metaslab_t *m2 = (const metaslab_t *)x2; + + int sort1 = 0; + int sort2 = 0; + if (m1->ms_allocator != -1 && m1->ms_primary) + sort1 = 1; + else if (m1->ms_allocator != -1 && !m1->ms_primary) + sort1 = 2; + if (m2->ms_allocator != -1 && m2->ms_primary) + sort2 = 1; + else if (m2->ms_allocator != -1 && !m2->ms_primary) + sort2 = 2; /* - * If the weights are identical, use the offset to force uniqueness. + * Sort inactive metaslabs first, then primaries, then secondaries. When + * selecting a metaslab to allocate from, an allocator first tries its + * primary, then secondary active metaslab. If it doesn't have active + * metaslabs, or can't allocate from them, it searches for an inactive + * metaslab to activate. If it can't find a suitable one, it will steal + * a primary or secondary metaslab from another allocator. */ - if (m1->ms_start < m2->ms_start) + if (sort1 < sort2) return (-1); - if (m1->ms_start > m2->ms_start) + if (sort1 > sort2) return (1); - ASSERT3P(m1, ==, m2); + int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight); + if (likely(cmp)) + return (cmp); - return (0); + IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + + return (AVL_CMP(m1->ms_start, m2->ms_start)); } /* @@ -683,12 +710,18 @@ metaslab_group_alloc_update(metaslab_group_t *mg) } metaslab_group_t * -metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); + mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); + mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), + KM_SLEEP); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); mg->mg_vd = vd; @@ -696,7 +729,16 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mg->mg_activation_count = 0; mg->mg_initialized = B_FALSE; mg->mg_no_free_space = B_TRUE; - refcount_create_tracked(&mg->mg_alloc_queue_depth); + mg->mg_allocators = allocators; + + mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t), + KM_SLEEP); + mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < allocators; i++) { + refcount_create_tracked(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); @@ -718,8 +760,22 @@ metaslab_group_destroy(metaslab_group_t *mg) taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); + kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *)); + kmem_free(mg->mg_secondaries, mg->mg_allocators * + sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); - refcount_destroy(&mg->mg_alloc_queue_depth); + mutex_destroy(&mg->mg_ms_initialize_lock); + cv_destroy(&mg->mg_ms_initialize_cv); + + for (int i = 0; i < mg->mg_allocators; i++) { + refcount_destroy(&mg->mg_alloc_queue_depth[i]); + mg->mg_cur_max_alloc_queue_depth[i] = 0; + } + kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators * + sizeof (refcount_t)); + kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators * + sizeof (uint64_t)); + kmem_free(mg, sizeof (metaslab_group_t)); } @@ -799,6 +855,22 @@ metaslab_group_passivate(metaslab_group_t *mg) taskq_wait(mg->mg_taskq); spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); metaslab_group_alloc_update(mg); + for (int i = 0; i < mg->mg_allocators; i++) { + metaslab_t *msp = mg->mg_primaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + msp = mg->mg_secondaries[i]; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + } mgprev = mg->mg_prev; mgnext = mg->mg_next; @@ -940,6 +1012,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) } static void +metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + ASSERT(MUTEX_HELD(&mg->mg_lock)); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + +} + +static void metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* @@ -950,10 +1033,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); - ASSERT(msp->ms_group == mg); - avl_remove(&mg->mg_metaslab_tree, msp); - msp->ms_weight = weight; - avl_add(&mg->mg_metaslab_tree, msp); + metaslab_group_sort_impl(mg, msp, weight); mutex_exit(&mg->mg_lock); } @@ -1001,7 +1081,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg) */ static boolean_t metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, - uint64_t psize) + uint64_t psize, int allocator) { spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; @@ -1030,7 +1110,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, if (mg->mg_allocatable) { metaslab_group_t *mgp; int64_t qdepth; - uint64_t qmax = mg->mg_max_alloc_queue_depth; + uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator]; if (!mc->mc_alloc_throttle_enabled) return (B_TRUE); @@ -1042,7 +1122,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, if (mg->mg_no_free_space) return (B_FALSE); - qdepth = refcount_count(&mg->mg_alloc_queue_depth); + qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]); /* * If this metaslab group is below its qmax or it's @@ -1061,9 +1141,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * groups at the same time when we make this check. */ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { - qmax = mgp->mg_max_alloc_queue_depth; + qmax = mgp->mg_cur_max_alloc_queue_depth[allocator]; - qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + qdepth = refcount_count( + &mgp->mg_alloc_queue_depth[allocator]); /* * If there is another metaslab group that @@ -1105,18 +1186,14 @@ metaslab_rangesize_compare(const void *x1, const void *x2) uint64_t rs_size1 = r1->rs_end - r1->rs_start; uint64_t rs_size2 = r2->rs_end - r2->rs_start; - if (rs_size1 < rs_size2) - return (-1); - if (rs_size1 > rs_size2) - return (1); + int cmp = AVL_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); if (r1->rs_start < r2->rs_start) return (-1); - if (r1->rs_start > r2->rs_start) - return (1); - - return (0); + return (AVL_CMP(r1->rs_start, r2->rs_start)); } /* @@ -1468,9 +1545,12 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; + ms->ms_allocator = -1; + ms->ms_new = B_TRUE; /* * We only open space map objects that already exist. All others @@ -1567,6 +1647,7 @@ metaslab_fini(metaslab_t *msp) cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); mutex_destroy(&msp->ms_sync_lock); + ASSERT3U(msp->ms_allocator, ==, -1); kmem_free(msp, sizeof (metaslab_t)); } @@ -1658,7 +1739,7 @@ metaslab_set_fragmentation(metaslab_t *msp) if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { msp->ms_condense_wanted = B_TRUE; vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); - spa_dbgmsg(spa, "txg %llu, requesting force condense: " + zfs_dbgmsg("txg %llu, requesting force condense: " "ms_id %llu, vdev_id %llu", txg, msp->ms_id, vd->vdev_id); } @@ -1963,19 +2044,59 @@ metaslab_weight(metaslab_t *msp) } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, + int allocator, uint64_t activation_weight) +{ + /* + * If we're activating for the claim code, we don't want to actually + * set the metaslab up for a specific allocator. + */ + if (activation_weight == METASLAB_WEIGHT_CLAIM) + return (0); + metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? + mg->mg_primaries : mg->mg_secondaries); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + mutex_enter(&mg->mg_lock); + if (arr[allocator] != NULL) { + mutex_exit(&mg->mg_lock); + return (EEXIST); + } + + arr[allocator] = msp; + ASSERT3S(msp->ms_allocator, ==, -1); + msp->ms_allocator = allocator; + msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + mutex_exit(&mg->mg_lock); + + return (0); +} + +static int +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) { ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { + int error = 0; metaslab_load_wait(msp); if (!msp->ms_loaded) { - int error = metaslab_load(msp); - if (error) { + if ((error = metaslab_load(msp)) != 0) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } } + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + /* + * The metaslab was activated for another allocator + * while we were waiting, we should reselect. + */ + return (EBUSY); + } + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, @@ -1988,6 +2109,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight) } static void +metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_group_sort(mg, msp, weight); + return; + } + + mutex_enter(&mg->mg_lock); + ASSERT3P(msp->ms_group, ==, mg); + if (msp->ms_primary) { + ASSERT3U(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + mg->mg_primaries[msp->ms_allocator] = NULL; + } else { + ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp); + mg->mg_secondaries[msp->ms_allocator] = NULL; + } + msp->ms_allocator = -1; + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + +static void metaslab_passivate(metaslab_t *msp, uint64_t weight) { uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; @@ -2002,7 +2151,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight) ASSERT0(weight & METASLAB_ACTIVE_MASK); msp->ms_activation_weight = 0; - metaslab_group_sort(msp->ms_group, msp, weight); + metaslab_passivate_allocator(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } @@ -2105,17 +2254,6 @@ metaslab_group_preload(metaslab_group_t *mg) * * 3. The on-disk size of the space map should actually decrease. * - * Checking the first condition is tricky since we don't want to walk - * the entire AVL tree calculating the estimated on-disk size. Instead we - * use the size-ordered range tree in the metaslab and calculate the - * size required to write out the largest segment in our free tree. If the - * size required to represent that segment on disk is larger than the space - * map object then we avoid condensing this map. - * - * To determine the second criterion we use a best-case estimate and assume - * each segment can be represented on-disk as a single 64-bit entry. We refer - * to this best-case estimate as the space map's minimal form. - * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for @@ -2126,9 +2264,6 @@ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; - range_seg_t *rs; - uint64_t size, entries, segsz, object_size, optimal_size, record_size; - dmu_object_info_t doi; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); @@ -2154,34 +2289,22 @@ metaslab_should_condense(metaslab_t *msp) msp->ms_condense_checked_txg = current_txg; /* - * Use the ms_allocatable_by_size range tree, which is ordered by - * size, to obtain the largest segment in the free tree. We always - * condense metaslabs that are empty and metaslabs for which a - * condense request has been made. + * We always condense metaslabs that are empty and metaslabs for + * which a condense request has been made. */ - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || msp->ms_condense_wanted) + if (avl_is_empty(&msp->ms_allocatable_by_size) || + msp->ms_condense_wanted) return (B_TRUE); - /* - * Calculate the number of 64-bit entries this segment would - * require when written to disk. If this single segment would be - * larger on-disk than the entire current on-disk structure, then - * clearly condensing will increase the on-disk structure size. - */ - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - entries = size / (MIN(size, SM_RUN_MAX)); - segsz = entries * sizeof (uint64_t); - - optimal_size = - sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root); - object_size = space_map_length(msp->ms_sm); + uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t optimal_size = space_map_estimate_optimal_size(sm, + msp->ms_allocatable, SM_NO_VDEVID); + dmu_object_info_t doi; dmu_object_info_from_db(sm->sm_dbuf, &doi); - record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - return (segsz <= object_size && - object_size >= (optimal_size * zfs_condense_pct / 100) && + return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } @@ -2256,11 +2379,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * optimal, this is typically close to optimal, and much cheaper to * compute. */ - space_map_write(sm, condense_tree, SM_ALLOC, tx); + space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_allocatable, SM_FREE, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2372,8 +2495,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); - space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); - space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, + SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } @@ -2388,7 +2513,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, - msp->ms_checkpointing, SM_FREE, tx); + msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); space_map_update(vd->vdev_checkpoint_sm); @@ -2580,22 +2705,34 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); } + if (msp->ms_new) { + msp->ms_new = B_FALSE; + mutex_enter(&mg->mg_lock); + mg->mg_ms_ready++; + mutex_exit(&mg->mg_lock); + } /* * Calculate the new weights before unloading any metaslabs. * This will give us the most accurate weighting. */ - metaslab_group_sort(mg, msp, metaslab_weight(msp)); + metaslab_group_sort(mg, msp, metaslab_weight(msp) | + (msp->ms_weight & METASLAB_ACTIVE_MASK)); /* * If the metaslab is loaded and we've not tried to load or allocate * from it in 'metaslab_unload_delay' txgs, then unload it. */ if (msp->ms_loaded && + msp->ms_initializing == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); } + if (msp->ms_allocator != -1) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } if (!metaslab_debug_unload) metaslab_unload(msp); @@ -2689,7 +2826,8 @@ metaslab_alloc_trace_fini(void) */ static void metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, - metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, + int allocator) { if (!metaslab_trace_enabled) return; @@ -2722,6 +2860,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, mat->mat_dva_id = dva_id; mat->mat_offset = offset; mat->mat_weight = 0; + mat->mat_allocator = allocator; if (msp != NULL) mat->mat_weight = msp->ms_weight; @@ -2762,35 +2901,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal) */ static void -metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator) { if (!(flags & METASLAB_ASYNC_ALLOC) || - flags & METASLAB_DONT_THROTTLE) + (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) refcount_add(&mg->mg_alloc_queue_depth, tag); + (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag); +} + +static void +metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) +{ + uint64_t max = mg->mg_max_alloc_queue_depth; + uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + while (cur < max) { + if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator], + cur, cur + 1) == cur) { + atomic_inc_64( + &mg->mg_class->mc_alloc_max_slots[allocator]); + return; + } + cur = mg->mg_cur_max_alloc_queue_depth[allocator]; + } } void -metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator, boolean_t io_complete) { if (!(flags & METASLAB_ASYNC_ALLOC) || - flags & METASLAB_DONT_THROTTLE) + (flags & METASLAB_DONT_THROTTLE)) return; metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; if (!mg->mg_class->mc_alloc_throttle_enabled) return; - (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); + (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag); + if (io_complete) + metaslab_group_increment_qdepth(mg, allocator); } void -metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, + int allocator) { #ifdef ZFS_DEBUG const dva_t *dva = bp->blk_dva; @@ -2799,7 +2959,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) for (int d = 0; d < ndvas; d++) { uint64_t vdev = DVA_GET_VDEV(&dva[d]); metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; - VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator], + tag)); } #endif } @@ -2812,6 +2973,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); + VERIFY0(msp->ms_initializing); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -2841,91 +3003,147 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) return (start); } +/* + * Find the metaslab with the highest weight that is less than what we've + * already tried. In the common case, this means that we will examine each + * metaslab at most once. Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. If a metaslab is + * activated by another thread, and we fail to allocate from the metaslab we + * have selected, we may not try the newly-activated metaslab, and instead + * activate another metaslab. This is not optimal, but generally does not cause + * any problems (a possible exception being if every metaslab is completely full + * except for the the newly-activated metaslab which we fail to examine). + */ +static metaslab_t * +find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, + dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator, + zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active) +{ + avl_index_t idx; + avl_tree_t *t = &mg->mg_metaslab_tree; + metaslab_t *msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + int i; + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + continue; + } + + /* + * If the selected metaslab is condensing or being + * initialized, skip it. + */ + if (msp->ms_condensing || msp->ms_initializing > 0) + continue; + + *was_active = msp->ms_allocator != -1; + /* + * If we're activating as primary, this is our first allocation + * from this disk, so we don't need to check how close we are. + * If the metaslab under consideration was already active, + * we're getting desperate enough to steal another allocator's + * metaslab, so we still don't care about distances. + */ + if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) + break; + + uint64_t target_distance = min_distance + + (space_map_allocated(msp->ms_sm) != 0 ? 0 : + min_distance >> 1); + + for (i = 0; i < d; i++) { + if (metaslab_distance(msp, &dva[i]) < target_distance) + break; + } + if (i == d) + break; + } + + if (msp != NULL) { + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; + search->ms_allocator = msp->ms_allocator; + search->ms_primary = msp->ms_primary; + } + return (msp); +} + +/* ARGSUSED */ static uint64_t metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; uint64_t activation_weight; - uint64_t target_distance; - int i; + boolean_t tertiary = B_FALSE; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) { - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + for (int i = 0; i < d; i++) { + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + tertiary = B_TRUE; break; } } + /* + * If we don't have enough metaslabs active to fill the entire array, we + * just use the 0th slot. + */ + if (mg->mg_ms_ready < mg->mg_allocators * 2) { + tertiary = B_FALSE; + allocator = 0; + } + + ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); search->ms_weight = UINT64_MAX; search->ms_start = 0; + /* + * At the end of the metaslab tree are the already-active metaslabs, + * first the primaries, then the secondaries. When we resume searching + * through the tree, we need to consider ms_allocator and ms_primary so + * we start in the location right after where we left off, and don't + * accidentally loop forever considering the same metaslabs. + */ + search->ms_allocator = -1; + search->ms_primary = B_TRUE; for (;;) { - boolean_t was_active; - avl_tree_t *t = &mg->mg_metaslab_tree; - avl_index_t idx; + boolean_t was_active = B_FALSE; mutex_enter(&mg->mg_lock); - /* - * Find the metaslab with the highest weight that is less - * than what we've already tried. In the common case, this - * means that we will examine each metaslab at most once. - * Note that concurrent callers could reorder metaslabs - * by activation/passivation once we have dropped the mg_lock. - * If a metaslab is activated by another thread, and we fail - * to allocate from the metaslab we have selected, we may - * not try the newly-activated metaslab, and instead activate - * another metaslab. This is not optimal, but generally - * does not cause any problems (a possible exception being - * if every metaslab is completely full except for the - * the newly-activated metaslab which we fail to examine). - */ - msp = avl_find(t, search, &idx); - if (msp == NULL) - msp = avl_nearest(t, idx, AVL_AFTER); - for (; msp != NULL; msp = AVL_NEXT(t, msp)) { - - if (!metaslab_should_allocate(msp, asize)) { - metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL); - continue; - } - - /* - * If the selected metaslab is condensing, skip it. - */ - if (msp->ms_condensing) - continue; - - was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; - if (activation_weight == METASLAB_WEIGHT_PRIMARY) - break; - - target_distance = min_distance + - (space_map_allocated(msp->ms_sm) != 0 ? 0 : - min_distance >> 1); - - for (i = 0; i < d; i++) { - if (metaslab_distance(msp, &dva[i]) < - target_distance) - break; - } - if (i == d) - break; + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + mg->mg_primaries[allocator] != NULL) { + msp = mg->mg_primaries[allocator]; + was_active = B_TRUE; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + mg->mg_secondaries[allocator] != NULL && !tertiary) { + msp = mg->mg_secondaries[allocator]; + was_active = B_TRUE; + } else { + msp = find_valid_metaslab(mg, activation_weight, dva, d, + min_distance, asize, allocator, zal, search, + &was_active); } + mutex_exit(&mg->mg_lock); if (msp == NULL) { kmem_free(search, sizeof (*search)); return (-1ULL); } - search->ms_weight = msp->ms_weight; - search->ms_start = msp->ms_start + 1; mutex_enter(&msp->ms_lock); - /* * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that @@ -2939,18 +3157,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, continue; } - if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && - activation_weight == METASLAB_WEIGHT_PRIMARY) { - metaslab_passivate(msp, - msp->ms_weight & ~METASLAB_ACTIVE_MASK); + /* + * If the metaslab is freshly activated for an allocator that + * isn't the one we're allocating from, or if it's a primary and + * we're seeking a secondary (or vice versa), we go back and + * select a new metaslab. + */ + if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && + (msp->ms_allocator != -1) && + (msp->ms_allocator != allocator || ((activation_weight == + METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { mutex_exit(&msp->ms_lock); continue; } - if (metaslab_activate(msp, activation_weight) != 0) { + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_WEIGHT_CLAIM); mutex_exit(&msp->ms_lock); continue; } + + if (metaslab_activate(msp, allocator, activation_weight) != 0) { + mutex_exit(&msp->ms_lock); + continue; + } + msp->ms_selected_txg = txg; /* @@ -2963,24 +3195,35 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, if (!metaslab_should_allocate(msp, asize)) { /* Passivate this metaslab and select a new one. */ metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_TOO_SMALL); + TRACE_TOO_SMALL, allocator); goto next; } /* * If this metaslab is currently condensing then pick again as * we can't manipulate this metaslab until it's committed - * to disk. + * to disk. If this metaslab is being initialized, we shouldn't + * allocate from it since the allocated region might be + * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, - TRACE_CONDENSING); + TRACE_CONDENSING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; + } else if (msp->ms_initializing > 0) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_INITIALIZING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; } offset = metaslab_block_alloc(msp, asize, txg); - metaslab_trace_add(zal, mg, msp, asize, d, offset); + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); if (offset != -1ULL) { /* Proactively passivate the metaslab, if needed */ @@ -3036,19 +3279,20 @@ next: static uint64_t metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, - uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d, + int allocator) { uint64_t offset; ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, - min_distance, dva, d); + min_distance, dva, d, allocator); mutex_enter(&mg->mg_lock); if (offset == -1ULL) { mg->mg_failed_allocations++; metaslab_trace_add(zal, mg, NULL, asize, d, - TRACE_GROUP_FAILURE); + TRACE_GROUP_FAILURE, allocator); if (asize == SPA_GANGBLOCKSIZE) { /* * This metaslab group was unable to allocate @@ -3083,7 +3327,7 @@ int ditto_same_vdev_distance_shift = 3; int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, - zio_alloc_list_t *zal) + zio_alloc_list_t *zal, int allocator) { metaslab_group_t *mg, *rotor; vdev_t *vd; @@ -3095,7 +3339,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * For testing, make some blocks above a certain size be gang blocks. */ if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) { - metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, + allocator); return (SET_ERROR(ENOSPC)); } @@ -3181,12 +3426,12 @@ top: */ if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { allocatable = metaslab_group_allocatable(mg, rotor, - psize); + psize, allocator); } if (!allocatable) { metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_NOT_ALLOCATABLE); + TRACE_NOT_ALLOCATABLE, allocator); goto next; } @@ -3201,7 +3446,7 @@ top: vd->vdev_state < VDEV_STATE_HEALTHY) && d == 0 && !try_hard && vd->vdev_children == 0) { metaslab_trace_add(zal, mg, NULL, psize, d, - TRACE_VDEV_ERROR); + TRACE_VDEV_ERROR, allocator); goto next; } @@ -3225,7 +3470,7 @@ top: ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, - distance, dva, d); + distance, dva, d, allocator); if (offset != -1ULL) { /* @@ -3288,7 +3533,7 @@ next: bzero(&dva[d], sizeof (dva_t)); - metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); return (SET_ERROR(ENOSPC)); } @@ -3355,7 +3600,7 @@ metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, return; if (spa->spa_vdev_removal != NULL && - spa->spa_vdev_removal->svr_vdev == vd && + spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && vdev_is_concrete(vd)) { /* * Note: we check if the vdev is concrete because when @@ -3589,18 +3834,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) * the reservation. */ boolean_t -metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, - int flags) +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, + zio_t *zio, int flags) { uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; + uint64_t max = mc->mc_alloc_max_slots[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); - if (reserved_slots < mc->mc_alloc_max_slots) - available_slots = mc->mc_alloc_max_slots - reserved_slots; + uint64_t reserved_slots = + refcount_count(&mc->mc_alloc_slots[allocator]); + if (reserved_slots < max) + available_slots = max - reserved_slots; if (slots <= available_slots || GANG_ALLOCATION(flags)) { /* @@ -3608,7 +3855,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, * them individually when an I/O completes. */ for (int d = 0; d < slots; d++) { - reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + reserved_slots = + refcount_add(&mc->mc_alloc_slots[allocator], + zio); } zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; @@ -3619,12 +3868,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, } void -metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, + int allocator, zio_t *zio) { ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); for (int d = 0; d < slots; d++) { - (void) refcount_remove(&mc->mc_alloc_slots, zio); + (void) refcount_remove(&mc->mc_alloc_slots[allocator], + zio); } mutex_exit(&mc->mc_lock); } @@ -3646,7 +3897,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, mutex_enter(&msp->ms_lock); if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); + /* + * No need to fail in that case; someone else has activated the + * metaslab, but that doesn't preclude us from using it. + */ + if (error == EBUSY) + error = 0; if (error == 0 && !range_tree_contains(msp->ms_allocatable, offset, size)) @@ -3751,7 +4008,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, - zio_alloc_list_t *zal, zio_t *zio) + zio_alloc_list_t *zal, zio_t *zio, int allocator) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -3774,12 +4031,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags, zal); + txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { metaslab_unalloc_dva(spa, &dva[d], txg); metaslab_group_alloc_decrement(spa, - DVA_GET_VDEV(&dva[d]), zio, flags); + DVA_GET_VDEV(&dva[d]), zio, flags, + allocator, B_FALSE); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); @@ -3790,7 +4048,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, * based on the newly allocated dva. */ metaslab_group_alloc_increment(spa, - DVA_GET_VDEV(&dva[d]), zio, flags); + DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c index 4ebadace742d..6359b72503ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c @@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { - avl_index_t where; range_seg_t rsearch; uint64_t end = start + size; @@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) rsearch.rs_start = start; rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, &where)); + return (avl_find(&rt->rt_root, &rsearch, NULL)); } range_seg_t * @@ -651,3 +650,23 @@ range_tree_is_empty(range_tree_t *rt) ASSERT(rt != NULL); return (range_tree_space(rt) == 0); } + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = avl_first(&rt->rt_root); + return (rs != NULL ? rs->rs_start : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = avl_last(&rt->rt_root); + return (rs != NULL ? rs->rs_end : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c index dd6e90c7796b..50f3c0ad822f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c @@ -242,31 +242,23 @@ sa_cache_fini(void) static int layout_num_compare(const void *arg1, const void *arg2) { - const sa_lot_t *node1 = arg1; - const sa_lot_t *node2 = arg2; + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; - if (node1->lot_num > node2->lot_num) - return (1); - else if (node1->lot_num < node2->lot_num) - return (-1); - return (0); + return (AVL_CMP(node1->lot_num, node2->lot_num)); } static int layout_hash_compare(const void *arg1, const void *arg2) { - const sa_lot_t *node1 = arg1; - const sa_lot_t *node2 = arg2; + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; - if (node1->lot_hash > node2->lot_hash) - return (1); - if (node1->lot_hash < node2->lot_hash) - return (-1); - if (node1->lot_instance > node2->lot_instance) - return (1); - if (node1->lot_instance < node2->lot_instance) - return (-1); - return (0); + int cmp = AVL_CMP(node1->lot_hash, node2->lot_hash); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(node1->lot_instance, node2->lot_instance)); } boolean_t diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 489956f1857b..c8a635ae54f3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -55,6 +55,7 @@ #include <sys/vdev_removal.h> #include <sys/vdev_indirect_mapping.h> #include <sys/vdev_indirect_births.h> +#include <sys/vdev_initialize.h> #include <sys/metaslab.h> #include <sys/metaslab_impl.h> #include <sys/uberblock_impl.h> @@ -443,8 +444,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); - if (err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds)) { + err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds); + if (err != 0) { dsl_pool_config_exit(dp, FTAG); break; } @@ -599,7 +601,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) break; } - if (error = dmu_objset_hold(strval, FTAG, &os)) + error = dmu_objset_hold(strval, FTAG, &os); + if (error != 0) break; /* @@ -902,19 +905,14 @@ spa_change_guid(spa_t *spa) static int spa_error_entry_compare(const void *a, const void *b) { - spa_error_entry_t *sa = (spa_error_entry_t *)a; - spa_error_entry_t *sb = (spa_error_entry_t *)b; + const spa_error_entry_t *sa = (const spa_error_entry_t *)a; + const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; - ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, + ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); + return (AVL_ISIGN(ret)); } /* @@ -1215,8 +1213,10 @@ spa_activate(spa_t *spa, int mode) */ trim_thread_create(spa); - for (size_t i = 0; i < TXG_SIZE; i++) - spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); + for (size_t i = 0; i < TXG_SIZE; i++) { + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); @@ -1388,6 +1388,11 @@ spa_unload(spa_t *spa) */ spa_async_suspend(spa); + if (spa->spa_root_vdev) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + /* * Stop syncing. */ @@ -1403,10 +1408,10 @@ spa_unload(spa_t *spa) * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -1440,7 +1445,7 @@ spa_unload(spa_t *spa) bpobj_close(&spa->spa_deferred_bpobj); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. @@ -1502,7 +1507,7 @@ spa_unload(spa_t *spa) spa->spa_comment = NULL; } - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -3954,6 +3959,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_restart_removal(spa); spa_spawn_aux_threads(spa); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_load_note(spa, "LOADED"); @@ -4362,18 +4371,14 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) } static void -spa_add_feature_stats(spa_t *spa, nvlist_t *config) +spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { - nvlist_t *features; zap_cursor_t zc; zap_attribute_t za; - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); - VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); - /* We may be unable to read features if pool is suspended. */ if (spa_suspended(spa)) - goto out; + return; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, @@ -4382,7 +4387,7 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); - VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); @@ -4395,16 +4400,62 @@ spa_add_feature_stats(spa_t *spa, nvlist_t *config) zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); - VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } +} -out: - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, - features) == 0); - nvlist_free(features); +static void +spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t feature = spa_feature_table[i]; + uint64_t refcount; + + if (feature_get_refcount(spa, &feature, &refcount) != 0) + continue; + + VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); + } +} + +/* + * Store a list of pool features and their reference counts in the + * config. + * + * The first time this is called on a spa, allocate a new nvlist, fetch + * the pool features and reference counts from disk, then save the list + * in the spa. In subsequent calls on the same spa use the saved nvlist + * and refresh its values from the cached reference counts. This + * ensures we don't block here on I/O on a suspended pool so 'zpool + * clear' can resume the pool. + */ +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + mutex_enter(&spa->spa_feat_stats_lock); + features = spa->spa_feat_stats; + + if (features != NULL) { + spa_feature_stats_from_cache(spa, features); + } else { + VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); + spa->spa_feat_stats = features; + spa_feature_stats_from_disk(spa, features); + } + + VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features)); + + mutex_exit(&spa->spa_feat_stats_lock); } int @@ -5675,6 +5726,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { + /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. @@ -5709,6 +5761,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, } /* + * We're about to export or destroy this pool. Make sure + * we stop all initializtion activity here before we + * set the spa_final_txg. This will ensure that all + * dirty data resulting from the initialization is + * committed to disk before we unload the pool. + */ + if (spa->spa_root_vdev != NULL) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + + /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. @@ -5837,8 +5901,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && - tvd->vdev_ashift != - spa->spa_vdev_removal->svr_vdev->vdev_ashift) { + tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ @@ -5954,10 +6017,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) return (spa_vdev_exit(spa, NULL, txg, error)); } - if (spa->spa_vdev_removal != NULL || - spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - } if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -6401,6 +6462,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (error); } +int +spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) +{ + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EROFS)); + } + mutex_enter(&vd->vdev_initialize_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate an initialize action we check to see + * if the vdev_initialize_thread is NULL. We do this instead + * of using the vdev_initialize_state since there might be + * a previous initialization process which has completed but + * the thread is not exited. + */ + if (cmd_type == POOL_INITIALIZE_DO && + (vd->vdev_initialize_thread != NULL || + vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_INITIALIZE_CANCEL && + (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && + vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_SUSPEND && + vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_INITIALIZE_DO: + vdev_initialize(vd); + break; + case POOL_INITIALIZE_CANCEL: + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + break; + case POOL_INITIALIZE_SUSPEND: + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_initialize_lock); + + /* Sync out the initializing state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + return (0); +} + + /* * Split a set of devices from their mirrors, and create a new pool from them. */ @@ -6608,6 +6749,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + /* + * Temporarily stop the initializing activity. We set + * the state to ACTIVE so that we know to resume + * the initializing once the split has completed. + */ + mutex_enter(&vml[c]->vdev_initialize_lock); + vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); + mutex_exit(&vml[c]->vdev_initialize_lock); + } + } + #ifndef illumos /* mark that we are creating new spa by splitting */ newspa->spa_splitting_newspa = B_TRUE; @@ -6702,6 +6856,10 @@ out: if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } + + /* restart initializing disks as necessary */ + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); @@ -7066,6 +7224,14 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); + if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + /* * Let the world know that we're done. */ @@ -7765,8 +7931,9 @@ spa_sync(spa_t *spa, uint64_t txg) * Wait for i/os issued in open context that need to complete * before this txg syncs. */ - VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); - spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. @@ -7776,9 +7943,11 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * If there are any pending vdev state changes, convert them @@ -7844,7 +8013,7 @@ spa_sync(spa_t *spa, uint64_t txg) * The max queue depth will not change in the middle of syncing * out this txg. */ - uint64_t queue_depth_total = 0; + uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -7858,18 +8027,23 @@ spa_sync(spa_t *spa, uint64_t txg) * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ - ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i]))); mg->mg_max_alloc_queue_depth = max_queue_depth; - queue_depth_total += mg->mg_max_alloc_queue_depth; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mg->mg_cur_max_alloc_queue_depth[i] = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; } metaslab_class_t *mc = spa_normal_class(spa); - ASSERT0(refcount_count(&mc->mc_alloc_slots)); - mc->mc_alloc_max_slots = queue_depth_total; + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(refcount_count(&mc->mc_alloc_slots[i])); + mc->mc_alloc_max_slots[i] = slots_per_allocator; + } mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - ASSERT3U(mc->mc_alloc_max_slots, <=, - max_queue_depth * rvd->vdev_children); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); @@ -8052,14 +8226,17 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_sync_done(dp, txg); - mutex_enter(&spa->spa_alloc_lock); - VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); - mutex_exit(&spa->spa_alloc_lock); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } /* * Update usable space statistics. */ - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + != NULL) vdev_sync_done(vd, txg); spa_update_dspace(spa); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c index a4af48d8c58b..db0d2caa6107 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c @@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg { } spa_checkpoint_discard_sync_callback_arg_t; static int -spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, - uint64_t size, void *arg) +spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) { spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; vdev_t *vd = sdc->sdc_vd; - metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - uint64_t end = offset + size; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; if (sdc->sdc_entry_limit == 0) return (EINTR); @@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ - VERIFY3U(type, ==, SM_FREE); - VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(sme->sme_type, ==, SM_FREE); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* @@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, mutex_enter(&ms->ms_lock); if (range_tree_is_empty(ms->ms_freeing)) vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); - range_tree_add(ms->ms_freeing, offset, size); + range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); - ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size); - ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size); + ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, + sme->sme_run); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); - vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size; - vd->vdev_stat.vs_checkpoint_space -= size; + vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; + vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; sdc->sdc_entry_limit--; return (0); @@ -289,12 +289,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) * Thus, we set the maximum entries that the space map callback * will be applied to be half the entries that could fit in the * imposed memory limit. + * + * Note that since this is a conservative estimate we also + * assume the worst case scenario in our computation where each + * entry is two-word. */ uint64_t max_entry_limit = - (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1; - - uint64_t entries_in_sm = - space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; /* * Iterate from the end of the space map towards the beginning, @@ -318,14 +319,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) spa_checkpoint_discard_sync_callback_arg_t sdc; sdc.sdc_vd = vd; sdc.sdc_txg = tx->tx_txg; - sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit); + sdc.sdc_entry_limit = max_entry_limit; - uint64_t entries_before = entries_in_sm; + uint64_t words_before = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, spa_checkpoint_discard_sync_callback, &sdc, tx); - uint64_t entries_after = + uint64_t words_after = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); #ifdef DEBUG @@ -333,9 +335,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) #endif zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " - "deleted %llu entries - %llu entries are left", - tx->tx_txg, vd->vdev_id, (entries_before - entries_after), - entries_after); + "deleted %llu words - %llu words are left", + tx->tx_txg, vd->vdev_id, (words_before - words_after), + words_after); if (error != EINTR) { if (error != 0) { @@ -344,15 +346,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) "space map of vdev %llu\n", error, vd->vdev_id); } - ASSERT0(entries_after); + ASSERT0(words_after); ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); - ASSERT0(vd->vdev_checkpoint_sm->sm_length); + ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; - VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset, + VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index ff1fcb4f0b21..6865dcff2212 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. */ @@ -559,6 +559,18 @@ spa_config_update(spa_t *spa, int what) */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; + + /* + * Explicitly skip vdevs that are indirect or + * log vdevs that are being removed. The reason + * is that both of those can have vdev_ms_array + * set to 0 and we wouldn't want to change their + * metaslab size nor call vdev_expand() on them. + */ + if (!vdev_is_concrete(tvd) || + (tvd->vdev_islog && tvd->vdev_removing)) + continue; + if (tvd->vdev_ms_array == 0) { vdev_ashift_optimize(tvd); vdev_metaslab_set_size(tvd); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index aaa3c310f1e8..2ceed8dd8040 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -41,6 +41,7 @@ #include <sys/zil.h> #include <sys/vdev_impl.h> #include <sys/vdev_file.h> +#include <sys/vdev_initialize.h> #include <sys/metaslab.h> #include <sys/uberblock_impl.h> #include <sys/txg.h> @@ -252,7 +253,7 @@ int spa_mode_global; * Everything except dprintf, spa, and indirect_remap is on by default * in debug builds. */ -int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA | ZFS_DEBUG_INDIRECT_REMAP); +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif @@ -434,6 +435,8 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, &spa_min_slop, 0, "Minimal value of reserved space"); +int spa_allocators = 4; + /*PRINTFLIKE2*/ void spa_load_failed(spa_t *spa, const char *fmt, ...) @@ -705,7 +708,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); @@ -779,8 +782,16 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa_active_count++; } - avl_create(&spa->spa_alloc_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + spa->spa_alloc_count = spa_allocators; + spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * + sizeof (kmutex_t), KM_SLEEP); + spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * + sizeof (avl_tree_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); + avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + } /* * Every pool starts with the default cachefile @@ -812,8 +823,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) KM_SLEEP) == 0); } - spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); - spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; @@ -860,11 +869,20 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } - avl_destroy(&spa->spa_alloc_tree); + for (int i = 0; i < spa->spa_alloc_count; i++) { + avl_destroy(&spa->spa_alloc_trees[i]); + mutex_destroy(&spa->spa_alloc_locks[i]); + } + kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * + sizeof (kmutex_t)); + kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * + sizeof (avl_tree_t)); + list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); + nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); #ifdef illumos @@ -895,7 +913,6 @@ spa_remove(spa_t *spa) cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); - mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); @@ -907,6 +924,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); + mutex_destroy(&spa->spa_feat_stats_lock); kmem_free(spa, sizeof (spa_t)); } @@ -1001,18 +1019,13 @@ typedef struct spa_aux { int aux_count; } spa_aux_t; -static int +static inline int spa_aux_compare(const void *a, const void *b) { - const spa_aux_t *sa = a; - const spa_aux_t *sb = b; + const spa_aux_t *sa = (const spa_aux_t *)a; + const spa_aux_t *sb = (const spa_aux_t *)b; - if (sa->aux_guid < sb->aux_guid) - return (-1); - else if (sa->aux_guid > sb->aux_guid) - return (1); - else - return (0); + return (AVL_CMP(sa->aux_guid, sb->aux_guid)); } void @@ -1299,6 +1312,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); + if (vd->vdev_ops->vdev_op_leaf) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + mutex_exit(&vd->vdev_initialize_lock); + } + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); @@ -1862,9 +1881,12 @@ spa_update_dspace(spa_t *spa) * allocated twice (on the old device and the new * device). */ - vdev_t *vd = spa->spa_vdev_removal->svr_vdev; + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *vd = + vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); spa->spa_dspace -= spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + spa_config_exit(spa, SCL_VDEV, FTAG); } } @@ -2009,6 +2031,12 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp) return (dsize); } +uint64_t +spa_dirty_data(spa_t *spa) +{ + return (spa->spa_dsl_pool->dp_dirty_total); +} + /* * ========================================================================== * Initialization and Termination @@ -2023,11 +2051,8 @@ spa_name_compare(const void *a1, const void *a2) int s; s = strcmp(s1->spa_name, s2->spa_name); - if (s > 0) - return (1); - if (s < 0) - return (-1); - return (0); + + return (AVL_ISIGN(s)); } int @@ -2261,12 +2286,6 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) return (0); } -boolean_t -spa_debug_enabled(spa_t *spa) -{ - return (spa->spa_debug); -} - int spa_maxblocksize(spa_t *spa) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index 2f15c5185c57..7356e3ceea75 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -43,68 +43,205 @@ SYSCTL_DECL(_vfs_zfs); * Note on space map block size: * * The data for a given space map can be kept on blocks of any size. - * Larger blocks entail fewer i/o operations, but they also cause the - * DMU to keep more data in-core, and also to waste more i/o bandwidth + * Larger blocks entail fewer I/O operations, but they also cause the + * DMU to keep more data in-core, and also to waste more I/O bandwidth * when only a few blocks have changed since the last transaction group. */ /* + * Enabled whenever we want to stress test the use of double-word + * space map entries. + */ +boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; + +/* + * Override the default indirect block size of 128K, instead using 16K for + * spacemaps (2^14 bytes). This dramatically reduces write inflation since + * appending to a spacemap typically has to write one data block (4KB) and one + * or two indirect blocks (16K-32K, rather than 128K). + */ +int space_map_ibs = 14; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + +boolean_t +sm_entry_is_debug(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); +} + +boolean_t +sm_entry_is_single_word(uint64_t e) +{ + uint8_t prefix = SM_PREFIX_DECODE(e); + return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); +} + +boolean_t +sm_entry_is_double_word(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM2_PREFIX); +} + +/* * Iterate through the space map, invoking the callback on each (non-debug) * space map entry. */ int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) { - uint64_t *entry, *entry_map, *entry_map_end; - uint64_t bufsize, size, offset, end; + uint64_t sm_len = space_map_length(sm); + ASSERT3U(sm->sm_blksz, !=, 0); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, + ZIO_PRIORITY_SYNC_READ); + + uint64_t blksz = sm->sm_blksz; int error = 0; + for (uint64_t block_base = 0; block_base < sm_len && error == 0; + block_base += blksz) { + dmu_buf_t *db; + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), + block_base, FTAG, &db, DMU_READ_PREFETCH); + if (error != 0) + return (error); - end = space_map_length(sm); + uint64_t *block_start = db->db_data; + uint64_t block_length = MIN(sm_len - block_base, blksz); + uint64_t *block_end = block_start + + (block_length / sizeof (uint64_t)); - bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); - entry_map = zio_buf_alloc(bufsize); + VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); + VERIFY3U(block_length, !=, 0); + ASSERT3U(blksz, ==, db->db_size); - if (end > bufsize) { - dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, - end - bufsize, ZIO_PRIORITY_SYNC_READ); - } + for (uint64_t *block_cursor = block_start; + block_cursor < block_end && error == 0; block_cursor++) { + uint64_t e = *block_cursor; + + if (sm_entry_is_debug(e)) /* Skip debug entries */ + continue; - for (offset = 0; offset < end && error == 0; offset += bufsize) { - size = MIN(end - offset, bufsize); - VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); - VERIFY(size != 0); - ASSERT3U(sm->sm_blksz, !=, 0); + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + /* it is a two-word entry */ + ASSERT(sm_entry_is_double_word(e)); + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move on to the second word */ + block_cursor++; + e = *block_cursor; + VERIFY3P(block_cursor, <=, block_end); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } - dprintf("object=%llu offset=%llx size=%llx\n", - space_map_object(sm), offset, size); + uint64_t entry_offset = (raw_offset << sm->sm_shift) + + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; - error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, - entry_map, DMU_READ_PREFETCH); - if (error != 0) - break; + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); + ASSERT3U(entry_offset, >=, sm->sm_start); + ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); + ASSERT3U(entry_run, <=, sm->sm_size); + ASSERT3U(entry_offset + entry_run, <=, + sm->sm_start + sm->sm_size); - entry_map_end = entry_map + (size / sizeof (uint64_t)); - for (entry = entry_map; entry < entry_map_end && error == 0; - entry++) { - uint64_t e = *entry; - uint64_t offset, size; + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); + } + dmu_buf_rele(db, FTAG); + } + return (error); +} - if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ - continue; +/* + * Reads the entries from the last block of the space map into + * buf in reverse order. Populates nwords with number of words + * in the last block. + * + * Refer to block comment within space_map_incremental_destroy() + * to understand why this function is needed. + */ +static int +space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, + uint64_t bufsz, uint64_t *nwords) +{ + int error = 0; + dmu_buf_t *db; - offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + - sm->sm_start; - size = SM_RUN_DECODE(e) << sm->sm_shift; + /* + * Find the offset of the last word in the space map and use + * that to read the last block of the space map with + * dmu_buf_hold(). + */ + uint64_t last_word_offset = + sm->sm_phys->smp_objsize - sizeof (uint64_t); + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (error != 0) + return (error); - VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift)); - VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); - VERIFY3U(offset, >=, sm->sm_start); - VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); - error = callback(SM_TYPE_DECODE(e), offset, size, arg); + ASSERT3U(sm->sm_object, ==, db->db_object); + ASSERT3U(sm->sm_blksz, ==, db->db_size); + ASSERT3U(bufsz, >=, db->db_size); + ASSERT(nwords != NULL); + + uint64_t *words = db->db_data; + *nwords = + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + + ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); + + uint64_t n = *nwords; + uint64_t j = n - 1; + for (uint64_t i = 0; i < n; i++) { + uint64_t entry = words[i]; + if (sm_entry_is_double_word(entry)) { + /* + * Since we are populating the buffer backwards + * we have to be extra careful and add the two + * words of the double-word entry in the right + * order. + */ + ASSERT3U(j, >, 0); + buf[j - 1] = entry; + + i++; + ASSERT3U(i, <, n); + entry = words[i]; + buf[j] = entry; + j -= 2; + } else { + ASSERT(sm_entry_is_debug(entry) || + sm_entry_is_single_word(entry)); + buf[j] = entry; + j--; } } - zio_buf_free(entry_map, bufsize); + /* + * Assert that we wrote backwards all the + * way to the beginning of the buffer. + */ + ASSERT3S(j, ==, -1); + + dmu_buf_rele(db, FTAG); return (error); } @@ -118,124 +255,122 @@ int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx) { - uint64_t bufsize, len; - uint64_t *entry_map; - int error = 0; - - len = space_map_length(sm); - bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); - entry_map = zio_buf_alloc(bufsize); + uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); + uint64_t *buf = zio_buf_alloc(bufsz); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* - * Since we can't move the starting offset of the space map - * (e.g there are reference on-disk pointing to it), we destroy - * its entries incrementally starting from the end. + * Ideally we would want to iterate from the beginning of the + * space map to the end in incremental steps. The issue with this + * approach is that we don't have any field on-disk that points + * us where to start between each step. We could try zeroing out + * entries that we've destroyed, but this doesn't work either as + * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). + * + * As a result, we destroy its entries incrementally starting from + * the end after applying the callback to each of them. * - * The logic that follows is basically the same as the one used - * in space_map_iterate() but it traverses the space map - * backwards: + * The problem with this approach is that we cannot literally + * iterate through the words in the space map backwards as we + * can't distinguish two-word space map entries from their second + * word. Thus we do the following: * - * 1] We figure out the size of the buffer that we want to use - * to read the on-disk space map entries. - * 2] We figure out the offset at the end of the space map where - * we will start reading entries into our buffer. - * 3] We read the on-disk entries into the buffer. - * 4] We iterate over the entries from end to beginning calling - * the callback function on each one. As we move from entry - * to entry we decrease the size of the space map, deleting - * effectively each entry. - * 5] If there are no more entries in the space map or the - * callback returns a value other than 0, we stop iterating - * over the space map. If there are entries remaining and - * the callback returned zero we go back to step [1]. + * 1] We get all the entries from the last block of the space map + * and put them into a buffer in reverse order. This way the + * last entry comes first in the buffer, the second to last is + * second, etc. + * 2] We iterate through the entries in the buffer and we apply + * the callback to each one. As we move from entry to entry we + * we decrease the size of the space map, deleting effectively + * each entry. + * 3] If there are no more entries in the space map or the callback + * returns a value other than 0, we stop iterating over the + * space map. If there are entries remaining and the callback + * returned 0, we go back to step [1]. */ - uint64_t offset = 0, size = 0; - while (len > 0 && error == 0) { - size = MIN(bufsize, len); - - VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); - VERIFY3U(size, >, 0); - ASSERT3U(sm->sm_blksz, !=, 0); - - offset = len - size; - - IMPLY(bufsize > len, offset == 0); - IMPLY(bufsize == len, offset == 0); - IMPLY(bufsize < len, offset > 0); - - - EQUIV(size == len, offset == 0); - IMPLY(size < len, bufsize < len); - - dprintf("object=%llu offset=%llx size=%llx\n", - space_map_object(sm), offset, size); - - error = dmu_read(sm->sm_os, space_map_object(sm), - offset, size, entry_map, DMU_READ_PREFETCH); + int error = 0; + while (space_map_length(sm) > 0 && error == 0) { + uint64_t nwords = 0; + error = space_map_reversed_last_block_entries(sm, buf, bufsz, + &nwords); if (error != 0) break; - uint64_t num_entries = size / sizeof (uint64_t); - - ASSERT3U(num_entries, >, 0); - - while (num_entries > 0) { - uint64_t e, entry_offset, entry_size; - maptype_t type; - - e = entry_map[num_entries - 1]; + ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); - ASSERT3U(num_entries, >, 0); - ASSERT0(error); + for (uint64_t i = 0; i < nwords; i++) { + uint64_t e = buf[i]; - if (SM_DEBUG_DECODE(e)) { + if (sm_entry_is_debug(e)) { sm->sm_phys->smp_objsize -= sizeof (uint64_t); space_map_update(sm); - len -= sizeof (uint64_t); - num_entries--; continue; } - type = SM_TYPE_DECODE(e); - entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + - sm->sm_start; - entry_size = SM_RUN_DECODE(e) << sm->sm_shift; + int words = 1; + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + ASSERT(sm_entry_is_double_word(e)); + words = 2; + + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move to the second word */ + i++; + e = buf[i]; + + ASSERT3P(i, <=, nwords); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = + (raw_offset << sm->sm_shift) + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); - VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); VERIFY3U(entry_offset, >=, sm->sm_start); - VERIFY3U(entry_offset + entry_size, <=, + VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); + VERIFY3U(entry_run, <=, sm->sm_size); + VERIFY3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); - error = callback(type, entry_offset, entry_size, arg); + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); if (error != 0) break; if (type == SM_ALLOC) - sm->sm_phys->smp_alloc -= entry_size; + sm->sm_phys->smp_alloc -= entry_run; else - sm->sm_phys->smp_alloc += entry_size; - - sm->sm_phys->smp_objsize -= sizeof (uint64_t); + sm->sm_phys->smp_alloc += entry_run; + sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); space_map_update(sm); - len -= sizeof (uint64_t); - num_entries--; } - IMPLY(error == 0, num_entries == 0); - EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0); } - if (len == 0) { + if (space_map_length(sm) == 0) { ASSERT0(error); - ASSERT0(offset); - ASSERT0(sm->sm_length); ASSERT0(sm->sm_phys->smp_objsize); ASSERT0(sm->sm_alloc); } - zio_buf_free(entry_map, bufsize); + zio_buf_free(buf, bufsz); return (error); } @@ -246,16 +381,15 @@ typedef struct space_map_load_arg { } space_map_load_arg_t; static int -space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size, - void *arg) +space_map_load_callback(space_map_entry_t *sme, void *arg) { space_map_load_arg_t *smla = arg; - if (type == smla->smla_type) { - VERIFY3U(range_tree_space(smla->smla_rt) + size, <=, + if (sme->sme_type == smla->smla_type) { + VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, smla->smla_sm->sm_size); - range_tree_add(smla->smla_rt, offset, size); + range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); } else { - range_tree_remove(smla->smla_rt, offset, size); + range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); } return (0); @@ -367,43 +501,239 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) } } -uint64_t -space_map_entries(space_map_t *sm, range_tree_t *rt) +static void +space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) { - avl_tree_t *t = &rt->rt_root; - range_seg_t *rs; - uint64_t size, entries; + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, + sizeof (dentry), &dentry, tx); + + sm->sm_phys->smp_objsize += sizeof (dentry); +} + +/* + * Writes one or more entries given a segment. + * + * Note: The function may release the dbuf from the pointer initially + * passed to it, and return a different dbuf. Also, the space map's + * dbuf must be dirty for the changes in sm_phys to take effect. + */ +static void +space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, + uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +{ + ASSERT3U(words, !=, 0); + ASSERT3U(words, <=, 2); + + /* ensure the vdev_id can be represented by the space map */ + ASSERT3U(vdev_id, <=, SM_NO_VDEVID); + + /* + * if this is a single word entry, ensure that no vdev was + * specified. + */ + IMPLY(words == 1, vdev_id == SM_NO_VDEVID); + + dmu_buf_t *db = *dbp; + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + uint64_t *block_base = db->db_data; + uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); + uint64_t *block_cursor = block_base + + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + + ASSERT3P(block_cursor, <=, block_end); + + uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; + + ASSERT3U(rs->rs_start, >=, sm->sm_start); + ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); + ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); + ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + + while (size != 0) { + ASSERT3P(block_cursor, <=, block_end); + + /* + * If we are at the end of this block, flush it and start + * writing again from the beginning. + */ + if (block_cursor == block_end) { + dmu_buf_rele(db, tag); + + uint64_t next_word_offset = sm->sm_phys->smp_objsize; + VERIFY0(dmu_buf_hold(sm->sm_os, + space_map_object(sm), next_word_offset, + tag, &db, DMU_READ_PREFETCH)); + dmu_buf_will_dirty(db, tx); + + /* update caller's dbuf */ + *dbp = db; + + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + block_base = db->db_data; + block_cursor = block_base; + block_end = block_base + + (db->db_size / sizeof (uint64_t)); + } + + /* + * If we are writing a two-word entry and we only have one + * word left on this block, just pad it with an empty debug + * entry and write the two-word entry in the next block. + */ + uint64_t *next_entry = block_cursor + 1; + if (next_entry == block_end && words > 1) { + ASSERT3U(words, ==, 2); + *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(0) | + SM_DEBUG_SYNCPASS_ENCODE(0) | + SM_DEBUG_TXG_ENCODE(0); + block_cursor++; + sm->sm_phys->smp_objsize += sizeof (uint64_t); + ASSERT3P(block_cursor, ==, block_end); + continue; + } + + uint64_t run_len = MIN(size, run_max); + switch (words) { + case 1: + *block_cursor = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + block_cursor++; + break; + case 2: + /* write the first word of the entry */ + *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(run_len) | + SM2_VDEV_ENCODE(vdev_id); + block_cursor++; + + /* move on to the second word of the entry */ + ASSERT3P(block_cursor, <, block_end); + *block_cursor = SM2_TYPE_ENCODE(maptype) | + SM2_OFFSET_ENCODE(start); + block_cursor++; + break; + default: + panic("%d-word space map entries are not supported", + words); + break; + } + sm->sm_phys->smp_objsize += words * sizeof (uint64_t); + + start += run_len; + size -= run_len; + } + ASSERT0(size); + +} + +/* + * Note: The space map's dbuf must be dirty for the changes in sm_phys to + * take effect. + */ +static void +space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + dmu_buf_t *db; + + space_map_write_intro_debug(sm, maptype, tx); +#ifdef DEBUG /* - * All space_maps always have a debug entry so account for it here. + * We do this right after we write the intro debug entry + * because the estimate does not take it into account. */ - entries = 1; + uint64_t initial_objsize = sm->sm_phys->smp_objsize; + uint64_t estimated_growth = + space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); + uint64_t estimated_final_objsize = initial_objsize + estimated_growth; +#endif /* - * Traverse the range tree and calculate the number of space map - * entries that would be required to write out the range tree. + * Find the offset right after the last word in the space map + * and use that to get a hold of the last block, so we can + * start appending to it. */ - for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - entries += howmany(size, SM_RUN_MAX); + uint64_t next_word_offset = sm->sm_phys->smp_objsize; + VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), + next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + dmu_buf_will_dirty(db, tx); + + avl_tree_t *t = &rt->rt_root; + for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { + uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + uint8_t words = 1; + + /* + * We only write two-word entries when both of the following + * are true: + * + * [1] The feature is enabled. + * [2] The offset or run is too big for a single-word entry, + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). + * + * Note that for purposes of testing we've added the case that + * we write two-word entries occasionally when the feature is + * enabled and zfs_force_some_double_word_sm_entries has been + * set. + */ + if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && + (offset >= (1ULL << SM_OFFSET_BITS) || + length > SM_RUN_MAX || + vdev_id != SM_NO_VDEVID || + (zfs_force_some_double_word_sm_entries && + spa_get_random(100) == 0))) + words = 2; + + space_map_write_seg(sm, rs, maptype, vdev_id, words, + &db, FTAG, tx); } - return (entries); + + dmu_buf_rele(db, FTAG); + +#ifdef DEBUG + /* + * We expect our estimation to be based on the worst case + * scenario [see comment in space_map_estimate_optimal_size()]. + * Therefore we expect the actual objsize to be equal or less + * than whatever we estimated it to be. + */ + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); +#endif } +/* + * Note: This function manipulates the state of the given space map but + * does not hold any locks implicitly. Thus the caller is responsible + * for synchronizing writes to the space map. + */ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - dmu_tx_t *tx) + uint64_t vdev_id, dmu_tx_t *tx) { objset_t *os = sm->sm_os; - spa_t *spa = dmu_objset_spa(os); - avl_tree_t *t = &rt->rt_root; - range_seg_t *rs; - uint64_t size, total, rt_space, nodes; - uint64_t *entry, *entry_map, *entry_map_end; - uint64_t expected_entries, actual_entries = 1; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); VERIFY3U(space_map_object(sm), !=, 0); + dmu_buf_will_dirty(sm->sm_dbuf, tx); /* @@ -423,58 +753,10 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, else sm->sm_phys->smp_alloc -= range_tree_space(rt); - expected_entries = space_map_entries(sm, rt); - - entry_map = zio_buf_alloc(sm->sm_blksz); - entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t)); - entry = entry_map; - - *entry++ = SM_DEBUG_ENCODE(1) | - SM_DEBUG_ACTION_ENCODE(maptype) | - SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | - SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - - total = 0; - nodes = avl_numnodes(&rt->rt_root); - rt_space = range_tree_space(rt); - for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t start; + uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t rt_space = range_tree_space(rt); - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - - total += size << sm->sm_shift; - - while (size != 0) { - uint64_t run_len; - - run_len = MIN(size, SM_RUN_MAX); - - if (entry == entry_map_end) { - dmu_write(os, space_map_object(sm), - sm->sm_phys->smp_objsize, sm->sm_blksz, - entry_map, tx); - sm->sm_phys->smp_objsize += sm->sm_blksz; - entry = entry_map; - } - - *entry++ = SM_OFFSET_ENCODE(start) | - SM_TYPE_ENCODE(maptype) | - SM_RUN_ENCODE(run_len); - - start += run_len; - size -= run_len; - actual_entries++; - } - } - - if (entry != entry_map) { - size = (entry - entry_map) * sizeof (uint64_t); - dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, - size, entry_map, tx); - sm->sm_phys->smp_objsize += size; - } - ASSERT3U(expected_entries, ==, actual_entries); + space_map_write_impl(sm, rt, maptype, vdev_id, tx); /* * Ensure that the space_map's accounting wasn't changed @@ -482,9 +764,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, */ VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); - VERIFY3U(range_tree_space(rt), ==, total); - - zio_buf_free(entry_map, sm->sm_blksz); } static int @@ -526,7 +805,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object, space_map_close(sm); return (error); } - *smp = sm; return (0); @@ -569,7 +847,8 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || - doi.doi_data_block_size != blocksize) { + doi.doi_data_block_size != blocksize || + doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " "object[%llu]: old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, @@ -625,8 +904,8 @@ space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) bonuslen = SPACE_MAP_SIZE_V0; } - object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize, - DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); + object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, + space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); } @@ -658,6 +937,133 @@ space_map_free(space_map_t *sm, dmu_tx_t *tx) sm->sm_object = 0; } +/* + * Given a range tree, it makes a worst-case estimate of how much + * space would the tree's segments take if they were written to + * the given space map. + */ +uint64_t +space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id) +{ + spa_t *spa = dmu_objset_spa(sm->sm_os); + uint64_t shift = sm->sm_shift; + uint64_t *histogram = rt->rt_histogram; + uint64_t entries_for_seg = 0; + + /* + * In order to get a quick estimate of the optimal size that this + * range tree would have on-disk as a space map, we iterate through + * its histogram buckets instead of iterating through its nodes. + * + * Note that this is a highest-bound/worst-case estimate for the + * following reasons: + * + * 1] We assume that we always add a debug padding for each block + * we write and we also assume that we start at the last word + * of a block attempting to write a two-word entry. + * 2] Rounding up errors due to the way segments are distributed + * in the buckets of the range tree's histogram. + * 3] The activation of zfs_force_some_double_word_sm_entries + * (tunable) when testing. + * + * = Math and Rounding Errors = + * + * rt_histogram[i] bucket of a range tree represents the number + * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given + * that, we want to divide the buckets into groups: Buckets that + * can be represented using a single-word entry, ones that can + * be represented with a double-word entry, and ones that can + * only be represented with multiple two-word entries. + * + * [Note that if the new encoding feature is not enabled there + * are only two groups: single-word entry buckets and multiple + * single-word entry buckets. The information below assumes + * two-word entries enabled, but it can easily applied when + * the feature is not enabled] + * + * To find the highest bucket that can be represented with a + * single-word entry we look at the maximum run that such entry + * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that + * the run of a space map entry is shifted by sm_shift, thus we + * add it to the exponent]. This way, excluding the value of the + * maximum run that can be represented by a single-word entry, + * all runs that are smaller exist in buckets 0 to + * SM_RUN_BITS + shift - 1. + * + * To find the highest bucket that can be represented with a + * double-word entry, we follow the same approach. Finally, any + * bucket higher than that are represented with multiple two-word + * entries. To be more specific, if the highest bucket whose + * segments can be represented with a single two-word entry is X, + * then bucket X+1 will need 2 two-word entries for each of its + * segments, X+2 will need 4, X+3 will need 8, ...etc. + * + * With all of the above we make our estimation based on bucket + * groups. There is a rounding error though. As we mentioned in + * the example with the one-word entry, the maximum run that can + * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is + * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of + * that length fall into the next bucket (and bucket group) where + * we start counting two-word entries and this is one more reason + * why the estimated size may end up being bigger than the actual + * size written. + */ + uint64_t size = 0; + uint64_t idx = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || + (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { + + /* + * If we are trying to force some double word entries just + * assume the worst-case of every single word entry being + * written as a double word entry. + */ + uint64_t entry_size = + (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && + zfs_force_some_double_word_sm_entries) ? + (2 * sizeof (uint64_t)) : sizeof (uint64_t); + + uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; + for (; idx <= single_entry_max_bucket; idx++) + size += histogram[idx] * entry_size; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, single_entry_max_bucket); + entries_for_seg = + 1ULL << (idx - single_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * entry_size; + } + return (size); + } + } + + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); + + uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; + for (; idx <= double_entry_max_bucket; idx++) + size += histogram[idx] * 2 * sizeof (uint64_t); + + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, double_entry_max_bucket); + entries_for_seg = 1ULL << (idx - double_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * 2 * sizeof (uint64_t); + } + + /* + * Assume the worst case where we start with the padding at the end + * of the current block and we add an extra padding entry at the end + * of all subsequent blocks. + */ + size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); + + return (size); +} + uint64_t space_map_object(space_map_t *sm) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c index a866e65d54f7..aa289ba1061d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_reftree.c @@ -54,20 +54,14 @@ static int space_reftree_compare(const void *x1, const void *x2) { - const space_ref_t *sr1 = x1; - const space_ref_t *sr2 = x2; + const space_ref_t *sr1 = (const space_ref_t *)x1; + const space_ref_t *sr2 = (const space_ref_t *)x2; - if (sr1->sr_offset < sr2->sr_offset) - return (-1); - if (sr1->sr_offset > sr2->sr_offset) - return (1); + int cmp = AVL_CMP(sr1->sr_offset, sr2->sr_offset); + if (likely(cmp)) + return (cmp); - if (sr1 < sr2) - return (-1); - if (sr1 > sr2) - return (1); - - return (0); + return (AVL_PCMP(sr1, sr2)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h index cb1d4354579e..9b8e73b596fe 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h @@ -194,7 +194,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); -int arc_tempreserve_space(uint64_t reserve, uint64_t txg); +int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); uint64_t arc_max_bytes(void); void arc_init(void); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 69617b3dca9c..ec966432f2ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -83,6 +83,13 @@ typedef enum dbuf_states { DB_EVICTING } dbuf_states_t; +typedef enum dbuf_cached_state { + DB_NO_CACHE = -1, + DB_DBUF_CACHE, + DB_DBUF_METADATA_CACHE, + DB_CACHE_MAX +} dbuf_cached_state_t; + struct dnode; struct dmu_tx; @@ -229,11 +236,12 @@ typedef struct dmu_buf_impl { */ avl_node_t db_link; - /* - * Link in dbuf_cache. - */ + /* Link in dbuf_cache or dbuf_metadata_cache */ multilist_node_t db_cache_link; + /* Tells us which dbuf cache this dbuf is in, if any */ + dbuf_cached_state_t db_caching_status; + /* Data which is unique to data (leaf) blocks: */ /* User callback information. */ @@ -295,7 +303,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, uint64_t dbuf_refcount(dmu_buf_impl_t *db); void dbuf_rele(dmu_buf_impl_t *db, void *tag); -void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting); dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 0f7916e7d189..9bba698828fd 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -109,7 +109,8 @@ typedef enum dmu_object_byteswap { /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data - * (dmu_object_byteswap_t). + * (dmu_object_byteswap_t). All of the types created by this method + * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ @@ -124,6 +125,9 @@ typedef enum dmu_object_byteswap { ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) +#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) + /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill @@ -352,6 +356,9 @@ typedef struct dmu_buf { */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, @@ -512,6 +519,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db); int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, + uint32_t flags); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); @@ -750,10 +760,13 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); +int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); +int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, + dmu_tx_t *tx); #ifdef _KERNEL #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, @@ -767,6 +780,8 @@ int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); @@ -810,6 +825,7 @@ typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; + boolean_t ot_dbuf_metadata_cache; char *ot_name; } dmu_object_type_info_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h index 59e87aab8081..25ff8642177d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h @@ -39,6 +39,7 @@ #include <sys/zio.h> #include <sys/zil.h> #include <sys/sa.h> +#include <sys/zfs_ioctl.h> #ifdef __cplusplus extern "C" { @@ -69,6 +70,7 @@ typedef struct objset_phys { dnode_phys_t os_groupused_dnode; } objset_phys_t; +#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1) struct objset { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; @@ -100,6 +102,16 @@ struct objset { zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; int os_recordsize; + /* + * The next four values are used as a cache of whatever's on disk, and + * are initialized the first time these properties are queried. Before + * being initialized with their real values, their values are + * OBJSET_PROP_UNINITIALIZED. + */ + uint64_t os_version; + uint64_t os_normalization; + uint64_t os_utf8only; + uint64_t os_casesensitivity; /* * Pointer is constant; the blkptr it points to is protected by diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h index 69a834d877ee..1f4b1f2cde9f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h @@ -70,6 +70,7 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; boolean_t drc_force; boolean_t drc_resumable; + boolean_t drc_clone; struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h index 5566c70add13..89a7b2ef60e4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -291,7 +291,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); -void dnode_rele_and_unlock(dnode_t *dn, void *tag); +void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index 8de77532ee75..0509e95b1587 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -65,9 +65,10 @@ uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, - blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *); + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, + int); int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, - dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *); + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); @@ -88,9 +89,9 @@ int metaslab_class_validate(metaslab_class_t *); void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, zio_t *, int); -void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, int64_t, int64_t); @@ -100,7 +101,7 @@ uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_deferred(metaslab_class_t *); uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc); -metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); void metaslab_group_destroy(metaslab_group_t *); void metaslab_group_activate(metaslab_group_t *); void metaslab_group_passivate(metaslab_group_t *); @@ -109,8 +110,9 @@ uint64_t metaslab_group_get_space(metaslab_group_t *); void metaslab_group_histogram_verify(metaslab_group_t *); uint64_t metaslab_group_fragmentation(metaslab_group_t *); void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); -void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int); -void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, + boolean_t); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 939bcb30528b..5eb59df37e51 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_IMPL_H @@ -52,6 +52,7 @@ typedef struct metaslab_alloc_trace { uint64_t mat_weight; uint32_t mat_dva_id; uint64_t mat_offset; + int mat_allocator; } metaslab_alloc_trace_t; /* @@ -67,14 +68,17 @@ typedef enum trace_alloc_type { TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, - TRACE_VDEV_ERROR = -8ULL + TRACE_VDEV_ERROR = -8ULL, + TRACE_INITIALIZING = -9ULL } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_WEIGHT_TYPE (1ULL << 61) +#define METASLAB_WEIGHT_CLAIM (1ULL << 61) +#define METASLAB_WEIGHT_TYPE (1ULL << 60) #define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ + METASLAB_WEIGHT_CLAIM) /* * The metaslab weight is used to encode the amount of free space in a @@ -97,37 +101,39 @@ typedef enum trace_alloc_type { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PS1| weighted-free space | + * |PSC1| weighted-free space | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio * space - the fragmentation-weighted space * * Segment-based weight: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * |PS0| idx| count of segments in region | + * |PSC0| idx| count of segments in region | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio * idx - index for the highest bucket in the histogram * count - number of segments in the specified bucket */ -#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) -#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) #define WEIGHT_IS_SPACEBASED(weight) \ - ((weight) == 0 || BF64_GET((weight), 61, 1)) -#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) + ((weight) == 0 || BF64_GET((weight), 60, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) /* * These macros are only applicable to segment-based weighting. */ -#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) -#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) -#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) -#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) /* * A metaslab class encompasses a category of allocatable top-level vdevs. @@ -178,8 +184,8 @@ struct metaslab_class { * allowed to reserve slots even if we've reached the maximum * number of allocations allowed. */ - uint64_t mc_alloc_max_slots; - refcount_t mc_alloc_slots; + uint64_t *mc_alloc_max_slots; + refcount_t *mc_alloc_slots; uint64_t mc_alloc_groups; /* # of allocatable groups */ @@ -202,9 +208,12 @@ struct metaslab_class { */ struct metaslab_group { kmutex_t mg_lock; + metaslab_t **mg_primaries; + metaslab_t **mg_secondaries; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; boolean_t mg_allocatable; /* can we allocate? */ + uint64_t mg_ms_ready; /* * A metaslab group is considered to be initialized only after @@ -224,15 +233,33 @@ struct metaslab_group { metaslab_group_t *mg_next; /* - * Each metaslab group can handle mg_max_alloc_queue_depth allocations - * which are tracked by mg_alloc_queue_depth. It's possible for a - * metaslab group to handle more allocations than its max. This - * can occur when gang blocks are required or when other groups - * are unable to handle their share of allocations. + * In order for the allocation throttle to function properly, we cannot + * have too many IOs going to each disk by default; the throttle + * operates by allocating more work to disks that finish quickly, so + * allocating larger chunks to each disk reduces its effectiveness. + * However, if the number of IOs going to each allocator is too small, + * we will not perform proper aggregation at the vdev_queue layer, + * also resulting in decreased performance. Therefore, we will use a + * ramp-up strategy. + * + * Each allocator in each metaslab group has a current queue depth + * (mg_alloc_queue_depth[allocator]) and a current max queue depth + * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * has an absolute max queue depth (mg_max_alloc_queue_depth). We + * add IOs to an allocator until the mg_alloc_queue_depth for that + * allocator hits the cur_max. Every time an IO completes for a given + * allocator on a given metaslab group, we increment its cur_max until + * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to + * help protect against disks that decrease in performance over time. + * + * It's possible for an allocator to handle more allocations than + * its max. This can occur when gang blocks are required or when other + * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; - refcount_t mg_alloc_queue_depth; - + uint64_t *mg_cur_max_alloc_queue_depth; + refcount_t *mg_alloc_queue_depth; + int mg_allocators; /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out @@ -245,6 +272,11 @@ struct metaslab_group { uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + int mg_ms_initializing; + boolean_t mg_initialize_updating; + kmutex_t mg_ms_initialize_lock; + kcondvar_t mg_ms_initialize_cv; }; /* @@ -335,6 +367,8 @@ struct metaslab { boolean_t ms_condense_wanted; uint64_t ms_condense_checked_txg; + uint64_t ms_initializing; /* leaves initializing this ms */ + /* * We must hold both ms_lock and ms_group->mg_lock in order to * modify ms_loaded. @@ -357,6 +391,13 @@ struct metaslab { uint64_t ms_max_size; /* maximum allocatable size */ /* + * -1 if it's not active in an allocator, otherwise set to the allocator + * this metaslab is active for. + */ + int ms_allocator; + boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ + + /* * The metaslab block allocators can optionally use a size-ordered * range tree and/or an array of LBAs. Not all allocators use * this functionality. The ms_allocatable_by_size should always @@ -370,6 +411,8 @@ struct metaslab { metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + + boolean_t ms_new; }; #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h index 244f35a3d6f3..feac5ae5fbf2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h @@ -95,6 +95,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); +uint64_t range_tree_min(range_tree_t *rt); +uint64_t range_tree_max(range_tree_t *rt); +uint64_t range_tree_span(range_tree_t *rt); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index fc4f90740efc..0b220362379e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -154,6 +154,7 @@ _NOTE(CONSTCOND) } while (0) #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 +#define SPA_VDEVBITS 24 /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). @@ -184,15 +185,15 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | + * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | + * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | vdev3 | GRID | ASIZE | + * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -371,8 +372,9 @@ typedef struct blkptr { #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) -#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) -#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) +#define DVA_SET_VDEV(dva, x) \ + BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) @@ -668,6 +670,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 /* * Controls the behavior of spa_vdev_remove(). @@ -683,6 +686,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); +extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, @@ -829,6 +833,7 @@ extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); +extern uint64_t spa_dirty_data(spa_t *spa); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...); @@ -945,13 +950,6 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_bp(bp, fmt, ...) #endif -extern boolean_t spa_debug_enabled(spa_t *spa); -#define spa_dbgmsg(spa, ...) \ -{ \ - if (spa_debug_enabled(spa)) \ - zfs_dbgmsg(__VA_ARGS__); \ -} - extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 292fa5e96ac1..f69dde66dd9b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -238,8 +238,16 @@ struct spa { uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ - kmutex_t spa_alloc_lock; - avl_tree_t spa_alloc_tree; + /* + * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are + * stored in spa_alloc_count. There is one tree and one lock for each + * allocator, to help improve allocation performance in write-heavy + * workloads. + */ + kmutex_t *spa_alloc_locks; + avl_tree_t *spa_alloc_trees; + int spa_alloc_count; + spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ @@ -324,7 +332,6 @@ struct spa { kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ - boolean_t spa_debug; /* debug enabled? */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ @@ -353,6 +360,8 @@ struct spa { uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ + kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ + nvlist_t *spa_feat_stats; /* Cache of enabled features */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; #ifdef illumos @@ -381,6 +390,10 @@ struct spa { int spa_queued; } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE]; #endif + /* arc_memory_throttle() parameters during low memory condition */ + uint64_t spa_lowmem_page_load; /* memory load during txg */ + uint64_t spa_lowmem_last_txg; /* txg window start */ + hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h index 98b87269cb6c..d3d852978a57 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -93,50 +93,100 @@ typedef struct space_map { /* * debug entry * - * 1 3 10 50 - * ,---+--------+------------+---------------------------------. - * | 1 | action | syncpass | txg (lower bits) | - * `---+--------+------------+---------------------------------' - * 63 62 60 59 50 49 0 + * 2 2 10 50 + * +-----+-----+------------+----------------------------------+ + * | 1 0 | act | syncpass | txg (lower bits) | + * +-----+-----+------------+----------------------------------+ + * 63 62 61 60 59 50 49 0 * * - * non-debug entry + * one-word entry * * 1 47 1 15 - * ,-----------------------------------------------------------. + * +-----------------------------------------------------------+ * | 0 | offset (sm_shift units) | type | run | - * `-----------------------------------------------------------' - * 63 62 17 16 15 0 + * +-----------------------------------------------------------+ + * 63 62 16 15 14 0 + * + * + * two-word entry + * + * 2 2 36 24 + * +-----+-----+---------------------------+-------------------+ + * | 1 1 | pad | run | vdev | + * +-----+-----+---------------------------+-------------------+ + * 63 62 61 60 59 24 23 0 + * + * 1 63 + * +------+----------------------------------------------------+ + * | type | offset | + * +------+----------------------------------------------------+ + * 63 62 0 + * + * Note that a two-word entry will not strandle a block boundary. + * If necessary, the last word of a block will be padded with a + * debug entry (with act = syncpass = txg = 0). */ -/* All this stuff takes and returns bytes */ -#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) -#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) -#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) -#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) -#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) -#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) -#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) -#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) +typedef enum { + SM_ALLOC, + SM_FREE +} maptype_t; + +typedef struct space_map_entry { + maptype_t sme_type; + uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ + uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ + uint64_t sme_run; /* max is 2^36; units of sm_shift */ +} space_map_entry_t; + +#define SM_NO_VDEVID (1 << SPA_VDEVBITS) -#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) -#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) +/* one-word entry constants */ +#define SM_DEBUG_PREFIX 2 +#define SM_OFFSET_BITS 47 +#define SM_RUN_BITS 15 +/* two-word entry constants */ +#define SM2_PREFIX 3 +#define SM2_OFFSET_BITS 63 +#define SM2_RUN_BITS 36 + +#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) +#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) #define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) #define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) - #define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) #define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) -#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) - -typedef enum { - SM_ALLOC, - SM_FREE -} maptype_t; - -typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size, - void *arg); +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) +#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) + +#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) +#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) +#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) +#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) +#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) +#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) +#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) +#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) +#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) +#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) + +boolean_t sm_entry_is_debug(uint64_t e); +boolean_t sm_entry_is_single_word(uint64_t e); +boolean_t sm_entry_is_double_word(uint64_t e); + +typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); @@ -154,7 +204,9 @@ uint64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - dmu_tx_t *tx); + uint64_t vdev_id, dmu_tx_t *tx); +uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id); void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h index e583d61eac2f..bf3b269d707d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_TXG_IMPL_H @@ -92,6 +92,7 @@ typedef struct tx_state { kmutex_t tx_sync_lock; /* protects the rest of this struct */ uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiescing_txg; /* currently quiescing txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ uint64_t tx_synced_txg; /* last synced txg id */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index 16436b7c022f..4a3af854d465 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -59,6 +59,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern int zfs_vdev_queue_depth_pct; +extern int zfs_vdev_def_queue_depth; extern uint32_t zfs_vdev_async_write_max_active; /* @@ -79,6 +80,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); +/* + * Given a target vdev, translates the logical range "in" to the physical + * range "res" + */ +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, + range_seg_t *res); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -91,6 +98,11 @@ typedef struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; + /* + * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. + * Used when initializing vdevs. Isn't used by leaf ops. + */ + vdev_xlation_func_t *vdev_op_xlate; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -251,6 +263,24 @@ struct vdev { /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + + boolean_t vdev_initialize_exit_wanted; + vdev_initializing_state_t vdev_initialize_state; + kthread_t *vdev_initialize_thread; + /* Protects vdev_initialize_thread and vdev_initialize_state. */ + kmutex_t vdev_initialize_lock; + kcondvar_t vdev_initialize_cv; + uint64_t vdev_initialize_offset[TXG_SIZE]; + uint64_t vdev_initialize_last_offset; + range_tree_t *vdev_initialize_tree; /* valid while initializing */ + uint64_t vdev_initialize_bytes_est; + uint64_t vdev_initialize_bytes_done; + time_t vdev_initialize_action_time; /* start and end time */ + + /* for limiting outstanding I/Os */ + kmutex_t vdev_initialize_io_lock; + kcondvar_t vdev_initialize_io_cv; + uint64_t vdev_initialize_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -470,6 +500,8 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ +extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, + range_seg_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h new file mode 100644 index 000000000000..db4b0572cd60 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_initialize.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INITIALIZE_H +#define _SYS_VDEV_INITIALIZE_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vdev_initialize(vdev_t *vd); +extern void vdev_initialize_stop(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_stop_all(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_restart(vdev_t *vd); +extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, + range_seg_t *physical_rs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INITIALIZE_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h index a29ae586102e..3962237afdab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h @@ -30,7 +30,7 @@ extern "C" { #endif typedef struct spa_vdev_removal { - vdev_t *svr_vdev; + uint64_t svr_vdev_id; uint64_t svr_max_offset_to_sync[TXG_SIZE]; /* Thread performing a vdev removal. */ kthread_t *svr_thread; @@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *); extern int spa_vdev_remove_cancel(spa_t *); extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); +extern int vdev_removal_max_span; +extern int zfs_remove_max_segment; + #ifdef __cplusplus } #endif diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h index 3ea0da4a1d33..04606bda48db 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h @@ -146,4 +146,7 @@ extern struct mtx zfs_debug_mtx; #define sys_shutdown rebooting +#define noinline __attribute__((noinline)) +#define likely(x) __builtin_expect((x), 1) + #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h index b04b24f17f8b..9cbfc26b64e2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -57,7 +57,7 @@ extern boolean_t zfs_free_leak_on_eio; #define ZFS_DEBUG_DNODE_VERIFY (1 << 2) #define ZFS_DEBUG_SNAPNAMES (1 << 3) #define ZFS_DEBUG_MODIFY (1 << 4) -#define ZFS_DEBUG_SPA (1 << 5) +/* 1<<5 was previously used, try not to reuse */ #define ZFS_DEBUG_ZIO_FREE (1 << 6) #define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) #define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h index 216b55b6c3ce..80a24b436a01 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright 2016 Toomas Soome <tsoome@me.com> */ @@ -217,7 +217,7 @@ enum zio_child { #define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) #define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) #define ZIO_CHILD_ALL_BITS \ - (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ + (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) enum zio_wait_type { @@ -356,7 +356,7 @@ typedef struct zio_transform { struct zio_transform *zt_next; } zio_transform_t; -typedef int zio_pipe_stage_t(zio_t *zio); +typedef zio_t *zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must @@ -489,6 +489,7 @@ struct zio { void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; + int io_allocator; /* FMA state */ zio_cksum_report_t *io_cksum_report; @@ -550,8 +551,8 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags); -extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t size, boolean_t *slog); +extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, + blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog); extern void zio_flush(zio_t *zio, vdev_t *vd); extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size); @@ -586,7 +587,7 @@ extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - struct abd *data, uint64_t size, int type, zio_priority_t priority, + struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h index 4db05ac77598..ebe05a09dc4e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_priority.h @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ #ifndef _ZIO_PRIORITY_H #define _ZIO_PRIORITY_H @@ -30,6 +30,7 @@ typedef enum zio_priority { ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ + ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c index 64b9c0cb3510..62d215aa4626 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c @@ -450,6 +450,30 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) } } +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + static void txg_sync_thread(void *arg) { @@ -476,7 +500,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0 && + !txg_has_quiesced_to_sync(dp) && dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); @@ -489,7 +513,7 @@ txg_sync_thread(void *arg) * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ - while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -504,6 +528,7 @@ txg_sync_thread(void *arg) * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ + ASSERT(tx->tx_quiesced_txg != 0); txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; @@ -552,7 +577,7 @@ txg_quiesce_thread(void *arg) */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - tx->tx_quiesced_txg != 0)) + txg_has_quiesced_to_sync(dp))) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) @@ -562,6 +587,8 @@ txg_quiesce_thread(void *arg) dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); @@ -570,6 +597,7 @@ txg_quiesce_thread(void *arg) * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; tx->tx_quiesced_txg = txg; DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); @@ -667,7 +695,8 @@ txg_kick(dsl_pool_t *dp) ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); - if (tx->tx_syncing_txg == 0 && + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c index fbe7b619a29a..d33f451938b8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/unique.c @@ -42,14 +42,10 @@ typedef struct unique { static int unique_compare(const void *a, const void *b) { - const unique_t *una = a; - const unique_t *unb = b; - - if (una->un_value < unb->un_value) - return (-1); - if (una->un_value > unb->un_value) - return (+1); - return (0); + const unique_t *una = (const unique_t *)a; + const unique_t *unb = (const unique_t *)b; + + return (AVL_CMP(una->un_value, unb->un_value)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index e17243a8c598..1baea65c5fa3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -51,6 +51,7 @@ #include <sys/dsl_scan.h> #include <sys/abd.h> #include <sys/trim_map.h> +#include <sys/vdev_initialize.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); @@ -162,24 +163,30 @@ static vdev_ops_t *vdev_ops_table[] = { }; -/* maximum number of metaslabs per top-level vdev */ +/* target number of metaslabs per top-level vdev */ int vdev_max_ms_count = 200; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RDTUN, &vdev_max_ms_count, 0, "Maximum number of metaslabs per top-level vdev"); -/* minimum amount of metaslabs per top-level vdev */ +/* minimum number of metaslabs per top-level vdev */ int vdev_min_ms_count = 16; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RDTUN, &vdev_min_ms_count, 0, "Minimum number of metaslabs per top-level vdev"); -/* see comment in vdev_metaslab_set_size() */ +/* practical upper limit of total metaslabs per top-level vdev */ +int vdev_ms_count_limit = 1ULL << 17; + +/* lower limit for metaslab size (512M) */ int vdev_default_ms_shift = 29; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RDTUN, &vdev_default_ms_shift, 0, "Shift between vdev size and number of metaslabs"); +/* upper limit for metaslab size (256G) */ +int vdev_max_ms_shift = 38; + boolean_t vdev_validate_skip = B_FALSE; /* @@ -289,6 +296,14 @@ vdev_getops(const char *type) return (ops); } +/* ARGSUSED */ +void +vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +{ + res->rs_start = in->rs_start; + res->rs_end = in->rs_end; +} + /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. @@ -560,7 +575,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); - + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } @@ -752,7 +771,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? - spa_log_class(spa) : spa_normal_class(spa), vd); + spa_log_class(spa) : spa_normal_class(spa), vd, + spa->spa_alloc_count); } if (vd->vdev_ops->vdev_op_leaf && @@ -832,6 +852,7 @@ void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the @@ -862,6 +883,7 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + ASSERT(vd->vdev_initialize_thread == NULL); /* * Discard allocation state. @@ -935,6 +957,10 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); + mutex_destroy(&vd->vdev_initialize_lock); + mutex_destroy(&vd->vdev_initialize_io_lock); + cv_destroy(&vd->vdev_initialize_io_cv); + cv_destroy(&vd->vdev_initialize_cv); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; @@ -987,6 +1013,32 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; + /* + * State which may be set on a top-level vdev that's in the + * process of being removed. + */ + ASSERT0(tvd->vdev_indirect_config.vic_births_object); + ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); + ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); + ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); + ASSERT3P(tvd->vdev_indirect_births, ==, NULL); + ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0(tvd->vdev_removing); + tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_indirect_config = svd->vdev_indirect_config; + tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; + tvd->vdev_indirect_births = svd->vdev_indirect_births; + range_tree_swap(&svd->vdev_obsolete_segments, + &tvd->vdev_obsolete_segments); + tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; + svd->vdev_indirect_config.vic_mapping_object = 0; + svd->vdev_indirect_config.vic_births_object = 0; + svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; + svd->vdev_indirect_mapping = NULL; + svd->vdev_indirect_births = NULL; + svd->vdev_obsolete_sm = NULL; + svd->vdev_removing = 0; + for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); @@ -1140,7 +1192,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; - for (m = oldc; m < newc; m++) { uint64_t object = 0; @@ -1725,7 +1776,8 @@ vdev_validate(vdev_t *vd) if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); - vdev_dbgmsg(vd, "vdev_validate: failed reading config"); + vdev_dbgmsg(vd, "vdev_validate: failed reading config for " + "txg %llu", (u_longlong_t)txg); return (0); } @@ -2121,34 +2173,53 @@ void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; - uint64_t ms_shift = 0; + uint64_t ms_count = asize >> vdev_default_ms_shift; + uint64_t ms_shift; /* - * For vdevs that are bigger than 8G the metaslab size varies in - * a way that the number of metaslabs increases in powers of two, - * linearly in terms of vdev_asize, starting from 16 metaslabs. - * So for vdev_asize of 8G we get 16 metaslabs, for 16G, we get 32, - * and so on, until we hit the maximum metaslab count limit - * [vdev_max_ms_count] from which point the metaslab count stays - * the same. + * There are two dimensions to the metaslab sizing calculation: + * the size of the metaslab and the count of metaslabs per vdev. + * In general, we aim for vdev_max_ms_count (200) metaslabs. The + * range of the dimensions are as follows: + * + * 2^29 <= ms_size <= 2^38 + * 16 <= ms_count <= 131,072 + * + * On the lower end of vdev sizes, we aim for metaslabs sizes of + * at least 512MB (2^29) to minimize fragmentation effects when + * testing with smaller devices. However, the count constraint + * of at least 16 metaslabs will override this minimum size goal. + * + * On the upper end of vdev sizes, we aim for a maximum metaslab + * size of 256GB. However, we will cap the total count to 2^17 + * metaslabs to keep our memory footprint in check. + * + * The net effect of applying above constrains is summarized below. + * + * vdev size metaslab count + * -------------|----------------- + * < 8GB ~16 + * 8GB - 100GB one per 512MB + * 100GB - 50TB ~200 + * 50TB - 32PB one per 256GB + * > 32PB ~131,072 + * ------------------------------- */ - ms_shift = vdev_default_ms_shift; - if ((asize >> ms_shift) < vdev_min_ms_count) { - /* - * For devices that are less than 8G we want to have - * exactly 16 metaslabs. We don't want less as integer - * division rounds down, so less metaslabs mean more - * wasted space. We don't want more as these vdevs are - * small and in the likely event that we are running - * out of space, the SPA will have a hard time finding - * space due to fragmentation. - */ + if (ms_count < vdev_min_ms_count) ms_shift = highbit64(asize / vdev_min_ms_count); - ms_shift = MAX(ms_shift, SPA_MAXBLOCKSHIFT); - - } else if ((asize >> ms_shift) > vdev_max_ms_count) { + else if (ms_count > vdev_max_ms_count) ms_shift = highbit64(asize / vdev_max_ms_count); + else + ms_shift = vdev_default_ms_shift; + + if (ms_shift < SPA_MAXBLOCKSHIFT) { + ms_shift = SPA_MAXBLOCKSHIFT; + } else if (ms_shift > vdev_max_ms_shift) { + ms_shift = vdev_max_ms_shift; + /* cap the total count to constrain memory footprint */ + if ((asize >> ms_shift) > vdev_ms_count_limit) + ms_shift = highbit64(asize / vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; @@ -2647,7 +2718,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); - space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); + space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); @@ -3003,7 +3074,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) ASSERT(vdev_is_concrete(vd)); - while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + != NULL) metaslab_sync_done(msp, txg); if (reassess) @@ -3229,6 +3301,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } + /* Restart initializing if necessary */ + mutex_enter(&vd->vdev_initialize_lock); + if (vdev_writeable(vd) && + vd->vdev_initialize_thread == NULL && + vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) vdev_initialize(vd); + } + mutex_exit(&vd->vdev_initialize_lock); + if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) @@ -3531,8 +3612,18 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + /* + * Report intializing progress. Since we don't have the + * initializing locks held, this is only an estimate (although a + * fairly accurate one). + */ + vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = vd->vdev_initialize_action_time; + } /* * Report expandable space on top-level, non-auxillary devices only. * The expandable space is reported in terms of metaslab sized units @@ -4193,11 +4284,11 @@ vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); - if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && - vdev_is_concrete(vd)) { + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index 26828e069d7d..be24cde54b9b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -841,6 +841,7 @@ vdev_ops_t vdev_disk_ops = { vdev_disk_hold, vdev_disk_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c index b5caee2ec79e..c198d77e21d4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -271,6 +271,7 @@ vdev_ops_t vdev_file_ops = { vdev_file_hold, vdev_file_rele, NULL, + vdev_default_xlate, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -291,6 +292,7 @@ vdev_ops_t vdev_disk_ops = { vdev_file_hold, vdev_file_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index 29a649aceb00..aa8a400f2d78 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -1151,6 +1151,7 @@ vdev_ops_t vdev_geom_ops = { vdev_geom_hold, vdev_geom_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c index 62b92c677292..c4e4835447f5 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c @@ -23,6 +23,7 @@ #include <sys/vdev_impl.h> #include <sys/fs/zfs.h> #include <sys/zio.h> +#include <sys/zio_checksum.h> #include <sys/metaslab.h> #include <sys/refcount.h> #include <sys/dmu.h> @@ -46,10 +47,11 @@ * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * - * - reads and repair writes to this device use the callback to create - * a child io for each mapped segment. + * - i/os to this vdev use the callback to determine where the + * data is now located, and issue child i/os for each segment's new + * location. * - * - frees and claims to this device use the callback to free or claim + * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak @@ -204,6 +206,94 @@ uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; int zfs_condense_indirect_commit_entry_delay_ticks = 0; /* + * If a split block contains more than this many segments, consider it too + * computationally expensive to check all (2^num_segments) possible + * combinations. Instead, try at most 2^_segments_max randomly-selected + * combinations. + * + * This is reasonable if only a few segment copies are damaged and the + * majority of segment copies are good. This allows all the segment copies to + * participate fairly in the reconstruction and prevents the repeated use of + * one bad copy. + */ +int zfs_reconstruct_indirect_segments_max = 10; + +/* + * The indirect_child_t represents the vdev that we will read from, when we + * need to read all copies of the data (e.g. for scrub or reconstruction). + * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), + * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, + * ic_vdev is a child of the mirror. + */ +typedef struct indirect_child { + abd_t *ic_data; + vdev_t *ic_vdev; +} indirect_child_t; + +/* + * The indirect_split_t represents one mapped segment of an i/o to the + * indirect vdev. For non-split (contiguously-mapped) blocks, there will be + * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. + * For split blocks, there will be several of these. + */ +typedef struct indirect_split { + list_node_t is_node; /* link on iv_splits */ + + /* + * is_split_offset is the offset into the i/o. + * This is the sum of the previous splits' is_size's. + */ + uint64_t is_split_offset; + + vdev_t *is_vdev; /* top-level vdev */ + uint64_t is_target_offset; /* offset on is_vdev */ + uint64_t is_size; + int is_children; /* number of entries in is_child[] */ + + /* + * is_good_child is the child that we are currently using to + * attempt reconstruction. + */ + int is_good_child; + + indirect_child_t is_child[1]; /* variable-length */ +} indirect_split_t; + +/* + * The indirect_vsd_t is associated with each i/o to the indirect vdev. + * It is the "Vdev-Specific Data" in the zio_t's io_vsd. + */ +typedef struct indirect_vsd { + boolean_t iv_split_block; + boolean_t iv_reconstruct; + + list_t iv_splits; /* list of indirect_split_t's */ +} indirect_vsd_t; + +static void +vdev_indirect_map_free(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + indirect_split_t *is; + while ((is = list_head(&iv->iv_splits)) != NULL) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic->ic_data != NULL) + abd_free(ic->ic_data); + } + list_remove(&iv->iv_splits, is); + kmem_free(is, + offsetof(indirect_split_t, is_child[is->is_children])); + } + kmem_free(iv, sizeof (*iv)); +} + +static const zio_vsd_ops_t vdev_indirect_vsd_ops = { + vdev_indirect_map_free, + zio_vsd_default_cksum_report +}; +/* * Mark the given offset and size as being obsolete. */ void @@ -729,7 +819,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, - vd->vdev_obsolete_segments, SM_ALLOC, tx); + vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); space_map_update(vd->vdev_obsolete_sm); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } @@ -818,12 +908,6 @@ vdev_indirect_close(vdev_t *vd) } /* ARGSUSED */ -static void -vdev_indirect_io_done(zio_t *zio) -{ -} - -/* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) @@ -1067,41 +1151,475 @@ vdev_indirect_child_io_done(zio_t *zio) abd_put(zio->io_abd); } +/* + * This is a callback for vdev_indirect_remap() which allocates an + * indirect_split_t for each split segment and adds it to iv_splits. + */ static void -vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, +vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; + indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; - zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, - abd_get_offset(zio->io_abd, split_offset), - size, zio->io_type, zio->io_priority, - 0, vdev_indirect_child_io_done, zio)); + int n = 1; + if (vd->vdev_ops == &vdev_mirror_ops) + n = vd->vdev_children; + + indirect_split_t *is = + kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); + + is->is_children = n; + is->is_size = size; + is->is_split_offset = split_offset; + is->is_target_offset = offset; + is->is_vdev = vd; + + /* + * Note that we only consider multiple copies of the data for + * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even + * though they use the same ops as mirror, because there's only one + * "good" copy under the replacing/spare. + */ + if (vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < n; i++) { + is->is_child[i].ic_vdev = vd->vdev_child[i]; + } + } else { + is->is_child[0].ic_vdev = vd; + } + + list_insert_tail(&iv->iv_splits, is); +} + +static void +vdev_indirect_read_split_done(zio_t *zio) +{ + indirect_child_t *ic = zio->io_private; + + if (zio->io_error != 0) { + /* + * Clear ic_data to indicate that we do not have data for this + * child. + */ + abd_free(ic->ic_data); + ic->ic_data = NULL; + } +} + +/* + * Issue reads for all copies (mirror children) of all splits. + */ +static void +vdev_indirect_read_all(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int i = 0; i < is->is_children; i++) { + indirect_child_t *ic = &is->is_child[i]; + + if (!vdev_readable(ic->ic_vdev)) + continue; + + /* + * Note, we may read from a child whose DTL + * indicates that the data may not be present here. + * While this might result in a few i/os that will + * likely return incorrect data, it simplifies the + * code since we can treat scrub and resilver + * identically. (The incorrect data will be + * detected and ignored when we verify the + * checksum.) + */ + + ic->ic_data = abd_alloc_sametype(zio->io_abd, + is->is_size); + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, ic->ic_data, + is->is_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_read_split_done, ic)); + } + } + iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa = zio->io_spa; + indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); + list_create(&iv->iv_splits, + sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); + + zio->io_vsd = iv; + zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - ASSERT((zio->io_flags & - (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); + /* + * Note: this code can handle other kinds of writes, + * but we don't expect them. + */ + ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, - vdev_indirect_io_start_cb, zio); + vdev_indirect_gather_splits, zio); + + indirect_split_t *first = list_head(&iv->iv_splits); + if (first->is_size == zio->io_size) { + /* + * This is not a split block; we are pointing to the entire + * data, which will checksum the same as the original data. + * Pass the BP down so that the child i/o can verify the + * checksum, and try a different location if available + * (e.g. on a mirror). + * + * While this special case could be handled the same as the + * general (split block) case, doing it this way ensures + * that the vast majority of blocks on indirect vdevs + * (which are not split) are handled identically to blocks + * on non-indirect vdevs. This allows us to be less strict + * about performance in the general (but rare) case. + */ + ASSERT0(first->is_split_offset); + ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + first->is_vdev, first->is_target_offset, + abd_get_offset(zio->io_abd, 0), + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } else { + iv->iv_split_block = B_TRUE; + if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + /* + * Read all copies. Note that for simplicity, + * we don't bother consulting the DTL in the + * resilver case. + */ + vdev_indirect_read_all(zio); + } else { + /* + * Read one copy of each split segment, from the + * top-level vdev. Since we don't know the + * checksum of each split individually, the child + * zio can't ensure that we get the right data. + * E.g. if it's a mirror, it will just read from a + * random (healthy) leaf vdev. We have to verify + * the checksum in vdev_indirect_io_done(). + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + zio_nowait(zio_vdev_child_io(zio, NULL, + is->is_vdev, is->is_target_offset, + abd_get_offset(zio->io_abd, + is->is_split_offset), + is->is_size, zio->io_type, + zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } + } + } zio_execute(zio); } +/* + * Report a checksum error for a child. + */ +static void +vdev_indirect_checksum_error(zio_t *zio, + indirect_split_t *is, indirect_child_t *ic) +{ + vdev_t *vd = ic->ic_vdev; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zio_bad_cksum_t zbc = { 0 }; + void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); + abd_t *good_abd = is->is_child[is->is_good_child].ic_data; + void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); + abd_return_buf(ic->ic_data, bad_buf, is->is_size); + abd_return_buf(good_abd, good_buf, is->is_size); +} + +/* + * Issue repair i/os for any incorrect copies. We do this by comparing + * each split segment's correct data (is_good_child's ic_data) with each + * other copy of the data. If they differ, then we overwrite the bad data + * with the good copy. Note that we do this without regard for the DTL's, + * which simplifies this code and also issues the optimal number of writes + * (based on which copies actually read bad data, as opposed to which we + * think might be wrong). For the same reason, we always use + * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). + */ +static void +vdev_indirect_repair(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + enum zio_flag flags = ZIO_FLAG_IO_REPAIR; + + if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) + flags |= ZIO_FLAG_SELF_HEAL; + + if (!spa_writeable(zio->io_spa)) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + indirect_child_t *good_child = &is->is_child[is->is_good_child]; + + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic == good_child) + continue; + if (ic->ic_data == NULL) + continue; + if (abd_cmp(good_child->ic_data, ic->ic_data, + is->is_size) == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, + good_child->ic_data, is->is_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL)); + + vdev_indirect_checksum_error(zio, is, ic); + } + } +} + +/* + * Report checksum errors on all children that we read from. + */ +static void +vdev_indirect_all_checksum_errors(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + + if (ic->ic_data == NULL) + continue; + + vdev_t *vd = ic->ic_vdev; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + is->is_target_offset, is->is_size, + NULL, NULL, NULL); + } + } +} + +/* + * This function is called when we have read all copies of the data and need + * to try to find a combination of copies that gives us the right checksum. + * + * If we pointed to any mirror vdevs, this effectively does the job of the + * mirror. The mirror vdev code can't do its own job because we don't know + * the checksum of each split segment individually. We have to try every + * combination of copies of split segments, until we find one that checksums + * correctly. (Or until we have tried all combinations, or have tried + * 2^zfs_reconstruct_indirect_segments_max combinations. In these cases we + * set io_error to ECKSUM to propagate the error up to the user.) + * + * For example, if we have 3 segments in the split, + * and each points to a 2-way mirror, we will have the following pieces of + * data: + * + * | mirror child + * split | [0] [1] + * ======|===================== + * A | data_A_0 data_A_1 + * B | data_B_0 data_B_1 + * C | data_C_0 data_C_1 + * + * We will try the following (mirror children)^(number of splits) (2^3=8) + * combinations, which is similar to bitwise-little-endian counting in + * binary. In general each "digit" corresponds to a split segment, and the + * base of each digit is is_children, which can be different for each + * digit. + * + * "low bit" "high bit" + * v v + * data_A_0 data_B_0 data_C_0 + * data_A_1 data_B_0 data_C_0 + * data_A_0 data_B_1 data_C_0 + * data_A_1 data_B_1 data_C_0 + * data_A_0 data_B_0 data_C_1 + * data_A_1 data_B_0 data_C_1 + * data_A_0 data_B_1 data_C_1 + * data_A_1 data_B_1 data_C_1 + * + * Note that the split segments may be on the same or different top-level + * vdevs. In either case, we try lots of combinations (see + * zfs_reconstruct_indirect_segments_max). This ensures that if a mirror has + * small silent errors on all of its children, we can still reconstruct the + * correct data, as long as those errors are at sufficiently-separated + * offsets (specifically, separated by the largest block size - default of + * 128KB, but up to 16MB). + */ +static void +vdev_indirect_reconstruct_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + uint64_t attempts = 0; + uint64_t attempts_max = 1ULL << zfs_reconstruct_indirect_segments_max; + int segments = 0; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) + segments++; + + for (;;) { + /* copy data from splits to main zio */ + int ret; + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + + /* + * If this child failed, its ic_data will be NULL. + * Skip this combination. + */ + if (is->is_child[is->is_good_child].ic_data == NULL) { + ret = EIO; + goto next; + } + + abd_copy_off(zio->io_abd, + is->is_child[is->is_good_child].ic_data, + is->is_split_offset, 0, is->is_size); + } + + /* See if this checksum matches. */ + zio_bad_cksum_t zbc; + ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + /* Found a matching checksum. Issue repair i/os. */ + vdev_indirect_repair(zio); + zio_checksum_verified(zio); + return; + } + + /* + * Checksum failed; try a different combination of split + * children. + */ + boolean_t more; +next: + more = B_FALSE; + if (segments <= zfs_reconstruct_indirect_segments_max) { + /* + * There are relatively few segments, so + * deterministically check all combinations. We do + * this by by adding one to the first split's + * good_child. If it overflows, then "carry over" to + * the next split (like counting in base is_children, + * but each digit can have a different base). + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child++; + if (is->is_good_child < is->is_children) { + more = B_TRUE; + break; + } + is->is_good_child = 0; + } + } else if (++attempts < attempts_max) { + /* + * There are too many combinations to try all of them + * in a reasonable amount of time, so try a fixed + * number of random combinations, after which we'll + * consider the block unrecoverable. + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child = + spa_get_random(is->is_children); + } + more = B_TRUE; + } + if (!more) { + /* All combinations failed. */ + zio->io_error = ret; + vdev_indirect_all_checksum_errors(zio); + zio_checksum_verified(zio); + return; + } + } +} + +static void +vdev_indirect_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (iv->iv_reconstruct) { + /* + * We have read all copies of the data (e.g. from mirrors), + * either because this was a scrub/resilver, or because the + * one-copy read didn't checksum correctly. + */ + vdev_indirect_reconstruct_io_done(zio); + return; + } + + if (!iv->iv_split_block) { + /* + * This was not a split block, so we passed the BP down, + * and the checksum was handled by the (one) child zio. + */ + return; + } + + zio_bad_cksum_t zbc; + int ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + zio_checksum_verified(zio); + return; + } + + /* + * The checksum didn't match. Read all copies of all splits, and + * then we will try to reconstruct. The next time + * vdev_indirect_io_done() is called, iv_reconstruct will be set. + */ + vdev_indirect_read_all(zio); + + zio_vdev_io_redone(zio); +} + vdev_ops_t vdev_indirect_ops = { vdev_indirect_open, vdev_indirect_close, @@ -1113,6 +1631,7 @@ vdev_ops_t vdev_indirect_ops = { NULL, NULL, vdev_indirect_remap, + NULL, VDEV_TYPE_INDIRECT, /* name of this vdev type */ B_FALSE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c index ea80fbc4733f..02999aae7274 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2017 by Delphix. All rights reserved. */ #include <sys/dmu_tx.h> @@ -536,14 +536,13 @@ typedef struct load_obsolete_space_map_arg { } load_obsolete_space_map_arg_t; static int -load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size, - void *arg) +load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) { load_obsolete_space_map_arg_t *losma = arg; - ASSERT3S(type, ==, SM_ALLOC); + ASSERT3S(sme->sme_type, ==, SM_ALLOC); vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, - offset, size, losma->losma_counts); + sme->sme_offset, sme->sme_run, losma->losma_counts); return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c new file mode 100644 index 000000000000..a2c39c2868e5 --- /dev/null +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c @@ -0,0 +1,792 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/txg.h> +#include <sys/vdev_impl.h> +#include <sys/refcount.h> +#include <sys/metaslab_impl.h> +#include <sys/dsl_synctask.h> +#include <sys/zap.h> +#include <sys/dmu_tx.h> + +/* + * Maximum number of metaslabs per group that can be initialized + * simultaneously. + */ +int max_initialize_ms = 3; + +/* + * Value that is written to disk during initialization. + */ +uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; + +/* maximum number of I/Os outstanding per leaf vdev */ +int zfs_initialize_limit = 1; + +/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ +uint64_t zfs_initialize_chunk_size = 1024 * 1024; + +static boolean_t +vdev_initialize_should_stop(vdev_t *vd) +{ + return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || + vd->vdev_detached || vd->vdev_top->vdev_removing); +} + +static void +vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) +{ + /* + * We pass in the guid instead of the vdev_t since the vdev may + * have been freed prior to the sync task being processed. This + * happens when a vdev is detached as we call spa_config_vdev_exit(), + * stop the intializing thread, schedule the sync task, and free + * the vdev. Later when the scheduled sync task is invoked, it would + * find that the vdev has been freed. + */ + uint64_t guid = *(uint64_t *)arg; + uint64_t txg = dmu_tx_get_txg(tx); + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; + vd->vdev_initialize_offset[txg & TXG_MASK] = 0; + + VERIFY(vd->vdev_leaf_zap != 0); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + + if (last_offset > 0) { + vd->vdev_initialize_last_offset = last_offset; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (last_offset), 1, &last_offset, tx)); + } + if (vd->vdev_initialize_action_time > 0) { + uint64_t val = (uint64_t)vd->vdev_initialize_action_time; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), + 1, &val, tx)); + } + + uint64_t initialize_state = vd->vdev_initialize_state; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, + &initialize_state, tx)); +} + +static void +vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + spa_t *spa = vd->vdev_spa; + + if (new_state == vd->vdev_initialize_state) + return; + + /* + * Copy the vd's guid, this will be freed by the sync task. + */ + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* + * If we're suspending, then preserving the original start time. + */ + if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { + vd->vdev_initialize_action_time = gethrestime_sec(); + } + vd->vdev_initialize_state = new_state; + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, + guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); + + switch (new_state) { + case VDEV_INITIALIZE_ACTIVE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s activated", vd->vdev_path); + break; + case VDEV_INITIALIZE_SUSPENDED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s suspended", vd->vdev_path); + break; + case VDEV_INITIALIZE_CANCELED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); + break; + case VDEV_INITIALIZE_COMPLETE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s complete", vd->vdev_path); + break; + default: + panic("invalid state %llu", (unsigned long long)new_state); + } + + dmu_tx_commit(tx); +} + +static void +vdev_initialize_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_initialize_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the vdev was unavailable; roll the + * last offset back. (This works because spa_sync waits on + * spa_txg_zio before it runs sync tasks.) + */ + uint64_t *off = + &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else { + /* + * Since initializing is best-effort, we ignore I/O errors and + * rely on vdev_probe to determine if the errors are more + * critical. + */ + if (zio->io_error != 0) + vd->vdev_stat.vs_initialize_errors++; + + vd->vdev_initialize_bytes_done += zio->io_orig_size; + } + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + cv_broadcast(&vd->vdev_initialize_io_cv); + mutex_exit(&vd->vdev_initialize_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* Takes care of physical writing and limiting # of concurrent ZIOs. */ +static int +vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) +{ + spa_t *spa = vd->vdev_spa; + + /* Limit inflight initializing I/Os */ + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + vd->vdev_initialize_inflight++; + mutex_exit(&vd->vdev_initialize_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_initialize_lock); + + if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* This is the first write of this txg. */ + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* + * We know the vdev struct will still be around since all + * consumers of vdev_free must stop the initialization first. + */ + if (vdev_initialize_should_stop(vd)) { + mutex_enter(&vd->vdev_initialize_io_lock); + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + mutex_exit(&vd->vdev_initialize_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_initialize_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_initialize_lock); + + vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; + zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, + size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, + ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); + /* vdev_initialize_cb releases SCL_STATE_ALL */ + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Translate a logical range to the physical range for the specified vdev_t. + * This function is initially called with a leaf vdev and will walk each + * parent vdev until it reaches a top-level vdev. Once the top-level is + * reached the physical range is initialized and the recursive function + * begins to unwind. As it unwinds it calls the parent's vdev specific + * translation function to do the real conversion. + */ +void +vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +{ + /* + * Walk up the vdev tree + */ + if (vd != vd->vdev_top) { + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + } else { + /* + * We've reached the top-level vdev, initialize the + * physical range to the logical range and start to + * unwind. + */ + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; + return; + } + + vdev_t *pvd = vd->vdev_parent; + ASSERT3P(pvd, !=, NULL); + ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); + + /* + * As this recursive function unwinds, translate the logical + * range into its physical components by calling the + * vdev specific translate function. + */ + range_seg_t intermediate = { 0 }; + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + + physical_rs->rs_start = intermediate.rs_start; + physical_rs->rs_end = intermediate.rs_end; +} + +/* + * Callback to fill each ABD chunk with zfs_initialize_value. len must be + * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD + * allocation will guarantee these for us. + */ +/* ARGSUSED */ +static int +vdev_initialize_block_fill(void *buf, size_t len, void *unused) +{ + ASSERT0(len % sizeof (uint64_t)); + for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { + *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; + } + return (0); +} + +static abd_t * +vdev_initialize_block_alloc() +{ + /* Allocate ABD for filler data */ + abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); + + ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); + (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, + vdev_initialize_block_fill, NULL); + + return (data); +} + +static void +vdev_initialize_block_free(abd_t *data) +{ + abd_free(data); +} + +static int +vdev_initialize_ranges(vdev_t *vd, abd_t *data) +{ + avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + + for (range_seg_t *rs = avl_first(rt); rs != NULL; + rs = AVL_NEXT(rt, rs)) { + uint64_t size = rs->rs_end - rs->rs_start; + + /* Split range into legally-sized physical chunks */ + uint64_t writes_required = + ((size - 1) / zfs_initialize_chunk_size) + 1; + + for (uint64_t w = 0; w < writes_required; w++) { + int error; + + error = vdev_initialize_write(vd, + VDEV_LABEL_START_SIZE + rs->rs_start + + (w * zfs_initialize_chunk_size), + MIN(size - (w * zfs_initialize_chunk_size), + zfs_initialize_chunk_size), data); + if (error != 0) + return (error); + } + } + return (0); +} + +static void +vdev_initialize_ms_load(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_load_wait(msp); + if (!msp->ms_loaded) + VERIFY0(metaslab_load(msp)); +} + +static void +vdev_initialize_mg_wait(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + while (mg->mg_initialize_updating) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } +} + +static void +vdev_initialize_mg_mark(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + ASSERT(mg->mg_initialize_updating); + + while (mg->mg_ms_initializing >= max_initialize_ms) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } + mg->mg_ms_initializing++; + ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); +} + +/* + * Mark the metaslab as being initialized to prevent any allocations + * on this metaslab. We must also track how many metaslabs are currently + * being initialized within a metaslab group and limit them to prevent + * allocation failures from occurring because all metaslabs are being + * initialized. + */ +static void +vdev_initialize_ms_mark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + + mutex_enter(&mg->mg_ms_initialize_lock); + + /* + * To keep an accurate count of how many threads are initializing + * a specific metaslab group, we only allow one thread to mark + * the metaslab group at a time. This ensures that the value of + * ms_initializing will be accurate when we decide to mark a metaslab + * group as being initialized. To do this we force all other threads + * to wait till the metaslab's mg_initialize_updating flag is no + * longer set. + */ + vdev_initialize_mg_wait(mg); + mg->mg_initialize_updating = B_TRUE; + if (msp->ms_initializing == 0) { + vdev_initialize_mg_mark(mg); + } + mutex_enter(&msp->ms_lock); + msp->ms_initializing++; + mutex_exit(&msp->ms_lock); + + mg->mg_initialize_updating = B_FALSE; + cv_broadcast(&mg->mg_ms_initialize_cv); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_ms_unmark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + mutex_enter(&mg->mg_ms_initialize_lock); + mutex_enter(&msp->ms_lock); + if (--msp->ms_initializing == 0) { + mg->mg_ms_initializing--; + cv_broadcast(&mg->mg_ms_initialize_cv); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_calculate_progress(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + vd->vdev_initialize_bytes_est = 0; + vd->vdev_initialize_bytes_done = 0; + + for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + + uint64_t ms_free = msp->ms_size - + space_map_allocated(msp->ms_sm); + + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) + ms_free /= vd->vdev_top->vdev_children; + + /* + * Convert the metaslab range to a physical range + * on our vdev. We use this to determine if we are + * in the middle of this metaslab range. + */ + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = msp->ms_start; + logical_rs.rs_end = msp->ms_start + msp->ms_size; + vdev_xlate(vd, &logical_rs, &physical_rs); + + if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += ms_free; + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If we get here, we're in the middle of initializing this + * metaslab. Load it and walk the free tree for more accurate + * progress estimation. + */ + vdev_initialize_ms_load(msp); + + for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; + rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { + logical_rs.rs_start = rs->rs_start; + logical_rs.rs_end = rs->rs_end; + vdev_xlate(vd, &logical_rs, &physical_rs); + + uint64_t size = physical_rs.rs_end - + physical_rs.rs_start; + vd->vdev_initialize_bytes_est += size; + if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_start && + vd->vdev_initialize_last_offset < + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - + physical_rs.rs_start; + } + } + mutex_exit(&msp->ms_lock); + } +} + +static void +vdev_initialize_load(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || + vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (vd->vdev_initialize_last_offset), 1, + &vd->vdev_initialize_last_offset); + ASSERT(err == 0 || err == ENOENT); + } + + vdev_initialize_calculate_progress(vd); +} + + +/* + * Convert the logical range into a physcial range and add it to our + * avl tree. + */ +void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate(vd, &logical_rs, &physical_rs); + + IMPLY(vd->vdev_top == vd, + logical_rs.rs_start == physical_rs.rs_start); + IMPLY(vd->vdev_top == vd, + logical_rs.rs_end == physical_rs.rs_end); + + /* Only add segments that we have not visited yet */ + if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs.rs_start, + (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs.rs_end); + ASSERT3U(physical_rs.rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs.rs_start = vd->vdev_initialize_last_offset; + } + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + /* + * With raidz, it's possible that the logical range does not live on + * this leaf vdev. We only add the physical range to this vdev's if it + * has a length greater than 0. + */ + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } +} + +static void +vdev_initialize_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + uint64_t ms_count = 0; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_initialize_last_offset = 0; + vdev_initialize_load(vd); + + abd_t *deadbeef = vdev_initialize_block_alloc(); + + vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + + for (uint64_t i = 0; !vd->vdev_detached && + i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + + /* + * If we've expanded the top-level vdev or it's our + * first pass, calculate our progress. + */ + if (vd->vdev_top->vdev_ms_count != ms_count) { + vdev_initialize_calculate_progress(vd); + ms_count = vd->vdev_top->vdev_ms_count; + } + + vdev_initialize_ms_mark(msp); + mutex_enter(&msp->ms_lock); + vdev_initialize_ms_load(msp); + + range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, + vd); + mutex_exit(&msp->ms_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + error = vdev_initialize_ranges(vd, deadbeef); + vdev_initialize_ms_unmark(msp); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight > 0) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + mutex_exit(&vd->vdev_initialize_io_lock); + + range_tree_destroy(vd->vdev_initialize_tree); + vdev_initialize_block_free(deadbeef); + vd->vdev_initialize_tree = NULL; + + mutex_enter(&vd->vdev_initialize_lock); + if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { + vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); + } + ASSERT(vd->vdev_initialize_thread != NULL || + vd->vdev_initialize_inflight == 0); + + /* + * Drop the vdev_initialize_lock while we sync out the + * txg since it's possible that a device might be trying to + * come online and must check to see if it needs to restart an + * initialization. That thread will be holding the spa_config_lock + * which would prevent the txg_wait_synced from completing. + */ + mutex_exit(&vd->vdev_initialize_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&vd->vdev_initialize_lock); + + vd->vdev_initialize_thread = NULL; + cv_broadcast(&vd->vdev_initialize_cv); + mutex_exit(&vd->vdev_initialize_lock); + thread_exit(); +} + +/* + * Initiates a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_initialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); + vd->vdev_initialize_thread = thread_create(NULL, 0, + vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); +} + +/* + * Stop initializng a device, with the resultant initialing state being + * tgt_state. Blocks until the initializing thread has exited. + * Caller must hold vdev_initialize_lock and must not be writing to the spa + * config, as the initializing thread may try to enter the config as a reader + * before exiting. + */ +void +vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + spa_t *spa = vd->vdev_spa; + ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); + + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + + /* + * Allow cancel requests to proceed even if the initialize thread + * has stopped. + */ + if (vd->vdev_initialize_thread == NULL && + tgt_state != VDEV_INITIALIZE_CANCELED) { + return; + } + + vdev_initialize_change_state(vd, tgt_state); + vd->vdev_initialize_exit_wanted = B_TRUE; + while (vd->vdev_initialize_thread != NULL) + cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + vd->vdev_initialize_exit_wanted = B_FALSE; +} + +static void +vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, tgt_state); + mutex_exit(&vd->vdev_initialize_lock); + return; + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); + } +} + +/* + * Convenience function to stop initializing of a vdev tree and set all + * initialize thread pointers to NULL. + */ +void +vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + vdev_initialize_stop_all_impl(vd, tgt_state); + + if (vd->vdev_spa->spa_sync_on) { + /* Make sure that our state has been synced to disk */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + } +} + +void +vdev_initialize_restart(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_leaf_zap != 0) { + mutex_enter(&vd->vdev_initialize_lock); + uint64_t initialize_state = VDEV_INITIALIZE_NONE; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, + sizeof (initialize_state), 1, &initialize_state); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_state = initialize_state; + + uint64_t timestamp = 0; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, + sizeof (timestamp), 1, ×tamp); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_action_time = (time_t)timestamp; + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) { + /* load progress for reporting, but don't resume */ + vdev_initialize_load(vd); + } else if (vd->vdev_initialize_state == + VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { + vdev_initialize(vd); + } + + mutex_exit(&vd->vdev_initialize_lock); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_restart(vd->vdev_child[i]); + } +} diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c index d993d2aec8e1..d66fa4ef822f 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c @@ -33,15 +33,15 @@ * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * - * 2. Verify that all the devices given in a configuration are present + * 2. Verify that all the devices given in a configuration are present * within the pool. * - * 3. Determine the uberblock for the pool. + * 3. Determine the uberblock for the pool. * - * 4. In case of an import operation, determine the configuration of the + * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * - * 5. If an import operation cannot find all the devices in the pool, + * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * @@ -77,9 +77,9 @@ * In order to identify which labels are valid, the labels are written in the * following manner: * - * 1. For each vdev, update 'L1' to the new label - * 2. Update the uberblock - * 3. For each vdev, update 'L2' to the new label + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the @@ -117,19 +117,19 @@ * * The nvlist describing the pool and vdev contains the following elements: * - * version ZFS on-disk version - * name Pool name - * state Pool state - * txg Transaction group in which this label was written - * pool_guid Unique identifier for this pool - * vdev_tree An nvlist describing vdev tree. + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * - * top_guid Unique ID for top-level vdev in which this is contained - * guid Unique ID for the leaf vdev + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ @@ -396,22 +396,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * histograms. */ uint64_t seg_count = 0; + uint64_t to_alloc = vd->vdev_stat.vs_alloc; /* * There are the same number of allocated segments * as free segments, so we will have at least one - * entry per free segment. + * entry per free segment. However, small free + * segments (smaller than vdev_removal_max_span) + * will be combined with adjacent allocated segments + * as a single mapping. */ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - seg_count += vd->vdev_mg->mg_histogram[i]; + if (1ULL << (i + 1) < vdev_removal_max_span) { + to_alloc += + vd->vdev_mg->mg_histogram[i] << + i + 1; + } else { + seg_count += + vd->vdev_mg->mg_histogram[i]; + } } /* - * The maximum length of a mapping is SPA_MAXBLOCKSIZE, - * so we need at least one entry per SPA_MAXBLOCKSIZE - * of allocated data. + * The maximum length of a mapping is + * zfs_remove_max_segment, so we need at least one entry + * per zfs_remove_max_segment of allocated data. */ - seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE; + seg_count += to_alloc / zfs_remove_max_segment; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * @@ -546,6 +557,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; + uint64_t label_txg = 0; int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; @@ -571,8 +583,6 @@ retry: if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), &label, 0) == 0) { - uint64_t label_txg = 0; - /* * Auxiliary vdevs won't have txg values in their * labels and newly added vdevs may not have been @@ -603,6 +613,15 @@ retry: goto retry; } + /* + * We found a valid label but it didn't pass txg restrictions. + */ + if (config == NULL && label_txg != 0) { + vdev_dbgmsg(vd, "label discarded as txg is too large " + "(%llu > %llu)", (u_longlong_t)label_txg, + (u_longlong_t)txg); + } + abd_free(vp_abd); return (config); @@ -1028,19 +1047,13 @@ retry: * among uberblocks with equal txg, choose the one with the latest timestamp. */ static int -vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - if (ub1->ub_txg < ub2->ub_txg) - return (-1); - if (ub1->ub_txg > ub2->ub_txg) - return (1); - - if (ub1->ub_timestamp < ub2->ub_timestamp) - return (-1); - if (ub1->ub_timestamp > ub2->ub_timestamp) - return (1); + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + if (likely(cmp)) + return (cmp); - return (0); + return (AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp)); } struct ubl_cbdata { @@ -1167,10 +1180,13 @@ vdev_uberblock_sync_done(zio_t *zio) * Write the uberblock to all labels of all leaves of the specified vdev. */ static void -vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) +vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, + uberblock_t *ub, vdev_t *vd, int flags) { - for (uint64_t c = 0; c < vd->vdev_children; c++) - vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_uberblock_sync(zio, good_writes, + ub, vd->vdev_child[c], flags); + } if (!vd->vdev_ops->vdev_op_leaf) return; @@ -1188,7 +1204,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) for (int l = 0; l < VDEV_LABELS; l++) vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), - vdev_uberblock_sync_done, zio->io_private, + vdev_uberblock_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); abd_free(ub_abd); @@ -1202,10 +1218,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) zio_t *zio; uint64_t good_writes = 0; - zio = zio_root(spa, NULL, &good_writes, flags); + zio = zio_root(spa, NULL, NULL, flags); for (int v = 0; v < svdcount; v++) - vdev_uberblock_sync(zio, ub, svd[v], flags); + vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); (void) zio_wait(zio); @@ -1266,7 +1282,8 @@ vdev_label_sync_ignore_done(zio_t *zio) * Write all even or odd labels to all leaves of the specified vdev. */ static void -vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) +vdev_label_sync(zio_t *zio, uint64_t *good_writes, + vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; @@ -1274,8 +1291,10 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) char *buf; size_t buflen; - for (int c = 0; c < vd->vdev_children; c++) - vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_label_sync(zio, good_writes, + vd->vdev_child[c], l, txg, flags); + } if (!vd->vdev_ops->vdev_op_leaf) return; @@ -1300,7 +1319,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), - vdev_label_sync_done, zio->io_private, + vdev_label_sync_done, good_writes, flags | ZIO_FLAG_DONT_PROPAGATE); } } @@ -1332,7 +1351,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); - vdev_label_sync(vio, vd, l, txg, flags); + vdev_label_sync(vio, good_writes, vd, l, txg, flags); zio_nowait(vio); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index 60cb7aa96fca..26be35fc3501 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -516,13 +516,16 @@ vdev_mirror_io_start(zio_t *zio) } if (zio->io_type == ZIO_TYPE_READ) { - if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering && + if (zio->io_bp != NULL && + (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering && mm->mm_children > 1) { /* - * For scrubbing reads we need to allocate a read - * buffer for each child and issue reads to all - * children. If any child succeeds, it will copy its - * data into zio->io_data in vdev_mirror_scrub_done. + * For scrubbing reads (if we can verify the + * checksum here, as indicated by io_bp being + * non-NULL) we need to allocate a read buffer for + * each child and issue reads to all children. If + * any child succeeds, it will copy its data into + * zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; @@ -677,7 +680,21 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_error == 0) { if (mc->mc_tried) continue; + /* + * We didn't try this child. We need to + * repair it if: + * 1. it's a scrub (in which case we have + * tried everything that was healthy) + * - or - + * 2. it's an indirect vdev (in which case + * it could point to any other vdev, which + * might have a bad DTL) + * - or - + * 3. the DTL indicates that this data is + * missing from this vdev + */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + mc->mc_vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; @@ -723,6 +740,7 @@ vdev_ops_t vdev_mirror_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -738,6 +756,7 @@ vdev_ops_t vdev_replacing_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -753,6 +772,7 @@ vdev_ops_t vdev_spare_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c index 29194fc11065..6852de445049 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ /* @@ -91,6 +91,7 @@ vdev_ops_t vdev_missing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -106,6 +107,7 @@ vdev_ops_t vdev_hole_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c index 5a3ba1b3e983..78b725a37b68 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -157,6 +157,8 @@ uint32_t zfs_vdev_trim_min_active = 1; uint32_t zfs_vdev_trim_max_active = 64; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; +uint32_t zfs_vdev_initializing_min_active = 1; +uint32_t zfs_vdev_initializing_max_active = 1; /* @@ -195,6 +197,14 @@ int zfs_vdev_queue_depth_pct = 1000; int zfs_vdev_queue_depth_pct = 300; #endif +/* + * When performing allocations for a given metaslab, we want to make sure that + * there are enough IOs to aggregate together to improve throughput. We want to + * ensure that there are at least 128k worth of IOs that can be aggregated, and + * we assume that the average allocation size is 4k, so we need the queue depth + * to be 32 per allocator to get good aggregation of sequential writes. + */ +int zfs_vdev_def_queue_depth = 32; #ifdef __FreeBSD__ #ifdef _KERNEL @@ -301,20 +311,15 @@ sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) int vdev_queue_offset_compare(const void *x1, const void *x2) { - const zio_t *z1 = x1; - const zio_t *z2 = x2; + const zio_t *z1 = (const zio_t *)x1; + const zio_t *z2 = (const zio_t *)x2; - if (z1->io_offset < z2->io_offset) - return (-1); - if (z1->io_offset > z2->io_offset) - return (1); + int cmp = AVL_CMP(z1->io_offset, z2->io_offset); - if (z1 < z2) - return (-1); - if (z1 > z2) - return (1); + if (likely(cmp)) + return (cmp); - return (0); + return (AVL_PCMP(z1, z2)); } static inline avl_tree_t * @@ -534,6 +539,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_min_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_min_active); default: panic("invalid priority %u", p); return (0); @@ -597,6 +604,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_max_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_max_active); default: panic("invalid priority %u", p); return (0); @@ -824,8 +833,8 @@ again: } /* - * For LBA-ordered queues (async / scrub), issue the i/o which follows - * the most recently issued i/o in LBA (offset) order. + * For LBA-ordered queues (async / scrub / initializing), issue the + * i/o which follows the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ @@ -881,12 +890,14 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && - zio->io_priority != ZIO_PRIORITY_REMOVAL) + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_REMOVAL) + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } else { ASSERT(zio->io_type == ZIO_TYPE_FREE); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index d6f4bbc4156a..4df04c30aabf 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -41,6 +41,10 @@ #include <sys/fm/fs/zfs.h> #include <sys/bio.h> +#ifdef ZFS_DEBUG +#include <sys/vdev_initialize.h> /* vdev_xlate testing */ +#endif + /* * Virtual device vector for RAID-Z. * @@ -1896,6 +1900,39 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +{ +#ifdef ZFS_DEBUG + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = zio->io_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_raidz_asize(zio->io_vd, zio->io_size); + + raidz_col_t *rc = &rm->rm_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + /* + * It would be nice to assert that rs_end is equal + * to rc_offset + rc_size but there might be an + * optional I/O at the end that is not accounted in + * rc_size. + */ + if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + + rc->rc_size + (1 << tvd->vdev_ashift)); + } else { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); + } +#endif +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1953,6 +1990,12 @@ vdev_raidz_io_start(zio_t *zio) for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Verify physical to logical translation. + */ + vdev_raidz_io_verify(zio, rm, c); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, @@ -2622,6 +2665,37 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) return (B_FALSE); } +static void +vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + uint64_t width = raidvd->vdev_children; + uint64_t tgt_col = cvd->vdev_id; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* make sure the offsets are block-aligned */ + ASSERT0(in->rs_start % (1 << ashift)); + ASSERT0(in->rs_end % (1 << ashift)); + uint64_t b_start = in->rs_start >> ashift; + uint64_t b_end = in->rs_end >> ashift; + + uint64_t start_row = 0; + if (b_start > tgt_col) /* avoid underflow */ + start_row = ((b_start - tgt_col - 1) / width) + 1; + + uint64_t end_row = 0; + if (b_end > tgt_col) + end_row = ((b_end - tgt_col - 1) / width) + 1; + + res->rs_start = start_row << ashift; + res->rs_end = end_row << ashift; + + ASSERT3U(res->rs_start, <=, in->rs_start); + ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); +} + vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, @@ -2633,6 +2707,7 @@ vdev_ops_t vdev_raidz_ops = { NULL, NULL, NULL, + vdev_raidz_xlate, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c index c864ab1cb0c1..20fa9c24db24 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c @@ -44,6 +44,7 @@ #include <sys/vdev_indirect_births.h> #include <sys/vdev_indirect_mapping.h> #include <sys/abd.h> +#include <sys/vdev_initialize.h> /* * This file contains the necessary logic to remove vdevs from a @@ -83,18 +84,12 @@ typedef struct vdev_copy_arg { kmutex_t vca_lock; } vdev_copy_arg_t; -typedef struct vdev_copy_seg_arg { - vdev_copy_arg_t *vcsa_copy_arg; - uint64_t vcsa_txg; - dva_t *vcsa_dest_dva; - blkptr_t *vcsa_dest_bp; -} vdev_copy_seg_arg_t; - /* - * The maximum amount of allowed data we're allowed to copy from a device - * at a time when removing it. + * The maximum amount of memory we can use for outstanding i/o while + * doing a device removal. This determines how much i/o we can have + * in flight concurrently. */ -int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; +int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when @@ -111,6 +106,24 @@ int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; int zfs_remove_max_segment = 1024 * 1024; /* + * Allow a remap segment to span free chunks of at most this size. The main + * impact of a larger span is that we will read and write larger, more + * contiguous chunks, with more "unnecessary" data -- trading off bandwidth + * for iops. The value here was chosen to align with + * zfs_vdev_read_gap_limit, which is a similar concept when doing regular + * reads (but there's no reason it has to be the same). + * + * Additionally, a higher span will have the following relatively minor + * effects: + * - the mapping will be smaller, since one entry can cover more allocated + * segments + * - more of the fragmentation in the removing device will be preserved + * - we'll do larger allocations, which may fail and fall back on smaller + * allocations + */ +int vdev_removal_max_span = 32 * 1024; + +/* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ @@ -176,7 +189,7 @@ spa_vdev_removal_create(vdev_t *vd) mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); svr->svr_allocd_segs = range_tree_create(NULL, NULL); - svr->svr_vdev = vd; + svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { svr->svr_frees[i] = range_tree_create(NULL, NULL); @@ -218,9 +231,10 @@ spa_vdev_removal_destroy(spa_vdev_removal_t *svr) static void vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) { - vdev_t *vd = arg; + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; - spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; uint64_t txg = dmu_tx_get_txg(tx); @@ -342,7 +356,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) ASSERT3P(spa->spa_vdev_removal, ==, NULL); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, - spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); + spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* @@ -384,21 +398,24 @@ spa_remove_init(spa_t *spa) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_removing_phys.sr_removing_vdev); - spa_config_exit(spa, SCL_STATE, FTAG); - if (vd == NULL) + if (vd == NULL) { + spa_config_exit(spa, SCL_STATE, FTAG); return (EINVAL); + } vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT(vdev_is_concrete(vd)); spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); - ASSERT(svr->svr_vdev->vdev_removing); + ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); + ASSERT(vd->vdev_removing); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); + spa_config_exit(spa, SCL_STATE, FTAG); spa->spa_vdev_removal = svr; } @@ -451,15 +468,8 @@ spa_restart_removal(spa_t *spa) if (!spa_writeable(spa)) return; - vdev_t *vd = svr->svr_vdev; - vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; - - ASSERT3P(vd, !=, NULL); - ASSERT(vd->vdev_removing); - - zfs_dbgmsg("restarting removal of %llu at count=%llu", - vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); - svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, + zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); + svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } @@ -480,7 +490,7 @@ free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); - ASSERT3P(vd, ==, svr->svr_vdev); + ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); mutex_enter(&svr->svr_lock); @@ -663,7 +673,7 @@ spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) if (state == DSS_FINISHED) { spa_removing_phys_t *srp = &spa->spa_removing_phys; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (srp->sr_prev_indirect_vdev != UINT64_MAX) { @@ -706,7 +716,7 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; @@ -734,85 +744,249 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) spa_sync_removing_state(spa, tx); } +typedef struct vdev_copy_segment_arg { + spa_t *vcsa_spa; + dva_t *vcsa_dest_dva; + uint64_t vcsa_txg; + range_tree_t *vcsa_obsolete_segs; +} vdev_copy_segment_arg_t; + +static void +unalloc_seg(void *arg, uint64_t start, uint64_t size) +{ + vdev_copy_segment_arg_t *vcsa = arg; + spa_t *spa = vcsa->vcsa_spa; + blkptr_t bp = { 0 }; + + BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(&bp, DMU_OT_NONE); + BP_SET_LEVEL(&bp, 0); + BP_SET_DEDUP(&bp, 0); + BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); + + DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); + DVA_SET_OFFSET(&bp.blk_dva[0], + DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); + DVA_SET_ASIZE(&bp.blk_dva[0], size); + + zio_free(spa, vcsa->vcsa_txg, &bp); +} + +/* + * All reads and writes associated with a call to spa_vdev_copy_segment() + * are done. + */ +static void +spa_vdev_copy_segment_done(zio_t *zio) +{ + vdev_copy_segment_arg_t *vcsa = zio->io_private; + + range_tree_vacate(vcsa->vcsa_obsolete_segs, + unalloc_seg, vcsa); + range_tree_destroy(vcsa->vcsa_obsolete_segs); + kmem_free(vcsa, sizeof (*vcsa)); + + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The write of the new location is done. + */ static void spa_vdev_copy_segment_write_done(zio_t *zio) { - vdev_copy_seg_arg_t *vcsa = zio->io_private; - vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; - spa_config_exit(zio->io_spa, SCL_STATE, FTAG); + vdev_copy_arg_t *vca = zio->io_private; + abd_free(zio->io_abd); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); - - ASSERT0(zio->io_error); - kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); - kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); } +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ static void spa_vdev_copy_segment_read_done(zio_t *zio) { - vdev_copy_seg_arg_t *vcsa = zio->io_private; - dva_t *dest_dva = vcsa->vcsa_dest_dva; - uint64_t txg = vcsa->vcsa_txg; - spa_t *spa = zio->io_spa; - vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva)); - blkptr_t *bp = NULL; - dva_t *dva = NULL; - uint64_t size = zio->io_size; - - ASSERT3P(dest_vd, !=, NULL); - ASSERT0(zio->io_error); - - vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); - bp = vcsa->vcsa_dest_bp; - dva = bp->blk_dva; - - BP_ZERO(bp); - - /* initialize with dest_dva */ - bcopy(dest_dva, dva, sizeof (dva_t)); - BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); - - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); - BP_SET_TYPE(bp, DMU_OT_NONE); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, - txg, bp, zio->io_abd, size, - spa_vdev_copy_segment_write_done, vcsa, - ZIO_PRIORITY_REMOVAL, 0, NULL)); + zio_nowait(zio_unique_parent(zio)); +} + +/* + * If the old and new vdevs are mirrors, we will read both sides of the old + * mirror, and write each copy to the corresponding side of the new mirror. + * If the old and new vdevs have a different number of children, we will do + * this as best as possible. Since we aren't verifying checksums, this + * ensures that as long as there's a good copy of the data, we'll have a + * good copy after the removal, even if there's silent damage to one side + * of the mirror. If we're removing a mirror that has some silent damage, + * we'll have exactly the same damage in the new location (assuming that + * the new location is also a mirror). + * + * We accomplish this by creating a tree of zio_t's, with as many writes as + * there are "children" of the new vdev (a non-redundant vdev counts as one + * child, a 2-way mirror has 2 children, etc). Each write has an associated + * read from a child of the old vdev. Typically there will be the same + * number of children of the old and new vdevs. However, if there are more + * children of the new vdev, some child(ren) of the old vdev will be issued + * multiple reads. If there are more children of the old vdev, some copies + * will be dropped. + * + * For example, the tree of zio_t's for a 2-way mirror is: + * + * null + * / \ + * write(new vdev, child 0) write(new vdev, child 1) + * | | + * read(old vdev, child 0) read(old vdev, child 1) + * + * Child zio's complete before their parents complete. However, zio's + * created with zio_vdev_child_io() may be issued before their children + * complete. In this case we need to make sure that the children (reads) + * complete before the parents (writes) are *issued*. We do this by not + * calling zio_nowait() on each write until its corresponding read has + * completed. + * + * The spa_config_lock must be held while zio's created by + * zio_vdev_child_io() are in progress, to ensure that the vdev tree does + * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" + * zio is needed to release the spa_config_lock after all the reads and + * writes complete. (Note that we can't grab the config lock for each read, + * because it is not reentrant - we could deadlock with a thread waiting + * for a write lock.) + */ +static void +spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, + vdev_t *source_vd, uint64_t source_offset, + vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) +{ + ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes += size; + mutex_exit(&vca->vca_lock); + + abd_t *abd = abd_alloc_for_io(size, B_FALSE); + + vdev_t *source_child_vd; + if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { + /* + * Source and dest are both mirrors. Copy from the same + * child id as we are copying to (wrapping around if there + * are more dest children than source children). + */ + source_child_vd = + source_vd->vdev_child[dest_id % source_vd->vdev_children]; + } else { + source_child_vd = source_vd; + } + + zio_t *write_zio = zio_vdev_child_io(nzio, NULL, + dest_child_vd, dest_offset, abd, size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_write_done, vca); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + source_child_vd, source_offset, abd, size, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_read_done, vca)); } +/* + * Allocate a new location for this segment, and create the zio_t's to + * read from the old location and write to the new location. + */ static int -spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, +spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, + uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; - vdev_copy_seg_arg_t *private; dva_t dst = { 0 }; - blkptr_t blk, *bp = &blk; - dva_t *dva = bp->blk_dva; + uint64_t start = range_tree_min(segs); - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + uint64_t size = range_tree_span(segs); + if (range_tree_span(segs) > maxalloc) { + /* + * We can't allocate all the segments. Prefer to end + * the allocation at the end of a segment, thus avoiding + * additional split blocks. + */ + range_seg_t search; + avl_index_t where; + search.rs_start = start + maxalloc; + search.rs_end = search.rs_start; + range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); + if (rs == NULL) { + rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); + } else { + rs = AVL_PREV(&segs->rt_root, rs); + } + if (rs != NULL) { + size = rs->rs_end - start; + } else { + /* + * There are no segments that end before maxalloc. + * I.e. the first segment is larger than maxalloc, + * so we must split it. + */ + size = maxalloc; + } + } + ASSERT3U(size, <=, maxalloc); + + /* + * We use allocator 0 for this I/O because we don't expect device remap + * to be the steady state of the system, so parallelizing is not as + * critical as it is for other allocation types. We also want to ensure + * that the IOs are allocated together as much as possible, to reduce + * mapping sizes. + */ int error = metaslab_alloc_dva(spa, mg->mg_class, size, - &dst, 0, NULL, txg, 0, zal); + &dst, 0, NULL, txg, 0, zal, 0); if (error != 0) return (error); /* + * Determine the ranges that are not actually needed. Offsets are + * relative to the start of the range to be copied (i.e. relative to the + * local variable "start"). + */ + range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); + + range_seg_t *rs = avl_first(&segs->rt_root); + ASSERT3U(rs->rs_start, ==, start); + uint64_t prev_seg_end = rs->rs_end; + while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { + if (rs->rs_start >= start + size) { + break; + } else { + range_tree_add(obsolete_segs, + prev_seg_end - start, + rs->rs_start - prev_seg_end); + } + prev_seg_end = rs->rs_end; + } + /* We don't end in the middle of an obsolete range */ + ASSERT3U(start + size, <=, prev_seg_end); + + range_tree_clear(segs, start, size); + + /* * We can't have any padding of the allocated size, otherwise we will * misunderstand what's allocated, and the size of the mapping. * The caller ensures this will be true by passing in a size that is @@ -820,51 +994,37 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, */ ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); - mutex_enter(&vca->vca_lock); - vca->vca_outstanding_bytes += size; - mutex_exit(&vca->vca_lock); - entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + entry->vime_obsolete_count = range_tree_space(obsolete_segs); + } - private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); - private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; - private->vcsa_txg = txg; - private->vcsa_copy_arg = vca; + vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); + vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; + vcsa->vcsa_obsolete_segs = obsolete_segs; + vcsa->vcsa_spa = spa; + vcsa->vcsa_txg = txg; /* - * This lock is eventually released by the donefunc for the - * zio_write_phys that finishes copying the data. + * See comment before spa_vdev_copy_one_child(). */ - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - - /* - * Do logical I/O, letting the redundancy vdevs (like mirror) - * handle their own I/O instead of duplicating that code here. - */ - BP_ZERO(bp); - - DVA_SET_VDEV(&dva[0], vd->vdev_id); - DVA_SET_OFFSET(&dva[0], start); - DVA_SET_GANG(&dva[0], 0); - DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); - - BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); - - BP_SET_LSIZE(bp, size); - BP_SET_PSIZE(bp, size); - BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); - BP_SET_TYPE(bp, DMU_OT_NONE); - BP_SET_LEVEL(bp, 0); - BP_SET_DEDUP(bp, 0); - BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, - bp, abd_alloc_for_io(size, B_FALSE), size, - spa_vdev_copy_segment_read_done, private, - ZIO_PRIORITY_REMOVAL, 0, NULL)); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, + spa_vdev_copy_segment_done, vcsa, 0); + vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); + if (dest_vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < dest_vd->vdev_children; i++) { + vdev_t *child = dest_vd->vdev_child[i]; + spa_vdev_copy_one_child(vca, nzio, vd, start, + child, DVA_GET_OFFSET(&dst), i, size); + } + } else { + spa_vdev_copy_one_child(vca, nzio, vd, start, + dest_vd, DVA_GET_OFFSET(&dst), -1, size); + } + zio_nowait(nzio); list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); @@ -882,8 +1042,8 @@ static void vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; - vdev_t *vd = svr->svr_vdev; - spa_t *spa = vd->vdev_spa; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); @@ -912,37 +1072,6 @@ vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) } static void -vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) -{ - ivd->vdev_indirect_config = vd->vdev_indirect_config; - - ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); - ASSERT(vd->vdev_indirect_mapping != NULL); - ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; - vd->vdev_indirect_mapping = NULL; - - ASSERT3P(ivd->vdev_indirect_births, ==, NULL); - ASSERT(vd->vdev_indirect_births != NULL); - ivd->vdev_indirect_births = vd->vdev_indirect_births; - vd->vdev_indirect_births = NULL; - - ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); - ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); - - if (vd->vdev_obsolete_sm != NULL) { - ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); - - /* - * We cannot use space_map_{open,close} because we hold all - * the config locks as writer. - */ - ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); - ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; - vd->vdev_obsolete_sm = NULL; - } -} - -static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); @@ -977,17 +1106,13 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) vdev_remove_enlist_zaps(vd, svr->svr_zaplist); ivd = vdev_add_parent(vd, &vdev_indirect_ops); + ivd->vdev_removing = 0; vd->vdev_leaf_zap = 0; vdev_remove_child(ivd, vd); vdev_compact_children(ivd); - vdev_indirect_state_transfer(ivd, vd); - - svr->svr_vdev = ivd; - - ASSERT(!ivd->vdev_removing); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1010,9 +1135,8 @@ vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) * context by the removal thread after we have copied all vdev's data. */ static void -vdev_remove_complete(vdev_t *vd) +vdev_remove_complete(spa_t *spa) { - spa_t *spa = vd->vdev_spa; uint64_t txg; /* @@ -1020,8 +1144,13 @@ vdev_remove_complete(vdev_t *vd) * vdev_metaslab_fini() */ txg_wait_synced(spa->spa_dsl_pool, 0); - txg = spa_vdev_enter(spa); + vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", vd->vdev_id, txg); @@ -1041,6 +1170,10 @@ vdev_remove_complete(vdev_t *vd) /* * We now release the locks, allowing spa_sync to run and finish the * removal via vdev_remove_complete_sync in syncing context. + * + * Note that we hold on to the vdev_t that has been replaced. Since + * it isn't part of the vdev tree any longer, it can't be concurrently + * manipulated, even while we don't have the config lock. */ (void) spa_vdev_exit(spa, NULL, txg, 0); @@ -1062,6 +1195,8 @@ vdev_remove_complete(vdev_t *vd) */ vdev_config_dirty(spa->spa_root_vdev); (void) spa_vdev_exit(spa, vd, txg, 0); + + spa_event_post(ev); } /* @@ -1072,7 +1207,7 @@ vdev_remove_complete(vdev_t *vd) * this size again this txg. */ static void -spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, +spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, uint64_t *max_alloc, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); @@ -1080,39 +1215,78 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, mutex_enter(&svr->svr_lock); - range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); - if (rs == NULL) { + /* + * Determine how big of a chunk to copy. We can allocate up + * to max_alloc bytes, and we can span up to vdev_removal_max_span + * bytes of unallocated space at a time. "segs" will track the + * allocated segments that we are copying. We may also be copying + * free segments (of up to vdev_removal_max_span bytes). + */ + range_tree_t *segs = range_tree_create(NULL, NULL); + for (;;) { + range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); + if (rs == NULL) + break; + + uint64_t seg_length; + + if (range_tree_is_empty(segs)) { + /* need to truncate the first seg based on max_alloc */ + seg_length = + MIN(rs->rs_end - rs->rs_start, *max_alloc); + } else { + if (rs->rs_start - range_tree_max(segs) > + vdev_removal_max_span) { + /* + * Including this segment would cause us to + * copy a larger unneeded chunk than is allowed. + */ + break; + } else if (rs->rs_end - range_tree_min(segs) > + *max_alloc) { + /* + * This additional segment would extend past + * max_alloc. Rather than splitting this + * segment, leave it for the next mapping. + */ + break; + } else { + seg_length = rs->rs_end - rs->rs_start; + } + } + + range_tree_add(segs, rs->rs_start, seg_length); + range_tree_remove(svr->svr_allocd_segs, + rs->rs_start, seg_length); + } + + if (range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); + range_tree_destroy(segs); return; } - uint64_t offset = rs->rs_start; - uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); - - range_tree_remove(svr->svr_allocd_segs, offset, length); if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, svr, 0, ZFS_SPACE_CHECK_NONE, tx); } - svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; + svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ - svr->svr_bytes_done[txg & TXG_MASK] += length; + svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); - uint64_t thismax = *max_alloc; - while (length > 0) { - uint64_t mylen = MIN(length, thismax); - - int error = spa_vdev_copy_segment(svr->svr_vdev, - offset, mylen, txg, vca, &zal); + uint64_t thismax = SPA_MAXBLOCKSIZE; + while (!range_tree_is_empty(segs)) { + int error = spa_vdev_copy_segment(vd, + segs, thismax, txg, vca, &zal); if (error == ENOSPC) { /* @@ -1126,18 +1300,17 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, */ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); - thismax = P2ROUNDUP(mylen / 2, + uint64_t attempted = + MIN(range_tree_span(segs), thismax); + thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); - ASSERT3U(thismax, <, mylen); /* * The minimum-size allocation can not fail. */ - ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); - *max_alloc = mylen - (1 << spa->spa_max_ashift); + ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); + *max_alloc = attempted - (1 << spa->spa_max_ashift); } else { ASSERT0(error); - length -= mylen; - offset += mylen; /* * We've performed an allocation, so reset the @@ -1148,6 +1321,7 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, } } metaslab_trace_fini(&zal); + range_tree_destroy(segs); } /* @@ -1169,12 +1343,14 @@ spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, static void spa_vdev_remove_thread(void *arg) { - vdev_t *vd = arg; - spa_t *spa = vd->vdev_spa; + spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; uint64_t max_alloc = zfs_remove_max_segment; uint64_t last_txg = 0; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); @@ -1182,7 +1358,6 @@ spa_vdev_remove_thread(void *arg) ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_removing); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); - ASSERT3P(svr->svr_vdev, ==, vd); ASSERT(vim != NULL); mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1264,6 +1439,17 @@ spa_vdev_remove_thread(void *arg) mutex_exit(&svr->svr_lock); /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* * This delay will pause the removal around the point * specified by zfs_remove_max_bytes_pause. We do this * solely from the test suite or during debugging. @@ -1289,11 +1475,19 @@ spa_vdev_remove_thread(void *arg) VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); + /* + * Reacquire the vdev_config lock. The vdev_t + * that we're removing may have changed, e.g. due + * to a vdev_attach or vdev_detach. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vd = vdev_lookup_top(spa, svr->svr_vdev_id); + if (txg != last_txg) max_alloc = zfs_remove_max_segment; last_txg = txg; - spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); + spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); @@ -1301,6 +1495,9 @@ spa_vdev_remove_thread(void *arg) } mutex_exit(&svr->svr_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + /* * Wait for all copies to finish before cleaning up the vca. */ @@ -1318,7 +1515,7 @@ spa_vdev_remove_thread(void *arg) mutex_exit(&svr->svr_lock); } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); - vdev_remove_complete(vd); + vdev_remove_complete(spa); } thread_exit(); } @@ -1360,7 +1557,7 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; - vdev_t *vd = svr->svr_vdev; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; @@ -1433,8 +1630,11 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) * because we have not allocated mappings for it yet. */ uint64_t syncd = vdev_indirect_mapping_max_offset(vim); - range_tree_clear(svr->svr_allocd_segs, syncd, - msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); + uint64_t sm_end = msp->ms_sm->sm_start + + msp->ms_sm->sm_size; + if (sm_end > syncd) + range_tree_clear(svr->svr_allocd_segs, + syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); } @@ -1495,7 +1695,7 @@ spa_vdev_remove_cancel(spa_t *spa) if (spa->spa_vdev_removal == NULL) return (ESRCH); - uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; + uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, @@ -1625,6 +1825,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) /* Make sure these changes are sync'ed */ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + /* Stop initializing */ + (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, @@ -1785,6 +1988,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ error = spa_reset_logs(spa); + /* + * We stop any initializing that is currently in progress but leave + * the state as "active". This will allow the initializing to resume + * if the removal is canceled sometime later. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); + *txg = spa_vdev_config_enter(spa); /* @@ -1796,6 +2006,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); return (error); } @@ -1806,7 +2017,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_initiate_sync, - vd, 0, ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); return (0); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c index 92c670d28b2c..a03d18704dfc 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -151,6 +151,7 @@ vdev_ops_t vdev_root_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c index b40263fc981c..fc9ac80593ac 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c @@ -58,9 +58,7 @@ static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void fzap_byteswap(void *vbuf, size_t size) { - uint64_t block_type; - - block_type = *(uint64_t *)vbuf; + uint64_t block_type = *(uint64_t *)vbuf; if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) zap_leaf_byteswap(vbuf, size); @@ -73,11 +71,6 @@ fzap_byteswap(void *vbuf, size_t size) void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { - dmu_buf_t *db; - zap_leaf_t *l; - int i; - zap_phys_t *zp; - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; @@ -87,7 +80,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; - zp = zap_f_phys(zap); + zap_phys_t *zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap @@ -106,17 +99,18 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) zp->zap_flags = flags; /* block 1 will be the first leaf */ - for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) + for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++) ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; /* * set up block 1 - the first leaf */ - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + dmu_buf_t *db; + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); - l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); l->l_dbuf = db; zap_leaf_init(l, zp->zap_normflags != 0); @@ -146,9 +140,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), dmu_tx_t *tx) { - uint64_t b, newblk; - dmu_buf_t *db_old, *db_new; - int err; + uint64_t newblk; int bs = FZAP_BLOCK_SHIFT(zap); int hepb = 1<<(bs-4); /* hepb = half the number of entries in a block */ @@ -172,21 +164,23 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, * Copy the ptrtbl from the old to new location. */ - b = tbl->zt_blks_copied; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + uint64_t b = tbl->zt_blks_copied; + dmu_buf_t *db_old; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); - if (err) + if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + dmu_buf_t *db_new; + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, @@ -221,22 +215,20 @@ static int zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_tx_t *tx) { - int err; - uint64_t blk, off; int bs = FZAP_BLOCK_SHIFT(zap); - dmu_buf_t *db; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); dprintf("storing %llx at index %llx\n", val, idx); - blk = idx >> (bs-3); - off = idx & ((1<<(bs-3))-1); + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + dmu_buf_t *db; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - if (err) + if (err != 0) return (err); dmu_buf_will_dirty(db, tx); @@ -249,7 +241,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); - if (err) { + if (err != 0) { dmu_buf_rele(db, FTAG); return (err); } @@ -268,27 +260,24 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, static int zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) { - uint64_t blk, off; - int err; - dmu_buf_t *db; - dnode_t *dn; int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - blk = idx >> (bs-3); - off = idx & ((1<<(bs-3))-1); + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); /* * Note: this is equivalent to dmu_buf_hold(), but we use * _dnode_enter / _by_dnode because it's faster because we don't * have to hold the dnode. */ - dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(dn, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); - if (err) + if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; dmu_buf_rele(db, FTAG); @@ -319,11 +308,10 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) static void zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) { - int i; - for (i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { uint64_t lb = src[i]; - dst[2*i+0] = lb; - dst[2*i+1] = lb; + dst[2 * i + 0] = lb; + dst[2 * i + 1] = lb; } } @@ -345,19 +333,16 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) * stored in the header block). Give it its own entire * block, which will double the size of the ptrtbl. */ - uint64_t newblk; - dmu_buf_t *db_new; - int err; - ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); - newblk = zap_allocate_blocks(zap, 1); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + uint64_t newblk = zap_allocate_blocks(zap, 1); + dmu_buf_t *db_new; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); - if (err) + if (err != 0) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), @@ -392,9 +377,8 @@ zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks) { - uint64_t newblk; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - newblk = zap_f_phys(zap)->zap_freeblk; + uint64_t newblk = zap_f_phys(zap)->zap_freeblk; zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } @@ -411,7 +395,6 @@ zap_leaf_evict_sync(void *dbu) static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { - void *winner; zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -421,12 +404,11 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; - VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); - ASSERT(winner == NULL); + VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); @@ -460,11 +442,9 @@ zap_put_leaf(zap_leaf_t *l) static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { - zap_leaf_t *l, *winner; - ASSERT(blkid != 0); - l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; @@ -472,7 +452,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) l->l_dbuf = db; dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); - winner = dmu_buf_set_user(db, &l->l_dbu); + zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { @@ -510,17 +490,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { dmu_buf_t *db; - zap_leaf_t *l; - int bs = FZAP_BLOCK_SHIFT(zap); - int err; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + int bs = FZAP_BLOCK_SHIFT(zap); dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(dn, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); - if (err) + if (err != 0) return (err); ASSERT3U(db->db_object, ==, zap->zap_object); @@ -528,7 +506,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); - l = dmu_buf_get_user(db); + zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); @@ -583,8 +561,7 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { - uint64_t idx, blk; - int err; + uint64_t blk; ASSERT(zap->zap_dbuf == NULL || zap_f_phys(zap) == zap->zap_dbuf->db_data); @@ -596,8 +573,8 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) return (SET_ERROR(EIO)); } - idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); - err = zap_idx_to_blk(zap, idx, &blk); + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + int err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); @@ -614,9 +591,7 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; - zap_leaf_t *nl; - int prefix_diff, i, err; - uint64_t sibling; + int err; int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); @@ -636,19 +611,19 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; - if (err) + if (err != 0) return (err); ASSERT(!zap->zap_ismicro); while (old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); - if (err) + if (err != 0) return (err); } err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (err) + if (err != 0) return (err); if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { @@ -662,25 +637,26 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); - prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); - sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + uint64_t sibling = + (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; /* check for i/o errors before doing zap_leaf_split */ - for (i = 0; i < (1ULL<<prefix_diff); i++) { + for (int i = 0; i < (1ULL << prefix_diff); i++) { uint64_t blk; - err = zap_idx_to_blk(zap, sibling+i, &blk); - if (err) + err = zap_idx_to_blk(zap, sibling + i, &blk); + if (err != 0) return (err); ASSERT3U(blk, ==, l->l_blkid); } - nl = zap_create_leaf(zap, tx); + zap_leaf_t *nl = zap_create_leaf(zap, tx); zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ - for (i = 0; i < (1ULL << prefix_diff); i++) { - err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); + for (int i = 0; i < (1ULL << prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); ASSERT0(err); /* we checked for i/o errors above */ } @@ -708,8 +684,6 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, zap_put_leaf(l); if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { - int err; - /* * We are in the middle of growing the pointer table, or * this leaf will soon make us grow it. @@ -719,10 +693,10 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, uint64_t zapobj = zap->zap_object; zap_unlockdir(zap, tag); - err = zap_lockdir(os, zapobj, tx, + int err = zap_lockdir(os, zapobj, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; - if (err) + if (err != 0) return; } @@ -763,9 +737,8 @@ fzap_checksize(uint64_t integer_size, uint64_t num_integers) static int fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) { - int err; - - if ((err = fzap_checkname(zn)) != 0) + int err = fzap_checkname(zn); + if (err != 0) return (err); return (fzap_checksize(integer_size, num_integers)); } @@ -779,10 +752,10 @@ fzap_lookup(zap_name_t *zn, char *realname, int rn_len, boolean_t *ncp) { zap_leaf_t *l; - int err; zap_entry_handle_t zeh; - if ((err = fzap_checkname(zn)) != 0) + int err = fzap_checkname(zn); + if (err != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); @@ -870,7 +843,8 @@ fzap_update(zap_name_t *zn, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; - int err, create; + int err; + boolean_t create; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; @@ -923,9 +897,9 @@ fzap_length(zap_name_t *zn, if (err != 0) goto out; - if (integer_size) + if (integer_size != 0) *integer_size = zeh.zeh_integer_size; - if (num_integers) + if (num_integers != 0) *num_integers = zeh.zeh_num_integers; out: zap_put_leaf(l); @@ -954,15 +928,14 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx) void fzap_prefetch(zap_name_t *zn) { - uint64_t idx, blk; + uint64_t blk; zap_t *zap = zn->zn_zap; - int bs; - idx = ZAP_HASH_IDX(zn->zn_hash, + uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, zap_f_phys(zap)->zap_ptrtbl.zt_shift); if (zap_idx_to_blk(zap, idx, &blk) != 0) return; - bs = FZAP_BLOCK_SHIFT(zap); + int bs = FZAP_BLOCK_SHIFT(zap); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } @@ -975,9 +948,8 @@ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { - uint64_t new_obj; - - VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + uint64_t new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx); + VERIFY(new_obj != 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx)); @@ -989,13 +961,12 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) { zap_cursor_t zc; - zap_attribute_t *za; int err; if (mask == 0) mask = -1ULL; - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, zapobj); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { @@ -1005,7 +976,7 @@ zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, } } zap_cursor_fini(&zc); - kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(za, sizeof (*za)); return (err); } @@ -1013,23 +984,23 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; - zap_attribute_t za; - int err; + int err = 0; - err = 0; + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { - if (za.za_integer_length != 8 || za.za_num_integers != 1) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } - err = zap_add(os, intoobj, za.za_name, - 8, 1, &za.za_first_integer, tx); - if (err) + err = zap_add(os, intoobj, za->za_name, + 8, 1, &za->za_first_integer, tx); + if (err != 0) break; } zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); return (err); } @@ -1038,23 +1009,23 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx) { zap_cursor_t zc; - zap_attribute_t za; - int err; + int err = 0; - err = 0; + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { - if (za.za_integer_length != 8 || za.za_num_integers != 1) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } - err = zap_add(os, intoobj, za.za_name, + err = zap_add(os, intoobj, za->za_name, 8, 1, &value, tx); if (err != 0) break; } zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); return (err); } @@ -1063,29 +1034,29 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; - zap_attribute_t za; - int err; + int err = 0; - err = 0; + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); - zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { uint64_t delta = 0; - if (za.za_integer_length != 8 || za.za_num_integers != 1) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } - err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); + err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); if (err != 0 && err != ENOENT) break; - delta += za.za_first_integer; - err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); - if (err) + delta += za->za_first_integer; + err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); + if (err != 0) break; } zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); return (err); } @@ -1150,12 +1121,11 @@ zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx) { uint64_t value = 0; - int err; if (delta == 0) return (0); - err = zap_lookup(os, obj, name, 8, 1, &value); + int err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); value += delta; @@ -1253,7 +1223,6 @@ again: static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { - int i, err; uint64_t lastblk = 0; /* @@ -1261,14 +1230,14 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) * can hold, then it'll be accounted for more than once, since * we won't have lastblk. */ - for (i = 0; i < len; i++) { + for (int i = 0; i < len; i++) { zap_leaf_t *l; if (tbl[i] == lastblk) continue; lastblk = tbl[i]; - err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); if (err == 0) { zap_leaf_stats(zap, l, zs); zap_put_leaf(l); @@ -1333,14 +1302,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { - int b; - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); - for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c index 35dca89728fb..1c7c736d8e97 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ @@ -107,7 +107,6 @@ ldv(int len, const void *addr) void zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) { - int i; zap_leaf_t l; dmu_buf_t l_dbuf; @@ -123,10 +122,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); - for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); - for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; @@ -162,14 +161,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { - int i; - l->l_bs = highbit64(l->l_dbuf->db_size) - 1; zap_memset(&zap_leaf_phys(l)->l_hdr, 0, sizeof (struct zap_leaf_header)); zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } @@ -188,11 +185,9 @@ zap_leaf_init(zap_leaf_t *l, boolean_t sort) static uint16_t zap_leaf_chunk_alloc(zap_leaf_t *l) { - int chunk; - ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; + int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); @@ -232,7 +227,7 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, uint16_t *chunkp = &chunk_head; int byten = 0; uint64_t value = 0; - int shift = (integer_size-1)*8; + int shift = (integer_size - 1) * 8; int len = num_integers; ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES); @@ -240,10 +235,9 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int i; la->la_type = ZAP_CHUNK_ARRAY; - for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { if (byten == 0) value = ldv(integer_size, buf); la->la_array[i] = value >> shift; @@ -321,10 +315,9 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, while (len > 0) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int i; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { @@ -347,16 +340,13 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int bseen = 0; if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { - uint64_t *thiskey; - boolean_t match; - + uint64_t *thiskey = + kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); - thiskey = kmem_alloc(array_numints * sizeof (*thiskey), - KM_SLEEP); zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, sizeof (*thiskey), array_numints, thiskey); - match = bcmp(thiskey, zn->zn_key_orig, + boolean_t match = bcmp(thiskey, zn->zn_key_orig, array_numints * sizeof (*thiskey)) == 0; kmem_free(thiskey, array_numints * sizeof (*thiskey)); return (match); @@ -365,11 +355,10 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, ASSERT(zn->zn_key_intlen == 1); if (zn->zn_matchtype & MT_NORMALIZE) { char *thisname = kmem_alloc(array_numints, KM_SLEEP); - boolean_t match; zap_leaf_array_read(l, chunk, sizeof (char), array_numints, sizeof (char), array_numints, thisname); - match = zap_match(zn, thisname); + boolean_t match = zap_match(zn, thisname); kmem_free(thisname, array_numints); return (match); } @@ -400,12 +389,11 @@ zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) { - uint16_t *chunkp; struct zap_leaf_entry *le; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); + for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); *chunkp != CHAIN_END; chunkp = &le->le_next) { uint16_t chunk = *chunkp; le = ZAP_LEAF_ENTRY(l, chunk); @@ -446,17 +434,15 @@ int zap_leaf_lookup_closest(zap_leaf_t *l, uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) { - uint16_t chunk; uint64_t besth = -1ULL; uint32_t bestcd = -1U; uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; - uint16_t lh; struct zap_leaf_entry *le; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); - for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { - for (chunk = zap_leaf_phys(l)->l_hash[lh]; + for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { + for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh]; chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); @@ -529,11 +515,10 @@ int zap_entry_update(zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, const void *buf) { - int delta_chunks; zap_leaf_t *l = zeh->zeh_leaf; struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); - delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - + int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) @@ -550,14 +535,12 @@ zap_entry_update(zap_entry_handle_t *zeh, void zap_entry_remove(zap_entry_handle_t *zeh) { - uint16_t entry_chunk; - struct zap_leaf_entry *le; zap_leaf_t *l = zeh->zeh_leaf; ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); - entry_chunk = *zeh->zeh_chunkp; - le = ZAP_LEAF_ENTRY(l, entry_chunk); + uint16_t entry_chunk = *zeh->zeh_chunkp; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); zap_leaf_array_free(l, &le->le_name_chunk); @@ -575,15 +558,12 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, zap_entry_handle_t *zeh) { uint16_t chunk; - uint16_t *chunkp; struct zap_leaf_entry *le; - uint64_t valuelen; - int numchunks; uint64_t h = zn->zn_hash; - valuelen = integer_size * num_integers; + uint64_t valuelen = integer_size * num_integers; - numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (E2BIG); @@ -645,7 +625,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ - chunkp = zap_leaf_rehash_entry(l, chunk); + uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); zap_leaf_phys(l)->l_hdr.lh_nentries++; @@ -673,14 +653,13 @@ boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, const char *name, zap_t *zap) { - uint64_t chunk; struct zap_leaf_entry *le; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); - for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); + for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); if (le->le_hash != zeh->zeh_hash) @@ -763,14 +742,11 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) static void zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) { - struct zap_leaf_entry *le, *nle; - uint16_t chunk; - - le = ZAP_LEAF_ENTRY(l, entry); + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - chunk = zap_leaf_chunk_alloc(nl); - nle = ZAP_LEAF_ENTRY(nl, chunk); + uint16_t chunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ (void) zap_leaf_rehash_entry(nl, chunk); @@ -791,7 +767,6 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { - int i; int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ @@ -818,7 +793,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ - for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; @@ -833,9 +808,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { - int i, n; - - n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; @@ -851,7 +824,7 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_n_tenths_full[n]++; - for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { int nentries = 0; int chunk = zap_leaf_phys(l)->l_hash[i]; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c index 31dce3b1723b..50d5fc48f0c8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. @@ -89,22 +89,20 @@ zap_hash(zap_name_t *zn) ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { - int i; const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); - for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { - int j; + for (int i = 0; i < zn->zn_key_norm_numints; + wp++, i++) { uint64_t word = *wp; - for (j = 0; j < zn->zn_key_intlen; j++) { + for (int j = 0; j < zn->zn_key_intlen; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { - int i, len; const uint8_t *cp = zn->zn_key_norm; /* @@ -114,10 +112,10 @@ zap_hash(zap_name_t *zn) * zn_key_*_numints includes the terminating * null for non-binary keys.) */ - len = zn->zn_key_norm_numints - 1; + int len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); - for (i = 0; i < len; cp++, i++) { + for (int i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } @@ -137,15 +135,12 @@ zap_hash(zap_name_t *zn) static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) { - size_t inlen, outlen; - int err; - ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); - inlen = strlen(name) + 1; - outlen = ZAP_MAXNAMELEN; + size_t inlen = strlen(name) + 1; + size_t outlen = ZAP_MAXNAMELEN; - err = 0; + int err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); @@ -255,12 +250,11 @@ zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) static void mzap_byteswap(mzap_phys_t *buf, size_t size) { - int i, max; buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); - max = (size / MZAP_ENT_LEN) - 1; - for (i = 0; i < max; i++) { + int max = (size / MZAP_ENT_LEN) - 1; + for (int i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = @@ -271,9 +265,7 @@ mzap_byteswap(mzap_phys_t *buf, size_t size) void zap_byteswap(void *buf, size_t size) { - uint64_t block_type; - - block_type = *(uint64_t *)buf; + uint64_t block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ @@ -289,27 +281,22 @@ mze_compare(const void *arg1, const void *arg2) const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - if (mze1->mze_hash > mze2->mze_hash) - return (+1); - if (mze1->mze_hash < mze2->mze_hash) - return (-1); - if (mze1->mze_cd > mze2->mze_cd) - return (+1); - if (mze1->mze_cd < mze2->mze_cd) - return (-1); - return (0); + int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash); + if (likely(cmp)) + return (cmp); + + return (AVL_CMP(mze1->mze_cd, mze2->mze_cd)); } static int mze_insert(zap_t *zap, int chunkid, uint64_t hash) { - mzap_ent_t *mze; avl_index_t idx; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; @@ -352,10 +339,8 @@ static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; - mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zap->zap_m.zap_avl; - uint32_t cd; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); @@ -363,8 +348,8 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; - cd = 0; - for (mze = avl_find(avl, &mze_tofind, &idx); + uint32_t cd = 0; + for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { if (mze->mze_cd != cd) break; @@ -399,15 +384,13 @@ static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; - zap_t *zap; - int i; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); - zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); + zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, 0, 0, 0); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; @@ -443,7 +426,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); - for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { @@ -495,28 +478,21 @@ handle_winner: return (winner); } +/* + * This routine "consumes" the caller's hold on the dbuf, which must + * have the specified tag. + */ static int zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { - zap_t *zap; - krw_t lt; - ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; *zapp = NULL; -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - - zap = dmu_buf_get_user(db); + zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(os, obj, db); if (zap == NULL) { @@ -535,7 +511,7 @@ zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ - lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ @@ -581,12 +557,19 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; - int err; - err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { return (err); } +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + } +#endif + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); @@ -599,11 +582,17 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; - int err; - err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + } +#endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); @@ -620,22 +609,20 @@ zap_unlockdir(zap_t *zap, void *tag) static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { - mzap_phys_t *mzp; - int i, sz, nchunks; int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - sz = zap->zap_dbuf->db_size; - mzp = zio_buf_alloc(sz); + int sz = zap->zap_dbuf->db_size; + mzap_phys_t *mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); - nchunks = zap->zap_m.zap_num_chunks; + int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); - if (err) { + if (err != 0) { zio_buf_free(mzp, sz); return (err); } @@ -648,19 +635,18 @@ mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) fzap_upgrade(zap, tx, flags); - for (i = 0; i < nchunks; i++) { + for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; - zap_name_t *zn; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); - zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); - if (err) + if (err != 0) break; } zio_buf_free(mzp, sz); @@ -690,32 +676,24 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; - mzap_phys_t *zp; - VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); - -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif + VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); - zp = db->db_data; + mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; - dmu_buf_rele(db, FTAG); if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ - VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, - B_FALSE, B_FALSE, FTAG, &zap)); - VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags)); + VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + B_FALSE, B_FALSE, &zap)); + VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); + } else { + dmu_buf_rele(db, FTAG); } } @@ -732,9 +710,8 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int err; - - err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + int err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); @@ -752,6 +729,7 @@ uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); mzap_create_impl(os, obj, normflags, 0, tx); @@ -763,6 +741,7 @@ zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && @@ -808,10 +787,10 @@ int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; - int err; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); @@ -829,7 +808,6 @@ zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) { - mzap_ent_t *other; int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; @@ -837,7 +815,7 @@ mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) return (B_FALSE); again: - for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); other && other->mze_hash == mze->mze_hash; other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { @@ -882,10 +860,8 @@ zap_lookup_impl(zap_t *zap, const char *name, boolean_t *ncp) { int err = 0; - mzap_ent_t *mze; - zap_name_t *zn; - zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); @@ -893,7 +869,7 @@ zap_lookup_impl(zap_t *zap, const char *name, err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { - mze = mze_find(zn); + mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -924,9 +900,9 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, boolean_t *ncp) { zap_t *zap; - int err; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, @@ -950,9 +926,8 @@ zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, boolean_t *ncp) { zap_t *zap; - int err; - err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); @@ -967,13 +942,12 @@ zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; - int err; - zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc_uint64(zap, key, key_numints); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -990,13 +964,12 @@ zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; - int err; - zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc_uint64(zap, key, key_numints); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1024,14 +997,12 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; - int err; - mzap_ent_t *mze; - zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1039,7 +1010,7 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { - mze = mze_find(zn); + mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -1059,13 +1030,12 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; - int err; - zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc_uint64(zap, key, key_numints); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1079,26 +1049,24 @@ zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, static void mzap_addent(zap_name_t *zn, uint64_t value) { - int i; zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; - uint32_t cd; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG - for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { + for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif - cd = mze_find_unused_cd(zap, zn->zn_hash); + uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: - for (i = start; i < zap->zap_m.zap_num_chunks; i++) { + for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; @@ -1125,12 +1093,10 @@ zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, void *tag) { - int err = 0; - mzap_ent_t *mze; const uint64_t *intval = val; - zap_name_t *zn; + int err = 0; - zn = zap_name_alloc(zap, key, 0); + zap_name_t *zn = zap_name_alloc(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); @@ -1147,8 +1113,7 @@ zap_add_impl(zap_t *zap, const char *key, } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { - mze = mze_find(zn); - if (mze != NULL) { + if (mze_find(zn) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); @@ -1199,13 +1164,12 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, const void *val, dmu_tx_t *tx) { zap_t *zap; - int err; - zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc_uint64(zap, key, key_numints); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1223,11 +1187,8 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; - mzap_ent_t *mze; uint64_t oldval; const uint64_t *intval = val; - zap_name_t *zn; - int err; #ifdef ZFS_DEBUG /* @@ -1238,10 +1199,11 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1261,7 +1223,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { - mze = mze_find(zn); + mzap_ent_t *mze = mze_find(zn); if (mze != NULL) { ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); MZE_PHYS(zap, mze)->mze_value = *intval; @@ -1282,13 +1244,12 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; - zap_name_t *zn; - int err; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc_uint64(zap, key, key_numints); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1311,17 +1272,15 @@ static int zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { - mzap_ent_t *mze; - zap_name_t *zn; int err = 0; - zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { - mze = mze_find(zn); + mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { @@ -1369,13 +1328,12 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; - int err; - zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); - zn = zap_name_alloc_uint64(zap, key, key_numints); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); @@ -1451,9 +1409,6 @@ int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; - avl_index_t idx; - mzap_ent_t mze_tofind; - mzap_ent_t *mze; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); @@ -1462,7 +1417,7 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); - if (err) + if (err != 0) return (err); /* @@ -1482,10 +1437,14 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { + avl_index_t idx; + mzap_ent_t mze_tofind; + mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_cd = zc->zc_cd; - mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + mzap_ent_t *mze = + avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); @@ -1562,11 +1521,11 @@ out: int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { - int err; zap_t *zap; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); - if (err) + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) return (err); bzero(zs, sizeof (zap_stats_t)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c index ea20891ec211..54bc638c6e98 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zcp.c @@ -433,7 +433,7 @@ zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl, /* * Convert a lua value to an nvpair, adding it to an nvlist with the given key. */ -void +static void zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key) { /* @@ -445,7 +445,7 @@ zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key) (void) lua_error(state); } -int +static int zcp_lua_to_nvlist_helper(lua_State *state) { nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2); @@ -454,11 +454,12 @@ zcp_lua_to_nvlist_helper(lua_State *state) return (0); } -void +static void zcp_convert_return_values(lua_State *state, nvlist_t *nvl, const char *key, zcp_eval_arg_t *evalargs) { int err; + VERIFY3U(1, ==, lua_gettop(state)); lua_pushcfunction(state, zcp_lua_to_nvlist_helper); lua_pushlightuserdata(state, (char *)key); lua_pushlightuserdata(state, nvl); @@ -904,6 +905,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs) ZCP_RET_RETURN, evalargs); } else if (return_count > 1) { evalargs->ea_result = SET_ERROR(ECHRNG); + lua_settop(state, 0); (void) lua_pushfstring(state, "Multiple return " "values not supported"); zcp_convert_return_values(state, evalargs->ea_outnvl, @@ -965,6 +967,7 @@ static void zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname) { evalargs->ea_result = SET_ERROR(ECHRNG); + lua_settop(evalargs->ea_state, 0); (void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s", poolname); zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c index 78b2912df1d6..76003e3544f4 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c @@ -220,7 +220,7 @@ spa_features_check(spa_t *spa, boolean_t for_write, * * Note: well-designed features will not need to use this; they should * use spa_feature_is_enabled() and spa_feature_is_active() instead. - * However, this is non-static for zdb and zhack. + * However, this is non-static for zdb, zhack, and spa_add_feature_stats(). */ int feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c index e74799a70fe0..581b6b1bfb64 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c @@ -71,14 +71,10 @@ static char *nulldomain = ""; static int idx_compare(const void *arg1, const void *arg2) { - const fuid_domain_t *node1 = arg1; - const fuid_domain_t *node2 = arg2; + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; - if (node1->f_idx < node2->f_idx) - return (-1); - else if (node1->f_idx > node2->f_idx) - return (1); - return (0); + return (AVL_CMP(node1->f_idx, node2->f_idx)); } /* @@ -87,14 +83,13 @@ idx_compare(const void *arg1, const void *arg2) static int domain_compare(const void *arg1, const void *arg2) { - const fuid_domain_t *node1 = arg1; - const fuid_domain_t *node2 = arg2; + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; int val; val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); - if (val == 0) - return (0); - return (val > 0 ? 1 : -1); + + return (AVL_ISIGN(val)); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c index ca6ac539f0d6..af73005c260e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c @@ -195,6 +195,8 @@ #include <sys/zcp.h> #include <sys/zio_checksum.h> #include <sys/vdev_removal.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_initialize.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -3865,6 +3867,80 @@ zfs_ioc_destroy(zfs_cmd_t *zc) } /* + * innvl: { + * vdevs: { + * guid 1, guid 2, ... + * }, + * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} + * } + * + * outnvl: { + * [func: EINVAL (if provided command type didn't make sense)], + * [vdevs: { + * guid1: errno, (see function body for possible errnos) + * ... + * }] + * } + * + */ +static int +zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + uint64_t cmd_type; + if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, + &cmd_type) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + if (!(cmd_type == POOL_INITIALIZE_CANCEL || + cmd_type == POOL_INITIALIZE_DO || + cmd_type == POOL_INITIALIZE_SUSPEND)) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_guids; + if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, + &vdev_guids) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = 0; + + for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); + pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + error = spa_vdev_initialize(spa, vdev_guid, cmd_type); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + if (fnvlist_size(vdev_errlist) > 0) { + fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, + vdev_errlist); + } + fnvlist_free(vdev_errlist); + + spa_close(spa, FTAG); + return (total_errors > 0 ? EINVAL : 0); +} + +/* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot @@ -6118,6 +6194,10 @@ zfs_ioctl_init(void) zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, + zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c index b40bdbea123c..7743e81dd5f1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c @@ -594,12 +594,8 @@ zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) int zfs_range_compare(const void *arg1, const void *arg2) { - const rl_t *rl1 = arg1; - const rl_t *rl2 = arg2; - - if (rl1->r_off > rl2->r_off) - return (1); - if (rl1->r_off < rl2->r_off) - return (-1); - return (0); + const rl_t *rl1 = (const rl_t *)arg1; + const rl_t *rl2 = (const rl_t *)arg2; + + return (AVL_CMP(rl1->r_off, rl2->r_off)); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c index 980c32820c3f..66d858081485 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c @@ -2630,6 +2630,7 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) dmu_tx_commit(tx); zfsvfs->z_version = newvers; + os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); @@ -2642,17 +2643,47 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { - const char *pname; - int error = ENOENT; + uint64_t *cached_copy = NULL; /* - * Look up the file system's value for the property. For the - * version property, we look up a slightly different string. + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. */ - if (prop == ZFS_PROP_VERSION) + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; - else + } else { pname = zfs_prop_to_name(prop); + } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); @@ -2677,6 +2708,15 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) } error = 0; } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + return (error); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c index 58c3807f6ae4..ca34a69a6553 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c @@ -1691,7 +1691,8 @@ zfs_trunc(znode_t *zp, uint64_t end) return (0); } - error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); if (error) { zfs_range_unlock(rl); return (error); @@ -2102,6 +2103,17 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, *path = '\0'; sa_hdl = hdl; + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + for (;;) { uint64_t pobj; char component[MAXNAMELEN + 2]; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c index 9571998347f2..8cc65d6f31f8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -131,17 +131,11 @@ zil_bp_compare(const void *x1, const void *x2) const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; - if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) - return (-1); - if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) - return (1); - - if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) - return (-1); - if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) - return (1); + int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (likely(cmp)) + return (cmp); - return (0); + return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); } static void @@ -503,12 +497,7 @@ zil_lwb_vdev_compare(const void *x1, const void *x2) const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; - if (v1 < v2) - return (-1); - if (v1 > v2) - return (1); - - return (0); + return (AVL_CMP(v1, v2)); } static lwb_t * @@ -665,7 +654,8 @@ zil_create(zilog_t *zilog) BP_ZERO(&blk); } - error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, + error = zio_alloc_zil(zilog->zl_spa, + zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, ZIL_MIN_BLKSZ, &slog); if (error == 0) @@ -1342,7 +1332,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ - error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog); + error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, + txg, bp, &lwb->lwb_blk, zil_blksz, &slog); if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -1624,12 +1615,7 @@ zil_aitx_compare(const void *x1, const void *x2) const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; - if (o1 < o2) - return (-1); - if (o1 > o2) - return (1); - - return (0); + return (AVL_CMP(o1, o2)); } /* @@ -2297,7 +2283,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) */ lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); - ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); + IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); /* * Since the lwb's zio hadn't been issued by the time this thread diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index 3eb8747619fa..53d0f4d27b08 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -44,6 +44,7 @@ #include <sys/dsl_scan.h> #include <sys/metaslab_impl.h> #include <sys/abd.h> +#include <sys/cityhash.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -99,9 +100,6 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; extern vmem_t *zio_alloc_arena; #endif -#define ZIO_PIPELINE_CONTINUE 0x100 -#define ZIO_PIPELINE_STOP 0x101 - #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul @@ -538,7 +536,8 @@ zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) } static void -zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) +zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, + zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; @@ -557,13 +556,33 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); + /* - * Dispatch the parent zio in its own taskq so that - * the child can continue to make progress. This also - * prevents overflowing the stack when we have deeply nested - * parent-child relationships. + * If we can tell the caller to execute this parent next, do + * so. Otherwise dispatch the parent zio as its own task. + * + * Having the caller execute the parent when possible reduces + * locking on the zio taskq's, reduces context switch + * overhead, and has no recursion penalty. Note that one + * read from disk typically causes at least 3 zio's: a + * zio_null(), the logical zio_read(), and then a physical + * zio. When the physical ZIO completes, we are able to call + * zio_done() on all 3 of these zio's from one invocation of + * zio_execute() by returning the parent back to + * zio_execute(). Since the parent isn't executed until this + * thread returns back to zio_execute(), the caller should do + * so promptly. + * + * In other cases, dispatching the parent prevents + * overflowing the stack when we have deeply nested + * parent-child relationships, as we do with the "mega zio" + * of writes for spa_sync(), and the chain of ZIL blocks. */ - zio_taskq_dispatch(pio, type, B_FALSE); + if (next_to_executep != NULL && *next_to_executep == NULL) { + *next_to_executep = pio; + } else { + zio_taskq_dispatch(pio, type, B_FALSE); + } } else { mutex_exit(&pio->io_lock); } @@ -1149,17 +1168,6 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); - /* - * In the common case, where the parent zio was to a normal vdev, - * the child zio must be to a child vdev of that vdev. Otherwise, - * the child zio must be to a top-level vdev. - */ - if (pio->io_vd != NULL && pio->io_vd->vdev_ops != &vdev_indirect_ops) { - ASSERT3P(vd->vdev_parent, ==, pio->io_vd); - } else { - ASSERT3P(vd, ==, vd->vdev_top); - } - if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the @@ -1223,7 +1231,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, - int type, zio_priority_t priority, enum zio_flag flags, + zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; @@ -1285,7 +1293,7 @@ zio_shrink(zio_t *zio, uint64_t size) * ========================================================================== */ -static int +static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1322,14 +1330,14 @@ zio_read_bp_init(zio_t *zio) if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); @@ -1344,7 +1352,7 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); /* * If we've been overridden and nopwrite is set then @@ -1355,13 +1363,13 @@ zio_write_bp_init(zio_t *zio) ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); @@ -1369,7 +1377,7 @@ zio_write_bp_init(zio_t *zio) if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -1381,10 +1389,10 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline = zio->io_orig_pipeline; } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -1403,11 +1411,11 @@ zio_write_compress(zio_t *zio) */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } if (!IO_IS_ALLOCATING(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); if (zio->io_children_ready != NULL) { /* @@ -1466,7 +1474,7 @@ zio_write_compress(zio_t *zio) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } else { /* * Round up compressed size up to the ashift @@ -1554,10 +1562,10 @@ zio_write_compress(zio_t *zio) zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -1569,7 +1577,7 @@ zio_free_bp_init(zio_t *zio) ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -1643,12 +1651,12 @@ zio_taskq_member(zio_t *zio, zio_taskq_type_t q) return (B_FALSE); } -static int +static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (ZIO_PIPELINE_STOP); + return (NULL); } void @@ -1730,14 +1738,13 @@ static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) { - zio->io_executor = curthread; - ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; - int rv; + + zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); @@ -1768,12 +1775,16 @@ zio_execute(zio_t *zio) zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; - rv = zio_pipeline[highbit64(stage) - 1](zio); - if (rv == ZIO_PIPELINE_STOP) - return; + /* + * The zio pipeline stage returns the next zio to execute + * (typically the same as this one), or NULL if we should + * stop. + */ + zio = zio_pipeline[highbit64(stage) - 1](zio); - ASSERT(rv == ZIO_PIPELINE_CONTINUE); + if (zio == NULL) + return; } } @@ -2236,7 +2247,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, zio_nowait(zio); } -static int +static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -2248,16 +2259,16 @@ zio_gang_assemble(zio_t *zio) zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); @@ -2271,7 +2282,7 @@ zio_gang_issue(zio_t *zio) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } static void @@ -2310,7 +2321,7 @@ zio_write_gang_done(zio_t *zio) abd_put(zio->io_abd); } -static int +static zio_t * zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; @@ -2335,7 +2346,8 @@ zio_write_gang_block(zio_t *pio) ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); + VERIFY(refcount_held(&mc->mc_alloc_slots[pio->io_allocator], + pio)); /* * The logical zio has already placed a reservation for @@ -2346,12 +2358,12 @@ zio_write_gang_block(zio_t *pio) * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, - pio, flags)); + pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, - &pio->io_alloc_list, pio); + &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2365,10 +2377,10 @@ zio_write_gang_block(zio_t *pio) * stage. */ metaslab_class_throttle_unreserve(mc, - gbh_copies - copies, pio); + gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; - return (ZIO_PIPELINE_CONTINUE); + return (pio); } if (pio == gio) { @@ -2423,7 +2435,7 @@ zio_write_gang_block(zio_t *pio) * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, - zp.zp_copies, cio, flags)); + zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } @@ -2435,7 +2447,7 @@ zio_write_gang_block(zio_t *pio) zio_nowait(zio); - return (ZIO_PIPELINE_CONTINUE); + return (pio); } /* @@ -2456,7 +2468,7 @@ zio_write_gang_block(zio_t *pio) * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ -static int +static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -2483,7 +2495,7 @@ zio_nop_write(zio_t *zio) BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); /* * If the checksums match then reset the pipeline so that we @@ -2503,7 +2515,7 @@ zio_nop_write(zio_t *zio) zio->io_flags |= ZIO_FLAG_NOPWRITE; } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -2531,7 +2543,7 @@ zio_ddt_child_read_done(zio_t *zio) mutex_exit(&pio->io_lock); } -static int +static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -2551,7 +2563,7 @@ zio_ddt_read_start(zio_t *zio) zio->io_vsd = dde; if (ddp_self == NULL) - return (ZIO_PIPELINE_CONTINUE); + return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) @@ -2564,23 +2576,23 @@ zio_ddt_read_start(zio_t *zio) zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(BP_GET_DEDUP(bp)); @@ -2592,12 +2604,12 @@ zio_ddt_read_done(zio_t *zio) ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); - return (ZIO_PIPELINE_STOP); + return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, @@ -2610,7 +2622,7 @@ zio_ddt_read_done(zio_t *zio) ASSERT(zio->io_vsd == NULL); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } static boolean_t @@ -2768,7 +2780,7 @@ zio_ddt_ditto_write_done(zio_t *zio) ddt_exit(ddt); } -static int +static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -2812,7 +2824,7 @@ zio_ddt_write(zio_t *zio) ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); @@ -2838,7 +2850,7 @@ zio_ddt_write(zio_t *zio) zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, @@ -2880,12 +2892,12 @@ zio_ddt_write(zio_t *zio) if (dio) zio_nowait(dio); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ddt_entry_t *freedde; /* for debugging */ -static int +static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -2903,7 +2915,7 @@ zio_ddt_free(zio_t *zio) ddt_phys_decref(ddp); ddt_exit(ddt); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -2913,13 +2925,13 @@ zio_ddt_free(zio_t *zio) */ static zio_t * -zio_io_to_allocate(spa_t *spa) +zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; - ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); + ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); - zio = avl_first(&spa->spa_alloc_tree); + zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); @@ -2929,18 +2941,19 @@ zio_io_to_allocate(spa_t *spa) * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ + ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(spa_normal_class(spa), - zio->io_prop.zp_copies, zio, 0)) { + zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } - avl_remove(&spa->spa_alloc_tree, zio); + avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } -static int +static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -2950,7 +2963,7 @@ zio_dva_throttle(zio_t *zio) !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); @@ -2958,40 +2971,35 @@ zio_dva_throttle(zio_t *zio) ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); - mutex_enter(&spa->spa_alloc_lock); + zbookmark_phys_t *bm = &zio->io_bookmark; + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; + mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); - avl_add(&spa->spa_alloc_tree, zio); + avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); - nio = zio_io_to_allocate(zio->io_spa); - mutex_exit(&spa->spa_alloc_lock); + nio = zio_io_to_allocate(zio->io_spa, zio->io_allocator); + mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); - if (nio == zio) - return (ZIO_PIPELINE_CONTINUE); - - if (nio != NULL) { - ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); - /* - * We are passing control to a new zio so make sure that - * it is processed by a different thread. We do this to - * avoid stack overflows that can occur when parents are - * throttled and children are making progress. We allow - * it to go to the head of the taskq since it's already - * been waiting. - */ - zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); - } - return (ZIO_PIPELINE_STOP); + return (nio); } void -zio_allocate_dispatch(spa_t *spa) +zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; - mutex_enter(&spa->spa_alloc_lock); - zio = zio_io_to_allocate(spa); - mutex_exit(&spa->spa_alloc_lock); + mutex_enter(&spa->spa_alloc_locks[allocator]); + zio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; @@ -3000,7 +3008,7 @@ zio_allocate_dispatch(spa_t *spa) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } -static int +static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -3032,10 +3040,10 @@ zio_dva_allocate(zio_t *zio) error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, - &zio->io_alloc_list, zio); + &zio->io_alloc_list, zio, zio->io_allocator); if (error != 0) { - spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " + zfs_dbgmsg("%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) @@ -3043,18 +3051,18 @@ zio_dva_allocate(zio_t *zio) zio->io_error = error; } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_dva_claim(zio_t *zio) { int error; @@ -3063,7 +3071,7 @@ zio_dva_claim(zio_t *zio) if (error) zio->io_error = error; - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3092,8 +3100,8 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t size, boolean_t *slog) +zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; @@ -3101,14 +3109,22 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); + /* + * When allocating a zil block, we don't have information about + * the final destination of the block except the objset it's part + * of, so we just hash the objset ID to pick the allocator to get + * some parallelism. + */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL); + txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL, + cityhash4(0, 0, 0, objset) % spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, - &io_alloc_list, NULL); + &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) % + spa->spa_alloc_count); if (error == 0) *slog = FALSE; } @@ -3150,7 +3166,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ -static int +static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -3169,13 +3185,13 @@ zio_vdev_io_start(zio_t *zio) * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); - return (ZIO_PIPELINE_STOP); + return (NULL); } if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } ASSERT3P(zio->io_logical, !=, zio); @@ -3183,9 +3199,13 @@ zio_vdev_io_start(zio_t *zio) ASSERT(spa->spa_trust_config); if (zio->io_vd->vdev_removing) { + /* + * Note: the code can handle other kinds of writes, + * but we don't expect them. + */ ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | - ZIO_FLAG_INDUCE_DAMAGE)); + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } @@ -3251,39 +3271,58 @@ zio_vdev_io_start(zio_t *zio) * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. - * This prevents spurious resilvering with nested replication. - * For example, given a mirror of mirrors, (A+B)+(C+D), if only - * A is out of date, we'll read from C+D, then use the data to - * resilver A+B -- but we don't actually want to resilver B, just A. - * The top-level mirror has no way to know this, so instead we just - * discard unnecessary repairs as we work our way down the vdev tree. - * The same logic applies to any form of nested replication: - * ditto + mirror, RAID-Z + replacing, etc. This covers them all. + * This prevents spurious resilvering. + * + * There are a few ways that we can end up creating these spurious + * resilver i/os: + * + * 1. A resilver i/o will be issued if any DVA in the BP has a + * dirty DTL. The mirror code will issue resilver writes to + * each DVA, including the one(s) that are not on vdevs with dirty + * DTLs. + * + * 2. With nested replication, which happens when we have a + * "replacing" or "spare" vdev that's a child of a mirror or raidz. + * For example, given mirror(replacing(A+B), C), it's likely that + * only A is out of date (it's the new device). In this case, we'll + * read from C, then use the data to resilver A+B -- but we don't + * actually want to resilver B, just A. The top-level mirror has no + * way to know this, so instead we just discard unnecessary repairs + * as we work our way down the vdev tree. + * + * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. + * The same logic applies to any form of nested replication: ditto + * + mirror, RAID-Z + replacing, etc. + * + * However, indirect vdevs point off to other vdevs which may have + * DTL's, so we never bypass them. The child i/os on concrete vdevs + * will be properly bypassed instead. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ + vd->vdev_ops != &vdev_indirect_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } if (vd->vdev_ops->vdev_op_leaf) { switch (zio->io_type) { case ZIO_TYPE_READ: if (vdev_cache_read(zio)) - return (ZIO_PIPELINE_CONTINUE); + return (zio); /* FALLTHROUGH */ case ZIO_TYPE_WRITE: case ZIO_TYPE_FREE: if ((zio = vdev_queue_io(zio)) == NULL) - return (ZIO_PIPELINE_STOP); + return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); - return (ZIO_PIPELINE_STOP); + return (NULL); } break; } @@ -3295,14 +3334,14 @@ zio_vdev_io_start(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && !trim_map_write_start(zio)) - return (ZIO_PIPELINE_STOP); + return (NULL); } vd->vdev_ops->vdev_op_io_start(zio); - return (ZIO_PIPELINE_STOP); + return (NULL); } -static int +static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -3310,7 +3349,7 @@ zio_vdev_io_done(zio_t *zio) boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || @@ -3353,7 +3392,7 @@ zio_vdev_io_done(zio_t *zio) if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3411,13 +3450,13 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) zcr->zcr_free = zio_buf_free; } -static int +static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) @@ -3463,7 +3502,7 @@ zio_vdev_io_assess(zio_t *zio) zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); - return (ZIO_PIPELINE_STOP); + return (NULL); } /* @@ -3503,7 +3542,7 @@ zio_vdev_io_assess(zio_t *zio) zio->io_physdone(zio->io_logical); } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } void @@ -3538,7 +3577,7 @@ zio_vdev_io_bypass(zio_t *zio) * Generate and verify checksums * ========================================================================== */ -static int +static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -3552,7 +3591,7 @@ zio_checksum_generate(zio_t *zio) checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { @@ -3566,10 +3605,10 @@ zio_checksum_generate(zio_t *zio) zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } -static int +static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; @@ -3584,7 +3623,7 @@ zio_checksum_verify(zio_t *zio) * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) - return (ZIO_PIPELINE_CONTINUE); + return (zio); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } @@ -3599,7 +3638,7 @@ zio_checksum_verify(zio_t *zio) } } - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3642,7 +3681,7 @@ zio_worst_error(int e1, int e2) * I/O completion * ========================================================================== */ -static int +static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; @@ -3651,7 +3690,7 @@ zio_ready(zio_t *zio) if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } if (zio->io_ready) { @@ -3678,8 +3717,8 @@ zio_ready(zio_t *zio) */ metaslab_class_throttle_unreserve( spa_normal_class(zio->io_spa), - zio->io_prop.zp_copies, zio); - zio_allocate_dispatch(zio->io_spa); + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } @@ -3697,7 +3736,7 @@ zio_ready(zio_t *zio) */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); - zio_notify_parent(pio, zio, ZIO_WAIT_READY); + zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { @@ -3713,7 +3752,7 @@ zio_ready(zio_t *zio) zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); - return (ZIO_PIPELINE_CONTINUE); + return (zio); } /* @@ -3762,21 +3801,22 @@ zio_dva_throttle_done(zio_t *zio) ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); mutex_enter(&pio->io_lock); - metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, + pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), - 1, pio); + 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ - zio_allocate_dispatch(zio->io_spa); + zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } -static int +static zio_t * zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -3793,7 +3833,7 @@ zio_done(zio_t *zio) * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { - return (ZIO_PIPELINE_STOP); + return (NULL); } /* @@ -3816,8 +3856,10 @@ zio_done(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(bp != NULL); - metaslab_group_alloc_verify(spa, zio->io_bp, zio); - VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio)); + metaslab_group_alloc_verify(spa, zio->io_bp, zio, + zio->io_allocator); + VERIFY(refcount_not_held(&mc->mc_alloc_slots[zio->io_allocator], + zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) @@ -4005,7 +4047,12 @@ zio_done(zio_t *zio) if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); - zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + /* + * This is a rare code path, so we don't + * bother with "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, + NULL); } } @@ -4017,7 +4064,11 @@ zio_done(zio_t *zio) */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; - zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + /* + * This is a rare code path, so we don't bother with + * "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend @@ -4038,7 +4089,7 @@ zio_done(zio_t *zio) ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } - return (ZIO_PIPELINE_STOP); + return (NULL); } ASSERT(zio->io_child_count == 0); @@ -4068,12 +4119,17 @@ zio_done(zio_t *zio) zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); + /* + * We are done executing this zio. We may want to execute a parent + * next. See the comment in zio_notify_parent(). + */ + zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); - zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { @@ -4085,7 +4141,7 @@ zio_done(zio_t *zio) zio_destroy(zio); } - return (ZIO_PIPELINE_STOP); + return (next_to_execute); } /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c index 7e05b9212db0..b87303889ddb 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c @@ -25,7 +25,7 @@ */ /* * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -56,6 +56,12 @@ static zcomp_stats_t zcomp_stats = { kstat_t *zcomp_ksp; /* + * If nonzero, every 1/X decompression attempts will fail, simulating + * an undetected memory error. + */ +uint64_t zio_decompress_fail_fraction = 0; + +/* * Compression vectors. */ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { @@ -172,6 +178,16 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); abd_return_buf(src, tmp, s_len); + /* + * Decompression shouldn't fail, because we've already verifyied + * the checksum. However, for extra protection (e.g. against bitflips + * in non-ECC RAM), we handle this error (and test it). + */ + ASSERT0(ret); + if (zio_decompress_fail_fraction != 0 && + spa_get_random(zio_decompress_fail_fraction) == 0) + ret = SET_ERROR(EINVAL); + return (ret); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c index 805c63d09a01..3c7f669a351a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c @@ -174,7 +174,7 @@ typedef struct zvol_state { zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ znode_t zv_znode; /* for range locking */ - dmu_buf_t *zv_dbuf; /* bonus handle */ + dnode_t *zv_dn; /* dnode hold */ #ifndef illumos int zv_state; int zv_volmode; /* Provide GEOM or cdev */ @@ -868,7 +868,7 @@ zvol_first_open(zvol_state_t *zv) } zv->zv_volblocksize = doi.doi_data_block_size; - error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); + error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); if (error) { dmu_objset_disown(os, zvol_tag); return (error); @@ -893,8 +893,8 @@ zvol_last_close(zvol_state_t *zv) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - dmu_buf_rele(zv->zv_dbuf, zvol_tag); - zv->zv_dbuf = NULL; + dnode_rele(zv->zv_dn, zvol_tag); + zv->zv_dn = NULL; /* * Evict cached data @@ -1342,8 +1342,6 @@ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; - objset_t *os = zv->zv_objset; - uint64_t object = ZVOL_OBJ; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; /* length of user data */ dmu_buf_t *db; @@ -1367,7 +1365,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - error = dmu_read(os, object, offset, size, buf, + error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* @@ -1380,7 +1378,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) offset = P2ALIGN(offset, size); zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - error = dmu_buf_hold(os, object, offset, zgd, &db, + error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -1451,8 +1449,8 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read(zv->zv_objset, - ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, + off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; @@ -1874,7 +1872,7 @@ zvol_read(struct cdev *dev, struct uio *uio, int ioflag) if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; - error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes); + error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -1946,7 +1944,7 @@ zvol_write(struct cdev *dev, struct uio *uio, int ioflag) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); @@ -2028,7 +2026,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) int zvol_get_volume_params(minor_t minor, uint64_t *blksize, uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, - void **rl_hdl, void **bonus_hdl) + void **rl_hdl, void **dnode_hdl) { zvol_state_t *zv; @@ -2039,7 +2037,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, return (SET_ERROR(ENXIO)); ASSERT(blksize && max_xfer_len && minor_hdl && - objset_hdl && zil_hdl && rl_hdl && bonus_hdl); + objset_hdl && zil_hdl && rl_hdl && dnode_hdl); *blksize = zv->zv_volblocksize; *max_xfer_len = (uint64_t)zvol_maxphys; @@ -2047,7 +2045,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, *objset_hdl = zv->zv_objset; *zil_hdl = zv->zv_zilog; *rl_hdl = &zv->zv_znode; - *bonus_hdl = zv->zv_dbuf; + *dnode_hdl = zv->zv_dn; return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h index 10e0ddaeef88..fea46c90481d 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h @@ -107,6 +107,14 @@ extern "C" { /* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + +/* * Type used for the root of the AVL tree. */ typedef struct avl_tree avl_tree_t; diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index c347b63a1a6f..3fcda9e8965e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -118,7 +118,7 @@ typedef enum { ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, - ZFS_PROP_CREATETXG, /* not exposed to the user */ + ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ @@ -637,6 +637,13 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" +#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ + "com.delphix:next_offset_to_initialize" +#define VDEV_LEAF_ZAP_INITIALIZE_STATE \ + "com.delphix:vdev_initialize_state" +#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ + "com.delphix:vdev_initialize_action_time" + /* * This is needed in userland to report the minimum necessary device size. * @@ -735,6 +742,15 @@ typedef enum pool_scrub_cmd { POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; +/* + * Initialize functions. + */ +typedef enum pool_initialize_func { + POOL_INITIALIZE_DO, + POOL_INITIALIZE_CANCEL, + POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_FUNCS +} pool_initialize_func_t; /* * ZIO types. Needed to interpret vdev statistics below. @@ -814,6 +830,14 @@ typedef struct pool_checkpoint_stat { uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; +typedef enum { + VDEV_INITIALIZE_NONE, + VDEV_INITIALIZE_ACTIVE, + VDEV_INITIALIZE_CANCELED, + VDEV_INITIALIZE_SUSPENDED, + VDEV_INITIALIZE_COMPLETE +} vdev_initializing_state_t; + /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. @@ -840,6 +864,11 @@ typedef struct vdev_stat { uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ + uint64_t vs_initialize_errors; /* initializing errors */ + uint64_t vs_initialize_bytes_done; /* bytes initialized */ + uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ + uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ + uint64_t vs_initialize_action_time; /* time_t */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof(uint64_t)) >= \ @@ -974,6 +1003,7 @@ typedef enum zfs_ioc { ZFS_IOC_REMAP, ZFS_IOC_POOL_CHECKPOINT, ZFS_IOC_POOL_DISCARD_CHECKPOINT, + ZFS_IOC_POOL_INITIALIZE, ZFS_IOC_LAST } zfs_ioc_t; @@ -1037,6 +1067,12 @@ typedef enum { #define ZPOOL_HIST_ERRNO "errno" /* + * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. + */ +#define ZPOOL_INITIALIZE_COMMAND "initialize_command" +#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" + +/* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h index e4545a96ee76..52d6aea0a364 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_NVPAIR_H @@ -39,6 +39,7 @@ extern "C" { #endif typedef enum { + DATA_TYPE_DONTCARE = -1, DATA_TYPE_UNKNOWN = 0, DATA_TYPE_BOOLEAN, DATA_TYPE_BYTE, diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h index f12dbbfe6ef5..c9874b3e4db7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + #ifndef _NVPAIR_IMPL_H #define _NVPAIR_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -47,16 +49,27 @@ typedef struct i_nvp i_nvp_t; struct i_nvp { union { - uint64_t _nvi_align; /* ensure alignment */ + /* ensure alignment */ + uint64_t _nvi_align; + struct { - i_nvp_t *_nvi_next; /* pointer to next nvpair */ - i_nvp_t *_nvi_prev; /* pointer to prev nvpair */ + /* pointer to next nvpair */ + i_nvp_t *_nvi_next; + + /* pointer to prev nvpair */ + i_nvp_t *_nvi_prev; + + /* next pair in table bucket */ + i_nvp_t *_nvi_hashtable_next; } _nvi; } _nvi_un; - nvpair_t nvi_nvp; /* nvpair */ + + /* nvpair */ + nvpair_t nvi_nvp; }; #define nvi_next _nvi_un._nvi._nvi_next #define nvi_prev _nvi_un._nvi._nvi_prev +#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next typedef struct { i_nvp_t *nvp_list; /* linked list of nvpairs */ @@ -64,6 +77,10 @@ typedef struct { i_nvp_t *nvp_curr; /* current walker nvpair */ nv_alloc_t *nvp_nva; /* pluggable allocator */ uint32_t nvp_stat; /* internal state */ + + i_nvp_t **nvp_hashtable; /* table of entries used for lookup */ + uint32_t nvp_nbuckets; /* # of buckets in hash table */ + uint32_t nvp_nentries; /* # of entries in hash table */ } nvpriv_t; #ifdef __cplusplus |