diff options
author | Matt Macy <mmacy@FreeBSD.org> | 2021-01-07 23:19:59 +0000 |
---|---|---|
committer | Matt Macy <mmacy@FreeBSD.org> | 2021-01-07 23:19:59 +0000 |
commit | 42385c279bad198701badabdab661df49f2acb9a (patch) | |
tree | 529931aeb217bb13f5cbfb4eb38fd75163d251ba /cmd/zpool | |
parent | 0be360124f8f108f73365e31448e7550f877f3ac (diff) | |
download | src-42385c279bad198701badabdab661df49f2acb9a.tar.gz src-42385c279bad198701badabdab661df49f2acb9a.zip |
Update OpenZFS to master-f11b09vendor/openzfs/20210107
Diffstat (limited to 'cmd/zpool')
-rw-r--r-- | cmd/zpool/zpool_iter.c | 16 | ||||
-rw-r--r-- | cmd/zpool/zpool_main.c | 155 | ||||
-rw-r--r-- | cmd/zpool/zpool_util.h | 4 | ||||
-rw-r--r-- | cmd/zpool/zpool_vdev.c | 393 |
4 files changed, 483 insertions, 85 deletions
diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 5f3153bca2c2..d70d266699cf 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -56,6 +56,7 @@ typedef struct zpool_node { struct zpool_list { boolean_t zl_findall; + boolean_t zl_literal; uu_avl_t *zl_avl; uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; @@ -88,7 +89,9 @@ add_pool(zpool_handle_t *zhp, void *data) uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { if (zlp->zl_proplist && - zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { + zpool_expand_proplist(zhp, zlp->zl_proplist, + zlp->zl_literal) + != 0) { zpool_close(zhp); free(node); return (-1); @@ -110,7 +113,8 @@ add_pool(zpool_handle_t *zhp, void *data) * line. */ zpool_list_t * -pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) +pool_list_get(int argc, char **argv, zprop_list_t **proplist, + boolean_t literal, int *err) { zpool_list_t *zlp; @@ -128,6 +132,8 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) zlp->zl_proplist = proplist; + zlp->zl_literal = literal; + if (argc == 0) { (void) zpool_iter(g_zfs, add_pool, zlp); zlp->zl_findall = B_TRUE; @@ -242,12 +248,12 @@ pool_list_count(zpool_list_t *zlp) */ int for_each_pool(int argc, char **argv, boolean_t unavail, - zprop_list_t **proplist, zpool_iter_f func, void *data) + zprop_list_t **proplist, boolean_t literal, zpool_iter_f func, void *data) { zpool_list_t *list; int ret = 0; - if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) + if ((list = pool_list_get(argc, argv, proplist, literal, &ret)) == NULL) return (1); if (pool_list_iter(list, unavail, func, data) != 0) @@ -711,7 +717,7 @@ all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, vcdl->g_zfs = g_zfs; /* Gather our list of all vdevs in all pools */ - for_each_pool(argc, argv, B_TRUE, NULL, + for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, all_pools_for_each_vdev_gather_cb, vcdl); /* Run command on all vdevs in all pools */ diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index f612db48d4f9..e00fdb7ae1b0 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -669,9 +669,16 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; + uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *class = ""; + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole == B_TRUE) { + continue; + } + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) @@ -692,6 +699,54 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } } +/* + * Print the list of l2cache devices for dry runs. + */ +static void +print_cache_list(nvlist_t *nv, int indent) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0 && children > 0) { + (void) printf("\t%*s%s\n", indent, "", "cache"); + } else { + return; + } + for (c = 0; c < children; c++) { + char *vname; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); + (void) printf("\t%*s%s\n", indent + 2, "", vname); + free(vname); + } +} + +/* + * Print the list of spares for dry runs. + */ +static void +print_spare_list(nvlist_t *nv, int indent) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0 && children > 0) { + (void) printf("\t%*s%s\n", indent, "", "spares"); + } else { + return; + } + for (c = 0; c < children; c++) { + char *vname; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); + (void) printf("\t%*s%s\n", indent + 2, "", vname); + free(vname); + } +} + static boolean_t prop_list_contains_feature(nvlist_t *proplist) { @@ -921,16 +976,16 @@ zpool_do_add(int argc, char **argv) if (dryrun) { nvlist_t *poolnvroot; - nvlist_t **l2child; - uint_t l2children, c; + nvlist_t **l2child, **sparechild; + uint_t l2children, sparechildren, c; char *vname; - boolean_t hadcache = B_FALSE; + boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " - "configuration:\n"), zpool_get_name(zhp)); + "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", @@ -991,6 +1046,29 @@ zpool_do_add(int argc, char **argv) free(vname); } } + /* And finaly the spares */ + if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, + &sparechild, &sparechildren) == 0 && sparechildren > 0) { + hadspare = B_TRUE; + (void) printf(gettext("\tspares\n")); + for (c = 0; c < sparechildren; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + sparechild[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &sparechild, &sparechildren) == 0 && sparechildren > 0) { + if (!hadspare) + (void) printf(gettext("\tspares\n")); + for (c = 0; c < sparechildren; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + sparechild[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } ret = 0; } else { @@ -1548,6 +1626,8 @@ zpool_do_create(int argc, char **argv) VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); + print_cache_list(nvroot, 0); + print_spare_list(nvroot, 0); ret = 0; } else { @@ -1762,7 +1842,7 @@ zpool_do_export(int argc, char **argv) } return (for_each_pool(argc, argv, B_TRUE, NULL, - zpool_export_one, &cb)); + B_FALSE, zpool_export_one, &cb)); } /* check arguments */ @@ -1771,7 +1851,8 @@ zpool_do_export(int argc, char **argv) usage(B_FALSE); } - ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, + &cb); return (ret); } @@ -2254,6 +2335,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, break; } color_end(); + } else if (children == 0 && !isspare && + getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && + VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift) { + (void) printf( + gettext(" block size: %dB configured, %dB native"), + 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } /* The root vdev has the scrub/resilver stats */ @@ -2287,7 +2375,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } - /* Display vdev initialization and trim status for leaves */ + /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); @@ -3606,7 +3694,8 @@ zpool_do_sync(int argc, char **argv) argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ - ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, zpool_sync_one, + &force); return (ret); } @@ -4951,7 +5040,7 @@ are_vdevs_in_pool(int argc, char **argv, char *pool_name, /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, - is_vdev, cb); + B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; @@ -4979,7 +5068,8 @@ is_pool_cb(zpool_handle_t *zhp, void *data) static int is_pool(char *name) { - return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); + return (for_each_pool(0, NULL, B_TRUE, NULL, B_FALSE, is_pool_cb, + name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ @@ -5431,7 +5521,7 @@ zpool_do_iostat(int argc, char **argv) * Construct the list of all interesting pools. */ ret = 0; - if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + if ((list = pool_list_get(argc, argv, NULL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { @@ -6105,7 +6195,7 @@ zpool_do_list(int argc, char **argv) for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, - &ret)) == NULL) + cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) @@ -6505,6 +6595,10 @@ zpool_do_split(int argc, char **argv) "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); + print_vdev_tree(NULL, "dedup", config, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", config, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); } } @@ -6857,7 +6951,7 @@ zpool_do_reopen(int argc, char **argv) argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ - ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one, + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); @@ -6987,12 +7081,13 @@ zpool_do_scrub(int argc, char **argv) usage(B_FALSE); } - error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb); + error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; - error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback, - &act); + error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + wait_callback, &act); } return (error); @@ -7030,7 +7125,8 @@ zpool_do_resilver(int argc, char **argv) usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); + return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + scrub_callback, &cb)); } /* @@ -7583,7 +7679,7 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); - (void) printf(gettext("remove: ")); + printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; @@ -8424,7 +8520,7 @@ zpool_do_status(int argc, char **argv) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); - ret = for_each_pool(argc, argv, B_TRUE, NULL, + ret = for_each_pool(argc, argv, B_TRUE, NULL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) @@ -8943,7 +9039,7 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext("\n")); } } else { - ret = for_each_pool(argc, argv, B_FALSE, NULL, + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, upgrade_one, &cb); } @@ -9029,6 +9125,12 @@ print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { + (void) printf(" output nvlist omitted; " + "original size: %lldKB\n", + (longlong_t)fnvlist_lookup_int64(rec, + ZPOOL_HIST_OUTPUT_SIZE) / 1024); + } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, @@ -9126,7 +9228,7 @@ zpool_do_history(int argc, char **argv) argc -= optind; argv += optind; - ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { @@ -9689,7 +9791,7 @@ zpool_do_get(int argc, char **argv) cb.cb_proplist = &fake_name; } - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) @@ -9759,7 +9861,7 @@ zpool_do_set(int argc, char **argv) *(cb.cb_value) = '\0'; cb.cb_value++; - error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, + error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, B_FALSE, set_callback, &cb); return (error); @@ -9842,7 +9944,8 @@ vdev_any_spare_replacing(nvlist_t *nv) (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || - strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) { + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || + strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } @@ -10044,7 +10147,7 @@ int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; - char c; + int c; char *value; int i; unsigned long count; diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index 265aa58953a0..abaa22d78c20 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -64,7 +64,7 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, * Pool list functions */ int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, - zpool_iter_f, void *); + boolean_t, zpool_iter_f, void *); /* Vdev list functions */ typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); @@ -72,7 +72,7 @@ int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); typedef struct zpool_list zpool_list_t; -zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); +zpool_list_t *pool_list_get(int, char **, zprop_list_t **, boolean_t, int *); void pool_list_update(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 9aa09b18c4ae..c86081a8153a 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -86,9 +86,6 @@ boolean_t error_seen; boolean_t is_force; - - - /*PRINTFLIKE1*/ void vdev_error(const char *fmt, ...) @@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path) uint_t i, nspares; boolean_t inuse; + if (zpool_is_draid_spare(path)) + return (B_TRUE); + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); @@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for <zfs_vdev_paths>/xxx + * draid* Virtual dRAID spare */ static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; @@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (zpool_is_draid_spare(arg)) { + if (!is_primary) { + (void) fprintf(stderr, + gettext("cannot open '%s': dRAID spares can only " + "be used to replace primary vdevs\n"), arg); + return (NULL); + } + + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -432,11 +443,16 @@ typedef struct replication_level { #define ZPOOL_FUZZ (16 * 1024 * 1024) +/* + * N.B. For the purposes of comparing replication levels dRAID can be + * considered functionally equivilant to raidz. + */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { - if (strcmp(a->zprl_type, "raidz") == 0 && + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; @@ -446,6 +462,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, } /* + * Comparison for determining if dRAID and raidz where passed in either order. + */ +static boolean_t +is_raidz_draid(replication_level_t *a, replication_level_t *b) +{ + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && + (strcmp(b->zprl_type, "raidz") == 0 || + strcmp(b->zprl_type, "draid") == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. @@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) else return (NULL); } + } else if (is_raidz_draid(&lastrep, &rep)) { + /* + * Accepted raidz and draid when they can + * handle the same number of disk failures. + */ + if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s and %s vdevs " + "with different " + "redundancy, %llu vs. " + "%llu are present\n"), + lastrep.zprl_type, + rep.zprl_type, + lastrep.zprl_parity, + rep.zprl_parity); + else + return (NULL); + } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) @@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, return (anyinuse); } -static const char * -is_grouping(const char *type, int *mindev, int *maxdev) +/* + * Returns the parity level extracted from a raidz or draid type. + * If the parity cannot be determined zero is returned. + */ +static int +get_parity(const char *type) { - if (strncmp(type, "raidz", 5) == 0) { - const char *p = type + 5; - char *end; - long nparity; + long parity = 0; + const char *p; + + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { + p = type + strlen(VDEV_TYPE_RAIDZ); if (*p == '\0') { - nparity = 1; + /* when unspecified default to single parity */ + return (1); } else if (*p == '0') { - return (NULL); /* no zero prefixes allowed */ + /* no zero prefixes allowed */ + return (0); } else { + /* 0-3, no suffixes allowed */ + char *end; errno = 0; - nparity = strtol(p, &end, 10); - if (errno != 0 || nparity < 1 || nparity >= 255 || - *end != '\0') - return (NULL); + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || + parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { + return (0); + } + } + } else if (strncmp(type, VDEV_TYPE_DRAID, + strlen(VDEV_TYPE_DRAID)) == 0) { + p = type + strlen(VDEV_TYPE_DRAID); + + if (*p == '\0' || *p == ':') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, allowed suffixes: '\0' or ':' */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || + parity < 1 || parity > VDEV_DRAID_MAXPARITY || + (*end != '\0' && *end != ':')) { + return (0); + } } + } + + return ((int)parity); +} + +/* + * Assign the minimum and maximum number of devices allowed for + * the specified type. On error NULL is returned, otherwise the + * type prefix is returned (raidz, mirror, etc). + */ +static const char * +is_grouping(const char *type, int *mindev, int *maxdev) +{ + int nparity; + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { + nparity = get_parity(type); + if (nparity == 0) + return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + + if (strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0) { + return (VDEV_TYPE_RAIDZ); + } else { + return (VDEV_TYPE_DRAID); + } } if (maxdev != NULL) @@ -1168,6 +1280,163 @@ is_grouping(const char *type, int *mindev, int *maxdev) } /* + * Extract the configuration parameters encoded in the dRAID type and + * use them to generate a dRAID configuration. The expected format is: + * + * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] + * + * The intent is to be able to generate a good configuration when no + * additional information is provided. The only mandatory component + * of the 'type' is the 'draid' prefix. If a value is not provided + * then reasonable defaults are used. The optional components may + * appear in any order but the d/s/c suffix is required. + * + * Valid inputs: + * - data: number of data devices per group (1-255) + * - parity: number of parity blocks per group (1-3) + * - spares: number of distributed spare (0-100) + * - children: total number of devices (1-255) + * + * Examples: + * - zpool create tank draid <devices...> + * - zpool create tank draid2:8d:51c:2s <devices...> + */ +static int +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +{ + uint64_t nparity = 1; + uint64_t nspares = 0; + uint64_t ndata = UINT64_MAX; + uint64_t ngroups = 1; + long value; + + if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + if (nparity == 0) + return (EINVAL); + + char *p = (char *)type; + while ((p = strchr(p, ':')) != NULL) { + char *end; + + p = p + 1; + errno = 0; + + if (!isdigit(p[0])) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:<number><c|d|s>] not '%s'\n"), + type); + return (EINVAL); + } + + /* Expected non-zero value with c/d/s suffix */ + value = strtol(p, &end, 10); + char suffix = tolower(*end); + if (errno != 0 || + (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:<number><c|d|s>] not '%s'\n"), + type); + return (EINVAL); + } + + if (suffix == 'c') { + if ((uint64_t)value != children) { + fprintf(stderr, + gettext("invalid number of dRAID children; " + "%llu required but %llu provided\n"), + (u_longlong_t)value, + (u_longlong_t)children); + return (EINVAL); + } + } else if (suffix == 'd') { + ndata = (uint64_t)value; + } else if (suffix == 's') { + nspares = (uint64_t)value; + } else { + verify(0); /* Unreachable */ + } + } + + /* + * When a specific number of data disks is not provided limit a + * redundancy group to 8 data disks. This value was selected to + * provide a reasonable tradeoff between capacity and performance. + */ + if (ndata == UINT64_MAX) { + if (children > nspares + nparity) { + ndata = MIN(children - nspares - nparity, 8); + } else { + fprintf(stderr, gettext("request number of " + "distributed spares %llu and parity level %llu\n" + "leaves no disks available for data\n"), + (u_longlong_t)nspares, (u_longlong_t)nparity); + return (EINVAL); + } + } + + /* Verify the maximum allowed group size is never exceeded. */ + if (ndata == 0 || (ndata + nparity > children - nspares)) { + fprintf(stderr, gettext("requested number of dRAID data " + "disks per group %llu is too high,\nat most %llu disks " + "are available for data\n"), (u_longlong_t)ndata, + (u_longlong_t)(children - nspares - nparity)); + return (EINVAL); + } + + if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + fprintf(stderr, + gettext("invalid dRAID parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_DRAID_MAXPARITY); + return (EINVAL); + } + + /* + * Verify the requested number of spares can be satisfied. + * An arbitrary limit of 100 distributed spares is applied. + */ + if (nspares > 100 || nspares > (children - (ndata + nparity))) { + fprintf(stderr, + gettext("invalid number of dRAID spares %llu; additional " + "disks would be required\n"), (u_longlong_t)nspares); + return (EINVAL); + } + + /* Verify the requested number children is sufficient. */ + if (children < (ndata + nparity + nspares)) { + fprintf(stderr, gettext("%llu disks were provided, but at " + "least %llu disks are required for this config\n"), + (u_longlong_t)children, + (u_longlong_t)(ndata + nparity + nspares)); + } + + if (children > VDEV_DRAID_MAX_CHILDREN) { + fprintf(stderr, gettext("%llu disks were provided, but " + "dRAID only supports up to %u disks"), + (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); + } + + /* + * Calculate the minimum number of groups required to fill a slice. + * This is the LCM of the stripe width (ndata + nparity) and the + * number of data drives (children - nspares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + + return (0); +} + +/* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. * Note: we don't bother freeing anything in the error paths @@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; - const char *type; - uint64_t is_log, is_special, is_dedup; + const char *type, *fulltype; + boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; @@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { + fulltype = argv[0]; nv = NULL; /* - * If it's a mirror or raidz, the subsequent arguments are - * its leaves -- until we encounter the next mirror or raidz. + * If it's a mirror, raidz, or draid the subsequent arguments + * are its leaves -- until we encounter the next mirror, + * raidz or draid. */ - if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } + is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } @@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; - is_special = B_FALSE; - is_dedup = B_FALSE; + is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* @@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; - is_log = B_FALSE; - is_dedup = B_FALSE; + is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; @@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; - is_log = B_FALSE; - is_special = B_FALSE; + is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; @@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = B_FALSE; + is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { @@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], - B_FALSE)) == NULL) { + !(is_log || is_special || is_dedup || + is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); @@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) + if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); + } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, @@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_DRAID) == 0) { + if (draid_config_by_type(nv, + fulltype, children) != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, children) == 0); @@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ - if ((nv = make_leaf_vdev(props, argv[0], - is_log)) == NULL) + if ((nv = make_leaf_vdev(props, argv[0], !(is_log || + is_special || is_dedup || is_spare))) == NULL) goto spec_out; - if (is_log) + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; + } + if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, |