aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/cmd
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/cmd')
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb.c205
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_main.c38
-rw-r--r--sys/contrib/openzfs/cmd/zinject/zinject.c81
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_iter.c118
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_main.c55
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_util.h3
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_vdev.c90
7 files changed, 449 insertions, 141 deletions
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
index d655fa715e15..1ffb8ebc70f2 100644
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -106,11 +106,15 @@ extern boolean_t spa_mode_readable_spacemaps;
extern uint_t zfs_reconstruct_indirect_combinations_max;
extern uint_t zfs_btree_verify_intensity;
+enum {
+ ARG_ALLOCATED = 256,
+ ARG_BLOCK_BIN_MODE,
+ ARG_BLOCK_CLASSES,
+};
+
static const char cmdname[] = "zdb";
uint8_t dump_opt[512];
-#define ALLOCATED_OPT 256
-
typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
static uint64_t *zopt_metaslab = NULL;
@@ -131,6 +135,20 @@ static objset_t *os;
static boolean_t kernel_init_done;
static boolean_t corruption_found = B_FALSE;
+static enum {
+ BIN_AUTO = 0,
+ BIN_PSIZE,
+ BIN_LSIZE,
+ BIN_ASIZE,
+} block_bin_mode = BIN_AUTO;
+
+static enum {
+ CLASS_NORMAL = 1 << 1,
+ CLASS_SPECIAL = 1 << 2,
+ CLASS_DEDUP = 1 << 3,
+ CLASS_OTHER = 1 << 4,
+} block_classes = 0;
+
static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
boolean_t);
static void mos_obj_refd(uint64_t);
@@ -749,6 +767,12 @@ usage(void)
(void) fprintf(stderr, " Options to control amount of output:\n");
(void) fprintf(stderr, " -b --block-stats "
"block statistics\n");
+ (void) fprintf(stderr, " --bin=(lsize|psize|asize) "
+ "bin blocks based on this size in all three columns\n");
+ (void) fprintf(stderr,
+ " --class=(normal|special|dedup|other)[,...]\n"
+ " only consider blocks from "
+ "these allocation classes\n");
(void) fprintf(stderr, " -B --backup "
"backup stream\n");
(void) fprintf(stderr, " -c --checksum "
@@ -1694,7 +1718,7 @@ dump_metaslab(metaslab_t *msp)
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
(u_longlong_t)space_map_object(sm), freebuf);
- if (dump_opt[ALLOCATED_OPT] ||
+ if (dump_opt[ARG_ALLOCATED] ||
(dump_opt['m'] > 2 && !dump_opt['L'])) {
mutex_enter(&msp->ms_lock);
VERIFY0(metaslab_load(msp));
@@ -1705,7 +1729,7 @@ dump_metaslab(metaslab_t *msp)
dump_metaslab_stats(msp);
}
- if (dump_opt[ALLOCATED_OPT]) {
+ if (dump_opt[ARG_ALLOCATED]) {
uint64_t off = msp->ms_start;
zfs_range_tree_walk(msp->ms_allocatable, dump_allocated,
&off);
@@ -1726,7 +1750,7 @@ dump_metaslab(metaslab_t *msp)
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}
- if (dump_opt[ALLOCATED_OPT] ||
+ if (dump_opt[ARG_ALLOCATED] ||
(dump_opt['m'] > 2 && !dump_opt['L'])) {
metaslab_unload(msp);
mutex_exit(&msp->ms_lock);
@@ -3301,6 +3325,7 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
uint64_t keyformat, salt, iters;
int i;
unsigned char c;
+ FILE *f;
VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
@@ -3333,6 +3358,25 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
break;
+ case ZFS_KEYFORMAT_RAW:
+ if ((f = fopen(key_material, "r")) == NULL)
+ return (B_FALSE);
+
+ if (fread(key_out, 1, WRAPPING_KEY_LEN, f) !=
+ WRAPPING_KEY_LEN) {
+ (void) fclose(f);
+ return (B_FALSE);
+ }
+
+ /* Check the key length */
+ if (fgetc(f) != EOF) {
+ (void) fclose(f);
+ return (B_FALSE);
+ }
+
+ (void) fclose(f);
+ break;
+
default:
fatal("no support for key format %u\n",
(unsigned int) keyformat);
@@ -5794,6 +5838,34 @@ dump_size_histograms(zdb_cb_t *zcb)
(void) printf("\nBlock Size Histogram\n");
+ switch (block_bin_mode) {
+ case BIN_PSIZE:
+ printf("(note: all categories are binned by %s)\n", "psize");
+ break;
+ case BIN_LSIZE:
+ printf("(note: all categories are binned by %s)\n", "lsize");
+ break;
+ case BIN_ASIZE:
+ printf("(note: all categories are binned by %s)\n", "asize");
+ break;
+ default:
+ printf("(note: all categories are binned separately)\n");
+ break;
+ }
+ if (block_classes != 0) {
+ char buf[256] = "";
+ if (block_classes & CLASS_NORMAL)
+ strlcat(buf, "\"normal\", ", sizeof (buf));
+ if (block_classes & CLASS_SPECIAL)
+ strlcat(buf, "\"special\", ", sizeof (buf));
+ if (block_classes & CLASS_DEDUP)
+ strlcat(buf, "\"dedup\", ", sizeof (buf));
+ if (block_classes & CLASS_OTHER)
+ strlcat(buf, "\"other\", ", sizeof (buf));
+ buf[strlen(buf)-2] = '\0';
+ printf("(note: only blocks in these classes are counted: %s)\n",
+ buf);
+ }
/*
* Print the first line titles
*/
@@ -6142,29 +6214,85 @@ skipped:
[BPE_GET_PSIZE(bp)]++;
return;
}
+
+ if (block_classes != 0) {
+ spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
+
+ uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[0]);
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+ vdev_t *vd = vdev_lookup_top(zcb->zcb_spa, vdev);
+ ASSERT(vd != NULL);
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(ms != NULL);
+ metaslab_group_t *mg = ms->ms_group;
+ ASSERT(mg != NULL);
+ metaslab_class_t *mc = mg->mg_class;
+ ASSERT(mc != NULL);
+
+ spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
+
+ int class;
+ if (mc == spa_normal_class(zcb->zcb_spa)) {
+ class = CLASS_NORMAL;
+ } else if (mc == spa_special_class(zcb->zcb_spa)) {
+ class = CLASS_SPECIAL;
+ } else if (mc == spa_dedup_class(zcb->zcb_spa)) {
+ class = CLASS_DEDUP;
+ } else {
+ class = CLASS_OTHER;
+ }
+
+ if (!(block_classes & class)) {
+ goto hist_skipped;
+ }
+ }
+
/*
* The binning histogram bins by powers of two up to
* SPA_MAXBLOCKSIZE rather than creating bins for
* every possible blocksize found in the pool.
*/
- int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
+ int bin;
+
+ /*
+ * Binning strategy: each bin includes blocks up to and including
+ * the given size (excluding blocks that fit into the previous bin).
+ * This way, the "4K" bin includes blocks within the (2K; 4K] range.
+ */
+#define BIN(size) (highbit64((size) - 1))
+
+ switch (block_bin_mode) {
+ case BIN_PSIZE: bin = BIN(BP_GET_PSIZE(bp)); break;
+ case BIN_LSIZE: bin = BIN(BP_GET_LSIZE(bp)); break;
+ case BIN_ASIZE: bin = BIN(BP_GET_ASIZE(bp)); break;
+ case BIN_AUTO: break;
+ default: PANIC("bad block_bin_mode"); abort();
+ }
+
+ if (block_bin_mode == BIN_AUTO)
+ bin = BIN(BP_GET_PSIZE(bp));
zcb->zcb_psize_count[bin]++;
zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
zcb->zcb_psize_total += BP_GET_PSIZE(bp);
- bin = highbit64(BP_GET_LSIZE(bp)) - 1;
+ if (block_bin_mode == BIN_AUTO)
+ bin = BIN(BP_GET_LSIZE(bp));
zcb->zcb_lsize_count[bin]++;
zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
- bin = highbit64(BP_GET_ASIZE(bp)) - 1;
+ if (block_bin_mode == BIN_AUTO)
+ bin = BIN(BP_GET_ASIZE(bp));
zcb->zcb_asize_count[bin]++;
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp);
+#undef BIN
+
+hist_skipped:
if (!do_claim)
return;
@@ -9406,7 +9534,11 @@ main(int argc, char **argv)
{"livelist", no_argument, NULL, 'y'},
{"zstd-headers", no_argument, NULL, 'Z'},
{"allocated-map", no_argument, NULL,
- ALLOCATED_OPT},
+ ARG_ALLOCATED},
+ {"bin", required_argument, NULL,
+ ARG_BLOCK_BIN_MODE},
+ {"class", required_argument, NULL,
+ ARG_BLOCK_CLASSES},
{0, 0, 0, 0}
};
@@ -9437,7 +9569,7 @@ main(int argc, char **argv)
case 'u':
case 'y':
case 'Z':
- case ALLOCATED_OPT:
+ case ARG_ALLOCATED:
dump_opt[c]++;
dump_all = 0;
break;
@@ -9520,6 +9652,59 @@ main(int argc, char **argv)
case 'x':
vn_dumpdir = optarg;
break;
+ case ARG_BLOCK_BIN_MODE:
+ if (strcmp(optarg, "lsize") == 0) {
+ block_bin_mode = BIN_LSIZE;
+ } else if (strcmp(optarg, "psize") == 0) {
+ block_bin_mode = BIN_PSIZE;
+ } else if (strcmp(optarg, "asize") == 0) {
+ block_bin_mode = BIN_ASIZE;
+ } else {
+ (void) fprintf(stderr,
+ "--bin=\"%s\" must be one of \"lsize\", "
+ "\"psize\" or \"asize\"\n", optarg);
+ usage();
+ }
+ break;
+
+ case ARG_BLOCK_CLASSES: {
+ char *buf = strdup(optarg), *tok = buf, *next,
+ *save = NULL;
+
+ while ((next = strtok_r(tok, ",", &save)) != NULL) {
+ tok = NULL;
+
+ if (strcmp(next, "normal") == 0) {
+ block_classes |= CLASS_NORMAL;
+ } else if (strcmp(next, "special") == 0) {
+ block_classes |= CLASS_SPECIAL;
+ } else if (strcmp(next, "dedup") == 0) {
+ block_classes |= CLASS_DEDUP;
+ } else if (strcmp(next, "other") == 0) {
+ block_classes |= CLASS_OTHER;
+ } else {
+ (void) fprintf(stderr,
+ "--class=\"%s\" must be a "
+ "comma-separated list of either "
+ "\"normal\", \"special\", "
+ "\"asize\" or \"other\"; "
+ "got \"%s\"\n",
+ optarg, next);
+ usage();
+ }
+ }
+
+ if (block_classes == 0) {
+ (void) fprintf(stderr,
+ "--class= must be a comma-separated "
+ "list of either \"normal\", \"special\", "
+ "\"asize\" or \"other\"; got empty\n");
+ usage();
+ }
+
+ free(buf);
+ break;
+ }
default:
usage();
break;
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
index 484986bde719..ccdd5ffef8e6 100644
--- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
@@ -914,7 +914,11 @@ zfs_do_clone(int argc, char **argv)
log_history = B_FALSE;
}
- ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
+ /*
+ * Dataset cloned successfully, mount/share failures are
+ * non-fatal.
+ */
+ (void) zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
}
zfs_close(zhp);
@@ -930,19 +934,15 @@ usage:
}
/*
- * Return a default volblocksize for the pool which always uses more than
- * half of the data sectors. This primarily applies to dRAID which always
- * writes full stripe widths.
+ * Calculate the minimum allocation size based on the top-level vdevs.
*/
static uint64_t
-default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+calculate_volblocksize(nvlist_t *config)
{
- uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+ uint64_t asize = SPA_MINBLOCKSIZE;
nvlist_t *tree, **vdevs;
uint_t nvdevs;
- nvlist_t *config = zpool_get_config(zhp, NULL);
-
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
&vdevs, &nvdevs) != 0) {
@@ -973,6 +973,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
}
}
+ return (asize);
+}
+
+/*
+ * Return a default volblocksize for the pool which always uses more than
+ * half of the data sectors. This primarily applies to dRAID which always
+ * writes full stripe widths.
+ */
+static uint64_t
+default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)
+{
+ uint64_t volblocksize, asize = SPA_MINBLOCKSIZE;
+
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0)
+ asize = calculate_volblocksize(config);
+
/*
* Calculate the target volblocksize such that more than half
* of the asize is used. The following table is for 4k sectors.
@@ -1319,7 +1337,9 @@ zfs_do_create(int argc, char **argv)
goto error;
}
- ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
+ /* Dataset created successfully, mount/share failures are non-fatal */
+ ret = 0;
+ (void) zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
error:
nvlist_free(props);
return (ret);
diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c
index 113797c878b9..c2f646f2567d 100644
--- a/sys/contrib/openzfs/cmd/zinject/zinject.c
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.c
@@ -107,6 +107,8 @@
* zinject
* zinject <-a | -u pool>
* zinject -c <id|all>
+ * zinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]
+ * [-T iotype] [-t type object | -b bookmark pool]
* zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
* [-r range] <object>
* zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
@@ -132,14 +134,18 @@
* The '-f' flag controls the frequency of errors injected, expressed as a
* real number percentage between 0.0001 and 100. The default is 100.
*
- * The this form is responsible for actually injecting the handler into the
+ * The <object> form is responsible for actually injecting the handler into the
* framework. It takes the arguments described above, translates them to the
* internal tuple using libzpool, and then issues an ioctl() to register the
* handler.
*
- * The final form can target a specific bookmark, regardless of whether a
+ * The '-b' option can target a specific bookmark, regardless of whether a
* human-readable interface has been designed. It allows developers to specify
* a particular block by number.
+ *
+ * The '-E' option injects pipeline ready stage delays for the given object or
+ * bookmark. The delay is specified in milliseconds, and it supports I/O type
+ * and range filters.
*/
#include <errno.h>
@@ -346,6 +352,13 @@ usage(void)
"\t\tsuch that the operation takes a minimum of supplied seconds\n"
"\t\tto complete.\n"
"\n"
+ "\tzinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]\n"
+ "\t\t[-T iotype] [-t type object | -b bookmark pool]\n"
+ "\n"
+ "\t\tInject pipeline ready stage delays for the given object path\n"
+ "\t\t(data or dnode) or raw bookmark. The delay is specified in\n"
+ "\t\tmilliseconds.\n"
+ "\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\t\tCause the pool to stop writing blocks yet not\n"
"\t\treport errors for a duration. Simulates buggy hardware\n"
@@ -724,12 +737,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
if (quiet) {
(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
} else {
+ boolean_t show_object = B_FALSE;
+ boolean_t show_iotype = B_FALSE;
(void) printf("Added handler %llu with the following "
"properties:\n", (u_longlong_t)zc.zc_guid);
(void) printf(" pool: %s\n", pool);
if (record->zi_guid) {
(void) printf(" vdev: %llx\n",
(u_longlong_t)record->zi_guid);
+ show_iotype = B_TRUE;
} else if (record->zi_func[0] != '\0') {
(void) printf(" panic function: %s\n",
record->zi_func);
@@ -742,7 +758,18 @@ register_handler(const char *pool, int flags, zinject_record_t *record,
} else if (record->zi_timer > 0) {
(void) printf(" timer: %lld ms\n",
(u_longlong_t)NSEC2MSEC(record->zi_timer));
+ if (record->zi_cmd == ZINJECT_DELAY_READY) {
+ show_object = B_TRUE;
+ show_iotype = B_TRUE;
+ }
} else {
+ show_object = B_TRUE;
+ }
+ if (show_iotype) {
+ (void) printf("iotype: %s\n",
+ iotype_to_str(record->zi_iotype));
+ }
+ if (show_object) {
(void) printf("objset: %llu\n",
(u_longlong_t)record->zi_objset);
(void) printf("object: %llu\n",
@@ -910,6 +937,7 @@ main(int argc, char **argv)
int ret;
int flags = 0;
uint32_t dvas = 0;
+ hrtime_t ready_delay = -1;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
@@ -940,7 +968,7 @@ main(int argc, char **argv)
}
while ((c = getopt(argc, argv,
- ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
+ ":aA:b:C:d:D:E:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
@@ -1113,6 +1141,18 @@ main(int argc, char **argv)
case 'u':
flags |= ZINJECT_UNLOAD_SPA;
break;
+ case 'E':
+ ready_delay = MSEC2NSEC(strtol(optarg, &end, 10));
+ if (ready_delay <= 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid delay '%s': "
+ "must be a positive duration\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ record.zi_cmd = ZINJECT_DELAY_READY;
+ record.zi_timer = ready_delay;
+ break;
case 'L':
if ((label = name_to_type(optarg)) == TYPE_INVAL &&
!LABEL_TYPE(type)) {
@@ -1150,7 +1190,7 @@ main(int argc, char **argv)
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
- record.zi_freq > 0 || dvas != 0) {
+ record.zi_freq > 0 || dvas != 0 || ready_delay >= 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
@@ -1186,7 +1226,7 @@ main(int argc, char **argv)
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
- dvas != 0) {
+ dvas != 0 || ready_delay >= 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
@@ -1276,13 +1316,23 @@ main(int argc, char **argv)
return (1);
}
- record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (record.zi_cmd == ZINJECT_UNINITIALIZED) {
+ record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (!error)
+ error = EIO;
+ } else if (error != 0) {
+ (void) fprintf(stderr, "error type -e incompatible "
+ "with delay injection\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ } else {
+ record.zi_iotype = io_type;
+ }
+
if (translate_raw(raw, &record) != 0) {
libzfs_fini(g_zfs);
return (1);
}
- if (!error)
- error = EIO;
} else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || device != NULL || record.zi_freq > 0 ||
@@ -1410,6 +1460,13 @@ main(int argc, char **argv)
record.zi_dvas = dvas;
}
+ if (record.zi_cmd != ZINJECT_UNINITIALIZED && error != 0) {
+ (void) fprintf(stderr, "error type -e incompatible "
+ "with delay injection\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
if (error == EACCES) {
if (type != TYPE_DATA) {
(void) fprintf(stderr, "decryption errors "
@@ -1425,8 +1482,12 @@ main(int argc, char **argv)
* not found.
*/
error = ECKSUM;
- } else {
+ } else if (record.zi_cmd == ZINJECT_UNINITIALIZED) {
record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (!error)
+ error = EIO;
+ } else {
+ record.zi_iotype = io_type;
}
if (translate_record(type, argv[0], range, level, &record, pool,
@@ -1434,8 +1495,6 @@ main(int argc, char **argv)
libzfs_fini(g_zfs);
return (1);
}
- if (!error)
- error = EIO;
}
/*
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
index 2eec9a95e24c..fef602736705 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
@@ -26,6 +26,7 @@
/*
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright (c) 2025, Klara, Inc.
*/
#include <libintl.h>
@@ -52,7 +53,7 @@
typedef struct zpool_node {
zpool_handle_t *zn_handle;
uu_avl_node_t zn_avlnode;
- int zn_mark;
+ hrtime_t zn_last_refresh;
} zpool_node_t;
struct zpool_list {
@@ -62,6 +63,7 @@ struct zpool_list {
uu_avl_pool_t *zl_pool;
zprop_list_t **zl_proplist;
zfs_type_t zl_type;
+ hrtime_t zl_last_refresh;
};
static int
@@ -81,26 +83,30 @@ zpool_compare(const void *larg, const void *rarg, void *unused)
* of known pools.
*/
static int
-add_pool(zpool_handle_t *zhp, void *data)
+add_pool(zpool_handle_t *zhp, zpool_list_t *zlp)
{
- zpool_list_t *zlp = data;
- zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
+ zpool_node_t *node, *new = safe_malloc(sizeof (zpool_node_t));
uu_avl_index_t idx;
- node->zn_handle = zhp;
- uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
- if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
+ new->zn_handle = zhp;
+ uu_avl_node_init(new, &new->zn_avlnode, zlp->zl_pool);
+
+ node = uu_avl_find(zlp->zl_avl, new, NULL, &idx);
+ if (node == NULL) {
if (zlp->zl_proplist &&
zpool_expand_proplist(zhp, zlp->zl_proplist,
zlp->zl_type, zlp->zl_literal) != 0) {
zpool_close(zhp);
- free(node);
+ free(new);
return (-1);
}
- uu_avl_insert(zlp->zl_avl, node, idx);
+ new->zn_last_refresh = zlp->zl_last_refresh;
+ uu_avl_insert(zlp->zl_avl, new, idx);
} else {
+ zpool_refresh_stats_from_handle(node->zn_handle, zhp);
+ node->zn_last_refresh = zlp->zl_last_refresh;
zpool_close(zhp);
- free(node);
+ free(new);
return (-1);
}
@@ -108,6 +114,18 @@ add_pool(zpool_handle_t *zhp, void *data)
}
/*
+ * add_pool(), but always returns 0. This allows zpool_iter() to continue
+ * even if a pool exists in the tree, or we fail to get the properties for
+ * a new one.
+ */
+static int
+add_pool_cb(zpool_handle_t *zhp, void *data)
+{
+ (void) add_pool(zhp, data);
+ return (0);
+}
+
+/*
* Create a list of pools based on the given arguments. If we're given no
* arguments, then iterate over all pools in the system and add them to the AVL
* tree. Otherwise, add only those pool explicitly specified on the command
@@ -135,9 +153,10 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,
zlp->zl_type = type;
zlp->zl_literal = literal;
+ zlp->zl_last_refresh = gethrtime();
if (argc == 0) {
- (void) zpool_iter(g_zfs, add_pool, zlp);
+ (void) zpool_iter(g_zfs, add_pool_cb, zlp);
zlp->zl_findall = B_TRUE;
} else {
int i;
@@ -159,15 +178,61 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,
}
/*
- * Search for any new pools, adding them to the list. We only add pools when no
- * options were given on the command line. Otherwise, we keep the list fixed as
- * those that were explicitly specified.
+ * Refresh the state of all pools on the list. Additionally, if no options were
+ * given on the command line, add any new pools and remove any that are no
+ * longer available.
*/
-void
-pool_list_update(zpool_list_t *zlp)
+int
+pool_list_refresh(zpool_list_t *zlp)
{
- if (zlp->zl_findall)
- (void) zpool_iter(g_zfs, add_pool, zlp);
+ zlp->zl_last_refresh = gethrtime();
+
+ if (!zlp->zl_findall) {
+ /*
+ * This list is a fixed list of pools, so we must not add
+ * or remove any. Just walk over them and refresh their
+ * state.
+ */
+ int navail = 0;
+ for (zpool_node_t *node = uu_avl_first(zlp->zl_avl);
+ node != NULL; node = uu_avl_next(zlp->zl_avl, node)) {
+ boolean_t missing;
+ zpool_refresh_stats(node->zn_handle, &missing);
+ navail += !missing;
+ node->zn_last_refresh = zlp->zl_last_refresh;
+ }
+ return (navail);
+ }
+
+ /* Search for any new pools and add them to the list. */
+ (void) zpool_iter(g_zfs, add_pool_cb, zlp);
+
+ /* Walk the list of existing pools, and update or remove them. */
+ zpool_node_t *node, *next;
+ for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next) {
+ next = uu_avl_next(zlp->zl_avl, node);
+
+ /*
+ * Skip any that were refreshed and are online; they were added
+ * by zpool_iter() and are already up to date.
+ */
+ if (node->zn_last_refresh == zlp->zl_last_refresh &&
+ zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL)
+ continue;
+
+ /* Refresh and remove if necessary. */
+ boolean_t missing;
+ zpool_refresh_stats(node->zn_handle, &missing);
+ if (missing) {
+ uu_avl_remove(zlp->zl_avl, node);
+ zpool_close(node->zn_handle);
+ free(node);
+ } else {
+ node->zn_last_refresh = zlp->zl_last_refresh;
+ }
+ }
+
+ return (uu_avl_numnodes(zlp->zl_avl));
}
/*
@@ -191,23 +256,6 @@ pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
}
/*
- * Remove the given pool from the list. When running iostat, we want to remove
- * those pools that no longer exist.
- */
-void
-pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp)
-{
- zpool_node_t search, *node;
-
- search.zn_handle = zhp;
- if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
- uu_avl_remove(zlp->zl_avl, node);
- zpool_close(node->zn_handle);
- free(node);
- }
-}
-
-/*
* Free all the handles associated with this list.
*/
void
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
index 2c46ad0df895..1feec55c0e8b 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -33,7 +33,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
- * Copyright (c) 2021, 2023, Klara Inc.
+ * Copyright (c) 2021, 2023, 2025, Klara, Inc.
* Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP.
*/
@@ -5761,24 +5761,6 @@ children:
return (ret);
}
-static int
-refresh_iostat(zpool_handle_t *zhp, void *data)
-{
- iostat_cbdata_t *cb = data;
- boolean_t missing;
-
- /*
- * If the pool has disappeared, remove it from the list and continue.
- */
- if (zpool_refresh_stats(zhp, &missing) != 0)
- return (-1);
-
- if (missing)
- pool_list_remove(cb->cb_list, zhp);
-
- return (0);
-}
-
/*
* Callback to print out the iostats for the given pool.
*/
@@ -6359,15 +6341,14 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data)
* This command can be tricky because we want to be able to deal with pool
* creation/destruction as well as vdev configuration changes. The bulk of this
* processing is handled by the pool_list_* routines in zpool_iter.c. We rely
- * on pool_list_update() to detect the addition of new pools. Configuration
- * changes are all handled within libzfs.
+ * on pool_list_refresh() to detect the addition and removal of pools.
+ * Configuration changes are all handled within libzfs.
*/
int
zpool_do_iostat(int argc, char **argv)
{
int c;
int ret;
- int npools;
float interval = 0;
unsigned long count = 0;
zpool_list_t *list;
@@ -6618,10 +6599,24 @@ zpool_do_iostat(int argc, char **argv)
return (1);
}
+ int last_npools = 0;
for (;;) {
- if ((npools = pool_list_count(list)) == 0)
+ /*
+ * Refresh all pools in list, adding or removing pools as
+ * necessary.
+ */
+ int npools = pool_list_refresh(list);
+ if (npools == 0) {
(void) fprintf(stderr, gettext("no pools available\n"));
- else {
+ } else {
+ /*
+ * If the list of pools has changed since last time
+ * around, reset the iteration count to force the
+ * header to be redisplayed.
+ */
+ if (last_npools != npools)
+ cb.cb_iteration = 0;
+
/*
* If this is the first iteration and -y was supplied
* we skip any printing.
@@ -6630,15 +6625,6 @@ zpool_do_iostat(int argc, char **argv)
cb.cb_iteration == 0);
/*
- * Refresh all statistics. This is done as an
- * explicit step before calculating the maximum name
- * width, so that any * configuration changes are
- * properly accounted for.
- */
- (void) pool_list_iter(list, B_FALSE, refresh_iostat,
- &cb);
-
- /*
* Iterate over all pools to determine the maximum width
* for the pool / device name column across all pools.
*/
@@ -6691,6 +6677,7 @@ zpool_do_iostat(int argc, char **argv)
if (skip) {
(void) fflush(stdout);
(void) fsleep(interval);
+ last_npools = npools;
continue;
}
@@ -6728,6 +6715,8 @@ zpool_do_iostat(int argc, char **argv)
(void) fflush(stdout);
(void) fsleep(interval);
+
+ last_npools = npools;
}
pool_list_free(list);
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
index 5ab7cb9750f1..3af23c52bd45 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
@@ -76,11 +76,10 @@ typedef struct zpool_list zpool_list_t;
zpool_list_t *pool_list_get(int, char **, zprop_list_t **, zfs_type_t,
boolean_t, int *);
-void pool_list_update(zpool_list_t *);
+int pool_list_refresh(zpool_list_t *);
int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
void pool_list_free(zpool_list_t *);
int pool_list_count(zpool_list_t *);
-void pool_list_remove(zpool_list_t *, zpool_handle_t *);
extern libzfs_handle_t *g_zfs;
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
index 684b46a2d673..222b5524669e 100644
--- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
@@ -270,14 +270,13 @@ is_spare(nvlist_t *config, const char *path)
* draid* Virtual dRAID spare
*/
static nvlist_t *
-make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
+make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift)
{
char path[MAXPATHLEN];
struct stat64 statbuf;
nvlist_t *vdev = NULL;
const char *type = NULL;
boolean_t wholedisk = B_FALSE;
- uint64_t ashift = 0;
int err;
/*
@@ -382,31 +381,6 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
(uint64_t)wholedisk) == 0);
/*
- * Override defaults if custom properties are provided.
- */
- if (props != NULL) {
- const char *value = NULL;
-
- if (nvlist_lookup_string(props,
- zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
- if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
- (void) fprintf(stderr,
- gettext("ashift must be a number.\n"));
- return (NULL);
- }
- if (ashift != 0 &&
- (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
- (void) fprintf(stderr,
- gettext("invalid 'ashift=%" PRIu64 "' "
- "property: only values between %" PRId32 " "
- "and %" PRId32 " are allowed.\n"),
- ashift, ASHIFT_MIN, ASHIFT_MAX);
- return (NULL);
- }
- }
- }
-
- /*
* If the device is known to incorrectly report its physical sector
* size explicitly provide the known correct value.
*/
@@ -609,22 +583,28 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)
ZPOOL_CONFIG_PATH, &path) == 0);
/*
+ * Skip active spares they should never cause
+ * the pool to be evaluated as inconsistent.
+ */
+ if (is_spare(NULL, path))
+ continue;
+
+ /*
* If we have a raidz/mirror that combines disks
- * with files, report it as an error.
+ * with files, only report it as an error when
+ * fatal is set to ensure all the replication
+ * checks aren't skipped in check_replication().
*/
- if (!dontreport && type != NULL &&
+ if (fatal && !dontreport && type != NULL &&
strcmp(type, childtype) != 0) {
if (ret != NULL)
free(ret);
ret = NULL;
- if (fatal)
- vdev_error(gettext(
- "mismatched replication "
- "level: %s contains both "
- "files and devices\n"),
- rep.zprl_type);
- else
- return (NULL);
+ vdev_error(gettext(
+ "mismatched replication "
+ "level: %s contains both "
+ "files and devices\n"),
+ rep.zprl_type);
dontreport = B_TRUE;
}
@@ -1507,6 +1487,29 @@ construct_spec(nvlist_t *props, int argc, char **argv)
const char *type, *fulltype;
boolean_t is_log, is_special, is_dedup, is_spare;
boolean_t seen_logs;
+ uint64_t ashift = 0;
+
+ if (props != NULL) {
+ const char *value = NULL;
+
+ if (nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
+ if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
+ (void) fprintf(stderr,
+ gettext("ashift must be a number.\n"));
+ return (NULL);
+ }
+ if (ashift != 0 &&
+ (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
+ (void) fprintf(stderr,
+ gettext("invalid 'ashift=%" PRIu64 "' "
+ "property: only values between %" PRId32 " "
+ "and %" PRId32 " are allowed.\n"),
+ ashift, ASHIFT_MIN, ASHIFT_MAX);
+ return (NULL);
+ }
+ }
+ }
top = NULL;
toplevels = 0;
@@ -1612,9 +1615,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)
children * sizeof (nvlist_t *));
if (child == NULL)
zpool_no_memory();
- if ((nv = make_leaf_vdev(props, argv[c],
+ if ((nv = make_leaf_vdev(argv[c],
!(is_log || is_special || is_dedup ||
- is_spare))) == NULL) {
+ is_spare), ashift)) == NULL) {
for (c = 0; c < children - 1; c++)
nvlist_free(child[c]);
free(child);
@@ -1678,6 +1681,10 @@ construct_spec(nvlist_t *props, int argc, char **argv)
ZPOOL_CONFIG_ALLOCATION_BIAS,
VDEV_ALLOC_BIAS_DEDUP) == 0);
}
+ if (ashift > 0) {
+ fnvlist_add_uint64(nv,
+ ZPOOL_CONFIG_ASHIFT, ashift);
+ }
if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
verify(nvlist_add_uint64(nv,
ZPOOL_CONFIG_NPARITY,
@@ -1705,8 +1712,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)
* We have a device. Pass off to make_leaf_vdev() to
* construct the appropriate nvlist describing the vdev.
*/
- if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
- is_special || is_dedup || is_spare))) == NULL)
+ if ((nv = make_leaf_vdev(argv[0], !(is_log ||
+ is_special || is_dedup || is_spare),
+ ashift)) == NULL)
goto spec_out;
verify(nvlist_add_uint64(nv,