diff options
Diffstat (limited to 'sys/contrib/openzfs/cmd')
32 files changed, 1519 insertions, 589 deletions
| diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am index 96040976e53e..ca94f6b77e06 100644 --- a/sys/contrib/openzfs/cmd/Makefile.am +++ b/sys/contrib/openzfs/cmd/Makefile.am @@ -98,17 +98,16 @@ endif  if USING_PYTHON -bin_SCRIPTS      += arc_summary     arcstat        dbufstat        zilstat -CLEANFILES       += arc_summary     arcstat        dbufstat        zilstat -dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in +bin_SCRIPTS      += zarcsummary     zarcstat        dbufstat        zilstat +CLEANFILES       += zarcsummary     zarcstat        dbufstat        zilstat +dist_noinst_DATA += %D%/zarcsummary %D%/zarcstat.in %D%/dbufstat.in %D%/zilstat.in -$(call SUBST,arcstat,%D%/) +$(call SUBST,zarcstat,%D%/)  $(call SUBST,dbufstat,%D%/)  $(call SUBST,zilstat,%D%/) -arc_summary: %D%/arc_summary +zarcsummary: %D%/zarcsummary  	$(AM_V_at)cp $< $@  endif -  PHONY += cmd  cmd: $(bin_SCRIPTS) $(bin_PROGRAMS) $(sbin_SCRIPTS) $(sbin_PROGRAMS) $(dist_bin_SCRIPTS) $(zfsexec_PROGRAMS) $(mounthelper_PROGRAMS) diff --git a/sys/contrib/openzfs/cmd/arcstat.in b/sys/contrib/openzfs/cmd/zarcstat.in index 6f9abb39c3fb..8ffd20481166 100755 --- a/sys/contrib/openzfs/cmd/arcstat.in +++ b/sys/contrib/openzfs/cmd/zarcstat.in @@ -2,7 +2,7 @@  # SPDX-License-Identifier: CDDL-1.0  #  # Print out ZFS ARC Statistics exported via kstat(1) -# For a definition of fields, or usage, use arcstat -v +# For a definition of fields, or usage, use zarcstat -v  #  # This script was originally a fork of the original arcstat.pl (0.1)  # by Neelakanth Nadgir, originally published on his Sun blog on @@ -56,6 +56,7 @@ import time  import getopt  import re  import copy +import os  from signal import signal, SIGINT, SIGWINCH, SIG_DFL @@ -171,7 +172,7 @@ cols = {      "zactive":    [7, 1000, "zfetch prefetches active per second"],  } -# ARC structural breakdown from arc_summary +# ARC structural breakdown from zarcsummary  structfields = {      "cmp":      ["compressed", "Compressed"],      "ovh":      ["overhead", "Overhead"], @@ -187,7 +188,7 @@ structstats = {                             # size stats      "sz":       ["_size", "size"],  } -# ARC types breakdown from arc_summary +# ARC types breakdown from zarcsummary  typefields = {      "data":     ["data", "ARC data"],      "meta":     ["metadata", "ARC metadata"], @@ -198,7 +199,7 @@ typestats = {                               # size stats      "sz":       ["_size", "size"],  } -# ARC states breakdown from arc_summary +# ARC states breakdown from zarcsummary  statefields = {      "ano":      ["anon", "Anonymous"],      "mfu":      ["mfu", "MFU"], @@ -261,7 +262,7 @@ hdr_intr = 20          # Print header every 20 lines of output  opfile = None  sep = "  "              # Default separator is 2 spaces  l2exist = False -cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: zarcstat [-havxp] [-f fields] [-o file] [-s string] [interval "         "[count]]\n")  cur = {}  d = {} @@ -348,10 +349,10 @@ def usage():                       "character or string\n")      sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n")      sys.stderr.write("\nExamples:\n") -    sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") -    sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") -    sys.stderr.write("\tarcstat -v\n") -    sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") +    sys.stderr.write("\tzarcstat -o /tmp/a.log 2 10\n") +    sys.stderr.write("\tzarcstat -s \",\" -o /tmp/a.log 2 10\n") +    sys.stderr.write("\tzarcstat -v\n") +    sys.stderr.write("\tzarcstat -f time,hit%,dh%,ph%,mh% 1\n")      sys.stderr.write("\n")      sys.exit(1) @@ -366,7 +367,7 @@ def snap_stats():      cur = kstat -    # fill in additional values from arc_summary +    # fill in additional values from zarcsummary      cur["caches_size"] = caches_size = cur["anon_data"]+cur["anon_metadata"]+\          cur["mfu_data"]+cur["mfu_metadata"]+cur["mru_data"]+cur["mru_metadata"]+\          cur["uncached_data"]+cur["uncached_metadata"] @@ -766,6 +767,7 @@ def calculate():  def main(): +      global sint      global count      global hdr_intr diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/zarcsummary index c1319573220c..24a129d9ca70 100755 --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/zarcsummary @@ -34,7 +34,7 @@ Provides basic information on the ARC, its efficiency, the L2ARC (if present),  the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See  the in-source documentation and code at  https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details. -The original introduction to arc_summary can be found at +The original introduction to zarcsummary can be found at  http://cuddletech.com/?p=454  """ @@ -161,7 +161,7 @@ elif sys.platform.startswith('linux'):          return get_params(TUNABLES_PATH)      def get_version_impl(request): -        # The original arc_summary called /sbin/modinfo/{spl,zfs} to get +        # The original zarcsummary called /sbin/modinfo/{spl,zfs} to get          # the version information. We switch to /sys/module/{spl,zfs}/version          # to make sure we get what is really loaded in the kernel          try: @@ -439,7 +439,7 @@ def print_header():      """      # datetime is now recommended over time but we keep the exact formatting -    # from the older version of arc_summary in case there are scripts +    # from the older version of zarcsummary in case there are scripts      # that expect it in this way      daydate = time.strftime(DATE_FORMAT)      spc_date = LINE_LENGTH-len(daydate) @@ -559,6 +559,7 @@ def section_arc(kstats_dict):      print()      compressed_size = arc_stats['compressed_size'] +    uncompressed_size = arc_stats['uncompressed_size']      overhead_size = arc_stats['overhead_size']      bonus_size = arc_stats['bonus_size']      dnode_size = arc_stats['dnode_size'] @@ -671,6 +672,8 @@ def section_arc(kstats_dict):      print()      print('ARC misc:') +    prt_i2('Uncompressed size:', f_perc(uncompressed_size, compressed_size), +           f_bytes(uncompressed_size))      prt_i1('Memory throttles:', arc_stats['memory_throttle_count'])      prt_i1('Memory direct reclaims:', arc_stats['memory_direct_count'])      prt_i1('Memory indirect reclaims:', arc_stats['memory_indirect_count']) diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 45eb9c783659..2560ad045db3 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -106,8 +106,14 @@ extern boolean_t spa_mode_readable_spacemaps;  extern uint_t zfs_reconstruct_indirect_combinations_max;  extern uint_t zfs_btree_verify_intensity; +enum { +	ARG_ALLOCATED = 256, +	ARG_BLOCK_BIN_MODE, +	ARG_BLOCK_CLASSES, +}; +  static const char cmdname[] = "zdb"; -uint8_t dump_opt[256]; +uint8_t dump_opt[512];  typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); @@ -127,6 +133,21 @@ static zfs_range_tree_t *mos_refd_objs;  static spa_t *spa;  static objset_t *os;  static boolean_t kernel_init_done; +static boolean_t corruption_found = B_FALSE; + +static enum { +	BIN_AUTO = 0, +	BIN_PSIZE, +	BIN_LSIZE, +	BIN_ASIZE, +} block_bin_mode = BIN_AUTO; + +static enum { +	CLASS_NORMAL = 1 << 1, +	CLASS_SPECIAL = 1 << 2, +	CLASS_DEDUP = 1 << 3, +	CLASS_OTHER = 1 << 4, +} block_classes = 0;  static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,      boolean_t); @@ -176,7 +197,7 @@ static int  sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,      dmu_tx_t *tx)  { -	ASSERT3P(tx, ==, NULL); +	ASSERT0P(tx);  	struct sublivelist_verify *sv = arg;  	sublivelist_verify_block_refcnt_t current = {  			.svbr_blk = *bp, @@ -208,7 +229,7 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,  				sublivelist_verify_block_t svb = {  				    .svb_dva = bp->blk_dva[i],  				    .svb_allocated_txg = -				    BP_GET_LOGICAL_BIRTH(bp) +				    BP_GET_BIRTH(bp)  				};  				if (zfs_btree_find(&sv->sv_leftover, &svb, @@ -250,6 +271,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)  		    &e->svbr_blk, B_TRUE);  		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",  		    e->svbr_refcnt, blkbuf); +		corruption_found = B_TRUE;  	}  	zfs_btree_destroy(&sv->sv_pair); @@ -381,7 +403,7 @@ verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,  	sublivelist_verify_block_t svb = {{{0}}};  	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);  	DVA_SET_OFFSET(&svb.svb_dva, offset); -	DVA_SET_ASIZE(&svb.svb_dva, size); +	DVA_SET_ASIZE(&svb.svb_dva, 0);  	zfs_btree_index_t where;  	uint64_t end_offset = offset + size; @@ -405,6 +427,7 @@ verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,  			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),  			    (u_longlong_t)found->svb_allocated_txg,  			    (u_longlong_t)txg); +			corruption_found = B_TRUE;  		}  	}  } @@ -426,6 +449,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)  			    (u_longlong_t)txg, (u_longlong_t)offset,  			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,  			    (u_longlong_t)mv->mv_msid); +			corruption_found = B_TRUE;  		} else {  			zfs_range_tree_add(mv->mv_allocated,  			    offset, size); @@ -439,6 +463,7 @@ metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)  			    (u_longlong_t)txg, (u_longlong_t)offset,  			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,  			    (u_longlong_t)mv->mv_msid); +			corruption_found = B_TRUE;  		} else {  			zfs_range_tree_remove(mv->mv_allocated,  			    offset, size); @@ -526,6 +551,7 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)  			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),  			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),  			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); +			corruption_found = B_TRUE;  			continue;  		} @@ -542,6 +568,7 @@ mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)  			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),  			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),  			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); +			corruption_found = B_TRUE;  			continue;  		} @@ -619,8 +646,9 @@ livelist_metaslab_validate(spa_t *spa)  			    metaslab_calculate_range_tree_type(vd, m,  			    &start, &shift);  			metaslab_verify_t mv; -			mv.mv_allocated = zfs_range_tree_create(NULL, -			    type, NULL, start, shift); +			mv.mv_allocated = zfs_range_tree_create_flags( +			    NULL, type, NULL, start, shift, +			    0, "livelist_metaslab_validate:mv_allocated");  			mv.mv_vdid = vd->vdev_id;  			mv.mv_msid = m->ms_id;  			mv.mv_start = m->ms_start; @@ -654,6 +682,7 @@ livelist_metaslab_validate(spa_t *spa)  	}  	(void) printf("ERROR: Found livelist blocks marked as allocated "  	    "for indirect vdevs:\n"); +	corruption_found = B_TRUE;  	zfs_btree_index_t *where = NULL;  	sublivelist_verify_block_t *svb; @@ -738,6 +767,12 @@ usage(void)  	(void) fprintf(stderr, "    Options to control amount of output:\n");  	(void) fprintf(stderr, "        -b --block-stats             "  	    "block statistics\n"); +	(void) fprintf(stderr, "           --bin=(lsize|psize|asize) " +	    "bin blocks based on this size in all three columns\n"); +	(void) fprintf(stderr, +	    "           --class=(normal|special|dedup|other)[,...]\n" +	    "                                     only consider blocks from " +	    "these allocation classes\n");  	(void) fprintf(stderr, "        -B --backup                  "  	    "backup stream\n");  	(void) fprintf(stderr, "        -c --checksum                " @@ -797,8 +832,8 @@ usage(void)  	    "[default is 200]\n");  	(void) fprintf(stderr, "        -K --key=KEY                 "  	    "decryption key for encrypted dataset\n"); -	(void) fprintf(stderr, "        -o --option=\"OPTION=INTEGER\" " -	    "set global variable to an unsigned 32-bit integer\n"); +	(void) fprintf(stderr, "        -o --option=\"NAME=VALUE\" " +	    "set the named tunable to the given value\n");  	(void) fprintf(stderr, "        -p --path==PATH              "  	    "use one or more with -e to specify path to vdev dir\n");  	(void) fprintf(stderr, "        -P --parseable               " @@ -826,7 +861,7 @@ usage(void)  	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "  	    "to make only that option verbose\n");  	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); -	zdb_exit(1); +	zdb_exit(2);  }  static void @@ -891,9 +926,9 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)  	size_t nvsize = *(uint64_t *)data;  	char *packed = umem_alloc(nvsize, UMEM_NOFAIL); -	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); +	VERIFY0(dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); -	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); +	VERIFY0(nvlist_unpack(packed, nvsize, &nv, 0));  	umem_free(packed, nvsize); @@ -1454,8 +1489,8 @@ get_obsolete_refcount(vdev_t *vd)  			refcount++;  		}  	} else { -		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); -		ASSERT3U(obsolete_sm_object, ==, 0); +		ASSERT0P(vd->vdev_obsolete_sm); +		ASSERT0(obsolete_sm_object);  	}  	for (unsigned c = 0; c < vd->vdev_children; c++) {  		refcount += get_obsolete_refcount(vd->vdev_child[c]); @@ -1577,9 +1612,8 @@ dump_spacemap(objset_t *os, space_map_t *sm)  			continue;  		} -		uint8_t words;  		char entry_type; -		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; +		uint64_t entry_off, entry_run, entry_vdev;  		if (sm_entry_is_single_word(word)) {  			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? @@ -1587,35 +1621,43 @@ dump_spacemap(objset_t *os, space_map_t *sm)  			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +  			    sm->sm_start;  			entry_run = SM_RUN_DECODE(word) << mapshift; -			words = 1; + +			(void) printf("\t    [%6llu] %c " +			    "range: %012llx-%012llx size: %08llx\n", +			    (u_longlong_t)entry_id, entry_type, +			    (u_longlong_t)entry_off, +			    (u_longlong_t)(entry_off + entry_run - 1), +			    (u_longlong_t)entry_run);  		} else {  			/* it is a two-word entry so we read another word */  			ASSERT(sm_entry_is_double_word(word));  			uint64_t extra_word;  			offset += sizeof (extra_word); +			ASSERT3U(offset, <, space_map_length(sm));  			VERIFY0(dmu_read(os, space_map_object(sm), offset,  			    sizeof (extra_word), &extra_word,  			    DMU_READ_PREFETCH)); -			ASSERT3U(offset, <=, space_map_length(sm)); -  			entry_run = SM2_RUN_DECODE(word) << mapshift;  			entry_vdev = SM2_VDEV_DECODE(word);  			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?  			    'A' : 'F';  			entry_off = (SM2_OFFSET_DECODE(extra_word) <<  			    mapshift) + sm->sm_start; -			words = 2; -		} -		(void) printf("\t    [%6llu]    %c  range:" -		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n", -		    (u_longlong_t)entry_id, -		    entry_type, (u_longlong_t)entry_off, -		    (u_longlong_t)(entry_off + entry_run), -		    (u_longlong_t)entry_run, -		    (u_longlong_t)entry_vdev, words); +			if (zopt_metaslab_args == 0 || +			    zopt_metaslab[0] == entry_vdev) { +				(void) printf("\t    [%6llu] %c " +				    "range: %012llx-%012llx size: %08llx " +				    "vdev: %llu\n", +				    (u_longlong_t)entry_id, entry_type, +				    (u_longlong_t)entry_off, +				    (u_longlong_t)(entry_off + entry_run - 1), +				    (u_longlong_t)entry_run, +				    (u_longlong_t)entry_vdev); +			} +		}  		if (entry_type == 'A')  			alloc += entry_run; @@ -1651,6 +1693,16 @@ dump_metaslab_stats(metaslab_t *msp)  }  static void +dump_allocated(void *arg, uint64_t start, uint64_t size) +{ +	uint64_t *off = arg; +	if (*off != start) +		(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off, +		    start - *off); +	*off = start + size; +} + +static void  dump_metaslab(metaslab_t *msp)  {  	vdev_t *vd = msp->ms_group->mg_vd; @@ -1666,13 +1718,24 @@ dump_metaslab(metaslab_t *msp)  	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,  	    (u_longlong_t)space_map_object(sm), freebuf); -	if (dump_opt['m'] > 2 && !dump_opt['L']) { +	if (dump_opt[ARG_ALLOCATED] || +	    (dump_opt['m'] > 2 && !dump_opt['L'])) {  		mutex_enter(&msp->ms_lock);  		VERIFY0(metaslab_load(msp)); +	} + +	if (dump_opt['m'] > 2 && !dump_opt['L']) {  		zfs_range_tree_stat_verify(msp->ms_allocatable);  		dump_metaslab_stats(msp); -		metaslab_unload(msp); -		mutex_exit(&msp->ms_lock); +	} + +	if (dump_opt[ARG_ALLOCATED]) { +		uint64_t off = msp->ms_start; +		zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, +		    &off); +		if (off != msp->ms_start + msp->ms_size) +			(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off, +			    msp->ms_size - off);  	}  	if (dump_opt['m'] > 1 && sm != NULL && @@ -1687,6 +1750,12 @@ dump_metaslab(metaslab_t *msp)  		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);  	} +	if (dump_opt[ARG_ALLOCATED] || +	    (dump_opt['m'] > 2 && !dump_opt['L'])) { +		metaslab_unload(msp); +		mutex_exit(&msp->ms_lock); +	} +  	if (vd->vdev_ops == &vdev_draid_ops)  		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);  	else @@ -1723,8 +1792,9 @@ print_vdev_metaslab_header(vdev_t *vd)  		}  	} -	(void) printf("\tvdev %10llu   %s", -	    (u_longlong_t)vd->vdev_id, bias_str); +	(void) printf("\tvdev %10llu\t%s  metaslab shift %4llu", +	    (u_longlong_t)vd->vdev_id, bias_str, +	    (u_longlong_t)vd->vdev_ms_shift);  	if (ms_flush_data_obj != 0) {  		(void) printf("   ms_unflushed_phys object %llu", @@ -1791,7 +1861,7 @@ print_vdev_indirect(vdev_t *vd)  	vdev_indirect_births_t *vib = vd->vdev_indirect_births;  	if (vim == NULL) { -		ASSERT3P(vib, ==, NULL); +		ASSERT0P(vib);  		return;  	} @@ -1864,7 +1934,7 @@ dump_metaslabs(spa_t *spa)  	(void) printf("\nMetaslabs:\n"); -	if (!dump_opt['d'] && zopt_metaslab_args > 0) { +	if (zopt_metaslab_args > 0) {  		c = zopt_metaslab[0];  		if (c >= children) @@ -1991,7 +2061,7 @@ dump_ddt_log(ddt_t *ddt)  				c += strlcpy(&flagstr[c], " UNKNOWN",  				    sizeof (flagstr) - c);  			flagstr[1] = '['; -			flagstr[c++] = ']'; +			flagstr[c] = ']';  		}  		uint64_t count = avl_numnodes(&ddl->ddl_tree); @@ -2042,10 +2112,10 @@ dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class)  	if (error == ENOENT)  		return; -	ASSERT(error == 0); +	ASSERT0(error);  	error = ddt_object_count(ddt, type, class, &count); -	ASSERT(error == 0); +	ASSERT0(error);  	if (count == 0)  		return; @@ -2568,7 +2638,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,  		    (u_longlong_t)BP_GET_PSIZE(bp),  		    (u_longlong_t)BP_GET_FILL(bp),  		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), -		    (u_longlong_t)BP_GET_BIRTH(bp)); +		    (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp));  		if (bp_freed)  			(void) snprintf(blkbuf + strlen(blkbuf),  			    buflen - strlen(blkbuf), " %s", "FREE"); @@ -2582,19 +2652,17 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,  	}  } -static void +static u_longlong_t  print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,      const dnode_phys_t *dnp)  {  	char blkbuf[BP_SPRINTF_LEN]; +	u_longlong_t offset;  	int l; -	if (!BP_IS_EMBEDDED(bp)) { -		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); -		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); -	} +	offset = (u_longlong_t)blkid2offset(dnp, bp, zb); -	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); +	(void) printf("%16llx ", offset);  	ASSERT(zb->zb_level >= 0); @@ -2609,19 +2677,38 @@ print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,  	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);  	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)  		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); -	(void) printf("%s\n", blkbuf); +	(void) printf("%s", blkbuf); + +	if (!BP_IS_EMBEDDED(bp)) { +		if (BP_GET_TYPE(bp) != dnp->dn_type) { +			(void) printf(" (ERROR: Block pointer type " +			    "(%llu) does not match dnode type (%hhu))", +			    BP_GET_TYPE(bp), dnp->dn_type); +			corruption_found = B_TRUE; +		} +		if (BP_GET_LEVEL(bp) != zb->zb_level) { +			(void) printf(" (ERROR: Block pointer level " +			    "(%llu) does not match bookmark level (%lld))", +			    BP_GET_LEVEL(bp), (longlong_t)zb->zb_level); +			corruption_found = B_TRUE; +		} +	} +	(void) printf("\n"); + +	return (offset);  }  static int  visit_indirect(spa_t *spa, const dnode_phys_t *dnp,      blkptr_t *bp, const zbookmark_phys_t *zb)  { +	u_longlong_t offset;  	int err = 0; -	if (BP_GET_LOGICAL_BIRTH(bp) == 0) +	if (BP_GET_BIRTH(bp) == 0)  		return (0); -	print_indirect(spa, bp, zb, dnp); +	offset = print_indirect(spa, bp, zb, dnp);  	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {  		arc_flags_t flags = ARC_FLAG_WAIT; @@ -2651,8 +2738,15 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,  				break;  			fill += BP_GET_FILL(cbp);  		} -		if (!err) -			ASSERT3U(fill, ==, BP_GET_FILL(bp)); +		if (!err) { +			if (fill != BP_GET_FILL(bp)) { +				(void) printf("%16llx: Block pointer " +				    "fill (%llu) does not match calculated " +				    "value (%llu)\n", offset, BP_GET_FILL(bp), +				    (u_longlong_t)fill); +				corruption_found = B_TRUE; +			} +		}  		arc_buf_destroy(buf, &buf);  	} @@ -2806,7 +2900,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)  	(void) arg, (void) tx;  	char blkbuf[BP_SPRINTF_LEN]; -	if (BP_GET_LOGICAL_BIRTH(bp) != 0) { +	if (BP_GET_BIRTH(bp) != 0) {  		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);  		(void) printf("\t%s\n", blkbuf);  	} @@ -2847,7 +2941,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)  	(void) arg, (void) tx;  	char blkbuf[BP_SPRINTF_LEN]; -	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0); +	ASSERT(BP_GET_BIRTH(bp) != 0);  	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);  	(void) printf("\t%s\n", blkbuf);  	return (0); @@ -2908,6 +3002,7 @@ dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)  				(void) printf("ERROR %u while trying to open "  				    "subobj id %llu\n",  				    error, (u_longlong_t)subobj); +				corruption_found = B_TRUE;  				continue;  			}  			dump_full_bpobj(&subbpo, "subobj", indent + 1); @@ -3087,6 +3182,7 @@ bpobj_count_refd(bpobj_t *bpo)  				(void) printf("ERROR %u while trying to open "  				    "subobj id %llu\n",  				    error, (u_longlong_t)subobj); +				corruption_found = B_TRUE;  				continue;  			}  			bpobj_count_refd(&subbpo); @@ -3108,7 +3204,7 @@ dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)  static int  dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)  { -	ASSERT(arg == NULL); +	ASSERT0P(arg);  	if (dump_opt['d'] >= 5) {  		char buf[128];  		(void) snprintf(buf, sizeof (buf), @@ -3229,6 +3325,7 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)  	uint64_t keyformat, salt, iters;  	int i;  	unsigned char c; +	FILE *f;  	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,  	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), @@ -3261,6 +3358,25 @@ zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)  		break; +	case ZFS_KEYFORMAT_RAW: +		if ((f = fopen(key_material, "r")) == NULL) +			return (B_FALSE); + +		if (fread(key_out, 1, WRAPPING_KEY_LEN, f) != +		    WRAPPING_KEY_LEN) { +			(void) fclose(f); +			return (B_FALSE); +		} + +		/* Check the key length */ +		if (fgetc(f) != EOF) { +			(void) fclose(f); +			return (B_FALSE); +		} + +		(void) fclose(f); +		break; +  	default:  		fatal("no support for key format %u\n",  		    (unsigned int) keyformat); @@ -3346,7 +3462,7 @@ open_objset(const char *path, const void *tag, objset_t **osp)  	uint64_t sa_attrs = 0;  	uint64_t version = 0; -	VERIFY3P(sa_os, ==, NULL); +	VERIFY0P(sa_os);  	/*  	 * We can't own an objset if it's redacted.  Therefore, we do this @@ -3519,8 +3635,8 @@ dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)  		uint64_t fuid_obj;  		/* first find the fuid object.  It lives in the master node */ -		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, -		    8, 1, &fuid_obj) == 0); +		VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, +		    8, 1, &fuid_obj));  		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);  		(void) zfs_fuid_table_load(os, fuid_obj,  		    &idx_tree, &domain_tree); @@ -5722,6 +5838,34 @@ dump_size_histograms(zdb_cb_t *zcb)  	(void) printf("\nBlock Size Histogram\n"); +	switch (block_bin_mode) { +	case BIN_PSIZE: +		printf("(note: all categories are binned by %s)\n", "psize"); +		break; +	case BIN_LSIZE: +		printf("(note: all categories are binned by %s)\n", "lsize"); +		break; +	case BIN_ASIZE: +		printf("(note: all categories are binned by %s)\n", "asize"); +		break; +	default: +		printf("(note: all categories are binned separately)\n"); +		break; +	} +	if (block_classes != 0) { +		char buf[256] = ""; +		if (block_classes & CLASS_NORMAL) +			strlcat(buf, "\"normal\", ", sizeof (buf)); +		if (block_classes & CLASS_SPECIAL) +			strlcat(buf, "\"special\", ", sizeof (buf)); +		if (block_classes & CLASS_DEDUP) +			strlcat(buf, "\"dedup\", ", sizeof (buf)); +		if (block_classes & CLASS_OTHER) +			strlcat(buf, "\"other\", ", sizeof (buf)); +		buf[strlen(buf)-2] = '\0'; +		printf("(note: only blocks in these classes are counted: %s)\n", +		    buf); +	}  	/*  	 * Print the first line titles  	 */ @@ -5921,11 +6065,11 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,  		 * entry back to the block pointer before we claim it.  		 */  		if (v == DDT_PHYS_FLAT) { -			ASSERT3U(BP_GET_BIRTH(bp), ==, +			ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==,  			    ddt_phys_birth(dde->dde_phys, v));  			tempbp = *bp;  			ddt_bp_fill(dde->dde_phys, v, &tempbp, -			    BP_GET_BIRTH(bp)); +			    BP_GET_PHYSICAL_BIRTH(bp));  			bp = &tempbp;  		} @@ -6070,29 +6214,85 @@ skipped:  		    [BPE_GET_PSIZE(bp)]++;  		return;  	} + +	if (block_classes != 0) { +		spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + +		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[0]); +		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]); +		vdev_t *vd = vdev_lookup_top(zcb->zcb_spa, vdev); +		ASSERT(vd != NULL); +		metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; +		ASSERT(ms != NULL); +		metaslab_group_t *mg = ms->ms_group; +		ASSERT(mg != NULL); +		metaslab_class_t *mc = mg->mg_class; +		ASSERT(mc != NULL); + +		spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + +		int class; +		if (mc == spa_normal_class(zcb->zcb_spa)) { +			class = CLASS_NORMAL; +		} else if (mc == spa_special_class(zcb->zcb_spa)) { +			class = CLASS_SPECIAL; +		} else if (mc == spa_dedup_class(zcb->zcb_spa)) { +			class = CLASS_DEDUP; +		} else { +			class = CLASS_OTHER; +		} + +		if (!(block_classes & class)) { +			goto hist_skipped; +		} +	} +  	/*  	 * The binning histogram bins by powers of two up to  	 * SPA_MAXBLOCKSIZE rather than creating bins for  	 * every possible blocksize found in the pool.  	 */ -	int bin = highbit64(BP_GET_PSIZE(bp)) - 1; +	int bin; + +	/* +	 * Binning strategy: each bin includes blocks up to and including +	 * the given size (excluding blocks that fit into the previous bin). +	 * This way, the "4K" bin includes blocks within the (2K; 4K] range. +	 */ +#define	BIN(size) (highbit64((size) - 1)) + +	switch (block_bin_mode) { +	case BIN_PSIZE: bin = BIN(BP_GET_PSIZE(bp)); break; +	case BIN_LSIZE: bin = BIN(BP_GET_LSIZE(bp)); break; +	case BIN_ASIZE: bin = BIN(BP_GET_ASIZE(bp)); break; +	case BIN_AUTO: break; +	default: PANIC("bad block_bin_mode"); abort(); +	} + +	if (block_bin_mode == BIN_AUTO) +		bin = BIN(BP_GET_PSIZE(bp));  	zcb->zcb_psize_count[bin]++;  	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);  	zcb->zcb_psize_total += BP_GET_PSIZE(bp); -	bin = highbit64(BP_GET_LSIZE(bp)) - 1; +	if (block_bin_mode == BIN_AUTO) +		bin = BIN(BP_GET_LSIZE(bp));  	zcb->zcb_lsize_count[bin]++;  	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);  	zcb->zcb_lsize_total += BP_GET_LSIZE(bp); -	bin = highbit64(BP_GET_ASIZE(bp)) - 1; +	if (block_bin_mode == BIN_AUTO) +		bin = BIN(BP_GET_ASIZE(bp));  	zcb->zcb_asize_count[bin]++;  	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);  	zcb->zcb_asize_total += BP_GET_ASIZE(bp); +#undef BIN + +hist_skipped:  	if (!do_claim)  		return; @@ -6151,7 +6351,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,  	if (zb->zb_level == ZB_DNODE_LEVEL)  		return (0); -	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) { +	if (dump_opt['b'] >= 5 && BP_GET_BIRTH(bp) > 0) {  		char blkbuf[BP_SPRINTF_LEN];  		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);  		(void) printf("objset %llu object %llu " @@ -6322,8 +6522,9 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)  	ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); -	zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, -	    NULL, 0, 0); +	zfs_range_tree_t *allocs = zfs_range_tree_create_flags( +	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0, +	    0, "zdb_claim_removing:allocs");  	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {  		metaslab_t *msp = vd->vdev_ms[msi]; @@ -6750,6 +6951,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)  	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;  	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;  	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; +	spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops;  	zcb->zcb_vd_obsolete_counts =  	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -6887,7 +7089,9 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)  		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {  			metaslab_t *msp = vd->vdev_ms[m];  			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == -			    spa_embedded_log_class(spa)) ? +			    spa_embedded_log_class(spa) || +			    msp->ms_group->mg_class == +			    spa_special_embedded_log_class(spa)) ?  			    vd->vdev_log_mg : vd->vdev_mg);  			/* @@ -7011,7 +7215,7 @@ deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)  static void  dump_livelist_cb(dsl_deadlist_t *ll, void *arg)  { -	ASSERT3P(arg, ==, NULL); +	ASSERT0P(arg);  	global_feature_count[SPA_FEATURE_LIVELIST]++;  	dump_blkptr_list(ll, "Deleted Livelist");  	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); @@ -7121,6 +7325,8 @@ dump_block_stats(spa_t *spa)  	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));  	zcb->zcb_totalasize +=  	    metaslab_class_get_alloc(spa_embedded_log_class(spa)); +	zcb->zcb_totalasize += +	    metaslab_class_get_alloc(spa_special_embedded_log_class(spa));  	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();  	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); @@ -7169,6 +7375,7 @@ dump_block_stats(spa_t *spa)  	total_alloc = norm_alloc +  	    metaslab_class_get_alloc(spa_log_class(spa)) +  	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) + +	    metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) +  	    metaslab_class_get_alloc(spa_special_class(spa)) +  	    metaslab_class_get_alloc(spa_dedup_class(spa)) +  	    get_unflushed_alloc_space(spa); @@ -7252,6 +7459,18 @@ dump_block_stats(spa_t *spa)  		    100.0 * alloc / space);  	} +	if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor +	    != NULL) { +		uint64_t alloc = metaslab_class_get_alloc( +		    spa_special_embedded_log_class(spa)); +		uint64_t space = metaslab_class_get_space( +		    spa_special_embedded_log_class(spa)); + +		(void) printf("\t%-16s %14llu     used: %5.2f%%\n", +		    "Special embedded log", (u_longlong_t)alloc, +		    100.0 * alloc / space); +	} +  	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {  		if (zcb->zcb_embedded_blocks[i] == 0)  			continue; @@ -7706,7 +7925,8 @@ zdb_set_skip_mmp(char *target)   * applies to the new_path parameter if allocated.   */  static char * -import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa, +    char **new_path)  {  	int error = 0;  	char *poolname, *bogus_name = NULL; @@ -7714,11 +7934,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)  	/* If the target is not a pool, the extract the pool name */  	char *path_start = strchr(target, '/'); -	if (path_start != NULL) { +	if (target_is_spa || path_start == NULL) { +		poolname = target; +	} else {  		size_t poolname_len = path_start - target;  		poolname = strndup(target, poolname_len); -	} else { -		poolname = target;  	}  	if (cfg == NULL) { @@ -7749,10 +7969,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)  		    "with error %d\n", bogus_name, error);  	} -	if (new_path != NULL && path_start != NULL) { -		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { +	if (new_path != NULL && !target_is_spa) { +		if (asprintf(new_path, "%s%s", bogus_name, +		    path_start != NULL ? path_start : "") == -1) {  			free(bogus_name); -			if (path_start != NULL) +			if (!target_is_spa && path_start != NULL)  				free(poolname);  			return (NULL);  		} @@ -7891,7 +8112,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)  		for (uint64_t c = ckpoint_rvd->vdev_children;  		    c < current_rvd->vdev_children; c++) {  			vdev_t *current_vd = current_rvd->vdev_child[c]; -			VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); +			VERIFY0P(current_vd->vdev_checkpoint_sm);  		}  	} @@ -7981,7 +8202,7 @@ verify_checkpoint_blocks(spa_t *spa)  	 * name) so we can do verification on it against the current state  	 * of the pool.  	 */ -	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, +	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE,  	    NULL);  	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); @@ -8451,8 +8672,9 @@ dump_zpool(spa_t *spa)  	if (dump_opt['d'] || dump_opt['i']) {  		spa_feature_t f; -		mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, -		    NULL, 0, 0); +		mos_refd_objs = zfs_range_tree_create_flags( +		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0, +		    0, "dump_zpool:mos_refd_objs");  		dump_objset(dp->dp_meta_objset);  		if (dump_opt['d'] >= 3) { @@ -8588,9 +8810,9 @@ zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)  }  static void -zdb_dump_gbh(void *buf, int flags) +zdb_dump_gbh(void *buf, uint64_t size, int flags)  { -	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); +	zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags);  }  static void @@ -8780,7 +9002,6 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,  	(void) buf;  	uint64_t orig_lsize = lsize;  	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL)); -	boolean_t found = B_FALSE;  	/*  	 * We don't know how the data was compressed, so just try  	 * every decompress function at every inflated blocksize. @@ -8823,20 +9044,19 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,  		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {  			if (try_decompress_block(pabd, lsize, psize, flags,  			    *cfuncp, lbuf, lbuf2)) { -				found = B_TRUE; +				tryzle = B_FALSE;  				break;  			}  		}  		if (*cfuncp != 0)  			break;  	} -	if (!found && tryzle) { +	if (tryzle) {  		for (lsize = orig_lsize; lsize <= maxlsize;  		    lsize += SPA_MINBLOCKSIZE) {  			if (try_decompress_block(pabd, lsize, psize, flags,  			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {  				*cfuncp = ZIO_COMPRESS_ZLE; -				found = B_TRUE;  				break;  			}  		} @@ -9073,7 +9293,7 @@ zdb_read_block(char *thing, spa_t *spa)  		zdb_dump_indirect((blkptr_t *)buf,  		    orig_lsize / sizeof (blkptr_t), flags);  	else if (flags & ZDB_FLAG_GBH) -		zdb_dump_gbh(buf, flags); +		zdb_dump_gbh(buf, lsize, flags);  	else  		zdb_dump_block(thing, buf, lsize, flags); @@ -9120,7 +9340,7 @@ zdb_read_block(char *thing, spa_t *spa)  				ck_zio->io_offset =  				    DVA_GET_OFFSET(&bp->blk_dva[0]);  				ck_zio->io_bp = bp; -				zio_checksum_compute(ck_zio, ck, pabd, lsize); +				zio_checksum_compute(ck_zio, ck, pabd, psize);  				printf(  				    "%12s\t"  				    "cksum=%016llx:%016llx:%016llx:%016llx\n", @@ -9313,6 +9533,12 @@ main(int argc, char **argv)  		{"all-reconstruction",	no_argument,		NULL, 'Y'},  		{"livelist",		no_argument,		NULL, 'y'},  		{"zstd-headers",	no_argument,		NULL, 'Z'}, +		{"allocated-map",	no_argument,		NULL, +		    ARG_ALLOCATED}, +		{"bin",			required_argument,	NULL, +		    ARG_BLOCK_BIN_MODE}, +		{"class",		required_argument,	NULL, +		    ARG_BLOCK_CLASSES},  		{0, 0, 0, 0}  	}; @@ -9343,6 +9569,7 @@ main(int argc, char **argv)  		case 'u':  		case 'y':  		case 'Z': +		case ARG_ALLOCATED:  			dump_opt[c]++;  			dump_all = 0;  			break; @@ -9377,9 +9604,11 @@ main(int argc, char **argv)  			while (*optarg != '\0') { *optarg++ = '*'; }  			break;  		case 'o': -			error = set_global_var(optarg); +			dump_opt[c]++; +			dump_all = 0; +			error = handle_tunable_option(optarg, B_FALSE);  			if (error != 0) -				usage(); +				zdb_exit(1);  			break;  		case 'p':  			if (searchdirs == NULL) { @@ -9423,6 +9652,59 @@ main(int argc, char **argv)  		case 'x':  			vn_dumpdir = optarg;  			break; +		case ARG_BLOCK_BIN_MODE: +			if (strcmp(optarg, "lsize") == 0) { +				block_bin_mode = BIN_LSIZE; +			} else if (strcmp(optarg, "psize") == 0) { +				block_bin_mode = BIN_PSIZE; +			} else if (strcmp(optarg, "asize") == 0) { +				block_bin_mode = BIN_ASIZE; +			} else { +				(void) fprintf(stderr, +				    "--bin=\"%s\" must be one of \"lsize\", " +				    "\"psize\" or \"asize\"\n", optarg); +				usage(); +			} +			break; + +		case ARG_BLOCK_CLASSES: { +			char *buf = strdup(optarg), *tok = buf, *next, +			    *save = NULL; + +			while ((next = strtok_r(tok, ",", &save)) != NULL) { +				tok = NULL; + +				if (strcmp(next, "normal") == 0) { +					block_classes |= CLASS_NORMAL; +				} else if (strcmp(next, "special") == 0) { +					block_classes |= CLASS_SPECIAL; +				} else if (strcmp(next, "dedup") == 0) { +					block_classes |= CLASS_DEDUP; +				} else if (strcmp(next, "other") == 0) { +					block_classes |= CLASS_OTHER; +				} else { +					(void) fprintf(stderr, +					    "--class=\"%s\" must be a " +					    "comma-separated list of either " +					    "\"normal\", \"special\", " +					    "\"asize\" or \"other\"; " +					    "got \"%s\"\n", +					    optarg, next); +					usage(); +				} +			} + +			if (block_classes == 0) { +				(void) fprintf(stderr, +				    "--class= must be a comma-separated " +				    "list of either \"normal\", \"special\", " +				    "\"asize\" or \"other\"; got empty\n"); +				usage(); +			} + +			free(buf); +			break; +		}  		default:  			usage();  			break; @@ -9465,6 +9747,9 @@ main(int argc, char **argv)  	 */  	spa_mode_readable_spacemaps = B_TRUE; +	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); +	zfs_recover = (dump_opt['A'] > 1); +  	if (dump_all)  		verbose = MAX(verbose, 1); @@ -9475,9 +9760,6 @@ main(int argc, char **argv)  			dump_opt[c] += verbose;  	} -	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); -	zfs_recover = (dump_opt['A'] > 1); -  	argc -= optind;  	argv += optind;  	if (argc < 2 && dump_opt['R']) @@ -9545,6 +9827,12 @@ main(int argc, char **argv)  			error = 0;  			goto fini;  		} +		if (dump_opt['o']) +			/* +			 * Avoid blasting tunable options off the top of the +			 * screen. +			 */ +			zdb_exit(1);  		usage();  	} @@ -9605,7 +9893,7 @@ main(int argc, char **argv)  		} else if (objset_str && !zdb_numeric(objset_str + 1) &&  		    dump_opt['N']) {  			printf("Supply a numeric objset ID with -N\n"); -			error = 1; +			error = 2;  			goto fini;  		}  	} else { @@ -9697,7 +9985,7 @@ main(int argc, char **argv)  	char *checkpoint_target = NULL;  	if (dump_opt['k']) {  		checkpoint_pool = import_checkpointed_state(target, cfg, -		    &checkpoint_target); +		    target_is_spa, &checkpoint_target);  		if (checkpoint_target != NULL)  			target = checkpoint_target; @@ -9714,7 +10002,7 @@ main(int argc, char **argv)  	if (error == 0) {  		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {  			ASSERT(checkpoint_pool != NULL); -			ASSERT(checkpoint_target == NULL); +			ASSERT0P(checkpoint_target);  			error = spa_open(checkpoint_pool, &spa, FTAG);  			if (error != 0) { @@ -9907,5 +10195,8 @@ fini:  	if (kernel_init_done)  		kernel_fini(); +	if (corruption_found && error == 0) +		error = 3; +  	return (error);  } diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.h b/sys/contrib/openzfs/cmd/zdb/zdb.h index 6b6c9169816b..48b561eb202c 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.h +++ b/sys/contrib/openzfs/cmd/zdb/zdb.h @@ -29,6 +29,6 @@  #define	_ZDB_H  void dump_intent_log(zilog_t *); -extern uint8_t dump_opt[256]; +extern uint8_t dump_opt[512];  #endif	/* _ZDB_H */ diff --git a/sys/contrib/openzfs/cmd/zdb/zdb_il.c b/sys/contrib/openzfs/cmd/zdb/zdb_il.c index 6b90b08ca1b1..3d91fb28a4c7 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb_il.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb_il.c @@ -48,8 +48,6 @@  #include "zdb.h" -extern uint8_t dump_opt[256]; -  static char tab_prefix[4] = "\t\t\t";  static void @@ -176,7 +174,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)  	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {  		(void) printf("%shas blkptr, %s\n", tab_prefix, -		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >= +		    !BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) >=  		    spa_min_claim_txg(zilog->zl_spa) ?  		    "will claim" : "won't claim");  		print_log_bp(bp, tab_prefix); @@ -189,7 +187,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)  			(void) printf("%s<hole>\n", tab_prefix);  			return;  		} -		if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) { +		if (BP_GET_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {  			(void) printf("%s<block already committed>\n",  			    tab_prefix);  			return; @@ -240,7 +238,7 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)  	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {  		(void) printf("%shas blkptr, %s\n", tab_prefix, -		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >= +		    !BP_IS_HOLE(bp) && BP_GET_BIRTH(bp) >=  		    spa_min_claim_txg(zilog->zl_spa) ?  		    "will claim" : "won't claim");  		print_log_bp(bp, tab_prefix); @@ -476,7 +474,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,  	if (claim_txg != 0)  		claim = "already claimed"; -	else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa)) +	else if (BP_GET_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))  		claim = "will claim";  	else  		claim = "won't claim"; diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c index 8718dbde03b6..c0590edc7516 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c @@ -134,11 +134,13 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)  	 * of blkid cache and L2ARC VDEV does not contain pool guid in its  	 * blkid, so this is a special case for L2ARC VDEV.  	 */ -	else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL && +	else if (gsp->gs_vdev_guid != 0 &&  	    nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 &&  	    gsp->gs_vdev_guid == vdev_guid) { -		(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, -		    &gsp->gs_devid); +		if (gsp->gs_devid == NULL) { +			(void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, +			    &gsp->gs_devid); +		}  		(void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,  		    &gsp->gs_vdev_expandtime);  		return (B_TRUE); @@ -156,22 +158,28 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)  	/*  	 * For each vdev in this pool, look for a match by devid  	 */ -	if ((config = zpool_get_config(zhp, NULL)) != NULL) { -		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, -		    &nvl) == 0) { -			(void) zfs_agent_iter_vdev(zhp, nvl, gsp); -		} -	} -	/* -	 * if a match was found then grab the pool guid -	 */ -	if (gsp->gs_vdev_guid && gsp->gs_devid) { -		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, -		    &gsp->gs_pool_guid); -	} +	boolean_t found = B_FALSE; +	uint64_t pool_guid; +	/* Get pool configuration and extract pool GUID */ +	if ((config = zpool_get_config(zhp, NULL)) == NULL || +	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, +	    &pool_guid) != 0) +		goto out; + +	/* Skip this pool if we're looking for a specific pool */ +	if (gsp->gs_pool_guid != 0 && pool_guid != gsp->gs_pool_guid) +		goto out; + +	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) == 0) +		found = zfs_agent_iter_vdev(zhp, nvl, gsp); + +	if (found && gsp->gs_pool_guid == 0) +		gsp->gs_pool_guid = pool_guid; + +out:  	zpool_close(zhp); -	return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0); +	return (found);  }  void @@ -233,20 +241,17 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)  		 * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or  		 * ZFS_EV_POOL_GUID may be missing so find them.  		 */ -		if (devid == NULL || pool_guid == 0 || vdev_guid == 0) { -			if (devid == NULL) -				search.gs_vdev_guid = vdev_guid; -			else -				search.gs_devid = devid; -			zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); -			if (devid == NULL) -				devid = search.gs_devid; -			if (pool_guid == 0) -				pool_guid = search.gs_pool_guid; -			if (vdev_guid == 0) -				vdev_guid = search.gs_vdev_guid; -			devtype = search.gs_vdev_type; -		} +		search.gs_devid = devid; +		search.gs_vdev_guid = vdev_guid; +		search.gs_pool_guid = pool_guid; +		zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); +		if (devid == NULL) +			devid = search.gs_devid; +		if (pool_guid == 0) +			pool_guid = search.gs_pool_guid; +		if (vdev_guid == 0) +			vdev_guid = search.gs_vdev_guid; +		devtype = search.gs_vdev_type;  		/*  		 * We want to avoid reporting "remove" events coming from diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am index 093a04c4636a..c0b161ecf248 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am +++ b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am @@ -9,18 +9,18 @@ dist_zedexec_SCRIPTS = \  	%D%/all-debug.sh \  	%D%/all-syslog.sh \  	%D%/data-notify.sh \ -	%D%/deadman-slot_off.sh \ +	%D%/deadman-sync-slot_off.sh \  	%D%/generic-notify.sh \ -	%D%/pool_import-led.sh \ +	%D%/pool_import-sync-led.sh \  	%D%/resilver_finish-notify.sh \  	%D%/resilver_finish-start-scrub.sh \  	%D%/scrub_finish-notify.sh \ -	%D%/statechange-led.sh \ +	%D%/statechange-sync-led.sh \  	%D%/statechange-notify.sh \ -	%D%/statechange-slot_off.sh \ +	%D%/statechange-sync-slot_off.sh \  	%D%/trim_finish-notify.sh \ -	%D%/vdev_attach-led.sh \ -	%D%/vdev_clear-led.sh +	%D%/vdev_attach-sync-led.sh \ +	%D%/vdev_clear-sync-led.sh  nodist_zedexec_SCRIPTS = \  	%D%/history_event-zfs-list-cacher.sh @@ -30,17 +30,17 @@ SUBSTFILES += $(nodist_zedexec_SCRIPTS)  zedconfdefaults = \  	all-syslog.sh \  	data-notify.sh \ -	deadman-slot_off.sh \ +	deadman-sync-slot_off.sh \  	history_event-zfs-list-cacher.sh \ -	pool_import-led.sh \ +	pool_import-sync-led.sh \  	resilver_finish-notify.sh \  	resilver_finish-start-scrub.sh \  	scrub_finish-notify.sh \ -	statechange-led.sh \ +	statechange-sync-led.sh \  	statechange-notify.sh \ -	statechange-slot_off.sh \ -	vdev_attach-led.sh \ -	vdev_clear-led.sh +	statechange-sync-slot_off.sh \ +	vdev_attach-sync-led.sh \ +	vdev_clear-sync-led.sh  dist_noinst_DATA += %D%/README diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/deadman-slot_off.sh b/sys/contrib/openzfs/cmd/zed/zed.d/deadman-sync-slot_off.sh index 7b339b3add01..7b339b3add01 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/deadman-slot_off.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/deadman-sync-slot_off.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh deleted file mode 120000 index 7d7404398a4a..000000000000 --- a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-sync-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-sync-led.sh new file mode 120000 index 000000000000..8b9c10c11ebb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-led.sh index 40cb61f17307..40cb61f17307 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-led.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-slot_off.sh index 06acce93b8aa..06acce93b8aa 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-slot_off.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-sync-slot_off.sh diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh deleted file mode 120000 index 7d7404398a4a..000000000000 --- a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-sync-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-sync-led.sh new file mode 120000 index 000000000000..8b9c10c11ebb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh deleted file mode 120000 index 7d7404398a4a..000000000000 --- a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh +++ /dev/null @@ -1 +0,0 @@ -statechange-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-sync-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-sync-led.sh new file mode 120000 index 000000000000..8b9c10c11ebb --- /dev/null +++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-sync-led.sh @@ -0,0 +1 @@ +statechange-sync-led.sh
\ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh index 6e00f153be1c..78d8f658ddd8 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh @@ -441,8 +441,9 @@ zed_notify_slack_webhook()          "${pathname}")"      # Construct the JSON message for posting. +    # shellcheck disable=SC2016      # -    msg_json="$(printf '{"text": "*%s*\\n%s"}' "${subject}" "${msg_body}" )" +    msg_json="$(printf '{"text": "*%s*\\n```%s```"}' "${subject}" "${msg_body}" )"      # Send the POST request and check for errors.      # diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.c b/sys/contrib/openzfs/cmd/zed/zed_event.c index 296c222ca382..ba7cba304b1d 100644 --- a/sys/contrib/openzfs/cmd/zed/zed_event.c +++ b/sys/contrib/openzfs/cmd/zed/zed_event.c @@ -110,7 +110,7 @@ zed_event_fini(struct zed_conf *zcp)  static void  _bump_event_queue_length(void)  { -	int zzlm = -1, wr; +	int zzlm, wr;  	char qlen_buf[12] = {0}; /* parameter is int => max "-2147483647\n" */  	long int qlen, orig_qlen; diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.c b/sys/contrib/openzfs/cmd/zed/zed_exec.c index 036081decd64..a14af4f20a85 100644 --- a/sys/contrib/openzfs/cmd/zed/zed_exec.c +++ b/sys/contrib/openzfs/cmd/zed/zed_exec.c @@ -196,37 +196,29 @@ _nop(int sig)  	(void) sig;  } -static void * -_reap_children(void *arg) +static void +wait_for_children(boolean_t do_pause, boolean_t wait)  { -	(void) arg; -	struct launched_process_node node, *pnode;  	pid_t pid; -	int status;  	struct rusage usage; -	struct sigaction sa = {}; - -	(void) sigfillset(&sa.sa_mask); -	(void) sigdelset(&sa.sa_mask, SIGCHLD); -	(void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); - -	(void) sigemptyset(&sa.sa_mask); -	sa.sa_handler = _nop; -	sa.sa_flags = SA_NOCLDSTOP; -	(void) sigaction(SIGCHLD, &sa, NULL); +	int status; +	struct launched_process_node node, *pnode;  	for (_reap_children_stop = B_FALSE; !_reap_children_stop; ) {  		(void) pthread_mutex_lock(&_launched_processes_lock); -		pid = wait4(0, &status, WNOHANG, &usage); - +		pid = wait4(0, &status, wait ? 0 : WNOHANG, &usage);  		if (pid == 0 || pid == (pid_t)-1) {  			(void) pthread_mutex_unlock(&_launched_processes_lock); -			if (pid == 0 || errno == ECHILD) -				pause(); -			else if (errno != EINTR) +			if ((pid == 0) || (errno == ECHILD)) { +				if (do_pause) +					pause(); +			} else if (errno != EINTR)  				zed_log_msg(LOG_WARNING,  				    "Failed to wait for children: %s",  				    strerror(errno)); +			if (!do_pause) +				return; +  		} else {  			memset(&node, 0, sizeof (node));  			node.pid = pid; @@ -278,6 +270,25 @@ _reap_children(void *arg)  		}  	} +} + +static void * +_reap_children(void *arg) +{ +	(void) arg; +	struct sigaction sa = {}; + +	(void) sigfillset(&sa.sa_mask); +	(void) sigdelset(&sa.sa_mask, SIGCHLD); +	(void) pthread_sigmask(SIG_SETMASK, &sa.sa_mask, NULL); + +	(void) sigemptyset(&sa.sa_mask); +	sa.sa_handler = _nop; +	sa.sa_flags = SA_NOCLDSTOP; +	(void) sigaction(SIGCHLD, &sa, NULL); + +	wait_for_children(B_TRUE, B_FALSE); +  	return (NULL);  } @@ -307,6 +318,45 @@ zed_exec_fini(void)  }  /* + * Check if the zedlet name indicates if it is a synchronous zedlet + * + * Synchronous zedlets have a "-sync-" immediately following the event name in + * their zedlet filename, like: + * + * EVENT_NAME-sync-ZEDLETNAME.sh + * + * For example, if you wanted a synchronous statechange script: + * + * statechange-sync-myzedlet.sh + * + * Synchronous zedlets are guaranteed to be the only zedlet running.  No other + * zedlets may run in parallel with a synchronous zedlet.  A synchronous + * zedlet will wait for all previously spawned zedlets to finish before running. + * Users should be careful to only use synchronous zedlets when needed, since + * they decrease parallelism. + */ +static boolean_t +zedlet_is_sync(const char *zedlet, const char *event) +{ +	const char *sync_str = "-sync-"; +	size_t sync_str_len; +	size_t zedlet_len; +	size_t event_len; + +	sync_str_len = strlen(sync_str); +	zedlet_len = strlen(zedlet); +	event_len = strlen(event); + +	if (event_len + sync_str_len >= zedlet_len) +		return (B_FALSE); + +	if (strncmp(&zedlet[event_len], sync_str, sync_str_len) == 0) +		return (B_TRUE); + +	return (B_FALSE); +} + +/*   * Process the event [eid] by synchronously invoking all zedlets with a   * matching class prefix.   * @@ -368,9 +418,28 @@ zed_exec_process(uint64_t eid, const char *class, const char *subclass,  	    z = zed_strings_next(zcp->zedlets)) {  		for (csp = class_strings; *csp; csp++) {  			n = strlen(*csp); -			if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) +			if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) { +				boolean_t is_sync = zedlet_is_sync(z, *csp); + +				if (is_sync) { +					/* +					 * Wait for previous zedlets to +					 * finish +					 */ +					wait_for_children(B_FALSE, B_TRUE); +				} +  				_zed_exec_fork_child(eid, zcp->zedlet_dir,  				    z, e, zcp->zevent_fd, zcp->do_foreground); + +				if (is_sync) { +					/* +					 * Wait for sync zedlet we just launched +					 * to finish. +					 */ +					wait_for_children(B_FALSE, B_TRUE); +				} +			}  		}  	}  	free(e); diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index 841c356508a5..ccdd5ffef8e6 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -440,7 +440,7 @@ get_usage(zfs_help_t idx)  		return (gettext("\tredact <snapshot> <bookmark> "  		    "<redaction_snapshot> ...\n"));  	case HELP_REWRITE: -		return (gettext("\trewrite [-rvx] [-o <offset>] [-l <length>] " +		return (gettext("\trewrite [-Prvx] [-o <offset>] [-l <length>] "  		    "<directory|file ...>\n"));  	case HELP_JAIL:  		return (gettext("\tjail <jailid|jailname> <filesystem>\n")); @@ -914,7 +914,11 @@ zfs_do_clone(int argc, char **argv)  			log_history = B_FALSE;  		} -		ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); +		/* +		 * Dataset cloned successfully, mount/share failures are +		 * non-fatal. +		 */ +		(void) zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);  	}  	zfs_close(zhp); @@ -923,26 +927,22 @@ zfs_do_clone(int argc, char **argv)  	return (!!ret);  usage: -	ASSERT3P(zhp, ==, NULL); +	ASSERT0P(zhp);  	nvlist_free(props);  	usage(B_FALSE);  	return (-1);  }  /* - * Return a default volblocksize for the pool which always uses more than - * half of the data sectors.  This primarily applies to dRAID which always - * writes full stripe widths. + * Calculate the minimum allocation size based on the top-level vdevs.   */  static uint64_t -default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +calculate_volblocksize(nvlist_t *config)  { -	uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; +	uint64_t asize = SPA_MINBLOCKSIZE;  	nvlist_t *tree, **vdevs;  	uint_t nvdevs; -	nvlist_t *config = zpool_get_config(zhp, NULL); -  	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||  	    nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,  	    &vdevs, &nvdevs) != 0) { @@ -973,6 +973,24 @@ default_volblocksize(zpool_handle_t *zhp, nvlist_t *props)  		}  	} +	return (asize); +} + +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors.  This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ +	uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + +	nvlist_t *config = zpool_get_config(zhp, NULL); + +	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_MAX_ALLOC, &asize) != 0) +		asize = calculate_volblocksize(config); +  	/*  	 * Calculate the target volblocksize such that more than half  	 * of the asize is used. The following table is for 4k sectors. @@ -1319,7 +1337,9 @@ zfs_do_create(int argc, char **argv)  		goto error;  	} -	ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); +	/* Dataset created successfully, mount/share failures are non-fatal */ +	ret = 0; +	(void) zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);  error:  	nvlist_free(props);  	return (ret); @@ -1974,9 +1994,8 @@ fill_dataset_info(nvlist_t *list, zfs_handle_t *zhp, boolean_t as_int)  	}  	if (type == ZFS_TYPE_SNAPSHOT) { -		char *ds, *snap; -		ds = snap = strdup(zfs_get_name(zhp)); -		ds = strsep(&snap, "@"); +		char *snap = strdup(zfs_get_name(zhp)); +		char *ds = strsep(&snap, "@");  		fnvlist_add_string(list, "dataset", ds);  		fnvlist_add_string(list, "snapshot_name", snap);  		free(ds); @@ -2019,8 +2038,7 @@ get_callback(zfs_handle_t *zhp, void *data)  	nvlist_t *user_props = zfs_get_user_props(zhp);  	zprop_list_t *pl = cbp->cb_proplist;  	nvlist_t *propval; -	nvlist_t *item, *d, *props; -	item = d = props = NULL; +	nvlist_t *item, *d = NULL, *props = NULL;  	const char *strval;  	const char *sourceval;  	boolean_t received = is_recvd_column(cbp); @@ -5305,6 +5323,7 @@ zfs_do_receive(int argc, char **argv)  #define	ZFS_DELEG_PERM_MOUNT		"mount"  #define	ZFS_DELEG_PERM_SHARE		"share"  #define	ZFS_DELEG_PERM_SEND		"send" +#define	ZFS_DELEG_PERM_SEND_RAW		"send:raw"  #define	ZFS_DELEG_PERM_RECEIVE		"receive"  #define	ZFS_DELEG_PERM_RECEIVE_APPEND	"receive:append"  #define	ZFS_DELEG_PERM_ALLOW		"allow" @@ -5347,6 +5366,7 @@ static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {  	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },  	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },  	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, +	{ ZFS_DELEG_PERM_SEND_RAW, ZFS_DELEG_NOTE_SEND_RAW },  	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },  	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },  	{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, @@ -5879,7 +5899,7 @@ parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)  static inline const char *  deleg_perm_comment(zfs_deleg_note_t note)  { -	const char *str = ""; +	const char *str;  	/* subcommands */  	switch (note) { @@ -5931,6 +5951,10 @@ deleg_perm_comment(zfs_deleg_note_t note)  	case ZFS_DELEG_NOTE_SEND:  		str = gettext("");  		break; +	case ZFS_DELEG_NOTE_SEND_RAW: +		str = gettext("Allow sending ONLY encrypted (raw) replication" +		    "\n\t\t\t\tstreams"); +		break;  	case ZFS_DELEG_NOTE_SHARE:  		str = gettext("Allows sharing file systems over NFS or SMB"  		    "\n\t\t\t\tprotocols"); @@ -6860,17 +6884,17 @@ print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl,  			if (scripted) {  				if (parsable) { -					(void) printf("%s\t%s\t%ld\n", zname, -					    tagname, (unsigned long)time); +					(void) printf("%s\t%s\t%lld\n", zname, +					    tagname, (long long)time);  				} else {  					(void) printf("%s\t%s\t%s\n", zname,  					    tagname, tsbuf);  				}  			} else {  				if (parsable) { -					(void) printf("%-*s  %-*s  %ld\n", +					(void) printf("%-*s  %-*s  %lld\n",  					    nwidth, zname, tagwidth, -					    tagname, (unsigned long)time); +					    tagname, (long long)time);  				} else {  					(void) printf("%-*s  %-*s  %s\n",  					    nwidth, zname, tagwidth, @@ -7729,6 +7753,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)  	struct extmnttab entry;  	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";  	ino_t path_inode; +	char *zfs_mntpnt, *entry_mntpnt;  	/*  	 * Search for the given (major,minor) pair in the mount table. @@ -7770,6 +7795,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)  		goto out;  	} +	/* +	 * If the filesystem is mounted, check that the mountpoint matches +	 * the one in the mnttab entry w.r.t. provided path. If it doesn't, +	 * then we should not proceed further. +	 */ +	entry_mntpnt = strdup(entry.mnt_mountp); +	if (zfs_is_mounted(zhp, &zfs_mntpnt)) { +		if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) { +			(void) fprintf(stderr, gettext("cannot %s '%s': " +			    "not an original mountpoint\n"), cmdname, path); +			free(zfs_mntpnt); +			free(entry_mntpnt); +			goto out; +		} +		free(zfs_mntpnt); +	} +	free(entry_mntpnt); +  	if (op == OP_SHARE) {  		char nfs_mnt_prop[ZFS_MAXPROPLEN];  		char smbshare_prop[ZFS_MAXPROPLEN]; @@ -9160,8 +9203,11 @@ zfs_do_rewrite(int argc, char **argv)  	zfs_rewrite_args_t args;  	memset(&args, 0, sizeof (args)); -	while ((c = getopt(argc, argv, "l:o:rvx")) != -1) { +	while ((c = getopt(argc, argv, "Pl:o:rvx")) != -1) {  		switch (c) { +		case 'P': +			args.flags |= ZFS_REWRITE_PHYSICAL; +			break;  		case 'l':  			args.len = strtoll(optarg, NULL, 0);  			break; diff --git a/sys/contrib/openzfs/cmd/zhack.c b/sys/contrib/openzfs/cmd/zhack.c index 8244bc83fa0d..8ffbf91ffb30 100644 --- a/sys/contrib/openzfs/cmd/zhack.c +++ b/sys/contrib/openzfs/cmd/zhack.c @@ -54,6 +54,7 @@  #include <sys/dmu_tx.h>  #include <zfeature_common.h>  #include <libzutil.h> +#include <sys/metaslab_impl.h>  static importargs_t g_importargs;  static char *g_pool; @@ -69,7 +70,8 @@ static __attribute__((noreturn)) void  usage(void)  {  	(void) fprintf(stderr, -	    "Usage: zhack [-c cachefile] [-d dir] <subcommand> <args> ...\n" +	    "Usage: zhack [-o tunable] [-c cachefile] [-d dir] <subcommand> " +	    "<args> ...\n"  	    "where <subcommand> <args> is one of the following:\n"  	    "\n"); @@ -93,7 +95,10 @@ usage(void)  	    "        -c repair corrupted label checksums\n"  	    "        -u restore the label on a detached device\n"  	    "\n" -	    "    <device> : path to vdev\n"); +	    "    <device> : path to vdev\n" +	    "\n" +	    "    metaslab leak <pool>\n" +	    "        apply allocation map from zdb to specified pool\n");  	exit(1);  } @@ -162,9 +167,9 @@ zhack_import(char *target, boolean_t readonly)  	props = NULL;  	if (readonly) { -		VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); -		VERIFY(nvlist_add_uint64(props, -		    zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); +		VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); +		VERIFY0(nvlist_add_uint64(props, +		    zpool_prop_to_name(ZPOOL_PROP_READONLY), 1));  	}  	zfeature_checks_disable = B_TRUE; @@ -218,8 +223,8 @@ dump_obj(objset_t *os, uint64_t obj, const char *name)  		} else {  			ASSERT(za->za_integer_length == 1);  			char val[1024]; -			VERIFY(zap_lookup(os, obj, za->za_name, -			    1, sizeof (val), val) == 0); +			VERIFY0(zap_lookup(os, obj, za->za_name, +			    1, sizeof (val), val));  			(void) printf("\t%s = %s\n", za->za_name, val);  		}  	} @@ -363,10 +368,12 @@ feature_incr_sync(void *arg, dmu_tx_t *tx)  	zfeature_info_t *feature = arg;  	uint64_t refcount; +	mutex_enter(&spa->spa_feat_stats_lock);  	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));  	feature_sync(spa, feature, refcount + 1, tx);  	spa_history_log_internal(spa, "zhack feature incr", tx,  	    "name=%s", feature->fi_guid); +	mutex_exit(&spa->spa_feat_stats_lock);  }  static void @@ -376,10 +383,12 @@ feature_decr_sync(void *arg, dmu_tx_t *tx)  	zfeature_info_t *feature = arg;  	uint64_t refcount; +	mutex_enter(&spa->spa_feat_stats_lock);  	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));  	feature_sync(spa, feature, refcount - 1, tx);  	spa_history_log_internal(spa, "zhack feature decr", tx,  	    "name=%s", feature->fi_guid); +	mutex_exit(&spa->spa_feat_stats_lock);  }  static void @@ -496,6 +505,186 @@ zhack_do_feature(int argc, char **argv)  	return (0);  } +static boolean_t +strstarts(const char *a, const char *b) +{ +	return (strncmp(a, b, strlen(b)) == 0); +} + +static void +metaslab_force_alloc(metaslab_t *msp, uint64_t start, uint64_t size, +    dmu_tx_t *tx) +{ +	ASSERT(msp->ms_disabled); +	ASSERT(MUTEX_HELD(&msp->ms_lock)); +	uint64_t txg = dmu_tx_get_txg(tx); + +	uint64_t off = start; +	while (off < start + size) { +		uint64_t ostart, osize; +		boolean_t found = zfs_range_tree_find_in(msp->ms_allocatable, +		    off, start + size - off, &ostart, &osize); +		if (!found) +			break; +		zfs_range_tree_remove(msp->ms_allocatable, ostart, osize); + +		if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) +			vdev_dirty(msp->ms_group->mg_vd, VDD_METASLAB, msp, +			    txg); + +		zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], ostart, +		    osize); +		msp->ms_allocating_total += osize; +		off = ostart + osize; +	} +} + +static void +zhack_do_metaslab_leak(int argc, char **argv) +{ +	int c; +	char *target; +	spa_t *spa; + +	optind = 1; +	boolean_t force = B_FALSE; +	while ((c = getopt(argc, argv, "f")) != -1) { +		switch (c) { +		case 'f': +			force = B_TRUE; +			break; +		default: +			usage(); +			break; +		} +	} + +	argc -= optind; +	argv += optind; + +	if (argc < 1) { +		(void) fprintf(stderr, "error: missing pool name\n"); +		usage(); +	} +	target = argv[0]; + +	zhack_spa_open(target, B_FALSE, FTAG, &spa); +	spa_config_enter(spa, SCL_VDEV | SCL_ALLOC, FTAG, RW_READER); + +	char *line = NULL; +	size_t cap = 0; + +	vdev_t *vd = NULL; +	metaslab_t *prev = NULL; +	dmu_tx_t *tx = NULL; +	while (getline(&line, &cap, stdin) > 0) { +		if (strstarts(line, "\tvdev ")) { +			uint64_t vdev_id, ms_shift; +			if (sscanf(line, +			    "\tvdev %10"PRIu64"\t%*s  metaslab shift %4"PRIu64, +			    &vdev_id, &ms_shift) == 1) { +				VERIFY3U(sscanf(line, "\tvdev %"PRIu64 +				    "\t  metaslab shift %4"PRIu64, +				    &vdev_id, &ms_shift), ==, 2); +			} +			vd = vdev_lookup_top(spa, vdev_id); +			if (vd == NULL) { +				fprintf(stderr, "error: no such vdev with " +				    "id %"PRIu64"\n", vdev_id); +				break; +			} +			if (tx) { +				dmu_tx_commit(tx); +				mutex_exit(&prev->ms_lock); +				metaslab_enable(prev, B_FALSE, B_FALSE); +				tx = NULL; +				prev = NULL; +			} +			if (vd->vdev_ms_shift != ms_shift) { +				fprintf(stderr, "error: ms_shift mismatch: %" +				    PRIu64" != %"PRIu64"\n", vd->vdev_ms_shift, +				    ms_shift); +				break; +			} +		} else if (strstarts(line, "\tmetaslabs ")) { +			uint64_t ms_count; +			VERIFY3U(sscanf(line, "\tmetaslabs %"PRIu64, &ms_count), +			    ==, 1); +			ASSERT(vd); +			if (!force && vd->vdev_ms_count != ms_count) { +				fprintf(stderr, "error: ms_count mismatch: %" +				    PRIu64" != %"PRIu64"\n", vd->vdev_ms_count, +				    ms_count); +				break; +			} +		} else if (strstarts(line, "ALLOC:")) { +			uint64_t start, size; +			VERIFY3U(sscanf(line, "ALLOC: %"PRIu64" %"PRIu64"\n", +			    &start, &size), ==, 2); + +			ASSERT(vd); +			metaslab_t *cur = +			    vd->vdev_ms[start >> vd->vdev_ms_shift]; +			if (prev != cur) { +				if (prev) { +					dmu_tx_commit(tx); +					mutex_exit(&prev->ms_lock); +					metaslab_enable(prev, B_FALSE, B_FALSE); +				} +				ASSERT(cur); +				metaslab_disable(cur); +				mutex_enter(&cur->ms_lock); +				metaslab_load(cur); +				prev = cur; +				tx = dmu_tx_create_dd( +				    spa_get_dsl(vd->vdev_spa)->dp_root_dir); +				dmu_tx_assign(tx, DMU_TX_WAIT); +			} + +			metaslab_force_alloc(cur, start, size, tx); +		} else { +			continue; +		} +	} +	if (tx) { +		dmu_tx_commit(tx); +		mutex_exit(&prev->ms_lock); +		metaslab_enable(prev, B_FALSE, B_FALSE); +		tx = NULL; +		prev = NULL; +	} +	if (line) +		free(line); + +	spa_config_exit(spa, SCL_VDEV | SCL_ALLOC, FTAG); +	spa_close(spa, FTAG); +} + +static int +zhack_do_metaslab(int argc, char **argv) +{ +	char *subcommand; + +	argc--; +	argv++; +	if (argc == 0) { +		(void) fprintf(stderr, +		    "error: no metaslab operation specified\n"); +		usage(); +	} + +	subcommand = argv[0]; +	if (strcmp(subcommand, "leak") == 0) { +		zhack_do_metaslab_leak(argc, argv); +	} else { +		(void) fprintf(stderr, "error: unknown subcommand: %s\n", +		    subcommand); +		usage(); +	} + +	return (0); +} +  #define	ASHIFT_UBERBLOCK_SHIFT(ashift)	\  	MIN(MAX(ashift, UBERBLOCK_SHIFT), \  	MAX_UBERBLOCK_SHIFT) @@ -525,6 +714,23 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl,  	return (0);  } +static int +zhack_repair_get_byteswap(const zio_eck_t *vdev_eck, const int l, int *byteswap) +{ +	if (vdev_eck->zec_magic == ZEC_MAGIC) { +		*byteswap = B_FALSE; +	} else if (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)) { +		*byteswap = B_TRUE; +	} else { +		(void) fprintf(stderr, "error: label %d: " +		    "Expected the nvlist checksum magic number but instead got " +		    "0x%" PRIx64 "\n", +		    l, vdev_eck->zec_magic); +		return (1); +	} +	return (0); +} +  static void  zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,      const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) @@ -551,33 +757,10 @@ zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,  }  static int -zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, -    const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg, -    uint64_t *ashift) +zhack_repair_get_ashift(nvlist_t *cfg, const int l, uint64_t *ashift)  {  	int err; - -	if (ub->ub_txg != 0) { -		(void) fprintf(stderr, -		    "error: label %d: UB TXG of 0 expected, but got %" -		    PRIu64 "\n", -		    l, ub->ub_txg); -		(void) fprintf(stderr, "It would appear the device was not " -		    "properly removed.\n"); -		return (1); -	} - -	for (int i = 0; i < cfg_keys_len; i++) { -		uint64_t val; -		err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); -		if (err) { -			(void) fprintf(stderr, -			    "error: label %d, %d: " -			    "cannot find nvlist key %s\n", -			    l, i, cfg_keys[i]); -			return (err); -		} -	} +	nvlist_t *vdev_tree_cfg;  	err = nvlist_lookup_nvlist(cfg,  	    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); @@ -601,7 +784,7 @@ zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys,  		(void) fprintf(stderr,  		    "error: label %d: nvlist key %s is zero\n",  		    l, ZPOOL_CONFIG_ASHIFT); -		return (err); +		return (1);  	}  	return (0); @@ -616,30 +799,35 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)  	 */  	if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {  		const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp); +		int err; +  		ub->ub_txg = txg; -		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { +		err = nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG); +		if (err) {  			(void) fprintf(stderr,  			    "error: label %d: "  			    "Failed to remove pool creation TXG\n",  			    l); -			return (1); +			return (err);  		} -		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { +		err = nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG); +		if (err) {  			(void) fprintf(stderr,  			    "error: label %d: Failed to remove pool TXG to "  			    "be replaced.\n",  			    l); -			return (1); +			return (err);  		} -		if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { +		err = nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg); +		if (err) {  			(void) fprintf(stderr,  			    "error: label %d: "  			    "Failed to add pool TXG of %" PRIu64 "\n",  			    l, txg); -			return (1); +			return (err);  		}  	} @@ -733,6 +921,7 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,  	    BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;  	const uint64_t actual_magic = vdev_eck->zec_magic;  	int err = 0; +  	if (actual_magic != expected_magic) {  		(void) fprintf(stderr, "error: label %d: "  		    "Expected " @@ -754,6 +943,36 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,  	return (err);  } +static int +zhack_repair_unpack_cfg(vdev_label_t *vl, const int l, nvlist_t **cfg) +{ +	const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, +	    ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; +	int err; + +	err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, +	    VDEV_PHYS_SIZE - sizeof (zio_eck_t), cfg, 0); +	if (err) { +		(void) fprintf(stderr, +		    "error: cannot unpack nvlist label %d\n", l); +		return (err); +	} + +	for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) { +		uint64_t val; +		err = nvlist_lookup_uint64(*cfg, cfg_keys[i], &val); +		if (err) { +			(void) fprintf(stderr, +			    "error: label %d, %d: " +			    "cannot find nvlist key %s\n", +			    l, i, cfg_keys[i]); +			return (err); +		} +	} + +	return (0); +} +  static void  zhack_repair_one_label(const zhack_repair_op_t op, const int fd,      vdev_label_t *vl, const uint64_t label_offset, const int l, @@ -767,10 +986,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,  	    (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;  	const uint64_t vdev_phys_offset =  	    label_offset + offsetof(vdev_label_t, vl_vdev_phys); -	const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, -	    ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };  	nvlist_t *cfg; -	nvlist_t *vdev_tree_cfg = NULL;  	uint64_t ashift;  	int byteswap; @@ -778,18 +994,9 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,  	if (err)  		return; -	if (vdev_eck->zec_magic == 0) { -		(void) fprintf(stderr, "error: label %d: " -		    "Expected the nvlist checksum magic number to not be zero" -		    "\n", -		    l); -		(void) fprintf(stderr, "There should already be a checksum " -		    "for the label.\n"); +	err = zhack_repair_get_byteswap(vdev_eck, l, &byteswap); +	if (err)  		return; -	} - -	byteswap = -	    (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));  	if (byteswap) {  		byteswap_uint64_array(&vdev_eck->zec_cksum, @@ -805,16 +1012,7 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,  		return;  	} -	err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, -	    VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); -	if (err) { -		(void) fprintf(stderr, -		    "error: cannot unpack nvlist label %d\n", l); -		return; -	} - -	err = zhack_repair_check_label(ub, -	    l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift); +	err = zhack_repair_unpack_cfg(vl, l, &cfg);  	if (err)  		return; @@ -822,6 +1020,19 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,  		char *buf;  		size_t buflen; +		if (ub->ub_txg != 0) { +			(void) fprintf(stderr, +			    "error: label %d: UB TXG of 0 expected, but got %" +			    PRIu64 "\n", l, ub->ub_txg); +			(void) fprintf(stderr, "It would appear the device was " +			    "not properly detached.\n"); +			return; +		} + +		err = zhack_repair_get_ashift(cfg, l, &ashift); +		if (err) +			return; +  		err = zhack_repair_undetach(ub, cfg, l);  		if (err)  			return; @@ -981,7 +1192,7 @@ main(int argc, char **argv)  	dprintf_setup(&argc, argv);  	zfs_prop_init(); -	while ((c = getopt(argc, argv, "+c:d:")) != -1) { +	while ((c = getopt(argc, argv, "+c:d:o:")) != -1) {  		switch (c) {  		case 'c':  			g_importargs.cachefile = optarg; @@ -990,6 +1201,10 @@ main(int argc, char **argv)  			assert(g_importargs.paths < MAX_NUM_PATHS);  			g_importargs.path[g_importargs.paths++] = optarg;  			break; +		case 'o': +			if (handle_tunable_option(optarg, B_FALSE) != 0) +				exit(1); +			break;  		default:  			usage();  			break; @@ -1011,6 +1226,8 @@ main(int argc, char **argv)  		rv = zhack_do_feature(argc, argv);  	} else if (strcmp(subcommand, "label") == 0) {  		return (zhack_do_label(argc, argv)); +	} else if (strcmp(subcommand, "metaslab") == 0) { +		rv = zhack_do_metaslab(argc, argv);  	} else {  		(void) fprintf(stderr, "error: unknown subcommand: %s\n",  		    subcommand); diff --git a/sys/contrib/openzfs/cmd/zilstat.in b/sys/contrib/openzfs/cmd/zilstat.in index 4140398bf4a3..d01db9b0914b 100755 --- a/sys/contrib/openzfs/cmd/zilstat.in +++ b/sys/contrib/openzfs/cmd/zilstat.in @@ -47,6 +47,7 @@ cols = {  	"cec":       [5,         1000,       "zil_commit_error_count"],  	"csc":       [5,         1000,       "zil_commit_stall_count"],  	"cSc":       [5,         1000,       "zil_commit_suspend_count"], +	"cCc":       [5,         1000,       "zil_commit_crash_count"],  	"ic":        [5,         1000,       "zil_itx_count"],  	"iic":       [5,         1000,       "zil_itx_indirect_count"],  	"iib":       [5,         1024,       "zil_itx_indirect_bytes"], diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c index 113797c878b9..c2f646f2567d 100644 --- a/sys/contrib/openzfs/cmd/zinject/zinject.c +++ b/sys/contrib/openzfs/cmd/zinject/zinject.c @@ -107,6 +107,8 @@   * 	zinject   * 	zinject <-a | -u pool>   * 	zinject -c <id|all> + * 	zinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range] + *	    [-T iotype] [-t type object | -b bookmark pool]   * 	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]   *	    [-r range] <object>   * 	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool @@ -132,14 +134,18 @@   * The '-f' flag controls the frequency of errors injected, expressed as a   * real number percentage between 0.0001 and 100.  The default is 100.   * - * The this form is responsible for actually injecting the handler into the + * The <object> form is responsible for actually injecting the handler into the   * framework.  It takes the arguments described above, translates them to the   * internal tuple using libzpool, and then issues an ioctl() to register the   * handler.   * - * The final form can target a specific bookmark, regardless of whether a + * The '-b' option can target a specific bookmark, regardless of whether a   * human-readable interface has been designed.  It allows developers to specify   * a particular block by number. + * + * The '-E' option injects pipeline ready stage delays for the given object or + * bookmark. The delay is specified in milliseconds, and it supports I/O type + * and range filters.   */  #include <errno.h> @@ -346,6 +352,13 @@ usage(void)  	    "\t\tsuch that the operation takes a minimum of supplied seconds\n"  	    "\t\tto complete.\n"  	    "\n" +	    "\tzinject -E <delay> [-a] [-m] [-f freq] [-l level] [-r range]\n" +	    "\t\t[-T iotype] [-t type object | -b bookmark pool]\n" +	    "\n" +	    "\t\tInject pipeline ready stage delays for the given object path\n" +	    "\t\t(data or dnode) or raw bookmark. The delay is specified in\n" +	    "\t\tmilliseconds.\n" +	    "\n"  	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"  	    "\t\tCause the pool to stop writing blocks yet not\n"  	    "\t\treport errors for a duration.  Simulates buggy hardware\n" @@ -724,12 +737,15 @@ register_handler(const char *pool, int flags, zinject_record_t *record,  	if (quiet) {  		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);  	} else { +		boolean_t show_object = B_FALSE; +		boolean_t show_iotype = B_FALSE;  		(void) printf("Added handler %llu with the following "  		    "properties:\n", (u_longlong_t)zc.zc_guid);  		(void) printf("  pool: %s\n", pool);  		if (record->zi_guid) {  			(void) printf("  vdev: %llx\n",  			    (u_longlong_t)record->zi_guid); +			show_iotype = B_TRUE;  		} else if (record->zi_func[0] != '\0') {  			(void) printf("  panic function: %s\n",  			    record->zi_func); @@ -742,7 +758,18 @@ register_handler(const char *pool, int flags, zinject_record_t *record,  		} else if (record->zi_timer > 0) {  			(void) printf(" timer: %lld ms\n",  			    (u_longlong_t)NSEC2MSEC(record->zi_timer)); +			if (record->zi_cmd == ZINJECT_DELAY_READY) { +				show_object = B_TRUE; +				show_iotype = B_TRUE; +			}  		} else { +			show_object = B_TRUE; +		} +		if (show_iotype) { +			(void) printf("iotype: %s\n", +			    iotype_to_str(record->zi_iotype)); +		} +		if (show_object) {  			(void) printf("objset: %llu\n",  			    (u_longlong_t)record->zi_objset);  			(void) printf("object: %llu\n", @@ -910,6 +937,7 @@ main(int argc, char **argv)  	int ret;  	int flags = 0;  	uint32_t dvas = 0; +	hrtime_t ready_delay = -1;  	if ((g_zfs = libzfs_init()) == NULL) {  		(void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); @@ -940,7 +968,7 @@ main(int argc, char **argv)  	}  	while ((c = getopt(argc, argv, -	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) { +	    ":aA:b:C:d:D:E:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:P:")) != -1) {  		switch (c) {  		case 'a':  			flags |= ZINJECT_FLUSH_ARC; @@ -1113,6 +1141,18 @@ main(int argc, char **argv)  		case 'u':  			flags |= ZINJECT_UNLOAD_SPA;  			break; +		case 'E': +			ready_delay = MSEC2NSEC(strtol(optarg, &end, 10)); +			if (ready_delay <= 0 || *end != '\0') { +				(void) fprintf(stderr, "invalid delay '%s': " +				    "must be a positive duration\n", optarg); +				usage(); +				libzfs_fini(g_zfs); +				return (1); +			} +			record.zi_cmd = ZINJECT_DELAY_READY; +			record.zi_timer = ready_delay; +			break;  		case 'L':  			if ((label = name_to_type(optarg)) == TYPE_INVAL &&  			    !LABEL_TYPE(type)) { @@ -1150,7 +1190,7 @@ main(int argc, char **argv)  		 */  		if (raw != NULL || range != NULL || type != TYPE_INVAL ||  		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || -		    record.zi_freq > 0 || dvas != 0) { +		    record.zi_freq > 0 || dvas != 0 || ready_delay >= 0) {  			(void) fprintf(stderr, "cancel (-c) incompatible with "  			    "any other options\n");  			usage(); @@ -1186,7 +1226,7 @@ main(int argc, char **argv)  		 */  		if (raw != NULL || range != NULL || type != TYPE_INVAL ||  		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || -		    dvas != 0) { +		    dvas != 0 || ready_delay >= 0) {  			(void) fprintf(stderr, "device (-d) incompatible with "  			    "data error injection\n");  			usage(); @@ -1276,13 +1316,23 @@ main(int argc, char **argv)  			return (1);  		} -		record.zi_cmd = ZINJECT_DATA_FAULT; +		if (record.zi_cmd == ZINJECT_UNINITIALIZED) { +			record.zi_cmd = ZINJECT_DATA_FAULT; +			if (!error) +				error = EIO; +		} else if (error != 0) { +			(void) fprintf(stderr, "error type -e incompatible " +			    "with delay injection\n"); +			libzfs_fini(g_zfs); +			return (1); +		} else { +			record.zi_iotype = io_type; +		} +  		if (translate_raw(raw, &record) != 0) {  			libzfs_fini(g_zfs);  			return (1);  		} -		if (!error) -			error = EIO;  	} else if (record.zi_cmd == ZINJECT_PANIC) {  		if (raw != NULL || range != NULL || type != TYPE_INVAL ||  		    level != 0 || device != NULL || record.zi_freq > 0 || @@ -1410,6 +1460,13 @@ main(int argc, char **argv)  			record.zi_dvas = dvas;  		} +		if (record.zi_cmd != ZINJECT_UNINITIALIZED && error != 0) { +			(void) fprintf(stderr, "error type -e incompatible " +			    "with delay injection\n"); +			libzfs_fini(g_zfs); +			return (1); +		} +  		if (error == EACCES) {  			if (type != TYPE_DATA) {  				(void) fprintf(stderr, "decryption errors " @@ -1425,8 +1482,12 @@ main(int argc, char **argv)  			 * not found.  			 */  			error = ECKSUM; -		} else { +		} else if (record.zi_cmd == ZINJECT_UNINITIALIZED) {  			record.zi_cmd = ZINJECT_DATA_FAULT; +			if (!error) +				error = EIO; +		} else { +			record.zi_iotype = io_type;  		}  		if (translate_record(type, argv[0], range, level, &record, pool, @@ -1434,8 +1495,6 @@ main(int argc, char **argv)  			libzfs_fini(g_zfs);  			return (1);  		} -		if (!error) -			error = EIO;  	}  	/* diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am index 2f962408e5a3..5bb6d8160b18 100644 --- a/sys/contrib/openzfs/cmd/zpool/Makefile.am +++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am @@ -148,6 +148,7 @@ dist_zpoolcompat_DATA = \  	%D%/compatibility.d/openzfs-2.1-linux \  	%D%/compatibility.d/openzfs-2.2 \  	%D%/compatibility.d/openzfs-2.3 \ +	%D%/compatibility.d/openzfs-2.4 \  	%D%/compatibility.d/openzfsonosx-1.7.0 \  	%D%/compatibility.d/openzfsonosx-1.8.1 \  	%D%/compatibility.d/openzfsonosx-1.9.3 \ @@ -187,7 +188,9 @@ zpoolcompatlinks = \  	"openzfs-2.2		openzfs-2.2-linux" \  	"openzfs-2.2		openzfs-2.2-freebsd" \  	"openzfs-2.3		openzfs-2.3-linux" \ -	"openzfs-2.3		openzfs-2.3-freebsd" +	"openzfs-2.3		openzfs-2.3-freebsd" \ +	"openzfs-2.4		openzfs-2.4-linux" \ +	"openzfs-2.4		openzfs-2.4-freebsd"  zpoolconfdir = $(sysconfdir)/zfs/zpool.d  INSTALL_DATA_HOOKS += zpool-install-data-hook diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 new file mode 100644 index 000000000000..3fbd91014c95 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 @@ -0,0 +1,48 @@ +# Features supported by OpenZFS 2.4 on Linux and FreeBSD +allocation_classes +async_destroy +blake3 +block_cloning +block_cloning_endian +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +dynamic_gang_header +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +fast_dedup +filesystem_limits +head_errlog +hole_birth +large_blocks +large_dnode +large_microzap +livelist +log_spacemap +longname +lz4_compress +multi_vdev_crash_dump +obsolete_counts +physical_rewrite +project_quota +raidz_expansion +redacted_datasets +redaction_bookmarks +redaction_list_spill +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +vdev_zaps_v2 +zilsaxattr +zpool_checkpoint +zstd_compress diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c index 2ec189b98653..fef602736705 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -26,6 +26,7 @@  /*   * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright (c) 2025, Klara, Inc.   */  #include <libintl.h> @@ -52,7 +53,7 @@  typedef struct zpool_node {  	zpool_handle_t	*zn_handle;  	uu_avl_node_t	zn_avlnode; -	int		zn_mark; +	hrtime_t	zn_last_refresh;  } zpool_node_t;  struct zpool_list { @@ -62,6 +63,7 @@ struct zpool_list {  	uu_avl_pool_t	*zl_pool;  	zprop_list_t	**zl_proplist;  	zfs_type_t	zl_type; +	hrtime_t	zl_last_refresh;  };  static int @@ -81,26 +83,30 @@ zpool_compare(const void *larg, const void *rarg, void *unused)   * of known pools.   */  static int -add_pool(zpool_handle_t *zhp, void *data) +add_pool(zpool_handle_t *zhp, zpool_list_t *zlp)  { -	zpool_list_t *zlp = data; -	zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); +	zpool_node_t *node, *new = safe_malloc(sizeof (zpool_node_t));  	uu_avl_index_t idx; -	node->zn_handle = zhp; -	uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); -	if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { +	new->zn_handle = zhp; +	uu_avl_node_init(new, &new->zn_avlnode, zlp->zl_pool); + +	node = uu_avl_find(zlp->zl_avl, new, NULL, &idx); +	if (node == NULL) {  		if (zlp->zl_proplist &&  		    zpool_expand_proplist(zhp, zlp->zl_proplist,  		    zlp->zl_type, zlp->zl_literal) != 0) {  			zpool_close(zhp); -			free(node); +			free(new);  			return (-1);  		} -		uu_avl_insert(zlp->zl_avl, node, idx); +		new->zn_last_refresh = zlp->zl_last_refresh; +		uu_avl_insert(zlp->zl_avl, new, idx);  	} else { +		zpool_refresh_stats_from_handle(node->zn_handle, zhp); +		node->zn_last_refresh = zlp->zl_last_refresh;  		zpool_close(zhp); -		free(node); +		free(new);  		return (-1);  	} @@ -108,6 +114,18 @@ add_pool(zpool_handle_t *zhp, void *data)  }  /* + * add_pool(), but always returns 0. This allows zpool_iter() to continue + * even if a pool exists in the tree, or we fail to get the properties for + * a new one. + */ +static int +add_pool_cb(zpool_handle_t *zhp, void *data) +{ +	(void) add_pool(zhp, data); +	return (0); +} + +/*   * Create a list of pools based on the given arguments.  If we're given no   * arguments, then iterate over all pools in the system and add them to the AVL   * tree.  Otherwise, add only those pool explicitly specified on the command @@ -135,9 +153,10 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,  	zlp->zl_type = type;  	zlp->zl_literal = literal; +	zlp->zl_last_refresh = gethrtime();  	if (argc == 0) { -		(void) zpool_iter(g_zfs, add_pool, zlp); +		(void) zpool_iter(g_zfs, add_pool_cb, zlp);  		zlp->zl_findall = B_TRUE;  	} else {  		int i; @@ -159,15 +178,61 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, zfs_type_t type,  }  /* - * Search for any new pools, adding them to the list.  We only add pools when no - * options were given on the command line.  Otherwise, we keep the list fixed as - * those that were explicitly specified. + * Refresh the state of all pools on the list. Additionally, if no options were + * given on the command line, add any new pools and remove any that are no + * longer available.   */ -void -pool_list_update(zpool_list_t *zlp) +int +pool_list_refresh(zpool_list_t *zlp)  { -	if (zlp->zl_findall) -		(void) zpool_iter(g_zfs, add_pool, zlp); +	zlp->zl_last_refresh = gethrtime(); + +	if (!zlp->zl_findall) { +		/* +		 * This list is a fixed list of pools, so we must not add +		 * or remove any. Just walk over them and refresh their +		 * state. +		 */ +		int navail = 0; +		for (zpool_node_t *node = uu_avl_first(zlp->zl_avl); +		    node != NULL; node = uu_avl_next(zlp->zl_avl, node)) { +			boolean_t missing; +			zpool_refresh_stats(node->zn_handle, &missing); +			navail += !missing; +			node->zn_last_refresh = zlp->zl_last_refresh; +		} +		return (navail); +	} + +	/* Search for any new pools and add them to the list. */ +	(void) zpool_iter(g_zfs, add_pool_cb, zlp); + +	/* Walk the list of existing pools, and update or remove them. */ +	zpool_node_t *node, *next; +	for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next) { +		next = uu_avl_next(zlp->zl_avl, node); + +		/* +		 * Skip any that were refreshed and are online; they were added +		 * by zpool_iter() and are already up to date. +		 */ +		if (node->zn_last_refresh == zlp->zl_last_refresh && +		    zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL) +			continue; + +		/* Refresh and remove if necessary. */ +		boolean_t missing; +		zpool_refresh_stats(node->zn_handle, &missing); +		if (missing) { +			uu_avl_remove(zlp->zl_avl, node); +			zpool_close(node->zn_handle); +			free(node); +		} else { +			node->zn_last_refresh = zlp->zl_last_refresh; +		} +	} + +	return (uu_avl_numnodes(zlp->zl_avl));  }  /* @@ -191,23 +256,6 @@ pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,  }  /* - * Remove the given pool from the list.  When running iostat, we want to remove - * those pools that no longer exist. - */ -void -pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) -{ -	zpool_node_t search, *node; - -	search.zn_handle = zhp; -	if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { -		uu_avl_remove(zlp->zl_avl, node); -		zpool_close(node->zn_handle); -		free(node); -	} -} - -/*   * Free all the handles associated with this list.   */  void @@ -379,8 +427,8 @@ process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl)  static int  vdev_process_cmd_output(vdev_cmd_data_t *data, char *line)  { -	char *col = NULL; -	char *val = line; +	char *col; +	char *val;  	char *equals;  	char **tmp; @@ -397,6 +445,7 @@ vdev_process_cmd_output(vdev_cmd_data_t *data, char *line)  		col = line;  		val = equals + 1;  	} else { +		col = NULL;  		val = line;  	} diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index e62441894cd7..a6658a9c2800 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -33,8 +33,8 @@   * Copyright (c) 2017, Intel Corporation.   * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>   * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> - * Copyright (c) 2021, 2023, Klara Inc. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2023, 2025, Klara, Inc. + * Copyright (c) 2021, 2025 Hewlett Packard Enterprise Development LP.   */  #include <assert.h> @@ -456,7 +456,7 @@ get_usage(zpool_help_t idx)  		    "<pool> <vdev> ...\n"));  	case HELP_ATTACH:  		return (gettext("\tattach [-fsw] [-o property=value] " -		    "<pool> <device> <new-device>\n")); +		    "<pool> <vdev> <new-device>\n"));  	case HELP_CLEAR:  		return (gettext("\tclear [[--power]|[-nF]] <pool> [device]\n"));  	case HELP_CREATE: @@ -510,16 +510,16 @@ get_usage(zpool_help_t idx)  	case HELP_REOPEN:  		return (gettext("\treopen [-n] <pool>\n"));  	case HELP_INITIALIZE: -		return (gettext("\tinitialize [-c | -s | -u] [-w] <pool> " -		    "[<device> ...]\n")); +		return (gettext("\tinitialize [-c | -s | -u] [-w] <-a | <pool> " +		    "[<device> ...]>\n"));  	case HELP_SCRUB: -		return (gettext("\tscrub [-e | -s | -p | -C] [-w] " -		    "<pool> ...\n")); +		return (gettext("\tscrub [-e | -s | -p | -C | -E | -S] [-w] " +		    "<-a | <pool> [<pool> ...]>\n"));  	case HELP_RESILVER:  		return (gettext("\tresilver <pool> ...\n"));  	case HELP_TRIM: -		return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> " -		    "[<device> ...]\n")); +		return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] " +		    "<-a | <pool> [<device> ...]>\n"));  	case HELP_STATUS:  		return (gettext("\tstatus [-DdegiLPpstvx] "  		    "[-c script1[,script2,...]] ...\n" @@ -560,33 +560,6 @@ get_usage(zpool_help_t idx)  	}  } -static void -zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) -{ -	uint_t children = 0; -	nvlist_t **child; -	uint_t i; - -	(void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, -	    &child, &children); - -	if (children == 0) { -		char *path = zpool_vdev_name(g_zfs, zhp, nvroot, -		    VDEV_NAME_PATH); - -		if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && -		    strcmp(path, VDEV_TYPE_HOLE) != 0) -			fnvlist_add_boolean(res, path); - -		free(path); -		return; -	} - -	for (i = 0; i < children; i++) { -		zpool_collect_leaves(zhp, child[i], res); -	} -} -  /*   * Callback routine that will print out a pool property value.   */ @@ -779,10 +752,11 @@ usage(boolean_t requested)  }  /* - * zpool initialize [-c | -s | -u] [-w] <pool> [<vdev> ...] + * zpool initialize [-c | -s | -u] [-w] <-a | pool> [<vdev> ...]   * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool   * if none specified.   * + *	-a	Use all pools.   *	-c	Cancel. Ends active initializing.   *	-s	Suspend. Initializing can then be restarted with no flags.   *	-u	Uninitialize. Clears initialization state. @@ -794,22 +768,26 @@ zpool_do_initialize(int argc, char **argv)  	int c;  	char *poolname;  	zpool_handle_t *zhp; -	nvlist_t *vdevs;  	int err = 0;  	boolean_t wait = B_FALSE; +	boolean_t initialize_all = B_FALSE;  	struct option long_options[] = {  		{"cancel",	no_argument,		NULL, 'c'},  		{"suspend",	no_argument,		NULL, 's'},  		{"uninit",	no_argument,		NULL, 'u'},  		{"wait",	no_argument,		NULL, 'w'}, +		{"all",		no_argument,		NULL, 'a'},  		{0, 0, 0, 0}  	};  	pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; -	while ((c = getopt_long(argc, argv, "csuw", long_options, +	while ((c = getopt_long(argc, argv, "acsuw", long_options,  	    NULL)) != -1) {  		switch (c) { +		case 'a': +			initialize_all = B_TRUE; +			break;  		case 'c':  			if (cmd_type != POOL_INITIALIZE_START &&  			    cmd_type != POOL_INITIALIZE_CANCEL) { @@ -856,7 +834,18 @@ zpool_do_initialize(int argc, char **argv)  	argc -= optind;  	argv += optind; -	if (argc < 1) { +	initialize_cbdata_t cbdata = { +		.wait = wait, +		.cmd_type = cmd_type +	}; + +	if (initialize_all && argc > 0) { +		(void) fprintf(stderr, gettext("-a cannot be combined with " +		    "individual pools or vdevs\n")); +		usage(B_FALSE); +	} + +	if (argc < 1 && !initialize_all) {  		(void) fprintf(stderr, gettext("missing pool name argument\n"));  		usage(B_FALSE);  		return (-1); @@ -868,30 +857,35 @@ zpool_do_initialize(int argc, char **argv)  		usage(B_FALSE);  	} -	poolname = argv[0]; -	zhp = zpool_open(g_zfs, poolname); -	if (zhp == NULL) -		return (-1); - -	vdevs = fnvlist_alloc(); -	if (argc == 1) { -		/* no individual leaf vdevs specified, so add them all */ -		nvlist_t *config = zpool_get_config(zhp, NULL); -		nvlist_t *nvroot = fnvlist_lookup_nvlist(config, -		    ZPOOL_CONFIG_VDEV_TREE); -		zpool_collect_leaves(zhp, nvroot, vdevs); +	if (argc == 0 && initialize_all) { +		/* Initilize each pool recursively */ +		err = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, +		    B_FALSE, zpool_initialize_one, &cbdata); +		return (err); +	} else if (argc == 1) { +		/* no individual leaf vdevs specified, initialize the pool */ +		poolname = argv[0]; +		zhp = zpool_open(g_zfs, poolname); +		if (zhp == NULL) +			return (-1); +		err = zpool_initialize_one(zhp, &cbdata);  	} else { +		/* individual leaf vdevs specified, initialize them */ +		poolname = argv[0]; +		zhp = zpool_open(g_zfs, poolname); +		if (zhp == NULL) +			return (-1); +		nvlist_t *vdevs = fnvlist_alloc();  		for (int i = 1; i < argc; i++) {  			fnvlist_add_boolean(vdevs, argv[i]);  		} +		if (wait) +			err = zpool_initialize_wait(zhp, cmd_type, vdevs); +		else +			err = zpool_initialize(zhp, cmd_type, vdevs); +		fnvlist_free(vdevs);  	} -	if (wait) -		err = zpool_initialize_wait(zhp, cmd_type, vdevs); -	else -		err = zpool_initialize(zhp, cmd_type, vdevs); - -	fnvlist_free(vdevs);  	zpool_close(zhp);  	return (err); @@ -1788,7 +1782,7 @@ zpool_do_labelclear(int argc, char **argv)  {  	char vdev[MAXPATHLEN];  	char *name = NULL; -	int c, fd = -1, ret = 0; +	int c, fd, ret = 0;  	nvlist_t *config;  	pool_state_t state;  	boolean_t inuse = B_FALSE; @@ -5767,24 +5761,6 @@ children:  	return (ret);  } -static int -refresh_iostat(zpool_handle_t *zhp, void *data) -{ -	iostat_cbdata_t *cb = data; -	boolean_t missing; - -	/* -	 * If the pool has disappeared, remove it from the list and continue. -	 */ -	if (zpool_refresh_stats(zhp, &missing) != 0) -		return (-1); - -	if (missing) -		pool_list_remove(cb->cb_list, zhp); - -	return (0); -} -  /*   * Callback to print out the iostats for the given pool.   */ @@ -6157,7 +6133,6 @@ static void  get_interval_count_filter_guids(int *argc, char **argv, float *interval,      unsigned long *count, iostat_cbdata_t *cb)  { -	char **tmpargv = argv;  	int argc_for_interval = 0;  	/* Is the last arg an interval value?  Or a guid? */ @@ -6181,7 +6156,7 @@ get_interval_count_filter_guids(int *argc, char **argv, float *interval,  	}  	/* Point to our list of possible intervals */ -	tmpargv = &argv[*argc - argc_for_interval]; +	char **tmpargv = &argv[*argc - argc_for_interval];  	*argc = *argc - argc_for_interval;  	get_interval_count(&argc_for_interval, tmpargv, @@ -6366,18 +6341,16 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data)   * This command can be tricky because we want to be able to deal with pool   * creation/destruction as well as vdev configuration changes.  The bulk of this   * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely - * on pool_list_update() to detect the addition of new pools.  Configuration - * changes are all handled within libzfs. + * on pool_list_refresh() to detect the addition and removal of pools. + * Configuration changes are all handled within libzfs.   */  int  zpool_do_iostat(int argc, char **argv)  {  	int c;  	int ret; -	int npools;  	float interval = 0;  	unsigned long count = 0; -	int winheight = 24;  	zpool_list_t *list;  	boolean_t verbose = B_FALSE;  	boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; @@ -6626,10 +6599,24 @@ zpool_do_iostat(int argc, char **argv)  		return (1);  	} +	int last_npools = 0;  	for (;;) { -		if ((npools = pool_list_count(list)) == 0) +		/* +		 * Refresh all pools in list, adding or removing pools as +		 * necessary. +		 */ +		int npools = pool_list_refresh(list); +		if (npools == 0) {  			(void) fprintf(stderr, gettext("no pools available\n")); -		else { +		} else { +			/* +			 * If the list of pools has changed since last time +			 * around, reset the iteration count to force the +			 * header to be redisplayed. +			 */ +			if (last_npools != npools) +				cb.cb_iteration = 0; +  			/*  			 * If this is the first iteration and -y was supplied  			 * we skip any printing. @@ -6638,15 +6625,6 @@ zpool_do_iostat(int argc, char **argv)  			    cb.cb_iteration == 0);  			/* -			 * Refresh all statistics.  This is done as an -			 * explicit step before calculating the maximum name -			 * width, so that any * configuration changes are -			 * properly accounted for. -			 */ -			(void) pool_list_iter(list, B_FALSE, refresh_iostat, -			    &cb); - -			/*  			 * Iterate over all pools to determine the maximum width  			 * for the pool / device name column across all pools.  			 */ @@ -6673,7 +6651,7 @@ zpool_do_iostat(int argc, char **argv)  			 * even when terminal window has its height  			 * changed.  			 */ -			winheight = terminal_height(); +			int winheight = terminal_height();  			/*  			 * Are we connected to TTY? If not, headers_once  			 * should be true, to avoid breaking scripts. @@ -6699,6 +6677,7 @@ zpool_do_iostat(int argc, char **argv)  			if (skip) {  				(void) fflush(stdout);  				(void) fsleep(interval); +				last_npools = npools;  				continue;  			} @@ -6736,6 +6715,8 @@ zpool_do_iostat(int argc, char **argv)  		(void) fflush(stdout);  		(void) fsleep(interval); + +		last_npools = npools;  	}  	pool_list_free(list); @@ -6994,7 +6975,6 @@ collect_vdev_prop(zpool_prop_t prop, uint64_t value, const char *str,  /*   * print static default line per vdev - * not compatible with '-o' <proplist> option   */  static void  collect_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, @@ -7050,48 +7030,98 @@ collect_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,  		 * 'toplevel' boolean value is passed to the print_one_column()  		 * to indicate that the value is valid.  		 */ -		if (VDEV_STAT_VALID(vs_pspace, c) && vs->vs_pspace) { -			collect_vdev_prop(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL, -			    scripted, B_TRUE, format, cb->cb_json, props, -			    cb->cb_json_as_int); -		} else { -			collect_vdev_prop(ZPOOL_PROP_SIZE, vs->vs_space, NULL, -			    scripted, toplevel, format, cb->cb_json, props, -			    cb->cb_json_as_int); +		for (zprop_list_t *pl = cb->cb_proplist; pl != NULL; +		    pl = pl->pl_next) { +			switch (pl->pl_prop) { +			case ZPOOL_PROP_SIZE: +				if (VDEV_STAT_VALID(vs_pspace, c) && +				    vs->vs_pspace) { +					collect_vdev_prop( +					    ZPOOL_PROP_SIZE, vs->vs_pspace, +					    NULL, scripted, B_TRUE, format, +					    cb->cb_json, props, +					    cb->cb_json_as_int); +				} else { +					collect_vdev_prop( +					    ZPOOL_PROP_SIZE, vs->vs_space, NULL, +					    scripted, toplevel, format, +					    cb->cb_json, props, +					    cb->cb_json_as_int); +				} +				break; +			case ZPOOL_PROP_ALLOCATED: +				collect_vdev_prop(ZPOOL_PROP_ALLOCATED, +				    vs->vs_alloc, NULL, scripted, toplevel, +				    format, cb->cb_json, props, +				    cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_FREE: +				collect_vdev_prop(ZPOOL_PROP_FREE, +				    vs->vs_space - vs->vs_alloc, NULL, scripted, +				    toplevel, format, cb->cb_json, props, +				    cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_CHECKPOINT: +				collect_vdev_prop(ZPOOL_PROP_CHECKPOINT, +				    vs->vs_checkpoint_space, NULL, scripted, +				    toplevel, format, cb->cb_json, props, +				    cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_EXPANDSZ: +				collect_vdev_prop(ZPOOL_PROP_EXPANDSZ, +				    vs->vs_esize, NULL, scripted, B_TRUE, +				    format, cb->cb_json, props, +				    cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_FRAGMENTATION: +				collect_vdev_prop( +				    ZPOOL_PROP_FRAGMENTATION, +				    vs->vs_fragmentation, NULL, scripted, +				    (vs->vs_fragmentation != ZFS_FRAG_INVALID && +				    toplevel), +				    format, cb->cb_json, props, +				    cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_CAPACITY: +				cap = (vs->vs_space == 0) ? +				    0 : (vs->vs_alloc * 10000 / vs->vs_space); +				collect_vdev_prop(ZPOOL_PROP_CAPACITY, cap, +				    NULL, scripted, toplevel, format, +				    cb->cb_json, props, cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_HEALTH: +				state = zpool_state_to_name(vs->vs_state, +				    vs->vs_aux); +				if (isspare) { +					if (vs->vs_aux == VDEV_AUX_SPARED) +						state = "INUSE"; +					else if (vs->vs_state == +					    VDEV_STATE_HEALTHY) +						state = "AVAIL"; +				} +				collect_vdev_prop(ZPOOL_PROP_HEALTH, 0, state, +				    scripted, B_TRUE, format, cb->cb_json, +				    props, cb->cb_json_as_int); +				break; + +			case ZPOOL_PROP_NAME: +				break; + +			default: +				collect_vdev_prop(pl->pl_prop, 0, +				    NULL, scripted, B_FALSE, format, +				    cb->cb_json, props, cb->cb_json_as_int); + +			} + +  		} -		collect_vdev_prop(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, -		    scripted, toplevel, format, cb->cb_json, props, -		    cb->cb_json_as_int); -		collect_vdev_prop(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, -		    NULL, scripted, toplevel, format, cb->cb_json, props, -		    cb->cb_json_as_int); -		collect_vdev_prop(ZPOOL_PROP_CHECKPOINT, -		    vs->vs_checkpoint_space, NULL, scripted, toplevel, format, -		    cb->cb_json, props, cb->cb_json_as_int); -		collect_vdev_prop(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, -		    scripted, B_TRUE, format, cb->cb_json, props, -		    cb->cb_json_as_int); -		collect_vdev_prop(ZPOOL_PROP_FRAGMENTATION, -		    vs->vs_fragmentation, NULL, scripted, -		    (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), -		    format, cb->cb_json, props, cb->cb_json_as_int); -		cap = (vs->vs_space == 0) ? 0 : -		    (vs->vs_alloc * 10000 / vs->vs_space); -		collect_vdev_prop(ZPOOL_PROP_CAPACITY, cap, NULL, -		    scripted, toplevel, format, cb->cb_json, props, -		    cb->cb_json_as_int); -		collect_vdev_prop(ZPOOL_PROP_DEDUPRATIO, 0, NULL, -		    scripted, toplevel, format, cb->cb_json, props, -		    cb->cb_json_as_int); -		state = zpool_state_to_name(vs->vs_state, vs->vs_aux); -		if (isspare) { -			if (vs->vs_aux == VDEV_AUX_SPARED) -				state = "INUSE"; -			else if (vs->vs_state == VDEV_STATE_HEALTHY) -				state = "AVAIL"; -		} -		collect_vdev_prop(ZPOOL_PROP_HEALTH, 0, state, scripted, -		    B_TRUE, format, cb->cb_json, props, cb->cb_json_as_int);  		if (cb->cb_json) {  			fnvlist_add_nvlist(ent, "properties", props); @@ -7652,7 +7682,7 @@ zpool_do_replace(int argc, char **argv)  }  /* - * zpool attach [-fsw] [-o property=value] <pool> <device>|<vdev> <new_device> + * zpool attach [-fsw] [-o property=value] <pool> <vdev> <new_device>   *   *	-f	Force attach, even if <new_device> appears to be in use.   *	-s	Use sequential instead of healing reconstruction for resilver. @@ -7660,9 +7690,9 @@ zpool_do_replace(int argc, char **argv)   *	-w	Wait for resilvering (mirror) or expansion (raidz) to complete   *		before returning.   * - * Attach <new_device> to a <device> or <vdev>, where the vdev can be of type - * mirror or raidz. If <device> is not part of a mirror, then <device> will - * be transformed into a mirror of <device> and <new_device>. When a mirror + * Attach <new_device> to a <vdev>, where the vdev can be of type + * device, mirror or raidz. If <vdev> is not part of a mirror, then <vdev> will + * be transformed into a mirror of <vdev> and <new_device>. When a mirror   * is involved, <new_device> will begin life with a DTL of [0, now], and will   * immediately begin to resilver itself. For the raidz case, a expansion will   * commence and reflow the raidz data across all the disks including the @@ -8368,6 +8398,8 @@ zpool_do_reopen(int argc, char **argv)  typedef struct scrub_cbdata {  	int	cb_type;  	pool_scrub_cmd_t cb_scrub_cmd; +	time_t	cb_date_start; +	time_t	cb_date_end;  } scrub_cbdata_t;  static boolean_t @@ -8411,8 +8443,8 @@ scrub_callback(zpool_handle_t *zhp, void *data)  		return (1);  	} -	err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); - +	err = zpool_scan_range(zhp, cb->cb_type, cb->cb_scrub_cmd, +	    cb->cb_date_start, cb->cb_date_end);  	if (err == 0 && zpool_has_checkpoint(zhp) &&  	    cb->cb_type == POOL_SCAN_SCRUB) {  		(void) printf(gettext("warning: will not scrub state that " @@ -8430,10 +8462,35 @@ wait_callback(zpool_handle_t *zhp, void *data)  	return (zpool_wait(zhp, *act));  } +static time_t +date_string_to_sec(const char *timestr, boolean_t rounding) +{ +	struct tm tm = {0}; +	int adjustment = rounding ? 1 : 0; + +	/* Allow mktime to determine timezone. */ +	tm.tm_isdst = -1; + +	if (strptime(timestr, "%Y-%m-%d %H:%M", &tm) == NULL) { +		if (strptime(timestr, "%Y-%m-%d", &tm) == NULL) { +			fprintf(stderr, gettext("Failed to parse the date.\n")); +			usage(B_FALSE); +		} +		adjustment *= 24 * 60 * 60; +	} else { +		adjustment *= 60; +	} + +	return (mktime(&tm) + adjustment); +} +  /* - * zpool scrub [-e | -s | -p | -C] [-w] <pool> ... + * zpool scrub [-e | -s | -p | -C | -E | -S] [-w] [-a | <pool> ...]   * + *	-a	Scrub all pools.   *	-e	Only scrub blocks in the error log. + *	-E	End date of scrub. + *	-S	Start date of scrub.   *	-s	Stop.  Stops any in-progress scrub.   *	-p	Pause. Pause in-progress scrub.   *	-w	Wait.  Blocks until scrub has completed. @@ -8449,21 +8506,36 @@ zpool_do_scrub(int argc, char **argv)  	cb.cb_type = POOL_SCAN_SCRUB;  	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; +	cb.cb_date_start = cb.cb_date_end = 0;  	boolean_t is_error_scrub = B_FALSE;  	boolean_t is_pause = B_FALSE;  	boolean_t is_stop = B_FALSE;  	boolean_t is_txg_continue = B_FALSE; +	boolean_t scrub_all = B_FALSE;  	/* check options */ -	while ((c = getopt(argc, argv, "spweC")) != -1) { +	while ((c = getopt(argc, argv, "aspweCE:S:")) != -1) {  		switch (c) { +		case 'a': +			scrub_all = B_TRUE; +			break;  		case 'e':  			is_error_scrub = B_TRUE;  			break; +		case 'E': +			/* +			 * Round the date. It's better to scrub more data than +			 * less. This also makes the date inclusive. +			 */ +			cb.cb_date_end = date_string_to_sec(optarg, B_TRUE); +			break;  		case 's':  			is_stop = B_TRUE;  			break; +		case 'S': +			cb.cb_date_start = date_string_to_sec(optarg, B_FALSE); +			break;  		case 'p':  			is_pause = B_TRUE;  			break; @@ -8511,6 +8583,19 @@ zpool_do_scrub(int argc, char **argv)  		}  	} +	if ((cb.cb_date_start != 0 || cb.cb_date_end != 0) && +	    cb.cb_scrub_cmd != POOL_SCRUB_NORMAL) { +		(void) fprintf(stderr, gettext("invalid option combination: " +		    "start/end date is available only with normal scrub\n")); +		usage(B_FALSE); +	} +	if (cb.cb_date_start != 0 && cb.cb_date_end != 0 && +	    cb.cb_date_start > cb.cb_date_end) { +		(void) fprintf(stderr, gettext("invalid arguments: " +		    "end date has to be later than start date\n")); +		usage(B_FALSE); +	} +  	if (wait && (cb.cb_type == POOL_SCAN_NONE ||  	    cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) {  		(void) fprintf(stderr, gettext("invalid option combination: " @@ -8521,7 +8606,7 @@ zpool_do_scrub(int argc, char **argv)  	argc -= optind;  	argv += optind; -	if (argc < 1) { +	if (argc < 1 && !scrub_all) {  		(void) fprintf(stderr, gettext("missing pool name argument\n"));  		usage(B_FALSE);  	} @@ -8551,6 +8636,7 @@ zpool_do_resilver(int argc, char **argv)  	cb.cb_type = POOL_SCAN_RESILVER;  	cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; +	cb.cb_date_start = cb.cb_date_end = 0;  	/* check options */  	while ((c = getopt(argc, argv, "")) != -1) { @@ -8575,8 +8661,9 @@ zpool_do_resilver(int argc, char **argv)  }  /* - * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...] + * zpool trim [-d] [-r <rate>] [-c | -s] <-a | pool> [<device> ...]   * + *	-a		Trim all pools.   *	-c		Cancel. Ends any in-progress trim.   *	-d		Secure trim.  Requires kernel and device support.   *	-r <rate>	Sets the TRIM rate in bytes (per second). Supports @@ -8593,6 +8680,7 @@ zpool_do_trim(int argc, char **argv)  		{"rate",	required_argument,	NULL,	'r'},  		{"suspend",	no_argument,		NULL,	's'},  		{"wait",	no_argument,		NULL,	'w'}, +		{"all",		no_argument,		NULL,	'a'},  		{0, 0, 0, 0}  	}; @@ -8600,11 +8688,16 @@ zpool_do_trim(int argc, char **argv)  	uint64_t rate = 0;  	boolean_t secure = B_FALSE;  	boolean_t wait = B_FALSE; +	boolean_t trimall = B_FALSE; +	int error;  	int c; -	while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) +	while ((c = getopt_long(argc, argv, "acdr:sw", long_options, NULL))  	    != -1) {  		switch (c) { +		case 'a': +			trimall = B_TRUE; +			break;  		case 'c':  			if (cmd_type != POOL_TRIM_START &&  			    cmd_type != POOL_TRIM_CANCEL) { @@ -8663,7 +8756,18 @@ zpool_do_trim(int argc, char **argv)  	argc -= optind;  	argv += optind; -	if (argc < 1) { +	trimflags_t trim_flags = { +		.secure = secure, +		.rate = rate, +		.wait = wait, +	}; + +	trim_cbdata_t cbdata = { +		.trim_flags = trim_flags, +		.cmd_type = cmd_type +	}; + +	if (argc < 1 && !trimall) {  		(void) fprintf(stderr, gettext("missing pool name argument\n"));  		usage(B_FALSE);  		return (-1); @@ -8671,41 +8775,46 @@ zpool_do_trim(int argc, char **argv)  	if (wait && (cmd_type != POOL_TRIM_START)) {  		(void) fprintf(stderr, gettext("-w cannot be used with -c or " -		    "-s\n")); +		    "-s options\n"));  		usage(B_FALSE);  	} -	char *poolname = argv[0]; -	zpool_handle_t *zhp = zpool_open(g_zfs, poolname); -	if (zhp == NULL) -		return (-1); - -	trimflags_t trim_flags = { -		.secure = secure, -		.rate = rate, -		.wait = wait, -	}; +	if (trimall && argc > 0) { +		(void) fprintf(stderr, gettext("-a cannot be combined with " +		    "individual zpools or vdevs\n")); +		usage(B_FALSE); +	} -	nvlist_t *vdevs = fnvlist_alloc(); -	if (argc == 1) { +	if (argc == 0 && trimall) { +		cbdata.trim_flags.fullpool = B_TRUE; +		/* Trim each pool recursively */ +		error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, +		    B_FALSE, zpool_trim_one, &cbdata); +	} else if (argc == 1) { +		char *poolname = argv[0]; +		zpool_handle_t *zhp = zpool_open(g_zfs, poolname); +		if (zhp == NULL) +			return (-1);  		/* no individual leaf vdevs specified, so add them all */ -		nvlist_t *config = zpool_get_config(zhp, NULL); -		nvlist_t *nvroot = fnvlist_lookup_nvlist(config, -		    ZPOOL_CONFIG_VDEV_TREE); -		zpool_collect_leaves(zhp, nvroot, vdevs); -		trim_flags.fullpool = B_TRUE; +		error = zpool_trim_one(zhp, &cbdata); +		zpool_close(zhp);  	} else { -		trim_flags.fullpool = B_FALSE; +		char *poolname = argv[0]; +		zpool_handle_t *zhp = zpool_open(g_zfs, poolname); +		if (zhp == NULL) +			return (-1); +		/* leaf vdevs specified, trim only those */ +		cbdata.trim_flags.fullpool = B_FALSE; +		nvlist_t *vdevs = fnvlist_alloc();  		for (int i = 1; i < argc; i++) {  			fnvlist_add_boolean(vdevs, argv[i]);  		} +		error = zpool_trim(zhp, cbdata.cmd_type, vdevs, +		    &cbdata.trim_flags); +		fnvlist_free(vdevs); +		zpool_close(zhp);  	} -	int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); - -	fnvlist_free(vdevs); -	zpool_close(zhp); -  	return (error);  } @@ -10706,7 +10815,6 @@ status_callback_json(zpool_handle_t *zhp, void *data)  	uint_t c;  	vdev_stat_t *vs;  	nvlist_t *item, *d, *load_info, *vds; -	item = d = NULL;  	/* If dedup stats were requested, also fetch dedupcached. */  	if (cbp->cb_dedup_stats > 1) @@ -11330,7 +11438,8 @@ upgrade_enable_all(zpool_handle_t *zhp, int *countp)  		const char *fname = spa_feature_table[i].fi_uname;  		const char *fguid = spa_feature_table[i].fi_guid; -		if (!spa_feature_table[i].fi_zfs_mod_supported) +		if (!spa_feature_table[i].fi_zfs_mod_supported || +		    (spa_feature_table[i].fi_flags & ZFEATURE_FLAG_NO_UPGRADE))  			continue;  		if (!nvlist_exists(enabled, fguid) && requested_features[i]) { @@ -11485,7 +11594,11 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)  					    "Note that the pool "  					    "'compatibility' feature can be "  					    "used to inhibit\nfeature " -					    "upgrades.\n\n")); +					    "upgrades.\n\n" +					    "Features marked with (*) are not " +					    "applied automatically on upgrade, " +					    "and\nmust be applied explicitly " +					    "with zpool-set(7).\n\n"));  					(void) printf(gettext("POOL  "  					    "FEATURE\n"));  					(void) printf(gettext("------" @@ -11499,7 +11612,9 @@ upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)  					poolfirst = B_FALSE;  				} -				(void) printf(gettext("      %s\n"), fname); +				(void) printf(gettext("      %s%s\n"), fname, +				    spa_feature_table[i].fi_flags & +				    ZFEATURE_FLAG_NO_UPGRADE ? "(*)" : "");  			}  			/*  			 * If they did "zpool upgrade -a", then we could @@ -12300,7 +12415,7 @@ zpool_do_events_next(ev_opts_t *opts)  		nvlist_free(nvl);  	} -	VERIFY(0 == close(zevent_fd)); +	VERIFY0(close(zevent_fd));  	return (ret);  } diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h index 5ab7cb9750f1..3af23c52bd45 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h @@ -76,11 +76,10 @@ typedef struct zpool_list zpool_list_t;  zpool_list_t *pool_list_get(int, char **, zprop_list_t **, zfs_type_t,      boolean_t, int *); -void pool_list_update(zpool_list_t *); +int pool_list_refresh(zpool_list_t *);  int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);  void pool_list_free(zpool_list_t *);  int pool_list_count(zpool_list_t *); -void pool_list_remove(zpool_list_t *, zpool_handle_t *);  extern libzfs_handle_t *g_zfs; diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 07868a30d7e7..222b5524669e 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -270,14 +270,13 @@ is_spare(nvlist_t *config, const char *path)   *	draid*		Virtual dRAID spare   */  static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) +make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift)  {  	char path[MAXPATHLEN];  	struct stat64 statbuf;  	nvlist_t *vdev = NULL;  	const char *type = NULL;  	boolean_t wholedisk = B_FALSE; -	uint64_t ashift = 0;  	int err;  	/* @@ -382,31 +381,6 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)  		    (uint64_t)wholedisk) == 0);  	/* -	 * Override defaults if custom properties are provided. -	 */ -	if (props != NULL) { -		const char *value = NULL; - -		if (nvlist_lookup_string(props, -		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { -			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { -				(void) fprintf(stderr, -				    gettext("ashift must be a number.\n")); -				return (NULL); -			} -			if (ashift != 0 && -			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { -				(void) fprintf(stderr, -				    gettext("invalid 'ashift=%" PRIu64 "' " -				    "property: only values between %" PRId32 " " -				    "and %" PRId32 " are allowed.\n"), -				    ashift, ASHIFT_MIN, ASHIFT_MAX); -				return (NULL); -			} -		} -	} - -	/*  	 * If the device is known to incorrectly report its physical sector  	 * size explicitly provide the known correct value.  	 */ @@ -574,7 +548,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)  				nvlist_t *cnv = child[c];  				const char *path;  				struct stat64 statbuf; -				int64_t size = -1LL;  				const char *childtype;  				int fd, err; @@ -610,22 +583,28 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)  				    ZPOOL_CONFIG_PATH, &path) == 0);  				/* +				 * Skip active spares they should never cause +				 * the pool to be evaluated as inconsistent. +				 */ +				if (is_spare(NULL, path)) +					continue; + +				/*  				 * If we have a raidz/mirror that combines disks -				 * with files, report it as an error. +				 * with files, only report it as an error when +				 * fatal is set to ensure all the replication +				 * checks aren't skipped in check_replication().  				 */ -				if (!dontreport && type != NULL && +				if (fatal && !dontreport && type != NULL &&  				    strcmp(type, childtype) != 0) {  					if (ret != NULL)  						free(ret);  					ret = NULL; -					if (fatal) -						vdev_error(gettext( -						    "mismatched replication " -						    "level: %s contains both " -						    "files and devices\n"), -						    rep.zprl_type); -					else -						return (NULL); +					vdev_error(gettext( +					    "mismatched replication " +					    "level: %s contains both " +					    "files and devices\n"), +					    rep.zprl_type);  					dontreport = B_TRUE;  				} @@ -656,7 +635,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal)  				    statbuf.st_size == MAXOFFSET_T)  					continue; -				size = statbuf.st_size; +				int64_t size = statbuf.st_size;  				/*  				 * Also make sure that devices and @@ -876,6 +855,18 @@ check_replication(nvlist_t *config, nvlist_t *newroot)  				    (u_longlong_t)mirror->zprl_children);  				ret = -1;  			} +		} else if (is_raidz_draid(current, new)) { +			if (current->zprl_parity != new->zprl_parity) { +				vdev_error(gettext( +				    "mismatched replication level: pool and " +				    "new vdev with different redundancy, %s " +				    "and %s vdevs, %llu vs. %llu\n"), +				    current->zprl_type, +				    new->zprl_type, +				    (u_longlong_t)current->zprl_parity, +				    (u_longlong_t)new->zprl_parity); +				ret = -1; +			}  		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {  			vdev_error(gettext(  			    "mismatched replication level: pool uses %s " @@ -1353,7 +1344,7 @@ is_grouping(const char *type, int *mindev, int *maxdev)  static int  draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)  { -	uint64_t nparity = 1; +	uint64_t nparity;  	uint64_t nspares = 0;  	uint64_t ndata = UINT64_MAX;  	uint64_t ngroups = 1; @@ -1496,6 +1487,29 @@ construct_spec(nvlist_t *props, int argc, char **argv)  	const char *type, *fulltype;  	boolean_t is_log, is_special, is_dedup, is_spare;  	boolean_t seen_logs; +	uint64_t ashift = 0; + +	if (props != NULL) { +		const char *value = NULL; + +		if (nvlist_lookup_string(props, +		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { +			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { +				(void) fprintf(stderr, +				    gettext("ashift must be a number.\n")); +				return (NULL); +			} +			if (ashift != 0 && +			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { +				(void) fprintf(stderr, +				    gettext("invalid 'ashift=%" PRIu64 "' " +				    "property: only values between %" PRId32 " " +				    "and %" PRId32 " are allowed.\n"), +				    ashift, ASHIFT_MIN, ASHIFT_MAX); +				return (NULL); +			} +		} +	}  	top = NULL;  	toplevels = 0; @@ -1581,13 +1595,12 @@ construct_spec(nvlist_t *props, int argc, char **argv)  				is_dedup = is_spare = B_FALSE;  			} -			if (is_log || is_special || is_dedup) { +			if (is_log) {  				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {  					(void) fprintf(stderr,  					    gettext("invalid vdev " -					    "specification: unsupported '%s' " -					    "device: %s\n"), is_log ? "log" : -					    "special", type); +					    "specification: unsupported 'log' " +					    "device: %s\n"), type);  					goto spec_out;  				}  				nlogs++; @@ -1602,9 +1615,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)  				    children * sizeof (nvlist_t *));  				if (child == NULL)  					zpool_no_memory(); -				if ((nv = make_leaf_vdev(props, argv[c], +				if ((nv = make_leaf_vdev(argv[c],  				    !(is_log || is_special || is_dedup || -				    is_spare))) == NULL) { +				    is_spare), ashift)) == NULL) {  					for (c = 0; c < children - 1; c++)  						nvlist_free(child[c]);  					free(child); @@ -1668,6 +1681,10 @@ construct_spec(nvlist_t *props, int argc, char **argv)  					    ZPOOL_CONFIG_ALLOCATION_BIAS,  					    VDEV_ALLOC_BIAS_DEDUP) == 0);  				} +				if (ashift > 0) { +					fnvlist_add_uint64(nv, +					    ZPOOL_CONFIG_ASHIFT, ashift); +				}  				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {  					verify(nvlist_add_uint64(nv,  					    ZPOOL_CONFIG_NPARITY, @@ -1695,8 +1712,9 @@ construct_spec(nvlist_t *props, int argc, char **argv)  			 * We have a device.  Pass off to make_leaf_vdev() to  			 * construct the appropriate nvlist describing the vdev.  			 */ -			if ((nv = make_leaf_vdev(props, argv[0], !(is_log || -			    is_special || is_dedup || is_spare))) == NULL) +			if ((nv = make_leaf_vdev(argv[0], !(is_log || +			    is_special || is_dedup || is_spare), +			    ashift)) == NULL)  				goto spec_out;  			verify(nvlist_add_uint64(nv, diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am index be3539fe905d..80ef1ea7ca11 100644 --- a/sys/contrib/openzfs/cmd/zstream/Makefile.am +++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am @@ -18,6 +18,7 @@ zstream_LDADD = \  	libzpool.la \  	libnvpair.la -PHONY += install-exec-hook -install-exec-hook: +cmd-zstream-install-exec-hook:  	cd $(DESTDIR)$(sbindir) && $(LN_S) -f zstream zstreamdump + +INSTALL_EXEC_HOOKS += cmd-zstream-install-exec-hook diff --git a/sys/contrib/openzfs/cmd/ztest.c b/sys/contrib/openzfs/cmd/ztest.c index 89264c97ff10..89752dcb0f0f 100644 --- a/sys/contrib/openzfs/cmd/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest.c @@ -273,7 +273,6 @@ extern int zfs_compressed_arc_enabled;  extern int zfs_abd_scatter_enabled;  extern uint_t dmu_object_alloc_chunk_shift;  extern boolean_t zfs_force_some_double_word_sm_entries; -extern unsigned long zio_decompress_fail_fraction;  extern unsigned long zfs_reconstruct_indirect_damage_fraction;  extern uint64_t raidz_expand_max_reflow_bytes;  extern uint_t raidz_expand_pause_point; @@ -809,8 +808,8 @@ static ztest_option_t option_table[] = {  	{ 'X', "raidz-expansion", NULL,  	    "Perform a dedicated raidz expansion test",  	    NO_DEFAULT, NULL}, -	{ 'o',	"option", "\"OPTION=INTEGER\"", -	    "Set global variable to an unsigned 32-bit integer value", +	{ 'o',	"option", "\"NAME=VALUE\"", +	    "Set the named tunable to the given value",  	    NO_DEFAULT, NULL},  	{ 'G',	"dump-debug-msg", NULL,  	    "Dump zfs_dbgmsg buffer before exiting due to an error", @@ -829,8 +828,8 @@ static char *short_opts = NULL;  static void  init_options(void)  { -	ASSERT3P(long_opts, ==, NULL); -	ASSERT3P(short_opts, ==, NULL); +	ASSERT0P(long_opts); +	ASSERT0P(short_opts);  	int count = sizeof (option_table) / sizeof (option_table[0]);  	long_opts = umem_alloc(sizeof (struct option) * count, UMEM_NOFAIL); @@ -919,7 +918,7 @@ ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)  {  	char name[32];  	char *value; -	int state = ZTEST_VDEV_CLASS_RND; +	int state;  	(void) strlcpy(name, input, sizeof (name)); @@ -1686,7 +1685,7 @@ ztest_rll_init(rll_t *rll)  static void  ztest_rll_destroy(rll_t *rll)  { -	ASSERT3P(rll->rll_writer, ==, NULL); +	ASSERT0P(rll->rll_writer);  	ASSERT0(rll->rll_readers);  	mutex_destroy(&rll->rll_lock);  	cv_destroy(&rll->rll_cv); @@ -1720,7 +1719,7 @@ ztest_rll_unlock(rll_t *rll)  		rll->rll_writer = NULL;  	} else {  		ASSERT3S(rll->rll_readers, >, 0); -		ASSERT3P(rll->rll_writer, ==, NULL); +		ASSERT0P(rll->rll_writer);  		rll->rll_readers--;  	} @@ -1996,7 +1995,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)  	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,  	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH |  	    DMU_KEEP_CACHING) != 0) { -		zil_itx_destroy(itx); +		zil_itx_destroy(itx, 0);  		itx = zil_itx_create(TX_WRITE, sizeof (*lr));  		write_state = WR_NEED_COPY;  	} @@ -2278,8 +2277,8 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)  			ztest_block_tag_t rbt; -			VERIFY(dmu_read(os, lr->lr_foid, offset, -			    sizeof (rbt), &rbt, flags) == 0); +			VERIFY0(dmu_read(os, lr->lr_foid, offset, +			    sizeof (rbt), &rbt, flags));  			if (rbt.bt_magic == BT_MAGIC) {  				ztest_bt_verify(&rbt, os, lr->lr_foid, 0,  				    offset, gen, txg, crtxg); @@ -2966,7 +2965,7 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_t id)  	(void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); -	zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); +	VERIFY0(zil_commit(zilog, ztest_random(ZTEST_OBJECTS)));  	/*  	 * Remember the committed values in zd, which is in parent/child @@ -3882,7 +3881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)  	 * If newvd is too small, it should fail with EOVERFLOW.  	 *  	 * If newvd is a distributed spare and it's being attached to a -	 * dRAID which is not its parent it should fail with EINVAL. +	 * dRAID which is not its parent it should fail with ENOTSUP.  	 */  	if (pvd->vdev_ops != &vdev_mirror_ops &&  	    pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3901,7 +3900,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)  	else if (ashift > oldvd->vdev_top->vdev_ashift)  		expected_error = EDOM;  	else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) -		expected_error = EINVAL; +		expected_error = ENOTSUP;  	else  		expected_error = 0; @@ -4007,7 +4006,7 @@ raidz_scratch_verify(void)  		 * requested by user, but scratch object was not created.  		 */  		case RRSS_SCRATCH_NOT_IN_USE: -			ASSERT3U(offset, ==, 0); +			ASSERT0(offset);  			break;  		/* @@ -5537,8 +5536,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)  			}  			if (i == 1) { -				VERIFY(dmu_buf_hold(os, bigobj, off, -				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); +				VERIFY0(dmu_buf_hold(os, bigobj, off, +				    FTAG, &dbt, DMU_READ_NO_PREFETCH));  			}  			if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {  				VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, @@ -7069,7 +7068,7 @@ ztest_set_global_vars(void)  		char *kv = ztest_opts.zo_gvars[i];  		VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN);  		VERIFY3U(strlen(kv), >, 0); -		int err = set_global_var(kv); +		int err = handle_tunable_option(kv, B_TRUE);  		if (ztest_opts.zo_verbose > 0) {  			(void) printf("setting global var %s ... %s\n", kv,  			    err ? "failed" : "ok"); @@ -7813,6 +7812,9 @@ ztest_dataset_open(int d)  	ztest_dataset_name(name, ztest_opts.zo_pool, d); +	if (ztest_opts.zo_verbose >= 6) +		(void) printf("Opening %s\n", name); +  	(void) pthread_rwlock_rdlock(&ztest_name_lock);  	error = ztest_dataset_create(name); @@ -7934,7 +7936,7 @@ ztest_freeze(void)  	 */  	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {  		ztest_dmu_object_alloc_free(zd, 0); -		zil_commit(zd->zd_zilog, 0); +		VERIFY0(zil_commit(zd->zd_zilog, 0));  	}  	txg_wait_synced(spa_get_dsl(spa), 0); @@ -7976,7 +7978,7 @@ ztest_freeze(void)  	/*  	 * Commit all of the changes we just generated.  	 */ -	zil_commit(zd->zd_zilog, 0); +	VERIFY0(zil_commit(zd->zd_zilog, 0));  	txg_wait_synced(spa_get_dsl(spa), 0);  	/* @@ -8308,41 +8310,44 @@ static void  ztest_generic_run(ztest_shared_t *zs, spa_t *spa)  {  	kthread_t **run_threads; -	int t; +	int i, ndatasets;  	run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),  	    UMEM_NOFAIL);  	/* -	 * Kick off all the tests that run in parallel. +	 * Actual number of datasets to be used.  	 */ -	for (t = 0; t < ztest_opts.zo_threads; t++) { -		if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { -			umem_free(run_threads, ztest_opts.zo_threads * -			    sizeof (kthread_t *)); -			return; -		} +	ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads); -		run_threads[t] = thread_create(NULL, 0, ztest_thread, -		    (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, +	/* +	 * Prepare the datasets first. +	 */ +	for (i = 0; i < ndatasets; i++) +		VERIFY0(ztest_dataset_open(i)); + +	/* +	 * Kick off all the tests that run in parallel. +	 */ +	for (i = 0; i < ztest_opts.zo_threads; i++) { +		run_threads[i] = thread_create(NULL, 0, ztest_thread, +		    (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE,  		    defclsyspri);  	}  	/*  	 * Wait for all of the tests to complete.  	 */ -	for (t = 0; t < ztest_opts.zo_threads; t++) -		VERIFY0(thread_join(run_threads[t])); +	for (i = 0; i < ztest_opts.zo_threads; i++) +		VERIFY0(thread_join(run_threads[i]));  	/*  	 * Close all datasets. This must be done after all the threads  	 * are joined so we can be sure none of the datasets are in-use  	 * by any of the threads.  	 */ -	for (t = 0; t < ztest_opts.zo_threads; t++) { -		if (t < ztest_opts.zo_datasets) -			ztest_dataset_close(t); -	} +	for (i = 0; i < ndatasets; i++) +		ztest_dataset_close(i);  	txg_wait_synced(spa_get_dsl(spa), 0); @@ -8465,6 +8470,7 @@ ztest_run(ztest_shared_t *zs)  		int d = ztest_random(ztest_opts.zo_datasets);  		ztest_dataset_destroy(d); +		txg_wait_synced(spa_get_dsl(spa), 0);  	}  	zs->zs_enospc_count = 0; @@ -8972,7 +8978,7 @@ main(int argc, char **argv)  		exit(EXIT_FAILURE);  	} else {  		/* children should not be spawned if setting gvars fails */ -		VERIFY3S(err, ==, 0); +		VERIFY0(err);  	}  	/* Override location of zpool.cache */ | 
