aboutsummaryrefslogtreecommitdiff
path: root/sys/contrib/openzfs/cmd
diff options
context:
space:
mode:
Diffstat (limited to 'sys/contrib/openzfs/cmd')
-rw-r--r--sys/contrib/openzfs/cmd/Makefile.am10
-rw-r--r--sys/contrib/openzfs/cmd/arc_summary/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/arc_summary/Makefile.am13
-rwxr-xr-xsys/contrib/openzfs/cmd/arc_summary/arc_summary21093
-rwxr-xr-xsys/contrib/openzfs/cmd/arc_summary/arc_summary3943
-rw-r--r--sys/contrib/openzfs/cmd/arcstat/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/arcstat/Makefile.am5
-rwxr-xr-xsys/contrib/openzfs/cmd/arcstat/arcstat.in494
-rw-r--r--sys/contrib/openzfs/cmd/dbufstat/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/dbufstat/Makefile.am5
-rwxr-xr-xsys/contrib/openzfs/cmd/dbufstat/dbufstat.in669
-rw-r--r--sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am1
-rwxr-xr-xsys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs9
-rw-r--r--sys/contrib/openzfs/cmd/mount_zfs/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/mount_zfs/Makefile.am20
-rw-r--r--sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c408
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/Makefile.am20
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c227
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/raidz_test.c782
-rw-r--r--sys/contrib/openzfs/cmd/raidz_test/raidz_test.h116
-rw-r--r--sys/contrib/openzfs/cmd/vdev_id/Makefile.am1
-rwxr-xr-xsys/contrib/openzfs/cmd/vdev_id/vdev_id605
-rw-r--r--sys/contrib/openzfs/cmd/zdb/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zdb/Makefile.am16
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb.c8606
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb.h33
-rw-r--r--sys/contrib/openzfs/cmd/zdb/zdb_il.c431
-rw-r--r--sys/contrib/openzfs/cmd/zed/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zed/Makefile.am49
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/README.md112
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/fmd_api.c760
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/fmd_api.h246
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c316
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h86
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c422
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h46
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c981
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c956
-rw-r--r--sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c557
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed.c306
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed.d/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am53
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed.d/README30
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh26
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh14
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh43
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh54
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in85
l---------sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh1
l---------sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh1
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh19
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh59
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh177
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh74
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh37
l---------sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh1
l---------sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh1
-rwxr-xr-xsys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh538
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed.d/zed.rc122
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed.h58
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_conf.c735
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_conf.h62
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_disk_event.c416
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_disk_event.h31
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_event.c965
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_event.h29
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_exec.c232
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_exec.h25
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_file.c217
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_file.h35
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_log.c256
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_log.h44
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_strings.c247
-rw-r--r--sys/contrib/openzfs/cmd/zed/zed_strings.h27
-rw-r--r--sys/contrib/openzfs/cmd/zfs/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zfs/Makefile.am23
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_iter.c512
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_iter.h61
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_main.c8637
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_project.c295
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h49
-rw-r--r--sys/contrib/openzfs/cmd/zfs/zfs_util.h42
-rw-r--r--sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am9
-rw-r--r--sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c96
-rw-r--r--sys/contrib/openzfs/cmd/zgenhostid/Makefile.am1
-rwxr-xr-xsys/contrib/openzfs/cmd/zgenhostid/zgenhostid61
-rw-r--r--sys/contrib/openzfs/cmd/zhack/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zhack/Makefile.am14
-rw-r--r--sys/contrib/openzfs/cmd/zhack/zhack.c532
-rw-r--r--sys/contrib/openzfs/cmd/zinject/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zinject/Makefile.am13
-rw-r--r--sys/contrib/openzfs/cmd/zinject/translate.c397
-rw-r--r--sys/contrib/openzfs/cmd/zinject/zinject.c1287
-rw-r--r--sys/contrib/openzfs/cmd/zinject/zinject.h70
-rw-r--r--sys/contrib/openzfs/cmd/zpool/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zpool/Makefile.am136
-rw-r--r--sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c103
-rw-r--r--sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c410
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool.d/README9
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/defect1
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps29
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/enc1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/encdev1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/health1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on1
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/iostat77
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/label1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led1
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/lsblk83
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/media27
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/model1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/realloc1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/serial1
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/ses52
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/size1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/slot1
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/smart243
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/smartx1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/temp1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/test_status1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/test_type1
-rwxr-xr-xsys/contrib/openzfs/cmd/zpool/zpool.d/upath7
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/vendor1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc1
l---------sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor1
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_iter.c757
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_main.c10329
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_util.c125
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_util.h137
-rw-r--r--sys/contrib/openzfs/cmd/zpool/zpool_vdev.c1581
-rw-r--r--sys/contrib/openzfs/cmd/zstream/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zstream/Makefile.am15
-rw-r--r--sys/contrib/openzfs/cmd/zstream/zstream.c66
-rw-r--r--sys/contrib/openzfs/cmd/zstream/zstream.h36
-rw-r--r--sys/contrib/openzfs/cmd/zstream/zstream_dump.c799
-rw-r--r--sys/contrib/openzfs/cmd/zstream/zstream_redup.c469
-rw-r--r--sys/contrib/openzfs/cmd/zstream/zstream_token.c78
-rw-r--r--sys/contrib/openzfs/cmd/zstreamdump/Makefile.am1
-rwxr-xr-xsys/contrib/openzfs/cmd/zstreamdump/zstreamdump3
-rw-r--r--sys/contrib/openzfs/cmd/ztest/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/ztest/Makefile.am23
-rw-r--r--sys/contrib/openzfs/cmd/ztest/ztest.c7818
-rw-r--r--sys/contrib/openzfs/cmd/zvol_id/.gitignore1
-rw-r--r--sys/contrib/openzfs/cmd/zvol_id/Makefile.am10
-rw-r--r--sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c110
-rw-r--r--sys/contrib/openzfs/cmd/zvol_wait/Makefile.am1
-rwxr-xr-xsys/contrib/openzfs/cmd/zvol_wait/zvol_wait116
165 files changed, 59858 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am
new file mode 100644
index 000000000000..88d32b1c538c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/Makefile.am
@@ -0,0 +1,10 @@
+SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest
+SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path
+
+if USING_PYTHON
+SUBDIRS += arcstat arc_summary dbufstat
+endif
+
+if BUILD_LINUX
+SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait
+endif
diff --git a/sys/contrib/openzfs/cmd/arc_summary/.gitignore b/sys/contrib/openzfs/cmd/arc_summary/.gitignore
new file mode 100644
index 000000000000..50ba15f034e2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/.gitignore
@@ -0,0 +1 @@
+arc_summary
diff --git a/sys/contrib/openzfs/cmd/arc_summary/Makefile.am b/sys/contrib/openzfs/cmd/arc_summary/Makefile.am
new file mode 100644
index 000000000000..1a26c2c199f8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/Makefile.am
@@ -0,0 +1,13 @@
+bin_SCRIPTS = arc_summary
+
+CLEANFILES = arc_summary
+EXTRA_DIST = arc_summary2 arc_summary3
+
+if USING_PYTHON_2
+SCRIPT = arc_summary2
+else
+SCRIPT = arc_summary3
+endif
+
+arc_summary: $(SCRIPT)
+ cp $< $@
diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2
new file mode 100755
index 000000000000..5dc40d759dce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2
@@ -0,0 +1,1093 @@
+#!/usr/bin/env python2
+#
+# $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $
+#
+# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
+# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
+# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# If you are having troubles when using this script from cron(8) please try
+# adjusting your PATH before reporting problems.
+#
+# Note some of this code uses older code (eg getopt instead of argparse,
+# subprocess.Popen() instead of subprocess.run()) because we need to support
+# some very old versions of Python.
+#
+
+"""Print statistics on the ZFS Adjustable Replacement Cache (ARC)
+
+Provides basic information on the ARC, its efficiency, the L2ARC (if present),
+the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the
+in-source documentation and code at
+https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details.
+"""
+
+import getopt
+import os
+import sys
+import time
+import errno
+
+from subprocess import Popen, PIPE
+from decimal import Decimal as D
+
+
+if sys.platform.startswith('freebsd'):
+ # Requires py27-sysctl on FreeBSD
+ import sysctl
+
+ def load_kstats(namespace):
+ """Collect information on a specific subsystem of the ARC"""
+
+ base = 'kstat.zfs.misc.%s.' % namespace
+ return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)]
+
+ def load_tunables():
+ return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs'))
+
+elif sys.platform.startswith('linux'):
+
+ def load_kstats(namespace):
+ """Collect information on a specific subsystem of the ARC"""
+
+ kstat = 'kstat.zfs.misc.%s.%%s' % namespace
+ path = '/proc/spl/kstat/zfs/%s' % namespace
+ with open(path) as f:
+ entries = [line.strip().split() for line in f][2:] # Skip header
+ return [(kstat % name, D(value)) for name, _, value in entries]
+
+ def load_tunables():
+ basepath = '/sys/module/zfs/parameters'
+ tunables = {}
+ for name in os.listdir(basepath):
+ if not name:
+ continue
+ path = '%s/%s' % (basepath, name)
+ with open(path) as f:
+ value = f.read()
+ tunables[name] = value.strip()
+ return tunables
+
+
+show_tunable_descriptions = False
+alternate_tunable_layout = False
+
+
+def handle_Exception(ex_cls, ex, tb):
+ if ex is IOError:
+ if ex.errno == errno.EPIPE:
+ sys.exit()
+
+ if ex is KeyboardInterrupt:
+ sys.exit()
+
+
+sys.excepthook = handle_Exception
+
+
+def get_Kstat():
+ """Collect information on the ZFS subsystem from the /proc virtual
+ file system. The name "kstat" is a holdover from the Solaris utility
+ of the same name.
+ """
+
+ Kstat = {}
+ Kstat.update(load_kstats('arcstats'))
+ Kstat.update(load_kstats('zfetchstats'))
+ Kstat.update(load_kstats('vdev_cache_stats'))
+ return Kstat
+
+
+def fBytes(b=0):
+ """Return human-readable representation of a byte value in
+ powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
+ points. Values smaller than one KiB are returned without
+ decimal points.
+ """
+
+ prefixes = [
+ [2**80, "YiB"], # yobibytes (yotta)
+ [2**70, "ZiB"], # zebibytes (zetta)
+ [2**60, "EiB"], # exbibytes (exa)
+ [2**50, "PiB"], # pebibytes (peta)
+ [2**40, "TiB"], # tebibytes (tera)
+ [2**30, "GiB"], # gibibytes (giga)
+ [2**20, "MiB"], # mebibytes (mega)
+ [2**10, "KiB"]] # kibibytes (kilo)
+
+ if b >= 2**10:
+
+ for limit, unit in prefixes:
+
+ if b >= limit:
+ value = b / limit
+ break
+
+ result = "%0.2f\t%s" % (value, unit)
+
+ else:
+
+ result = "%d\tBytes" % b
+
+ return result
+
+
+def fHits(hits=0):
+ """Create a human-readable representation of the number of hits.
+ The single-letter symbols used are SI to avoid the confusion caused
+ by the different "short scale" and "long scale" representations in
+ English, which use the same words for different values. See
+ https://en.wikipedia.org/wiki/Names_of_large_numbers and
+ https://physics.nist.gov/cuu/Units/prefixes.html
+ """
+
+ numbers = [
+ [10**24, 'Y'], # yotta (septillion)
+ [10**21, 'Z'], # zetta (sextillion)
+ [10**18, 'E'], # exa (quintrillion)
+ [10**15, 'P'], # peta (quadrillion)
+ [10**12, 'T'], # tera (trillion)
+ [10**9, 'G'], # giga (billion)
+ [10**6, 'M'], # mega (million)
+ [10**3, 'k']] # kilo (thousand)
+
+ if hits >= 1000:
+
+ for limit, symbol in numbers:
+
+ if hits >= limit:
+ value = hits/limit
+ break
+
+ result = "%0.2f%s" % (value, symbol)
+
+ else:
+
+ result = "%d" % hits
+
+ return result
+
+
+def fPerc(lVal=0, rVal=0, Decimal=2):
+ """Calculate percentage value and return in human-readable format"""
+
+ if rVal > 0:
+ return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%"
+ else:
+ return str("%0." + str(Decimal) + "f") % 100 + "%"
+
+
+def get_arc_summary(Kstat):
+ """Collect general data on the ARC"""
+
+ output = {}
+ memory_throttle_count = Kstat[
+ "kstat.zfs.misc.arcstats.memory_throttle_count"
+ ]
+
+ if memory_throttle_count > 0:
+ output['health'] = 'THROTTLED'
+ else:
+ output['health'] = 'HEALTHY'
+
+ output['memory_throttle_count'] = fHits(memory_throttle_count)
+
+ # ARC Misc.
+ deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
+ mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
+ evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"]
+
+ # ARC Misc.
+ output["arc_misc"] = {}
+ output["arc_misc"]["deleted"] = fHits(deleted)
+ output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
+ output["arc_misc"]['evict_skips'] = fHits(evict_skip)
+
+ # ARC Sizing
+ arc_size = Kstat["kstat.zfs.misc.arcstats.size"]
+ mru_size = Kstat["kstat.zfs.misc.arcstats.mru_size"]
+ mfu_size = Kstat["kstat.zfs.misc.arcstats.mfu_size"]
+ meta_limit = Kstat["kstat.zfs.misc.arcstats.arc_meta_limit"]
+ meta_size = Kstat["kstat.zfs.misc.arcstats.arc_meta_used"]
+ dnode_limit = Kstat["kstat.zfs.misc.arcstats.arc_dnode_limit"]
+ dnode_size = Kstat["kstat.zfs.misc.arcstats.dnode_size"]
+ target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"]
+ target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"]
+ target_size = Kstat["kstat.zfs.misc.arcstats.c"]
+
+ target_size_ratio = (target_max_size / target_min_size)
+
+ # ARC Sizing
+ output['arc_sizing'] = {}
+ output['arc_sizing']['arc_size'] = {
+ 'per': fPerc(arc_size, target_max_size),
+ 'num': fBytes(arc_size),
+ }
+ output['arc_sizing']['target_max_size'] = {
+ 'ratio': target_size_ratio,
+ 'num': fBytes(target_max_size),
+ }
+ output['arc_sizing']['target_min_size'] = {
+ 'per': fPerc(target_min_size, target_max_size),
+ 'num': fBytes(target_min_size),
+ }
+ output['arc_sizing']['target_size'] = {
+ 'per': fPerc(target_size, target_max_size),
+ 'num': fBytes(target_size),
+ }
+ output['arc_sizing']['meta_limit'] = {
+ 'per': fPerc(meta_limit, target_max_size),
+ 'num': fBytes(meta_limit),
+ }
+ output['arc_sizing']['meta_size'] = {
+ 'per': fPerc(meta_size, meta_limit),
+ 'num': fBytes(meta_size),
+ }
+ output['arc_sizing']['dnode_limit'] = {
+ 'per': fPerc(dnode_limit, meta_limit),
+ 'num': fBytes(dnode_limit),
+ }
+ output['arc_sizing']['dnode_size'] = {
+ 'per': fPerc(dnode_size, dnode_limit),
+ 'num': fBytes(dnode_size),
+ }
+
+ # ARC Hash Breakdown
+ output['arc_hash_break'] = {}
+ output['arc_hash_break']['hash_chain_max'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_chain_max"
+ ]
+ output['arc_hash_break']['hash_chains'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_chains"
+ ]
+ output['arc_hash_break']['hash_collisions'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_collisions"
+ ]
+ output['arc_hash_break']['hash_elements'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_elements"
+ ]
+ output['arc_hash_break']['hash_elements_max'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_elements_max"
+ ]
+
+ output['arc_size_break'] = {}
+ output['arc_size_break']['recently_used_cache_size'] = {
+ 'per': fPerc(mru_size, mru_size + mfu_size),
+ 'num': fBytes(mru_size),
+ }
+ output['arc_size_break']['frequently_used_cache_size'] = {
+ 'per': fPerc(mfu_size, mru_size + mfu_size),
+ 'num': fBytes(mfu_size),
+ }
+
+ # ARC Hash Breakdown
+ hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"]
+ hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"]
+ hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"]
+ hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"]
+ hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"]
+
+ output['arc_hash_break'] = {}
+ output['arc_hash_break']['elements_max'] = fHits(hash_elements_max)
+ output['arc_hash_break']['elements_current'] = {
+ 'per': fPerc(hash_elements, hash_elements_max),
+ 'num': fHits(hash_elements),
+ }
+ output['arc_hash_break']['collisions'] = fHits(hash_collisions)
+ output['arc_hash_break']['chain_max'] = fHits(hash_chain_max)
+ output['arc_hash_break']['chains'] = fHits(hash_chains)
+
+ return output
+
+
+def _arc_summary(Kstat):
+ """Print information on the ARC"""
+
+ # ARC Sizing
+ arc = get_arc_summary(Kstat)
+
+ sys.stdout.write("ARC Summary: (%s)\n" % arc['health'])
+
+ sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" %
+ arc['memory_throttle_count'])
+ sys.stdout.write("\n")
+
+ # ARC Misc.
+ sys.stdout.write("ARC Misc:\n")
+ sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
+ sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
+ arc['arc_misc']['mutex_miss'])
+ sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %
+ arc['arc_misc']['evict_skips'])
+ sys.stdout.write("\n")
+
+ # ARC Sizing
+ sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % (
+ arc['arc_sizing']['arc_size']['per'],
+ arc['arc_sizing']['arc_size']['num']
+ )
+ )
+ sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % (
+ arc['arc_sizing']['target_size']['per'],
+ arc['arc_sizing']['target_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % (
+ arc['arc_sizing']['target_min_size']['per'],
+ arc['arc_sizing']['target_min_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % (
+ arc['arc_sizing']['target_max_size']['ratio'],
+ arc['arc_sizing']['target_max_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\nARC Size Breakdown:\n")
+ sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % (
+ arc['arc_size_break']['recently_used_cache_size']['per'],
+ arc['arc_size_break']['recently_used_cache_size']['num'],
+ )
+ )
+ sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % (
+ arc['arc_size_break']['frequently_used_cache_size']['per'],
+ arc['arc_size_break']['frequently_used_cache_size']['num'],
+ )
+ )
+ sys.stdout.write("\tMetadata Size (Hard Limit):\t%s\t%s\n" % (
+ arc['arc_sizing']['meta_limit']['per'],
+ arc['arc_sizing']['meta_limit']['num'],
+ )
+ )
+ sys.stdout.write("\tMetadata Size:\t\t\t%s\t%s\n" % (
+ arc['arc_sizing']['meta_size']['per'],
+ arc['arc_sizing']['meta_size']['num'],
+ )
+ )
+ sys.stdout.write("\tDnode Size (Hard Limit):\t%s\t%s\n" % (
+ arc['arc_sizing']['dnode_limit']['per'],
+ arc['arc_sizing']['dnode_limit']['num'],
+ )
+ )
+ sys.stdout.write("\tDnode Size:\t\t\t%s\t%s\n" % (
+ arc['arc_sizing']['dnode_size']['per'],
+ arc['arc_sizing']['dnode_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\n")
+
+ # ARC Hash Breakdown
+ sys.stdout.write("ARC Hash Breakdown:\n")
+ sys.stdout.write("\tElements Max:\t\t\t\t%s\n" %
+ arc['arc_hash_break']['elements_max'])
+ sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % (
+ arc['arc_hash_break']['elements_current']['per'],
+ arc['arc_hash_break']['elements_current']['num'],
+ )
+ )
+ sys.stdout.write("\tCollisions:\t\t\t\t%s\n" %
+ arc['arc_hash_break']['collisions'])
+ sys.stdout.write("\tChain Max:\t\t\t\t%s\n" %
+ arc['arc_hash_break']['chain_max'])
+ sys.stdout.write("\tChains:\t\t\t\t\t%s\n" %
+ arc['arc_hash_break']['chains'])
+
+
+def get_arc_efficiency(Kstat):
+ """Collect information on the efficiency of the ARC"""
+
+ output = {}
+
+ arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"]
+ arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"]
+ demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"]
+ demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"]
+ demand_metadata_hits = Kstat[
+ "kstat.zfs.misc.arcstats.demand_metadata_hits"
+ ]
+ demand_metadata_misses = Kstat[
+ "kstat.zfs.misc.arcstats.demand_metadata_misses"
+ ]
+ mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"]
+ mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"]
+ mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"]
+ mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"]
+ prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"]
+ prefetch_data_misses = Kstat[
+ "kstat.zfs.misc.arcstats.prefetch_data_misses"
+ ]
+ prefetch_metadata_hits = Kstat[
+ "kstat.zfs.misc.arcstats.prefetch_metadata_hits"
+ ]
+ prefetch_metadata_misses = Kstat[
+ "kstat.zfs.misc.arcstats.prefetch_metadata_misses"
+ ]
+
+ anon_hits = arc_hits - (
+ mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits
+ )
+ arc_accesses_total = (arc_hits + arc_misses)
+ demand_data_total = (demand_data_hits + demand_data_misses)
+ prefetch_data_total = (prefetch_data_hits + prefetch_data_misses)
+ real_hits = (mfu_hits + mru_hits)
+
+ output["total_accesses"] = fHits(arc_accesses_total)
+ output["cache_hit_ratio"] = {
+ 'per': fPerc(arc_hits, arc_accesses_total),
+ 'num': fHits(arc_hits),
+ }
+ output["cache_miss_ratio"] = {
+ 'per': fPerc(arc_misses, arc_accesses_total),
+ 'num': fHits(arc_misses),
+ }
+ output["actual_hit_ratio"] = {
+ 'per': fPerc(real_hits, arc_accesses_total),
+ 'num': fHits(real_hits),
+ }
+ output["data_demand_efficiency"] = {
+ 'per': fPerc(demand_data_hits, demand_data_total),
+ 'num': fHits(demand_data_total),
+ }
+
+ if prefetch_data_total > 0:
+ output["data_prefetch_efficiency"] = {
+ 'per': fPerc(prefetch_data_hits, prefetch_data_total),
+ 'num': fHits(prefetch_data_total),
+ }
+
+ if anon_hits > 0:
+ output["cache_hits_by_cache_list"] = {}
+ output["cache_hits_by_cache_list"]["anonymously_used"] = {
+ 'per': fPerc(anon_hits, arc_hits),
+ 'num': fHits(anon_hits),
+ }
+
+ output["most_recently_used"] = {
+ 'per': fPerc(mru_hits, arc_hits),
+ 'num': fHits(mru_hits),
+ }
+ output["most_frequently_used"] = {
+ 'per': fPerc(mfu_hits, arc_hits),
+ 'num': fHits(mfu_hits),
+ }
+ output["most_recently_used_ghost"] = {
+ 'per': fPerc(mru_ghost_hits, arc_hits),
+ 'num': fHits(mru_ghost_hits),
+ }
+ output["most_frequently_used_ghost"] = {
+ 'per': fPerc(mfu_ghost_hits, arc_hits),
+ 'num': fHits(mfu_ghost_hits),
+ }
+
+ output["cache_hits_by_data_type"] = {}
+ output["cache_hits_by_data_type"]["demand_data"] = {
+ 'per': fPerc(demand_data_hits, arc_hits),
+ 'num': fHits(demand_data_hits),
+ }
+ output["cache_hits_by_data_type"]["prefetch_data"] = {
+ 'per': fPerc(prefetch_data_hits, arc_hits),
+ 'num': fHits(prefetch_data_hits),
+ }
+ output["cache_hits_by_data_type"]["demand_metadata"] = {
+ 'per': fPerc(demand_metadata_hits, arc_hits),
+ 'num': fHits(demand_metadata_hits),
+ }
+ output["cache_hits_by_data_type"]["prefetch_metadata"] = {
+ 'per': fPerc(prefetch_metadata_hits, arc_hits),
+ 'num': fHits(prefetch_metadata_hits),
+ }
+
+ output["cache_misses_by_data_type"] = {}
+ output["cache_misses_by_data_type"]["demand_data"] = {
+ 'per': fPerc(demand_data_misses, arc_misses),
+ 'num': fHits(demand_data_misses),
+ }
+ output["cache_misses_by_data_type"]["prefetch_data"] = {
+ 'per': fPerc(prefetch_data_misses, arc_misses),
+ 'num': fHits(prefetch_data_misses),
+ }
+ output["cache_misses_by_data_type"]["demand_metadata"] = {
+ 'per': fPerc(demand_metadata_misses, arc_misses),
+ 'num': fHits(demand_metadata_misses),
+ }
+ output["cache_misses_by_data_type"]["prefetch_metadata"] = {
+ 'per': fPerc(prefetch_metadata_misses, arc_misses),
+ 'num': fHits(prefetch_metadata_misses),
+ }
+
+ return output
+
+
+def _arc_efficiency(Kstat):
+ """Print information on the efficiency of the ARC"""
+
+ arc = get_arc_efficiency(Kstat)
+
+ sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" %
+ arc['total_accesses'])
+ sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % (
+ arc['cache_hit_ratio']['per'],
+ arc['cache_hit_ratio']['num'],
+ )
+ )
+ sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % (
+ arc['cache_miss_ratio']['per'],
+ arc['cache_miss_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % (
+ arc['actual_hit_ratio']['per'],
+ arc['actual_hit_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\n")
+ sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % (
+ arc['data_demand_efficiency']['per'],
+ arc['data_demand_efficiency']['num'],
+ )
+ )
+
+ if 'data_prefetch_efficiency' in arc:
+ sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % (
+ arc['data_prefetch_efficiency']['per'],
+ arc['data_prefetch_efficiency']['num'],
+ )
+ )
+ sys.stdout.write("\n")
+
+ sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n")
+ if 'cache_hits_by_cache_list' in arc:
+ sys.stdout.write("\t Anonymously Used:\t\t%s\t%s\n" % (
+ arc['cache_hits_by_cache_list']['anonymously_used']['per'],
+ arc['cache_hits_by_cache_list']['anonymously_used']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Recently Used:\t\t%s\t%s\n" % (
+ arc['most_recently_used']['per'],
+ arc['most_recently_used']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Frequently Used:\t\t%s\t%s\n" % (
+ arc['most_frequently_used']['per'],
+ arc['most_frequently_used']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Recently Used Ghost:\t%s\t%s\n" % (
+ arc['most_recently_used_ghost']['per'],
+ arc['most_recently_used_ghost']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Frequently Used Ghost:\t%s\t%s\n" % (
+ arc['most_frequently_used_ghost']['per'],
+ arc['most_frequently_used_ghost']['num'],
+ )
+ )
+
+ sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n")
+ sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['demand_data']['per'],
+ arc["cache_hits_by_data_type"]['demand_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['prefetch_data']['per'],
+ arc["cache_hits_by_data_type"]['prefetch_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['demand_metadata']['per'],
+ arc["cache_hits_by_data_type"]['demand_metadata']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['prefetch_metadata']['per'],
+ arc["cache_hits_by_data_type"]['prefetch_metadata']['num'],
+ )
+ )
+
+ sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n")
+ sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['demand_data']['per'],
+ arc["cache_misses_by_data_type"]['demand_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['prefetch_data']['per'],
+ arc["cache_misses_by_data_type"]['prefetch_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['demand_metadata']['per'],
+ arc["cache_misses_by_data_type"]['demand_metadata']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['prefetch_metadata']['per'],
+ arc["cache_misses_by_data_type"]['prefetch_metadata']['num'],
+ )
+ )
+
+
+def get_l2arc_summary(Kstat):
+ """Collection information on the L2ARC"""
+
+ output = {}
+
+ l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"]
+ l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"]
+ l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"]
+ l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"]
+ l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"]
+ l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"]
+ l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"]
+ l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"]
+ l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"]
+ l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"]
+ l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"]
+ l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"]
+ l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"]
+ l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"]
+ l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"]
+ l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"]
+
+ l2_access_total = (l2_hits + l2_misses)
+ output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error)
+
+ output['l2_access_total'] = l2_access_total
+ output['l2_size'] = l2_size
+ output['l2_asize'] = l2_asize
+
+ if l2_size > 0 and l2_access_total > 0:
+
+ if output['l2_health_count'] > 0:
+ output["health"] = "DEGRADED"
+ else:
+ output["health"] = "HEALTHY"
+
+ output["low_memory_aborts"] = fHits(l2_abort_lowmem)
+ output["free_on_write"] = fHits(l2_free_on_write)
+ output["rw_clashes"] = fHits(l2_rw_clash)
+ output["bad_checksums"] = fHits(l2_cksum_bad)
+ output["io_errors"] = fHits(l2_io_error)
+
+ output["l2_arc_size"] = {}
+ output["l2_arc_size"]["adative"] = fBytes(l2_size)
+ output["l2_arc_size"]["actual"] = {
+ 'per': fPerc(l2_asize, l2_size),
+ 'num': fBytes(l2_asize)
+ }
+ output["l2_arc_size"]["head_size"] = {
+ 'per': fPerc(l2_hdr_size, l2_size),
+ 'num': fBytes(l2_hdr_size),
+ }
+
+ output["l2_arc_evicts"] = {}
+ output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry)
+ output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading)
+
+ output['l2_arc_breakdown'] = {}
+ output['l2_arc_breakdown']['value'] = fHits(l2_access_total)
+ output['l2_arc_breakdown']['hit_ratio'] = {
+ 'per': fPerc(l2_hits, l2_access_total),
+ 'num': fHits(l2_hits),
+ }
+ output['l2_arc_breakdown']['miss_ratio'] = {
+ 'per': fPerc(l2_misses, l2_access_total),
+ 'num': fHits(l2_misses),
+ }
+ output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds)
+
+ output['l2_arc_buffer'] = {}
+
+ output['l2_arc_writes'] = {}
+ output['l2_writes_done'] = l2_writes_done
+ output['l2_writes_sent'] = l2_writes_sent
+ if l2_writes_done != l2_writes_sent:
+ output['l2_arc_writes']['writes_sent'] = {
+ 'value': "FAULTED",
+ 'num': fHits(l2_writes_sent),
+ }
+ output['l2_arc_writes']['done_ratio'] = {
+ 'per': fPerc(l2_writes_done, l2_writes_sent),
+ 'num': fHits(l2_writes_done),
+ }
+ output['l2_arc_writes']['error_ratio'] = {
+ 'per': fPerc(l2_writes_error, l2_writes_sent),
+ 'num': fHits(l2_writes_error),
+ }
+ else:
+ output['l2_arc_writes']['writes_sent'] = {
+ 'per': fPerc(100),
+ 'num': fHits(l2_writes_sent),
+ }
+
+ return output
+
+
+def _l2arc_summary(Kstat):
+ """Print information on the L2ARC"""
+
+ arc = get_l2arc_summary(Kstat)
+
+ if arc['l2_size'] > 0 and arc['l2_access_total'] > 0:
+ sys.stdout.write("L2 ARC Summary: ")
+ if arc['l2_health_count'] > 0:
+ sys.stdout.write("(DEGRADED)\n")
+ else:
+ sys.stdout.write("(HEALTHY)\n")
+ sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" %
+ arc['low_memory_aborts'])
+ sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write'])
+ sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes'])
+ sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums'])
+ sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors'])
+ sys.stdout.write("\n")
+
+ sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" %
+ arc["l2_arc_size"]["adative"])
+ sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % (
+ arc["l2_arc_size"]["actual"]["per"],
+ arc["l2_arc_size"]["actual"]["num"],
+ )
+ )
+ sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % (
+ arc["l2_arc_size"]["head_size"]["per"],
+ arc["l2_arc_size"]["head_size"]["num"],
+ )
+ )
+ sys.stdout.write("\n")
+
+ if arc["l2_arc_evicts"]['lock_retries'] != '0' or \
+ arc["l2_arc_evicts"]["reading"] != '0':
+ sys.stdout.write("L2 ARC Evicts:\n")
+ sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" %
+ arc["l2_arc_evicts"]['lock_retries'])
+ sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" %
+ arc["l2_arc_evicts"]["reading"])
+ sys.stdout.write("\n")
+
+ sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" %
+ arc['l2_arc_breakdown']['value'])
+ sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_breakdown']['hit_ratio']['per'],
+ arc['l2_arc_breakdown']['hit_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_breakdown']['miss_ratio']['per'],
+ arc['l2_arc_breakdown']['miss_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" %
+ arc['l2_arc_breakdown']['feeds'])
+ sys.stdout.write("\n")
+
+ sys.stdout.write("L2 ARC Writes:\n")
+ if arc['l2_writes_done'] != arc['l2_writes_sent']:
+ sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % (
+ arc['l2_arc_writes']['writes_sent']['value'],
+ arc['l2_arc_writes']['writes_sent']['num'],
+ )
+ )
+ sys.stdout.write("\t Done Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_writes']['done_ratio']['per'],
+ arc['l2_arc_writes']['done_ratio']['num'],
+ )
+ )
+ sys.stdout.write("\t Error Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_writes']['error_ratio']['per'],
+ arc['l2_arc_writes']['error_ratio']['num'],
+ )
+ )
+ else:
+ sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_writes']['writes_sent']['per'],
+ arc['l2_arc_writes']['writes_sent']['num'],
+ )
+ )
+
+
+def get_dmu_summary(Kstat):
+ """Collect information on the DMU"""
+
+ output = {}
+
+ zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"]
+ zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"]
+
+ zfetch_access_total = (zfetch_hits + zfetch_misses)
+ output['zfetch_access_total'] = zfetch_access_total
+
+ if zfetch_access_total > 0:
+ output['dmu'] = {}
+ output['dmu']['efficiency'] = {}
+ output['dmu']['efficiency']['value'] = fHits(zfetch_access_total)
+ output['dmu']['efficiency']['hit_ratio'] = {
+ 'per': fPerc(zfetch_hits, zfetch_access_total),
+ 'num': fHits(zfetch_hits),
+ }
+ output['dmu']['efficiency']['miss_ratio'] = {
+ 'per': fPerc(zfetch_misses, zfetch_access_total),
+ 'num': fHits(zfetch_misses),
+ }
+
+ return output
+
+
+def _dmu_summary(Kstat):
+ """Print information on the DMU"""
+
+ arc = get_dmu_summary(Kstat)
+
+ if arc['zfetch_access_total'] > 0:
+ sys.stdout.write("DMU Prefetch Efficiency:\t\t\t\t\t%s\n" %
+ arc['dmu']['efficiency']['value'])
+ sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+ arc['dmu']['efficiency']['hit_ratio']['per'],
+ arc['dmu']['efficiency']['hit_ratio']['num'],
+ )
+ )
+ sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+ arc['dmu']['efficiency']['miss_ratio']['per'],
+ arc['dmu']['efficiency']['miss_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\n")
+
+
+def get_vdev_summary(Kstat):
+ """Collect information on the VDEVs"""
+
+ output = {}
+
+ vdev_cache_delegations = \
+ Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"]
+ vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"]
+ vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"]
+ vdev_cache_total = (vdev_cache_misses + vdev_cache_hits +
+ vdev_cache_delegations)
+
+ output['vdev_cache_total'] = vdev_cache_total
+
+ if vdev_cache_total > 0:
+ output['summary'] = fHits(vdev_cache_total)
+ output['hit_ratio'] = {
+ 'per': fPerc(vdev_cache_hits, vdev_cache_total),
+ 'num': fHits(vdev_cache_hits),
+ }
+ output['miss_ratio'] = {
+ 'per': fPerc(vdev_cache_misses, vdev_cache_total),
+ 'num': fHits(vdev_cache_misses),
+ }
+ output['delegations'] = {
+ 'per': fPerc(vdev_cache_delegations, vdev_cache_total),
+ 'num': fHits(vdev_cache_delegations),
+ }
+
+ return output
+
+
+def _vdev_summary(Kstat):
+ """Print information on the VDEVs"""
+
+ arc = get_vdev_summary(Kstat)
+
+ if arc['vdev_cache_total'] > 0:
+ sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary'])
+ sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+ arc['hit_ratio']['per'],
+ arc['hit_ratio']['num'],
+ ))
+ sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+ arc['miss_ratio']['per'],
+ arc['miss_ratio']['num'],
+ ))
+ sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % (
+ arc['delegations']['per'],
+ arc['delegations']['num'],
+ ))
+
+
+def _tunable_summary(Kstat):
+ """Print information on tunables, including descriptions if requested"""
+
+ global show_tunable_descriptions
+ global alternate_tunable_layout
+
+ tunables = load_tunables()
+ descriptions = {}
+
+ if show_tunable_descriptions:
+
+ command = ["/sbin/modinfo", "zfs", "-0"]
+
+ try:
+ p = Popen(command, stdin=PIPE, stdout=PIPE,
+ stderr=PIPE, shell=False, close_fds=True)
+ p.wait()
+
+ # By default, Python 2 returns a string as the first element of the
+ # tuple from p.communicate(), while Python 3 returns bytes which
+ # must be decoded first. The better way to do this would be with
+ # subprocess.run() or at least .check_output(), but this fails on
+ # CentOS 6 because of its old version of Python 2
+ desc = bytes.decode(p.communicate()[0])
+ description_list = desc.strip().split('\0')
+
+ if p.returncode == 0:
+ for tunable in description_list:
+ if tunable[0:5] == 'parm:':
+ tunable = tunable[5:].strip()
+ name, description = tunable.split(':', 1)
+ if not description:
+ description = "Description unavailable"
+ descriptions[name] = description
+ else:
+ sys.stderr.write("%s: '%s' exited with code %i\n" %
+ (sys.argv[0], command[0], p.returncode))
+ sys.stderr.write("Tunable descriptions will be disabled.\n")
+ except OSError as e:
+ sys.stderr.write("%s: Cannot run '%s': %s\n" %
+ (sys.argv[0], command[0], e.strerror))
+ sys.stderr.write("Tunable descriptions will be disabled.\n")
+
+ sys.stdout.write("ZFS Tunables:\n")
+
+ if alternate_tunable_layout:
+ fmt = "\t%s=%s\n"
+ else:
+ fmt = "\t%-50s%s\n"
+
+ for name in sorted(tunables.keys()):
+ if show_tunable_descriptions and name in descriptions:
+ sys.stdout.write("\t# %s\n" % descriptions[name])
+
+ sys.stdout.write(fmt % (name, tunables[name]))
+
+
+unSub = [
+ _arc_summary,
+ _arc_efficiency,
+ _l2arc_summary,
+ _dmu_summary,
+ _vdev_summary,
+ _tunable_summary
+]
+
+
+def zfs_header():
+ """Print title string with date"""
+
+ daydate = time.strftime('%a %b %d %H:%M:%S %Y')
+
+ sys.stdout.write('\n'+'-'*72+'\n')
+ sys.stdout.write('ZFS Subsystem Report\t\t\t\t%s' % daydate)
+ sys.stdout.write('\n')
+
+
+def usage():
+ """Print usage information"""
+
+ sys.stdout.write("Usage: arc_summary [-h] [-a] [-d] [-p PAGE]\n\n")
+ sys.stdout.write("\t -h, --help : "
+ "Print this help message and exit\n")
+ sys.stdout.write("\t -a, --alternate : "
+ "Show an alternate sysctl layout\n")
+ sys.stdout.write("\t -d, --description : "
+ "Show the sysctl descriptions\n")
+ sys.stdout.write("\t -p PAGE, --page=PAGE : "
+ "Select a single output page to display,\n")
+ sys.stdout.write("\t "
+ "should be an integer between 1 and " +
+ str(len(unSub)) + "\n\n")
+ sys.stdout.write("Examples:\n")
+ sys.stdout.write("\tarc_summary -a\n")
+ sys.stdout.write("\tarc_summary -p 4\n")
+ sys.stdout.write("\tarc_summary -ad\n")
+ sys.stdout.write("\tarc_summary --page=2\n")
+
+
+def main():
+ """Main function"""
+
+ global show_tunable_descriptions
+ global alternate_tunable_layout
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "adp:h", ["alternate", "description", "page=", "help"]
+ )
+ except getopt.error as e:
+ sys.stderr.write("Error: %s\n" % e.msg)
+ usage()
+ sys.exit(1)
+
+ args = {}
+ for opt, arg in opts:
+ if opt in ('-a', '--alternate'):
+ args['a'] = True
+ if opt in ('-d', '--description'):
+ args['d'] = True
+ if opt in ('-p', '--page'):
+ args['p'] = arg
+ if opt in ('-h', '--help'):
+ usage()
+ sys.exit(0)
+
+ Kstat = get_Kstat()
+
+ alternate_tunable_layout = 'a' in args
+ show_tunable_descriptions = 'd' in args
+
+ pages = []
+
+ if 'p' in args:
+ try:
+ pages.append(unSub[int(args['p']) - 1])
+ except IndexError:
+ sys.stderr.write('the argument to -p must be between 1 and ' +
+ str(len(unSub)) + '\n')
+ sys.exit(1)
+ else:
+ pages = unSub
+
+ zfs_header()
+ for page in pages:
+ page(Kstat)
+ sys.stdout.write("\n")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3
new file mode 100755
index 000000000000..c920b8e5395d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3
@@ -0,0 +1,943 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
+# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
+# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
+# Copyright (c) 2017 Scot W. Stevenson <scot.stevenson@gmail.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+"""Print statistics on the ZFS ARC Cache and other information
+
+Provides basic information on the ARC, its efficiency, the L2ARC (if present),
+the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See
+the in-source documentation and code at
+https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details.
+The original introduction to arc_summary can be found at
+http://cuddletech.com/?p=454
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+DESCRIPTION = 'Print ARC and other statistics for ZFS on Linux'
+INDENT = ' '*8
+LINE_LENGTH = 72
+DATE_FORMAT = '%a %b %d %H:%M:%S %Y'
+TITLE = 'ZFS Subsystem Report'
+
+SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split()
+SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')'
+
+# Tunables and SPL are handled separately because they come from
+# different sources
+SECTION_PATHS = {'arc': 'arcstats',
+ 'dmu': 'dmu_tx',
+ 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats
+ 'vdev': 'vdev_cache_stats',
+ 'xuio': 'xuio_stats',
+ 'zfetch': 'zfetchstats',
+ 'zil': 'zil'}
+
+parser = argparse.ArgumentParser(description=DESCRIPTION)
+parser.add_argument('-a', '--alternate', action='store_true', default=False,
+ help='use alternate formatting for tunables and SPL',
+ dest='alt')
+parser.add_argument('-d', '--description', action='store_true', default=False,
+ help='print descriptions with tunables and SPL',
+ dest='desc')
+parser.add_argument('-g', '--graph', action='store_true', default=False,
+ help='print graph on ARC use and exit', dest='graph')
+parser.add_argument('-p', '--page', type=int, dest='page',
+ help='print page by number (DEPRECATED, use "-s")')
+parser.add_argument('-r', '--raw', action='store_true', default=False,
+ help='dump all available data with minimal formatting',
+ dest='raw')
+parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP)
+ARGS = parser.parse_args()
+
+
+if sys.platform.startswith('freebsd'):
+ # Requires py36-sysctl on FreeBSD
+ import sysctl
+
+ VDEV_CACHE_SIZE = 'vdev.cache_size'
+
+ def load_kstats(section):
+ base = 'kstat.zfs.misc.{section}.'.format(section=section)
+ # base is removed from the name
+ fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):],
+ value=kstat.value)
+ return [fmt(kstat) for kstat in sysctl.filter(base)]
+
+ def get_params(base):
+ cut = 8 # = len('vfs.zfs.')
+ return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)}
+
+ def get_tunable_params():
+ return get_params('vfs.zfs')
+
+ def get_vdev_params():
+ return get_params('vfs.zfs.vdev')
+
+ def get_version_impl(request):
+ # FreeBSD reports versions for zpl and spa instead of zfs and spl.
+ name = {'zfs': 'zpl',
+ 'spl': 'spa'}[request]
+ mib = 'vfs.zfs.version.{}'.format(name)
+ version = sysctl.filter(mib)[0].value
+ return '{} version {}'.format(name, version)
+
+ def get_descriptions(_request):
+ # py-sysctl doesn't give descriptions, so we have to shell out.
+ command = ['sysctl', '-d', 'vfs.zfs']
+
+ # The recommended way to do this is with subprocess.run(). However,
+ # some installed versions of Python are < 3.5, so we offer them
+ # the option of doing it the old way (for now)
+ if 'run' in dir(subprocess):
+ info = subprocess.run(command, stdout=subprocess.PIPE,
+ universal_newlines=True)
+ lines = info.stdout.split('\n')
+ else:
+ info = subprocess.check_output(command, universal_newlines=True)
+ lines = info.split('\n')
+
+ def fmt(line):
+ name, desc = line.split(':', 1)
+ return (name.strip(), desc.strip())
+
+ return dict([fmt(line) for line in lines if len(line) > 0])
+
+
+elif sys.platform.startswith('linux'):
+ KSTAT_PATH = '/proc/spl/kstat/zfs'
+ SPL_PATH = '/sys/module/spl/parameters'
+ TUNABLES_PATH = '/sys/module/zfs/parameters'
+
+ VDEV_CACHE_SIZE = 'zfs_vdev_cache_size'
+
+ def load_kstats(section):
+ path = os.path.join(KSTAT_PATH, section)
+ with open(path) as f:
+ return list(f)[2:] # Get rid of header
+
+ def get_params(basepath):
+ """Collect information on the Solaris Porting Layer (SPL) or the
+ tunables, depending on the PATH given. Does not check if PATH is
+ legal.
+ """
+ result = {}
+ for name in os.listdir(basepath):
+ path = os.path.join(basepath, name)
+ with open(path) as f:
+ value = f.read()
+ result[name] = value.strip()
+ return result
+
+ def get_spl_params():
+ return get_params(SPL_PATH)
+
+ def get_tunable_params():
+ return get_params(TUNABLES_PATH)
+
+ def get_vdev_params():
+ return get_params(TUNABLES_PATH)
+
+ def get_version_impl(request):
+ # The original arc_summary called /sbin/modinfo/{spl,zfs} to get
+ # the version information. We switch to /sys/module/{spl,zfs}/version
+ # to make sure we get what is really loaded in the kernel
+ command = ["cat", "/sys/module/{0}/version".format(request)]
+ req = request.upper()
+
+ # The recommended way to do this is with subprocess.run(). However,
+ # some installed versions of Python are < 3.5, so we offer them
+ # the option of doing it the old way (for now)
+ if 'run' in dir(subprocess):
+ info = subprocess.run(command, stdout=subprocess.PIPE,
+ universal_newlines=True)
+ version = info.stdout.strip()
+ else:
+ info = subprocess.check_output(command, universal_newlines=True)
+ version = info.strip()
+
+ return version
+
+ def get_descriptions(request):
+ """Get the descriptions of the Solaris Porting Layer (SPL) or the
+ tunables, return with minimal formatting.
+ """
+
+ if request not in ('spl', 'zfs'):
+ print('ERROR: description of "{0}" requested)'.format(request))
+ sys.exit(1)
+
+ descs = {}
+ target_prefix = 'parm:'
+
+ # We would prefer to do this with /sys/modules -- see the discussion at
+ # get_version() -- but there isn't a way to get the descriptions from
+ # there, so we fall back on modinfo
+ command = ["/sbin/modinfo", request, "-0"]
+
+ # The recommended way to do this is with subprocess.run(). However,
+ # some installed versions of Python are < 3.5, so we offer them
+ # the option of doing it the old way (for now)
+ info = ''
+
+ try:
+
+ if 'run' in dir(subprocess):
+ info = subprocess.run(command, stdout=subprocess.PIPE,
+ universal_newlines=True)
+ raw_output = info.stdout.split('\0')
+ else:
+ info = subprocess.check_output(command,
+ universal_newlines=True)
+ raw_output = info.split('\0')
+
+ except subprocess.CalledProcessError:
+ print("Error: Descriptions not available",
+ "(can't access kernel module)")
+ sys.exit(1)
+
+ for line in raw_output:
+
+ if not line.startswith(target_prefix):
+ continue
+
+ line = line[len(target_prefix):].strip()
+ name, raw_desc = line.split(':', 1)
+ desc = raw_desc.rsplit('(', 1)[0]
+
+ if desc == '':
+ desc = '(No description found)'
+
+ descs[name.strip()] = desc.strip()
+
+ return descs
+
+
+def cleanup_line(single_line):
+ """Format a raw line of data from /proc and isolate the name value
+ part, returning a tuple with each. Currently, this gets rid of the
+ middle '4'. For example "arc_no_grow 4 0" returns the tuple
+ ("arc_no_grow", "0").
+ """
+ name, _, value = single_line.split()
+
+ return name, value
+
+
+def draw_graph(kstats_dict):
+ """Draw a primitive graph representing the basic information on the
+ ARC -- its size and the proportion used by MFU and MRU -- and quit.
+ We use max size of the ARC to calculate how full it is. This is a
+ very rough representation.
+ """
+
+ arc_stats = isolate_section('arcstats', kstats_dict)
+
+ GRAPH_INDENT = ' '*4
+ GRAPH_WIDTH = 60
+ arc_size = f_bytes(arc_stats['size'])
+ arc_perc = f_perc(arc_stats['size'], arc_stats['c_max'])
+ mfu_size = f_bytes(arc_stats['mfu_size'])
+ mru_size = f_bytes(arc_stats['mru_size'])
+ meta_limit = f_bytes(arc_stats['arc_meta_limit'])
+ meta_size = f_bytes(arc_stats['arc_meta_used'])
+ dnode_limit = f_bytes(arc_stats['arc_dnode_limit'])
+ dnode_size = f_bytes(arc_stats['dnode_size'])
+
+ info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) '
+ 'DNODE {6} ({7})')
+ info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size,
+ meta_size, meta_limit, dnode_size,
+ dnode_limit)
+ info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2)
+ info_line = GRAPH_INDENT+info_spc+info_line
+
+ graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+'
+
+ mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max']))
+ mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max']))
+ arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max']))
+ total_ticks = float(arc_perc)*GRAPH_WIDTH
+ mfu_ticks = mfu_perc*GRAPH_WIDTH
+ mru_ticks = mru_perc*GRAPH_WIDTH
+ other_ticks = total_ticks-(mfu_ticks+mru_ticks)
+
+ core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks)
+ core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form)))
+ core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|'
+
+ for line in ('', info_line, graph_line, core_line, graph_line, ''):
+ print(line)
+
+
+def f_bytes(byte_string):
+ """Return human-readable representation of a byte value in
+ powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
+ points. Values smaller than one KiB are returned without
+ decimal points. Note "bytes" is a reserved keyword.
+ """
+
+ prefixes = ([2**80, "YiB"], # yobibytes (yotta)
+ [2**70, "ZiB"], # zebibytes (zetta)
+ [2**60, "EiB"], # exbibytes (exa)
+ [2**50, "PiB"], # pebibytes (peta)
+ [2**40, "TiB"], # tebibytes (tera)
+ [2**30, "GiB"], # gibibytes (giga)
+ [2**20, "MiB"], # mebibytes (mega)
+ [2**10, "KiB"]) # kibibytes (kilo)
+
+ bites = int(byte_string)
+
+ if bites >= 2**10:
+ for limit, unit in prefixes:
+
+ if bites >= limit:
+ value = bites / limit
+ break
+
+ result = '{0:.1f} {1}'.format(value, unit)
+ else:
+ result = '{0} Bytes'.format(bites)
+
+ return result
+
+
+def f_hits(hits_string):
+ """Create a human-readable representation of the number of hits.
+ The single-letter symbols used are SI to avoid the confusion caused
+ by the different "short scale" and "long scale" representations in
+ English, which use the same words for different values. See
+ https://en.wikipedia.org/wiki/Names_of_large_numbers and:
+ https://physics.nist.gov/cuu/Units/prefixes.html
+ """
+
+ numbers = ([10**24, 'Y'], # yotta (septillion)
+ [10**21, 'Z'], # zetta (sextillion)
+ [10**18, 'E'], # exa (quintrillion)
+ [10**15, 'P'], # peta (quadrillion)
+ [10**12, 'T'], # tera (trillion)
+ [10**9, 'G'], # giga (billion)
+ [10**6, 'M'], # mega (million)
+ [10**3, 'k']) # kilo (thousand)
+
+ hits = int(hits_string)
+
+ if hits >= 1000:
+ for limit, symbol in numbers:
+
+ if hits >= limit:
+ value = hits/limit
+ break
+
+ result = "%0.1f%s" % (value, symbol)
+ else:
+ result = "%d" % hits
+
+ return result
+
+
+def f_perc(value1, value2):
+ """Calculate percentage and return in human-readable form. If
+ rounding produces the result '0.0' though the first number is
+ not zero, include a 'less-than' symbol to avoid confusion.
+ Division by zero is handled by returning 'n/a'; no error
+ is called.
+ """
+
+ v1 = float(value1)
+ v2 = float(value2)
+
+ try:
+ perc = 100 * v1/v2
+ except ZeroDivisionError:
+ result = 'n/a'
+ else:
+ result = '{0:0.1f} %'.format(perc)
+
+ if result == '0.0 %' and v1 > 0:
+ result = '< 0.1 %'
+
+ return result
+
+
+def format_raw_line(name, value):
+ """For the --raw option for the tunable and SPL outputs, decide on the
+ correct formatting based on the --alternate flag.
+ """
+
+ if ARGS.alt:
+ result = '{0}{1}={2}'.format(INDENT, name, value)
+ else:
+ spc = LINE_LENGTH-(len(INDENT)+len(value))
+ result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc)
+
+ return result
+
+
+def get_kstats():
+ """Collect information on the ZFS subsystem. The step does not perform any
+ further processing, giving us the option to only work on what is actually
+ needed. The name "kstat" is a holdover from the Solaris utility of the same
+ name.
+ """
+
+ result = {}
+
+ for section in SECTION_PATHS.values():
+ if section not in result:
+ result[section] = load_kstats(section)
+
+ return result
+
+
+def get_version(request):
+ """Get the version number of ZFS or SPL on this machine for header.
+ Returns an error string, but does not raise an error, if we can't
+ get the ZFS/SPL version.
+ """
+
+ if request not in ('spl', 'zfs'):
+ error_msg = '(ERROR: "{0}" requested)'.format(request)
+ return error_msg
+
+ return get_version_impl(request)
+
+
+def print_header():
+ """Print the initial heading with date and time as well as info on the
+ kernel and ZFS versions. This is not called for the graph.
+ """
+
+ # datetime is now recommended over time but we keep the exact formatting
+ # from the older version of arc_summary in case there are scripts
+ # that expect it in this way
+ daydate = time.strftime(DATE_FORMAT)
+ spc_date = LINE_LENGTH-len(daydate)
+ sys_version = os.uname()
+
+ sys_msg = sys_version.sysname+' '+sys_version.release
+ zfs = get_version('zfs')
+ spc_zfs = LINE_LENGTH-len(zfs)
+
+ machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')'
+ spl = get_version('spl')
+ spc_spl = LINE_LENGTH-len(spl)
+
+ print('\n'+('-'*LINE_LENGTH))
+ print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date))
+ print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs))
+ print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl))
+
+
+def print_raw(kstats_dict):
+ """Print all available data from the system in a minimally sorted format.
+ This can be used as a source to be piped through 'grep'.
+ """
+
+ sections = sorted(kstats_dict.keys())
+
+ for section in sections:
+
+ print('\n{0}:'.format(section.upper()))
+ lines = sorted(kstats_dict[section])
+
+ for line in lines:
+ name, value = cleanup_line(line)
+ print(format_raw_line(name, value))
+
+ # Tunables and SPL must be handled separately because they come from a
+ # different source and have descriptions the user might request
+ print()
+ section_spl()
+ section_tunables()
+
+
+def isolate_section(section_name, kstats_dict):
+ """From the complete information on all sections, retrieve only those
+ for one section.
+ """
+
+ try:
+ section_data = kstats_dict[section_name]
+ except KeyError:
+ print('ERROR: Data on {0} not available'.format(section_data))
+ sys.exit(1)
+
+ section_dict = dict(cleanup_line(l) for l in section_data)
+
+ return section_dict
+
+
+# Formatted output helper functions
+
+
+def prt_1(text, value):
+ """Print text and one value, no indent"""
+ spc = ' '*(LINE_LENGTH-(len(text)+len(value)))
+ print('{0}{spc}{1}'.format(text, value, spc=spc))
+
+
+def prt_i1(text, value):
+ """Print text and one value, with indent"""
+ spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value)))
+ print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc))
+
+
+def prt_2(text, value1, value2):
+ """Print text and two values, no indent"""
+ values = '{0:>9} {1:>9}'.format(value1, value2)
+ spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2))
+ print('{0}{spc} {1}'.format(text, values, spc=spc))
+
+
+def prt_i2(text, value1, value2):
+ """Print text and two values, with indent"""
+ values = '{0:>9} {1:>9}'.format(value1, value2)
+ spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2))
+ print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc))
+
+
+# The section output concentrates on important parameters instead of
+# being exhaustive (that is what the --raw parameter is for)
+
+
+def section_arc(kstats_dict):
+ """Give basic information on the ARC, MRU and MFU. This is the first
+ and most used section.
+ """
+
+ arc_stats = isolate_section('arcstats', kstats_dict)
+
+ throttle = arc_stats['memory_throttle_count']
+
+ if throttle == '0':
+ health = 'HEALTHY'
+ else:
+ health = 'THROTTLED'
+
+ prt_1('ARC status:', health)
+ prt_i1('Memory throttle count:', throttle)
+ print()
+
+ arc_size = arc_stats['size']
+ arc_target_size = arc_stats['c']
+ arc_max = arc_stats['c_max']
+ arc_min = arc_stats['c_min']
+ mfu_size = arc_stats['mfu_size']
+ mru_size = arc_stats['mru_size']
+ meta_limit = arc_stats['arc_meta_limit']
+ meta_size = arc_stats['arc_meta_used']
+ dnode_limit = arc_stats['arc_dnode_limit']
+ dnode_size = arc_stats['dnode_size']
+ target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min))
+
+ prt_2('ARC size (current):',
+ f_perc(arc_size, arc_max), f_bytes(arc_size))
+ prt_i2('Target size (adaptive):',
+ f_perc(arc_target_size, arc_max), f_bytes(arc_target_size))
+ prt_i2('Min size (hard limit):',
+ f_perc(arc_min, arc_max), f_bytes(arc_min))
+ prt_i2('Max size (high water):',
+ target_size_ratio, f_bytes(arc_max))
+ caches_size = int(mfu_size)+int(mru_size)
+ prt_i2('Most Frequently Used (MFU) cache size:',
+ f_perc(mfu_size, caches_size), f_bytes(mfu_size))
+ prt_i2('Most Recently Used (MRU) cache size:',
+ f_perc(mru_size, caches_size), f_bytes(mru_size))
+ prt_i2('Metadata cache size (hard limit):',
+ f_perc(meta_limit, arc_max), f_bytes(meta_limit))
+ prt_i2('Metadata cache size (current):',
+ f_perc(meta_size, meta_limit), f_bytes(meta_size))
+ prt_i2('Dnode cache size (hard limit):',
+ f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit))
+ prt_i2('Dnode cache size (current):',
+ f_perc(dnode_size, dnode_limit), f_bytes(dnode_size))
+ print()
+
+ print('ARC hash breakdown:')
+ prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max']))
+ prt_i2('Elements current:',
+ f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']),
+ f_hits(arc_stats['hash_elements']))
+ prt_i1('Collisions:', f_hits(arc_stats['hash_collisions']))
+
+ prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max']))
+ prt_i1('Chains:', f_hits(arc_stats['hash_chains']))
+ print()
+
+ print('ARC misc:')
+ prt_i1('Deleted:', f_hits(arc_stats['deleted']))
+ prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss']))
+ prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip']))
+ print()
+
+
+def section_archits(kstats_dict):
+ """Print information on how the caches are accessed ("arc hits").
+ """
+
+ arc_stats = isolate_section('arcstats', kstats_dict)
+ all_accesses = int(arc_stats['hits'])+int(arc_stats['misses'])
+ actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits'])
+
+ prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses))
+ ta_todo = (('Cache hit ratio:', arc_stats['hits']),
+ ('Cache miss ratio:', arc_stats['misses']),
+ ('Actual hit ratio (MFU + MRU hits):', actual_hits))
+
+ for title, value in ta_todo:
+ prt_i2(title, f_perc(value, all_accesses), f_hits(value))
+
+ dd_total = int(arc_stats['demand_data_hits']) +\
+ int(arc_stats['demand_data_misses'])
+ prt_i2('Data demand efficiency:',
+ f_perc(arc_stats['demand_data_hits'], dd_total),
+ f_hits(dd_total))
+
+ dp_total = int(arc_stats['prefetch_data_hits']) +\
+ int(arc_stats['prefetch_data_misses'])
+ prt_i2('Data prefetch efficiency:',
+ f_perc(arc_stats['prefetch_data_hits'], dp_total),
+ f_hits(dp_total))
+
+ known_hits = int(arc_stats['mfu_hits']) +\
+ int(arc_stats['mru_hits']) +\
+ int(arc_stats['mfu_ghost_hits']) +\
+ int(arc_stats['mru_ghost_hits'])
+
+ anon_hits = int(arc_stats['hits'])-known_hits
+
+ print()
+ print('Cache hits by cache type:')
+ cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']),
+ ('Most recently used (MRU):', arc_stats['mru_hits']),
+ ('Most frequently used (MFU) ghost:',
+ arc_stats['mfu_ghost_hits']),
+ ('Most recently used (MRU) ghost:',
+ arc_stats['mru_ghost_hits']))
+
+ for title, value in cl_todo:
+ prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value))
+
+ # For some reason, anon_hits can turn negative, which is weird. Until we
+ # have figured out why this happens, we just hide the problem, following
+ # the behavior of the original arc_summary.
+ if anon_hits >= 0:
+ prt_i2('Anonymously used:',
+ f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits))
+
+ print()
+ print('Cache hits by data type:')
+ dt_todo = (('Demand data:', arc_stats['demand_data_hits']),
+ ('Demand prefetch data:', arc_stats['prefetch_data_hits']),
+ ('Demand metadata:', arc_stats['demand_metadata_hits']),
+ ('Demand prefetch metadata:',
+ arc_stats['prefetch_metadata_hits']))
+
+ for title, value in dt_todo:
+ prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value))
+
+ print()
+ print('Cache misses by data type:')
+ dm_todo = (('Demand data:', arc_stats['demand_data_misses']),
+ ('Demand prefetch data:',
+ arc_stats['prefetch_data_misses']),
+ ('Demand metadata:', arc_stats['demand_metadata_misses']),
+ ('Demand prefetch metadata:',
+ arc_stats['prefetch_metadata_misses']))
+
+ for title, value in dm_todo:
+ prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value))
+
+ print()
+
+
+def section_dmu(kstats_dict):
+ """Collect information on the DMU"""
+
+ zfetch_stats = isolate_section('zfetchstats', kstats_dict)
+
+ zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+
+ prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total))
+ prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total),
+ f_hits(zfetch_stats['hits']))
+ prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total),
+ f_hits(zfetch_stats['misses']))
+ print()
+
+
+def section_l2arc(kstats_dict):
+ """Collect information on L2ARC device if present. If not, tell user
+ that we're skipping the section.
+ """
+
+ # The L2ARC statistics live in the same section as the normal ARC stuff
+ arc_stats = isolate_section('arcstats', kstats_dict)
+
+ if arc_stats['l2_size'] == '0':
+ print('L2ARC not detected, skipping section\n')
+ return
+
+ l2_errors = int(arc_stats['l2_writes_error']) +\
+ int(arc_stats['l2_cksum_bad']) +\
+ int(arc_stats['l2_io_error'])
+
+ l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses'])
+ health = 'HEALTHY'
+
+ if l2_errors > 0:
+ health = 'DEGRADED'
+
+ prt_1('L2ARC status:', health)
+
+ l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'),
+ ('Free on write:', 'l2_free_on_write'),
+ ('R/W clashes:', 'l2_rw_clash'),
+ ('Bad checksums:', 'l2_cksum_bad'),
+ ('I/O errors:', 'l2_io_error'))
+
+ for title, value in l2_todo:
+ prt_i1(title, f_hits(arc_stats[value]))
+
+ print()
+ prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size']))
+ prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']),
+ f_bytes(arc_stats['l2_asize']))
+ prt_i2('Header size:',
+ f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
+ f_bytes(arc_stats['l2_hdr_size']))
+
+ print()
+ prt_1('L2ARC breakdown:', f_hits(l2_access_total))
+ prt_i2('Hit ratio:',
+ f_perc(arc_stats['l2_hits'], l2_access_total),
+ f_hits(arc_stats['l2_hits']))
+ prt_i2('Miss ratio:',
+ f_perc(arc_stats['l2_misses'], l2_access_total),
+ f_hits(arc_stats['l2_misses']))
+ prt_i1('Feeds:', f_hits(arc_stats['l2_feeds']))
+
+ print()
+ print('L2ARC writes:')
+
+ if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']:
+ prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent']))
+ prt_i2('Done ratio:',
+ f_perc(arc_stats['l2_writes_done'],
+ arc_stats['l2_writes_sent']),
+ f_hits(arc_stats['l2_writes_done']))
+ prt_i2('Error ratio:',
+ f_perc(arc_stats['l2_writes_error'],
+ arc_stats['l2_writes_sent']),
+ f_hits(arc_stats['l2_writes_error']))
+ else:
+ prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent']))
+
+ print()
+ print('L2ARC evicts:')
+ prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry']))
+ prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading']))
+ print()
+
+
+def section_spl(*_):
+ """Print the SPL parameters, if requested with alternative format
+ and/or descriptions. This does not use kstats.
+ """
+
+ if sys.platform.startswith('freebsd'):
+ # No SPL support in FreeBSD
+ return
+
+ spls = get_spl_params()
+ keylist = sorted(spls.keys())
+ print('Solaris Porting Layer (SPL):')
+
+ if ARGS.desc:
+ descriptions = get_descriptions('spl')
+
+ for key in keylist:
+ value = spls[key]
+
+ if ARGS.desc:
+ try:
+ print(INDENT+'#', descriptions[key])
+ except KeyError:
+ print(INDENT+'# (No description found)') # paranoid
+
+ print(format_raw_line(key, value))
+
+ print()
+
+
+def section_tunables(*_):
+ """Print the tunables, if requested with alternative format and/or
+ descriptions. This does not use kstasts.
+ """
+
+ tunables = get_tunable_params()
+ keylist = sorted(tunables.keys())
+ print('Tunables:')
+
+ if ARGS.desc:
+ descriptions = get_descriptions('zfs')
+
+ for key in keylist:
+ value = tunables[key]
+
+ if ARGS.desc:
+ try:
+ print(INDENT+'#', descriptions[key])
+ except KeyError:
+ print(INDENT+'# (No description found)') # paranoid
+
+ print(format_raw_line(key, value))
+
+ print()
+
+
+def section_vdev(kstats_dict):
+ """Collect information on VDEV caches"""
+
+ # Currently [Nov 2017] the VDEV cache is disabled, because it is actually
+ # harmful. When this is the case, we just skip the whole entry. See
+ # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c
+ # for details
+ tunables = get_vdev_params()
+
+ if tunables[VDEV_CACHE_SIZE] == '0':
+ print('VDEV cache disabled, skipping section\n')
+ return
+
+ vdev_stats = isolate_section('vdev_cache_stats', kstats_dict)
+
+ vdev_cache_total = int(vdev_stats['hits']) +\
+ int(vdev_stats['misses']) +\
+ int(vdev_stats['delegations'])
+
+ prt_1('VDEV cache summary:', f_hits(vdev_cache_total))
+ prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total),
+ f_hits(vdev_stats['hits']))
+ prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total),
+ f_hits(vdev_stats['misses']))
+ prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total),
+ f_hits(vdev_stats['delegations']))
+ print()
+
+
+def section_zil(kstats_dict):
+ """Collect information on the ZFS Intent Log. Some of the information
+ taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h
+ """
+
+ zil_stats = isolate_section('zil', kstats_dict)
+
+ prt_1('ZIL committed transactions:',
+ f_hits(zil_stats['zil_itx_count']))
+ prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count']))
+ prt_i1('Flushes to stable storage:',
+ f_hits(zil_stats['zil_commit_writer_count']))
+ prt_i2('Transactions to SLOG storage pool:',
+ f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']),
+ f_hits(zil_stats['zil_itx_metaslab_slog_count']))
+ prt_i2('Transactions to non-SLOG storage pool:',
+ f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']),
+ f_hits(zil_stats['zil_itx_metaslab_normal_count']))
+ print()
+
+
+section_calls = {'arc': section_arc,
+ 'archits': section_archits,
+ 'dmu': section_dmu,
+ 'l2arc': section_l2arc,
+ 'spl': section_spl,
+ 'tunables': section_tunables,
+ 'vdev': section_vdev,
+ 'zil': section_zil}
+
+
+def main():
+ """Run program. The options to draw a graph and to print all data raw are
+ treated separately because they come with their own call.
+ """
+
+ kstats = get_kstats()
+
+ if ARGS.graph:
+ draw_graph(kstats)
+ sys.exit(0)
+
+ print_header()
+
+ if ARGS.raw:
+ print_raw(kstats)
+
+ elif ARGS.section:
+
+ try:
+ section_calls[ARGS.section](kstats)
+ except KeyError:
+ print('Error: Section "{0}" unknown'.format(ARGS.section))
+ sys.exit(1)
+
+ elif ARGS.page:
+ print('WARNING: Pages are deprecated, please use "--section"\n')
+
+ pages_to_calls = {1: 'arc',
+ 2: 'archits',
+ 3: 'l2arc',
+ 4: 'dmu',
+ 5: 'vdev',
+ 6: 'tunables'}
+
+ try:
+ call = pages_to_calls[ARGS.page]
+ except KeyError:
+ print('Error: Page "{0}" not supported'.format(ARGS.page))
+ sys.exit(1)
+ else:
+ section_calls[call](kstats)
+
+ else:
+ # If no parameters were given, we print all sections. We might want to
+ # change the sequence by hand
+ calls = sorted(section_calls.keys())
+
+ for section in calls:
+ section_calls[section](kstats)
+
+ sys.exit(0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/sys/contrib/openzfs/cmd/arcstat/.gitignore b/sys/contrib/openzfs/cmd/arcstat/.gitignore
new file mode 100644
index 000000000000..6d6cd1ab75fc
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arcstat/.gitignore
@@ -0,0 +1 @@
+arcstat
diff --git a/sys/contrib/openzfs/cmd/arcstat/Makefile.am b/sys/contrib/openzfs/cmd/arcstat/Makefile.am
new file mode 100644
index 000000000000..d1ba989a0cd8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arcstat/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/config/Substfiles.am
+
+bin_SCRIPTS = arcstat
+
+SUBSTFILES += $(bin_SCRIPTS)
diff --git a/sys/contrib/openzfs/cmd/arcstat/arcstat.in b/sys/contrib/openzfs/cmd/arcstat/arcstat.in
new file mode 100755
index 000000000000..c83a1c74599e
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/arcstat/arcstat.in
@@ -0,0 +1,494 @@
+#!/usr/bin/env @PYTHON_SHEBANG@
+#
+# Print out ZFS ARC Statistics exported via kstat(1)
+# For a definition of fields, or usage, use arcstat -v
+#
+# This script was originally a fork of the original arcstat.pl (0.1)
+# by Neelakanth Nadgir, originally published on his Sun blog on
+# 09/18/2007
+# http://blogs.sun.com/realneel/entry/zfs_arc_statistics
+#
+# A new version aimed to improve upon the original by adding features
+# and fixing bugs as needed. This version was maintained by Mike
+# Harsch and was hosted in a public open source repository:
+# http://github.com/mharsch/arcstat
+#
+# but has since moved to the illumos-gate repository.
+#
+# This Python port was written by John Hixson for FreeNAS, introduced
+# in commit e2c29f:
+# https://github.com/freenas/freenas
+#
+# and has been improved by many people since.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Fields have a fixed width. Every interval, we fill the "v"
+# hash with its corresponding value (v[field]=value) using calculate().
+# @hdr is the array of fields that needs to be printed, so we
+# just iterate over this array and print the values using our pretty printer.
+#
+# This script must remain compatible with Python 2.6+ and Python 3.4+.
+#
+
+import sys
+import time
+import getopt
+import re
+import copy
+
+from signal import signal, SIGINT, SIGWINCH, SIG_DFL
+
+
+cols = {
+ # HDR: [Size, Scale, Description]
+ "time": [8, -1, "Time"],
+ "hits": [4, 1000, "ARC reads per second"],
+ "miss": [4, 1000, "ARC misses per second"],
+ "read": [4, 1000, "Total ARC accesses per second"],
+ "hit%": [4, 100, "ARC hit percentage"],
+ "miss%": [5, 100, "ARC miss percentage"],
+ "dhit": [4, 1000, "Demand hits per second"],
+ "dmis": [4, 1000, "Demand misses per second"],
+ "dh%": [3, 100, "Demand hit percentage"],
+ "dm%": [3, 100, "Demand miss percentage"],
+ "phit": [4, 1000, "Prefetch hits per second"],
+ "pmis": [4, 1000, "Prefetch misses per second"],
+ "ph%": [3, 100, "Prefetch hits percentage"],
+ "pm%": [3, 100, "Prefetch miss percentage"],
+ "mhit": [4, 1000, "Metadata hits per second"],
+ "mmis": [4, 1000, "Metadata misses per second"],
+ "mread": [5, 1000, "Metadata accesses per second"],
+ "mh%": [3, 100, "Metadata hit percentage"],
+ "mm%": [3, 100, "Metadata miss percentage"],
+ "arcsz": [5, 1024, "ARC size"],
+ "size": [4, 1024, "ARC size"],
+ "c": [4, 1024, "ARC target size"],
+ "mfu": [4, 1000, "MFU list hits per second"],
+ "mru": [4, 1000, "MRU list hits per second"],
+ "mfug": [4, 1000, "MFU ghost list hits per second"],
+ "mrug": [4, 1000, "MRU ghost list hits per second"],
+ "eskip": [5, 1000, "evict_skip per second"],
+ "mtxmis": [6, 1000, "mutex_miss per second"],
+ "dread": [5, 1000, "Demand accesses per second"],
+ "pread": [5, 1000, "Prefetch accesses per second"],
+ "l2hits": [6, 1000, "L2ARC hits per second"],
+ "l2miss": [6, 1000, "L2ARC misses per second"],
+ "l2read": [6, 1000, "Total L2ARC accesses per second"],
+ "l2hit%": [6, 100, "L2ARC access hit percentage"],
+ "l2miss%": [7, 100, "L2ARC access miss percentage"],
+ "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"],
+ "l2size": [6, 1024, "Size of the L2ARC"],
+ "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"],
+ "grow": [4, 1000, "ARC grow disabled"],
+ "need": [4, 1024, "ARC reclaim need"],
+ "free": [4, 1024, "ARC free memory"],
+ "avail": [5, 1024, "ARC available memory"],
+ "waste": [5, 1024, "Wasted memory due to round up to pagesize"],
+}
+
+v = {}
+hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis",
+ "mm%", "size", "c", "avail"]
+xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread",
+ "pread", "read"]
+sint = 1 # Default interval is 1 second
+count = 1 # Default count is 1
+hdr_intr = 20 # Print header every 20 lines of output
+opfile = None
+sep = " " # Default separator is 2 spaces
+version = "0.4"
+l2exist = False
+cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval "
+ "[count]]\n")
+cur = {}
+d = {}
+out = None
+kstat = None
+
+
+if sys.platform.startswith('freebsd'):
+ # Requires py27-sysctl on FreeBSD
+ import sysctl
+
+ def kstat_update():
+ global kstat
+
+ k = sysctl.filter('kstat.zfs.misc.arcstats')
+
+ if not k:
+ sys.exit(1)
+
+ kstat = {}
+
+ for s in k:
+ if not s:
+ continue
+
+ name, value = s.name, s.value
+ # Trims 'kstat.zfs.misc.arcstats' from the name
+ kstat[name[24:]] = int(value)
+
+elif sys.platform.startswith('linux'):
+ def kstat_update():
+ global kstat
+
+ k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
+
+ if not k:
+ sys.exit(1)
+
+ del k[0:2]
+ kstat = {}
+
+ for s in k:
+ if not s:
+ continue
+
+ name, unused, value = s.split()
+ kstat[name] = int(value)
+
+
+def detailed_usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("Field definitions are as follows:\n")
+ for key in cols:
+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+ sys.stderr.write("\n")
+
+ sys.exit(0)
+
+
+def usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("\t -h : Print this help message\n")
+ sys.stderr.write("\t -v : List all possible field headers and definitions"
+ "\n")
+ sys.stderr.write("\t -x : Print extended stats\n")
+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+ sys.stderr.write("\t -o : Redirect output to the specified file\n")
+ sys.stderr.write("\t -s : Override default field separator with custom "
+ "character or string\n")
+ sys.stderr.write("\nExamples:\n")
+ sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n")
+ sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n")
+ sys.stderr.write("\tarcstat -v\n")
+ sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n")
+ sys.stderr.write("\n")
+
+ sys.exit(1)
+
+
+def snap_stats():
+ global cur
+ global kstat
+
+ prev = copy.deepcopy(cur)
+ kstat_update()
+
+ cur = kstat
+ for key in cur:
+ if re.match(key, "class"):
+ continue
+ if key in prev:
+ d[key] = cur[key] - prev[key]
+ else:
+ d[key] = cur[key]
+
+
+def prettynum(sz, scale, num=0):
+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+ index = 0
+ save = 0
+
+ # Special case for date field
+ if scale == -1:
+ return "%s" % num
+
+ # Rounding error, return 0
+ elif 0 < num < 1:
+ num = 0
+
+ while abs(num) > scale and index < 5:
+ save = num
+ num = num / scale
+ index += 1
+
+ if index == 0:
+ return "%*d" % (sz, num)
+
+ if abs(save / scale) < 10:
+ return "%*.1f%s" % (sz - 1, num, suffix[index])
+ else:
+ return "%*d%s" % (sz - 1, num, suffix[index])
+
+
+def print_values():
+ global hdr
+ global sep
+ global v
+
+ sys.stdout.write(sep.join(
+ prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr))
+
+ sys.stdout.write("\n")
+ sys.stdout.flush()
+
+
+def print_header():
+ global hdr
+ global sep
+
+ sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr))
+
+ sys.stdout.write("\n")
+
+
+def get_terminal_lines():
+ try:
+ import fcntl
+ import termios
+ import struct
+ data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234')
+ sz = struct.unpack('hh', data)
+ return sz[0]
+ except Exception:
+ pass
+
+
+def update_hdr_intr():
+ global hdr_intr
+
+ lines = get_terminal_lines()
+ if lines and lines > 3:
+ hdr_intr = lines - 3
+
+
+def resize_handler(signum, frame):
+ update_hdr_intr()
+
+
+def init():
+ global sint
+ global count
+ global hdr
+ global xhdr
+ global opfile
+ global sep
+ global out
+ global l2exist
+
+ desired_cols = None
+ xflag = False
+ hflag = False
+ vflag = False
+ i = 1
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "xo:hvs:f:",
+ [
+ "extended",
+ "outfile",
+ "help",
+ "verbose",
+ "separator",
+ "columns"
+ ]
+ )
+ except getopt.error as msg:
+ sys.stderr.write("Error: %s\n" % str(msg))
+ usage()
+ opts = None
+
+ for opt, arg in opts:
+ if opt in ('-x', '--extended'):
+ xflag = True
+ if opt in ('-o', '--outfile'):
+ opfile = arg
+ i += 1
+ if opt in ('-h', '--help'):
+ hflag = True
+ if opt in ('-v', '--verbose'):
+ vflag = True
+ if opt in ('-s', '--separator'):
+ sep = arg
+ i += 1
+ if opt in ('-f', '--columns'):
+ desired_cols = arg
+ i += 1
+ i += 1
+
+ argv = sys.argv[i:]
+ sint = int(argv[0]) if argv else sint
+ count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1)
+
+ if hflag or (xflag and desired_cols):
+ usage()
+
+ if vflag:
+ detailed_usage()
+
+ if xflag:
+ hdr = xhdr
+
+ update_hdr_intr()
+
+ # check if L2ARC exists
+ snap_stats()
+ l2_size = cur.get("l2_size")
+ if l2_size:
+ l2exist = True
+
+ if desired_cols:
+ hdr = desired_cols.split(",")
+
+ invalid = []
+ incompat = []
+ for ele in hdr:
+ if ele not in cols:
+ invalid.append(ele)
+ elif not l2exist and ele.startswith("l2"):
+ sys.stdout.write("No L2ARC Here\n%s\n" % ele)
+ incompat.append(ele)
+
+ if len(invalid) > 0:
+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+ usage()
+
+ if len(incompat) > 0:
+ sys.stderr.write("Incompatible field specified! -- %s\n" %
+ incompat)
+ usage()
+
+ if opfile:
+ try:
+ out = open(opfile, "w")
+ sys.stdout = out
+
+ except IOError:
+ sys.stderr.write("Cannot open %s for writing\n" % opfile)
+ sys.exit(1)
+
+
+def calculate():
+ global d
+ global v
+ global l2exist
+
+ v = dict()
+ v["time"] = time.strftime("%H:%M:%S", time.localtime())
+ v["hits"] = d["hits"] / sint
+ v["miss"] = d["misses"] / sint
+ v["read"] = v["hits"] + v["miss"]
+ v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0
+ v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0
+
+ v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint
+ v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint
+
+ v["dread"] = v["dhit"] + v["dmis"]
+ v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0
+ v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0
+
+ v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint
+ v["pmis"] = (d["prefetch_data_misses"] +
+ d["prefetch_metadata_misses"]) / sint
+
+ v["pread"] = v["phit"] + v["pmis"]
+ v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0
+ v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0
+
+ v["mhit"] = (d["prefetch_metadata_hits"] +
+ d["demand_metadata_hits"]) / sint
+ v["mmis"] = (d["prefetch_metadata_misses"] +
+ d["demand_metadata_misses"]) / sint
+
+ v["mread"] = v["mhit"] + v["mmis"]
+ v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0
+ v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0
+
+ v["arcsz"] = cur["size"]
+ v["size"] = cur["size"]
+ v["c"] = cur["c"]
+ v["mfu"] = d["mfu_hits"] / sint
+ v["mru"] = d["mru_hits"] / sint
+ v["mrug"] = d["mru_ghost_hits"] / sint
+ v["mfug"] = d["mfu_ghost_hits"] / sint
+ v["eskip"] = d["evict_skip"] / sint
+ v["mtxmis"] = d["mutex_miss"] / sint
+
+ if l2exist:
+ v["l2hits"] = d["l2_hits"] / sint
+ v["l2miss"] = d["l2_misses"] / sint
+ v["l2read"] = v["l2hits"] + v["l2miss"]
+ v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0
+
+ v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0
+ v["l2asize"] = cur["l2_asize"]
+ v["l2size"] = cur["l2_size"]
+ v["l2bytes"] = d["l2_read_bytes"] / sint
+
+ v["grow"] = 0 if cur["arc_no_grow"] else 1
+ v["need"] = cur["arc_need_free"]
+ v["free"] = cur["memory_free_bytes"]
+ v["avail"] = cur["memory_available_bytes"]
+ v["waste"] = cur["abd_chunk_waste_size"]
+
+
+def main():
+ global sint
+ global count
+ global hdr_intr
+
+ i = 0
+ count_flag = 0
+
+ init()
+ if count > 0:
+ count_flag = 1
+
+ signal(SIGINT, SIG_DFL)
+ signal(SIGWINCH, resize_handler)
+ while True:
+ if i == 0:
+ print_header()
+
+ snap_stats()
+ calculate()
+ print_values()
+
+ if count_flag == 1:
+ if count <= 1:
+ break
+ count -= 1
+
+ i = 0 if i >= hdr_intr else i + 1
+ time.sleep(sint)
+
+ if out:
+ out.close()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/sys/contrib/openzfs/cmd/dbufstat/.gitignore b/sys/contrib/openzfs/cmd/dbufstat/.gitignore
new file mode 100644
index 000000000000..2c2e913cef70
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/dbufstat/.gitignore
@@ -0,0 +1 @@
+dbufstat
diff --git a/sys/contrib/openzfs/cmd/dbufstat/Makefile.am b/sys/contrib/openzfs/cmd/dbufstat/Makefile.am
new file mode 100644
index 000000000000..e672a01a4227
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/dbufstat/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/config/Substfiles.am
+
+bin_SCRIPTS = dbufstat
+
+SUBSTFILES += $(bin_SCRIPTS)
diff --git a/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in
new file mode 100755
index 000000000000..98eb79057388
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in
@@ -0,0 +1,669 @@
+#!/usr/bin/env @PYTHON_SHEBANG@
+#
+# Print out statistics for all cached dmu buffers. This information
+# is available through the dbufs kstat and may be post-processed as
+# needed by the script.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (C) 2013 Lawrence Livermore National Security, LLC.
+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+#
+# This script must remain compatible with Python 2.6+ and Python 3.4+.
+#
+
+import sys
+import getopt
+import errno
+import re
+
+bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
+bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
+ "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
+ "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
+ "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
+ "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
+bincompat = ["cached", "direct", "indirect", "bonus", "spill"]
+
+dhdr = ["pool", "objset", "object", "dtype", "cached"]
+dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
+ "indirect", "bonus", "spill"]
+dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
+ "dbc", "list", "atype", "flags", "count", "asize", "access",
+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
+ "l2_comp", "aholds"]
+
+thdr = ["pool", "objset", "dtype", "cached"]
+txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
+ "bonus", "spill"]
+tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
+ "dbc", "dbholds", "list", "atype", "flags", "count", "asize",
+ "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
+ "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize"]
+
+cols = {
+ # hdr: [size, scale, description]
+ "pool": [15, -1, "pool name"],
+ "objset": [6, -1, "dataset identification number"],
+ "object": [10, -1, "object number"],
+ "level": [5, -1, "indirection level of buffer"],
+ "blkid": [8, -1, "block number of buffer"],
+ "offset": [12, 1024, "offset in object of buffer"],
+ "dbsize": [7, 1024, "size of buffer"],
+ "meta": [4, -1, "is this buffer metadata?"],
+ "state": [5, -1, "state of buffer (read, cached, etc)"],
+ "dbholds": [7, 1000, "number of holds on buffer"],
+ "dbc": [3, -1, "in dbuf cache"],
+ "list": [4, -1, "which ARC list contains this buffer"],
+ "atype": [7, -1, "ARC header type (data or metadata)"],
+ "flags": [9, -1, "ARC read flags"],
+ "count": [5, -1, "ARC data count"],
+ "asize": [7, 1024, "size of this ARC buffer"],
+ "access": [10, -1, "time this ARC buffer was last accessed"],
+ "mru": [5, 1000, "hits while on the ARC's MRU list"],
+ "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"],
+ "mfu": [5, 1000, "hits while on the ARC's MFU list"],
+ "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"],
+ "l2": [5, 1000, "hits while on the L2ARC"],
+ "l2_dattr": [8, -1, "L2ARC disk address/offset"],
+ "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"],
+ "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"],
+ "aholds": [6, 1000, "number of holds on this ARC buffer"],
+ "dtype": [27, -1, "dnode type"],
+ "btype": [27, -1, "bonus buffer type"],
+ "data_bs": [7, 1024, "data block size"],
+ "meta_bs": [7, 1024, "metadata block size"],
+ "bsize": [6, 1024, "bonus buffer size"],
+ "lvls": [6, -1, "number of indirection levels"],
+ "dholds": [6, 1000, "number of holds on dnode"],
+ "blocks": [8, 1000, "number of allocated blocks"],
+ "dsize": [12, 1024, "size of dnode"],
+ "cached": [6, 1024, "bytes cached for all blocks"],
+ "direct": [6, 1024, "bytes cached for direct blocks"],
+ "indirect": [8, 1024, "bytes cached for indirect blocks"],
+ "bonus": [5, 1024, "bytes cached for bonus buffer"],
+ "spill": [5, 1024, "bytes cached for spill block"],
+}
+
+hdr = None
+xhdr = None
+sep = " " # Default separator is 2 spaces
+cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] "
+ "[-s string] [-F filter]\n")
+raw = 0
+
+
+def print_incompat_helper(incompat):
+ cnt = 0
+ for key in sorted(incompat):
+ if cnt is 0:
+ sys.stderr.write("\t")
+ elif cnt > 8:
+ sys.stderr.write(",\n\t")
+ cnt = 0
+ else:
+ sys.stderr.write(", ")
+
+ sys.stderr.write("%s" % key)
+ cnt += 1
+
+ sys.stderr.write("\n\n")
+
+
+def detailed_usage():
+ sys.stderr.write("%s\n" % cmd)
+
+ sys.stderr.write("Field definitions incompatible with '-b' option:\n")
+ print_incompat_helper(bincompat)
+
+ sys.stderr.write("Field definitions incompatible with '-d' option:\n")
+ print_incompat_helper(dincompat)
+
+ sys.stderr.write("Field definitions incompatible with '-t' option:\n")
+ print_incompat_helper(tincompat)
+
+ sys.stderr.write("Field definitions are as follows:\n")
+ for key in sorted(cols.keys()):
+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+ sys.stderr.write("\n")
+
+ sys.exit(0)
+
+
+def usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("\t -b : Print table of information for each dbuf\n")
+ sys.stderr.write("\t -d : Print table of information for each dnode\n")
+ sys.stderr.write("\t -h : Print this help message\n")
+ sys.stderr.write("\t -n : Exclude header from output\n")
+ sys.stderr.write("\t -r : Print raw values\n")
+ sys.stderr.write("\t -t : Print table of information for each dnode type"
+ "\n")
+ sys.stderr.write("\t -v : List all possible field headers and definitions"
+ "\n")
+ sys.stderr.write("\t -x : Print extended stats\n")
+ sys.stderr.write("\t -i : Redirect input from the specified file\n")
+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+ sys.stderr.write("\t -o : Redirect output to the specified file\n")
+ sys.stderr.write("\t -s : Override default field separator with custom "
+ "character or string\n")
+ sys.stderr.write("\t -F : Filter output by value or regex\n")
+ sys.stderr.write("\nExamples:\n")
+ sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n")
+ sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n")
+ sys.stderr.write("\tdbufstat -v\n")
+ sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n")
+ sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n")
+ sys.stderr.write("\n")
+
+ sys.exit(1)
+
+
+def prettynum(sz, scale, num=0):
+ global raw
+
+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+ index = 0
+ save = 0
+
+ if raw or scale == -1:
+ return "%*s" % (sz, num)
+
+ # Rounding error, return 0
+ elif 0 < num < 1:
+ num = 0
+
+ while num > scale and index < 5:
+ save = num
+ num = num / scale
+ index += 1
+
+ if index == 0:
+ return "%*d" % (sz, num)
+
+ if (save / scale) < 10:
+ return "%*.1f%s" % (sz - 1, num, suffix[index])
+ else:
+ return "%*d%s" % (sz - 1, num, suffix[index])
+
+
+def print_values(v):
+ global hdr
+ global sep
+
+ try:
+ for col in hdr:
+ sys.stdout.write("%s%s" % (
+ prettynum(cols[col][0], cols[col][1], v[col]), sep))
+ sys.stdout.write("\n")
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+
+
+def print_header():
+ global hdr
+ global sep
+
+ try:
+ for col in hdr:
+ sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
+ sys.stdout.write("\n")
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+
+
+def get_typestring(t):
+ ot_strings = [
+ "DMU_OT_NONE",
+ # general:
+ "DMU_OT_OBJECT_DIRECTORY",
+ "DMU_OT_OBJECT_ARRAY",
+ "DMU_OT_PACKED_NVLIST",
+ "DMU_OT_PACKED_NVLIST_SIZE",
+ "DMU_OT_BPOBJ",
+ "DMU_OT_BPOBJ_HDR",
+ # spa:
+ "DMU_OT_SPACE_MAP_HEADER",
+ "DMU_OT_SPACE_MAP",
+ # zil:
+ "DMU_OT_INTENT_LOG",
+ # dmu:
+ "DMU_OT_DNODE",
+ "DMU_OT_OBJSET",
+ # dsl:
+ "DMU_OT_DSL_DIR",
+ "DMU_OT_DSL_DIR_CHILD_MAP",
+ "DMU_OT_DSL_DS_SNAP_MAP",
+ "DMU_OT_DSL_PROPS",
+ "DMU_OT_DSL_DATASET",
+ # zpl:
+ "DMU_OT_ZNODE",
+ "DMU_OT_OLDACL",
+ "DMU_OT_PLAIN_FILE_CONTENTS",
+ "DMU_OT_DIRECTORY_CONTENTS",
+ "DMU_OT_MASTER_NODE",
+ "DMU_OT_UNLINKED_SET",
+ # zvol:
+ "DMU_OT_ZVOL",
+ "DMU_OT_ZVOL_PROP",
+ # other; for testing only!
+ "DMU_OT_PLAIN_OTHER",
+ "DMU_OT_UINT64_OTHER",
+ "DMU_OT_ZAP_OTHER",
+ # new object types:
+ "DMU_OT_ERROR_LOG",
+ "DMU_OT_SPA_HISTORY",
+ "DMU_OT_SPA_HISTORY_OFFSETS",
+ "DMU_OT_POOL_PROPS",
+ "DMU_OT_DSL_PERMS",
+ "DMU_OT_ACL",
+ "DMU_OT_SYSACL",
+ "DMU_OT_FUID",
+ "DMU_OT_FUID_SIZE",
+ "DMU_OT_NEXT_CLONES",
+ "DMU_OT_SCAN_QUEUE",
+ "DMU_OT_USERGROUP_USED",
+ "DMU_OT_USERGROUP_QUOTA",
+ "DMU_OT_USERREFS",
+ "DMU_OT_DDT_ZAP",
+ "DMU_OT_DDT_STATS",
+ "DMU_OT_SA",
+ "DMU_OT_SA_MASTER_NODE",
+ "DMU_OT_SA_ATTR_REGISTRATION",
+ "DMU_OT_SA_ATTR_LAYOUTS",
+ "DMU_OT_SCAN_XLATE",
+ "DMU_OT_DEDUP",
+ "DMU_OT_DEADLIST",
+ "DMU_OT_DEADLIST_HDR",
+ "DMU_OT_DSL_CLONES",
+ "DMU_OT_BPOBJ_SUBOBJ"]
+ otn_strings = {
+ 0x80: "DMU_OTN_UINT8_DATA",
+ 0xc0: "DMU_OTN_UINT8_METADATA",
+ 0x81: "DMU_OTN_UINT16_DATA",
+ 0xc1: "DMU_OTN_UINT16_METADATA",
+ 0x82: "DMU_OTN_UINT32_DATA",
+ 0xc2: "DMU_OTN_UINT32_METADATA",
+ 0x83: "DMU_OTN_UINT64_DATA",
+ 0xc3: "DMU_OTN_UINT64_METADATA",
+ 0x84: "DMU_OTN_ZAP_DATA",
+ 0xc4: "DMU_OTN_ZAP_METADATA",
+ 0xa0: "DMU_OTN_UINT8_ENC_DATA",
+ 0xe0: "DMU_OTN_UINT8_ENC_METADATA",
+ 0xa1: "DMU_OTN_UINT16_ENC_DATA",
+ 0xe1: "DMU_OTN_UINT16_ENC_METADATA",
+ 0xa2: "DMU_OTN_UINT32_ENC_DATA",
+ 0xe2: "DMU_OTN_UINT32_ENC_METADATA",
+ 0xa3: "DMU_OTN_UINT64_ENC_DATA",
+ 0xe3: "DMU_OTN_UINT64_ENC_METADATA",
+ 0xa4: "DMU_OTN_ZAP_ENC_DATA",
+ 0xe4: "DMU_OTN_ZAP_ENC_METADATA"}
+
+ # If "-rr" option is used, don't convert to string representation
+ if raw > 1:
+ return "%i" % t
+
+ try:
+ if t < len(ot_strings):
+ return ot_strings[t]
+ else:
+ return otn_strings[t]
+ except (IndexError, KeyError):
+ return "(UNKNOWN)"
+
+
+def get_compstring(c):
+ comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON",
+ "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB",
+ "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1",
+ "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3",
+ "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5",
+ "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7",
+ "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9",
+ "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4",
+ "ZIO_COMPRESS_ZSTD", "ZIO_COMPRESS_FUNCTION"]
+
+ # If "-rr" option is used, don't convert to string representation
+ if raw > 1:
+ return "%i" % c
+
+ try:
+ return comp_strings[c]
+ except IndexError:
+ return "%i" % c
+
+
+def parse_line(line, labels):
+ global hdr
+
+ new = dict()
+ val = None
+ for col in hdr:
+ # These are "special" fields computed in the update_dict
+ # function, prevent KeyError exception on labels[col] for these.
+ if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']:
+ val = line[labels[col]]
+
+ if col in ['pool', 'flags']:
+ new[col] = str(val)
+ elif col in ['dtype', 'btype']:
+ new[col] = get_typestring(int(val))
+ elif col in ['l2_comp']:
+ new[col] = get_compstring(int(val))
+ else:
+ new[col] = int(val)
+
+ return new
+
+
+def update_dict(d, k, line, labels):
+ pool = line[labels['pool']]
+ objset = line[labels['objset']]
+ key = line[labels[k]]
+
+ dbsize = int(line[labels['dbsize']])
+ blkid = int(line[labels['blkid']])
+ level = int(line[labels['level']])
+
+ if pool not in d:
+ d[pool] = dict()
+
+ if objset not in d[pool]:
+ d[pool][objset] = dict()
+
+ if key not in d[pool][objset]:
+ d[pool][objset][key] = parse_line(line, labels)
+ d[pool][objset][key]['bonus'] = 0
+ d[pool][objset][key]['cached'] = 0
+ d[pool][objset][key]['direct'] = 0
+ d[pool][objset][key]['indirect'] = 0
+ d[pool][objset][key]['spill'] = 0
+
+ d[pool][objset][key]['cached'] += dbsize
+
+ if blkid == -1:
+ d[pool][objset][key]['bonus'] += dbsize
+ elif blkid == -2:
+ d[pool][objset][key]['spill'] += dbsize
+ else:
+ if level == 0:
+ d[pool][objset][key]['direct'] += dbsize
+ else:
+ d[pool][objset][key]['indirect'] += dbsize
+
+ return d
+
+
+def skip_line(vals, filters):
+ '''
+ Determines if a line should be skipped during printing
+ based on a set of filters
+ '''
+ if len(filters) == 0:
+ return False
+
+ for key in vals:
+ if key in filters:
+ val = prettynum(cols[key][0], cols[key][1], vals[key]).strip()
+ # we want a full match here
+ if re.match("(?:" + filters[key] + r")\Z", val) is None:
+ return True
+
+ return False
+
+
+def print_dict(d, filters, noheader):
+ if not noheader:
+ print_header()
+ for pool in list(d.keys()):
+ for objset in list(d[pool].keys()):
+ for v in list(d[pool][objset].values()):
+ if not skip_line(v, filters):
+ print_values(v)
+
+
+def dnodes_build_dict(filehandle):
+ labels = dict()
+ dnodes = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ update_dict(dnodes, 'object', line.split(), labels)
+
+ return dnodes
+
+
+def types_build_dict(filehandle):
+ labels = dict()
+ types = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ update_dict(types, 'dtype', line.split(), labels)
+
+ return types
+
+
+def buffers_print_all(filehandle, filters, noheader):
+ labels = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ if not noheader:
+ print_header()
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ vals = parse_line(line.split(), labels)
+ if not skip_line(vals, filters):
+ print_values(vals)
+
+
+def main():
+ global hdr
+ global sep
+ global raw
+
+ desired_cols = None
+ bflag = False
+ dflag = False
+ hflag = False
+ ifile = None
+ ofile = None
+ tflag = False
+ vflag = False
+ xflag = False
+ nflag = False
+ filters = dict()
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "bdf:hi:o:rs:tvxF:n",
+ [
+ "buffers",
+ "dnodes",
+ "columns",
+ "help",
+ "infile",
+ "outfile",
+ "separator",
+ "types",
+ "verbose",
+ "extended",
+ "filter"
+ ]
+ )
+ except getopt.error:
+ usage()
+ opts = None
+
+ for opt, arg in opts:
+ if opt in ('-b', '--buffers'):
+ bflag = True
+ if opt in ('-d', '--dnodes'):
+ dflag = True
+ if opt in ('-f', '--columns'):
+ desired_cols = arg
+ if opt in ('-h', '--help'):
+ hflag = True
+ if opt in ('-i', '--infile'):
+ ifile = arg
+ if opt in ('-o', '--outfile'):
+ ofile = arg
+ if opt in ('-r', '--raw'):
+ raw += 1
+ if opt in ('-s', '--separator'):
+ sep = arg
+ if opt in ('-t', '--types'):
+ tflag = True
+ if opt in ('-v', '--verbose'):
+ vflag = True
+ if opt in ('-x', '--extended'):
+ xflag = True
+ if opt in ('-n', '--noheader'):
+ nflag = True
+ if opt in ('-F', '--filter'):
+ fils = [x.strip() for x in arg.split(",")]
+
+ for fil in fils:
+ f = [x.strip() for x in fil.split("=")]
+
+ if len(f) != 2:
+ sys.stderr.write("Invalid filter '%s'.\n" % fil)
+ sys.exit(1)
+
+ if f[0] not in cols:
+ sys.stderr.write("Invalid field '%s' in filter.\n" % f[0])
+ sys.exit(1)
+
+ if f[0] in filters:
+ sys.stderr.write("Field '%s' specified multiple times in "
+ "filter.\n" % f[0])
+ sys.exit(1)
+
+ try:
+ re.compile("(?:" + f[1] + r")\Z")
+ except re.error:
+ sys.stderr.write("Invalid regex for field '%s' in "
+ "filter.\n" % f[0])
+ sys.exit(1)
+
+ filters[f[0]] = f[1]
+
+ if hflag or (xflag and desired_cols):
+ usage()
+
+ if vflag:
+ detailed_usage()
+
+ # Ensure at most only one of b, d, or t flags are set
+ if (bflag and dflag) or (bflag and tflag) or (dflag and tflag):
+ usage()
+
+ if bflag:
+ hdr = bxhdr if xflag else bhdr
+ elif tflag:
+ hdr = txhdr if xflag else thdr
+ else: # Even if dflag is False, it's the default if none set
+ dflag = True
+ hdr = dxhdr if xflag else dhdr
+
+ if desired_cols:
+ hdr = desired_cols.split(",")
+
+ invalid = []
+ incompat = []
+ for ele in hdr:
+ if ele not in cols:
+ invalid.append(ele)
+ elif ((bflag and bincompat and ele in bincompat) or
+ (dflag and dincompat and ele in dincompat) or
+ (tflag and tincompat and ele in tincompat)):
+ incompat.append(ele)
+
+ if len(invalid) > 0:
+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+ usage()
+
+ if len(incompat) > 0:
+ sys.stderr.write("Incompatible field specified! -- %s\n" %
+ incompat)
+ usage()
+
+ if ofile:
+ try:
+ tmp = open(ofile, "w")
+ sys.stdout = tmp
+
+ except IOError:
+ sys.stderr.write("Cannot open %s for writing\n" % ofile)
+ sys.exit(1)
+
+ if not ifile:
+ ifile = '/proc/spl/kstat/zfs/dbufs'
+
+ if ifile is not "-":
+ try:
+ tmp = open(ifile, "r")
+ sys.stdin = tmp
+ except IOError:
+ sys.stderr.write("Cannot open %s for reading\n" % ifile)
+ sys.exit(1)
+
+ if bflag:
+ buffers_print_all(sys.stdin, filters, nflag)
+
+ if dflag:
+ print_dict(dnodes_build_dict(sys.stdin), filters, nflag)
+
+ if tflag:
+ print_dict(types_build_dict(sys.stdin), filters, nflag)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am b/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am
new file mode 100644
index 000000000000..2380f56fa4d4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/fsck_zfs/Makefile.am
@@ -0,0 +1 @@
+dist_sbin_SCRIPTS = fsck.zfs
diff --git a/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs b/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs
new file mode 100755
index 000000000000..129a7f39c388
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/fsck_zfs/fsck.zfs
@@ -0,0 +1,9 @@
+#!/bin/sh
+#
+# fsck.zfs: A fsck helper to accommodate distributions that expect
+# to be able to execute a fsck on all filesystem types. Currently
+# this script does nothing but it could be extended to act as a
+# compatibility wrapper for 'zpool scrub'.
+#
+
+exit 0
diff --git a/sys/contrib/openzfs/cmd/mount_zfs/.gitignore b/sys/contrib/openzfs/cmd/mount_zfs/.gitignore
new file mode 100644
index 000000000000..cd9254bde3da
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/mount_zfs/.gitignore
@@ -0,0 +1 @@
+mount.zfs
diff --git a/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am b/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am
new file mode 100644
index 000000000000..6c4d6ff79f16
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/mount_zfs/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+#
+# Ignore the prefix for the mount helper. It must be installed in /sbin/
+# because this path is hardcoded in the mount(8) for security reasons.
+# However, if needed, the configure option --with-mounthelperdir= can be used
+# to override the default install location.
+#
+sbindir=$(mounthelperdir)
+sbin_PROGRAMS = mount.zfs
+
+mount_zfs_SOURCES = \
+ mount_zfs.c
+
+mount_zfs_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
+
+mount_zfs_LDADD += $(LTLIBINTL)
diff --git a/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c
new file mode 100644
index 000000000000..87d2ccadcded
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c
@@ -0,0 +1,408 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Lawrence Livermore National Security, LLC.
+ */
+
+#include <libintl.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/mntent.h>
+#include <sys/stat.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <locale.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#define ZS_COMMENT 0x00000000 /* comment */
+#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */
+
+libzfs_handle_t *g_zfs;
+
+/*
+ * Return the pool/dataset to mount given the name passed to mount. This
+ * is expected to be of the form pool/dataset, however may also refer to
+ * a block device if that device contains a valid zfs label.
+ */
+static char *
+parse_dataset(char *dataset)
+{
+ char cwd[PATH_MAX];
+ struct stat64 statbuf;
+ int error;
+ int len;
+
+ /*
+ * We expect a pool/dataset to be provided, however if we're
+ * given a device which is a member of a zpool we attempt to
+ * extract the pool name stored in the label. Given the pool
+ * name we can mount the root dataset.
+ */
+ error = stat64(dataset, &statbuf);
+ if (error == 0) {
+ nvlist_t *config;
+ char *name;
+ int fd;
+
+ fd = open(dataset, O_RDONLY);
+ if (fd < 0)
+ goto out;
+
+ error = zpool_read_label(fd, &config, NULL);
+ (void) close(fd);
+ if (error)
+ goto out;
+
+ error = nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME, &name);
+ if (error) {
+ nvlist_free(config);
+ } else {
+ dataset = strdup(name);
+ nvlist_free(config);
+ return (dataset);
+ }
+ }
+out:
+ /*
+ * If a file or directory in your current working directory is
+ * named 'dataset' then mount(8) will prepend your current working
+ * directory to the dataset. There is no way to prevent this
+ * behavior so we simply check for it and strip the prepended
+ * patch when it is added.
+ */
+ if (getcwd(cwd, PATH_MAX) == NULL)
+ return (dataset);
+
+ len = strlen(cwd);
+
+ /* Do not add one when cwd already ends in a trailing '/' */
+ if (strncmp(cwd, dataset, len) == 0)
+ return (dataset + len + (cwd[len-1] != '/'));
+
+ return (dataset);
+}
+
+/*
+ * Update the mtab_* code to use the libmount library when it is commonly
+ * available otherwise fallback to legacy mode. The mount(8) utility will
+ * manage the lock file for us to prevent racing updates to /etc/mtab.
+ */
+static int
+mtab_is_writeable(void)
+{
+ struct stat st;
+ int error, fd;
+
+ error = lstat("/etc/mtab", &st);
+ if (error || S_ISLNK(st.st_mode))
+ return (0);
+
+ fd = open("/etc/mtab", O_RDWR | O_CREAT, 0644);
+ if (fd < 0)
+ return (0);
+
+ close(fd);
+ return (1);
+}
+
+static int
+mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts)
+{
+ struct mntent mnt;
+ FILE *fp;
+ int error;
+
+ mnt.mnt_fsname = dataset;
+ mnt.mnt_dir = mntpoint;
+ mnt.mnt_type = type;
+ mnt.mnt_opts = mntopts ? mntopts : "";
+ mnt.mnt_freq = 0;
+ mnt.mnt_passno = 0;
+
+ fp = setmntent("/etc/mtab", "a+");
+ if (!fp) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' was mounted, but /etc/mtab "
+ "could not be opened due to error %d\n"),
+ dataset, errno);
+ return (MOUNT_FILEIO);
+ }
+
+ error = addmntent(fp, &mnt);
+ if (error) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' was mounted, but /etc/mtab "
+ "could not be updated due to error %d\n"),
+ dataset, errno);
+ return (MOUNT_FILEIO);
+ }
+
+ (void) endmntent(fp);
+
+ return (MOUNT_SUCCESS);
+}
+
+int
+main(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ char prop[ZFS_MAXPROPLEN];
+ uint64_t zfs_version = 0;
+ char mntopts[MNT_LINE_MAX] = { '\0' };
+ char badopt[MNT_LINE_MAX] = { '\0' };
+ char mtabopt[MNT_LINE_MAX] = { '\0' };
+ char mntpoint[PATH_MAX];
+ char *dataset;
+ unsigned long mntflags = 0, zfsflags = 0, remount = 0;
+ int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0;
+ int error, c;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+
+ opterr = 0;
+
+ /* check options */
+ while ((c = getopt_long(argc, argv, "sfnvo:h?", 0, 0)) != -1) {
+ switch (c) {
+ case 's':
+ sloppy = 1;
+ break;
+ case 'f':
+ fake = 1;
+ break;
+ case 'n':
+ nomtab = 1;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'o':
+ (void) strlcpy(mntopts, optarg, sizeof (mntopts));
+ break;
+ case 'h':
+ case '?':
+ (void) fprintf(stderr, gettext("Invalid option '%c'\n"),
+ optopt);
+ (void) fprintf(stderr, gettext("Usage: mount.zfs "
+ "[-sfnv] [-o options] <dataset> <mountpoint>\n"));
+ return (MOUNT_USAGE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check that we only have two arguments */
+ if (argc != 2) {
+ if (argc == 0)
+ (void) fprintf(stderr, gettext("missing dataset "
+ "argument\n"));
+ else if (argc == 1)
+ (void) fprintf(stderr,
+ gettext("missing mountpoint argument\n"));
+ else
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ (void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
+ return (MOUNT_USAGE);
+ }
+
+ dataset = parse_dataset(argv[0]);
+
+ /* canonicalize the mount point */
+ if (realpath(argv[1], mntpoint) == NULL) {
+ (void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+ "mounted at '%s' due to canonicalization error %d.\n"),
+ dataset, argv[1], errno);
+ return (MOUNT_SYSERR);
+ }
+
+ /* validate mount options and set mntflags */
+ error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy,
+ badopt, mtabopt);
+ if (error) {
+ switch (error) {
+ case ENOMEM:
+ (void) fprintf(stderr, gettext("filesystem '%s' "
+ "cannot be mounted due to a memory allocation "
+ "failure.\n"), dataset);
+ return (MOUNT_SYSERR);
+ case ENOENT:
+ (void) fprintf(stderr, gettext("filesystem '%s' "
+ "cannot be mounted due to invalid option "
+ "'%s'.\n"), dataset, badopt);
+ (void) fprintf(stderr, gettext("Use the '-s' option "
+ "to ignore the bad mount option.\n"));
+ return (MOUNT_USAGE);
+ default:
+ (void) fprintf(stderr, gettext("filesystem '%s' "
+ "cannot be mounted due to internal error %d.\n"),
+ dataset, error);
+ return (MOUNT_SOFTWARE);
+ }
+ }
+
+ if (verbose)
+ (void) fprintf(stdout, gettext("mount.zfs:\n"
+ " dataset: \"%s\"\n mountpoint: \"%s\"\n"
+ " mountflags: 0x%lx\n zfsflags: 0x%lx\n"
+ " mountopts: \"%s\"\n mtabopts: \"%s\"\n"),
+ dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
+
+ if (mntflags & MS_REMOUNT) {
+ nomtab = 1;
+ remount = 1;
+ }
+
+ if (zfsflags & ZS_ZFSUTIL)
+ zfsutil = 1;
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+ return (MOUNT_SYSERR);
+ }
+
+ /* try to open the dataset to access the mount point */
+ if ((zhp = zfs_open(g_zfs, dataset,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) {
+ (void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+ "mounted, unable to open the dataset\n"), dataset);
+ libzfs_fini(g_zfs);
+ return (MOUNT_USAGE);
+ }
+
+ zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
+
+ /* treat all snapshots as legacy mount points */
+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT)
+ (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN);
+ else
+ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop,
+ sizeof (prop), NULL, NULL, 0, B_FALSE);
+
+ /*
+ * Fetch the max supported zfs version in case we get ENOTSUP
+ * back from the mount command, since we need the zfs handle
+ * to do so.
+ */
+ zfs_version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+ if (zfs_version == 0) {
+ fprintf(stderr, gettext("unable to fetch "
+ "ZFS version for filesystem '%s'\n"), dataset);
+ return (MOUNT_SYSERR);
+ }
+
+ zfs_close(zhp);
+ libzfs_fini(g_zfs);
+
+ /*
+ * Legacy mount points may only be mounted using 'mount', never using
+ * 'zfs mount'. However, since 'zfs mount' actually invokes 'mount'
+ * we differentiate the two cases using the 'zfsutil' mount option.
+ * This mount option should only be supplied by the 'zfs mount' util.
+ *
+ * The only exception to the above rule is '-o remount' which is
+ * always allowed for non-legacy datasets. This is done because when
+ * using zfs as your root file system both rc.sysinit/umountroot and
+ * systemd depend on 'mount -o remount <mountpoint>' to work.
+ */
+ if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' cannot be mounted using 'zfs mount'.\n"
+ "Use 'zfs set mountpoint=%s' or 'mount -t zfs %s %s'.\n"
+ "See zfs(8) for more information.\n"),
+ dataset, mntpoint, dataset, mntpoint);
+ return (MOUNT_USAGE);
+ }
+
+ if (!zfsutil && !(remount || fake) &&
+ strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' cannot be mounted using 'mount'.\n"
+ "Use 'zfs set mountpoint=%s' or 'zfs mount %s'.\n"
+ "See zfs(8) for more information.\n"),
+ dataset, "legacy", dataset);
+ return (MOUNT_USAGE);
+ }
+
+ if (!fake) {
+ error = mount(dataset, mntpoint, MNTTYPE_ZFS,
+ mntflags, mntopts);
+ }
+
+ if (error) {
+ switch (errno) {
+ case ENOENT:
+ (void) fprintf(stderr, gettext("mount point "
+ "'%s' does not exist\n"), mntpoint);
+ return (MOUNT_SYSERR);
+ case EBUSY:
+ (void) fprintf(stderr, gettext("filesystem "
+ "'%s' is already mounted\n"), dataset);
+ return (MOUNT_BUSY);
+ case ENOTSUP:
+ if (zfs_version > ZPL_VERSION) {
+ (void) fprintf(stderr,
+ gettext("filesystem '%s' (v%d) is not "
+ "supported by this implementation of "
+ "ZFS (max v%d).\n"), dataset,
+ (int)zfs_version, (int)ZPL_VERSION);
+ } else {
+ (void) fprintf(stderr,
+ gettext("filesystem '%s' mount "
+ "failed for unknown reason.\n"), dataset);
+ }
+ return (MOUNT_SYSERR);
+#ifdef MS_MANDLOCK
+ case EPERM:
+ if (mntflags & MS_MANDLOCK) {
+ (void) fprintf(stderr, gettext("filesystem "
+ "'%s' has the 'nbmand=on' property set, "
+ "this mount\noption may be disabled in "
+ "your kernel. Use 'zfs set nbmand=off'\n"
+ "to disable this option and try to "
+ "mount the filesystem again.\n"), dataset);
+ return (MOUNT_SYSERR);
+ }
+ /* fallthru */
+#endif
+ default:
+ (void) fprintf(stderr, gettext("filesystem "
+ "'%s' can not be mounted: %s\n"), dataset,
+ strerror(errno));
+ return (MOUNT_USAGE);
+ }
+ }
+
+ if (!nomtab && mtab_is_writeable()) {
+ error = mtab_update(dataset, mntpoint, MNTTYPE_ZFS, mtabopt);
+ if (error)
+ return (error);
+ }
+
+ return (MOUNT_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/cmd/raidz_test/.gitignore b/sys/contrib/openzfs/cmd/raidz_test/.gitignore
new file mode 100644
index 000000000000..f8b83d9cce03
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/.gitignore
@@ -0,0 +1 @@
+/raidz_test
diff --git a/sys/contrib/openzfs/cmd/raidz_test/Makefile.am b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am
new file mode 100644
index 000000000000..72c914e641e4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+# Includes kernel code, generate warnings for large stack frames
+AM_CFLAGS += $(FRAME_LARGER_THAN)
+
+# Unconditionally enable ASSERTs
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+bin_PROGRAMS = raidz_test
+
+raidz_test_SOURCES = \
+ raidz_test.h \
+ raidz_test.c \
+ raidz_bench.c
+
+raidz_test_LDADD = \
+ $(abs_top_builddir)/lib/libzpool/libzpool.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la
+
+raidz_test_LDADD += -lm
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
new file mode 100644
index 000000000000..8a2cec4ca685
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/zio.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <stdio.h>
+
+#include <sys/time.h>
+
+#include "raidz_test.h"
+
+#define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32)
+#define REC_BENCH_MEMORY (((uint64_t)1ULL)<<29)
+#define BENCH_ASHIFT 12
+#define MIN_CS_SHIFT BENCH_ASHIFT
+#define MAX_CS_SHIFT SPA_MAXBLOCKSHIFT
+
+static zio_t zio_bench;
+static raidz_map_t *rm_bench;
+static size_t max_data_size = SPA_MAXBLOCKSIZE;
+
+static void
+bench_init_raidz_map(void)
+{
+ zio_bench.io_offset = 0;
+ zio_bench.io_size = max_data_size;
+
+ /*
+ * To permit larger column sizes these have to be done
+ * allocated using aligned alloc instead of zio_abd_buf_alloc
+ */
+ zio_bench.io_abd = raidz_alloc(max_data_size);
+
+ init_zio_abd(&zio_bench);
+}
+
+static void
+bench_fini_raidz_maps(void)
+{
+ /* tear down golden zio */
+ raidz_free(zio_bench.io_abd, max_data_size);
+ bzero(&zio_bench, sizeof (zio_t));
+}
+
+static inline void
+run_gen_bench_impl(const char *impl)
+{
+ int fn, ncols;
+ uint64_t ds, iter_cnt, iter, disksize;
+ hrtime_t start;
+ double elapsed, d_bw;
+
+ /* Benchmark generate functions */
+ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+
+ for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
+ /* create suitable raidz_map */
+ ncols = rto_opts.rto_dcols + fn + 1;
+ zio_bench.io_size = 1ULL << ds;
+ rm_bench = vdev_raidz_map_alloc(&zio_bench,
+ BENCH_ASHIFT, ncols, fn+1);
+
+ /* estimate iteration count */
+ iter_cnt = GEN_BENCH_MEMORY;
+ iter_cnt /= zio_bench.io_size;
+
+ start = gethrtime();
+ for (iter = 0; iter < iter_cnt; iter++)
+ vdev_raidz_generate_parity(rm_bench);
+ elapsed = NSEC2SEC((double)(gethrtime() - start));
+
+ disksize = (1ULL << ds) / rto_opts.rto_dcols;
+ d_bw = (double)iter_cnt * (double)disksize;
+ d_bw /= (1024.0 * 1024.0 * elapsed);
+
+ LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n",
+ impl,
+ raidz_gen_name[fn],
+ rto_opts.rto_dcols,
+ (1ULL<<ds),
+ d_bw,
+ d_bw * (double)(ncols),
+ (unsigned)iter_cnt);
+
+ vdev_raidz_map_free(rm_bench);
+ }
+ }
+}
+
+static void
+run_gen_bench(void)
+{
+ char **impl_name;
+
+ LOG(D_INFO, DBLSEP "\nBenchmarking parity generation...\n\n");
+ LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n");
+
+ for (impl_name = (char **)raidz_impl_names; *impl_name != NULL;
+ impl_name++) {
+
+ if (vdev_raidz_impl_set(*impl_name) != 0)
+ continue;
+
+ run_gen_bench_impl(*impl_name);
+ }
+}
+
+static void
+run_rec_bench_impl(const char *impl)
+{
+ int fn, ncols, nbad;
+ uint64_t ds, iter_cnt, iter, disksize;
+ hrtime_t start;
+ double elapsed, d_bw;
+ static const int tgt[7][3] = {
+ {1, 2, 3}, /* rec_p: bad QR & D[0] */
+ {0, 2, 3}, /* rec_q: bad PR & D[0] */
+ {0, 1, 3}, /* rec_r: bad PQ & D[0] */
+ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
+ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
+ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
+ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
+ };
+
+ for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
+ for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
+
+ /* create suitable raidz_map */
+ ncols = rto_opts.rto_dcols + PARITY_PQR;
+ zio_bench.io_size = 1ULL << ds;
+
+ /*
+ * raidz block is too short to test
+ * the requested method
+ */
+ if (zio_bench.io_size / rto_opts.rto_dcols <
+ (1ULL << BENCH_ASHIFT))
+ continue;
+
+ rm_bench = vdev_raidz_map_alloc(&zio_bench,
+ BENCH_ASHIFT, ncols, PARITY_PQR);
+
+ /* estimate iteration count */
+ iter_cnt = (REC_BENCH_MEMORY);
+ iter_cnt /= zio_bench.io_size;
+
+ /* calculate how many bad columns there are */
+ nbad = MIN(3, raidz_ncols(rm_bench) -
+ raidz_parity(rm_bench));
+
+ start = gethrtime();
+ for (iter = 0; iter < iter_cnt; iter++)
+ vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad);
+ elapsed = NSEC2SEC((double)(gethrtime() - start));
+
+ disksize = (1ULL << ds) / rto_opts.rto_dcols;
+ d_bw = (double)iter_cnt * (double)(disksize);
+ d_bw /= (1024.0 * 1024.0 * elapsed);
+
+ LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n",
+ impl,
+ raidz_rec_name[fn],
+ rto_opts.rto_dcols,
+ (1ULL<<ds),
+ d_bw,
+ d_bw * (double)ncols,
+ (unsigned)iter_cnt);
+
+ vdev_raidz_map_free(rm_bench);
+ }
+ }
+}
+
+static void
+run_rec_bench(void)
+{
+ char **impl_name;
+
+ LOG(D_INFO, DBLSEP "\nBenchmarking data reconstruction...\n\n");
+ LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n");
+
+ for (impl_name = (char **)raidz_impl_names; *impl_name != NULL;
+ impl_name++) {
+
+ if (vdev_raidz_impl_set(*impl_name) != 0)
+ continue;
+
+ run_rec_bench_impl(*impl_name);
+ }
+}
+
+void
+run_raidz_benchmark(void)
+{
+ bench_init_raidz_map();
+
+ run_gen_bench();
+ run_rec_bench();
+
+ bench_fini_raidz_maps();
+}
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
new file mode 100644
index 000000000000..66f36b0d56ca
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c
@@ -0,0 +1,782 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/zio.h>
+#include <umem.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <assert.h>
+#include <stdio.h>
+#include "raidz_test.h"
+
+static int *rand_data;
+raidz_test_opts_t rto_opts;
+
+static char gdb[256];
+static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d";
+
+static void sig_handler(int signo)
+{
+ struct sigaction action;
+ /*
+ * Restore default action and re-raise signal so SIGSEGV and
+ * SIGABRT can trigger a core dump.
+ */
+ action.sa_handler = SIG_DFL;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ (void) sigaction(signo, &action, NULL);
+
+ if (rto_opts.rto_gdb)
+ if (system(gdb)) { }
+
+ raise(signo);
+}
+
+static void print_opts(raidz_test_opts_t *opts, boolean_t force)
+{
+ char *verbose;
+ switch (opts->rto_v) {
+ case 0:
+ verbose = "no";
+ break;
+ case 1:
+ verbose = "info";
+ break;
+ default:
+ verbose = "debug";
+ break;
+ }
+
+ if (force || opts->rto_v >= D_INFO) {
+ (void) fprintf(stdout, DBLSEP "Running with options:\n"
+ " (-a) zio ashift : %zu\n"
+ " (-o) zio offset : 1 << %zu\n"
+ " (-d) number of raidz data columns : %zu\n"
+ " (-s) size of DATA : 1 << %zu\n"
+ " (-S) sweep parameters : %s \n"
+ " (-v) verbose : %s \n\n",
+ opts->rto_ashift, /* -a */
+ ilog2(opts->rto_offset), /* -o */
+ opts->rto_dcols, /* -d */
+ ilog2(opts->rto_dsize), /* -s */
+ opts->rto_sweep ? "yes" : "no", /* -S */
+ verbose); /* -v */
+ }
+}
+
+static void usage(boolean_t requested)
+{
+ const raidz_test_opts_t *o = &rto_opts_defaults;
+
+ FILE *fp = requested ? stdout : stderr;
+
+ (void) fprintf(fp, "Usage:\n"
+ "\t[-a zio ashift (default: %zu)]\n"
+ "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
+ "\t[-d number of raidz data columns (default: %zu)]\n"
+ "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
+ "\t[-S parameter sweep (default: %s)]\n"
+ "\t[-t timeout for parameter sweep test]\n"
+ "\t[-B benchmark all raidz implementations]\n"
+ "\t[-v increase verbosity (default: %zu)]\n"
+ "\t[-h (print help)]\n"
+ "\t[-T test the test, see if failure would be detected]\n"
+ "\t[-D debug (attach gdb on SIGSEGV)]\n"
+ "",
+ o->rto_ashift, /* -a */
+ ilog2(o->rto_offset), /* -o */
+ o->rto_dcols, /* -d */
+ ilog2(o->rto_dsize), /* -s */
+ rto_opts.rto_sweep ? "yes" : "no", /* -S */
+ o->rto_v); /* -d */
+
+ exit(requested ? 0 : 1);
+}
+
+static void process_options(int argc, char **argv)
+{
+ size_t value;
+ int opt;
+
+ raidz_test_opts_t *o = &rto_opts;
+
+ bcopy(&rto_opts_defaults, o, sizeof (*o));
+
+ while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
+ value = 0;
+
+ switch (opt) {
+ case 'a':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_ashift = MIN(13, MAX(9, value));
+ break;
+ case 'o':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
+ break;
+ case 'd':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_dcols = MIN(255, MAX(1, value));
+ break;
+ case 's':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT,
+ MAX(SPA_MINBLOCKSHIFT, value));
+ break;
+ case 't':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_sweep_timeout = value;
+ break;
+ case 'v':
+ o->rto_v++;
+ break;
+ case 'S':
+ o->rto_sweep = 1;
+ break;
+ case 'B':
+ o->rto_benchmark = 1;
+ break;
+ case 'D':
+ o->rto_gdb = 1;
+ break;
+ case 'T':
+ o->rto_sanity = 1;
+ break;
+ case 'h':
+ usage(B_TRUE);
+ break;
+ case '?':
+ default:
+ usage(B_FALSE);
+ break;
+ }
+ }
+}
+
+#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
+#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
+
+#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
+#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
+
+static int
+cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
+{
+ int i, ret = 0;
+
+ VERIFY(parity >= 1 && parity <= 3);
+
+ for (i = 0; i < parity; i++) {
+ if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
+ != 0) {
+ ret++;
+ LOG_OPT(D_DEBUG, opts,
+ "\nParity block [%d] different!\n", i);
+ }
+ }
+ return (ret);
+}
+
+static int
+cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
+{
+ int i, ret = 0;
+ int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
+
+ for (i = 0; i < dcols; i++) {
+ if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
+ != 0) {
+ ret++;
+
+ LOG_OPT(D_DEBUG, opts,
+ "\nData block [%d] different!\n", i);
+ }
+ }
+ return (ret);
+}
+
+static int
+init_rand(void *data, size_t size, void *private)
+{
+ int i;
+ int *dst = (int *)data;
+
+ for (i = 0; i < size / sizeof (int); i++)
+ dst[i] = rand_data[i];
+
+ return (0);
+}
+
+static void
+corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
+{
+ int i;
+ raidz_col_t *col;
+
+ for (i = 0; i < cnt; i++) {
+ col = &rm->rm_col[tgts[i]];
+ abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
+ }
+}
+
+void
+init_zio_abd(zio_t *zio)
+{
+ abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
+}
+
+static void
+fini_raidz_map(zio_t **zio, raidz_map_t **rm)
+{
+ vdev_raidz_map_free(*rm);
+ raidz_free((*zio)->io_abd, (*zio)->io_size);
+ umem_free(*zio, sizeof (zio_t));
+
+ *zio = NULL;
+ *rm = NULL;
+}
+
+static int
+init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
+{
+ int err = 0;
+ zio_t *zio_test;
+ raidz_map_t *rm_test;
+ const size_t total_ncols = opts->rto_dcols + parity;
+
+ if (opts->rm_golden) {
+ fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+ }
+
+ opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+ zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+
+ opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
+ opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
+
+ opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
+ zio_test->io_abd = raidz_alloc(opts->rto_dsize);
+
+ init_zio_abd(opts->zio_golden);
+ init_zio_abd(zio_test);
+
+ VERIFY0(vdev_raidz_impl_set("original"));
+
+ opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
+ opts->rto_ashift, total_ncols, parity);
+ rm_test = vdev_raidz_map_alloc(zio_test,
+ opts->rto_ashift, total_ncols, parity);
+
+ VERIFY(opts->zio_golden);
+ VERIFY(opts->rm_golden);
+
+ vdev_raidz_generate_parity(opts->rm_golden);
+ vdev_raidz_generate_parity(rm_test);
+
+ /* sanity check */
+ err |= cmp_data(opts, rm_test);
+ err |= cmp_code(opts, rm_test, parity);
+
+ if (err)
+ ERR("initializing the golden copy ... [FAIL]!\n");
+
+ /* tear down raidz_map of test zio */
+ fini_raidz_map(&zio_test, &rm_test);
+
+ return (err);
+}
+
+static raidz_map_t *
+init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
+{
+ raidz_map_t *rm = NULL;
+ const size_t alloc_dsize = opts->rto_dsize;
+ const size_t total_ncols = opts->rto_dcols + parity;
+ const int ccols[] = { 0, 1, 2 };
+
+ VERIFY(zio);
+ VERIFY(parity <= 3 && parity >= 1);
+
+ *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+
+ (*zio)->io_offset = 0;
+ (*zio)->io_size = alloc_dsize;
+ (*zio)->io_abd = raidz_alloc(alloc_dsize);
+ init_zio_abd(*zio);
+
+ rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
+ total_ncols, parity);
+ VERIFY(rm);
+
+ /* Make sure code columns are destroyed */
+ corrupt_colums(rm, ccols, parity);
+
+ return (rm);
+}
+
+static int
+run_gen_check(raidz_test_opts_t *opts)
+{
+ char **impl_name;
+ int fn, err = 0;
+ zio_t *zio_test;
+ raidz_map_t *rm_test;
+
+ err = init_raidz_golden_map(opts, PARITY_PQR);
+ if (0 != err)
+ return (err);
+
+ LOG(D_INFO, DBLSEP);
+ LOG(D_INFO, "Testing parity generation...\n");
+
+ for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
+ impl_name++) {
+
+ LOG(D_INFO, SEP);
+ LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
+
+ if (0 != vdev_raidz_impl_set(*impl_name)) {
+ LOG(D_INFO, "[SKIP]\n");
+ continue;
+ } else {
+ LOG(D_INFO, "[SUPPORTED]\n");
+ }
+
+ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ /* create suitable raidz_map */
+ rm_test = init_raidz_map(opts, &zio_test, fn+1);
+ VERIFY(rm_test);
+
+ LOG(D_INFO, "\t\tTesting method [%s] ...",
+ raidz_gen_name[fn]);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_generate_parity(rm_test);
+
+ if (cmp_code(opts, rm_test, fn+1) != 0) {
+ LOG(D_INFO, "[FAIL]\n");
+ err++;
+ } else
+ LOG(D_INFO, "[PASS]\n");
+
+ fini_raidz_map(&zio_test, &rm_test);
+ }
+ }
+
+ fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+
+ return (err);
+}
+
+static int
+run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
+{
+ int x0, x1, x2;
+ int tgtidx[3];
+ int err = 0;
+ static const int rec_tgts[7][3] = {
+ {1, 2, 3}, /* rec_p: bad QR & D[0] */
+ {0, 2, 3}, /* rec_q: bad PR & D[0] */
+ {0, 1, 3}, /* rec_r: bad PQ & D[0] */
+ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
+ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
+ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
+ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
+ };
+
+ memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
+
+ if (fn < RAIDZ_REC_PQ) {
+ /* can reconstruct 1 failed data disk */
+ for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+ if (x0 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ LOG(D_DEBUG, "[%d] ", x0);
+
+ tgtidx[2] = x0 + raidz_parity(rm);
+
+ corrupt_colums(rm, tgtidx+2, 1);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_reconstruct(rm, tgtidx, 3);
+
+ if (cmp_data(opts, rm) != 0) {
+ err++;
+ LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
+ }
+ }
+
+ } else if (fn < RAIDZ_REC_PQR) {
+ /* can reconstruct 2 failed data disk */
+ for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+ if (x0 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+ for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
+ if (x1 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ LOG(D_DEBUG, "[%d %d] ", x0, x1);
+
+ tgtidx[1] = x0 + raidz_parity(rm);
+ tgtidx[2] = x1 + raidz_parity(rm);
+
+ corrupt_colums(rm, tgtidx+1, 2);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_reconstruct(rm, tgtidx, 3);
+
+ if (cmp_data(opts, rm) != 0) {
+ err++;
+ LOG(D_DEBUG, "\nREC D[%d %d]... "
+ "[FAIL]\n", x0, x1);
+ }
+ }
+ }
+ } else {
+ /* can reconstruct 3 failed data disk */
+ for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+ if (x0 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+ for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
+ if (x1 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+ for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
+ if (x2 >=
+ rm->rm_cols - raidz_parity(rm))
+ continue;
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
+
+ tgtidx[0] = x0 + raidz_parity(rm);
+ tgtidx[1] = x1 + raidz_parity(rm);
+ tgtidx[2] = x2 + raidz_parity(rm);
+
+ corrupt_colums(rm, tgtidx, 3);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_reconstruct(rm,
+ tgtidx, 3);
+
+ if (cmp_data(opts, rm) != 0) {
+ err++;
+ LOG(D_DEBUG,
+ "\nREC D[%d %d %d]... "
+ "[FAIL]\n", x0, x1, x2);
+ }
+ }
+ }
+ }
+ }
+ return (err);
+}
+
+static int
+run_rec_check(raidz_test_opts_t *opts)
+{
+ char **impl_name;
+ unsigned fn, err = 0;
+ zio_t *zio_test;
+ raidz_map_t *rm_test;
+
+ err = init_raidz_golden_map(opts, PARITY_PQR);
+ if (0 != err)
+ return (err);
+
+ LOG(D_INFO, DBLSEP);
+ LOG(D_INFO, "Testing data reconstruction...\n");
+
+ for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
+ impl_name++) {
+
+ LOG(D_INFO, SEP);
+ LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
+
+ if (vdev_raidz_impl_set(*impl_name) != 0) {
+ LOG(D_INFO, "[SKIP]\n");
+ continue;
+ } else
+ LOG(D_INFO, "[SUPPORTED]\n");
+
+
+ /* create suitable raidz_map */
+ rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
+ /* generate parity */
+ vdev_raidz_generate_parity(rm_test);
+
+ for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
+
+ LOG(D_INFO, "\t\tTesting method [%s] ...",
+ raidz_rec_name[fn]);
+
+ if (run_rec_check_impl(opts, rm_test, fn) != 0) {
+ LOG(D_INFO, "[FAIL]\n");
+ err++;
+
+ } else
+ LOG(D_INFO, "[PASS]\n");
+
+ }
+ /* tear down test raidz_map */
+ fini_raidz_map(&zio_test, &rm_test);
+ }
+
+ fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+
+ return (err);
+}
+
+static int
+run_test(raidz_test_opts_t *opts)
+{
+ int err = 0;
+
+ if (opts == NULL)
+ opts = &rto_opts;
+
+ print_opts(opts, B_FALSE);
+
+ err |= run_gen_check(opts);
+ err |= run_rec_check(opts);
+
+ return (err);
+}
+
+#define SWEEP_RUNNING 0
+#define SWEEP_FINISHED 1
+#define SWEEP_ERROR 2
+#define SWEEP_TIMEOUT 3
+
+static int sweep_state = 0;
+static raidz_test_opts_t failed_opts;
+
+static kmutex_t sem_mtx;
+static kcondvar_t sem_cv;
+static int max_free_slots;
+static int free_slots;
+
+static void
+sweep_thread(void *arg)
+{
+ int err = 0;
+ raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
+ VERIFY(opts != NULL);
+
+ err = run_test(opts);
+
+ if (rto_opts.rto_sanity) {
+ /* 25% chance that a sweep test fails */
+ if (rand() < (RAND_MAX/4))
+ err = 1;
+ }
+
+ if (0 != err) {
+ mutex_enter(&sem_mtx);
+ memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
+ sweep_state = SWEEP_ERROR;
+ mutex_exit(&sem_mtx);
+ }
+
+ umem_free(opts, sizeof (raidz_test_opts_t));
+
+ /* signal the next thread */
+ mutex_enter(&sem_mtx);
+ free_slots++;
+ cv_signal(&sem_cv);
+ mutex_exit(&sem_mtx);
+
+ thread_exit();
+}
+
+static int
+run_sweep(void)
+{
+ static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
+ static const size_t ashift_v[] = { 9, 12, 14 };
+ static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
+ 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
+
+ (void) setvbuf(stdout, NULL, _IONBF, 0);
+
+ ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
+ ARRAY_SIZE(dcols_v);
+ ulong_t tried_comb = 0;
+ hrtime_t time_diff, start_time = gethrtime();
+ raidz_test_opts_t *opts;
+ int a, d, s;
+
+ max_free_slots = free_slots = MAX(2, boot_ncpus);
+
+ mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
+
+ for (s = 0; s < ARRAY_SIZE(size_v); s++)
+ for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
+ for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
+
+ if (size_v[s] < (1 << ashift_v[a])) {
+ total_comb--;
+ continue;
+ }
+
+ if (++tried_comb % 20 == 0)
+ LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
+
+ /* wait for signal to start new thread */
+ mutex_enter(&sem_mtx);
+ while (cv_timedwait_sig(&sem_cv, &sem_mtx,
+ ddi_get_lbolt() + hz)) {
+
+ /* check if should stop the test (timeout) */
+ time_diff = (gethrtime() - start_time) / NANOSEC;
+ if (rto_opts.rto_sweep_timeout > 0 &&
+ time_diff >= rto_opts.rto_sweep_timeout) {
+ sweep_state = SWEEP_TIMEOUT;
+ rto_opts.rto_should_stop = B_TRUE;
+ mutex_exit(&sem_mtx);
+ goto exit;
+ }
+
+ /* check if should stop the test (error) */
+ if (sweep_state != SWEEP_RUNNING) {
+ mutex_exit(&sem_mtx);
+ goto exit;
+ }
+
+ /* exit loop if a slot is available */
+ if (free_slots > 0) {
+ break;
+ }
+ }
+
+ free_slots--;
+ mutex_exit(&sem_mtx);
+
+ opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
+ opts->rto_ashift = ashift_v[a];
+ opts->rto_dcols = dcols_v[d];
+ opts->rto_offset = (1 << ashift_v[a]) * rand();
+ opts->rto_dsize = size_v[s];
+ opts->rto_v = 0; /* be quiet */
+
+ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
+ 0, NULL, TS_RUN, defclsyspri), !=, NULL);
+ }
+
+exit:
+ LOG(D_ALL, "\nWaiting for test threads to finish...\n");
+ mutex_enter(&sem_mtx);
+ VERIFY(free_slots <= max_free_slots);
+ while (free_slots < max_free_slots) {
+ (void) cv_wait(&sem_cv, &sem_mtx);
+ }
+ mutex_exit(&sem_mtx);
+
+ if (sweep_state == SWEEP_ERROR) {
+ ERR("Sweep test failed! Failed option: \n");
+ print_opts(&failed_opts, B_TRUE);
+ } else {
+ if (sweep_state == SWEEP_TIMEOUT)
+ LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
+ (ulong_t)rto_opts.rto_sweep_timeout);
+
+ LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
+ (ulong_t)tried_comb);
+ }
+
+ mutex_destroy(&sem_mtx);
+
+ return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
+}
+
+int
+main(int argc, char **argv)
+{
+ size_t i;
+ struct sigaction action;
+ int err = 0;
+
+ /* init gdb string early */
+ (void) sprintf(gdb, gdb_tmpl, getpid());
+
+ action.sa_handler = sig_handler;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+
+ if (sigaction(SIGSEGV, &action, NULL) < 0) {
+ ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ (void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+ dprintf_setup(&argc, argv);
+
+ process_options(argc, argv);
+
+ kernel_init(SPA_MODE_READ);
+
+ /* setup random data because rand() is not reentrant */
+ rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ srand((unsigned)time(NULL) * getpid());
+ for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
+ rand_data[i] = rand();
+
+ mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
+
+ if (rto_opts.rto_benchmark) {
+ run_raidz_benchmark();
+ } else if (rto_opts.rto_sweep) {
+ err = run_sweep();
+ } else {
+ err = run_test(NULL);
+ }
+
+ umem_free(rand_data, SPA_MAXBLOCKSIZE);
+ kernel_fini();
+
+ return (err);
+}
diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
new file mode 100644
index 000000000000..09c825ae43c7
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef RAIDZ_TEST_H
+#define RAIDZ_TEST_H
+
+#include <sys/spa.h>
+
+static const char *raidz_impl_names[] = {
+ "original",
+ "scalar",
+ "sse2",
+ "ssse3",
+ "avx2",
+ "avx512f",
+ "avx512bw",
+ "aarch64_neon",
+ "aarch64_neonx2",
+ "powerpc_altivec",
+ NULL
+};
+
+typedef struct raidz_test_opts {
+ size_t rto_ashift;
+ size_t rto_offset;
+ size_t rto_dcols;
+ size_t rto_dsize;
+ size_t rto_v;
+ size_t rto_sweep;
+ size_t rto_sweep_timeout;
+ size_t rto_benchmark;
+ size_t rto_sanity;
+ size_t rto_gdb;
+
+ /* non-user options */
+ boolean_t rto_should_stop;
+
+ zio_t *zio_golden;
+ raidz_map_t *rm_golden;
+} raidz_test_opts_t;
+
+static const raidz_test_opts_t rto_opts_defaults = {
+ .rto_ashift = 9,
+ .rto_offset = 1ULL << 0,
+ .rto_dcols = 8,
+ .rto_dsize = 1<<19,
+ .rto_v = 0,
+ .rto_sweep = 0,
+ .rto_benchmark = 0,
+ .rto_sanity = 0,
+ .rto_gdb = 0,
+ .rto_should_stop = B_FALSE
+};
+
+extern raidz_test_opts_t rto_opts;
+
+static inline size_t ilog2(size_t a)
+{
+ return (a > 1 ? 1 + ilog2(a >> 1) : 0);
+}
+
+
+#define D_ALL 0
+#define D_INFO 1
+#define D_DEBUG 2
+
+#define LOG(lvl, a...) \
+{ \
+ if (rto_opts.rto_v >= lvl) \
+ (void) fprintf(stdout, a); \
+} \
+
+#define LOG_OPT(lvl, opt, a...) \
+{ \
+ if (opt->rto_v >= lvl) \
+ (void) fprintf(stdout, a); \
+} \
+
+#define ERR(a...) (void) fprintf(stderr, a)
+
+
+#define DBLSEP "================\n"
+#define SEP "----------------\n"
+
+
+#define raidz_alloc(size) abd_alloc(size, B_FALSE)
+#define raidz_free(p, size) abd_free(p)
+
+
+void init_zio_abd(zio_t *zio);
+
+void run_raidz_benchmark(void);
+
+#endif /* RAIDZ_TEST_H */
diff --git a/sys/contrib/openzfs/cmd/vdev_id/Makefile.am b/sys/contrib/openzfs/cmd/vdev_id/Makefile.am
new file mode 100644
index 000000000000..fb815faad084
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/vdev_id/Makefile.am
@@ -0,0 +1 @@
+dist_udev_SCRIPTS = vdev_id
diff --git a/sys/contrib/openzfs/cmd/vdev_id/vdev_id b/sys/contrib/openzfs/cmd/vdev_id/vdev_id
new file mode 100755
index 000000000000..8a75e638b67e
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/vdev_id/vdev_id
@@ -0,0 +1,605 @@
+#!/bin/sh
+#
+# vdev_id: udev helper to generate user-friendly names for JBOD disks
+#
+# This script parses the file /etc/zfs/vdev_id.conf to map a
+# physical path in a storage topology to a channel name. The
+# channel name is combined with a disk enclosure slot number to
+# create an alias that reflects the physical location of the drive.
+# This is particularly helpful when it comes to tasks like replacing
+# failed drives. Slot numbers may also be re-mapped in case the
+# default numbering is unsatisfactory. The drive aliases will be
+# created as symbolic links in /dev/disk/by-vdev.
+#
+# The currently supported topologies are sas_direct and sas_switch.
+# A multipath mode is supported in which dm-mpath devices are
+# handled by examining the first-listed running component disk. In
+# multipath mode the configuration file should contain a channel
+# definition with the same name for each path to a given enclosure.
+#
+# The alias keyword provides a simple way to map already-existing
+# device symlinks to more convenient names. It is suitable for
+# small, static configurations or for sites that have some automated
+# way to generate the mapping file.
+#
+#
+# Some example configuration files are given below.
+
+# #
+# # Example vdev_id.conf - sas_direct.
+# #
+#
+# multipath no
+# topology sas_direct
+# phys_per_port 4
+# slot bay
+#
+# # PCI_ID HBA PORT CHANNEL NAME
+# channel 85:00.0 1 A
+# channel 85:00.0 0 B
+# channel 86:00.0 1 C
+# channel 86:00.0 0 D
+#
+# # Custom mapping for Channel A
+#
+# # Linux Mapped
+# # Slot Slot Channel
+# slot 1 7 A
+# slot 2 10 A
+# slot 3 3 A
+# slot 4 6 A
+#
+# # Default mapping for B, C, and D
+# slot 1 4
+# slot 2 2
+# slot 3 1
+# slot 4 3
+
+# #
+# # Example vdev_id.conf - sas_switch
+# #
+#
+# topology sas_switch
+#
+# # SWITCH PORT CHANNEL NAME
+# channel 1 A
+# channel 2 B
+# channel 3 C
+# channel 4 D
+
+# #
+# # Example vdev_id.conf - multipath
+# #
+#
+# multipath yes
+#
+# # PCI_ID HBA PORT CHANNEL NAME
+# channel 85:00.0 1 A
+# channel 85:00.0 0 B
+# channel 86:00.0 1 A
+# channel 86:00.0 0 B
+
+# #
+# # Example vdev_id.conf - alias
+# #
+#
+# # by-vdev
+# # name fully qualified or base name of device link
+# alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca
+# alias d2 wwn-0x5000c5002def789e
+
+PATH=/bin:/sbin:/usr/bin:/usr/sbin
+CONFIG=/etc/zfs/vdev_id.conf
+PHYS_PER_PORT=
+DEV=
+MULTIPATH=
+TOPOLOGY=
+BAY=
+
+usage() {
+ cat << EOF
+Usage: vdev_id [-h]
+ vdev_id <-d device> [-c config_file] [-p phys_per_port]
+ [-g sas_direct|sas_switch|scsi] [-m]
+
+ -c specify name of an alternative config file [default=$CONFIG]
+ -d specify basename of device (i.e. sda)
+ -e Create enclose device symlinks only (/dev/by-enclosure)
+ -g Storage network topology [default="$TOPOLOGY"]
+ -m Run in multipath mode
+ -p number of phy's per switch port [default=$PHYS_PER_PORT]
+ -h show this summary
+EOF
+ exit 0
+}
+
+map_slot() {
+ LINUX_SLOT=$1
+ CHANNEL=$2
+
+ MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \
+ \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG`
+ if [ -z "$MAPPED_SLOT" ] ; then
+ MAPPED_SLOT=$LINUX_SLOT
+ fi
+ printf "%d" ${MAPPED_SLOT}
+}
+
+map_channel() {
+ MAPPED_CHAN=
+ PCI_ID=$1
+ PORT=$2
+
+ case $TOPOLOGY in
+ "sas_switch")
+ MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \
+ { print \\$3; exit }" $CONFIG`
+ ;;
+ "sas_direct"|"scsi")
+ MAPPED_CHAN=`awk "\\$1 == \"channel\" && \
+ \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \
+ { print \\$4; exit }" $CONFIG`
+ ;;
+ esac
+ printf "%s" ${MAPPED_CHAN}
+}
+
+sas_handler() {
+ if [ -z "$PHYS_PER_PORT" ] ; then
+ PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+ PHYS_PER_PORT=${PHYS_PER_PORT:-4}
+ if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+ echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
+ exit 1
+ fi
+
+ if [ -z "$MULTIPATH_MODE" ] ; then
+ MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+
+ # Use first running component device if we're handling a dm-mpath device
+ if [ "$MULTIPATH_MODE" = "yes" ] ; then
+ # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
+ if [ -z "$DM_NAME" ] ; then
+ DM_NAME=`ls -l --full-time /dev/mapper |
+ awk "/\/$DEV$/{print \\$9}"`
+ fi
+
+ # For raw disks udev exports DEVTYPE=partition when
+ # handling partitions, and the rules can be written to
+ # take advantage of this to append a -part suffix. For
+ # dm devices we get DEVTYPE=disk even for partitions so
+ # we have to append the -part suffix directly in the
+ # helper.
+ if [ "$DEVTYPE" != "partition" ] ; then
+ PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+ fi
+
+ # Strip off partition information.
+ DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+ if [ -z "$DM_NAME" ] ; then
+ return
+ fi
+
+ # Get the raw scsi device name from multipath -ll. Strip off
+ # leading pipe symbols to make field numbering consistent.
+ DEV=`multipath -ll $DM_NAME |
+ awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+ if [ -z "$DEV" ] ; then
+ return
+ fi
+ fi
+
+ if echo $DEV | grep -q ^/devices/ ; then
+ sys_path=$DEV
+ else
+ sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+ fi
+
+ # Use positional parameters as an ad-hoc array
+ set -- $(echo "$sys_path" | tr / ' ')
+ num_dirs=$#
+ scsi_host_dir="/sys"
+
+ # Get path up to /sys/.../hostX
+ i=1
+ while [ $i -le $num_dirs ] ; do
+ d=$(eval echo \${$i})
+ scsi_host_dir="$scsi_host_dir/$d"
+ echo $d | grep -q -E '^host[0-9]+$' && break
+ i=$(($i + 1))
+ done
+
+ if [ $i = $num_dirs ] ; then
+ return
+ fi
+
+ PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+
+ # In sas_switch mode, the directory four levels beneath
+ # /sys/.../hostX contains symlinks to phy devices that reveal
+ # the switch port number. In sas_direct mode, the phy links one
+ # directory down reveal the HBA port.
+ port_dir=$scsi_host_dir
+ case $TOPOLOGY in
+ "sas_switch") j=$(($i + 4)) ;;
+ "sas_direct") j=$(($i + 1)) ;;
+ esac
+
+ i=$(($i + 1))
+ while [ $i -le $j ] ; do
+ port_dir="$port_dir/$(eval echo \${$i})"
+ i=$(($i + 1))
+ done
+
+ PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'`
+ if [ -z "$PHY" ] ; then
+ PHY=0
+ fi
+ PORT=$(( $PHY / $PHYS_PER_PORT ))
+
+ # Look in /sys/.../sas_device/end_device-X for the bay_identifier
+ # attribute.
+ end_device_dir=$port_dir
+ while [ $i -lt $num_dirs ] ; do
+ d=$(eval echo \${$i})
+ end_device_dir="$end_device_dir/$d"
+ if echo $d | grep -q '^end_device' ; then
+ end_device_dir="$end_device_dir/sas_device/$d"
+ break
+ fi
+ i=$(($i + 1))
+ done
+
+ SLOT=
+ case $BAY in
+ "bay")
+ SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null`
+ ;;
+ "phy")
+ SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null`
+ ;;
+ "port")
+ d=$(eval echo \${$i})
+ SLOT=`echo $d | sed -e 's/^.*://'`
+ ;;
+ "id")
+ i=$(($i + 1))
+ d=$(eval echo \${$i})
+ SLOT=`echo $d | sed -e 's/^.*://'`
+ ;;
+ "lun")
+ i=$(($i + 2))
+ d=$(eval echo \${$i})
+ SLOT=`echo $d | sed -e 's/^.*://'`
+ ;;
+ "ses")
+ # look for this SAS path in all SCSI Enclosure Services
+ # (SES) enclosures
+ sas_address=`cat $end_device_dir/sas_address 2>/dev/null`
+ enclosures=`lsscsi -g | \
+ sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'`
+ for enclosure in $enclosures; do
+ set -- $(sg_ses -p aes $enclosure | \
+ awk "/device slot number:/{slot=\$12} \
+ /SAS address: $sas_address/\
+ {print slot}")
+ SLOT=$1
+ if [ -n "$SLOT" ] ; then
+ break
+ fi
+ done
+ ;;
+ esac
+ if [ -z "$SLOT" ] ; then
+ return
+ fi
+
+ CHAN=`map_channel $PCI_ID $PORT`
+ SLOT=`map_slot $SLOT $CHAN`
+ if [ -z "$CHAN" ] ; then
+ return
+ fi
+ echo ${CHAN}${SLOT}${PART}
+}
+
+scsi_handler() {
+ if [ -z "$FIRST_BAY_NUMBER" ] ; then
+ FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+ FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0}
+
+ if [ -z "$PHYS_PER_PORT" ] ; then
+ PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+ PHYS_PER_PORT=${PHYS_PER_PORT:-4}
+ if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+ echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
+ exit 1
+ fi
+
+ if [ -z "$MULTIPATH_MODE" ] ; then
+ MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+
+ # Use first running component device if we're handling a dm-mpath device
+ if [ "$MULTIPATH_MODE" = "yes" ] ; then
+ # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
+ if [ -z "$DM_NAME" ] ; then
+ DM_NAME=`ls -l --full-time /dev/mapper |
+ awk "/\/$DEV$/{print \\$9}"`
+ fi
+
+ # For raw disks udev exports DEVTYPE=partition when
+ # handling partitions, and the rules can be written to
+ # take advantage of this to append a -part suffix. For
+ # dm devices we get DEVTYPE=disk even for partitions so
+ # we have to append the -part suffix directly in the
+ # helper.
+ if [ "$DEVTYPE" != "partition" ] ; then
+ PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+ fi
+
+ # Strip off partition information.
+ DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+ if [ -z "$DM_NAME" ] ; then
+ return
+ fi
+
+ # Get the raw scsi device name from multipath -ll. Strip off
+ # leading pipe symbols to make field numbering consistent.
+ DEV=`multipath -ll $DM_NAME |
+ awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+ if [ -z "$DEV" ] ; then
+ return
+ fi
+ fi
+
+ if echo $DEV | grep -q ^/devices/ ; then
+ sys_path=$DEV
+ else
+ sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+ fi
+
+ # expect sys_path like this, for example:
+ # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv
+
+ # Use positional parameters as an ad-hoc array
+ set -- $(echo "$sys_path" | tr / ' ')
+ num_dirs=$#
+ scsi_host_dir="/sys"
+
+ # Get path up to /sys/.../hostX
+ i=1
+ while [ $i -le $num_dirs ] ; do
+ d=$(eval echo \${$i})
+ scsi_host_dir="$scsi_host_dir/$d"
+ echo $d | grep -q -E '^host[0-9]+$' && break
+ i=$(($i + 1))
+ done
+
+ if [ $i = $num_dirs ] ; then
+ return
+ fi
+
+ PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+
+ # In scsi mode, the directory two levels beneath
+ # /sys/.../hostX reveals the port and slot.
+ port_dir=$scsi_host_dir
+ j=$(($i + 2))
+
+ i=$(($i + 1))
+ while [ $i -le $j ] ; do
+ port_dir="$port_dir/$(eval echo \${$i})"
+ i=$(($i + 1))
+ done
+
+ set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/')
+ PORT=$1
+ SLOT=$(($2 + $FIRST_BAY_NUMBER))
+
+ if [ -z "$SLOT" ] ; then
+ return
+ fi
+
+ CHAN=`map_channel $PCI_ID $PORT`
+ SLOT=`map_slot $SLOT $CHAN`
+ if [ -z "$CHAN" ] ; then
+ return
+ fi
+ echo ${CHAN}${SLOT}${PART}
+}
+
+# Figure out the name for the enclosure symlink
+enclosure_handler () {
+ # We get all the info we need from udev's DEVPATH variable:
+ #
+ # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0
+
+ # Get the enclosure ID ("0:0:0:0")
+ ENC=$(basename $(readlink -m "/sys/$DEVPATH/../.."))
+ if [ ! -d /sys/class/enclosure/$ENC ] ; then
+ # Not an enclosure, bail out
+ return
+ fi
+
+ # Get the long sysfs device path to our enclosure. Looks like:
+ # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0
+
+ ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC)
+
+ # Grab the full path to the hosts port dir:
+ # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0
+ PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+')
+
+ # Get the port number
+ PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$")
+
+ # The PCI directory is two directories up from the port directory
+ # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0
+ PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../.."))
+
+ # Strip down the PCI address from 0000:05:00.0 to 05:00.0
+ PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g')
+
+ # Name our device according to vdev_id.conf (like "L0" or "U1").
+ NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \
+ \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG)
+
+ echo "${NAME}"
+}
+
+alias_handler () {
+ # Special handling is needed to correctly append a -part suffix
+ # to partitions of device mapper devices. The DEVTYPE attribute
+ # is normally set to "disk" instead of "partition" in this case,
+ # so the udev rules won't handle that for us as they do for
+ # "plain" block devices.
+ #
+ # For example, we may have the following links for a device and its
+ # partitions,
+ #
+ # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0
+ # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1
+ # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3
+ #
+ # and the following alias in vdev_id.conf.
+ #
+ # alias A0 dm-name-isw_dibgbfcije_ARRAY0
+ #
+ # The desired outcome is for the following links to be created
+ # without having explicitly defined aliases for the partitions.
+ #
+ # /dev/disk/by-vdev/A0 -> ../../dm-0
+ # /dev/disk/by-vdev/A0-part1 -> ../../dm-1
+ # /dev/disk/by-vdev/A0-part2 -> ../../dm-3
+ #
+ # Warning: The following grep pattern will misidentify whole-disk
+ # devices whose names end with 'p' followed by a string of
+ # digits as partitions, causing alias creation to fail. This
+ # ambiguity seems unavoidable, so devices using this facility
+ # must not use such names.
+ DM_PART=
+ if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then
+ if [ "$DEVTYPE" != "partition" ] ; then
+ DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+ fi
+ fi
+
+ # DEVLINKS attribute must have been populated by already-run udev rules.
+ for link in $DEVLINKS ; do
+ # Remove partition information to match key of top-level device.
+ if [ -n "$DM_PART" ] ; then
+ link=`echo $link | sed 's/p[0-9][0-9]*$//'`
+ fi
+ # Check both the fully qualified and the base name of link.
+ for l in $link `basename $link` ; do
+ alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \
+ { print \\$2; exit }" $CONFIG`
+ if [ -n "$alias" ] ; then
+ echo ${alias}${DM_PART}
+ return
+ fi
+ done
+ done
+}
+
+while getopts 'c:d:eg:mp:h' OPTION; do
+ case ${OPTION} in
+ c)
+ CONFIG=${OPTARG}
+ ;;
+ d)
+ DEV=${OPTARG}
+ ;;
+ e)
+ # When udev sees a scsi_generic device, it calls this script with -e to
+ # create the enclosure device symlinks only. We also need
+ # "enclosure_symlinks yes" set in vdev_id.config to actually create the
+ # symlink.
+ ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG)
+ if [ "$ENCLOSURE_MODE" != "yes" ] ; then
+ exit 0
+ fi
+ ;;
+ g)
+ TOPOLOGY=$OPTARG
+ ;;
+ p)
+ PHYS_PER_PORT=${OPTARG}
+ ;;
+ m)
+ MULTIPATH_MODE=yes
+ ;;
+ h)
+ usage
+ ;;
+ esac
+done
+
+if [ ! -r $CONFIG ] ; then
+ exit 0
+fi
+
+if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then
+ echo "Error: missing required option -d"
+ exit 1
+fi
+
+if [ -z "$TOPOLOGY" ] ; then
+ TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG`
+fi
+
+if [ -z "$BAY" ] ; then
+ BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG`
+fi
+
+TOPOLOGY=${TOPOLOGY:-sas_direct}
+
+# Should we create /dev/by-enclosure symlinks?
+if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then
+ ID_ENCLOSURE=$(enclosure_handler)
+ if [ -z "$ID_ENCLOSURE" ] ; then
+ exit 0
+ fi
+
+ # Just create the symlinks to the enclosure devices and then exit.
+ ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG)
+ if [ -z "$ENCLOSURE_PREFIX" ] ; then
+ ENCLOSURE_PREFIX="enc"
+ fi
+ echo "ID_ENCLOSURE=$ID_ENCLOSURE"
+ echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE"
+ exit 0
+fi
+
+# First check if an alias was defined for this device.
+ID_VDEV=`alias_handler`
+
+if [ -z "$ID_VDEV" ] ; then
+ BAY=${BAY:-bay}
+ case $TOPOLOGY in
+ sas_direct|sas_switch)
+ ID_VDEV=`sas_handler`
+ ;;
+ scsi)
+ ID_VDEV=`scsi_handler`
+ ;;
+ *)
+ echo "Error: unknown topology $TOPOLOGY"
+ exit 1
+ ;;
+ esac
+fi
+
+if [ -n "$ID_VDEV" ] ; then
+ echo "ID_VDEV=${ID_VDEV}"
+ echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}"
+fi
diff --git a/sys/contrib/openzfs/cmd/zdb/.gitignore b/sys/contrib/openzfs/cmd/zdb/.gitignore
new file mode 100644
index 000000000000..f64a3fc5a160
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/.gitignore
@@ -0,0 +1 @@
+/zdb
diff --git a/sys/contrib/openzfs/cmd/zdb/Makefile.am b/sys/contrib/openzfs/cmd/zdb/Makefile.am
new file mode 100644
index 000000000000..b325cb060bd2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/Makefile.am
@@ -0,0 +1,16 @@
+include $(top_srcdir)/config/Rules.am
+
+# Unconditionally enable debugging for zdb
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = zdb
+
+zdb_SOURCES = \
+ zdb.c \
+ zdb_il.c \
+ zdb.h
+
+zdb_LDADD = \
+ $(abs_top_builddir)/lib/libzpool/libzpool.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
new file mode 100644
index 000000000000..e7211711a41c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -0,0 +1,8606 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2020 Datto Inc.
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ * under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dbuf.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_traverse.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfs_fuid.h>
+#include <sys/arc.h>
+#include <sys/arc_impl.h>
+#include <sys/ddt.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_scan.h>
+#include <sys/btree.h>
+#include <zfs_comutil.h>
+#include <sys/zstd/zstd.h>
+
+#include <libnvpair.h>
+#include <libzutil.h>
+
+#include "zdb.h"
+
+#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
+ zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
+ zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
+ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \
+ DMU_OT_ZAP_OTHER : \
+ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
+ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
+
+static char *
+zdb_ot_name(dmu_object_type_t type)
+{
+ if (type < DMU_OT_NUMTYPES)
+ return (dmu_ot[type].ot_name);
+ else if ((type & DMU_OT_NEWTYPE) &&
+ ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS))
+ return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name);
+ else
+ return ("UNKNOWN");
+}
+
+extern int reference_tracking_enable;
+extern int zfs_recover;
+extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
+extern boolean_t spa_load_verify_dryrun;
+extern int zfs_reconstruct_indirect_combinations_max;
+extern int zfs_btree_verify_intensity;
+
+static const char cmdname[] = "zdb";
+uint8_t dump_opt[256];
+
+typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
+
+uint64_t *zopt_metaslab = NULL;
+static unsigned zopt_metaslab_args = 0;
+
+typedef struct zopt_object_range {
+ uint64_t zor_obj_start;
+ uint64_t zor_obj_end;
+ uint64_t zor_flags;
+} zopt_object_range_t;
+zopt_object_range_t *zopt_object_ranges = NULL;
+static unsigned zopt_object_args = 0;
+
+static int flagbits[256];
+
+#define ZOR_FLAG_PLAIN_FILE 0x0001
+#define ZOR_FLAG_DIRECTORY 0x0002
+#define ZOR_FLAG_SPACE_MAP 0x0004
+#define ZOR_FLAG_ZAP 0x0008
+#define ZOR_FLAG_ALL_TYPES -1
+#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \
+ ZOR_FLAG_DIRECTORY | \
+ ZOR_FLAG_SPACE_MAP | \
+ ZOR_FLAG_ZAP)
+
+#define ZDB_FLAG_CHECKSUM 0x0001
+#define ZDB_FLAG_DECOMPRESS 0x0002
+#define ZDB_FLAG_BSWAP 0x0004
+#define ZDB_FLAG_GBH 0x0008
+#define ZDB_FLAG_INDIRECT 0x0010
+#define ZDB_FLAG_RAW 0x0020
+#define ZDB_FLAG_PRINT_BLKPTR 0x0040
+#define ZDB_FLAG_VERBOSE 0x0080
+
+uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
+static int leaked_objects = 0;
+static range_tree_t *mos_refd_objs;
+
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
+ boolean_t);
+static void mos_obj_refd(uint64_t);
+static void mos_obj_refd_multiple(uint64_t);
+static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
+ dmu_tx_t *tx);
+
+typedef struct sublivelist_verify {
+ /* all ALLOC'd blkptr_t in one sub-livelist */
+ zfs_btree_t sv_all_allocs;
+
+ /* all FREE'd blkptr_t in one sub-livelist */
+ zfs_btree_t sv_all_frees;
+
+ /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
+ zfs_btree_t sv_pair;
+
+ /* ALLOC's without a matching FREE, accumulates across sub-livelists */
+ zfs_btree_t sv_leftover;
+} sublivelist_verify_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+ const blkptr_t *l = larg;
+ const blkptr_t *r = rarg;
+
+ /* Sort them according to dva[0] */
+ uint64_t l_dva0_vdev, r_dva0_vdev;
+ l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+ r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+ if (l_dva0_vdev < r_dva0_vdev)
+ return (-1);
+ else if (l_dva0_vdev > r_dva0_vdev)
+ return (+1);
+
+ /* if vdevs are equal, sort by offsets. */
+ uint64_t l_dva0_offset;
+ uint64_t r_dva0_offset;
+ l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+ r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+ if (l_dva0_offset < r_dva0_offset) {
+ return (-1);
+ } else if (l_dva0_offset > r_dva0_offset) {
+ return (+1);
+ }
+
+ /*
+ * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
+ * it's possible the offsets are equal. In that case, sort by txg
+ */
+ if (l->blk_birth < r->blk_birth) {
+ return (-1);
+ } else if (l->blk_birth > r->blk_birth) {
+ return (+1);
+ }
+ return (0);
+}
+
+typedef struct sublivelist_verify_block {
+ dva_t svb_dva;
+
+ /*
+ * We need this to check if the block marked as allocated
+ * in the livelist was freed (and potentially reallocated)
+ * in the metaslab spacemaps at a later TXG.
+ */
+ uint64_t svb_allocated_txg;
+} sublivelist_verify_block_t;
+
+static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+
+static int
+sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
+ dmu_tx_t *tx)
+{
+ ASSERT3P(tx, ==, NULL);
+ struct sublivelist_verify *sv = arg;
+ char blkbuf[BP_SPRINTF_LEN];
+ zfs_btree_index_t where;
+ if (free) {
+ zfs_btree_add(&sv->sv_pair, bp);
+ /* Check if the FREE is a duplicate */
+ if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+ free);
+ (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+ } else {
+ zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+ }
+ } else {
+ /* Check if the ALLOC has been freed */
+ if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
+ zfs_btree_remove_idx(&sv->sv_pair, &where);
+ } else {
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ if (DVA_IS_EMPTY(&bp->blk_dva[i]))
+ break;
+ sublivelist_verify_block_t svb = {
+ .svb_dva = bp->blk_dva[i],
+ .svb_allocated_txg = bp->blk_birth
+ };
+
+ if (zfs_btree_find(&sv->sv_leftover, &svb,
+ &where) == NULL) {
+ zfs_btree_add_idx(&sv->sv_leftover,
+ &svb, &where);
+ }
+ }
+ }
+ /* Check if the ALLOC is a duplicate */
+ if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+ free);
+ (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
+ } else {
+ zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+ }
+ }
+ return (0);
+}
+
+static int
+sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
+{
+ int err;
+ char blkbuf[BP_SPRINTF_LEN];
+ struct sublivelist_verify *sv = args;
+
+ zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
+ sizeof (blkptr_t));
+
+ zfs_btree_create(&sv->sv_all_frees, livelist_compare,
+ sizeof (blkptr_t));
+
+ zfs_btree_create(&sv->sv_pair, livelist_compare,
+ sizeof (blkptr_t));
+
+ err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
+ sv, NULL);
+
+ zfs_btree_clear(&sv->sv_all_allocs);
+ zfs_btree_destroy(&sv->sv_all_allocs);
+
+ zfs_btree_clear(&sv->sv_all_frees);
+ zfs_btree_destroy(&sv->sv_all_frees);
+
+ blkptr_t *e;
+ zfs_btree_index_t *cookie = NULL;
+ while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
+ (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+ }
+ zfs_btree_destroy(&sv->sv_pair);
+
+ return (err);
+}
+
+static int
+livelist_block_compare(const void *larg, const void *rarg)
+{
+ const sublivelist_verify_block_t *l = larg;
+ const sublivelist_verify_block_t *r = rarg;
+
+ if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
+ return (+1);
+
+ if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
+ return (+1);
+
+ if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
+ return (+1);
+
+ return (0);
+}
+
+/*
+ * Check for errors in a livelist while tracking all unfreed ALLOCs in the
+ * sublivelist_verify_t: sv->sv_leftover
+ */
+static void
+livelist_verify(dsl_deadlist_t *dl, void *arg)
+{
+ sublivelist_verify_t *sv = arg;
+ dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
+}
+
+/*
+ * Check for errors in the livelist entry and discard the intermediary
+ * data structures
+ */
+/* ARGSUSED */
+static int
+sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
+{
+ sublivelist_verify_t sv;
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+ int err = sublivelist_verify_func(&sv, dle);
+ zfs_btree_clear(&sv.sv_leftover);
+ zfs_btree_destroy(&sv.sv_leftover);
+ return (err);
+}
+
+typedef struct metaslab_verify {
+ /*
+ * Tree containing all the leftover ALLOCs from the livelists
+ * that are part of this metaslab.
+ */
+ zfs_btree_t mv_livelist_allocs;
+
+ /*
+ * Metaslab information.
+ */
+ uint64_t mv_vdid;
+ uint64_t mv_msid;
+ uint64_t mv_start;
+ uint64_t mv_end;
+
+ /*
+ * What's currently allocated for this metaslab.
+ */
+ range_tree_t *mv_allocated;
+} metaslab_verify_t;
+
+typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
+
+typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
+ void *arg);
+
+typedef struct unflushed_iter_cb_arg {
+ spa_t *uic_spa;
+ uint64_t uic_txg;
+ void *uic_arg;
+ zdb_log_sm_cb_t uic_cb;
+} unflushed_iter_cb_arg_t;
+
+static int
+iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
+{
+ unflushed_iter_cb_arg_t *uic = arg;
+ return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
+}
+
+static void
+iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ space_map_t *sm = NULL;
+ VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+ sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+ unflushed_iter_cb_arg_t uic = {
+ .uic_spa = spa,
+ .uic_txg = sls->sls_txg,
+ .uic_arg = arg,
+ .uic_cb = cb
+ };
+ VERIFY0(space_map_iterate(sm, space_map_length(sm),
+ iterate_through_spacemap_logs_cb, &uic));
+ space_map_close(sm);
+ }
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
+ uint64_t offset, uint64_t size)
+{
+ sublivelist_verify_block_t svb;
+ DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
+ DVA_SET_OFFSET(&svb.svb_dva, offset);
+ DVA_SET_ASIZE(&svb.svb_dva, size);
+ zfs_btree_index_t where;
+ uint64_t end_offset = offset + size;
+
+ /*
+ * Look for an exact match for spacemap entry in the livelist entries.
+ * Then, look for other livelist entries that fall within the range
+ * of the spacemap entry as it may have been condensed
+ */
+ sublivelist_verify_block_t *found =
+ zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
+ if (found == NULL) {
+ found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
+ }
+ for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
+ DVA_GET_OFFSET(&found->svb_dva) < end_offset;
+ found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+ if (found->svb_allocated_txg <= txg) {
+ (void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
+ "from TXG %llx FREED at TXG %llx\n",
+ (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
+ (u_longlong_t)found->svb_allocated_txg,
+ (u_longlong_t)txg);
+ }
+ }
+}
+
+static int
+metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
+{
+ metaslab_verify_t *mv = arg;
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+ uint64_t txg = sme->sme_txg;
+
+ if (sme->sme_type == SM_ALLOC) {
+ if (range_tree_contains(mv->mv_allocated,
+ offset, size)) {
+ (void) printf("ERROR: DOUBLE ALLOC: "
+ "%llu [%llx:%llx] "
+ "%llu:%llu LOG_SM\n",
+ (u_longlong_t)txg, (u_longlong_t)offset,
+ (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+ (u_longlong_t)mv->mv_msid);
+ } else {
+ range_tree_add(mv->mv_allocated,
+ offset, size);
+ }
+ } else {
+ if (!range_tree_contains(mv->mv_allocated,
+ offset, size)) {
+ (void) printf("ERROR: DOUBLE FREE: "
+ "%llu [%llx:%llx] "
+ "%llu:%llu LOG_SM\n",
+ (u_longlong_t)txg, (u_longlong_t)offset,
+ (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+ (u_longlong_t)mv->mv_msid);
+ } else {
+ range_tree_remove(mv->mv_allocated,
+ offset, size);
+ }
+ }
+
+ if (sme->sme_type != SM_ALLOC) {
+ /*
+ * If something is freed in the spacemap, verify that
+ * it is not listed as allocated in the livelist.
+ */
+ verify_livelist_allocs(mv, txg, offset, size);
+ }
+ return (0);
+}
+
+static int
+spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ metaslab_verify_t *mv = arg;
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+ /* skip indirect vdevs */
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ if (vdev_id != mv->mv_vdid)
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ if (ms->ms_id != mv->mv_msid)
+ return (0);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+
+ ASSERT3U(txg, ==, sme->sme_txg);
+ return (metaslab_spacemap_validation_cb(sme, mv));
+}
+
+static void
+spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
+{
+ iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
+}
+
+static void
+spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv)
+{
+ if (sm == NULL)
+ return;
+
+ VERIFY0(space_map_iterate(sm, space_map_length(sm),
+ metaslab_spacemap_validation_cb, mv));
+}
+
+static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
+
+/*
+ * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
+ * they are part of that metaslab (mv_msid).
+ */
+static void
+mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
+{
+ zfs_btree_index_t where;
+ sublivelist_verify_block_t *svb;
+ ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
+ for (svb = zfs_btree_first(&sv->sv_leftover, &where);
+ svb != NULL;
+ svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
+ if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
+ continue;
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
+ (DVA_GET_OFFSET(&svb->svb_dva) +
+ DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
+ (void) printf("ERROR: Found block that crosses "
+ "metaslab boundary: <%llu:%llx:%llx>\n",
+ (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+ continue;
+ }
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
+ continue;
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
+ continue;
+
+ if ((DVA_GET_OFFSET(&svb->svb_dva) +
+ DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
+ (void) printf("ERROR: Found block that crosses "
+ "metaslab boundary: <%llu:%llx:%llx>\n",
+ (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+ continue;
+ }
+
+ zfs_btree_add(&mv->mv_livelist_allocs, svb);
+ }
+
+ for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
+ svb != NULL;
+ svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+ zfs_btree_remove(&sv->sv_leftover, svb);
+ }
+}
+
+/*
+ * [Livelist Check]
+ * Iterate through all the sublivelists and:
+ * - report leftover frees
+ * - report double ALLOCs/FREEs
+ * - record leftover ALLOCs together with their TXG [see Cross Check]
+ *
+ * [Spacemap Check]
+ * for each metaslab:
+ * - iterate over spacemap and then the metaslab's entries in the
+ * spacemap log, then report any double FREEs and ALLOCs (do not
+ * blow up).
+ *
+ * [Cross Check]
+ * After finishing the Livelist Check phase and while being in the
+ * Spacemap Check phase, we find all the recorded leftover ALLOCs
+ * of the livelist check that are part of the metaslab that we are
+ * currently looking at in the Spacemap Check. We report any entries
+ * that are marked as ALLOCs in the livelists but have been actually
+ * freed (and potentially allocated again) after their TXG stamp in
+ * the spacemaps. Also report any ALLOCs from the livelists that
+ * belong to indirect vdevs (e.g. their vdev completed removal).
+ *
+ * Note that this will miss Log Spacemap entries that cancelled each other
+ * out before being flushed to the metaslab, so we are not guaranteed
+ * to match all erroneous ALLOCs.
+ */
+static void
+livelist_metaslab_validate(spa_t *spa)
+{
+ (void) printf("Verifying deleted livelist entries\n");
+
+ sublivelist_verify_t sv;
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+ iterate_deleted_livelists(spa, livelist_verify, &sv);
+
+ (void) printf("Verifying metaslab entries\n");
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ if (!vdev_is_concrete(vd))
+ continue;
+
+ for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
+ metaslab_t *m = vd->vdev_ms[mid];
+
+ (void) fprintf(stderr,
+ "\rverifying concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)mid,
+ (longlong_t)vd->vdev_ms_count);
+
+ uint64_t shift, start;
+ range_seg_type_t type =
+ metaslab_calculate_range_tree_type(vd, m,
+ &start, &shift);
+ metaslab_verify_t mv;
+ mv.mv_allocated = range_tree_create(NULL,
+ type, NULL, start, shift);
+ mv.mv_vdid = vd->vdev_id;
+ mv.mv_msid = m->ms_id;
+ mv.mv_start = m->ms_start;
+ mv.mv_end = m->ms_start + m->ms_size;
+ zfs_btree_create(&mv.mv_livelist_allocs,
+ livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+
+ mv_populate_livelist_allocs(&mv, &sv);
+
+ spacemap_check_ms_sm(m->ms_sm, &mv);
+ spacemap_check_sm_log(spa, &mv);
+
+ range_tree_vacate(mv.mv_allocated, NULL, NULL);
+ range_tree_destroy(mv.mv_allocated);
+ zfs_btree_clear(&mv.mv_livelist_allocs);
+ zfs_btree_destroy(&mv.mv_livelist_allocs);
+ }
+ }
+ (void) fprintf(stderr, "\n");
+
+ /*
+ * If there are any segments in the leftover tree after we walked
+ * through all the metaslabs in the concrete vdevs then this means
+ * that we have segments in the livelists that belong to indirect
+ * vdevs and are marked as allocated.
+ */
+ if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
+ zfs_btree_destroy(&sv.sv_leftover);
+ return;
+ }
+ (void) printf("ERROR: Found livelist blocks marked as allocated "
+ "for indirect vdevs:\n");
+
+ zfs_btree_index_t *where = NULL;
+ sublivelist_verify_block_t *svb;
+ while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
+ NULL) {
+ int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
+ ASSERT3U(vdev_id, <, rvd->vdev_children);
+ vdev_t *vd = rvd->vdev_child[vdev_id];
+ ASSERT(!vdev_is_concrete(vd));
+ (void) printf("<%d:%llx:%llx> TXG %llx\n",
+ vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
+ (u_longlong_t)svb->svb_allocated_txg);
+ }
+ (void) printf("\n");
+ zfs_btree_destroy(&sv.sv_leftover);
+}
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr,
+ "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
+ "[-I <inflight I/Os>]\n"
+ "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
+ "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
+ "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
+ "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
+ "\t%s [-v] <bookmark>\n"
+ "\t%s -C [-A] [-U <cache>]\n"
+ "\t%s -l [-Aqu] <device>\n"
+ "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
+ "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
+ "\t%s -O <dataset> <path>\n"
+ "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
+ "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
+ "\t%s -E [-A] word0:word1:...:word15\n"
+ "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
+ "<poolname>\n\n",
+ cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
+ cmdname, cmdname, cmdname);
+
+ (void) fprintf(stderr, " Dataset name must include at least one "
+ "separator character '/' or '@'\n");
+ (void) fprintf(stderr, " If dataset name is specified, only that "
+ "dataset is dumped\n");
+ (void) fprintf(stderr, " If object numbers or object number "
+ "ranges are specified, only those\n"
+ " objects or ranges are dumped.\n\n");
+ (void) fprintf(stderr,
+ " Object ranges take the form <start>:<end>[:<flags>]\n"
+ " start Starting object number\n"
+ " end Ending object number, or -1 for no upper bound\n"
+ " flags Optional flags to select object types:\n"
+ " A All objects (this is the default)\n"
+ " d ZFS directories\n"
+ " f ZFS files \n"
+ " m SPA space maps\n"
+ " z ZAPs\n"
+ " - Negate effect of next flag\n\n");
+ (void) fprintf(stderr, " Options to control amount of output:\n");
+ (void) fprintf(stderr, " -b block statistics\n");
+ (void) fprintf(stderr, " -c checksum all metadata (twice for "
+ "all data) blocks\n");
+ (void) fprintf(stderr, " -C config (or cachefile if alone)\n");
+ (void) fprintf(stderr, " -d dataset(s)\n");
+ (void) fprintf(stderr, " -D dedup statistics\n");
+ (void) fprintf(stderr, " -E decode and display block from an "
+ "embedded block pointer\n");
+ (void) fprintf(stderr, " -h pool history\n");
+ (void) fprintf(stderr, " -i intent logs\n");
+ (void) fprintf(stderr, " -l read label contents\n");
+ (void) fprintf(stderr, " -k examine the checkpointed state "
+ "of the pool\n");
+ (void) fprintf(stderr, " -L disable leak tracking (do not "
+ "load spacemaps)\n");
+ (void) fprintf(stderr, " -m metaslabs\n");
+ (void) fprintf(stderr, " -M metaslab groups\n");
+ (void) fprintf(stderr, " -O perform object lookups by path\n");
+ (void) fprintf(stderr, " -R read and display block from a "
+ "device\n");
+ (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
+ (void) fprintf(stderr, " -S simulate dedup to measure effect\n");
+ (void) fprintf(stderr, " -v verbose (applies to all "
+ "others)\n");
+ (void) fprintf(stderr, " -y perform livelist and metaslab "
+ "validation on any livelists being deleted\n\n");
+ (void) fprintf(stderr, " Below options are intended for use "
+ "with other options:\n");
+ (void) fprintf(stderr, " -A ignore assertions (-A), enable "
+ "panic recovery (-AA) or both (-AAA)\n");
+ (void) fprintf(stderr, " -e pool is exported/destroyed/"
+ "has altroot/not in a cachefile\n");
+ (void) fprintf(stderr, " -F attempt automatic rewind within "
+ "safe range of transaction groups\n");
+ (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
+ "exiting\n");
+ (void) fprintf(stderr, " -I <number of inflight I/Os> -- "
+ "specify the maximum number of\n "
+ "checksumming I/Os [default is 200]\n");
+ (void) fprintf(stderr, " -o <variable>=<value> set global "
+ "variable to an unsigned 32-bit integer\n");
+ (void) fprintf(stderr, " -p <path> -- use one or more with "
+ "-e to specify path to vdev dir\n");
+ (void) fprintf(stderr, " -P print numbers in parseable form\n");
+ (void) fprintf(stderr, " -q don't print label contents\n");
+ (void) fprintf(stderr, " -t <txg> -- highest txg to use when "
+ "searching for uberblocks\n");
+ (void) fprintf(stderr, " -u uberblock\n");
+ (void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
+ "cachefile\n");
+ (void) fprintf(stderr, " -V do verbatim import\n");
+ (void) fprintf(stderr, " -x <dumpdir> -- "
+ "dump all read blocks into specified directory\n");
+ (void) fprintf(stderr, " -X attempt extreme rewind (does not "
+ "work with dataset)\n");
+ (void) fprintf(stderr, " -Y attempt all reconstruction "
+ "combinations for split blocks\n");
+ (void) fprintf(stderr, " -Z show ZSTD headers \n");
+ (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
+ "to make only that option verbose\n");
+ (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
+ exit(1);
+}
+
+static void
+dump_debug_buffer(void)
+{
+ if (dump_opt['G']) {
+ (void) printf("\n");
+ (void) fflush(stdout);
+ zfs_dbgmsg_print("zdb");
+ }
+}
+
+/*
+ * Called for usage errors that are discovered after a call to spa_open(),
+ * dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
+ */
+
+static void
+fatal(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void) fprintf(stderr, "%s: ", cmdname);
+ (void) vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ (void) fprintf(stderr, "\n");
+
+ dump_debug_buffer();
+
+ exit(1);
+}
+
+/* ARGSUSED */
+static void
+dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ nvlist_t *nv;
+ size_t nvsize = *(uint64_t *)data;
+ char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
+
+ VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
+
+ VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
+
+ umem_free(packed, nvsize);
+
+ dump_nvlist(nv, 8);
+
+ nvlist_free(nv);
+}
+
+/* ARGSUSED */
+static void
+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ spa_history_phys_t *shp = data;
+
+ if (shp == NULL)
+ return;
+
+ (void) printf("\t\tpool_create_len = %llu\n",
+ (u_longlong_t)shp->sh_pool_create_len);
+ (void) printf("\t\tphys_max_off = %llu\n",
+ (u_longlong_t)shp->sh_phys_max_off);
+ (void) printf("\t\tbof = %llu\n",
+ (u_longlong_t)shp->sh_bof);
+ (void) printf("\t\teof = %llu\n",
+ (u_longlong_t)shp->sh_eof);
+ (void) printf("\t\trecords_lost = %llu\n",
+ (u_longlong_t)shp->sh_records_lost);
+}
+
+static void
+zdb_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+ if (dump_opt['P'])
+ (void) snprintf(buf, buflen, "%llu", (longlong_t)num);
+ else
+ nicenum(num, buf, sizeof (buf));
+}
+
+static const char histo_stars[] = "****************************************";
+static const uint64_t histo_width = sizeof (histo_stars) - 1;
+
+static void
+dump_histogram(const uint64_t *histo, int size, int offset)
+{
+ int i;
+ int minidx = size - 1;
+ int maxidx = 0;
+ uint64_t max = 0;
+
+ for (i = 0; i < size; i++) {
+ if (histo[i] > max)
+ max = histo[i];
+ if (histo[i] > 0 && i > maxidx)
+ maxidx = i;
+ if (histo[i] > 0 && i < minidx)
+ minidx = i;
+ }
+
+ if (max < histo_width)
+ max = histo_width;
+
+ for (i = minidx; i <= maxidx; i++) {
+ (void) printf("\t\t\t%3u: %6llu %s\n",
+ i + offset, (u_longlong_t)histo[i],
+ &histo_stars[(max - histo[i]) * histo_width / max]);
+ }
+}
+
+static void
+dump_zap_stats(objset_t *os, uint64_t object)
+{
+ int error;
+ zap_stats_t zs;
+
+ error = zap_get_stats(os, object, &zs);
+ if (error)
+ return;
+
+ if (zs.zs_ptrtbl_len == 0) {
+ ASSERT(zs.zs_num_blocks == 1);
+ (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
+ (u_longlong_t)zs.zs_blocksize,
+ (u_longlong_t)zs.zs_num_entries);
+ return;
+ }
+
+ (void) printf("\tFat ZAP stats:\n");
+
+ (void) printf("\t\tPointer table:\n");
+ (void) printf("\t\t\t%llu elements\n",
+ (u_longlong_t)zs.zs_ptrtbl_len);
+ (void) printf("\t\t\tzt_blk: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_blk);
+ (void) printf("\t\t\tzt_numblks: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
+ (void) printf("\t\t\tzt_shift: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_shift);
+ (void) printf("\t\t\tzt_blks_copied: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_blks_copied);
+ (void) printf("\t\t\tzt_nextblk: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_nextblk);
+
+ (void) printf("\t\tZAP entries: %llu\n",
+ (u_longlong_t)zs.zs_num_entries);
+ (void) printf("\t\tLeaf blocks: %llu\n",
+ (u_longlong_t)zs.zs_num_leafs);
+ (void) printf("\t\tTotal blocks: %llu\n",
+ (u_longlong_t)zs.zs_num_blocks);
+ (void) printf("\t\tzap_block_type: 0x%llx\n",
+ (u_longlong_t)zs.zs_block_type);
+ (void) printf("\t\tzap_magic: 0x%llx\n",
+ (u_longlong_t)zs.zs_magic);
+ (void) printf("\t\tzap_salt: 0x%llx\n",
+ (u_longlong_t)zs.zs_salt);
+
+ (void) printf("\t\tLeafs with 2^n pointers:\n");
+ dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tBlocks with n*5 entries:\n");
+ dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tBlocks n/10 full:\n");
+ dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tEntries with n chunks:\n");
+ dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tBuckets with n entries:\n");
+ dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
+}
+
+/*ARGSUSED*/
+static void
+dump_none(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ (void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ uint64_t *arr;
+ uint64_t oursize;
+ if (dump_opt['d'] < 6)
+ return;
+
+ if (data == NULL) {
+ dmu_object_info_t doi;
+
+ VERIFY0(dmu_object_info(os, object, &doi));
+ size = doi.doi_max_offset;
+ /*
+ * We cap the size at 1 mebibyte here to prevent
+ * allocation failures and nigh-infinite printing if the
+ * object is extremely large.
+ */
+ oursize = MIN(size, 1 << 20);
+ arr = kmem_alloc(oursize, KM_SLEEP);
+
+ int err = dmu_read(os, object, 0, oursize, arr, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ kmem_free(arr, oursize);
+ return;
+ }
+ } else {
+ /*
+ * Even though the allocation is already done in this code path,
+ * we still cap the size to prevent excessive printing.
+ */
+ oursize = MIN(size, 1 << 20);
+ arr = data;
+ }
+
+ if (size == 0) {
+ (void) printf("\t\t[]\n");
+ return;
+ }
+
+ (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
+ for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
+ if (i % 4 != 0)
+ (void) printf(", %0llx", (u_longlong_t)arr[i]);
+ else
+ (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
+ }
+ if (oursize != size)
+ (void) printf(", ... ");
+ (void) printf("]\n");
+
+ if (data == NULL)
+ kmem_free(arr, oursize);
+}
+
+/*ARGSUSED*/
+static void
+dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ void *prop;
+ unsigned i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ prop = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+ (void) zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length, attr.za_num_integers, prop);
+ if (attr.za_integer_length == 1) {
+ (void) printf("%s", (char *)prop);
+ } else {
+ for (i = 0; i < attr.za_num_integers; i++) {
+ switch (attr.za_integer_length) {
+ case 2:
+ (void) printf("%u ",
+ ((uint16_t *)prop)[i]);
+ break;
+ case 4:
+ (void) printf("%u ",
+ ((uint32_t *)prop)[i]);
+ break;
+ case 8:
+ (void) printf("%lld ",
+ (u_longlong_t)((int64_t *)prop)[i]);
+ break;
+ }
+ }
+ }
+ (void) printf("\n");
+ umem_free(prop, attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ bpobj_phys_t *bpop = data;
+ uint64_t i;
+ char bytes[32], comp[32], uncomp[32];
+
+ /* make sure the output won't get truncated */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+
+ if (bpop == NULL)
+ return;
+
+ zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
+ zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
+ zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
+
+ (void) printf("\t\tnum_blkptrs = %llu\n",
+ (u_longlong_t)bpop->bpo_num_blkptrs);
+ (void) printf("\t\tbytes = %s\n", bytes);
+ if (size >= BPOBJ_SIZE_V1) {
+ (void) printf("\t\tcomp = %s\n", comp);
+ (void) printf("\t\tuncomp = %s\n", uncomp);
+ }
+ if (size >= BPOBJ_SIZE_V2) {
+ (void) printf("\t\tsubobjs = %llu\n",
+ (u_longlong_t)bpop->bpo_subobjs);
+ (void) printf("\t\tnum_subobjs = %llu\n",
+ (u_longlong_t)bpop->bpo_num_subobjs);
+ }
+ if (size >= sizeof (*bpop)) {
+ (void) printf("\t\tnum_freed = %llu\n",
+ (u_longlong_t)bpop->bpo_num_freed);
+ }
+
+ if (dump_opt['d'] < 5)
+ return;
+
+ for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t bp;
+
+ int err = dmu_read(os, object,
+ i * sizeof (bp), sizeof (bp), &bp, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ break;
+ }
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
+ BP_GET_FREE(&bp));
+ (void) printf("\t%s\n", blkbuf);
+ }
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dmu_object_info_t doi;
+ int64_t i;
+
+ VERIFY0(dmu_object_info(os, object, &doi));
+ uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+ int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ kmem_free(subobjs, doi.doi_max_offset);
+ return;
+ }
+
+ int64_t last_nonzero = -1;
+ for (i = 0; i < doi.doi_max_offset / 8; i++) {
+ if (subobjs[i] != 0)
+ last_nonzero = i;
+ }
+
+ for (i = 0; i <= last_nonzero; i++) {
+ (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
+ }
+ kmem_free(subobjs, doi.doi_max_offset);
+}
+
+/*ARGSUSED*/
+static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dump_zap_stats(os, object);
+ /* contents are printed elsewhere, properly decoded */
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ (void) printf(" %llx : [%d:%d:%d]\n",
+ (u_longlong_t)attr.za_first_integer,
+ (int)ATTR_LENGTH(attr.za_first_integer),
+ (int)ATTR_BSWAP(attr.za_first_integer),
+ (int)ATTR_NUM(attr.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ uint16_t *layout_attrs;
+ unsigned i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = [", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+
+ VERIFY(attr.za_integer_length == 2);
+ layout_attrs = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+
+ VERIFY(zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length,
+ attr.za_num_integers, layout_attrs) == 0);
+
+ for (i = 0; i != attr.za_num_integers; i++)
+ (void) printf(" %d ", (int)layout_attrs[i]);
+ (void) printf("]\n");
+ umem_free(layout_attrs,
+ attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ const char *typenames[] = {
+ /* 0 */ "not specified",
+ /* 1 */ "FIFO",
+ /* 2 */ "Character Device",
+ /* 3 */ "3 (invalid)",
+ /* 4 */ "Directory",
+ /* 5 */ "5 (invalid)",
+ /* 6 */ "Block Device",
+ /* 7 */ "7 (invalid)",
+ /* 8 */ "Regular File",
+ /* 9 */ "9 (invalid)",
+ /* 10 */ "Symbolic Link",
+ /* 11 */ "11 (invalid)",
+ /* 12 */ "Socket",
+ /* 13 */ "Door",
+ /* 14 */ "Event Port",
+ /* 15 */ "15 (invalid)",
+ };
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = %lld (type: %s)\n",
+ attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
+ typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static int
+get_dtl_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ space_map_t *sm = vd->vdev_dtl_sm;
+
+ if (sm != NULL &&
+ sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+ return (1);
+ return (0);
+ }
+
+ for (unsigned c = 0; c < vd->vdev_children; c++)
+ refcount += get_dtl_refcount(vd->vdev_child[c]);
+ return (refcount);
+}
+
+static int
+get_metaslab_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_top == vd) {
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ space_map_t *sm = vd->vdev_ms[m]->ms_sm;
+
+ if (sm != NULL &&
+ sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+ refcount++;
+ }
+ }
+ for (unsigned c = 0; c < vd->vdev_children; c++)
+ refcount += get_metaslab_refcount(vd->vdev_child[c]);
+
+ return (refcount);
+}
+
+static int
+get_obsolete_refcount(vdev_t *vd)
+{
+ uint64_t obsolete_sm_object;
+ int refcount = 0;
+
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (vd->vdev_top == vd && obsolete_sm_object != 0) {
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
+ obsolete_sm_object, &doi));
+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+ refcount++;
+ }
+ } else {
+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+ ASSERT3U(obsolete_sm_object, ==, 0);
+ }
+ for (unsigned c = 0; c < vd->vdev_children; c++) {
+ refcount += get_obsolete_refcount(vd->vdev_child[c]);
+ }
+
+ return (refcount);
+}
+
+static int
+get_prev_obsolete_spacemap_refcount(spa_t *spa)
+{
+ uint64_t prev_obj =
+ spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
+ if (prev_obj != 0) {
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+ zap_contains(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+ refcount++;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+ return (refcount);
+}
+
+static int
+get_log_spacemap_refcount(spa_t *spa)
+{
+ return (avl_numnodes(&spa->spa_sm_logs_by_txg));
+}
+
+static int
+verify_spacemap_refcounts(spa_t *spa)
+{
+ uint64_t expected_refcount = 0;
+ uint64_t actual_refcount;
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
+ &expected_refcount);
+ actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
+ actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+ actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
+ actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+ actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
+ actual_refcount += get_log_spacemap_refcount(spa);
+
+ if (expected_refcount != actual_refcount) {
+ (void) printf("space map refcount mismatch: expected %lld != "
+ "actual %lld\n",
+ (longlong_t)expected_refcount,
+ (longlong_t)actual_refcount);
+ return (2);
+ }
+ return (0);
+}
+
+static void
+dump_spacemap(objset_t *os, space_map_t *sm)
+{
+ const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
+ "INVALID", "INVALID", "INVALID", "INVALID" };
+
+ if (sm == NULL)
+ return;
+
+ (void) printf("space map object %llu:\n",
+ (longlong_t)sm->sm_object);
+ (void) printf(" smp_length = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_length);
+ (void) printf(" smp_alloc = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_alloc);
+
+ if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+ return;
+
+ /*
+ * Print out the freelist entries in both encoded and decoded form.
+ */
+ uint8_t mapshift = sm->sm_shift;
+ int64_t alloc = 0;
+ uint64_t word, entry_id = 0;
+ for (uint64_t offset = 0; offset < space_map_length(sm);
+ offset += sizeof (word)) {
+
+ VERIFY0(dmu_read(os, space_map_object(sm), offset,
+ sizeof (word), &word, DMU_READ_PREFETCH));
+
+ if (sm_entry_is_debug(word)) {
+ uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
+ uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
+ if (de_txg == 0) {
+ (void) printf(
+ "\t [%6llu] PADDING\n",
+ (u_longlong_t)entry_id);
+ } else {
+ (void) printf(
+ "\t [%6llu] %s: txg %llu pass %llu\n",
+ (u_longlong_t)entry_id,
+ ddata[SM_DEBUG_ACTION_DECODE(word)],
+ (u_longlong_t)de_txg,
+ (u_longlong_t)de_sync_pass);
+ }
+ entry_id++;
+ continue;
+ }
+
+ uint8_t words;
+ char entry_type;
+ uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
+
+ if (sm_entry_is_single_word(word)) {
+ entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
+ 'A' : 'F';
+ entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
+ sm->sm_start;
+ entry_run = SM_RUN_DECODE(word) << mapshift;
+ words = 1;
+ } else {
+ /* it is a two-word entry so we read another word */
+ ASSERT(sm_entry_is_double_word(word));
+
+ uint64_t extra_word;
+ offset += sizeof (extra_word);
+ VERIFY0(dmu_read(os, space_map_object(sm), offset,
+ sizeof (extra_word), &extra_word,
+ DMU_READ_PREFETCH));
+
+ ASSERT3U(offset, <=, space_map_length(sm));
+
+ entry_run = SM2_RUN_DECODE(word) << mapshift;
+ entry_vdev = SM2_VDEV_DECODE(word);
+ entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
+ 'A' : 'F';
+ entry_off = (SM2_OFFSET_DECODE(extra_word) <<
+ mapshift) + sm->sm_start;
+ words = 2;
+ }
+
+ (void) printf("\t [%6llu] %c range:"
+ " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
+ (u_longlong_t)entry_id,
+ entry_type, (u_longlong_t)entry_off,
+ (u_longlong_t)(entry_off + entry_run),
+ (u_longlong_t)entry_run,
+ (u_longlong_t)entry_vdev, words);
+
+ if (entry_type == 'A')
+ alloc += entry_run;
+ else
+ alloc -= entry_run;
+ entry_id++;
+ }
+ if (alloc != space_map_allocated(sm)) {
+ (void) printf("space_map_object alloc (%lld) INCONSISTENT "
+ "with space map summary (%lld)\n",
+ (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
+ }
+}
+
+static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+ char maxbuf[32];
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+
+ /* max sure nicenum has enough space */
+ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
+
+ zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
+
+ (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
+ "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
+ "freepct", free_pct);
+ (void) printf("\tIn-memory histogram:\n");
+ dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+dump_metaslab(metaslab_t *msp)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+ char freebuf[32];
+
+ zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
+ sizeof (freebuf));
+
+ (void) printf(
+ "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
+ (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
+ (u_longlong_t)space_map_object(sm), freebuf);
+
+ if (dump_opt['m'] > 2 && !dump_opt['L']) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY0(metaslab_load(msp));
+ range_tree_stat_verify(msp->ms_allocatable);
+ dump_metaslab_stats(msp);
+ metaslab_unload(msp);
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (dump_opt['m'] > 1 && sm != NULL &&
+ spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ /*
+ * The space map histogram represents free space in chunks
+ * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
+ */
+ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
+ (u_longlong_t)msp->ms_fragmentation);
+ dump_histogram(sm->sm_phys->smp_histogram,
+ SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
+ }
+
+ ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+ dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+ (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
+ (u_longlong_t)metaslab_unflushed_txg(msp));
+ }
+}
+
+static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *bias_str = "";
+ if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
+ bias_str = VDEV_ALLOC_BIAS_LOG;
+ } else if (alloc_bias == VDEV_BIAS_SPECIAL) {
+ bias_str = VDEV_ALLOC_BIAS_SPECIAL;
+ } else if (alloc_bias == VDEV_BIAS_DEDUP) {
+ bias_str = VDEV_ALLOC_BIAS_DEDUP;
+ }
+
+ uint64_t ms_flush_data_obj = 0;
+ if (vd->vdev_top_zap != 0) {
+ int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+ sizeof (uint64_t), 1, &ms_flush_data_obj);
+ if (error != ENOENT) {
+ ASSERT0(error);
+ }
+ }
+
+ (void) printf("\tvdev %10llu %s",
+ (u_longlong_t)vd->vdev_id, bias_str);
+
+ if (ms_flush_data_obj != 0) {
+ (void) printf(" ms_unflushed_phys object %llu",
+ (u_longlong_t)ms_flush_data_obj);
+ }
+
+ (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n",
+ "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+ "offset", "spacemap", "free");
+ (void) printf("\t%15s %19s %15s %12s\n",
+ "---------------", "-------------------",
+ "---------------", "------------");
+}
+
+static void
+dump_metaslab_groups(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_class_t *mc = spa_normal_class(spa);
+ uint64_t fragmentation;
+
+ metaslab_class_histogram_verify(mc);
+
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (mg == NULL || mg->mg_class != mc)
+ continue;
+
+ metaslab_group_histogram_verify(mg);
+ mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+ (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
+ "fragmentation",
+ (u_longlong_t)tvd->vdev_id,
+ (u_longlong_t)tvd->vdev_ms_count);
+ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+ (void) printf("%3s\n", "-");
+ } else {
+ (void) printf("%3llu%%\n",
+ (u_longlong_t)mg->mg_fragmentation);
+ }
+ dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+ }
+
+ (void) printf("\tpool %s\tfragmentation", spa_name(spa));
+ fragmentation = metaslab_class_fragmentation(mc);
+ if (fragmentation == ZFS_FRAG_INVALID)
+ (void) printf("\t%3s\n", "-");
+ else
+ (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
+ dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+print_vdev_indirect(vdev_t *vd)
+{
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ if (vim == NULL) {
+ ASSERT3P(vib, ==, NULL);
+ return;
+ }
+
+ ASSERT3U(vdev_indirect_mapping_object(vim), ==,
+ vic->vic_mapping_object);
+ ASSERT3U(vdev_indirect_births_object(vib), ==,
+ vic->vic_births_object);
+
+ (void) printf("indirect births obj %llu:\n",
+ (longlong_t)vic->vic_births_object);
+ (void) printf(" vib_count = %llu\n",
+ (longlong_t)vdev_indirect_births_count(vib));
+ for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
+ vdev_indirect_birth_entry_phys_t *cur_vibe =
+ &vib->vib_entries[i];
+ (void) printf("\toffset %llx -> txg %llu\n",
+ (longlong_t)cur_vibe->vibe_offset,
+ (longlong_t)cur_vibe->vibe_phys_birth_txg);
+ }
+ (void) printf("\n");
+
+ (void) printf("indirect mapping obj %llu:\n",
+ (longlong_t)vic->vic_mapping_object);
+ (void) printf(" vim_max_offset = 0x%llx\n",
+ (longlong_t)vdev_indirect_mapping_max_offset(vim));
+ (void) printf(" vim_bytes_mapped = 0x%llx\n",
+ (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
+ (void) printf(" vim_count = %llu\n",
+ (longlong_t)vdev_indirect_mapping_num_entries(vim));
+
+ if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
+ return;
+
+ uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+
+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[i];
+ (void) printf("\t<%llx:%llx:%llx> -> "
+ "<%llx:%llx:%llx> (%x obsolete)\n",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ counts[i]);
+ }
+ (void) printf("\n");
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ (void) printf("obsolete space map object %llu:\n",
+ (u_longlong_t)obsolete_sm_object);
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
+ obsolete_sm_object);
+ dump_spacemap(mos, vd->vdev_obsolete_sm);
+ (void) printf("\n");
+ }
+}
+
+static void
+dump_metaslabs(spa_t *spa)
+{
+ vdev_t *vd, *rvd = spa->spa_root_vdev;
+ uint64_t m, c = 0, children = rvd->vdev_children;
+
+ (void) printf("\nMetaslabs:\n");
+
+ if (!dump_opt['d'] && zopt_metaslab_args > 0) {
+ c = zopt_metaslab[0];
+
+ if (c >= children)
+ (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
+
+ if (zopt_metaslab_args > 1) {
+ vd = rvd->vdev_child[c];
+ print_vdev_metaslab_header(vd);
+
+ for (m = 1; m < zopt_metaslab_args; m++) {
+ if (zopt_metaslab[m] < vd->vdev_ms_count)
+ dump_metaslab(
+ vd->vdev_ms[zopt_metaslab[m]]);
+ else
+ (void) fprintf(stderr, "bad metaslab "
+ "number %llu\n",
+ (u_longlong_t)zopt_metaslab[m]);
+ }
+ (void) printf("\n");
+ return;
+ }
+ children = c + 1;
+ }
+ for (; c < children; c++) {
+ vd = rvd->vdev_child[c];
+ print_vdev_metaslab_header(vd);
+
+ print_vdev_indirect(vd);
+
+ for (m = 0; m < vd->vdev_ms_count; m++)
+ dump_metaslab(vd->vdev_ms[m]);
+ (void) printf("\n");
+ }
+}
+
+static void
+dump_log_spacemaps(spa_t *spa)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ (void) printf("\nLog Space Maps in Pool:\n");
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ space_map_t *sm = NULL;
+ VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+ sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+ (void) printf("Log Spacemap object %llu txg %llu\n",
+ (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
+ dump_spacemap(spa->spa_meta_objset, sm);
+ space_map_close(sm);
+ }
+ (void) printf("\n");
+}
+
+static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+ const ddt_phys_t *ddp = dde->dde_phys;
+ const ddt_key_t *ddk = &dde->dde_key;
+ const char *types[4] = { "ditto", "single", "double", "triple" };
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t blk;
+ int p;
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
+ (void) printf("index %llx refcnt %llu %s %s\n",
+ (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+ types[p], blkbuf);
+ }
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+ double rL, rP, rD, D, dedup, compress, copies;
+
+ if (dds->dds_blocks == 0)
+ return;
+
+ rL = (double)dds->dds_ref_lsize;
+ rP = (double)dds->dds_ref_psize;
+ rD = (double)dds->dds_ref_dsize;
+ D = (double)dds->dds_dsize;
+
+ dedup = rD / D;
+ compress = rL / rP;
+ copies = rD / rP;
+
+ (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+ "dedup * compress / copies = %.2f\n\n",
+ dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ char name[DDT_NAMELEN];
+ ddt_entry_t dde;
+ uint64_t walk = 0;
+ dmu_object_info_t doi;
+ uint64_t count, dspace, mspace;
+ int error;
+
+ error = ddt_object_info(ddt, type, class, &doi);
+
+ if (error == ENOENT)
+ return;
+ ASSERT(error == 0);
+
+ error = ddt_object_count(ddt, type, class, &count);
+ ASSERT(error == 0);
+ if (count == 0)
+ return;
+
+ dspace = doi.doi_physical_blocks_512 << 9;
+ mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ddt_object_name(ddt, type, class, name);
+
+ (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+ name,
+ (u_longlong_t)count,
+ (u_longlong_t)(dspace / count),
+ (u_longlong_t)(mspace / count));
+
+ if (dump_opt['D'] < 3)
+ return;
+
+ zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+ if (dump_opt['D'] < 4)
+ return;
+
+ if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+ return;
+
+ (void) printf("%s contents:\n\n", name);
+
+ while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+ dump_dde(ddt, &dde, walk);
+
+ ASSERT3U(error, ==, ENOENT);
+
+ (void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+ ddt_histogram_t ddh_total;
+ ddt_stat_t dds_total;
+
+ bzero(&ddh_total, sizeof (ddh_total));
+ bzero(&dds_total, sizeof (dds_total));
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ dump_ddt(ddt, type, class);
+ }
+ }
+ }
+
+ ddt_get_dedup_stats(spa, &dds_total);
+
+ if (dds_total.dds_blocks == 0) {
+ (void) printf("All DDTs are empty\n");
+ return;
+ }
+
+ (void) printf("\n");
+
+ if (dump_opt['D'] > 1) {
+ (void) printf("DDT histogram (aggregated over all DDTs):\n");
+ ddt_get_dedup_histogram(spa, &ddh_total);
+ zpool_dump_ddt(&dds_total, &ddh_total);
+ }
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static void
+dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
+{
+ char *prefix = arg;
+
+ (void) printf("%s [%llu,%llu) length %llu\n",
+ prefix,
+ (u_longlong_t)start,
+ (u_longlong_t)(start + size),
+ (u_longlong_t)(size));
+}
+
+static void
+dump_dtl(vdev_t *vd, int indent)
+{
+ spa_t *spa = vd->vdev_spa;
+ boolean_t required;
+ const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
+ "outage" };
+ char prefix[256];
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+ required = vdev_dtl_required(vd);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ if (indent == 0)
+ (void) printf("\nDirty time logs:\n\n");
+
+ (void) printf("\t%*s%s [%s]\n", indent, "",
+ vd->vdev_path ? vd->vdev_path :
+ vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+ required ? "DTL-required" : "DTL-expendable");
+
+ for (int t = 0; t < DTL_TYPES; t++) {
+ range_tree_t *rt = vd->vdev_dtl[t];
+ if (range_tree_space(rt) == 0)
+ continue;
+ (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+ indent + 2, "", name[t]);
+ range_tree_walk(rt, dump_dtl_seg, prefix);
+ if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+ dump_spacemap(spa->spa_meta_objset,
+ vd->vdev_dtl_sm);
+ }
+
+ for (unsigned c = 0; c < vd->vdev_children; c++)
+ dump_dtl(vd->vdev_child[c], indent + 4);
+}
+
+static void
+dump_history(spa_t *spa)
+{
+ nvlist_t **events = NULL;
+ char *buf;
+ uint64_t resid, len, off = 0;
+ uint_t num = 0;
+ int error;
+ time_t tsec;
+ struct tm t;
+ char tbuf[30];
+ char internalstr[MAXPATHLEN];
+
+ if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
+ (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
+ __func__);
+ return;
+ }
+
+ do {
+ len = SPA_OLD_MAXBLOCKSIZE;
+
+ if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+ (void) fprintf(stderr, "Unable to read history: "
+ "error %d\n", error);
+ free(buf);
+ return;
+ }
+
+ if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+ break;
+
+ off -= resid;
+ } while (len != 0);
+
+ (void) printf("\nHistory:\n");
+ for (unsigned i = 0; i < num; i++) {
+ uint64_t time, txg, ievent;
+ char *cmd, *intstr;
+ boolean_t printed = B_FALSE;
+
+ if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+ &time) != 0)
+ goto next;
+ if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+ &cmd) != 0) {
+ if (nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+ goto next;
+ verify(nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_TXG, &txg) == 0);
+ verify(nvlist_lookup_string(events[i],
+ ZPOOL_HIST_INT_STR, &intstr) == 0);
+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
+ goto next;
+
+ (void) snprintf(internalstr,
+ sizeof (internalstr),
+ "[internal %s txg:%lld] %s",
+ zfs_history_event_names[ievent],
+ (longlong_t)txg, intstr);
+ cmd = internalstr;
+ }
+ tsec = time;
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ (void) printf("%s %s\n", tbuf, cmd);
+ printed = B_TRUE;
+
+next:
+ if (dump_opt['h'] > 1) {
+ if (!printed)
+ (void) printf("unrecognized record:\n");
+ dump_nvlist(events[i], 2);
+ }
+ }
+ free(buf);
+}
+
+/*ARGSUSED*/
+static void
+dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static uint64_t
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_phys_t *zb)
+{
+ if (dnp == NULL) {
+ ASSERT(zb->zb_level < 0);
+ if (zb->zb_object == 0)
+ return (zb->zb_blkid);
+ return (zb->zb_blkid * BP_GET_LSIZE(bp));
+ }
+
+ ASSERT(zb->zb_level >= 0);
+
+ return ((zb->zb_blkid <<
+ (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+}
+
+static void
+snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
+ const blkptr_t *bp)
+{
+ abd_t *pabd;
+ void *buf;
+ zio_t *zio;
+ zfs_zstdhdr_t zstd_hdr;
+ int error;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
+ return;
+
+ if (BP_IS_HOLE(bp))
+ return;
+
+ if (BP_IS_EMBEDDED(bp)) {
+ buf = malloc(SPA_MAXBLOCKSIZE);
+ if (buf == NULL) {
+ (void) fprintf(stderr, "out of memory\n");
+ exit(1);
+ }
+ decode_embedded_bp_compressed(bp, buf);
+ memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
+ free(buf);
+ zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
+ zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
+ zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
+ return;
+ }
+
+ pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
+ zio = zio_root(spa, NULL, NULL, 0);
+
+ /* Decrypt but don't decompress so we can read the compression header */
+ zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
+ NULL));
+ error = zio_wait(zio);
+ if (error) {
+ (void) fprintf(stderr, "read failed: %d\n", error);
+ return;
+ }
+ buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
+ memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
+ zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
+ zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
+
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ " ZSTD:size=%u:version=%u:level=%u:NORMAL",
+ zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
+
+ abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
+}
+
+static void
+snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
+ boolean_t bp_freed)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+ int i;
+
+ if (dump_opt['b'] >= 6) {
+ snprintf_blkptr(blkbuf, buflen, bp);
+ if (bp_freed) {
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), " %s", "FREE");
+ }
+ return;
+ }
+
+ if (BP_IS_EMBEDDED(bp)) {
+ (void) sprintf(blkbuf,
+ "EMBEDDED et=%u %llxL/%llxP B=%llu",
+ (int)BPE_GET_ETYPE(bp),
+ (u_longlong_t)BPE_GET_LSIZE(bp),
+ (u_longlong_t)BPE_GET_PSIZE(bp),
+ (u_longlong_t)bp->blk_birth);
+ return;
+ }
+
+ blkbuf[0] = '\0';
+
+ for (i = 0; i < ndvas; i++)
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), "%llu:%llx:%llx ",
+ (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+ (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+ (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+ if (BP_IS_HOLE(bp)) {
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ "%llxL B=%llu",
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)bp->blk_birth);
+ } else {
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ "%llxL/%llxP F=%llu B=%llu/%llu",
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp),
+ (u_longlong_t)BP_GET_FILL(bp),
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+ if (bp_freed)
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), " %s", "FREE");
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx",
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ }
+}
+
+static void
+print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
+ const dnode_phys_t *dnp)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+ int l;
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+ }
+
+ (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
+
+ ASSERT(zb->zb_level >= 0);
+
+ for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
+ if (l == zb->zb_level) {
+ (void) printf("L%llx", (u_longlong_t)zb->zb_level);
+ } else {
+ (void) printf(" ");
+ }
+ }
+
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
+ if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
+ snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
+ (void) printf("%s\n", blkbuf);
+}
+
+static int
+visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
+ blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ int err = 0;
+
+ if (bp->blk_birth == 0)
+ return (0);
+
+ print_indirect(spa, bp, zb, dnp);
+
+ if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ arc_buf_t *buf;
+ uint64_t fill = 0;
+ ASSERT(!BP_IS_REDACTED(bp));
+
+ err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err)
+ return (err);
+ ASSERT(buf->b_data);
+
+ /* recursively visit blocks below this */
+ cbp = buf->b_data;
+ for (i = 0; i < epb; i++, cbp++) {
+ zbookmark_phys_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ err = visit_indirect(spa, dnp, cbp, &czb);
+ if (err)
+ break;
+ fill += BP_GET_FILL(cbp);
+ }
+ if (!err)
+ ASSERT3U(fill, ==, BP_GET_FILL(bp));
+ arc_buf_destroy(buf, &buf);
+ }
+
+ return (err);
+}
+
+/*ARGSUSED*/
+static void
+dump_indirect(dnode_t *dn)
+{
+ dnode_phys_t *dnp = dn->dn_phys;
+ int j;
+ zbookmark_phys_t czb;
+
+ (void) printf("Indirect blocks:\n");
+
+ SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
+ dn->dn_object, dnp->dn_nlevels - 1, 0);
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ czb.zb_blkid = j;
+ (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
+ &dnp->dn_blkptr[j], &czb);
+ }
+
+ (void) printf("\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dsl_dir_phys_t *dd = data;
+ time_t crtime;
+ char nice[32];
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
+
+ if (dd == NULL)
+ return;
+
+ ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
+
+ crtime = dd->dd_creation_time;
+ (void) printf("\t\tcreation_time = %s", ctime(&crtime));
+ (void) printf("\t\thead_dataset_obj = %llu\n",
+ (u_longlong_t)dd->dd_head_dataset_obj);
+ (void) printf("\t\tparent_dir_obj = %llu\n",
+ (u_longlong_t)dd->dd_parent_obj);
+ (void) printf("\t\torigin_obj = %llu\n",
+ (u_longlong_t)dd->dd_origin_obj);
+ (void) printf("\t\tchild_dir_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_child_dir_zapobj);
+ zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
+ (void) printf("\t\tused_bytes = %s\n", nice);
+ zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
+ (void) printf("\t\tcompressed_bytes = %s\n", nice);
+ zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
+ (void) printf("\t\tuncompressed_bytes = %s\n", nice);
+ zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
+ (void) printf("\t\tquota = %s\n", nice);
+ zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
+ (void) printf("\t\treserved = %s\n", nice);
+ (void) printf("\t\tprops_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_props_zapobj);
+ (void) printf("\t\tdeleg_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_deleg_zapobj);
+ (void) printf("\t\tflags = %llx\n",
+ (u_longlong_t)dd->dd_flags);
+
+#define DO(which) \
+ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
+ sizeof (nice)); \
+ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
+ DO(HEAD);
+ DO(SNAP);
+ DO(CHILD);
+ DO(CHILD_RSRV);
+ DO(REFRSRV);
+#undef DO
+ (void) printf("\t\tclones = %llu\n",
+ (u_longlong_t)dd->dd_clones);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dsl_dataset_phys_t *ds = data;
+ time_t crtime;
+ char used[32], compressed[32], uncompressed[32], unique[32];
+ char blkbuf[BP_SPRINTF_LEN];
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
+
+ if (ds == NULL)
+ return;
+
+ ASSERT(size == sizeof (*ds));
+ crtime = ds->ds_creation_time;
+ zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
+ zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
+ zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
+ sizeof (uncompressed));
+ zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
+
+ (void) printf("\t\tdir_obj = %llu\n",
+ (u_longlong_t)ds->ds_dir_obj);
+ (void) printf("\t\tprev_snap_obj = %llu\n",
+ (u_longlong_t)ds->ds_prev_snap_obj);
+ (void) printf("\t\tprev_snap_txg = %llu\n",
+ (u_longlong_t)ds->ds_prev_snap_txg);
+ (void) printf("\t\tnext_snap_obj = %llu\n",
+ (u_longlong_t)ds->ds_next_snap_obj);
+ (void) printf("\t\tsnapnames_zapobj = %llu\n",
+ (u_longlong_t)ds->ds_snapnames_zapobj);
+ (void) printf("\t\tnum_children = %llu\n",
+ (u_longlong_t)ds->ds_num_children);
+ (void) printf("\t\tuserrefs_obj = %llu\n",
+ (u_longlong_t)ds->ds_userrefs_obj);
+ (void) printf("\t\tcreation_time = %s", ctime(&crtime));
+ (void) printf("\t\tcreation_txg = %llu\n",
+ (u_longlong_t)ds->ds_creation_txg);
+ (void) printf("\t\tdeadlist_obj = %llu\n",
+ (u_longlong_t)ds->ds_deadlist_obj);
+ (void) printf("\t\tused_bytes = %s\n", used);
+ (void) printf("\t\tcompressed_bytes = %s\n", compressed);
+ (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+ (void) printf("\t\tunique = %s\n", unique);
+ (void) printf("\t\tfsid_guid = %llu\n",
+ (u_longlong_t)ds->ds_fsid_guid);
+ (void) printf("\t\tguid = %llu\n",
+ (u_longlong_t)ds->ds_guid);
+ (void) printf("\t\tflags = %llx\n",
+ (u_longlong_t)ds->ds_flags);
+ (void) printf("\t\tnext_clones_obj = %llu\n",
+ (u_longlong_t)ds->ds_next_clones_obj);
+ (void) printf("\t\tprops_obj = %llu\n",
+ (u_longlong_t)ds->ds_props_obj);
+ (void) printf("\t\tbp = %s\n", blkbuf);
+}
+
+/* ARGSUSED */
+static int
+dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ if (bp->blk_birth != 0) {
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("\t%s\n", blkbuf);
+ }
+ return (0);
+}
+
+static void
+dump_bptree(objset_t *os, uint64_t obj, const char *name)
+{
+ char bytes[32];
+ bptree_phys_t *bt;
+ dmu_buf_t *db;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
+ (void) printf("\n %s: %llu datasets, %s\n",
+ name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
+ dmu_buf_rele(db, FTAG);
+
+ if (dump_opt['d'] < 5)
+ return;
+
+ (void) printf("\n");
+
+ (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
+}
+
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ ASSERT(bp->blk_birth != 0);
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
+ (void) printf("\t%s\n", blkbuf);
+ return (0);
+}
+
+static void
+dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
+{
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
+ uint64_t i;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+ zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
+ zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
+ if (bpo->bpo_havefreed) {
+ (void) printf(" %*s: object %llu, %llu local "
+ "blkptrs, %llu freed, %llu subobjs in object %llu, "
+ "%s (%s/%s comp)\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+ (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+ bytes, comp, uncomp);
+ } else {
+ (void) printf(" %*s: object %llu, %llu local "
+ "blkptrs, %llu subobjs in object %llu, "
+ "%s (%s/%s comp)\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+ (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+ bytes, comp, uncomp);
+ }
+
+ for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+ uint64_t subobj;
+ bpobj_t subbpo;
+ int error;
+ VERIFY0(dmu_read(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+ error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+ if (error != 0) {
+ (void) printf("ERROR %u while trying to open "
+ "subobj id %llu\n",
+ error, (u_longlong_t)subobj);
+ continue;
+ }
+ dump_full_bpobj(&subbpo, "subobj", indent + 1);
+ bpobj_close(&subbpo);
+ }
+ } else {
+ if (bpo->bpo_havefreed) {
+ (void) printf(" %*s: object %llu, %llu blkptrs, "
+ "%llu freed, %s\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
+ bytes);
+ } else {
+ (void) printf(" %*s: object %llu, %llu blkptrs, "
+ "%s\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ bytes);
+ }
+ }
+
+ if (dump_opt['d'] < 5)
+ return;
+
+
+ if (indent == 0) {
+ (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+ (void) printf("\n");
+ }
+}
+
+static int
+dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
+ boolean_t print_list)
+{
+ int err = 0;
+ zfs_bookmark_phys_t prop;
+ objset_t *mos = dp->dp_spa->spa_meta_objset;
+ err = dsl_bookmark_lookup(dp, name, NULL, &prop);
+
+ if (err != 0) {
+ return (err);
+ }
+
+ (void) printf("\t#%s: ", strchr(name, '#') + 1);
+ (void) printf("{guid: %llx creation_txg: %llu creation_time: "
+ "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
+ (u_longlong_t)prop.zbm_creation_txg,
+ (u_longlong_t)prop.zbm_creation_time,
+ (u_longlong_t)prop.zbm_redaction_obj);
+
+ IMPLY(print_list, print_redact);
+ if (!print_redact || prop.zbm_redaction_obj == 0)
+ return (0);
+
+ redaction_list_t *rl;
+ VERIFY0(dsl_redaction_list_hold_obj(dp,
+ prop.zbm_redaction_obj, FTAG, &rl));
+
+ redaction_list_phys_t *rlp = rl->rl_phys;
+ (void) printf("\tRedacted:\n\t\tProgress: ");
+ if (rlp->rlp_last_object != UINT64_MAX ||
+ rlp->rlp_last_blkid != UINT64_MAX) {
+ (void) printf("%llu %llu (incomplete)\n",
+ (u_longlong_t)rlp->rlp_last_object,
+ (u_longlong_t)rlp->rlp_last_blkid);
+ } else {
+ (void) printf("complete\n");
+ }
+ (void) printf("\t\tSnapshots: [");
+ for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
+ if (i > 0)
+ (void) printf(", ");
+ (void) printf("%0llu",
+ (u_longlong_t)rlp->rlp_snaps[i]);
+ }
+ (void) printf("]\n\t\tLength: %llu\n",
+ (u_longlong_t)rlp->rlp_num_entries);
+
+ if (!print_list) {
+ dsl_redaction_list_rele(rl, FTAG);
+ return (0);
+ }
+
+ if (rlp->rlp_num_entries == 0) {
+ dsl_redaction_list_rele(rl, FTAG);
+ (void) printf("\t\tRedaction List: []\n\n");
+ return (0);
+ }
+
+ redact_block_phys_t *rbp_buf;
+ uint64_t size;
+ dmu_object_info_t doi;
+
+ VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
+ size = doi.doi_max_offset;
+ rbp_buf = kmem_alloc(size, KM_SLEEP);
+
+ err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
+ rbp_buf, 0);
+ if (err != 0) {
+ dsl_redaction_list_rele(rl, FTAG);
+ kmem_free(rbp_buf, size);
+ return (err);
+ }
+
+ (void) printf("\t\tRedaction List: [{object: %llx, offset: "
+ "%llx, blksz: %x, count: %llx}",
+ (u_longlong_t)rbp_buf[0].rbp_object,
+ (u_longlong_t)rbp_buf[0].rbp_blkid,
+ (uint_t)(redact_block_get_size(&rbp_buf[0])),
+ (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
+
+ for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
+ (void) printf(",\n\t\t{object: %llx, offset: %llx, "
+ "blksz: %x, count: %llx}",
+ (u_longlong_t)rbp_buf[i].rbp_object,
+ (u_longlong_t)rbp_buf[i].rbp_blkid,
+ (uint_t)(redact_block_get_size(&rbp_buf[i])),
+ (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
+ }
+ dsl_redaction_list_rele(rl, FTAG);
+ kmem_free(rbp_buf, size);
+ (void) printf("]\n\n");
+ return (0);
+}
+
+static void
+dump_bookmarks(objset_t *os, int verbosity)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+ objset_t *mos = os->os_spa->spa_meta_objset;
+ if (verbosity < 4)
+ return;
+ dsl_pool_config_enter(dp, FTAG);
+
+ for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ char osname[ZFS_MAX_DATASET_NAME_LEN];
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ dmu_objset_name(os, osname);
+ VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname,
+ attr.za_name));
+ (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
+ }
+ zap_cursor_fini(&zc);
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+static void
+bpobj_count_refd(bpobj_t *bpo)
+{
+ mos_obj_refd(bpo->bpo_object);
+
+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+ mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
+ for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+ uint64_t subobj;
+ bpobj_t subbpo;
+ int error;
+ VERIFY0(dmu_read(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+ error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+ if (error != 0) {
+ (void) printf("ERROR %u while trying to open "
+ "subobj id %llu\n",
+ error, (u_longlong_t)subobj);
+ continue;
+ }
+ bpobj_count_refd(&subbpo);
+ bpobj_close(&subbpo);
+ }
+ }
+}
+
+static int
+dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
+{
+ spa_t *spa = arg;
+ uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
+ if (dle->dle_bpobj.bpo_object != empty_bpobj)
+ bpobj_count_refd(&dle->dle_bpobj);
+ return (0);
+}
+
+static int
+dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
+{
+ ASSERT(arg == NULL);
+ if (dump_opt['d'] >= 5) {
+ char buf[128];
+ (void) snprintf(buf, sizeof (buf),
+ "mintxg %llu -> obj %llu",
+ (longlong_t)dle->dle_mintxg,
+ (longlong_t)dle->dle_bpobj.bpo_object);
+
+ dump_full_bpobj(&dle->dle_bpobj, buf, 0);
+ } else {
+ (void) printf("mintxg %llu -> obj %llu\n",
+ (longlong_t)dle->dle_mintxg,
+ (longlong_t)dle->dle_bpobj.bpo_object);
+ }
+ return (0);
+}
+
+static void
+dump_blkptr_list(dsl_deadlist_t *dl, char *name)
+{
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
+ char entries[32];
+ spa_t *spa = dmu_objset_spa(dl->dl_os);
+ uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
+
+ if (dl->dl_oldfmt) {
+ if (dl->dl_bpobj.bpo_object != empty_bpobj)
+ bpobj_count_refd(&dl->dl_bpobj);
+ } else {
+ mos_obj_refd(dl->dl_object);
+ dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
+ }
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ);
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ if (dl->dl_oldfmt) {
+ dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+ return;
+ }
+
+ zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
+ zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
+ zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
+ zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
+ (void) printf("\n %s: %s (%s/%s comp), %s entries\n",
+ name, bytes, comp, uncomp, entries);
+
+ if (dump_opt['d'] < 4)
+ return;
+
+ (void) printf("\n");
+
+ dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
+}
+
+static int
+verify_dd_livelist(objset_t *os)
+{
+ uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
+ dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+
+ ASSERT(!dmu_objset_is_snapshot(os));
+ if (!dsl_deadlist_is_open(&dd->dd_livelist))
+ return (0);
+
+ /* Iterate through the livelist to check for duplicates */
+ dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
+ NULL);
+
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_deadlist_space(&dd->dd_livelist, &ll_used,
+ &ll_comp, &ll_uncomp);
+
+ dsl_dataset_t *origin_ds;
+ ASSERT(dsl_pool_config_held(dp));
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
+ VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
+ &used, &comp, &uncomp));
+ dsl_dataset_rele(origin_ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ /*
+ * It's possible that the dataset's uncomp space is larger than the
+ * livelist's because livelists do not track embedded block pointers
+ */
+ if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
+ char nice_used[32], nice_comp[32], nice_uncomp[32];
+ (void) printf("Discrepancy in space accounting:\n");
+ zdb_nicenum(used, nice_used, sizeof (nice_used));
+ zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
+ zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
+ (void) printf("dir: used %s, comp %s, uncomp %s\n",
+ nice_used, nice_comp, nice_uncomp);
+ zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
+ zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
+ zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
+ (void) printf("livelist: used %s, comp %s, uncomp %s\n",
+ nice_used, nice_comp, nice_uncomp);
+ return (1);
+ }
+ return (0);
+}
+
+static avl_tree_t idx_tree;
+static avl_tree_t domain_tree;
+static boolean_t fuid_table_loaded;
+static objset_t *sa_os = NULL;
+static sa_attr_type_t *sa_attr_table = NULL;
+
+static int
+open_objset(const char *path, void *tag, objset_t **osp)
+{
+ int err;
+ uint64_t sa_attrs = 0;
+ uint64_t version = 0;
+
+ VERIFY3P(sa_os, ==, NULL);
+ /*
+ * We can't own an objset if it's redacted. Therefore, we do this
+ * dance: hold the objset, then acquire a long hold on its dataset, then
+ * release the pool (which is held as part of holding the objset).
+ */
+ err = dmu_objset_hold(path, tag, osp);
+ if (err != 0) {
+ (void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
+ path, strerror(err));
+ return (err);
+ }
+ dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
+ dsl_pool_rele(dmu_objset_pool(*osp), tag);
+
+ if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) {
+ (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &version);
+ if (version >= ZPL_VERSION_SA) {
+ (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+ 8, 1, &sa_attrs);
+ }
+ err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
+ &sa_attr_table);
+ if (err != 0) {
+ (void) fprintf(stderr, "sa_setup failed: %s\n",
+ strerror(err));
+ dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
+ dsl_dataset_rele(dmu_objset_ds(*osp), tag);
+ *osp = NULL;
+ }
+ }
+ sa_os = *osp;
+
+ return (0);
+}
+
+static void
+close_objset(objset_t *os, void *tag)
+{
+ VERIFY3P(os, ==, sa_os);
+ if (os->os_sa != NULL)
+ sa_tear_down(os);
+ dsl_dataset_long_rele(dmu_objset_ds(os), tag);
+ dsl_dataset_rele(dmu_objset_ds(os), tag);
+ sa_attr_table = NULL;
+ sa_os = NULL;
+}
+
+static void
+fuid_table_destroy(void)
+{
+ if (fuid_table_loaded) {
+ zfs_fuid_table_destroy(&idx_tree, &domain_tree);
+ fuid_table_loaded = B_FALSE;
+ }
+}
+
+/*
+ * print uid or gid information.
+ * For normal POSIX id just the id is printed in decimal format.
+ * For CIFS files with FUID the fuid is printed in hex followed by
+ * the domain-rid string.
+ */
+static void
+print_idstr(uint64_t id, const char *id_type)
+{
+ if (FUID_INDEX(id)) {
+ char *domain;
+
+ domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
+ (void) printf("\t%s %llx [%s-%d]\n", id_type,
+ (u_longlong_t)id, domain, (int)FUID_RID(id));
+ } else {
+ (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
+ }
+
+}
+
+static void
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
+{
+ uint32_t uid_idx, gid_idx;
+
+ uid_idx = FUID_INDEX(uid);
+ gid_idx = FUID_INDEX(gid);
+
+ /* Load domain table, if not already loaded */
+ if (!fuid_table_loaded && (uid_idx || gid_idx)) {
+ uint64_t fuid_obj;
+
+ /* first find the fuid object. It lives in the master node */
+ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
+ 8, 1, &fuid_obj) == 0);
+ zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
+ (void) zfs_fuid_table_load(os, fuid_obj,
+ &idx_tree, &domain_tree);
+ fuid_table_loaded = B_TRUE;
+ }
+
+ print_idstr(uid, "uid");
+ print_idstr(gid, "gid");
+}
+
+static void
+dump_znode_sa_xattr(sa_handle_t *hdl)
+{
+ nvlist_t *sa_xattr;
+ nvpair_t *elem = NULL;
+ int sa_xattr_size = 0;
+ int sa_xattr_entries = 0;
+ int error;
+ char *sa_xattr_packed;
+
+ error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
+ if (error || sa_xattr_size == 0)
+ return;
+
+ sa_xattr_packed = malloc(sa_xattr_size);
+ if (sa_xattr_packed == NULL)
+ return;
+
+ error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
+ sa_xattr_packed, sa_xattr_size);
+ if (error) {
+ free(sa_xattr_packed);
+ return;
+ }
+
+ error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
+ if (error) {
+ free(sa_xattr_packed);
+ return;
+ }
+
+ while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
+ sa_xattr_entries++;
+
+ (void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
+ sa_xattr_size, sa_xattr_entries);
+ while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
+ uchar_t *value;
+ uint_t cnt, idx;
+
+ (void) printf("\t\t%s = ", nvpair_name(elem));
+ nvpair_value_byte_array(elem, &value, &cnt);
+ for (idx = 0; idx < cnt; ++idx) {
+ if (isprint(value[idx]))
+ (void) putchar(value[idx]);
+ else
+ (void) printf("\\%3.3o", value[idx]);
+ }
+ (void) putchar('\n');
+ }
+
+ nvlist_free(sa_xattr);
+ free(sa_xattr_packed);
+}
+
+static void
+dump_znode_symlink(sa_handle_t *hdl)
+{
+ int sa_symlink_size = 0;
+ char linktarget[MAXPATHLEN];
+ linktarget[0] = '\0';
+ int error;
+
+ error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
+ if (error || sa_symlink_size == 0) {
+ return;
+ }
+ if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
+ &linktarget, sa_symlink_size) == 0)
+ (void) printf("\ttarget %s\n", linktarget);
+}
+
+/*ARGSUSED*/
+static void
+dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
+ sa_handle_t *hdl;
+ uint64_t xattr, rdev, gen;
+ uint64_t uid, gid, mode, fsize, parent, links;
+ uint64_t pflags;
+ uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+ time_t z_crtime, z_atime, z_mtime, z_ctime;
+ sa_bulk_attr_t bulk[12];
+ int idx = 0;
+ int error;
+
+ VERIFY3P(os, ==, sa_os);
+ if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+ (void) printf("Failed to get handle for SA znode\n");
+ return;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+ &links, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+ &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+ &fsize, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+ acctm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+ modtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+ crtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+ chgtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+ &pflags, 8);
+
+ if (sa_bulk_lookup(hdl, bulk, idx)) {
+ (void) sa_handle_destroy(hdl);
+ return;
+ }
+
+ z_crtime = (time_t)crtm[0];
+ z_atime = (time_t)acctm[0];
+ z_mtime = (time_t)modtm[0];
+ z_ctime = (time_t)chgtm[0];
+
+ if (dump_opt['d'] > 4) {
+ error = zfs_obj_to_path(os, object, path, sizeof (path));
+ if (error == ESTALE) {
+ (void) snprintf(path, sizeof (path), "on delete queue");
+ } else if (error != 0) {
+ leaked_objects++;
+ (void) snprintf(path, sizeof (path),
+ "path not found, possibly leaked");
+ }
+ (void) printf("\tpath %s\n", path);
+ }
+
+ if (S_ISLNK(mode))
+ dump_znode_symlink(hdl);
+ dump_uidgid(os, uid, gid);
+ (void) printf("\tatime %s", ctime(&z_atime));
+ (void) printf("\tmtime %s", ctime(&z_mtime));
+ (void) printf("\tctime %s", ctime(&z_ctime));
+ (void) printf("\tcrtime %s", ctime(&z_crtime));
+ (void) printf("\tgen %llu\n", (u_longlong_t)gen);
+ (void) printf("\tmode %llo\n", (u_longlong_t)mode);
+ (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
+ (void) printf("\tparent %llu\n", (u_longlong_t)parent);
+ (void) printf("\tlinks %llu\n", (u_longlong_t)links);
+ (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
+ if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
+ uint64_t projid;
+
+ if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\tprojid %llu\n", (u_longlong_t)projid);
+ }
+ if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
+ if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
+ dump_znode_sa_xattr(hdl);
+ sa_handle_destroy(hdl);
+}
+
+/*ARGSUSED*/
+static void
+dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
+ dump_none, /* unallocated */
+ dump_zap, /* object directory */
+ dump_uint64, /* object array */
+ dump_none, /* packed nvlist */
+ dump_packed_nvlist, /* packed nvlist size */
+ dump_none, /* bpobj */
+ dump_bpobj, /* bpobj header */
+ dump_none, /* SPA space map header */
+ dump_none, /* SPA space map */
+ dump_none, /* ZIL intent log */
+ dump_dnode, /* DMU dnode */
+ dump_dmu_objset, /* DMU objset */
+ dump_dsl_dir, /* DSL directory */
+ dump_zap, /* DSL directory child map */
+ dump_zap, /* DSL dataset snap map */
+ dump_zap, /* DSL props */
+ dump_dsl_dataset, /* DSL dataset */
+ dump_znode, /* ZFS znode */
+ dump_acl, /* ZFS V0 ACL */
+ dump_uint8, /* ZFS plain file */
+ dump_zpldir, /* ZFS directory */
+ dump_zap, /* ZFS master node */
+ dump_zap, /* ZFS delete queue */
+ dump_uint8, /* zvol object */
+ dump_zap, /* zvol prop */
+ dump_uint8, /* other uint8[] */
+ dump_uint64, /* other uint64[] */
+ dump_zap, /* other ZAP */
+ dump_zap, /* persistent error log */
+ dump_uint8, /* SPA history */
+ dump_history_offsets, /* SPA history offsets */
+ dump_zap, /* Pool properties */
+ dump_zap, /* DSL permissions */
+ dump_acl, /* ZFS ACL */
+ dump_uint8, /* ZFS SYSACL */
+ dump_none, /* FUID nvlist */
+ dump_packed_nvlist, /* FUID nvlist size */
+ dump_zap, /* DSL dataset next clones */
+ dump_zap, /* DSL scrub queue */
+ dump_zap, /* ZFS user/group/project used */
+ dump_zap, /* ZFS user/group/project quota */
+ dump_zap, /* snapshot refcount tags */
+ dump_ddt_zap, /* DDT ZAP object */
+ dump_zap, /* DDT statistics */
+ dump_znode, /* SA object */
+ dump_zap, /* SA Master Node */
+ dump_sa_attrs, /* SA attribute registration */
+ dump_sa_layouts, /* SA attribute layouts */
+ dump_zap, /* DSL scrub translations */
+ dump_none, /* fake dedup BP */
+ dump_zap, /* deadlist */
+ dump_none, /* deadlist hdr */
+ dump_zap, /* dsl clones */
+ dump_bpobj_subobjs, /* bpobj subobjs */
+ dump_unknown, /* Unknown type, must be last */
+};
+
+static boolean_t
+match_object_type(dmu_object_type_t obj_type, uint64_t flags)
+{
+ boolean_t match = B_TRUE;
+
+ switch (obj_type) {
+ case DMU_OT_DIRECTORY_CONTENTS:
+ if (!(flags & ZOR_FLAG_DIRECTORY))
+ match = B_FALSE;
+ break;
+ case DMU_OT_PLAIN_FILE_CONTENTS:
+ if (!(flags & ZOR_FLAG_PLAIN_FILE))
+ match = B_FALSE;
+ break;
+ case DMU_OT_SPACE_MAP:
+ if (!(flags & ZOR_FLAG_SPACE_MAP))
+ match = B_FALSE;
+ break;
+ default:
+ if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
+ if (!(flags & ZOR_FLAG_ZAP))
+ match = B_FALSE;
+ break;
+ }
+
+ /*
+ * If all bits except some of the supported flags are
+ * set, the user combined the all-types flag (A) with
+ * a negated flag to exclude some types (e.g. A-f to
+ * show all object types except plain files).
+ */
+ if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
+ match = B_FALSE;
+
+ break;
+ }
+
+ return (match);
+}
+
+static void
+dump_object(objset_t *os, uint64_t object, int verbosity,
+ boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
+{
+ dmu_buf_t *db = NULL;
+ dmu_object_info_t doi;
+ dnode_t *dn;
+ boolean_t dnode_held = B_FALSE;
+ void *bonus = NULL;
+ size_t bsize = 0;
+ char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
+ char bonus_size[32];
+ char aux[50];
+ int error;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
+
+ if (*print_header) {
+ (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
+ "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
+ "lsize", "%full", "type");
+ *print_header = 0;
+ }
+
+ if (object == 0) {
+ dn = DMU_META_DNODE(os);
+ dmu_object_info_from_dnode(dn, &doi);
+ } else {
+ /*
+ * Encrypted datasets will have sensitive bonus buffers
+ * encrypted. Therefore we cannot hold the bonus buffer and
+ * must hold the dnode itself instead.
+ */
+ error = dmu_object_info(os, object, &doi);
+ if (error)
+ fatal("dmu_object_info() failed, errno %u", error);
+
+ if (os->os_encrypted &&
+ DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error)
+ fatal("dnode_hold() failed, errno %u", error);
+ dnode_held = B_TRUE;
+ } else {
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error)
+ fatal("dmu_bonus_hold(%llu) failed, errno %u",
+ object, error);
+ bonus = db->db_data;
+ bsize = db->db_size;
+ dn = DB_DNODE((dmu_buf_impl_t *)db);
+ }
+ }
+
+ /*
+ * Default to showing all object types if no flags were specified.
+ */
+ if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
+ !match_object_type(doi.doi_type, flags))
+ goto out;
+
+ if (dnode_slots_used)
+ *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
+
+ zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
+ zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
+ zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
+ zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
+ zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
+ zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
+ (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+ doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+ doi.doi_max_offset);
+
+ aux[0] = '\0';
+
+ if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
+ (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+ " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
+ }
+
+ if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
+ ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
+ const char *compname = NULL;
+ if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
+ ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
+ &compname) == 0) {
+ (void) snprintf(aux + strlen(aux),
+ sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
+ compname);
+ } else {
+ (void) snprintf(aux + strlen(aux),
+ sizeof (aux) - strlen(aux),
+ " (Z=inherit=%s-unknown)",
+ ZDB_COMPRESS_NAME(os->os_compress));
+ }
+ } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
+ (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+ " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
+ } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
+ (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+ " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
+ }
+
+ (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
+ (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+ asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
+
+ if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
+ (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
+ "", "", "", "", "", "", bonus_size, "bonus",
+ zdb_ot_name(doi.doi_bonus_type));
+ }
+
+ if (verbosity >= 4) {
+ (void) printf("\tdnode flags: %s%s%s%s\n",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
+ "USED_BYTES " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
+ "USERUSED_ACCOUNTED " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
+ "USEROBJUSED_ACCOUNTED " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+ "SPILL_BLKPTR" : "");
+ (void) printf("\tdnode maxblkid: %llu\n",
+ (longlong_t)dn->dn_phys->dn_maxblkid);
+
+ if (!dnode_held) {
+ object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
+ object, bonus, bsize);
+ } else {
+ (void) printf("\t\t(bonus encrypted)\n");
+ }
+
+ if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) {
+ object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
+ NULL, 0);
+ } else {
+ (void) printf("\t\t(object encrypted)\n");
+ }
+
+ *print_header = B_TRUE;
+ }
+
+ if (verbosity >= 5)
+ dump_indirect(dn);
+
+ if (verbosity >= 5) {
+ /*
+ * Report the list of segments that comprise the object.
+ */
+ uint64_t start = 0;
+ uint64_t end;
+ uint64_t blkfill = 1;
+ int minlvl = 1;
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ minlvl = 0;
+ blkfill = DNODES_PER_BLOCK;
+ }
+
+ for (;;) {
+ char segsize[32];
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
+ error = dnode_next_offset(dn,
+ 0, &start, minlvl, blkfill, 0);
+ if (error)
+ break;
+ end = start;
+ error = dnode_next_offset(dn,
+ DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
+ zdb_nicenum(end - start, segsize, sizeof (segsize));
+ (void) printf("\t\tsegment [%016llx, %016llx)"
+ " size %5s\n", (u_longlong_t)start,
+ (u_longlong_t)end, segsize);
+ if (error)
+ break;
+ start = end;
+ }
+ }
+
+out:
+ if (db != NULL)
+ dmu_buf_rele(db, FTAG);
+ if (dnode_held)
+ dnode_rele(dn, FTAG);
+}
+
+static void
+count_dir_mos_objects(dsl_dir_t *dd)
+{
+ mos_obj_refd(dd->dd_object);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
+
+ /*
+ * The dd_crypto_obj can be referenced by multiple dsl_dir's.
+ * Ignore the references after the first one.
+ */
+ mos_obj_refd_multiple(dd->dd_crypto_obj);
+}
+
+static void
+count_ds_mos_objects(dsl_dataset_t *ds)
+{
+ mos_obj_refd(ds->ds_object);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+ mos_obj_refd(ds->ds_bookmarks_obj);
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ count_dir_mos_objects(ds->ds_dir);
+ }
+}
+
+static const char *objset_types[DMU_OST_NUMTYPES] = {
+ "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
+
+/*
+ * Parse a string denoting a range of object IDs of the form
+ * <start>[:<end>[:flags]], and store the results in zor.
+ * Return 0 on success. On error, return 1 and update the msg
+ * pointer to point to a descriptive error message.
+ */
+static int
+parse_object_range(char *range, zopt_object_range_t *zor, char **msg)
+{
+ uint64_t flags = 0;
+ char *p, *s, *dup, *flagstr;
+ size_t len;
+ int i;
+ int rc = 0;
+
+ if (strchr(range, ':') == NULL) {
+ zor->zor_obj_start = strtoull(range, &p, 0);
+ if (*p != '\0') {
+ *msg = "Invalid characters in object ID";
+ rc = 1;
+ }
+ zor->zor_obj_end = zor->zor_obj_start;
+ return (rc);
+ }
+
+ if (strchr(range, ':') == range) {
+ *msg = "Invalid leading colon";
+ rc = 1;
+ return (rc);
+ }
+
+ len = strlen(range);
+ if (range[len - 1] == ':') {
+ *msg = "Invalid trailing colon";
+ rc = 1;
+ return (rc);
+ }
+
+ dup = strdup(range);
+ s = strtok(dup, ":");
+ zor->zor_obj_start = strtoull(s, &p, 0);
+
+ if (*p != '\0') {
+ *msg = "Invalid characters in start object ID";
+ rc = 1;
+ goto out;
+ }
+
+ s = strtok(NULL, ":");
+ zor->zor_obj_end = strtoull(s, &p, 0);
+
+ if (*p != '\0') {
+ *msg = "Invalid characters in end object ID";
+ rc = 1;
+ goto out;
+ }
+
+ if (zor->zor_obj_start > zor->zor_obj_end) {
+ *msg = "Start object ID may not exceed end object ID";
+ rc = 1;
+ goto out;
+ }
+
+ s = strtok(NULL, ":");
+ if (s == NULL) {
+ zor->zor_flags = ZOR_FLAG_ALL_TYPES;
+ goto out;
+ } else if (strtok(NULL, ":") != NULL) {
+ *msg = "Invalid colon-delimited field after flags";
+ rc = 1;
+ goto out;
+ }
+
+ flagstr = s;
+ for (i = 0; flagstr[i]; i++) {
+ int bit;
+ boolean_t negation = (flagstr[i] == '-');
+
+ if (negation) {
+ i++;
+ if (flagstr[i] == '\0') {
+ *msg = "Invalid trailing negation operator";
+ rc = 1;
+ goto out;
+ }
+ }
+ bit = flagbits[(uchar_t)flagstr[i]];
+ if (bit == 0) {
+ *msg = "Invalid flag";
+ rc = 1;
+ goto out;
+ }
+ if (negation)
+ flags &= ~bit;
+ else
+ flags |= bit;
+ }
+ zor->zor_flags = flags;
+
+out:
+ free(dup);
+ return (rc);
+}
+
+static void
+dump_objset(objset_t *os)
+{
+ dmu_objset_stats_t dds = { 0 };
+ uint64_t object, object_count;
+ uint64_t refdbytes, usedobjs, scratch;
+ char numbuf[32];
+ char blkbuf[BP_SPRINTF_LEN + 20];
+ char osname[ZFS_MAX_DATASET_NAME_LEN];
+ const char *type = "UNKNOWN";
+ int verbosity = dump_opt['d'];
+ boolean_t print_header;
+ unsigned i;
+ int error;
+ uint64_t total_slots_used = 0;
+ uint64_t max_slot_used = 0;
+ uint64_t dnode_slots;
+ uint64_t obj_start;
+ uint64_t obj_end;
+ uint64_t flags;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
+
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ dmu_objset_fast_stat(os, &dds);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+
+ print_header = B_TRUE;
+
+ if (dds.dds_type < DMU_OST_NUMTYPES)
+ type = objset_types[dds.dds_type];
+
+ if (dds.dds_type == DMU_OST_META) {
+ dds.dds_creation_txg = TXG_INITIAL;
+ usedobjs = BP_GET_FILL(os->os_rootbp);
+ refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+ dd_used_bytes;
+ } else {
+ dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
+ }
+
+ ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
+
+ zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
+
+ if (verbosity >= 4) {
+ (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
+ (void) snprintf_blkptr(blkbuf + strlen(blkbuf),
+ sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
+ } else {
+ blkbuf[0] = '\0';
+ }
+
+ dmu_objset_name(os, osname);
+
+ (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
+ "%s, %llu objects%s%s\n",
+ osname, type, (u_longlong_t)dmu_objset_id(os),
+ (u_longlong_t)dds.dds_creation_txg,
+ numbuf, (u_longlong_t)usedobjs, blkbuf,
+ (dds.dds_inconsistent) ? " (inconsistent)" : "");
+
+ for (i = 0; i < zopt_object_args; i++) {
+ obj_start = zopt_object_ranges[i].zor_obj_start;
+ obj_end = zopt_object_ranges[i].zor_obj_end;
+ flags = zopt_object_ranges[i].zor_flags;
+
+ object = obj_start;
+ if (object == 0 || obj_start == obj_end)
+ dump_object(os, object, verbosity, &print_header, NULL,
+ flags);
+ else
+ object--;
+
+ while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
+ object <= obj_end) {
+ dump_object(os, object, verbosity, &print_header, NULL,
+ flags);
+ }
+ }
+
+ if (zopt_object_args > 0) {
+ (void) printf("\n");
+ return;
+ }
+
+ if (dump_opt['i'] != 0 || verbosity >= 2)
+ dump_intent_log(dmu_objset_zil(os));
+
+ if (dmu_objset_ds(os) != NULL) {
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ !dmu_objset_is_snapshot(os)) {
+ dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
+ if (verify_dd_livelist(os) != 0)
+ fatal("livelist is incorrect");
+ }
+
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ (void) printf("ds_remap_deadlist:\n");
+ dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
+ }
+ count_ds_mos_objects(ds);
+ }
+
+ if (dmu_objset_ds(os) != NULL)
+ dump_bookmarks(os, verbosity);
+
+ if (verbosity < 2)
+ return;
+
+ if (BP_IS_HOLE(os->os_rootbp))
+ return;
+
+ dump_object(os, 0, verbosity, &print_header, NULL, 0);
+ object_count = 0;
+ if (DMU_USERUSED_DNODE(os) != NULL &&
+ DMU_USERUSED_DNODE(os)->dn_type != 0) {
+ dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
+ NULL, 0);
+ dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
+ NULL, 0);
+ }
+
+ if (DMU_PROJECTUSED_DNODE(os) != NULL &&
+ DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
+ dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
+ &print_header, NULL, 0);
+
+ object = 0;
+ while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+ dump_object(os, object, verbosity, &print_header, &dnode_slots,
+ 0);
+ object_count++;
+ total_slots_used += dnode_slots;
+ max_slot_used = object + dnode_slots - 1;
+ }
+
+ (void) printf("\n");
+
+ (void) printf(" Dnode slots:\n");
+ (void) printf("\tTotal used: %10llu\n",
+ (u_longlong_t)total_slots_used);
+ (void) printf("\tMax used: %10llu\n",
+ (u_longlong_t)max_slot_used);
+ (void) printf("\tPercent empty: %10lf\n",
+ (double)(max_slot_used - total_slots_used)*100 /
+ (double)max_slot_used);
+ (void) printf("\n");
+
+ if (error != ESRCH) {
+ (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
+ abort();
+ }
+
+ ASSERT3U(object_count, ==, usedobjs);
+
+ if (leaked_objects != 0) {
+ (void) printf("%d potentially leaked objects detected\n",
+ leaked_objects);
+ leaked_objects = 0;
+ }
+}
+
+static void
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
+{
+ time_t timestamp = ub->ub_timestamp;
+
+ (void) printf("%s", header ? header : "");
+ (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
+ (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
+ (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
+ (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
+ (void) printf("\ttimestamp = %llu UTC = %s",
+ (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
+
+ (void) printf("\tmmp_magic = %016llx\n",
+ (u_longlong_t)ub->ub_mmp_magic);
+ if (MMP_VALID(ub)) {
+ (void) printf("\tmmp_delay = %0llu\n",
+ (u_longlong_t)ub->ub_mmp_delay);
+ if (MMP_SEQ_VALID(ub))
+ (void) printf("\tmmp_seq = %u\n",
+ (unsigned int) MMP_SEQ(ub));
+ if (MMP_FAIL_INT_VALID(ub))
+ (void) printf("\tmmp_fail = %u\n",
+ (unsigned int) MMP_FAIL_INT(ub));
+ if (MMP_INTERVAL_VALID(ub))
+ (void) printf("\tmmp_write = %u\n",
+ (unsigned int) MMP_INTERVAL(ub));
+ /* After MMP_* to make summarize_uberblock_mmp cleaner */
+ (void) printf("\tmmp_valid = %x\n",
+ (unsigned int) ub->ub_mmp_config & 0xFF);
+ }
+
+ if (dump_opt['u'] >= 4) {
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
+ (void) printf("\trootbp = %s\n", blkbuf);
+ }
+ (void) printf("\tcheckpoint_txg = %llu\n",
+ (u_longlong_t)ub->ub_checkpoint_txg);
+ (void) printf("%s", footer ? footer : "");
+}
+
+static void
+dump_config(spa_t *spa)
+{
+ dmu_buf_t *db;
+ size_t nvsize = 0;
+ int error = 0;
+
+
+ error = dmu_bonus_hold(spa->spa_meta_objset,
+ spa->spa_config_object, FTAG, &db);
+
+ if (error == 0) {
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ (void) printf("\nMOS Configuration:\n");
+ dump_packed_nvlist(spa->spa_meta_objset,
+ spa->spa_config_object, (void *)&nvsize, 1);
+ } else {
+ (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+ (u_longlong_t)spa->spa_config_object, error);
+ }
+}
+
+static void
+dump_cachefile(const char *cachefile)
+{
+ int fd;
+ struct stat64 statbuf;
+ char *buf;
+ nvlist_t *config;
+
+ if ((fd = open64(cachefile, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", cachefile,
+ strerror(errno));
+ exit(1);
+ }
+
+ if (fstat64(fd, &statbuf) != 0) {
+ (void) printf("failed to stat '%s': %s\n", cachefile,
+ strerror(errno));
+ exit(1);
+ }
+
+ if ((buf = malloc(statbuf.st_size)) == NULL) {
+ (void) fprintf(stderr, "failed to allocate %llu bytes\n",
+ (u_longlong_t)statbuf.st_size);
+ exit(1);
+ }
+
+ if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
+ (void) fprintf(stderr, "failed to read %llu bytes\n",
+ (u_longlong_t)statbuf.st_size);
+ exit(1);
+ }
+
+ (void) close(fd);
+
+ if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
+ (void) fprintf(stderr, "failed to unpack nvlist\n");
+ exit(1);
+ }
+
+ free(buf);
+
+ dump_nvlist(config, 0);
+
+ nvlist_free(config);
+}
+
+/*
+ * ZFS label nvlist stats
+ */
+typedef struct zdb_nvl_stats {
+ int zns_list_count;
+ int zns_leaf_count;
+ size_t zns_leaf_largest;
+ size_t zns_leaf_total;
+ nvlist_t *zns_string;
+ nvlist_t *zns_uint64;
+ nvlist_t *zns_boolean;
+} zdb_nvl_stats_t;
+
+static void
+collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
+{
+ nvlist_t *list, **array;
+ nvpair_t *nvp = NULL;
+ char *name;
+ uint_t i, items;
+
+ stats->zns_list_count++;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ name = nvpair_name(nvp);
+
+ switch (nvpair_type(nvp)) {
+ case DATA_TYPE_STRING:
+ fnvlist_add_string(stats->zns_string, name,
+ fnvpair_value_string(nvp));
+ break;
+ case DATA_TYPE_UINT64:
+ fnvlist_add_uint64(stats->zns_uint64, name,
+ fnvpair_value_uint64(nvp));
+ break;
+ case DATA_TYPE_BOOLEAN:
+ fnvlist_add_boolean(stats->zns_boolean, name);
+ break;
+ case DATA_TYPE_NVLIST:
+ if (nvpair_value_nvlist(nvp, &list) == 0)
+ collect_nvlist_stats(list, stats);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
+ break;
+
+ for (i = 0; i < items; i++) {
+ collect_nvlist_stats(array[i], stats);
+
+ /* collect stats on leaf vdev */
+ if (strcmp(name, "children") == 0) {
+ size_t size;
+
+ (void) nvlist_size(array[i], &size,
+ NV_ENCODE_XDR);
+ stats->zns_leaf_total += size;
+ if (size > stats->zns_leaf_largest)
+ stats->zns_leaf_largest = size;
+ stats->zns_leaf_count++;
+ }
+ }
+ break;
+ default:
+ (void) printf("skip type %d!\n", (int)nvpair_type(nvp));
+ }
+ }
+}
+
+static void
+dump_nvlist_stats(nvlist_t *nvl, size_t cap)
+{
+ zdb_nvl_stats_t stats = { 0 };
+ size_t size, sum = 0, total;
+ size_t noise;
+
+ /* requires nvlist with non-unique names for stat collection */
+ VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
+ VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
+ VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
+ VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
+
+ (void) printf("\n\nZFS Label NVList Config Stats:\n");
+
+ VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
+ (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n",
+ (int)total, (int)(cap - total), 100.0 * total / cap);
+
+ collect_nvlist_stats(nvl, &stats);
+
+ VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
+ size -= noise;
+ sum += size;
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
+ (int)fnvlist_num_pairs(stats.zns_uint64),
+ (int)size, 100.0 * size / total);
+
+ VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
+ size -= noise;
+ sum += size;
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
+ (int)fnvlist_num_pairs(stats.zns_string),
+ (int)size, 100.0 * size / total);
+
+ VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
+ size -= noise;
+ sum += size;
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
+ (int)fnvlist_num_pairs(stats.zns_boolean),
+ (int)size, 100.0 * size / total);
+
+ size = total - sum; /* treat remainder as nvlist overhead */
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
+ stats.zns_list_count, (int)size, 100.0 * size / total);
+
+ if (stats.zns_leaf_count > 0) {
+ size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
+
+ (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
+ stats.zns_leaf_count, (int)average);
+ (void) printf("%24d bytes largest\n",
+ (int)stats.zns_leaf_largest);
+
+ if (dump_opt['l'] >= 3 && average > 0)
+ (void) printf(" space for %d additional leaf vdevs\n",
+ (int)((cap - total) / average));
+ }
+ (void) printf("\n");
+
+ nvlist_free(stats.zns_string);
+ nvlist_free(stats.zns_uint64);
+ nvlist_free(stats.zns_boolean);
+}
+
+typedef struct cksum_record {
+ zio_cksum_t cksum;
+ boolean_t labels[VDEV_LABELS];
+ avl_node_t link;
+} cksum_record_t;
+
+static int
+cksum_record_compare(const void *x1, const void *x2)
+{
+ const cksum_record_t *l = (cksum_record_t *)x1;
+ const cksum_record_t *r = (cksum_record_t *)x2;
+ int arraysize = ARRAY_SIZE(l->cksum.zc_word);
+ int difference;
+
+ for (int i = 0; i < arraysize; i++) {
+ difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
+ if (difference)
+ break;
+ }
+
+ return (difference);
+}
+
+static cksum_record_t *
+cksum_record_alloc(zio_cksum_t *cksum, int l)
+{
+ cksum_record_t *rec;
+
+ rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
+ rec->cksum = *cksum;
+ rec->labels[l] = B_TRUE;
+
+ return (rec);
+}
+
+static cksum_record_t *
+cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
+{
+ cksum_record_t lookup = { .cksum = *cksum };
+ avl_index_t where;
+
+ return (avl_find(tree, &lookup, &where));
+}
+
+static cksum_record_t *
+cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
+{
+ cksum_record_t *rec;
+
+ rec = cksum_record_lookup(tree, cksum);
+ if (rec) {
+ rec->labels[l] = B_TRUE;
+ } else {
+ rec = cksum_record_alloc(cksum, l);
+ avl_add(tree, rec);
+ }
+
+ return (rec);
+}
+
+static int
+first_label(cksum_record_t *rec)
+{
+ for (int i = 0; i < VDEV_LABELS; i++)
+ if (rec->labels[i])
+ return (i);
+
+ return (-1);
+}
+
+static void
+print_label_numbers(char *prefix, cksum_record_t *rec)
+{
+ printf("%s", prefix);
+ for (int i = 0; i < VDEV_LABELS; i++)
+ if (rec->labels[i] == B_TRUE)
+ printf("%d ", i);
+ printf("\n");
+}
+
+#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
+
+typedef struct zdb_label {
+ vdev_label_t label;
+ nvlist_t *config_nv;
+ cksum_record_t *config;
+ cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
+ boolean_t header_printed;
+ boolean_t read_failed;
+} zdb_label_t;
+
+static void
+print_label_header(zdb_label_t *label, int l)
+{
+
+ if (dump_opt['q'])
+ return;
+
+ if (label->header_printed == B_TRUE)
+ return;
+
+ (void) printf("------------------------------------\n");
+ (void) printf("LABEL %d\n", l);
+ (void) printf("------------------------------------\n");
+
+ label->header_printed = B_TRUE;
+}
+
+static void
+print_l2arc_header(void)
+{
+ (void) printf("------------------------------------\n");
+ (void) printf("L2ARC device header\n");
+ (void) printf("------------------------------------\n");
+}
+
+static void
+print_l2arc_log_blocks(void)
+{
+ (void) printf("------------------------------------\n");
+ (void) printf("L2ARC device log blocks\n");
+ (void) printf("------------------------------------\n");
+}
+
+static void
+dump_l2arc_log_entries(uint64_t log_entries,
+ l2arc_log_ent_phys_t *le, uint64_t i)
+{
+ for (int j = 0; j < log_entries; j++) {
+ dva_t dva = le[j].le_dva;
+ (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
+ "vdev: %llu, offset: %llu\n",
+ (u_longlong_t)i, j + 1,
+ (u_longlong_t)DVA_GET_ASIZE(&dva),
+ (u_longlong_t)DVA_GET_VDEV(&dva),
+ (u_longlong_t)DVA_GET_OFFSET(&dva));
+ (void) printf("|\t\t\t\tbirth: %llu\n",
+ (u_longlong_t)le[j].le_birth);
+ (void) printf("|\t\t\t\tlsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tpsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tcompr: %llu\n",
+ (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tcomplevel: %llu\n",
+ (u_longlong_t)(&le[j])->le_complevel);
+ (void) printf("|\t\t\t\ttype: %llu\n",
+ (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tprotected: %llu\n",
+ (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tprefetch: %llu\n",
+ (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
+ (void) printf("|\t\t\t\taddress: %llu\n",
+ (u_longlong_t)le[j].le_daddr);
+ (void) printf("|\n");
+ }
+ (void) printf("\n");
+}
+
+static void
+dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
+{
+ (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr);
+ (void) printf("|\t\tpayload_asize: %llu\n",
+ (u_longlong_t)lbps.lbp_payload_asize);
+ (void) printf("|\t\tpayload_start: %llu\n",
+ (u_longlong_t)lbps.lbp_payload_start);
+ (void) printf("|\t\tlsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop));
+ (void) printf("|\t\tasize: %llu\n",
+ (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop));
+ (void) printf("|\t\tcompralgo: %llu\n",
+ (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop));
+ (void) printf("|\t\tcksumalgo: %llu\n",
+ (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop));
+ (void) printf("|\n\n");
+}
+
+static void
+dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr,
+ l2arc_dev_hdr_phys_t *rebuild)
+{
+ l2arc_log_blk_phys_t this_lb;
+ uint64_t asize;
+ l2arc_log_blkptr_t lbps[2];
+ abd_t *abd;
+ zio_cksum_t cksum;
+ int failed = 0;
+ l2arc_dev_t dev;
+
+ if (!dump_opt['q'])
+ print_l2arc_log_blocks();
+ bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps));
+
+ dev.l2ad_evict = l2dhdr.dh_evict;
+ dev.l2ad_start = l2dhdr.dh_start;
+ dev.l2ad_end = l2dhdr.dh_end;
+
+ if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) {
+ /* no log blocks to read */
+ if (!dump_opt['q']) {
+ (void) printf("No log blocks to read\n");
+ (void) printf("\n");
+ }
+ return;
+ } else {
+ dev.l2ad_hand = lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ }
+
+ dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
+ break;
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
+ if (!dump_opt['q']) {
+ (void) printf("Error while reading next log "
+ "block\n\n");
+ }
+ break;
+ }
+
+ fletcher_4_native_varsize(&this_lb, asize, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
+ failed++;
+ if (!dump_opt['q']) {
+ (void) printf("Invalid cksum\n");
+ dump_l2arc_log_blkptr(lbps[0]);
+ }
+ break;
+ }
+
+ switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ default:
+ abd = abd_alloc_for_io(asize, B_TRUE);
+ abd_copy_from_buf_off(abd, &this_lb, 0, asize);
+ zio_decompress_data(L2BLK_GET_COMPRESS(
+ (&lbps[0])->lbp_prop), abd, &this_lb,
+ asize, sizeof (this_lb), NULL);
+ abd_free(abd);
+ break;
+ }
+
+ if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(&this_lb, sizeof (this_lb));
+ if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ if (!dump_opt['q'])
+ (void) printf("Invalid log block magic\n\n");
+ break;
+ }
+
+ rebuild->dh_lb_count++;
+ rebuild->dh_lb_asize += asize;
+ if (dump_opt['l'] > 1 && !dump_opt['q']) {
+ (void) printf("lb[%4llu]\tmagic: %llu\n",
+ (u_longlong_t)rebuild->dh_lb_count,
+ (u_longlong_t)this_lb.lb_magic);
+ dump_l2arc_log_blkptr(lbps[0]);
+ }
+
+ if (dump_opt['l'] > 2 && !dump_opt['q'])
+ dump_l2arc_log_entries(l2dhdr.dh_log_entries,
+ this_lb.lb_entries,
+ rebuild->dh_lb_count);
+
+ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+ lbps[0].lbp_payload_start, dev.l2ad_evict) &&
+ !dev.l2ad_first)
+ break;
+
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb.lb_prev_lbp;
+ }
+
+ if (!dump_opt['q']) {
+ (void) printf("log_blk_count:\t %llu with valid cksum\n",
+ (u_longlong_t)rebuild->dh_lb_count);
+ (void) printf("\t\t %d with invalid cksum\n", failed);
+ (void) printf("log_blk_asize:\t %llu\n\n",
+ (u_longlong_t)rebuild->dh_lb_asize);
+ }
+}
+
+static int
+dump_l2arc_header(int fd)
+{
+ l2arc_dev_hdr_phys_t l2dhdr, rebuild;
+ int error = B_FALSE;
+
+ bzero(&l2dhdr, sizeof (l2dhdr));
+ bzero(&rebuild, sizeof (rebuild));
+
+ if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
+ VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
+ error = B_TRUE;
+ } else {
+ if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
+
+ if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
+ error = B_TRUE;
+ }
+
+ if (error) {
+ (void) printf("L2ARC device header not found\n\n");
+ /* Do not return an error here for backward compatibility */
+ return (0);
+ } else if (!dump_opt['q']) {
+ print_l2arc_header();
+
+ (void) printf(" magic: %llu\n",
+ (u_longlong_t)l2dhdr.dh_magic);
+ (void) printf(" version: %llu\n",
+ (u_longlong_t)l2dhdr.dh_version);
+ (void) printf(" pool_guid: %llu\n",
+ (u_longlong_t)l2dhdr.dh_spa_guid);
+ (void) printf(" flags: %llu\n",
+ (u_longlong_t)l2dhdr.dh_flags);
+ (void) printf(" start_lbps[0]: %llu\n",
+ (u_longlong_t)
+ l2dhdr.dh_start_lbps[0].lbp_daddr);
+ (void) printf(" start_lbps[1]: %llu\n",
+ (u_longlong_t)
+ l2dhdr.dh_start_lbps[1].lbp_daddr);
+ (void) printf(" log_blk_ent: %llu\n",
+ (u_longlong_t)l2dhdr.dh_log_entries);
+ (void) printf(" start: %llu\n",
+ (u_longlong_t)l2dhdr.dh_start);
+ (void) printf(" end: %llu\n",
+ (u_longlong_t)l2dhdr.dh_end);
+ (void) printf(" evict: %llu\n",
+ (u_longlong_t)l2dhdr.dh_evict);
+ (void) printf(" lb_asize_refcount: %llu\n",
+ (u_longlong_t)l2dhdr.dh_lb_asize);
+ (void) printf(" lb_count_refcount: %llu\n",
+ (u_longlong_t)l2dhdr.dh_lb_count);
+ (void) printf(" trim_action_time: %llu\n",
+ (u_longlong_t)l2dhdr.dh_trim_action_time);
+ (void) printf(" trim_state: %llu\n\n",
+ (u_longlong_t)l2dhdr.dh_trim_state);
+ }
+
+ dump_l2arc_log_blocks(fd, l2dhdr, &rebuild);
+ /*
+ * The total aligned size of log blocks and the number of log blocks
+ * reported in the header of the device may be less than what zdb
+ * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
+ * This happens because dump_l2arc_log_blocks() lacks the memory
+ * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
+ * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
+ * and dh_lb_count will be lower to begin with than what exists on the
+ * device. This is normal and zdb should not exit with an error. The
+ * opposite case should never happen though, the values reported in the
+ * header should never be higher than what dump_l2arc_log_blocks() and
+ * l2arc_rebuild() report. If this happens there is a leak in the
+ * accounting of log blocks.
+ */
+ if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
+ l2dhdr.dh_lb_count > rebuild.dh_lb_count)
+ return (1);
+
+ return (0);
+}
+
+static void
+dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
+{
+ if (dump_opt['q'])
+ return;
+
+ if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
+ return;
+
+ print_label_header(label, l);
+ dump_nvlist(label->config_nv, 4);
+ print_label_numbers(" labels = ", label->config);
+
+ if (dump_opt['l'] >= 2)
+ dump_nvlist_stats(label->config_nv, buflen);
+}
+
+#define ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
+{
+
+ vdev_t vd;
+ char header[ZDB_MAX_UB_HEADER_SIZE];
+
+ vd.vdev_ashift = ashift;
+ vd.vdev_top = &vd;
+
+ for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
+ uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
+ uberblock_t *ub = (void *)((char *)&label->label + uoff);
+ cksum_record_t *rec = label->uberblocks[i];
+
+ if (rec == NULL) {
+ if (dump_opt['u'] >= 2) {
+ print_label_header(label, label_num);
+ (void) printf(" Uberblock[%d] invalid\n", i);
+ }
+ continue;
+ }
+
+ if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
+ continue;
+
+ if ((dump_opt['u'] < 4) &&
+ (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
+ (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
+ continue;
+
+ print_label_header(label, label_num);
+ (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+ " Uberblock[%d]\n", i);
+ dump_uberblock(ub, header, "");
+ print_label_numbers(" labels = ", rec);
+ }
+}
+
+static char curpath[PATH_MAX];
+
+/*
+ * Iterate through the path components, recursively passing
+ * current one's obj and remaining path until we find the obj
+ * for the last one.
+ */
+static int
+dump_path_impl(objset_t *os, uint64_t obj, char *name)
+{
+ int err;
+ boolean_t header = B_TRUE;
+ uint64_t child_obj;
+ char *s;
+ dmu_buf_t *db;
+ dmu_object_info_t doi;
+
+ if ((s = strchr(name, '/')) != NULL)
+ *s = '\0';
+ err = zap_lookup(os, obj, name, 8, 1, &child_obj);
+
+ (void) strlcat(curpath, name, sizeof (curpath));
+
+ if (err != 0) {
+ (void) fprintf(stderr, "failed to lookup %s: %s\n",
+ curpath, strerror(err));
+ return (err);
+ }
+
+ child_obj = ZFS_DIRENT_OBJ(child_obj);
+ err = sa_buf_hold(os, child_obj, FTAG, &db);
+ if (err != 0) {
+ (void) fprintf(stderr,
+ "failed to get SA dbuf for obj %llu: %s\n",
+ (u_longlong_t)child_obj, strerror(err));
+ return (EINVAL);
+ }
+ dmu_object_info_from_db(db, &doi);
+ sa_buf_rele(db, FTAG);
+
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) {
+ (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
+ doi.doi_bonus_type, (u_longlong_t)child_obj);
+ return (EINVAL);
+ }
+
+ if (dump_opt['v'] > 6) {
+ (void) printf("obj=%llu %s type=%d bonustype=%d\n",
+ (u_longlong_t)child_obj, curpath, doi.doi_type,
+ doi.doi_bonus_type);
+ }
+
+ (void) strlcat(curpath, "/", sizeof (curpath));
+
+ switch (doi.doi_type) {
+ case DMU_OT_DIRECTORY_CONTENTS:
+ if (s != NULL && *(s + 1) != '\0')
+ return (dump_path_impl(os, child_obj, s + 1));
+ /*FALLTHROUGH*/
+ case DMU_OT_PLAIN_FILE_CONTENTS:
+ dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0);
+ return (0);
+ default:
+ (void) fprintf(stderr, "object %llu has non-file/directory "
+ "type %d\n", (u_longlong_t)obj, doi.doi_type);
+ break;
+ }
+
+ return (EINVAL);
+}
+
+/*
+ * Dump the blocks for the object specified by path inside the dataset.
+ */
+static int
+dump_path(char *ds, char *path)
+{
+ int err;
+ objset_t *os;
+ uint64_t root_obj;
+
+ err = open_objset(ds, FTAG, &os);
+ if (err != 0)
+ return (err);
+
+ err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
+ if (err != 0) {
+ (void) fprintf(stderr, "can't lookup root znode: %s\n",
+ strerror(err));
+ close_objset(os, FTAG);
+ return (EINVAL);
+ }
+
+ (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
+
+ err = dump_path_impl(os, root_obj, path);
+
+ close_objset(os, FTAG);
+ return (err);
+}
+
+static int
+dump_label(const char *dev)
+{
+ char path[MAXPATHLEN];
+ zdb_label_t labels[VDEV_LABELS];
+ uint64_t psize, ashift, l2cache;
+ struct stat64 statbuf;
+ boolean_t config_found = B_FALSE;
+ boolean_t error = B_FALSE;
+ boolean_t read_l2arc_header = B_FALSE;
+ avl_tree_t config_tree;
+ avl_tree_t uberblock_tree;
+ void *node, *cookie;
+ int fd;
+
+ bzero(labels, sizeof (labels));
+
+ /*
+ * Check if we were given absolute path and use it as is.
+ * Otherwise if the provided vdev name doesn't point to a file,
+ * try prepending expected disk paths and partition numbers.
+ */
+ (void) strlcpy(path, dev, sizeof (path));
+ if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
+ int error;
+
+ error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
+ if (error == 0 && zfs_dev_is_whole_disk(path)) {
+ if (zfs_append_partition(path, MAXPATHLEN) == -1)
+ error = ENOENT;
+ }
+
+ if (error || (stat64(path, &statbuf) != 0)) {
+ (void) printf("failed to find device %s, try "
+ "specifying absolute path instead\n", dev);
+ return (1);
+ }
+ }
+
+ if ((fd = open64(path, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", path, strerror(errno));
+ exit(1);
+ }
+
+ if (fstat64_blk(fd, &statbuf) != 0) {
+ (void) printf("failed to stat '%s': %s\n", path,
+ strerror(errno));
+ (void) close(fd);
+ exit(1);
+ }
+
+ if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
+ (void) printf("failed to invalidate cache '%s' : %s\n", path,
+ strerror(errno));
+
+ avl_create(&config_tree, cksum_record_compare,
+ sizeof (cksum_record_t), offsetof(cksum_record_t, link));
+ avl_create(&uberblock_tree, cksum_record_compare,
+ sizeof (cksum_record_t), offsetof(cksum_record_t, link));
+
+ psize = statbuf.st_size;
+ psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+ ashift = SPA_MINBLOCKSHIFT;
+
+ /*
+ * 1. Read the label from disk
+ * 2. Unpack the configuration and insert in config tree.
+ * 3. Traverse all uberblocks and insert in uberblock tree.
+ */
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ zdb_label_t *label = &labels[l];
+ char *buf = label->label.vl_vdev_phys.vp_nvlist;
+ size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
+ nvlist_t *config;
+ cksum_record_t *rec;
+ zio_cksum_t cksum;
+ vdev_t vd;
+
+ if (pread64(fd, &label->label, sizeof (label->label),
+ vdev_label_offset(psize, l, 0)) != sizeof (label->label)) {
+ if (!dump_opt['q'])
+ (void) printf("failed to read label %d\n", l);
+ label->read_failed = B_TRUE;
+ error = B_TRUE;
+ continue;
+ }
+
+ label->read_failed = B_FALSE;
+
+ if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
+ nvlist_t *vdev_tree = NULL;
+ size_t size;
+
+ if ((nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+ (nvlist_lookup_uint64(vdev_tree,
+ ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+ ashift = SPA_MINBLOCKSHIFT;
+
+ if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
+ size = buflen;
+
+ /* If the device is a cache device clear the header. */
+ if (!read_l2arc_header) {
+ if (nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
+ l2cache == POOL_STATE_L2CACHE) {
+ read_l2arc_header = B_TRUE;
+ }
+ }
+
+ fletcher_4_native_varsize(buf, size, &cksum);
+ rec = cksum_record_insert(&config_tree, &cksum, l);
+
+ label->config = rec;
+ label->config_nv = config;
+ config_found = B_TRUE;
+ } else {
+ error = B_TRUE;
+ }
+
+ vd.vdev_ashift = ashift;
+ vd.vdev_top = &vd;
+
+ for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
+ uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
+ uberblock_t *ub = (void *)((char *)label + uoff);
+
+ if (uberblock_verify(ub))
+ continue;
+
+ fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
+ rec = cksum_record_insert(&uberblock_tree, &cksum, l);
+
+ label->uberblocks[i] = rec;
+ }
+ }
+
+ /*
+ * Dump the label and uberblocks.
+ */
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ zdb_label_t *label = &labels[l];
+ size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
+
+ if (label->read_failed == B_TRUE)
+ continue;
+
+ if (label->config_nv) {
+ dump_config_from_label(label, buflen, l);
+ } else {
+ if (!dump_opt['q'])
+ (void) printf("failed to unpack label %d\n", l);
+ }
+
+ if (dump_opt['u'])
+ dump_label_uberblocks(label, ashift, l);
+
+ nvlist_free(label->config_nv);
+ }
+
+ /*
+ * Dump the L2ARC header, if existent.
+ */
+ if (read_l2arc_header)
+ error |= dump_l2arc_header(fd);
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
+ umem_free(node, sizeof (cksum_record_t));
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
+ umem_free(node, sizeof (cksum_record_t));
+
+ avl_destroy(&config_tree);
+ avl_destroy(&uberblock_tree);
+
+ (void) close(fd);
+
+ return (config_found == B_FALSE ? 2 :
+ (error == B_TRUE ? 1 : 0));
+}
+
+static uint64_t dataset_feature_count[SPA_FEATURES];
+static uint64_t global_feature_count[SPA_FEATURES];
+static uint64_t remap_deadlist_count = 0;
+
+/*ARGSUSED*/
+static int
+dump_one_objset(const char *dsname, void *arg)
+{
+ int error;
+ objset_t *os;
+ spa_feature_t f;
+
+ error = open_objset(dsname, FTAG, &os);
+ if (error != 0)
+ return (0);
+
+ for (f = 0; f < SPA_FEATURES; f++) {
+ if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
+ continue;
+ ASSERT(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET);
+ dataset_feature_count[f]++;
+ }
+
+ if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
+ remap_deadlist_count++;
+ }
+
+ for (dsl_bookmark_node_t *dbn =
+ avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
+ dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
+ mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
+ if (dbn->dbn_phys.zbm_redaction_obj != 0)
+ global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++;
+ if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
+ global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
+ }
+
+ if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
+ !dmu_objset_is_snapshot(os)) {
+ global_feature_count[SPA_FEATURE_LIVELIST]++;
+ }
+
+ dump_objset(os);
+ close_objset(os, FTAG);
+ fuid_table_destroy();
+ return (0);
+}
+
+/*
+ * Block statistics.
+ */
+#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
+typedef struct zdb_blkstats {
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_count;
+ uint64_t zb_gangs;
+ uint64_t zb_ditto_samevdev;
+ uint64_t zb_ditto_same_ms;
+ uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
+} zdb_blkstats_t;
+
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
+#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
+#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
+#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
+
+static const char *zdb_ot_extname[] = {
+ "deferred free",
+ "dedup ditto",
+ "other",
+ "Total",
+};
+
+#define ZB_TOTAL DN_MAX_LEVELS
+#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
+
+typedef struct zdb_cb {
+ zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+ uint64_t zcb_removing_size;
+ uint64_t zcb_checkpoint_size;
+ uint64_t zcb_dedup_asize;
+ uint64_t zcb_dedup_blocks;
+ uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
+ uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
+ uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
+ uint64_t zcb_psize_len[SPA_MAX_FOR_16M];
+ uint64_t zcb_lsize_len[SPA_MAX_FOR_16M];
+ uint64_t zcb_asize_len[SPA_MAX_FOR_16M];
+ uint64_t zcb_psize_total;
+ uint64_t zcb_lsize_total;
+ uint64_t zcb_asize_total;
+ uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+ uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+ [BPE_PAYLOAD_SIZE + 1];
+ uint64_t zcb_start;
+ hrtime_t zcb_lastprint;
+ uint64_t zcb_totalasize;
+ uint64_t zcb_errors[256];
+ int zcb_readfails;
+ int zcb_haderrors;
+ spa_t *zcb_spa;
+ uint32_t **zcb_vd_obsolete_counts;
+} zdb_cb_t;
+
+/* test if two DVA offsets from same vdev are within the same metaslab */
+static boolean_t
+same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
+{
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ uint64_t ms_shift = vd->vdev_ms_shift;
+
+ return ((off1 >> ms_shift) == (off2 >> ms_shift));
+}
+
+/*
+ * Used to simplify reporting of the histogram data.
+ */
+typedef struct one_histo {
+ char *name;
+ uint64_t *count;
+ uint64_t *len;
+ uint64_t cumulative;
+} one_histo_t;
+
+/*
+ * The number of separate histograms processed for psize, lsize and asize.
+ */
+#define NUM_HISTO 3
+
+/*
+ * This routine will create a fixed column size output of three different
+ * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
+ * the count, length and cumulative length of the psize, lsize and
+ * asize blocks.
+ *
+ * All three types of blocks are listed on a single line
+ *
+ * By default the table is printed in nicenumber format (e.g. 123K) but
+ * if the '-P' parameter is specified then the full raw number (parseable)
+ * is printed out.
+ */
+static void
+dump_size_histograms(zdb_cb_t *zcb)
+{
+ /*
+ * A temporary buffer that allows us to convert a number into
+ * a string using zdb_nicenumber to allow either raw or human
+ * readable numbers to be output.
+ */
+ char numbuf[32];
+
+ /*
+ * Define titles which are used in the headers of the tables
+ * printed by this routine.
+ */
+ const char blocksize_title1[] = "block";
+ const char blocksize_title2[] = "size";
+ const char count_title[] = "Count";
+ const char length_title[] = "Size";
+ const char cumulative_title[] = "Cum.";
+
+ /*
+ * Setup the histogram arrays (psize, lsize, and asize).
+ */
+ one_histo_t parm_histo[NUM_HISTO];
+
+ parm_histo[0].name = "psize";
+ parm_histo[0].count = zcb->zcb_psize_count;
+ parm_histo[0].len = zcb->zcb_psize_len;
+ parm_histo[0].cumulative = 0;
+
+ parm_histo[1].name = "lsize";
+ parm_histo[1].count = zcb->zcb_lsize_count;
+ parm_histo[1].len = zcb->zcb_lsize_len;
+ parm_histo[1].cumulative = 0;
+
+ parm_histo[2].name = "asize";
+ parm_histo[2].count = zcb->zcb_asize_count;
+ parm_histo[2].len = zcb->zcb_asize_len;
+ parm_histo[2].cumulative = 0;
+
+
+ (void) printf("\nBlock Size Histogram\n");
+ /*
+ * Print the first line titles
+ */
+ if (dump_opt['P'])
+ (void) printf("\n%s\t", blocksize_title1);
+ else
+ (void) printf("\n%7s ", blocksize_title1);
+
+ for (int j = 0; j < NUM_HISTO; j++) {
+ if (dump_opt['P']) {
+ if (j < NUM_HISTO - 1) {
+ (void) printf("%s\t\t\t", parm_histo[j].name);
+ } else {
+ /* Don't print trailing spaces */
+ (void) printf(" %s", parm_histo[j].name);
+ }
+ } else {
+ if (j < NUM_HISTO - 1) {
+ /* Left aligned strings in the output */
+ (void) printf("%-7s ",
+ parm_histo[j].name);
+ } else {
+ /* Don't print trailing spaces */
+ (void) printf("%s", parm_histo[j].name);
+ }
+ }
+ }
+ (void) printf("\n");
+
+ /*
+ * Print the second line titles
+ */
+ if (dump_opt['P']) {
+ (void) printf("%s\t", blocksize_title2);
+ } else {
+ (void) printf("%7s ", blocksize_title2);
+ }
+
+ for (int i = 0; i < NUM_HISTO; i++) {
+ if (dump_opt['P']) {
+ (void) printf("%s\t%s\t%s\t",
+ count_title, length_title, cumulative_title);
+ } else {
+ (void) printf("%7s%7s%7s",
+ count_title, length_title, cumulative_title);
+ }
+ }
+ (void) printf("\n");
+
+ /*
+ * Print the rows
+ */
+ for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
+
+ /*
+ * Print the first column showing the blocksize
+ */
+ zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
+
+ if (dump_opt['P']) {
+ printf("%s", numbuf);
+ } else {
+ printf("%7s:", numbuf);
+ }
+
+ /*
+ * Print the remaining set of 3 columns per size:
+ * for psize, lsize and asize
+ */
+ for (int j = 0; j < NUM_HISTO; j++) {
+ parm_histo[j].cumulative += parm_histo[j].len[i];
+
+ zdb_nicenum(parm_histo[j].count[i],
+ numbuf, sizeof (numbuf));
+ if (dump_opt['P'])
+ (void) printf("\t%s", numbuf);
+ else
+ (void) printf("%7s", numbuf);
+
+ zdb_nicenum(parm_histo[j].len[i],
+ numbuf, sizeof (numbuf));
+ if (dump_opt['P'])
+ (void) printf("\t%s", numbuf);
+ else
+ (void) printf("%7s", numbuf);
+
+ zdb_nicenum(parm_histo[j].cumulative,
+ numbuf, sizeof (numbuf));
+ if (dump_opt['P'])
+ (void) printf("\t%s", numbuf);
+ else
+ (void) printf("%7s", numbuf);
+ }
+ (void) printf("\n");
+ }
+}
+
+static void
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+ dmu_object_type_t type)
+{
+ uint64_t refcnt = 0;
+ int i;
+
+ ASSERT(type < ZDB_OT_TOTAL);
+
+ if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+ return;
+
+ spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+ int t = (i & 1) ? type : ZDB_OT_TOTAL;
+ int equal;
+ zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_count++;
+
+ /*
+ * The histogram is only big enough to record blocks up to
+ * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+ * "other", bucket.
+ */
+ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+ idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+ zb->zb_psize_histogram[idx]++;
+
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) {
+ zb->zb_ditto_samevdev++;
+
+ if (same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[1])))
+ zb->zb_ditto_same_ms++;
+ }
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal != 0) {
+ zb->zb_ditto_samevdev++;
+
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[1])))
+ zb->zb_ditto_same_ms++;
+ else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[2])))
+ zb->zb_ditto_same_ms++;
+ else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[1]),
+ DVA_GET_OFFSET(&bp->blk_dva[1]),
+ DVA_GET_OFFSET(&bp->blk_dva[2])))
+ zb->zb_ditto_same_ms++;
+ }
+ break;
+ }
+ }
+
+ spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
+
+ if (BP_IS_EMBEDDED(bp)) {
+ zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+ zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+ [BPE_GET_PSIZE(bp)]++;
+ return;
+ }
+ /*
+ * The binning histogram bins by powers of two up to
+ * SPA_MAXBLOCKSIZE rather than creating bins for
+ * every possible blocksize found in the pool.
+ */
+ int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
+
+ zcb->zcb_psize_count[bin]++;
+ zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
+ zcb->zcb_psize_total += BP_GET_PSIZE(bp);
+
+ bin = highbit64(BP_GET_LSIZE(bp)) - 1;
+
+ zcb->zcb_lsize_count[bin]++;
+ zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
+ zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
+
+ bin = highbit64(BP_GET_ASIZE(bp)) - 1;
+
+ zcb->zcb_asize_count[bin]++;
+ zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
+ zcb->zcb_asize_total += BP_GET_ASIZE(bp);
+
+ if (dump_opt['L'])
+ return;
+
+ if (BP_GET_DEDUP(bp)) {
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+
+ ddt = ddt_select(zcb->zcb_spa, bp);
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_FALSE);
+
+ if (dde == NULL) {
+ refcnt = 0;
+ } else {
+ ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ refcnt = ddp->ddp_refcnt;
+ if (ddt_phys_total_refcnt(dde) == 0)
+ ddt_remove(ddt, dde);
+ }
+ ddt_exit(ddt);
+ }
+
+ VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+ refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
+ bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+}
+
+static void
+zdb_blkptr_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ int ioerr = zio->io_error;
+ zdb_cb_t *zcb = zio->io_private;
+ zbookmark_phys_t *zb = &zio->io_bookmark;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ char blkbuf[BP_SPRINTF_LEN];
+
+ zcb->zcb_haderrors = 1;
+ zcb->zcb_errors[ioerr]++;
+
+ if (dump_opt['b'] >= 2)
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ else
+ blkbuf[0] = '\0';
+
+ (void) printf("zdb_blkptr_cb: "
+ "Got error %d reading "
+ "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+ ioerr,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid,
+ blkbuf);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ zdb_cb_t *zcb = arg;
+ dmu_object_type_t type;
+ boolean_t is_metadata;
+
+ if (zb->zb_level == ZB_DNODE_LEVEL)
+ return (0);
+
+ if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("objset %llu object %llu "
+ "level %lld offset 0x%llx %s\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (u_longlong_t)blkid2offset(dnp, bp, zb),
+ blkbuf);
+ }
+
+ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+ return (0);
+
+ type = BP_GET_TYPE(bp);
+
+ zdb_count_block(zcb, zilog, bp,
+ (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
+
+ is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
+
+ if (!BP_IS_EMBEDDED(bp) &&
+ (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
+ size_t size = BP_GET_PSIZE(bp);
+ abd_t *abd = abd_alloc(size, B_FALSE);
+ int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ flags |= ZIO_FLAG_SPECULATIVE;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_load_verify_bytes > max_inflight_bytes)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_load_verify_bytes += size;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(NULL, spa, bp, abd, size,
+ zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+ }
+
+ zcb->zcb_readfails = 0;
+
+ /* only call gethrtime() every 100 blocks */
+ static int iters;
+ if (++iters > 100)
+ iters = 0;
+ else
+ return (0);
+
+ if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+ uint64_t now = gethrtime();
+ char buf[10];
+ uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
+ int kb_per_sec =
+ 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
+ int sec_remaining =
+ (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
+
+ zfs_nicebytes(bytes, buf, sizeof (buf));
+ (void) fprintf(stderr,
+ "\r%5s completed (%4dMB/s) "
+ "estimated time remaining: %uhr %02umin %02usec ",
+ buf, kb_per_sec / 1024,
+ sec_remaining / 60 / 60,
+ sec_remaining / 60 % 60,
+ sec_remaining % 60);
+
+ zcb->zcb_lastprint = now;
+ }
+
+ return (0);
+}
+
+static void
+zdb_leak(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+
+ (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+static metaslab_ops_t zdb_metaslab_ops = {
+ NULL /* alloc */
+};
+
+/* ARGSUSED */
+static int
+load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ spa_vdev_removal_t *svr = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+
+ /* skip vdevs we don't care about */
+ if (sme->sme_vdev != svr->svr_vdev_id)
+ return (0);
+
+ vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ ASSERT(vim != NULL);
+ if (offset >= vdev_indirect_mapping_max_offset(vim))
+ return (0);
+
+ if (sme->sme_type == SM_ALLOC)
+ range_tree_add(svr->svr_allocd_segs, offset, size);
+ else
+ range_tree_remove(svr->svr_allocd_segs, offset, size);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ /*
+ * This callback was called through a remap from
+ * a device being removed. Therefore, the vdev that
+ * this callback is applied to is a concrete
+ * vdev.
+ */
+ ASSERT(vdev_is_concrete(vd));
+
+ VERIFY0(metaslab_claim_impl(vd, offset, size,
+ spa_min_claim_txg(vd->vdev_spa)));
+}
+
+static void
+claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+ vdev_t *vd = arg;
+
+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+ claim_segment_impl_cb, NULL);
+}
+
+/*
+ * After accounting for all allocated blocks that are directly referenced,
+ * we might have missed a reference to a block from a partially complete
+ * (and thus unused) indirect mapping object. We perform a secondary pass
+ * through the metaslabs we have already mapped and claim the destination
+ * blocks.
+ */
+static void
+zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
+{
+ if (dump_opt['L'])
+ return;
+
+ if (spa->spa_vdev_removal == NULL)
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+
+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+ break;
+
+ ASSERT0(range_tree_space(allocs));
+ if (msp->ms_sm != NULL)
+ VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
+ range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
+ }
+ range_tree_destroy(allocs);
+
+ iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
+
+ /*
+ * Clear everything past what has been synced,
+ * because we have not allocated mappings for
+ * it yet.
+ */
+ range_tree_clear(svr->svr_allocd_segs,
+ vdev_indirect_mapping_max_offset(vim),
+ vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
+
+ zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
+ range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/* ARGSUSED */
+static int
+increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+ spa_t *spa = zcb->zcb_spa;
+ vdev_t *vd;
+ const dva_t *dva = &bp->blk_dva[0];
+
+ ASSERT(!bp_freed);
+ ASSERT(!dump_opt['L']);
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
+ ASSERT3P(vd, !=, NULL);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
+
+ vdev_indirect_mapping_increment_obsolete_count(
+ vd->vdev_indirect_mapping,
+ DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+
+ return (0);
+}
+
+static uint32_t *
+zdb_load_obsolete_counts(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ uint64_t obsolete_sm_object;
+ uint32_t *counts;
+
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
+ counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+ if (vd->vdev_obsolete_sm != NULL) {
+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+ vd->vdev_obsolete_sm);
+ }
+ if (scip->scip_vdev == vd->vdev_id &&
+ scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+ prev_obsolete_sm);
+ space_map_close(prev_obsolete_sm);
+ }
+ return (counts);
+}
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ ddt_bookmark_t ddb;
+ ddt_entry_t dde;
+ int error;
+ int p;
+
+ ASSERT(!dump_opt['L']);
+
+ bzero(&ddb, sizeof (ddb));
+ while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+ blkptr_t blk;
+ ddt_phys_t *ddp = dde.dde_phys;
+
+ if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+ return;
+
+ ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddb.ddb_checksum,
+ &dde.dde_key, ddp, &blk);
+ if (p == DDT_PHYS_DITTO) {
+ zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+ } else {
+ zcb->zcb_dedup_asize +=
+ BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+ zcb->zcb_dedup_blocks++;
+ }
+ }
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
+ }
+
+ ASSERT(error == ENOENT);
+}
+
+typedef struct checkpoint_sm_exclude_entry_arg {
+ vdev_t *cseea_vd;
+ uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
+{
+ checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+ vdev_t *vd = cseea->cseea_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ ASSERT(sme->sme_type == SM_FREE);
+
+ /*
+ * Since the vdev_checkpoint_sm exists in the vdev level
+ * and the ms_sm space maps exist in the metaslab level,
+ * an entry in the checkpoint space map could theoretically
+ * cross the boundaries of the metaslab that it belongs.
+ *
+ * In reality, because of the way that we populate and
+ * manipulate the checkpoint's space maps currently,
+ * there shouldn't be any entries that cross metaslabs.
+ * Hence the assertion below.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * By removing the entry from the allocated segments we
+ * also verify that the entry is there to begin with.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ cseea->cseea_checkpoint_size += sme->sme_run;
+ return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ /*
+ * If there is no vdev_top_zap, we are in a pool whose
+ * version predates the pool checkpoint feature.
+ */
+ if (vd->vdev_top_zap == 0)
+ return;
+
+ /*
+ * If there is no reference of the vdev_checkpoint_sm in
+ * the vdev_top_zap, then one of the following scenarios
+ * is true:
+ *
+ * 1] There is no checkpoint
+ * 2] There is a checkpoint, but no checkpointed blocks
+ * have been freed yet
+ * 3] The current vdev is indirect
+ *
+ * In these cases we return immediately.
+ */
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ return;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+ &checkpoint_sm_obj));
+
+ checkpoint_sm_exclude_entry_arg_t cseea;
+ cseea.cseea_vd = vd;
+ cseea.cseea_checkpoint_size = 0;
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
+ checkpoint_sm_exclude_entry_cb, &cseea));
+ space_map_close(checkpoint_sm);
+
+ zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+ ASSERT(!dump_opt['L']);
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+ zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+ }
+}
+
+static int
+count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ int64_t *ualloc_space = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ if (sme->sme_type == SM_ALLOC)
+ *ualloc_space += sme->sme_run;
+ else
+ *ualloc_space -= sme->sme_run;
+
+ return (0);
+}
+
+static int64_t
+get_unflushed_alloc_space(spa_t *spa)
+{
+ if (dump_opt['L'])
+ return (0);
+
+ int64_t ualloc_space = 0;
+ iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
+ &ualloc_space);
+ return (ualloc_space);
+}
+
+static int
+load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
+{
+ maptype_t *uic_maptype = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+ /* skip indirect vdevs */
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+ ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ if (*uic_maptype == sme->sme_type)
+ range_tree_add(ms->ms_allocatable, offset, size);
+ else
+ range_tree_remove(ms->ms_allocatable, offset, size);
+
+ return (0);
+}
+
+static void
+load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
+{
+ iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ ASSERT3U(i, ==, vd->vdev_id);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ continue;
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rloading concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)msp->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ msp->ms_allocatable, maptype));
+ }
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+
+ load_unflushed_to_ms_allocatables(spa, maptype);
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+ uint64_t *vim_idxp)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+ (*vim_idxp)++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[*vim_idxp];
+ uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+ ASSERT3U(ent_offset, >=, msp->ms_start);
+ if (ent_offset >= msp->ms_start + msp->ms_size)
+ break;
+
+ /*
+ * Mappings do not cross metaslab boundaries,
+ * because we create them by walking the metaslabs.
+ */
+ ASSERT3U(ent_offset + ent_len, <=,
+ msp->ms_start + msp->ms_size);
+ range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+ }
+
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+ ASSERT(!dump_opt['L']);
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ ASSERT3U(c, ==, vd->vdev_id);
+
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ continue;
+
+ /*
+ * Note: we don't check for mapping leaks on
+ * removing vdevs because their ms_allocatable's
+ * are used to look for leaks in allocated space.
+ */
+ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+ /*
+ * Normally, indirect vdevs don't have any
+ * metaslabs. We want to set them up for
+ * zio_claim().
+ */
+ VERIFY0(vdev_metaslab_init(vd, 0));
+
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t vim_idx = 0;
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+ (void) fprintf(stderr,
+ "\rloading indirect vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vd->vdev_ms[m]->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+ &vim_idx);
+ }
+ ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+ }
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ zcb->zcb_spa = spa;
+
+ if (dump_opt['L'])
+ return;
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * We are going to be changing the meaning of the metaslab's
+ * ms_allocatable. Ensure that the allocator doesn't try to
+ * use the tree.
+ */
+ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+
+ zcb->zcb_vd_obsolete_counts =
+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+ UMEM_NOFAIL);
+
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
+ ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+ increment_indirect_mapping_cb, zcb, NULL);
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ zdb_ddt_leak_init(spa, zcb);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static boolean_t
+zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
+{
+ boolean_t leaks = B_FALSE;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t total_leaked = 0;
+ boolean_t are_precise = B_FALSE;
+
+ ASSERT(vim != NULL);
+
+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[i];
+ uint64_t obsolete_bytes = 0;
+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ /*
+ * This is not very efficient but it's easy to
+ * verify correctness.
+ */
+ for (uint64_t inner_offset = 0;
+ inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
+ inner_offset += 1 << vd->vdev_ashift) {
+ if (range_tree_contains(msp->ms_allocatable,
+ offset + inner_offset, 1 << vd->vdev_ashift)) {
+ obsolete_bytes += 1 << vd->vdev_ashift;
+ }
+ }
+
+ int64_t bytes_leaked = obsolete_bytes -
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
+ ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
+
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
+ (void) printf("obsolete indirect mapping count "
+ "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+ (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ (u_longlong_t)bytes_leaked);
+ }
+ total_leaked += ABS(bytes_leaked);
+ }
+
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (!are_precise && total_leaked > 0) {
+ int pct_leaked = total_leaked * 100 /
+ vdev_indirect_mapping_bytes_mapped(vim);
+ (void) printf("cannot verify obsolete indirect mapping "
+ "counts of vdev %llu because precise feature was not "
+ "enabled when it was removed: %d%% (%llx bytes) of mapping"
+ "unreferenced\n",
+ (u_longlong_t)vd->vdev_id, pct_leaked,
+ (u_longlong_t)total_leaked);
+ } else if (total_leaked > 0) {
+ (void) printf("obsolete indirect mapping count mismatch "
+ "for vdev %llu -- %llx total bytes mismatched\n",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)total_leaked);
+ leaks |= B_TRUE;
+ }
+
+ vdev_indirect_mapping_free_obsolete_counts(vim,
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
+
+ return (leaks);
+}
+
+static boolean_t
+zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
+{
+ if (dump_opt['L'])
+ return (B_FALSE);
+
+ boolean_t leaks = B_FALSE;
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ metaslab_group_t *mg __maybe_unused = vd->vdev_mg;
+
+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+ leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+ }
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(mg, ==, msp->ms_group);
+
+ /*
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
+ * represents an allocated block that we
+ * did not claim during the traversal.
+ * Claimed blocks would have been removed
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ range_tree_vacate(msp->ms_allocatable,
+ NULL, NULL);
+ } else {
+ range_tree_vacate(msp->ms_allocatable,
+ zdb_leak, vd);
+ }
+ if (msp->ms_loaded) {
+ msp->ms_loaded = B_FALSE;
+ }
+ }
+ }
+
+ umem_free(zcb->zcb_vd_obsolete_counts,
+ rvd->vdev_children * sizeof (uint32_t *));
+ zcb->zcb_vd_obsolete_counts = NULL;
+
+ return (leaks);
+}
+
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+
+ if (dump_opt['b'] >= 5) {
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("[%s] %s\n",
+ "deferred free", blkbuf);
+ }
+ zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+ return (0);
+}
+
+/*
+ * Iterate over livelists which have been destroyed by the user but
+ * are still present in the MOS, waiting to be freed
+ */
+static void
+iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t zap_obj;
+ int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+ if (err == ENOENT)
+ return;
+ ASSERT0(err);
+
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ dsl_deadlist_t ll;
+ /* NULL out os prior to dsl_deadlist_open in case it's garbage */
+ ll.dl_os = NULL;
+ for (zap_cursor_init(&zc, mos, zap_obj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ dsl_deadlist_open(&ll, mos, attr.za_first_integer);
+ func(&ll, arg);
+ dsl_deadlist_close(&ll);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static int
+bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ ASSERT(!bp_freed);
+ return (count_block_cb(arg, bp, tx));
+}
+
+static int
+livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
+{
+ zdb_cb_t *zbc = args;
+ bplist_t blks;
+ bplist_create(&blks);
+ /* determine which blocks have been alloc'd but not freed */
+ VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
+ /* count those blocks */
+ (void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
+ bplist_destroy(&blks);
+ return (0);
+}
+
+static void
+livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
+{
+ dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
+}
+
+/*
+ * Count the blocks in the livelists that have been destroyed by the user
+ * but haven't yet been freed.
+ */
+static void
+deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
+{
+ iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
+}
+
+static void
+dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
+{
+ ASSERT3P(arg, ==, NULL);
+ global_feature_count[SPA_FEATURE_LIVELIST]++;
+ dump_blkptr_list(ll, "Deleted Livelist");
+ dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
+}
+
+/*
+ * Print out, register object references to, and increment feature counts for
+ * livelists that have been destroyed by the user but haven't yet been freed.
+ */
+static void
+deleted_livelists_dump_mos(spa_t *spa)
+{
+ uint64_t zap_obj;
+ objset_t *mos = spa->spa_meta_objset;
+ int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+ if (err == ENOENT)
+ return;
+ mos_obj_refd(zap_obj);
+ iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
+}
+
+static int
+dump_block_stats(spa_t *spa)
+{
+ zdb_cb_t zcb;
+ zdb_blkstats_t *zb, *tzb;
+ uint64_t norm_alloc, norm_space, total_alloc, total_found;
+ int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
+ boolean_t leaks = B_FALSE;
+ int e, c, err;
+ bp_embedded_type_t i;
+
+ bzero(&zcb, sizeof (zcb));
+ (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
+ (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+ (dump_opt['c'] == 1) ? "metadata " : "",
+ dump_opt['c'] ? "checksums " : "",
+ (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+ !dump_opt['L'] ? "nothing leaked " : "");
+
+ /*
+ * When leak detection is enabled we load all space maps as SM_ALLOC
+ * maps, then traverse the pool claiming each block we discover. If
+ * the pool is perfectly consistent, the segment trees will be empty
+ * when we're done. Anything left over is a leak; any block we can't
+ * claim (because it's not part of any space map) is a double
+ * allocation, reference to a freed block, or an unclaimed log block.
+ *
+ * When leak detection is disabled (-L option) we still traverse the
+ * pool claiming each block we discover, but we skip opening any space
+ * maps.
+ */
+ bzero(&zcb, sizeof (zdb_cb_t));
+ zdb_leak_init(spa, &zcb);
+
+ /*
+ * If there's a deferred-free bplist, process that first.
+ */
+ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+ bpobj_count_block_cb, &zcb, NULL);
+
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+ bpobj_count_block_cb, &zcb, NULL);
+ }
+
+ zdb_claim_removing(spa, &zcb);
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
+ spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
+ &zcb, NULL));
+ }
+
+ deleted_livelists_count_blocks(spa, &zcb);
+
+ if (dump_opt['c'] > 1)
+ flags |= TRAVERSE_PREFETCH_DATA;
+
+ zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
+ zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
+ zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
+ zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
+ err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
+
+ /*
+ * If we've traversed the data blocks then we need to wait for those
+ * I/Os to complete. We leverage "The Godfather" zio to wait on
+ * all async I/Os to complete.
+ */
+ if (dump_opt['c']) {
+ for (c = 0; c < max_ncpus; c++) {
+ (void) zio_wait(spa->spa_async_zio_root[c]);
+ spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
+ }
+ ASSERT0(spa->spa_load_verify_bytes);
+
+ /*
+ * Done after zio_wait() since zcb_haderrors is modified in
+ * zdb_blkptr_done()
+ */
+ zcb.zcb_haderrors |= err;
+
+ if (zcb.zcb_haderrors) {
+ (void) printf("\nError counts:\n\n");
+ (void) printf("\t%5s %s\n", "errno", "count");
+ for (e = 0; e < 256; e++) {
+ if (zcb.zcb_errors[e] != 0) {
+ (void) printf("\t%5d %llu\n",
+ e, (u_longlong_t)zcb.zcb_errors[e]);
+ }
+ }
+ }
+
+ /*
+ * Report any leaked segments.
+ */
+ leaks |= zdb_leak_fini(spa, &zcb);
+
+ tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
+
+ norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ norm_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ total_alloc = norm_alloc +
+ metaslab_class_get_alloc(spa_log_class(spa)) +
+ metaslab_class_get_alloc(spa_special_class(spa)) +
+ metaslab_class_get_alloc(spa_dedup_class(spa)) +
+ get_unflushed_alloc_space(spa);
+ total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
+ zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
+
+ if (total_found == total_alloc && !dump_opt['L']) {
+ (void) printf("\n\tNo leaks (block sum matches space"
+ " maps exactly)\n");
+ } else if (!dump_opt['L']) {
+ (void) printf("block traversal size %llu != alloc %llu "
+ "(%s %lld)\n",
+ (u_longlong_t)total_found,
+ (u_longlong_t)total_alloc,
+ (dump_opt['L']) ? "unreachable" : "leaked",
+ (longlong_t)(total_alloc - total_found));
+ leaks = B_TRUE;
+ }
+
+ if (tzb->zb_count == 0)
+ return (2);
+
+ (void) printf("\n");
+ (void) printf("\t%-16s %14llu\n", "bp count:",
+ (u_longlong_t)tzb->zb_count);
+ (void) printf("\t%-16s %14llu\n", "ganged count:",
+ (longlong_t)tzb->zb_gangs);
+ (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
+ (u_longlong_t)tzb->zb_lsize,
+ (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
+ (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
+ "bp physical:", (u_longlong_t)tzb->zb_psize,
+ (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
+ (double)tzb->zb_lsize / tzb->zb_psize);
+ (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
+ "bp allocated:", (u_longlong_t)tzb->zb_asize,
+ (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
+ (double)tzb->zb_lsize / tzb->zb_asize);
+ (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
+ "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
+ (u_longlong_t)zcb.zcb_dedup_blocks,
+ (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
+ (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
+
+ if (spa_special_class(spa)->mc_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_special_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_special_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Special class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
+ if (spa_dedup_class(spa)->mc_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_dedup_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_dedup_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Dedup class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
+ for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+ if (zcb.zcb_embedded_blocks[i] == 0)
+ continue;
+ (void) printf("\n");
+ (void) printf("\tadditional, non-pointer bps of type %u: "
+ "%10llu\n",
+ i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+ if (dump_opt['b'] >= 3) {
+ (void) printf("\t number of (compressed) bytes: "
+ "number of bps\n");
+ dump_histogram(zcb.zcb_embedded_histogram[i],
+ sizeof (zcb.zcb_embedded_histogram[i]) /
+ sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+ }
+ }
+
+ if (tzb->zb_ditto_samevdev != 0) {
+ (void) printf("\tDittoed blocks on same vdev: %llu\n",
+ (longlong_t)tzb->zb_ditto_samevdev);
+ }
+ if (tzb->zb_ditto_same_ms != 0) {
+ (void) printf("\tDittoed blocks in same metaslab: %llu\n",
+ (longlong_t)tzb->zb_ditto_same_ms);
+ }
+
+ for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ if (vim == NULL) {
+ continue;
+ }
+
+ char mem[32];
+ zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
+ mem, vdev_indirect_mapping_size(vim));
+
+ (void) printf("\tindirect vdev id %llu has %llu segments "
+ "(%s in memory)\n",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
+ }
+
+ if (dump_opt['b'] >= 2) {
+ int l, t, level;
+ (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
+ "\t avg\t comp\t%%Total\tType\n");
+
+ for (t = 0; t <= ZDB_OT_TOTAL; t++) {
+ char csize[32], lsize[32], psize[32], asize[32];
+ char avg[32], gang[32];
+ const char *typename;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
+
+ if (t < DMU_OT_NUMTYPES)
+ typename = dmu_ot[t].ot_name;
+ else
+ typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
+
+ if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
+ (void) printf("%6s\t%5s\t%5s\t%5s"
+ "\t%5s\t%5s\t%6s\t%s\n",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ typename);
+ continue;
+ }
+
+ for (l = ZB_TOTAL - 1; l >= -1; l--) {
+ level = (l == -1 ? ZB_TOTAL : l);
+ zb = &zcb.zcb_type[level][t];
+
+ if (zb->zb_asize == 0)
+ continue;
+
+ if (dump_opt['b'] < 3 && level != ZB_TOTAL)
+ continue;
+
+ if (level == 0 && zb->zb_asize ==
+ zcb.zcb_type[ZB_TOTAL][t].zb_asize)
+ continue;
+
+ zdb_nicenum(zb->zb_count, csize,
+ sizeof (csize));
+ zdb_nicenum(zb->zb_lsize, lsize,
+ sizeof (lsize));
+ zdb_nicenum(zb->zb_psize, psize,
+ sizeof (psize));
+ zdb_nicenum(zb->zb_asize, asize,
+ sizeof (asize));
+ zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
+ sizeof (avg));
+ zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
+
+ (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
+ "\t%5.2f\t%6.2f\t",
+ csize, lsize, psize, asize, avg,
+ (double)zb->zb_lsize / zb->zb_psize,
+ 100.0 * zb->zb_asize / tzb->zb_asize);
+
+ if (level == ZB_TOTAL)
+ (void) printf("%s\n", typename);
+ else
+ (void) printf(" L%d %s\n",
+ level, typename);
+
+ if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+ (void) printf("\t number of ganged "
+ "blocks: %s\n", gang);
+ }
+
+ if (dump_opt['b'] >= 4) {
+ (void) printf("psize "
+ "(in 512-byte sectors): "
+ "number of blocks\n");
+ dump_histogram(zb->zb_psize_histogram,
+ PSIZE_HISTO_SIZE, 0);
+ }
+ }
+ }
+
+ /* Output a table summarizing block sizes in the pool */
+ if (dump_opt['b'] >= 2) {
+ dump_size_histograms(&zcb);
+ }
+ }
+
+ (void) printf("\n");
+
+ if (leaks)
+ return (2);
+
+ if (zcb.zcb_haderrors)
+ return (3);
+
+ return (0);
+}
+
+typedef struct zdb_ddt_entry {
+ ddt_key_t zdde_key;
+ uint64_t zdde_ref_blocks;
+ uint64_t zdde_ref_lsize;
+ uint64_t zdde_ref_psize;
+ uint64_t zdde_ref_dsize;
+ avl_node_t zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ avl_tree_t *t = arg;
+ avl_index_t where;
+ zdb_ddt_entry_t *zdde, zdde_search;
+
+ if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+ BP_IS_EMBEDDED(bp))
+ return (0);
+
+ if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+ (void) printf("traversing objset %llu, %llu objects, "
+ "%lu blocks so far\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)BP_GET_FILL(bp),
+ avl_numnodes(t));
+ }
+
+ if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+ BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+ return (0);
+
+ ddt_key_fill(&zdde_search.zdde_key, bp);
+
+ zdde = avl_find(t, &zdde_search, &where);
+
+ if (zdde == NULL) {
+ zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+ zdde->zdde_key = zdde_search.zdde_key;
+ avl_insert(t, zdde, where);
+ }
+
+ zdde->zdde_ref_blocks += 1;
+ zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+ zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+ zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+ return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+ avl_tree_t t;
+ void *cookie = NULL;
+ zdb_ddt_entry_t *zdde;
+ ddt_histogram_t ddh_total;
+ ddt_stat_t dds_total;
+
+ bzero(&ddh_total, sizeof (ddh_total));
+ bzero(&dds_total, sizeof (dds_total));
+ avl_create(&t, ddt_entry_compare,
+ sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+ ddt_stat_t dds;
+ uint64_t refcnt = zdde->zdde_ref_blocks;
+ ASSERT(refcnt != 0);
+
+ dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+ dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+ dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+ dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+ dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+ dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+ dds.dds_ref_psize = zdde->zdde_ref_psize;
+ dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+ ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
+ &dds, 0);
+
+ umem_free(zdde, sizeof (*zdde));
+ }
+
+ avl_destroy(&t);
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ (void) printf("Simulated DDT histogram:\n");
+
+ zpool_dump_ddt(&dds_total, &ddh_total);
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static int
+verify_device_removal_feature_counts(spa_t *spa)
+{
+ uint64_t dr_feature_refcount = 0;
+ uint64_t oc_feature_refcount = 0;
+ uint64_t indirect_vdev_count = 0;
+ uint64_t precise_vdev_count = 0;
+ uint64_t obsolete_counts_object_count = 0;
+ uint64_t obsolete_sm_count = 0;
+ uint64_t obsolete_counts_count = 0;
+ uint64_t scip_count = 0;
+ uint64_t obsolete_bpobj_count = 0;
+ int ret = 0;
+
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ if (scip->scip_next_mapping_object != 0) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ (void) printf("Condensing indirect vdev %llu: new mapping "
+ "object %llu, prev obsolete sm %llu\n",
+ (u_longlong_t)scip->scip_vdev,
+ (u_longlong_t)scip->scip_next_mapping_object,
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object);
+ if (scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm,
+ spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
+ (void) printf("\n");
+ space_map_close(prev_obsolete_sm);
+ }
+
+ scip_count += 2;
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (vic->vic_mapping_object != 0) {
+ ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
+ vd->vdev_removing);
+ indirect_vdev_count++;
+
+ if (vd->vdev_indirect_mapping->vim_havecounts) {
+ obsolete_counts_count++;
+ }
+ }
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
+ ASSERT(vic->vic_mapping_object != 0);
+ precise_vdev_count++;
+ }
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ ASSERT(vic->vic_mapping_object != 0);
+ obsolete_sm_count++;
+ }
+ }
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
+ &dr_feature_refcount);
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
+ &oc_feature_refcount);
+
+ if (dr_feature_refcount != indirect_vdev_count) {
+ ret = 1;
+ (void) printf("Number of indirect vdevs (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)indirect_vdev_count,
+ (u_longlong_t)dr_feature_refcount);
+ } else {
+ (void) printf("Verified device_removal feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)dr_feature_refcount);
+ }
+
+ if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ) == 0) {
+ obsolete_bpobj_count++;
+ }
+
+
+ obsolete_counts_object_count = precise_vdev_count;
+ obsolete_counts_object_count += obsolete_sm_count;
+ obsolete_counts_object_count += obsolete_counts_count;
+ obsolete_counts_object_count += scip_count;
+ obsolete_counts_object_count += obsolete_bpobj_count;
+ obsolete_counts_object_count += remap_deadlist_count;
+
+ if (oc_feature_refcount != obsolete_counts_object_count) {
+ ret = 1;
+ (void) printf("Number of obsolete counts objects (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)obsolete_counts_object_count,
+ (u_longlong_t)oc_feature_refcount);
+ (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
+ "ob:%llu rd:%llu\n",
+ (u_longlong_t)precise_vdev_count,
+ (u_longlong_t)obsolete_sm_count,
+ (u_longlong_t)obsolete_counts_count,
+ (u_longlong_t)scip_count,
+ (u_longlong_t)obsolete_bpobj_count,
+ (u_longlong_t)remap_deadlist_count);
+ } else {
+ (void) printf("Verified indirect_refcount feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)oc_feature_refcount);
+ }
+ return (ret);
+}
+
+static void
+zdb_set_skip_mmp(char *target)
+{
+ spa_t *spa;
+
+ /*
+ * Disable the activity check to allow examination of
+ * active pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL) {
+ spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appended to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+ int error = 0;
+ char *poolname, *bogus_name = NULL;
+
+ /* If the target is not a pool, the extract the pool name */
+ char *path_start = strchr(target, '/');
+ if (path_start != NULL) {
+ size_t poolname_len = path_start - target;
+ poolname = strndup(target, poolname_len);
+ } else {
+ poolname = target;
+ }
+
+ if (cfg == NULL) {
+ zdb_set_skip_mmp(poolname);
+ error = spa_get_stats(poolname, &cfg, NULL, 0);
+ if (error != 0) {
+ fatal("Tried to read config of pool \"%s\" but "
+ "spa_get_stats() failed with error %d\n",
+ poolname, error);
+ }
+ }
+
+ if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+ return (NULL);
+ fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+ error = spa_import(bogus_name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
+ ZFS_IMPORT_SKIP_MMP);
+ if (error != 0) {
+ fatal("Tried to import pool \"%s\" but spa_import() failed "
+ "with error %d\n", bogus_name, error);
+ }
+
+ if (new_path != NULL && path_start != NULL) {
+ if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+ if (path_start != NULL)
+ free(poolname);
+ return (NULL);
+ }
+ }
+
+ if (target != poolname)
+ free(poolname);
+
+ return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+ vdev_t *vcsec_vd;
+
+ /* the following fields are only used for printing progress */
+ uint64_t vcsec_entryid;
+ uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
+{
+ verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+ vdev_t *vd = vcsec->vcsec_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ ASSERT(sme->sme_type == SM_FREE);
+
+ if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu, space map entry %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vcsec->vcsec_entryid,
+ (longlong_t)vcsec->vcsec_num_entries);
+ }
+ vcsec->vcsec_entryid++;
+
+ /*
+ * See comment in checkpoint_sm_exclude_entry_cb()
+ */
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * The entries in the vdev_checkpoint_sm should be marked as
+ * allocated in the checkpointed state of the pool, therefore
+ * their respective ms_allocateable trees should not contain them.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+ for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * Since we don't allow device removal in a pool
+ * that has a checkpoint, we expect that all removed
+ * vdevs were removed from the pool before the
+ * checkpoint.
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ /*
+ * If the checkpoint space map doesn't exist, then nothing
+ * here is checkpointed so there's nothing to verify.
+ */
+ if (current_vd->vdev_top_zap == 0 ||
+ zap_contains(spa_meta_objset(current),
+ current_vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(current),
+ current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+ checkpoint_sm_obj, 0, current_vd->vdev_asize,
+ current_vd->vdev_ashift));
+
+ verify_checkpoint_sm_entry_cb_arg_t vcsec;
+ vcsec.vcsec_vd = ckpoint_vd;
+ vcsec.vcsec_entryid = 0;
+ vcsec.vcsec_num_entries =
+ space_map_length(checkpoint_sm) / sizeof (uint64_t);
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
+ verify_checkpoint_sm_entry_cb, &vcsec));
+ if (dump_opt['m'] > 3)
+ dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+
+ /*
+ * If we've added vdevs since we took the checkpoint, ensure
+ * that their checkpoint space maps are empty.
+ */
+ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+ for (uint64_t c = ckpoint_rvd->vdev_children;
+ c < current_rvd->vdev_children; c++) {
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+ ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+ load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+ for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+ vdev_t *current_vd = current_rvd->vdev_child[i];
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * See comment in verify_checkpoint_vdev_spacemaps()
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+ metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+ metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu of %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)current_vd->vdev_id,
+ (longlong_t)current_rvd->vdev_children,
+ (longlong_t)current_vd->vdev_ms[m]->ms_id,
+ (longlong_t)current_vd->vdev_ms_count);
+
+ /*
+ * We walk through the ms_allocatable trees that
+ * are loaded with the allocated blocks from the
+ * ms_sm spacemaps of the checkpoint. For each
+ * one of these ranges we ensure that none of them
+ * exists in the ms_allocatable trees of the
+ * current state which are loaded with the ranges
+ * that are currently free.
+ *
+ * This way we ensure that none of the blocks that
+ * are part of the checkpoint were freed by mistake.
+ */
+ range_tree_walk(ckpoint_msp->ms_allocatable,
+ (range_tree_func_t *)range_tree_verify_not_present,
+ current_msp->ms_allocatable);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+ ASSERT(!dump_opt['L']);
+
+ spa_t *checkpoint_spa;
+ char *checkpoint_pool;
+ nvlist_t *config = NULL;
+ int error = 0;
+
+ /*
+ * We import the checkpointed state of the pool (under a different
+ * name) so we can do verification on it against the current state
+ * of the pool.
+ */
+ checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+ NULL);
+ ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+ error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but spa_open() failed with "
+ "error %d\n", checkpoint_pool, error);
+ }
+
+ /*
+ * Ensure that ranges in the checkpoint space maps of each vdev
+ * are allocated according to the checkpointed state's metaslab
+ * space maps.
+ */
+ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Ensure that allocated ranges in the checkpoint's metaslab
+ * space maps remain allocated in the metaslab space maps of
+ * the current state.
+ */
+ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Once we are done, we get rid of the checkpointed state.
+ */
+ spa_close(checkpoint_spa, FTAG);
+ free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (vd->vdev_top_zap == 0)
+ continue;
+
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (0);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT && !dump_opt['L']) {
+ /*
+ * If the feature is active but the uberblock is missing
+ * then we must be in the middle of discarding the
+ * checkpoint.
+ */
+ (void) printf("\nPartially discarded checkpoint "
+ "state found:\n");
+ if (dump_opt['m'] > 3)
+ dump_leftover_checkpoint_blocks(spa);
+ return (0);
+ } else if (error != 0) {
+ (void) printf("lookup error %d when looking for "
+ "checkpointed uberblock in MOS\n", error);
+ return (error);
+ }
+ dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+ if (checkpoint.ub_checkpoint_txg == 0) {
+ (void) printf("\nub_checkpoint_txg not set in checkpointed "
+ "uberblock\n");
+ error = 3;
+ }
+
+ if (error == 0 && !dump_opt['L'])
+ verify_checkpoint_blocks(spa);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
+{
+ for (uint64_t i = start; i < size; i++) {
+ (void) printf("MOS object %llu referenced but not allocated\n",
+ (u_longlong_t)i);
+ }
+}
+
+static void
+mos_obj_refd(uint64_t obj)
+{
+ if (obj != 0 && mos_refd_objs != NULL)
+ range_tree_add(mos_refd_objs, obj, 1);
+}
+
+/*
+ * Call on a MOS object that may already have been referenced.
+ */
+static void
+mos_obj_refd_multiple(uint64_t obj)
+{
+ if (obj != 0 && mos_refd_objs != NULL &&
+ !range_tree_contains(mos_refd_objs, obj, 1))
+ range_tree_add(mos_refd_objs, obj, 1);
+}
+
+static void
+mos_leak_vdev_top_zap(vdev_t *vd)
+{
+ uint64_t ms_flush_data_obj;
+ int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+ sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
+ if (error == ENOENT)
+ return;
+ ASSERT0(error);
+
+ mos_obj_refd(ms_flush_data_obj);
+}
+
+static void
+mos_leak_vdev(vdev_t *vd)
+{
+ mos_obj_refd(vd->vdev_dtl_object);
+ mos_obj_refd(vd->vdev_ms_array);
+ mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
+ mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
+ mos_obj_refd(vd->vdev_leaf_zap);
+ if (vd->vdev_checkpoint_sm != NULL)
+ mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
+ if (vd->vdev_indirect_mapping != NULL) {
+ mos_obj_refd(vd->vdev_indirect_mapping->
+ vim_phys->vimp_counts_object);
+ }
+ if (vd->vdev_obsolete_sm != NULL)
+ mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *ms = vd->vdev_ms[m];
+ mos_obj_refd(space_map_object(ms->ms_sm));
+ }
+
+ if (vd->vdev_top_zap != 0) {
+ mos_obj_refd(vd->vdev_top_zap);
+ mos_leak_vdev_top_zap(vd);
+ }
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ mos_leak_vdev(vd->vdev_child[c]);
+ }
+}
+
+static void
+mos_leak_log_spacemaps(spa_t *spa)
+{
+ uint64_t spacemap_zap;
+ int error = zap_lookup(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
+ sizeof (spacemap_zap), 1, &spacemap_zap);
+ if (error == ENOENT)
+ return;
+ ASSERT0(error);
+
+ mos_obj_refd(spacemap_zap);
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
+ mos_obj_refd(sls->sls_sm_obj);
+}
+
+static int
+dump_mos_leaks(spa_t *spa)
+{
+ int rv = 0;
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ /* Visit and mark all referenced objects in the MOS */
+
+ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
+ mos_obj_refd(spa->spa_pool_props_object);
+ mos_obj_refd(spa->spa_config_object);
+ mos_obj_refd(spa->spa_ddt_stat_object);
+ mos_obj_refd(spa->spa_feat_desc_obj);
+ mos_obj_refd(spa->spa_feat_enabled_txg_obj);
+ mos_obj_refd(spa->spa_feat_for_read_obj);
+ mos_obj_refd(spa->spa_feat_for_write_obj);
+ mos_obj_refd(spa->spa_history);
+ mos_obj_refd(spa->spa_errlog_last);
+ mos_obj_refd(spa->spa_errlog_scrub);
+ mos_obj_refd(spa->spa_all_vdev_zaps);
+ mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
+ mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
+ mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
+ bpobj_count_refd(&spa->spa_deferred_bpobj);
+ mos_obj_refd(dp->dp_empty_bpobj);
+ bpobj_count_refd(&dp->dp_obsolete_bpobj);
+ bpobj_count_refd(&dp->dp_free_bpobj);
+ mos_obj_refd(spa->spa_l2cache.sav_object);
+ mos_obj_refd(spa->spa_spares.sav_object);
+
+ if (spa->spa_syncing_log_sm != NULL)
+ mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
+ mos_leak_log_spacemaps(spa);
+
+ mos_obj_refd(spa->spa_condensing_indirect_phys.
+ scip_next_mapping_object);
+ mos_obj_refd(spa->spa_condensing_indirect_phys.
+ scip_prev_obsolete_sm_object);
+ if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
+ vdev_indirect_mapping_t *vim =
+ vdev_indirect_mapping_open(mos,
+ spa->spa_condensing_indirect_phys.scip_next_mapping_object);
+ mos_obj_refd(vim->vim_phys->vimp_counts_object);
+ vdev_indirect_mapping_close(vim);
+ }
+ deleted_livelists_dump_mos(spa);
+
+ if (dp->dp_origin_snap != NULL) {
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
+ FTAG, &ds));
+ count_ds_mos_objects(ds);
+ dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+
+ count_ds_mos_objects(dp->dp_origin_snap);
+ dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
+ }
+ count_dir_mos_objects(dp->dp_mos_dir);
+ if (dp->dp_free_dir != NULL)
+ count_dir_mos_objects(dp->dp_free_dir);
+ if (dp->dp_leak_dir != NULL)
+ count_dir_mos_objects(dp->dp_leak_dir);
+
+ mos_leak_vdev(spa->spa_root_vdev);
+
+ for (uint64_t class = 0; class < DDT_CLASSES; class++) {
+ for (uint64_t type = 0; type < DDT_TYPES; type++) {
+ for (uint64_t cksum = 0;
+ cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
+ ddt_t *ddt = spa->spa_ddt[cksum];
+ mos_obj_refd(ddt->ddt_object[type][class]);
+ }
+ }
+ }
+
+ /*
+ * Visit all allocated objects and make sure they are referenced.
+ */
+ uint64_t object = 0;
+ while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
+ if (range_tree_contains(mos_refd_objs, object, 1)) {
+ range_tree_remove(mos_refd_objs, object, 1);
+ } else {
+ dmu_object_info_t doi;
+ const char *name;
+ dmu_object_info(mos, object, &doi);
+ if (doi.doi_type & DMU_OT_NEWTYPE) {
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(doi.doi_type);
+ name = dmu_ot_byteswap[bswap].ob_name;
+ } else {
+ name = dmu_ot[doi.doi_type].ot_name;
+ }
+
+ (void) printf("MOS object %llu (%s) leaked\n",
+ (u_longlong_t)object, name);
+ rv = 2;
+ }
+ }
+ (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
+ if (!range_tree_is_empty(mos_refd_objs))
+ rv = 2;
+ range_tree_vacate(mos_refd_objs, NULL, NULL);
+ range_tree_destroy(mos_refd_objs);
+ return (rv);
+}
+
+typedef struct log_sm_obsolete_stats_arg {
+ uint64_t lsos_current_txg;
+
+ uint64_t lsos_total_entries;
+ uint64_t lsos_valid_entries;
+
+ uint64_t lsos_sm_entries;
+ uint64_t lsos_valid_sm_entries;
+} log_sm_obsolete_stats_arg_t;
+
+static int
+log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ log_sm_obsolete_stats_arg_t *lsos = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ if (lsos->lsos_current_txg == 0) {
+ /* this is the first log */
+ lsos->lsos_current_txg = txg;
+ } else if (lsos->lsos_current_txg < txg) {
+ /* we just changed log - print stats and reset */
+ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+ (u_longlong_t)lsos->lsos_valid_sm_entries,
+ (u_longlong_t)lsos->lsos_sm_entries,
+ (u_longlong_t)lsos->lsos_current_txg);
+ lsos->lsos_valid_sm_entries = 0;
+ lsos->lsos_sm_entries = 0;
+ lsos->lsos_current_txg = txg;
+ }
+ ASSERT3U(lsos->lsos_current_txg, ==, txg);
+
+ lsos->lsos_sm_entries++;
+ lsos->lsos_total_entries++;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+ lsos->lsos_valid_sm_entries++;
+ lsos->lsos_valid_entries++;
+ return (0);
+}
+
+static void
+dump_log_spacemap_obsolete_stats(spa_t *spa)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ log_sm_obsolete_stats_arg_t lsos;
+ bzero(&lsos, sizeof (lsos));
+
+ (void) printf("Log Space Map Obsolete Entry Statistics:\n");
+
+ iterate_through_spacemap_logs(spa,
+ log_spacemap_obsolete_stats_cb, &lsos);
+
+ /* print stats for latest log */
+ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+ (u_longlong_t)lsos.lsos_valid_sm_entries,
+ (u_longlong_t)lsos.lsos_sm_entries,
+ (u_longlong_t)lsos.lsos_current_txg);
+
+ (void) printf("%-8llu valid entries out of %-8llu - total\n\n",
+ (u_longlong_t)lsos.lsos_valid_entries,
+ (u_longlong_t)lsos.lsos_total_entries);
+}
+
+static void
+dump_zpool(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ int rc = 0;
+
+ if (dump_opt['y']) {
+ livelist_metaslab_validate(spa);
+ }
+
+ if (dump_opt['S']) {
+ dump_simulated_ddt(spa);
+ return;
+ }
+
+ if (!dump_opt['e'] && dump_opt['C'] > 1) {
+ (void) printf("\nCached configuration:\n");
+ dump_nvlist(spa->spa_config, 8);
+ }
+
+ if (dump_opt['C'])
+ dump_config(spa);
+
+ if (dump_opt['u'])
+ dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
+
+ if (dump_opt['D'])
+ dump_all_ddts(spa);
+
+ if (dump_opt['d'] > 2 || dump_opt['m'])
+ dump_metaslabs(spa);
+ if (dump_opt['M'])
+ dump_metaslab_groups(spa);
+ if (dump_opt['d'] > 2 || dump_opt['m']) {
+ dump_log_spacemaps(spa);
+ dump_log_spacemap_obsolete_stats(spa);
+ }
+
+ if (dump_opt['d'] || dump_opt['i']) {
+ spa_feature_t f;
+ mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+ 0);
+ dump_objset(dp->dp_meta_objset);
+
+ if (dump_opt['d'] >= 3) {
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dump_full_bpobj(&spa->spa_deferred_bpobj,
+ "Deferred frees", 0);
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ dump_full_bpobj(&dp->dp_free_bpobj,
+ "Pool snapshot frees", 0);
+ }
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ dump_full_bpobj(&dp->dp_obsolete_bpobj,
+ "Pool obsolete blocks", 0);
+ }
+
+ if (spa_feature_is_active(spa,
+ SPA_FEATURE_ASYNC_DESTROY)) {
+ dump_bptree(spa->spa_meta_objset,
+ dp->dp_bptree_obj,
+ "Pool dataset frees");
+ }
+ dump_dtl(spa->spa_root_vdev, 0);
+ }
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
+ global_feature_count[f] = UINT64_MAX;
+ global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
+ global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
+ global_feature_count[SPA_FEATURE_LIVELIST] = 0;
+
+ (void) dmu_objset_find(spa_name(spa), dump_one_objset,
+ NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+ if (rc == 0 && !dump_opt['L'])
+ rc = dump_mos_leaks(spa);
+
+ for (f = 0; f < SPA_FEATURES; f++) {
+ uint64_t refcount;
+
+ uint64_t *arr;
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET)) {
+ if (global_feature_count[f] == UINT64_MAX)
+ continue;
+ if (!spa_feature_is_enabled(spa, f)) {
+ ASSERT0(global_feature_count[f]);
+ continue;
+ }
+ arr = global_feature_count;
+ } else {
+ if (!spa_feature_is_enabled(spa, f)) {
+ ASSERT0(dataset_feature_count[f]);
+ continue;
+ }
+ arr = dataset_feature_count;
+ }
+ if (feature_get_refcount(spa, &spa_feature_table[f],
+ &refcount) == ENOTSUP)
+ continue;
+ if (arr[f] != refcount) {
+ (void) printf("%s feature refcount mismatch: "
+ "%lld consumers != %lld refcount\n",
+ spa_feature_table[f].fi_uname,
+ (longlong_t)arr[f], (longlong_t)refcount);
+ rc = 2;
+ } else {
+ (void) printf("Verified %s feature refcount "
+ "of %llu is correct\n",
+ spa_feature_table[f].fi_uname,
+ (longlong_t)refcount);
+ }
+ }
+
+ if (rc == 0)
+ rc = verify_device_removal_feature_counts(spa);
+ }
+
+ if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
+ rc = dump_block_stats(spa);
+
+ if (rc == 0)
+ rc = verify_spacemap_refcounts(spa);
+
+ if (dump_opt['s'])
+ show_pool_stats(spa);
+
+ if (dump_opt['h'])
+ dump_history(spa);
+
+ if (rc == 0)
+ rc = verify_checkpoint(spa);
+
+ if (rc != 0) {
+ dump_debug_buffer();
+ exit(rc);
+ }
+}
+
+#define ZDB_FLAG_CHECKSUM 0x0001
+#define ZDB_FLAG_DECOMPRESS 0x0002
+#define ZDB_FLAG_BSWAP 0x0004
+#define ZDB_FLAG_GBH 0x0008
+#define ZDB_FLAG_INDIRECT 0x0010
+#define ZDB_FLAG_RAW 0x0020
+#define ZDB_FLAG_PRINT_BLKPTR 0x0040
+#define ZDB_FLAG_VERBOSE 0x0080
+
+static int flagbits[256];
+static char flagbitstr[16];
+
+static void
+zdb_print_blkptr(const blkptr_t *bp, int flags)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("%s\n", blkbuf);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+ int i;
+
+ for (i = 0; i < nbps; i++)
+ zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+ zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array(buf, size);
+ VERIFY(write(fileno(stdout), buf, size) == size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+ uint64_t *d = (uint64_t *)buf;
+ unsigned nwords = size / sizeof (uint64_t);
+ int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+ unsigned i, j;
+ const char *hdr;
+ char *c;
+
+
+ if (do_bswap)
+ hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
+ else
+ hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
+
+ (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
+
+#ifdef _LITTLE_ENDIAN
+ /* correct the endianness */
+ do_bswap = !do_bswap;
+#endif
+ for (i = 0; i < nwords; i += 2) {
+ (void) printf("%06llx: %016llx %016llx ",
+ (u_longlong_t)(i * sizeof (uint64_t)),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+ c = (char *)&d[i];
+ for (j = 0; j < 2 * sizeof (uint64_t); j++)
+ (void) printf("%c", isprint(c[j]) ? c[j] : '.');
+ (void) printf("\n");
+ }
+}
+
+/*
+ * There are two acceptable formats:
+ * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
+ * child[.child]* - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the hierarchy. For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, const char *path)
+{
+ char *s, *p, *q;
+ unsigned i;
+
+ if (vdev == NULL)
+ return (NULL);
+
+ /* First, assume the x.x.x.x format */
+ i = strtoul(path, &s, 10);
+ if (s == path || (s && *s != '.' && *s != '\0'))
+ goto name;
+ if (i >= vdev->vdev_children)
+ return (NULL);
+
+ vdev = vdev->vdev_child[i];
+ if (s && *s == '\0')
+ return (vdev);
+ return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+ for (i = 0; i < vdev->vdev_children; i++) {
+ vdev_t *vc = vdev->vdev_child[i];
+
+ if (vc->vdev_path == NULL) {
+ vc = zdb_vdev_lookup(vc, path);
+ if (vc == NULL)
+ continue;
+ else
+ return (vc);
+ }
+
+ p = strrchr(vc->vdev_path, '/');
+ p = p ? p + 1 : vc->vdev_path;
+ q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+ if (strcmp(vc->vdev_path, path) == 0)
+ return (vc);
+ if (strcmp(p, path) == 0)
+ return (vc);
+ if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+ return (vc);
+ }
+
+ return (NULL);
+}
+
+static int
+name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
+{
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+ int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
+ NULL, &ds);
+ if (error != 0) {
+ (void) fprintf(stderr, "failed to hold objset %llu: %s\n",
+ (u_longlong_t)objset_id, strerror(error));
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ return (error);
+ }
+ dsl_dataset_name(ds, outstr);
+ dsl_dataset_rele(ds, NULL);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ return (0);
+}
+
+static boolean_t
+zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
+{
+ char *s0, *s1;
+
+ if (sizes == NULL)
+ return (B_FALSE);
+
+ s0 = strtok(sizes, "/");
+ if (s0 == NULL)
+ return (B_FALSE);
+ s1 = strtok(NULL, "/");
+ *lsize = strtoull(s0, NULL, 16);
+ *psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
+ return (*lsize >= *psize && *psize > 0);
+}
+
+#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg))
+
+static boolean_t
+zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
+ uint64_t psize, int flags)
+{
+ boolean_t exceeded = B_FALSE;
+ /*
+ * We don't know how the data was compressed, so just try
+ * every decompress function at every inflated blocksize.
+ */
+ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
+ int *cfuncp = cfuncs;
+ uint64_t maxlsize = SPA_MAXBLOCKSIZE;
+ uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
+ ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
+ (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0);
+ *cfuncp++ = ZIO_COMPRESS_LZ4;
+ *cfuncp++ = ZIO_COMPRESS_LZJB;
+ mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
+ for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
+ if (((1ULL << c) & mask) == 0)
+ *cfuncp++ = c;
+
+ /*
+ * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
+ * could take a while and we should let the user know
+ * we are not stuck. On the other hand, printing progress
+ * info gets old after a while. User can specify 'v' flag
+ * to see the progression.
+ */
+ if (lsize == psize)
+ lsize += SPA_MINBLOCKSIZE;
+ else
+ maxlsize = lsize;
+ for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
+ for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
+ if (flags & ZDB_FLAG_VERBOSE) {
+ (void) fprintf(stderr,
+ "Trying %05llx -> %05llx (%s)\n",
+ (u_longlong_t)psize,
+ (u_longlong_t)lsize,
+ zio_compress_table[*cfuncp].\
+ ci_name);
+ }
+
+ /*
+ * We randomize lbuf2, and decompress to both
+ * lbuf and lbuf2. This way, we will know if
+ * decompression fill exactly to lsize.
+ */
+ VERIFY0(random_get_pseudo_bytes(lbuf2, lsize));
+
+ if (zio_decompress_data(*cfuncp, pabd,
+ lbuf, psize, lsize, NULL) == 0 &&
+ zio_decompress_data(*cfuncp, pabd,
+ lbuf2, psize, lsize, NULL) == 0 &&
+ bcmp(lbuf, lbuf2, lsize) == 0)
+ break;
+ }
+ if (*cfuncp != 0)
+ break;
+ }
+ umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+ if (lsize > maxlsize) {
+ exceeded = B_TRUE;
+ }
+ buf = lbuf;
+ if (*cfuncp == ZIO_COMPRESS_ZLE) {
+ printf("\nZLE decompression was selected. If you "
+ "suspect the results are wrong,\ntry avoiding ZLE "
+ "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
+ }
+
+ return (exceeded);
+}
+
+/*
+ * Read a block from a pool and print it out. The syntax of the
+ * block descriptor is:
+ *
+ * pool:vdev_specifier:offset:[lsize/]psize[:flags]
+ *
+ * pool - The name of the pool you wish to read from
+ * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ * offset - offset, in hex, in bytes
+ * size - Amount of data to read, in hex, in bytes
+ * flags - A string of characters specifying options
+ * b: Decode a blkptr at given offset within block
+ * c: Calculate and display checksums
+ * d: Decompress data before dumping
+ * e: Byteswap data before dumping
+ * g: Display data as a gang block header
+ * i: Display as an indirect block
+ * r: Dump raw data to stdout
+ * v: Verbose
+ *
+ */
+static void
+zdb_read_block(char *thing, spa_t *spa)
+{
+ blkptr_t blk, *bp = &blk;
+ dva_t *dva = bp->blk_dva;
+ int flags = 0;
+ uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
+ zio_t *zio;
+ vdev_t *vd;
+ abd_t *pabd;
+ void *lbuf, *buf;
+ char *s, *p, *dup, *vdev, *flagstr, *sizes;
+ int i, error;
+ boolean_t borrowed = B_FALSE, found = B_FALSE;
+
+ dup = strdup(thing);
+ s = strtok(dup, ":");
+ vdev = s ? s : "";
+ s = strtok(NULL, ":");
+ offset = strtoull(s ? s : "", NULL, 16);
+ sizes = strtok(NULL, ":");
+ s = strtok(NULL, ":");
+ flagstr = strdup(s ? s : "");
+
+ s = NULL;
+ if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
+ s = "invalid size(s)";
+ if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
+ s = "size must be a multiple of sector size";
+ if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+ s = "offset must be a multiple of sector size";
+ if (s) {
+ (void) printf("Invalid block specifier: %s - %s\n", thing, s);
+ goto done;
+ }
+
+ for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+ for (i = 0; i < strlen(flagstr); i++) {
+ int bit = flagbits[(uchar_t)flagstr[i]];
+
+ if (bit == 0) {
+ (void) printf("***Ignoring flag: %c\n",
+ (uchar_t)flagstr[i]);
+ continue;
+ }
+ found = B_TRUE;
+ flags |= bit;
+
+ p = &flagstr[i + 1];
+ if (*p != ':' && *p != '\0') {
+ int j = 0, nextbit = flagbits[(uchar_t)*p];
+ char *end, offstr[8] = { 0 };
+ if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
+ (nextbit == 0)) {
+ /* look ahead to isolate the offset */
+ while (nextbit == 0 &&
+ strchr(flagbitstr, *p) == NULL) {
+ offstr[j] = *p;
+ j++;
+ if (i + j > strlen(flagstr))
+ break;
+ p++;
+ nextbit = flagbits[(uchar_t)*p];
+ }
+ blkptr_offset = strtoull(offstr, &end,
+ 16);
+ i += j;
+ } else if (nextbit == 0) {
+ (void) printf("***Ignoring flag arg:"
+ " '%c'\n", (uchar_t)*p);
+ }
+ }
+ }
+ }
+ if (blkptr_offset % sizeof (blkptr_t)) {
+ printf("Block pointer offset 0x%llx "
+ "must be divisible by 0x%x\n",
+ (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
+ goto done;
+ }
+ if (found == B_FALSE && strlen(flagstr) > 0) {
+ printf("Invalid flag arg: '%s'\n", flagstr);
+ goto done;
+ }
+
+ vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+ if (vd == NULL) {
+ (void) printf("***Invalid vdev: %s\n", vdev);
+ free(dup);
+ return;
+ } else {
+ if (vd->vdev_path)
+ (void) fprintf(stderr, "Found vdev: %s\n",
+ vd->vdev_path);
+ else
+ (void) fprintf(stderr, "Found vdev type: %s\n",
+ vd->vdev_ops->vdev_op_type);
+ }
+
+ pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
+ lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[0], offset);
+ DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+ DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ zio = zio_root(spa, NULL, NULL, 0);
+
+ if (vd == vd->vdev_top) {
+ /*
+ * Treat this as a normal block read.
+ */
+ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+ } else {
+ /*
+ * Treat this as a vdev child I/O.
+ */
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+ psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+
+ error = zio_wait(zio);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (error) {
+ (void) printf("Read of %s failed, error: %d\n", thing, error);
+ goto out;
+ }
+
+ uint64_t orig_lsize = lsize;
+ buf = lbuf;
+ if (flags & ZDB_FLAG_DECOMPRESS) {
+ boolean_t failed = zdb_decompress_block(pabd, buf, lbuf,
+ lsize, psize, flags);
+ if (failed) {
+ (void) printf("Decompress of %s failed\n", thing);
+ goto out;
+ }
+ } else {
+ buf = abd_borrow_buf_copy(pabd, lsize);
+ borrowed = B_TRUE;
+ }
+ /*
+ * Try to detect invalid block pointer. If invalid, try
+ * decompressing.
+ */
+ if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
+ !(flags & ZDB_FLAG_DECOMPRESS)) {
+ const blkptr_t *b = (const blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset);
+ if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) ==
+ B_FALSE) {
+ abd_return_buf_copy(pabd, buf, lsize);
+ borrowed = B_FALSE;
+ buf = lbuf;
+ boolean_t failed = zdb_decompress_block(pabd, buf,
+ lbuf, lsize, psize, flags);
+ b = (const blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset);
+ if (failed || zfs_blkptr_verify(spa, b, B_FALSE,
+ BLK_VERIFY_LOG) == B_FALSE) {
+ printf("invalid block pointer at this DVA\n");
+ goto out;
+ }
+ }
+ }
+
+ if (flags & ZDB_FLAG_PRINT_BLKPTR)
+ zdb_print_blkptr((blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+ else if (flags & ZDB_FLAG_RAW)
+ zdb_dump_block_raw(buf, lsize, flags);
+ else if (flags & ZDB_FLAG_INDIRECT)
+ zdb_dump_indirect((blkptr_t *)buf,
+ orig_lsize / sizeof (blkptr_t), flags);
+ else if (flags & ZDB_FLAG_GBH)
+ zdb_dump_gbh(buf, flags);
+ else
+ zdb_dump_block(thing, buf, lsize, flags);
+
+ /*
+ * If :c was specified, iterate through the checksum table to
+ * calculate and display each checksum for our specified
+ * DVA and length.
+ */
+ if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
+ !(flags & ZDB_FLAG_GBH)) {
+ zio_t *czio;
+ (void) printf("\n");
+ for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
+ ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
+
+ if ((zio_checksum_table[ck].ci_flags &
+ ZCHECKSUM_FLAG_EMBEDDED) ||
+ ck == ZIO_CHECKSUM_NOPARITY) {
+ continue;
+ }
+ BP_SET_CHECKSUM(bp, ck);
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ czio->io_bp = bp;
+
+ if (vd == vd->vdev_top) {
+ zio_nowait(zio_read(czio, spa, bp, pabd, psize,
+ NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_DONT_RETRY, NULL));
+ } else {
+ zio_nowait(zio_vdev_child_io(czio, bp, vd,
+ offset, pabd, psize, ZIO_TYPE_READ,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+ error = zio_wait(czio);
+ if (error == 0 || error == ECKSUM) {
+ zio_t *ck_zio = zio_root(spa, NULL, NULL, 0);
+ ck_zio->io_offset =
+ DVA_GET_OFFSET(&bp->blk_dva[0]);
+ ck_zio->io_bp = bp;
+ zio_checksum_compute(ck_zio, ck, pabd, lsize);
+ printf("%12s\tcksum=%llx:%llx:%llx:%llx\n",
+ zio_checksum_table[ck].ci_name,
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ zio_wait(ck_zio);
+ } else {
+ printf("error %d reading block\n", error);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ }
+ }
+
+ if (borrowed)
+ abd_return_buf_copy(pabd, buf, lsize);
+
+out:
+ abd_free(pabd);
+ umem_free(lbuf, SPA_MAXBLOCKSIZE);
+done:
+ free(flagstr);
+ free(dup);
+}
+
+static void
+zdb_embedded_block(char *thing)
+{
+ blkptr_t bp;
+ unsigned long long *words = (void *)&bp;
+ char *buf;
+ int err;
+
+ bzero(&bp, sizeof (bp));
+ err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
+ "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
+ words + 0, words + 1, words + 2, words + 3,
+ words + 4, words + 5, words + 6, words + 7,
+ words + 8, words + 9, words + 10, words + 11,
+ words + 12, words + 13, words + 14, words + 15);
+ if (err != 16) {
+ (void) fprintf(stderr, "invalid input format\n");
+ exit(1);
+ }
+ ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
+ buf = malloc(SPA_MAXBLOCKSIZE);
+ if (buf == NULL) {
+ (void) fprintf(stderr, "out of memory\n");
+ exit(1);
+ }
+ err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
+ if (err != 0) {
+ (void) fprintf(stderr, "decode failed: %u\n", err);
+ exit(1);
+ }
+ zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
+ free(buf);
+}
+
+int
+main(int argc, char **argv)
+{
+ int c;
+ struct rlimit rl = { 1024, 1024 };
+ spa_t *spa = NULL;
+ objset_t *os = NULL;
+ int dump_all = 1;
+ int verbose = 0;
+ int error = 0;
+ char **searchdirs = NULL;
+ int nsearch = 0;
+ char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *policy = NULL;
+ uint64_t max_txg = UINT64_MAX;
+ int64_t objset_id = -1;
+ int flags = ZFS_IMPORT_MISSING_LOG;
+ int rewind = ZPOOL_NEVER_REWIND;
+ char *spa_config_path_env, *objset_str;
+ boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
+ nvlist_t *cfg = NULL;
+
+ (void) setrlimit(RLIMIT_NOFILE, &rl);
+ (void) enable_extended_FILE_stdio(-1, -1);
+
+ dprintf_setup(&argc, argv);
+
+ /*
+ * If there is an environment variable SPA_CONFIG_PATH it overrides
+ * default spa_config_path setting. If -U flag is specified it will
+ * override this environment variable settings once again.
+ */
+ spa_config_path_env = getenv("SPA_CONFIG_PATH");
+ if (spa_config_path_env != NULL)
+ spa_config_path = spa_config_path_env;
+
+ /*
+ * For performance reasons, we set this tunable down. We do so before
+ * the arg parsing section so that the user can override this value if
+ * they choose.
+ */
+ zfs_btree_verify_intensity = 3;
+
+ while ((c = getopt(argc, argv,
+ "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) {
+ switch (c) {
+ case 'b':
+ case 'c':
+ case 'C':
+ case 'd':
+ case 'D':
+ case 'E':
+ case 'G':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'm':
+ case 'M':
+ case 'O':
+ case 'R':
+ case 's':
+ case 'S':
+ case 'u':
+ case 'y':
+ case 'Z':
+ dump_opt[c]++;
+ dump_all = 0;
+ break;
+ case 'A':
+ case 'e':
+ case 'F':
+ case 'k':
+ case 'L':
+ case 'P':
+ case 'q':
+ case 'X':
+ dump_opt[c]++;
+ break;
+ case 'Y':
+ zfs_reconstruct_indirect_combinations_max = INT_MAX;
+ zfs_deadman_enabled = 0;
+ break;
+ /* NB: Sort single match options below. */
+ case 'I':
+ max_inflight_bytes = strtoull(optarg, NULL, 0);
+ if (max_inflight_bytes == 0) {
+ (void) fprintf(stderr, "maximum number "
+ "of inflight bytes must be greater "
+ "than 0\n");
+ usage();
+ }
+ break;
+ case 'o':
+ error = set_global_var(optarg);
+ if (error != 0)
+ usage();
+ break;
+ case 'p':
+ if (searchdirs == NULL) {
+ searchdirs = umem_alloc(sizeof (char *),
+ UMEM_NOFAIL);
+ } else {
+ char **tmp = umem_alloc((nsearch + 1) *
+ sizeof (char *), UMEM_NOFAIL);
+ bcopy(searchdirs, tmp, nsearch *
+ sizeof (char *));
+ umem_free(searchdirs,
+ nsearch * sizeof (char *));
+ searchdirs = tmp;
+ }
+ searchdirs[nsearch++] = optarg;
+ break;
+ case 't':
+ max_txg = strtoull(optarg, NULL, 0);
+ if (max_txg < TXG_INITIAL) {
+ (void) fprintf(stderr, "incorrect txg "
+ "specified: %s\n", optarg);
+ usage();
+ }
+ break;
+ case 'U':
+ spa_config_path = optarg;
+ if (spa_config_path[0] != '/') {
+ (void) fprintf(stderr,
+ "cachefile must be an absolute path "
+ "(i.e. start with a slash)\n");
+ usage();
+ }
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'V':
+ flags = ZFS_IMPORT_VERBATIM;
+ break;
+ case 'x':
+ vn_dumpdir = optarg;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (!dump_opt['e'] && searchdirs != NULL) {
+ (void) fprintf(stderr, "-p option requires use of -e\n");
+ usage();
+ }
+ if (dump_opt['d']) {
+ /* <pool>[/<dataset | objset id> is accepted */
+ if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL &&
+ objset_str++ != NULL) {
+ char *endptr;
+ errno = 0;
+ objset_id = strtoull(objset_str, &endptr, 0);
+ /* dataset 0 is the same as opening the pool */
+ if (errno == 0 && endptr != objset_str &&
+ objset_id != 0) {
+ target_is_spa = B_FALSE;
+ dataset_lookup = B_TRUE;
+ } else if (objset_id != 0) {
+ printf("failed to open objset %s "
+ "%llu %s", objset_str,
+ (u_longlong_t)objset_id,
+ strerror(errno));
+ exit(1);
+ }
+ /* normal dataset name not an objset ID */
+ if (endptr == objset_str) {
+ objset_id = -1;
+ }
+ }
+ }
+
+#if defined(_LP64)
+ /*
+ * ZDB does not typically re-read blocks; therefore limit the ARC
+ * to 256 MB, which can be used entirely for metadata.
+ */
+ zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT;
+ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
+#endif
+
+ /*
+ * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+ * "zdb -b" uses traversal prefetch which uses async reads.
+ * For good performance, let several of them be active at once.
+ */
+ zfs_vdev_async_read_max_active = 10;
+
+ /*
+ * Disable reference tracking for better performance.
+ */
+ reference_tracking_enable = B_FALSE;
+
+ /*
+ * Do not fail spa_load when spa_load_verify fails. This is needed
+ * to load non-idle pools.
+ */
+ spa_load_verify_dryrun = B_TRUE;
+
+ kernel_init(SPA_MODE_READ);
+
+ if (dump_all)
+ verbose = MAX(verbose, 1);
+
+ for (c = 0; c < 256; c++) {
+ if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL)
+ dump_opt[c] = 1;
+ if (dump_opt[c])
+ dump_opt[c] += verbose;
+ }
+
+ aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+ zfs_recover = (dump_opt['A'] > 1);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 2 && dump_opt['R'])
+ usage();
+
+ if (dump_opt['E']) {
+ if (argc != 1)
+ usage();
+ zdb_embedded_block(argv[0]);
+ return (0);
+ }
+
+ if (argc < 1) {
+ if (!dump_opt['e'] && dump_opt['C']) {
+ dump_cachefile(spa_config_path);
+ return (0);
+ }
+ usage();
+ }
+
+ if (dump_opt['l'])
+ return (dump_label(argv[0]));
+
+ if (dump_opt['O']) {
+ if (argc != 2)
+ usage();
+ dump_opt['v'] = verbose + 3;
+ return (dump_path(argv[0], argv[1]));
+ }
+
+ if (dump_opt['X'] || dump_opt['F'])
+ rewind = ZPOOL_DO_REWIND |
+ (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
+
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+ nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
+ fatal("internal error: %s", strerror(ENOMEM));
+
+ error = 0;
+ target = argv[0];
+
+ if (strpbrk(target, "/@") != NULL) {
+ size_t targetlen;
+
+ target_pool = strdup(target);
+ *strpbrk(target_pool, "/@") = '\0';
+
+ target_is_spa = B_FALSE;
+ targetlen = strlen(target);
+ if (targetlen && target[targetlen - 1] == '/')
+ target[targetlen - 1] = '\0';
+ } else {
+ target_pool = target;
+ }
+
+ if (dump_opt['e']) {
+ importargs_t args = { 0 };
+
+ args.paths = nsearch;
+ args.path = searchdirs;
+ args.can_be_active = B_TRUE;
+
+ error = zpool_find_config(NULL, target_pool, &cfg, &args,
+ &libzpool_config_ops);
+
+ if (error == 0) {
+
+ if (nvlist_add_nvlist(cfg,
+ ZPOOL_LOAD_POLICY, policy) != 0) {
+ fatal("can't open '%s': %s",
+ target, strerror(ENOMEM));
+ }
+
+ if (dump_opt['C'] > 1) {
+ (void) printf("\nConfiguration for import:\n");
+ dump_nvlist(cfg, 8);
+ }
+
+ /*
+ * Disable the activity check to allow examination of
+ * active pools.
+ */
+ error = spa_import(target_pool, cfg, NULL,
+ flags | ZFS_IMPORT_SKIP_MMP);
+ }
+ }
+
+ /*
+ * import_checkpointed_state makes the assumption that the
+ * target pool that we pass it is already part of the spa
+ * namespace. Because of that we need to make sure to call
+ * it always after the -e option has been processed, which
+ * imports the pool to the namespace if it's not in the
+ * cachefile.
+ */
+ char *checkpoint_pool = NULL;
+ char *checkpoint_target = NULL;
+ if (dump_opt['k']) {
+ checkpoint_pool = import_checkpointed_state(target, cfg,
+ &checkpoint_target);
+
+ if (checkpoint_target != NULL)
+ target = checkpoint_target;
+ }
+
+ if (target_pool != target)
+ free(target_pool);
+
+ if (error == 0) {
+ if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+ ASSERT(checkpoint_pool != NULL);
+ ASSERT(checkpoint_target == NULL);
+
+ error = spa_open(checkpoint_pool, &spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but "
+ "spa_open() failed with error %d\n",
+ checkpoint_pool, error);
+ }
+
+ } else if (target_is_spa || dump_opt['R'] || objset_id == 0) {
+ zdb_set_skip_mmp(target);
+ error = spa_open_rewind(target, &spa, FTAG, policy,
+ NULL);
+ if (error) {
+ /*
+ * If we're missing the log device then
+ * try opening the pool after clearing the
+ * log state.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL &&
+ spa->spa_log_state == SPA_LOG_MISSING) {
+ spa->spa_log_state = SPA_LOG_CLEAR;
+ error = 0;
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ if (!error) {
+ error = spa_open_rewind(target, &spa,
+ FTAG, policy, NULL);
+ }
+ }
+ } else if (strpbrk(target, "#") != NULL) {
+ dsl_pool_t *dp;
+ error = dsl_pool_hold(target, FTAG, &dp);
+ if (error != 0) {
+ fatal("can't dump '%s': %s", target,
+ strerror(error));
+ }
+ error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
+ dsl_pool_rele(dp, FTAG);
+ if (error != 0) {
+ fatal("can't dump '%s': %s", target,
+ strerror(error));
+ }
+ return (error);
+ } else {
+ zdb_set_skip_mmp(target);
+ if (dataset_lookup == B_TRUE) {
+ /*
+ * Use the supplied id to get the name
+ * for open_objset.
+ */
+ error = spa_open(target, &spa, FTAG);
+ if (error == 0) {
+ error = name_from_objset_id(spa,
+ objset_id, dsname);
+ spa_close(spa, FTAG);
+ if (error == 0)
+ target = dsname;
+ }
+ }
+ if (error == 0)
+ error = open_objset(target, FTAG, &os);
+ if (error == 0)
+ spa = dmu_objset_spa(os);
+ }
+ }
+ nvlist_free(policy);
+
+ if (error)
+ fatal("can't open '%s': %s", target, strerror(error));
+
+ /*
+ * Set the pool failure mode to panic in order to prevent the pool
+ * from suspending. A suspended I/O will have no way to resume and
+ * can prevent the zdb(8) command from terminating as expected.
+ */
+ if (spa != NULL)
+ spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
+
+ argv++;
+ argc--;
+ if (!dump_opt['R']) {
+ flagbits['d'] = ZOR_FLAG_DIRECTORY;
+ flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
+ flagbits['m'] = ZOR_FLAG_SPACE_MAP;
+ flagbits['z'] = ZOR_FLAG_ZAP;
+ flagbits['A'] = ZOR_FLAG_ALL_TYPES;
+
+ if (argc > 0 && dump_opt['d']) {
+ zopt_object_args = argc;
+ zopt_object_ranges = calloc(zopt_object_args,
+ sizeof (zopt_object_range_t));
+ for (unsigned i = 0; i < zopt_object_args; i++) {
+ int err;
+ char *msg = NULL;
+
+ err = parse_object_range(argv[i],
+ &zopt_object_ranges[i], &msg);
+ if (err != 0)
+ fatal("Bad object or range: '%s': %s\n",
+ argv[i], msg ? msg : "");
+ }
+ } else if (argc > 0 && dump_opt['m']) {
+ zopt_metaslab_args = argc;
+ zopt_metaslab = calloc(zopt_metaslab_args,
+ sizeof (uint64_t));
+ for (unsigned i = 0; i < zopt_metaslab_args; i++) {
+ errno = 0;
+ zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
+ if (zopt_metaslab[i] == 0 && errno != 0)
+ fatal("bad number %s: %s", argv[i],
+ strerror(errno));
+ }
+ }
+ if (os != NULL) {
+ dump_objset(os);
+ } else if (zopt_object_args > 0 && !dump_opt['m']) {
+ dump_objset(spa->spa_meta_objset);
+ } else {
+ dump_zpool(spa);
+ }
+ } else {
+ flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+ flagbits['c'] = ZDB_FLAG_CHECKSUM;
+ flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+ flagbits['e'] = ZDB_FLAG_BSWAP;
+ flagbits['g'] = ZDB_FLAG_GBH;
+ flagbits['i'] = ZDB_FLAG_INDIRECT;
+ flagbits['r'] = ZDB_FLAG_RAW;
+ flagbits['v'] = ZDB_FLAG_VERBOSE;
+
+ for (int i = 0; i < argc; i++)
+ zdb_read_block(argv[i], spa);
+ }
+
+ if (dump_opt['k']) {
+ free(checkpoint_pool);
+ if (!target_is_spa)
+ free(checkpoint_target);
+ }
+
+ if (os != NULL) {
+ close_objset(os, FTAG);
+ } else {
+ spa_close(spa, FTAG);
+ }
+
+ fuid_table_destroy();
+
+ dump_debug_buffer();
+
+ kernel_fini();
+
+ return (error);
+}
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.h b/sys/contrib/openzfs/cmd/zdb/zdb.h
new file mode 100644
index 000000000000..49579811efbb
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.h
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2017 Spectra Logic Corp Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#ifndef _ZDB_H
+#define _ZDB_H
+
+void dump_intent_log(zilog_t *);
+extern uint8_t dump_opt[256];
+
+#endif /* _ZDB_H */
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb_il.c b/sys/contrib/openzfs/cmd/zdb/zdb_il.c
new file mode 100644
index 000000000000..c12178effae0
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zdb/zdb_il.c
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Print intent log header and statistics.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/abd.h>
+
+#include "zdb.h"
+
+extern uint8_t dump_opt[256];
+
+static char tab_prefix[4] = "\t\t\t";
+
+static void
+print_log_bp(const blkptr_t *bp, const char *prefix)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("%s%s\n", prefix, blkbuf);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_create_t *lr = arg;
+ time_t crtime = lr->lr_crtime[0];
+ char *name, *link;
+ lr_attr_t *lrattr;
+
+ name = (char *)(lr + 1);
+
+ if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+ lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+ lrattr = (lr_attr_t *)(lr + 1);
+ name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ }
+
+ if (txtype == TX_SYMLINK) {
+ link = name + strlen(name) + 1;
+ (void) printf("%s%s -> %s\n", tab_prefix, name, link);
+ } else if (txtype != TX_MKXATTR) {
+ (void) printf("%s%s\n", tab_prefix, name);
+ }
+
+ (void) printf("%s%s", tab_prefix, ctime(&crtime));
+ (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n",
+ tab_prefix, (u_longlong_t)lr->lr_doid,
+ (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
+ (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
+ (longlong_t)lr->lr_mode);
+ (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+ tab_prefix,
+ (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
+ (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_remove_t *lr = arg;
+
+ (void) printf("%sdoid %llu, name %s\n", tab_prefix,
+ (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_link_t *lr = arg;
+
+ (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix,
+ (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
+ (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_rename_t *lr = arg;
+ char *snm = (char *)(lr + 1);
+ char *tnm = snm + strlen(snm) + 1;
+
+ (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
+ (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
+}
+
+/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+ char *cdata = data;
+
+ for (size_t i = 0; i < len; i++) {
+ if (isprint(*cdata))
+ (void) printf("%c ", *cdata);
+ else
+ (void) printf("%2X", *cdata);
+ cdata++;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_write_t *lr = arg;
+ abd_t *data;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_phys_t zb;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ int error;
+
+ (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
+
+ if (txtype == TX_WRITE2 || verbose < 5)
+ return;
+
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ (void) printf("%shas blkptr, %s\n", tab_prefix,
+ !BP_IS_HOLE(bp) &&
+ bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+ "will claim" : "won't claim");
+ print_log_bp(bp, tab_prefix);
+
+ if (BP_IS_HOLE(bp)) {
+ (void) printf("\t\t\tLSIZE 0x%llx\n",
+ (u_longlong_t)BP_GET_LSIZE(bp));
+ (void) printf("%s<hole>\n", tab_prefix);
+ return;
+ }
+ if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+ (void) printf("%s<block already committed>\n",
+ tab_prefix);
+ return;
+ }
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
+ error = zio_wait(zio_read(NULL, zilog->zl_spa,
+ bp, data, BP_GET_LSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+ if (error)
+ goto out;
+ } else {
+ /* data is stored after the end of the lr_write record */
+ data = abd_alloc(lr->lr_length, B_FALSE);
+ abd_copy_from_buf(data, lr + 1, lr->lr_length);
+ }
+
+ (void) printf("%s", tab_prefix);
+ (void) abd_iterate_func(data,
+ 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+ zil_prt_rec_write_cb, NULL);
+ (void) printf("\n");
+
+out:
+ abd_free(data);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_truncate_t *lr = arg;
+
+ (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_setattr_t *lr = arg;
+ time_t atime = (time_t)lr->lr_atime[0];
+ time_t mtime = (time_t)lr->lr_mtime[0];
+
+ (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
+
+ if (lr->lr_mask & AT_MODE) {
+ (void) printf("%sAT_MODE %llo\n", tab_prefix,
+ (longlong_t)lr->lr_mode);
+ }
+
+ if (lr->lr_mask & AT_UID) {
+ (void) printf("%sAT_UID %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_uid);
+ }
+
+ if (lr->lr_mask & AT_GID) {
+ (void) printf("%sAT_GID %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_gid);
+ }
+
+ if (lr->lr_mask & AT_SIZE) {
+ (void) printf("%sAT_SIZE %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_size);
+ }
+
+ if (lr->lr_mask & AT_ATIME) {
+ (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix,
+ (u_longlong_t)lr->lr_atime[0],
+ (u_longlong_t)lr->lr_atime[1],
+ ctime(&atime));
+ }
+
+ if (lr->lr_mask & AT_MTIME) {
+ (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix,
+ (u_longlong_t)lr->lr_mtime[0],
+ (u_longlong_t)lr->lr_mtime[1],
+ ctime(&mtime));
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_acl_t *lr = arg;
+
+ (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
+}
+
+typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *);
+typedef struct zil_rec_info {
+ zil_prt_rec_func_t zri_print;
+ const char *zri_name;
+ uint64_t zri_count;
+} zil_rec_info_t;
+
+static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
+ {.zri_print = NULL, .zri_name = "Total "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "},
+ {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "},
+ {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "},
+ {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "},
+ {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "},
+ {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "},
+ {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "},
+ {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "},
+ {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "},
+ {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "},
+ {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "},
+};
+
+/* ARGSUSED */
+static int
+print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
+{
+ int txtype;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+ /* reduce size of txtype to strip off TX_CI bit */
+ txtype = lr->lrc_txtype;
+
+ ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
+ ASSERT(lr->lrc_txg);
+
+ (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n",
+ (lr->lrc_txtype & TX_CI) ? "CI-" : "",
+ zil_rec_info[txtype].zri_name,
+ (u_longlong_t)lr->lrc_reclen,
+ (u_longlong_t)lr->lrc_txg,
+ (u_longlong_t)lr->lrc_seq);
+
+ if (txtype && verbose >= 3) {
+ if (!zilog->zl_os->os_encrypted) {
+ zil_rec_info[txtype].zri_print(zilog, txtype, lr);
+ } else {
+ (void) printf("%s(encrypted)\n", tab_prefix);
+ }
+ }
+
+ zil_rec_info[txtype].zri_count++;
+ zil_rec_info[0].zri_count++;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ char blkbuf[BP_SPRINTF_LEN + 10];
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ const char *claim;
+
+ if (verbose <= 3)
+ return (0);
+
+ if (verbose >= 5) {
+ (void) strcpy(blkbuf, ", ");
+ snprintf_blkptr(blkbuf + strlen(blkbuf),
+ sizeof (blkbuf) - strlen(blkbuf), bp);
+ } else {
+ blkbuf[0] = '\0';
+ }
+
+ if (claim_txg != 0)
+ claim = "already claimed";
+ else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
+ claim = "will claim";
+ else
+ claim = "won't claim";
+
+ (void) printf("\tBlock seqno %llu, %s%s\n",
+ (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+ return (0);
+}
+
+static void
+print_log_stats(int verbose)
+{
+ unsigned i, w, p10;
+
+ if (verbose > 3)
+ (void) printf("\n");
+
+ if (zil_rec_info[0].zri_count == 0)
+ return;
+
+ for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
+ w++;
+
+ for (i = 0; i < TX_MAX_TYPE; i++)
+ if (zil_rec_info[i].zri_count || verbose >= 3)
+ (void) printf("\t\t%s %*llu\n",
+ zil_rec_info[i].zri_name, w,
+ (u_longlong_t)zil_rec_info[i].zri_count);
+ (void) printf("\n");
+}
+
+/* ARGSUSED */
+void
+dump_intent_log(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ int i;
+
+ if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
+ return;
+
+ (void) printf("\n ZIL header: claim_txg %llu, "
+ "claim_blk_seq %llu, claim_lr_seq %llu",
+ (u_longlong_t)zh->zh_claim_txg,
+ (u_longlong_t)zh->zh_claim_blk_seq,
+ (u_longlong_t)zh->zh_claim_lr_seq);
+ (void) printf(" replay_seq %llu, flags 0x%llx\n",
+ (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
+
+ for (i = 0; i < TX_MAX_TYPE; i++)
+ zil_rec_info[i].zri_count = 0;
+
+ /* see comment in zil_claim() or zil_check_log_chain() */
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return;
+
+ if (verbose >= 2) {
+ (void) printf("\n");
+ (void) zil_parse(zilog, print_log_block, print_log_record, NULL,
+ zh->zh_claim_txg, B_FALSE);
+ print_log_stats(verbose);
+ }
+}
diff --git a/sys/contrib/openzfs/cmd/zed/.gitignore b/sys/contrib/openzfs/cmd/zed/.gitignore
new file mode 100644
index 000000000000..76557bb6bb3a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/.gitignore
@@ -0,0 +1 @@
+/zed
diff --git a/sys/contrib/openzfs/cmd/zed/Makefile.am b/sys/contrib/openzfs/cmd/zed/Makefile.am
new file mode 100644
index 000000000000..4bd8ac4a53e6
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/Makefile.am
@@ -0,0 +1,49 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS)
+
+SUBDIRS = zed.d
+
+sbin_PROGRAMS = zed
+
+ZED_SRC = \
+ zed.c \
+ zed.h \
+ zed_conf.c \
+ zed_conf.h \
+ zed_disk_event.c \
+ zed_disk_event.h \
+ zed_event.c \
+ zed_event.h \
+ zed_exec.c \
+ zed_exec.h \
+ zed_file.c \
+ zed_file.h \
+ zed_log.c \
+ zed_log.h \
+ zed_strings.c \
+ zed_strings.h
+
+FMA_SRC = \
+ agents/zfs_agents.c \
+ agents/zfs_agents.h \
+ agents/zfs_diagnosis.c \
+ agents/zfs_mod.c \
+ agents/zfs_retire.c \
+ agents/fmd_api.c \
+ agents/fmd_api.h \
+ agents/fmd_serd.c \
+ agents/fmd_serd.h
+
+zed_SOURCES = $(ZED_SRC) $(FMA_SRC)
+
+zed_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+ $(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
+zed_LDFLAGS = -pthread
+
+EXTRA_DIST = agents/README.md
diff --git a/sys/contrib/openzfs/cmd/zed/agents/README.md b/sys/contrib/openzfs/cmd/zed/agents/README.md
new file mode 100644
index 000000000000..e35b97668a9d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/README.md
@@ -0,0 +1,112 @@
+## Fault Management Logic for ZED ##
+
+The integration of Fault Management Daemon (FMD) logic from illumos
+is being deployed in three phases. This logic is encapsulated in
+several software modules inside ZED.
+
+### ZED+FM Phase 1 ###
+
+All the phase 1 work is in current Master branch. Phase I work includes:
+
+* Add new paths to the persistent VDEV label for device matching.
+* Add a disk monitor for generating _disk-add_ and _disk-change_ events.
+* Add support for automated VDEV auto-online, auto-replace and auto-expand.
+* Expand the statechange event to include all VDEV state transitions.
+
+### ZED+FM Phase 2 (WIP) ###
+
+The phase 2 work primarily entails the _Diagnosis Engine_ and the
+_Retire Agent_ modules. It also includes infrastructure to support a
+crude FMD environment to host these modules. For additional
+information see the **FMD Components in ZED** and **Implementation
+Notes** sections below.
+
+### ZED+FM Phase 3 ###
+
+Future work will add additional functionality and will likely include:
+
+* Add FMD module garbage collection (periodically call `fmd_module_gc()`).
+* Add real module property retrieval (currently hard-coded in accessors).
+* Additional diagnosis telemetry (like latency outliers and SMART data).
+* Export FMD module statistics.
+* Zedlet parallel execution and resiliency (add watchdog).
+
+### ZFS Fault Management Overview ###
+
+The primary purpose with ZFS fault management is automated diagnosis
+and isolation of VDEV faults. A fault is something we can associate
+with an impact (e.g. loss of data redundancy) and a corrective action
+(e.g. offline or replace a disk). A typical ZFS fault management stack
+is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk
+monitor_, a _diagnosis engine_ and _response agents_.
+
+After detecting a software error, the ZFS kernel module sends error
+events to the ZED user daemon which in turn routes the events to its
+internal FMA modules based on their event subscriptions. Likewise, if
+a disk is added or changed in the system, the disk monitor sends disk
+events which are consumed by a response agent.
+
+### FMD Components in ZED ###
+
+There are three FMD modules (aka agents) that are now built into ZED.
+
+ 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`)
+ 2. A _Retire Agent_ module (`agents/zfs_retire.c`)
+ 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`)
+
+To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum
+ereports and feeds them into a Soft Error Rate Discrimination (SERD)
+algorithm which will generate a corresponding fault diagnosis when the
+tracked VDEV encounters **N** events in a given **T** time window. The
+initial N and T values for the SERD algorithm are estimates inherited
+from illumos (10 errors in 10 minutes).
+
+In turn, a **Retire Agent** responds to diagnosed faults by isolating
+the faulty VDEV. It will notify the ZFS kernel module of the new VDEV
+state (degraded or faulted). The retire agent is also responsible for
+managing hot spares across all pools. When it encounters a device fault
+or a device removal it will replace the device with an appropriate
+spare if available.
+
+Finally, a **Disk Add Agent** responds to events from a libudev disk
+monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or
+expand the associated VDEV. This agent is also known as the `zfs_mod`
+or Sysevent Loadable Module (SLM) on the illumos platform. The added
+disk is matched to a specific VDEV using its device id, physical path
+or VDEV GUID.
+
+Note that the _auto-replace_ feature (aka hot plug) is opt-in and you
+must set the pool's `autoreplace` property to enable it. The new disk
+will be matched to the corresponding leaf VDEV by physical location
+and labeled with a GPT partition before replacing the original VDEV
+in the pool.
+
+### Implementation Notes ###
+
+* The FMD module API required for logic modules is emulated and implemented
+ in the `fmd_api.c` and `fmd_serd.c` source files. This support includes
+ module registration, memory allocation, module property accessors, basic
+ case management, one-shot timers and SERD engines.
+ For detailed information on the FMD module API, see the document --
+ _"Fault Management Daemon Programmer's Reference Manual"_.
+
+* The event subscriptions for the modules (located in a module specific
+ configuration file on illumos) are currently hard-coded into the ZED
+ `zfs_agent_dispatch()` function.
+
+* The FMD modules are called one at a time from a single thread that
+ consumes events queued to the modules. These events are sourced from
+ the normal ZED events and also include events posted from the diagnosis
+ engine and the libudev disk event monitor.
+
+* The FMD code modules have minimal changes and were intentionally left
+ as similar as possible to their upstream source files.
+
+* The sysevent namespace in ZED differs from illumos. For example:
+ * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"`
+ * Linux uses `"sysevent.fs.zfs.vdev_remove"`
+
+* The FMD Modules port was produced by Intel Federal, LLC under award
+ number B609815 between the U.S. Department of Energy (DOE) and Intel
+ Federal, LLC.
+
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c
new file mode 100644
index 000000000000..607b387ca3a8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+/*
+ * This file implements the minimal FMD module API required to support the
+ * fault logic modules in ZED. This support includes module registration,
+ * memory allocation, module property accessors, basic case management,
+ * one-shot timers and SERD engines.
+ *
+ * In the ZED runtime, the modules are called from a single thread so no
+ * locking is required in this emulated FMD environment.
+ */
+
+#include <sys/types.h>
+#include <sys/fm/protocol.h>
+#include <uuid/uuid.h>
+#include <signal.h>
+#include <strings.h>
+#include <time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+typedef struct fmd_modstat {
+ fmd_stat_t ms_accepted; /* total events accepted by module */
+ fmd_stat_t ms_caseopen; /* cases currently open */
+ fmd_stat_t ms_casesolved; /* total cases solved by module */
+ fmd_stat_t ms_caseclosed; /* total cases closed by module */
+} fmd_modstat_t;
+
+typedef struct fmd_module {
+ const char *mod_name; /* basename of module (ro) */
+ const fmd_hdl_info_t *mod_info; /* module info registered with handle */
+ void *mod_spec; /* fmd_hdl_get/setspecific data value */
+ fmd_stat_t *mod_ustat; /* module specific custom stats */
+ uint_t mod_ustat_cnt; /* count of ustat stats */
+ fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */
+ fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */
+ char *mod_vers; /* a copy of module version string */
+} fmd_module_t;
+
+/*
+ * ZED has two FMD hardwired module instances
+ */
+fmd_module_t zfs_retire_module;
+fmd_module_t zfs_diagnosis_module;
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+/*
+ * Register a module with fmd and finish module initialization.
+ * Returns an integer indicating whether it succeeded (zero) or
+ * failed (non-zero).
+ */
+int
+fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ mp->mod_info = mip;
+ mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */
+ mp->mod_spec = NULL;
+
+ /* bare minimum module stats */
+ (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted");
+ (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen");
+ (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved");
+ (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed");
+
+ fmd_serd_hash_create(&mp->mod_serds);
+
+ fmd_hdl_debug(hdl, "register module");
+
+ return (0);
+}
+
+void
+fmd_hdl_unregister(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_modstat_t *msp = &mp->mod_stats;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+ /* dump generic module stats */
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name,
+ msp->ms_accepted.fmds_value.ui64);
+ if (ops->fmdo_close != NULL) {
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name,
+ msp->ms_caseopen.fmds_value.ui64);
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name,
+ msp->ms_casesolved.fmds_value.ui64);
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name,
+ msp->ms_caseclosed.fmds_value.ui64);
+ }
+
+ /* dump module specific stats */
+ if (mp->mod_ustat != NULL) {
+ int i;
+
+ for (i = 0; i < mp->mod_ustat_cnt; i++) {
+ fmd_hdl_debug(hdl, "%s: %llu",
+ mp->mod_ustat[i].fmds_name,
+ mp->mod_ustat[i].fmds_value.ui64);
+ }
+ }
+
+ fmd_serd_hash_destroy(&mp->mod_serds);
+
+ fmd_hdl_debug(hdl, "unregister module");
+}
+
+/*
+ * fmd_hdl_setspecific() is used to associate a data pointer with
+ * the specified handle for the duration of the module's lifetime.
+ * This pointer can be retrieved using fmd_hdl_getspecific().
+ */
+void
+fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ mp->mod_spec = spec;
+}
+
+/*
+ * Return the module-specific data pointer previously associated
+ * with the handle using fmd_hdl_setspecific().
+ */
+void *
+fmd_hdl_getspecific(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (mp->mod_spec);
+}
+
+void *
+fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+ return (umem_alloc(size, flags));
+}
+
+void *
+fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+ return (umem_zalloc(size, flags));
+}
+
+void
+fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size)
+{
+ umem_free(data, size);
+}
+
+/*
+ * Record a module debug message using the specified format.
+ */
+void
+fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...)
+{
+ char message[256];
+ va_list vargs;
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ va_start(vargs, format);
+ (void) vsnprintf(message, sizeof (message), format, vargs);
+ va_end(vargs);
+
+ /* prefix message with module name */
+ zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message);
+}
+
+/* Property Retrieval */
+
+int32_t
+fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
+{
+ /*
+ * These can be looked up in mp->modinfo->fmdi_props
+ * For now we just hard code for phase 2. In the
+ * future, there can be a ZED based override.
+ */
+ if (strcmp(name, "spare_on_remove") == 0)
+ return (1);
+
+ if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
+ return (10); /* N = 10 events */
+
+ return (0);
+}
+
+int64_t
+fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
+{
+ /*
+ * These can be looked up in mp->modinfo->fmdi_props
+ * For now we just hard code for phase 2. In the
+ * future, there can be a ZED based override.
+ */
+ if (strcmp(name, "remove_timeout") == 0)
+ return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
+
+ if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
+ return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
+
+ return (0);
+}
+
+/* FMD Statistics */
+
+fmd_stat_t *
+fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ if (flags == FMD_STAT_NOALLOC) {
+ mp->mod_ustat = statv;
+ mp->mod_ustat_cnt = nstats;
+ }
+
+ return (statv);
+}
+
+/* Case Management */
+
+fmd_case_t *
+fmd_case_open(fmd_hdl_t *hdl, void *data)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ uuid_t uuid;
+
+ fmd_case_t *cp;
+
+ cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP);
+ cp->ci_mod = hdl;
+ cp->ci_state = FMD_CASE_UNSOLVED;
+ cp->ci_flags = FMD_CF_DIRTY;
+ cp->ci_data = data;
+ cp->ci_bufptr = NULL;
+ cp->ci_bufsiz = 0;
+
+ uuid_generate(uuid);
+ uuid_unparse(uuid, cp->ci_uuid);
+
+ fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid);
+ mp->mod_stats.ms_caseopen.fmds_value.ui64++;
+
+ return (cp);
+}
+
+void
+fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ /*
+ * For ZED, the event was already sent from fmd_case_add_suspect()
+ */
+
+ if (cp->ci_state >= FMD_CASE_SOLVED)
+ fmd_hdl_debug(hdl, "case is already solved or closed");
+
+ cp->ci_state = FMD_CASE_SOLVED;
+
+ fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid);
+ mp->mod_stats.ms_casesolved.fmds_value.ui64++;
+}
+
+void
+fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+ fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid);
+
+ if (ops->fmdo_close != NULL)
+ ops->fmdo_close(hdl, cp);
+
+ mp->mod_stats.ms_caseopen.fmds_value.ui64--;
+ mp->mod_stats.ms_caseclosed.fmds_value.ui64++;
+
+ if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0)
+ fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz);
+
+ fmd_hdl_free(hdl, cp, sizeof (fmd_case_t));
+}
+
+void
+fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid)
+{
+ fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid);
+}
+
+int
+fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE);
+}
+
+void
+fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep)
+{
+}
+
+static void
+zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code)
+{
+ nvlist_t *rsrc;
+ char *strval;
+ uint64_t guid;
+ uint8_t byte;
+
+ zed_log_msg(LOG_INFO, "\nzed_fault_event:");
+
+ if (uuid != NULL)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid);
+ if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval);
+ if (code != NULL)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code);
+ if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte);
+ if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
+ if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME,
+ strval);
+ if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL,
+ guid);
+ if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV,
+ guid);
+ }
+}
+
+static const char *
+fmd_fault_mkcode(nvlist_t *fault)
+{
+ char *class, *code = "-";
+
+ /*
+ * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po
+ */
+ if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) {
+ if (strcmp(class, "fault.fs.zfs.vdev.io") == 0)
+ code = "ZFS-8000-FD";
+ else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0)
+ code = "ZFS-8000-GH";
+ else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0)
+ code = "ZFS-8000-HC";
+ else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0)
+ code = "ZFS-8000-JQ";
+ else if (strcmp(class, "fault.fs.zfs.log_replay") == 0)
+ code = "ZFS-8000-K4";
+ else if (strcmp(class, "fault.fs.zfs.pool") == 0)
+ code = "ZFS-8000-CS";
+ else if (strcmp(class, "fault.fs.zfs.device") == 0)
+ code = "ZFS-8000-D3";
+
+ }
+ return (code);
+}
+
+void
+fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault)
+{
+ nvlist_t *nvl;
+ const char *code = fmd_fault_mkcode(fault);
+ int64_t tod[2];
+ int err = 0;
+
+ /*
+ * payload derived from fmd_protocol_list()
+ */
+
+ (void) gettimeofday(&cp->ci_tv, NULL);
+ tod[0] = cp->ci_tv.tv_sec;
+ tod[1] = cp->ci_tv.tv_usec;
+
+ nvl = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION);
+ err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS);
+ err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid);
+ err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code);
+ err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2);
+ err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1);
+ err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1);
+
+ if (err)
+ zed_log_die("failed to populate nvlist");
+
+ zed_log_fault(fault, cp->ci_uuid, code);
+ zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl);
+
+ nvlist_free(nvl);
+ nvlist_free(fault);
+}
+
+void
+fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data)
+{
+ cp->ci_data = data;
+}
+
+void *
+fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ return (cp->ci_data);
+}
+
+void
+fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr == NULL);
+ assert(size < (1024 * 1024));
+
+ cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP);
+ cp->ci_bufsiz = size;
+}
+
+void
+fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp,
+ const char *name, void *buf, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr != NULL);
+ assert(size <= cp->ci_bufsiz);
+
+ bcopy(cp->ci_bufptr, buf, size);
+}
+
+void
+fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp,
+ const char *name, const void *buf, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr != NULL);
+ assert(cp->ci_bufsiz >= size);
+
+ bcopy(buf, cp->ci_bufptr, size);
+}
+
+/* SERD Engines */
+
+void
+fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) {
+ zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': "
+ " name already exists", name);
+ return;
+ }
+
+ (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t);
+}
+
+void
+fmd_serd_destroy(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ fmd_serd_eng_delete(&mp->mod_serds, name);
+
+ fmd_hdl_debug(hdl, "serd_destroy %s", name);
+}
+
+int
+fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
+}
+
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
+
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+ return;
+ }
+
+ fmd_serd_eng_reset(sgp);
+
+ fmd_hdl_debug(hdl, "serd_reset %s", name);
+}
+
+int
+fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
+ int err;
+
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
+ name);
+ return (FMD_B_FALSE);
+ }
+ err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+
+ return (err);
+}
+
+/* FMD Timers */
+
+static void
+_timer_notify(union sigval sv)
+{
+ fmd_timer_t *ftp = sv.sival_ptr;
+ fmd_hdl_t *hdl = ftp->ft_hdl;
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+ struct itimerspec its;
+
+ fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+
+ /* disarm the timer */
+ bzero(&its, sizeof (struct itimerspec));
+ timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+ /* Note that the fmdo_timeout can remove this timer */
+ if (ops->fmdo_timeout != NULL)
+ ops->fmdo_timeout(hdl, ftp, ftp->ft_arg);
+}
+
+/*
+ * Install a new timer which will fire at least delta nanoseconds after the
+ * current time. After the timeout has expired, the module's fmdo_timeout
+ * entry point is called.
+ */
+fmd_timer_t *
+fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta)
+{
+ struct sigevent sev;
+ struct itimerspec its;
+ fmd_timer_t *ftp;
+
+ ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP);
+ ftp->ft_arg = arg;
+ ftp->ft_hdl = hdl;
+
+ its.it_value.tv_sec = delta / 1000000000;
+ its.it_value.tv_nsec = delta % 1000000000;
+ its.it_interval.tv_sec = its.it_value.tv_sec;
+ its.it_interval.tv_nsec = its.it_value.tv_nsec;
+
+ sev.sigev_notify = SIGEV_THREAD;
+ sev.sigev_notify_function = _timer_notify;
+ sev.sigev_notify_attributes = NULL;
+ sev.sigev_value.sival_ptr = ftp;
+
+ timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid);
+ timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+ fmd_hdl_debug(hdl, "installing timer for %d secs (%p)",
+ (int)its.it_value.tv_sec, ftp->ft_tid);
+
+ return (ftp);
+}
+
+void
+fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp)
+{
+ fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid);
+
+ timer_delete(ftp->ft_tid);
+
+ fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t));
+}
+
+/* Name-Value Pair Lists */
+
+nvlist_t *
+fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty,
+ nvlist_t *asru, nvlist_t *fru, nvlist_t *resource)
+{
+ nvlist_t *nvl;
+ int err = 0;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ zed_log_die("failed to xalloc fault nvlist");
+
+ err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION);
+ err |= nvlist_add_string(nvl, FM_CLASS, class);
+ err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty);
+
+ if (asru != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru);
+ if (fru != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru);
+ if (resource != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource);
+
+ if (err)
+ zed_log_die("failed to populate nvlist: %s\n", strerror(err));
+
+ return (nvl);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static int
+fmd_strmatch(const char *s, const char *p)
+{
+ char c;
+
+ if (p == NULL)
+ return (0);
+
+ if (s == NULL)
+ s = ""; /* treat NULL string as the empty string */
+
+ do {
+ if ((c = *p++) == '\0')
+ return (*s == '\0');
+
+ if (c == '*') {
+ while (*p == '*')
+ p++; /* consecutive *'s can be collapsed */
+
+ if (*p == '\0')
+ return (1);
+
+ while (*s != '\0') {
+ if (fmd_strmatch(s++, p) != 0)
+ return (1);
+ }
+
+ return (0);
+ }
+ } while (c == *s++);
+
+ return (0);
+}
+
+int
+fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern)
+{
+ char *class;
+
+ return (nvl != NULL &&
+ nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 &&
+ fmd_strmatch(class, pattern));
+}
+
+nvlist_t *
+fmd_nvl_alloc(fmd_hdl_t *hdl, int flags)
+{
+ nvlist_t *nvl = NULL;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ return (NULL);
+
+ return (nvl);
+}
+
+
+/*
+ * ZED Agent specific APIs
+ */
+
+fmd_hdl_t *
+fmd_module_hdl(const char *name)
+{
+ if (strcmp(name, "zfs-retire") == 0)
+ return ((fmd_hdl_t *)&zfs_retire_module);
+ if (strcmp(name, "zfs-diagnosis") == 0)
+ return ((fmd_hdl_t *)&zfs_diagnosis_module);
+
+ return (NULL);
+}
+
+boolean_t
+fmd_module_initialized(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (mp->mod_info != NULL);
+}
+
+/*
+ * fmd_module_recv is called for each event that is received by
+ * the fault manager that has a class that matches one of the
+ * module's subscriptions.
+ */
+void
+fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+ fmd_event_t faux_event = {0};
+ int64_t *tv;
+ uint_t n;
+
+ /*
+ * Will need to normalized this if we persistently store the case data
+ */
+ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0)
+ faux_event.ev_hrt = tv[0] * NANOSEC + tv[1];
+ else
+ faux_event.ev_hrt = 0;
+
+ ops->fmdo_recv(hdl, &faux_event, nvl, class);
+
+ mp->mod_stats.ms_accepted.fmds_value.ui64++;
+
+ /* TBD - should we initiate fm_module_gc() periodically? */
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h
new file mode 100644
index 000000000000..4f06fb244b7b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_api.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _FMD_API_H
+#define _FMD_API_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <libnvpair.h>
+#include <stdarg.h>
+#include <umem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Fault Management Daemon Client Interfaces
+ */
+
+#define FMD_API_VERSION 5
+
+typedef struct fmd_hdl fmd_hdl_t;
+
+typedef struct fmd_timer {
+ timer_t ft_tid;
+ void *ft_arg;
+ fmd_hdl_t *ft_hdl;
+} fmd_timer_t;
+
+#define id_t fmd_timer_t *
+
+
+typedef struct fmd_event {
+ hrtime_t ev_hrt; /* event time used by SERD engines */
+} fmd_event_t;
+
+typedef struct fmd_case {
+ char ci_uuid[48]; /* uuid string for this case */
+ fmd_hdl_t *ci_mod; /* module that owns this case */
+ void *ci_data; /* data from fmd_case_setspecific() */
+ ushort_t ci_state; /* case state (see below) */
+ ushort_t ci_flags; /* case flags (see below) */
+ struct timeval ci_tv; /* time of original diagnosis */
+ void *ci_bufptr; /* case data serialization buffer */
+ size_t ci_bufsiz;
+} fmd_case_t;
+
+
+#define FMD_B_FALSE 0 /* false value for booleans as int */
+#define FMD_B_TRUE 1 /* true value for booleans as int */
+
+
+#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */
+#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */
+#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */
+#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */
+#define FMD_CASE_REPAIRED 4 /* case is repaired */
+#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */
+
+#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */
+#define FMD_CF_SOLVED 0x02 /* case has been solved */
+#define FMD_CF_ISOLATED 0x04 /* case has been isolated */
+#define FMD_CF_REPAIRED 0x08 /* case has been repaired */
+#define FMD_CF_RESOLVED 0x10 /* case has been resolved */
+
+
+#define FMD_TYPE_BOOL 0 /* int */
+#define FMD_TYPE_INT32 1 /* int32_t */
+#define FMD_TYPE_UINT32 2 /* uint32_t */
+#define FMD_TYPE_INT64 3 /* int64_t */
+#define FMD_TYPE_UINT64 4 /* uint64_t */
+#define FMD_TYPE_TIME 5 /* uint64_t */
+#define FMD_TYPE_SIZE 6 /* uint64_t */
+
+typedef struct fmd_prop {
+ const char *fmdp_name; /* property name */
+ uint_t fmdp_type; /* property type (see above) */
+ const char *fmdp_defv; /* default value */
+} fmd_prop_t;
+
+typedef struct fmd_stat {
+ char fmds_name[32]; /* statistic name */
+ uint_t fmds_type; /* statistic type (see above) */
+ char fmds_desc[64]; /* statistic description */
+ union {
+ int bool; /* FMD_TYPE_BOOL */
+ int32_t i32; /* FMD_TYPE_INT32 */
+ uint32_t ui32; /* FMD_TYPE_UINT32 */
+ int64_t i64; /* FMD_TYPE_INT64 */
+ uint64_t ui64; /* FMD_TYPE_UINT64 */
+ } fmds_value;
+} fmd_stat_t;
+
+typedef struct fmd_hdl_ops {
+ void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *);
+ void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *);
+ void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *);
+ void (*fmdo_stats)(fmd_hdl_t *);
+ void (*fmdo_gc)(fmd_hdl_t *);
+} fmd_hdl_ops_t;
+
+#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */
+#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */
+#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */
+
+typedef struct fmd_hdl_info {
+ const char *fmdi_desc; /* fmd client description string */
+ const char *fmdi_vers; /* fmd client version string */
+ const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */
+ const fmd_prop_t *fmdi_props; /* array of configuration props */
+} fmd_hdl_info_t;
+
+extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *);
+extern void fmd_hdl_unregister(fmd_hdl_t *);
+
+extern void fmd_hdl_setspecific(fmd_hdl_t *, void *);
+extern void *fmd_hdl_getspecific(fmd_hdl_t *);
+
+#define FMD_SLEEP UMEM_NOFAIL
+
+extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int);
+extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int);
+extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t);
+
+extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int);
+extern void fmd_hdl_strfree(fmd_hdl_t *, char *);
+
+extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
+extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
+
+extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
+extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
+
+#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
+#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
+
+extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *);
+extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *);
+extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *);
+
+extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *);
+extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *);
+
+extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *);
+extern void fmd_case_uuclose(fmd_hdl_t *, const char *);
+extern int fmd_case_uuclosed(fmd_hdl_t *, const char *);
+extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *);
+extern void fmd_case_uuresolved(fmd_hdl_t *, const char *);
+
+extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *);
+extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *);
+extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *);
+
+extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *);
+extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *);
+
+extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t);
+extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *,
+ const char *, void *, size_t);
+extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *,
+ const char *, const void *, size_t);
+extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
+
+extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
+extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
+extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern void fmd_serd_reset(fmd_hdl_t *, const char *);
+extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
+extern int fmd_serd_fired(fmd_hdl_t *, const char *);
+extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+
+extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
+extern void fmd_timer_remove(fmd_hdl_t *, id_t);
+
+extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *,
+ const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *);
+
+extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *);
+
+#define FMD_HAS_FAULT_FRU 0
+#define FMD_HAS_FAULT_ASRU 1
+#define FMD_HAS_FAULT_RESOURCE 2
+
+extern void fmd_repair_fru(fmd_hdl_t *, const char *);
+extern int fmd_repair_asru(fmd_hdl_t *, const char *);
+
+extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int);
+extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int);
+
+/*
+ * ZED Specific Interfaces
+ */
+
+extern fmd_hdl_t *fmd_module_hdl(const char *);
+extern boolean_t fmd_module_initialized(fmd_hdl_t *);
+extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *);
+
+/* ZFS FMA Retire Agent */
+extern void _zfs_retire_init(fmd_hdl_t *);
+extern void _zfs_retire_fini(fmd_hdl_t *);
+
+/* ZFS FMA Diagnosis Engine */
+extern void _zfs_diagnosis_init(fmd_hdl_t *);
+extern void _zfs_diagnosis_fini(fmd_hdl_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FMD_API_H */
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c
new file mode 100644
index 000000000000..d4ec37fb7691
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/list.h>
+#include <sys/time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+#include "../zed_log.h"
+
+
+#define FMD_STR_BUCKETS 211
+
+
+#ifdef SERD_ENG_DEBUG
+#define serd_log_msg(fmt, ...) \
+ zed_log_msg(LOG_INFO, fmt, __VA_ARGS__)
+#else
+#define serd_log_msg(fmt, ...)
+#endif
+
+
+/*
+ * SERD Engine Backend
+ */
+
+/*
+ * Compute the delta between events in nanoseconds. To account for very old
+ * events which are replayed, we must handle the case where time is negative.
+ * We convert the hrtime_t's to unsigned 64-bit integers and then handle the
+ * case where 'old' is greater than 'new' (i.e. high-res time has wrapped).
+ */
+static hrtime_t
+fmd_event_delta(hrtime_t t1, hrtime_t t2)
+{
+ uint64_t old = t1;
+ uint64_t new = t2;
+
+ return (new >= old ? new - old : (UINT64_MAX - old) + new + 1);
+}
+
+static fmd_serd_eng_t *
+fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t)
+{
+ fmd_serd_eng_t *sgp;
+
+ sgp = malloc(sizeof (fmd_serd_eng_t));
+ bzero(sgp, sizeof (fmd_serd_eng_t));
+
+ sgp->sg_name = strdup(name);
+ sgp->sg_flags = FMD_SERD_DIRTY;
+ sgp->sg_n = n;
+ sgp->sg_t = t;
+
+ list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t),
+ offsetof(fmd_serd_elem_t, se_list));
+
+ return (sgp);
+}
+
+static void
+fmd_serd_eng_free(fmd_serd_eng_t *sgp)
+{
+ fmd_serd_eng_reset(sgp);
+ free(sgp->sg_name);
+ list_destroy(&sgp->sg_list);
+ free(sgp);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static ulong_t
+fmd_strhash(const char *key)
+{
+ ulong_t g, h = 0;
+ const char *p;
+
+ for (p = key; *p != '\0'; p++) {
+ h = (h << 4) + *p;
+
+ if ((g = (h & 0xf0000000)) != 0) {
+ h ^= (g >> 24);
+ h ^= g;
+ }
+ }
+
+ return (h);
+}
+
+void
+fmd_serd_hash_create(fmd_serd_hash_t *shp)
+{
+ shp->sh_hashlen = FMD_STR_BUCKETS;
+ shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *));
+ shp->sh_count = 0;
+}
+
+void
+fmd_serd_hash_destroy(fmd_serd_hash_t *shp)
+{
+ fmd_serd_eng_t *sgp, *ngp;
+ uint_t i;
+
+ for (i = 0; i < shp->sh_hashlen; i++) {
+ for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) {
+ ngp = sgp->sg_next;
+ fmd_serd_eng_free(sgp);
+ }
+ }
+
+ free(shp->sh_hash);
+ bzero(shp, sizeof (fmd_serd_hash_t));
+}
+
+void
+fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg)
+{
+ fmd_serd_eng_t *sgp;
+ uint_t i;
+
+ for (i = 0; i < shp->sh_hashlen; i++) {
+ for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next)
+ func(sgp, arg);
+ }
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name,
+ uint_t n, hrtime_t t)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t);
+
+ serd_log_msg(" SERD Engine: inserting %s N %d T %llu",
+ name, (int)n, (long long unsigned)t);
+
+ sgp->sg_next = shp->sh_hash[h];
+ shp->sh_hash[h] = sgp;
+ shp->sh_count++;
+
+ return (sgp);
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp;
+
+ for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) {
+ if (strcmp(name, sgp->sg_name) == 0)
+ return (sgp);
+ }
+
+ return (NULL);
+}
+
+void
+fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h];
+
+ serd_log_msg(" SERD Engine: deleting %s", name);
+
+ for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) {
+ if (strcmp(sgp->sg_name, name) != 0)
+ pp = &sgp->sg_next;
+ else
+ break;
+ }
+
+ if (sgp != NULL) {
+ *pp = sgp->sg_next;
+ fmd_serd_eng_free(sgp);
+ assert(shp->sh_count != 0);
+ shp->sh_count--;
+ }
+}
+
+static void
+fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep)
+{
+ list_remove(&sgp->sg_list, sep);
+ sgp->sg_count--;
+
+ serd_log_msg(" SERD Engine: discarding %s, %d remaining",
+ sgp->sg_name, (int)sgp->sg_count);
+
+ free(sep);
+}
+
+int
+fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt)
+{
+ fmd_serd_elem_t *sep, *oep;
+
+ /*
+ * If the fired flag is already set, return false and discard the
+ * event. This means that the caller will only see the engine "fire"
+ * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired()
+ * function can also be used in combination with fmd_serd_eng_record().
+ */
+ if (sgp->sg_flags & FMD_SERD_FIRED) {
+ serd_log_msg(" SERD Engine: record %s already fired!",
+ sgp->sg_name);
+ return (FMD_B_FALSE);
+ }
+
+ while (sgp->sg_count >= sgp->sg_n)
+ fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list));
+
+ sep = malloc(sizeof (fmd_serd_elem_t));
+ sep->se_hrt = hrt;
+
+ list_insert_head(&sgp->sg_list, sep);
+ sgp->sg_count++;
+
+ serd_log_msg(" SERD Engine: recording %s of %d (%llu)",
+ sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt);
+
+ /*
+ * Pick up the oldest element pointer for comparison to 'sep'. We must
+ * do this after adding 'sep' because 'oep' and 'sep' can be the same.
+ */
+ oep = list_tail(&sgp->sg_list);
+
+ if (sgp->sg_count >= sgp->sg_n &&
+ fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) {
+ sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY;
+ serd_log_msg(" SERD Engine: fired %s", sgp->sg_name);
+ return (FMD_B_TRUE);
+ }
+
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+ return (FMD_B_FALSE);
+}
+
+int
+fmd_serd_eng_fired(fmd_serd_eng_t *sgp)
+{
+ return (sgp->sg_flags & FMD_SERD_FIRED);
+}
+
+int
+fmd_serd_eng_empty(fmd_serd_eng_t *sgp)
+{
+ return (sgp->sg_count == 0);
+}
+
+void
+fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
+{
+ serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name);
+
+ while (sgp->sg_count != 0)
+ fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list));
+
+ sgp->sg_flags &= ~FMD_SERD_FIRED;
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+}
+
+void
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+{
+ fmd_serd_elem_t *sep, *nep;
+ hrtime_t hrt;
+
+ if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED))
+ return; /* no garbage collection needed if empty or fired */
+
+ sep = list_head(&sgp->sg_list);
+ if (sep == NULL)
+ return;
+
+ hrt = sep->se_hrt - sgp->sg_t;
+
+ for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) {
+ if (sep->se_hrt >= hrt)
+ break; /* sep and subsequent events are all within T */
+
+ nep = list_next(&sgp->sg_list, sep);
+ fmd_serd_eng_discard(sgp, sep);
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+ }
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h
new file mode 100644
index 000000000000..c35c9acc7785
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/fmd_serd.h
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _FMD_SERD_H
+#define _FMD_SERD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/list.h>
+#include <sys/time.h>
+
+typedef struct fmd_serd_elem {
+ list_node_t se_list; /* linked list forward/back pointers */
+ hrtime_t se_hrt; /* upper bound on event hrtime */
+} fmd_serd_elem_t;
+
+typedef struct fmd_serd_eng {
+ char *sg_name; /* string name for this engine */
+ struct fmd_serd_eng *sg_next; /* next engine on hash chain */
+ list_t sg_list; /* list of fmd_serd_elem_t's */
+ uint_t sg_count; /* count of events in sg_list */
+ uint_t sg_flags; /* engine flags (see below) */
+ uint_t sg_n; /* engine N parameter (event count) */
+ hrtime_t sg_t; /* engine T parameter (nanoseconds) */
+} fmd_serd_eng_t;
+
+#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */
+#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */
+
+typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *);
+
+typedef struct fmd_serd_hash {
+ fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */
+ uint_t sh_hashlen; /* length of hash bucket array */
+ uint_t sh_count; /* count of engines in hash */
+} fmd_serd_hash_t;
+
+extern void fmd_serd_hash_create(fmd_serd_hash_t *);
+extern void fmd_serd_hash_destroy(fmd_serd_hash_t *);
+extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *);
+
+extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *,
+ const char *, uint32_t, hrtime_t);
+
+extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *);
+extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *);
+
+extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t);
+extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
+extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
+
+extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FMD_SERD_H */
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
new file mode 100644
index 000000000000..006e0ab99f47
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c
@@ -0,0 +1,422 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+#include "../zed_log.h"
+
+/*
+ * agent dispatch code
+ */
+
+static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
+static list_t agent_events; /* list of pending events */
+static int agent_exiting;
+
+typedef struct agent_event {
+ char ae_class[64];
+ char ae_subclass[32];
+ nvlist_t *ae_nvl;
+ list_node_t ae_node;
+} agent_event_t;
+
+pthread_t g_agents_tid;
+
+libzfs_handle_t *g_zfs_hdl;
+
+/* guid search data */
+typedef enum device_type {
+ DEVICE_TYPE_L2ARC, /* l2arc device */
+ DEVICE_TYPE_SPARE, /* spare device */
+ DEVICE_TYPE_PRIMARY /* any primary pool storage device */
+} device_type_t;
+
+typedef struct guid_search {
+ uint64_t gs_pool_guid;
+ uint64_t gs_vdev_guid;
+ char *gs_devid;
+ device_type_t gs_vdev_type;
+ uint64_t gs_vdev_expandtime; /* vdev expansion time */
+} guid_search_t;
+
+/*
+ * Walks the vdev tree recursively looking for a matching devid.
+ * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
+ */
+static boolean_t
+zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
+{
+ guid_search_t *gsp = arg;
+ char *path = NULL;
+ uint_t c, children;
+ nvlist_t **child;
+
+ /*
+ * First iterate over any children.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
+ return (B_TRUE);
+ }
+ }
+ }
+ /*
+ * Iterate over any spares and cache devices
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
+ return (B_TRUE);
+ }
+ }
+ }
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
+ return (B_TRUE);
+ }
+ }
+ }
+ /*
+ * On a devid match, grab the vdev guid and expansion time, if any.
+ */
+ if (gsp->gs_devid != NULL &&
+ (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
+ (strcmp(gsp->gs_devid, path) == 0)) {
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+ &gsp->gs_vdev_guid);
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
+ &gsp->gs_vdev_expandtime);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
+{
+ guid_search_t *gsp = arg;
+ nvlist_t *config, *nvl;
+
+ /*
+ * For each vdev in this pool, look for a match by devid
+ */
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvl) == 0) {
+ (void) zfs_agent_iter_vdev(zhp, nvl, gsp);
+ }
+ }
+ /*
+ * if a match was found then grab the pool guid
+ */
+ if (gsp->gs_vdev_guid) {
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &gsp->gs_pool_guid);
+ }
+
+ zpool_close(zhp);
+ return (gsp->gs_vdev_guid != 0);
+}
+
+void
+zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ agent_event_t *event;
+
+ if (subclass == NULL)
+ subclass = "";
+
+ event = malloc(sizeof (agent_event_t));
+ if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
+ if (event)
+ free(event);
+ return;
+ }
+
+ if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
+ class = EC_ZFS;
+ subclass = ESC_ZFS_VDEV_CHECK;
+ }
+
+ /*
+ * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED
+ * ereport from vdev_disk layer after a hot unplug. Fortunately we
+ * get a EC_DEV_REMOVE from our disk monitor and it is a suitable
+ * proxy so we remap it here for the benefit of the diagnosis engine.
+ */
+ if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
+ (strcmp(subclass, ESC_DISK) == 0) &&
+ (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
+ nvlist_exists(nvl, DEV_IDENTIFIER))) {
+ nvlist_t *payload = event->ae_nvl;
+ struct timeval tv;
+ int64_t tod[2];
+ uint64_t pool_guid = 0, vdev_guid = 0;
+ guid_search_t search = { 0 };
+ device_type_t devtype = DEVICE_TYPE_PRIMARY;
+
+ class = "resource.fs.zfs.removed";
+ subclass = "";
+
+ (void) nvlist_add_string(payload, FM_CLASS, class);
+ (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
+ (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
+
+ (void) gettimeofday(&tv, NULL);
+ tod[0] = tv.tv_sec;
+ tod[1] = tv.tv_usec;
+ (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
+
+ /*
+ * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
+ * ZFS_EV_POOL_GUID may be missing so find them.
+ */
+ (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER,
+ &search.gs_devid);
+ (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+ pool_guid = search.gs_pool_guid;
+ vdev_guid = search.gs_vdev_guid;
+ devtype = search.gs_vdev_type;
+
+ /*
+ * We want to avoid reporting "remove" events coming from
+ * libudev for VDEVs which were expanded recently (10s) and
+ * avoid activating spares in response to partitions being
+ * deleted and created in rapid succession.
+ */
+ if (search.gs_vdev_expandtime != 0 &&
+ search.gs_vdev_expandtime + 10 > tv.tv_sec) {
+ zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
+ "for recently expanded device '%s'", EC_DEV_REMOVE,
+ search.gs_devid);
+ goto out;
+ }
+
+ (void) nvlist_add_uint64(payload,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
+ (void) nvlist_add_uint64(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
+ switch (devtype) {
+ case DEVICE_TYPE_L2ARC:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ VDEV_TYPE_L2CACHE);
+ break;
+ case DEVICE_TYPE_SPARE:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
+ break;
+ case DEVICE_TYPE_PRIMARY:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
+ break;
+ }
+
+ zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
+ EC_DEV_REMOVE, class);
+ }
+
+ (void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
+ (void) strlcpy(event->ae_subclass, subclass,
+ sizeof (event->ae_subclass));
+
+ (void) pthread_mutex_lock(&agent_lock);
+ list_insert_tail(&agent_events, event);
+ (void) pthread_mutex_unlock(&agent_lock);
+
+out:
+ (void) pthread_cond_signal(&agent_cond);
+}
+
+static void
+zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ /*
+ * The diagnosis engine subscribes to the following events.
+ * On illumos these subscriptions reside in:
+ * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
+ */
+ if (strstr(class, "ereport.fs.zfs.") != NULL ||
+ strstr(class, "resource.fs.zfs.") != NULL ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
+ fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
+ }
+
+ /*
+ * The retire agent subscribes to the following events.
+ * On illumos these subscriptions reside in:
+ * /usr/lib/fm/fmd/plugins/zfs-retire.conf
+ *
+ * NOTE: faults events come directly from our diagnosis engine
+ * and will not pass through the zfs kernel module.
+ */
+ if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
+ strcmp(class, "resource.fs.zfs.removed") == 0 ||
+ strcmp(class, "resource.fs.zfs.statechange") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
+ fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
+ }
+
+ /*
+ * The SLM module only consumes disk events and vdev check events
+ *
+ * NOTE: disk events come directly from disk monitor and will
+ * not pass through the zfs kernel module.
+ */
+ if (strstr(class, "EC_dev_") != NULL ||
+ strcmp(class, EC_ZFS) == 0) {
+ (void) zfs_slm_event(class, subclass, nvl);
+ }
+}
+
+/*
+ * Events are consumed and dispatched from this thread
+ * An agent can also post an event so event list lock
+ * is not held when calling an agent.
+ * One event is consumed at a time.
+ */
+static void *
+zfs_agent_consumer_thread(void *arg)
+{
+ for (;;) {
+ agent_event_t *event;
+
+ (void) pthread_mutex_lock(&agent_lock);
+
+ /* wait for an event to show up */
+ while (!agent_exiting && list_is_empty(&agent_events))
+ (void) pthread_cond_wait(&agent_cond, &agent_lock);
+
+ if (agent_exiting) {
+ (void) pthread_mutex_unlock(&agent_lock);
+ zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
+ "exiting");
+ return (NULL);
+ }
+
+ if ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+
+ (void) pthread_mutex_unlock(&agent_lock);
+
+ /* dispatch to all event subscribers */
+ zfs_agent_dispatch(event->ae_class, event->ae_subclass,
+ event->ae_nvl);
+
+ nvlist_free(event->ae_nvl);
+ free(event);
+ continue;
+ }
+
+ (void) pthread_mutex_unlock(&agent_lock);
+ }
+
+ return (NULL);
+}
+
+void
+zfs_agent_init(libzfs_handle_t *zfs_hdl)
+{
+ fmd_hdl_t *hdl;
+
+ g_zfs_hdl = zfs_hdl;
+
+ if (zfs_slm_init() != 0)
+ zed_log_die("Failed to initialize zfs slm");
+ zed_log_msg(LOG_INFO, "Add Agent: init");
+
+ hdl = fmd_module_hdl("zfs-diagnosis");
+ _zfs_diagnosis_init(hdl);
+ if (!fmd_module_initialized(hdl))
+ zed_log_die("Failed to initialize zfs diagnosis");
+
+ hdl = fmd_module_hdl("zfs-retire");
+ _zfs_retire_init(hdl);
+ if (!fmd_module_initialized(hdl))
+ zed_log_die("Failed to initialize zfs retire");
+
+ list_create(&agent_events, sizeof (agent_event_t),
+ offsetof(struct agent_event, ae_node));
+
+ if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
+ NULL) != 0) {
+ list_destroy(&agent_events);
+ zed_log_die("Failed to initialize agents");
+ }
+}
+
+void
+zfs_agent_fini(void)
+{
+ fmd_hdl_t *hdl;
+ agent_event_t *event;
+
+ agent_exiting = 1;
+ (void) pthread_cond_signal(&agent_cond);
+
+ /* wait for zfs_enum_pools thread to complete */
+ (void) pthread_join(g_agents_tid, NULL);
+
+ /* drain any pending events */
+ while ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+ nvlist_free(event->ae_nvl);
+ free(event);
+ }
+
+ list_destroy(&agent_events);
+
+ if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
+ _zfs_retire_fini(hdl);
+ fmd_hdl_unregister(hdl);
+ }
+ if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
+ _zfs_diagnosis_fini(hdl);
+ fmd_hdl_unregister(hdl);
+ }
+
+ zed_log_msg(LOG_INFO, "Add Agent: fini");
+ zfs_slm_fini();
+
+ g_zfs_hdl = NULL;
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h
new file mode 100644
index 000000000000..d1a459139b1e
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef ZFS_AGENTS_H
+#define ZFS_AGENTS_H
+
+#include <libzfs.h>
+#include <libnvpair.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Agent abstraction presented to ZED
+ */
+extern void zfs_agent_init(libzfs_handle_t *);
+extern void zfs_agent_fini(void);
+extern void zfs_agent_post_event(const char *, const char *, nvlist_t *);
+
+/*
+ * ZFS Sysevent Linkable Module (SLM)
+ */
+extern int zfs_slm_init(void);
+extern void zfs_slm_fini(void);
+extern void zfs_slm_event(const char *, const char *, nvlist_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZFS_AGENTS_H */
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c
new file mode 100644
index 000000000000..0b27f6702ee8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_diagnosis.c
@@ -0,0 +1,981 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <strings.h>
+#include <libuutil.h>
+#include <libzfs.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+/*
+ * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
+ * #define reserves enough space for two 64-bit hex values plus the length of
+ * the longest string.
+ */
+#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
+
+/*
+ * On-disk case structure. This must maintain backwards compatibility with
+ * previous versions of the DE. By default, any members appended to the end
+ * will be filled with zeros if they don't exist in a previous version.
+ */
+typedef struct zfs_case_data {
+ uint64_t zc_version;
+ uint64_t zc_ena;
+ uint64_t zc_pool_guid;
+ uint64_t zc_vdev_guid;
+ int zc_pool_state;
+ char zc_serd_checksum[MAX_SERDLEN];
+ char zc_serd_io[MAX_SERDLEN];
+ int zc_has_remove_timer;
+} zfs_case_data_t;
+
+/*
+ * Time-of-day
+ */
+typedef struct er_timeval {
+ uint64_t ertv_sec;
+ uint64_t ertv_nsec;
+} er_timeval_t;
+
+/*
+ * In-core case structure.
+ */
+typedef struct zfs_case {
+ boolean_t zc_present;
+ uint32_t zc_version;
+ zfs_case_data_t zc_data;
+ fmd_case_t *zc_case;
+ uu_list_node_t zc_node;
+ id_t zc_remove_timer;
+ char *zc_fru;
+ er_timeval_t zc_when;
+} zfs_case_t;
+
+#define CASE_DATA "data"
+#define CASE_FRU "fru"
+#define CASE_DATA_VERSION_INITIAL 1
+#define CASE_DATA_VERSION_SERD 2
+
+typedef struct zfs_de_stats {
+ fmd_stat_t old_drops;
+ fmd_stat_t dev_drops;
+ fmd_stat_t vdev_drops;
+ fmd_stat_t import_drops;
+ fmd_stat_t resource_drops;
+} zfs_de_stats_t;
+
+zfs_de_stats_t zfs_stats = {
+ { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" },
+ { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
+ { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
+ { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
+ { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
+};
+
+static hrtime_t zfs_remove_timeout;
+
+uu_list_pool_t *zfs_case_pool;
+uu_list_t *zfs_cases;
+
+#define ZFS_MAKE_RSRC(type) \
+ FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
+#define ZFS_MAKE_EREPORT(type) \
+ FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
+
+/*
+ * Write out the persistent representation of an active case.
+ */
+static void
+zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+ zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
+}
+
+/*
+ * Read back the persistent representation of an active case.
+ */
+static zfs_case_t *
+zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ zfs_case_t *zcp;
+
+ zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
+ zcp->zc_case = cp;
+
+ fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
+ sizeof (zcp->zc_data));
+
+ if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+ return (NULL);
+ }
+
+ /*
+ * fmd_buf_read() will have already zeroed out the remainder of the
+ * buffer, so we don't have to do anything special if the version
+ * doesn't include the SERD engine name.
+ */
+
+ if (zcp->zc_data.zc_has_remove_timer)
+ zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
+ NULL, zfs_remove_timeout);
+
+ uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
+ (void) uu_list_insert_before(zfs_cases, NULL, zcp);
+
+ fmd_case_setspecific(hdl, cp, zcp);
+
+ return (zcp);
+}
+
+/*
+ * Iterate over any active cases. If any cases are associated with a pool or
+ * vdev which is no longer present on the system, close the associated case.
+ */
+static void
+zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded)
+{
+ uint64_t vdev_guid = 0;
+ uint_t c, children;
+ nvlist_t **child;
+ zfs_case_t *zcp;
+
+ (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
+
+ /*
+ * Mark any cases associated with this (pool, vdev) pair.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == vdev_guid) {
+ zcp->zc_present = B_TRUE;
+ zcp->zc_when = *loaded;
+ }
+ }
+
+ /*
+ * Iterate over all children.
+ */
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_mark_pool(zpool_handle_t *zhp, void *unused)
+{
+ zfs_case_t *zcp;
+ uint64_t pool_guid;
+ uint64_t *tod;
+ er_timeval_t loaded = { 0 };
+ nvlist_t *config, *vd;
+ uint_t nelem = 0;
+ int ret;
+
+ pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+ /*
+ * Mark any cases associated with just this pool.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == 0)
+ zcp->zc_present = B_TRUE;
+ }
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+ &tod, &nelem);
+ if (nelem == 2) {
+ loaded.ertv_sec = tod[0];
+ loaded.ertv_nsec = tod[1];
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == 0) {
+ zcp->zc_when = loaded;
+ }
+ }
+ }
+
+ ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
+ if (ret) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ zfs_mark_vdev(pool_guid, vd, &loaded);
+
+ zpool_close(zhp);
+
+ return (0);
+}
+
+struct load_time_arg {
+ uint64_t lt_guid;
+ er_timeval_t *lt_time;
+ boolean_t lt_found;
+};
+
+static int
+zpool_find_load_time(zpool_handle_t *zhp, void *arg)
+{
+ struct load_time_arg *lta = arg;
+ uint64_t pool_guid;
+ uint64_t *tod;
+ nvlist_t *config;
+ uint_t nelem;
+
+ if (lta->lt_found) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+ if (pool_guid != lta->lt_guid) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+ &tod, &nelem) == 0 && nelem == 2) {
+ lta->lt_found = B_TRUE;
+ lta->lt_time->ertv_sec = tod[0];
+ lta->lt_time->ertv_nsec = tod[1];
+ }
+
+ zpool_close(zhp);
+
+ return (0);
+}
+
+static void
+zfs_purge_cases(fmd_hdl_t *hdl)
+{
+ zfs_case_t *zcp;
+ uu_list_walk_t *walk;
+ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+
+ /*
+ * There is no way to open a pool by GUID, or lookup a vdev by GUID. No
+ * matter what we do, we're going to have to stomach an O(vdevs * cases)
+ * algorithm. In reality, both quantities are likely so small that
+ * neither will matter. Given that iterating over pools is more
+ * expensive than iterating over the in-memory case list, we opt for a
+ * 'present' flag in each case that starts off cleared. We then iterate
+ * over all pools, marking those that are still present, and removing
+ * those that aren't found.
+ *
+ * Note that we could also construct an FMRI and rely on
+ * fmd_nvl_fmri_present(), but this would end up doing the same search.
+ */
+
+ /*
+ * Mark the cases as not present.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp))
+ zcp->zc_present = B_FALSE;
+
+ /*
+ * Iterate over all pools and mark the pools and vdevs found. If this
+ * fails (most probably because we're out of memory), then don't close
+ * any of the cases and we cannot be sure they are accurate.
+ */
+ if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
+ return;
+
+ /*
+ * Remove those cases which were not found.
+ */
+ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+ while ((zcp = uu_list_walk_next(walk)) != NULL) {
+ if (!zcp->zc_present)
+ fmd_case_close(hdl, zcp->zc_case);
+ }
+ uu_list_walk_end(walk);
+}
+
+/*
+ * Construct the name of a serd engine given the pool/vdev GUID and type (io or
+ * checksum).
+ */
+static void
+zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
+ const char *type)
+{
+ (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s",
+ (long long unsigned int)pool_guid,
+ (long long unsigned int)vdev_guid, type);
+}
+
+/*
+ * Solve a given ZFS case. This first checks to make sure the diagnosis is
+ * still valid, as well as cleaning up any pending timer associated with the
+ * case.
+ */
+static void
+zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
+ boolean_t checkunusable)
+{
+ nvlist_t *detector, *fault;
+ boolean_t serialize;
+ nvlist_t *fru = NULL;
+ fmd_hdl_debug(hdl, "solving fault '%s'", faultname);
+
+ /*
+ * Construct the detector from the case data. The detector is in the
+ * ZFS scheme, and is either the pool or the vdev, depending on whether
+ * this is a vdev or pool fault.
+ */
+ detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
+ (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
+ (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
+ zcp->zc_data.zc_pool_guid);
+ if (zcp->zc_data.zc_vdev_guid != 0) {
+ (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
+ zcp->zc_data.zc_vdev_guid);
+ }
+
+ fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
+ fru, detector);
+ fmd_case_add_suspect(hdl, zcp->zc_case, fault);
+
+ nvlist_free(fru);
+
+ fmd_case_solve(hdl, zcp->zc_case);
+
+ serialize = B_FALSE;
+ if (zcp->zc_data.zc_has_remove_timer) {
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_data.zc_has_remove_timer = 0;
+ serialize = B_TRUE;
+ }
+ if (serialize)
+ zfs_case_serialize(hdl, zcp);
+
+ nvlist_free(detector);
+}
+
+static boolean_t
+timeval_earlier(er_timeval_t *a, er_timeval_t *b)
+{
+ return (a->ertv_sec < b->ertv_sec ||
+ (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec));
+}
+
+/*ARGSUSED*/
+static void
+zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
+{
+ int64_t *tod;
+ uint_t nelem;
+
+ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod,
+ &nelem) == 0 && nelem == 2) {
+ when->ertv_sec = tod[0];
+ when->ertv_nsec = tod[1];
+ } else {
+ when->ertv_sec = when->ertv_nsec = UINT64_MAX;
+ }
+}
+
+/*
+ * Main fmd entry point.
+ */
+/*ARGSUSED*/
+static void
+zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
+{
+ zfs_case_t *zcp, *dcp;
+ int32_t pool_state;
+ uint64_t ena, pool_guid, vdev_guid;
+ er_timeval_t pool_load;
+ er_timeval_t er_when;
+ nvlist_t *detector;
+ boolean_t pool_found = B_FALSE;
+ boolean_t isresource;
+ char *type;
+
+ /*
+ * We subscribe to notifications for vdev or pool removal. In these
+ * cases, there may be cases that no longer apply. Purge any cases
+ * that no longer apply.
+ */
+ if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) {
+ fmd_hdl_debug(hdl, "purging orphaned cases from %s",
+ strrchr(class, '.') + 1);
+ zfs_purge_cases(hdl);
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
+
+ if (isresource) {
+ /*
+ * For resources, we don't have a normal payload.
+ */
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ &vdev_guid) != 0)
+ pool_state = SPA_LOAD_OPEN;
+ else
+ pool_state = SPA_LOAD_NONE;
+ detector = NULL;
+ } else {
+ (void) nvlist_lookup_nvlist(nvl,
+ FM_EREPORT_DETECTOR, &detector);
+ (void) nvlist_lookup_int32(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
+ }
+
+ /*
+ * We also ignore all ereports generated during an import of a pool,
+ * since the only possible fault (.pool) would result in import failure,
+ * and hence no persistent fault. Some day we may want to do something
+ * with these ereports, so we continue generating them internally.
+ */
+ if (pool_state == SPA_LOAD_IMPORT) {
+ zfs_stats.import_drops.fmds_value.ui64++;
+ fmd_hdl_debug(hdl, "ignoring '%s' during import", class);
+ return;
+ }
+
+ /*
+ * Device I/O errors are ignored during pool open.
+ */
+ if (pool_state == SPA_LOAD_OPEN &&
+ (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) {
+ fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class);
+ zfs_stats.dev_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * We ignore ereports for anything except disks and files.
+ */
+ if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ &type) == 0) {
+ if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
+ strcmp(type, VDEV_TYPE_FILE) != 0) {
+ zfs_stats.vdev_drops.fmds_value.ui64++;
+ return;
+ }
+ }
+
+ /*
+ * Determine if this ereport corresponds to an open case.
+ * Each vdev or pool can have a single case.
+ */
+ (void) nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
+ if (nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+ vdev_guid = 0;
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
+ ena = 0;
+
+ zfs_ereport_when(hdl, nvl, &er_when);
+
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid) {
+ pool_found = B_TRUE;
+ pool_load = zcp->zc_when;
+ }
+ if (zcp->zc_data.zc_vdev_guid == vdev_guid)
+ break;
+ }
+
+ /*
+ * Avoid falsely accusing a pool of being faulty. Do so by
+ * not replaying ereports that were generated prior to the
+ * current import. If the failure that generated them was
+ * transient because the device was actually removed but we
+ * didn't receive the normal asynchronous notification, we
+ * don't want to mark it as faulted and potentially panic. If
+ * there is still a problem we'd expect not to be able to
+ * import the pool, or that new ereports will be generated
+ * once the pool is used.
+ */
+ if (pool_found && timeval_earlier(&er_when, &pool_load)) {
+ fmd_hdl_debug(hdl, "ignoring pool %llx, "
+ "ereport time %lld.%lld, pool load time = %lld.%lld",
+ pool_guid, er_when.ertv_sec, er_when.ertv_nsec,
+ pool_load.ertv_sec, pool_load.ertv_nsec);
+ zfs_stats.old_drops.fmds_value.ui64++;
+ return;
+ }
+
+ if (!pool_found) {
+ /*
+ * Haven't yet seen this pool, but same situation
+ * may apply.
+ */
+ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+ struct load_time_arg la;
+
+ la.lt_guid = pool_guid;
+ la.lt_time = &pool_load;
+ la.lt_found = B_FALSE;
+
+ if (zhdl != NULL &&
+ zpool_iter(zhdl, zpool_find_load_time, &la) == 0 &&
+ la.lt_found == B_TRUE) {
+ pool_found = B_TRUE;
+
+ if (timeval_earlier(&er_when, &pool_load)) {
+ fmd_hdl_debug(hdl, "ignoring pool %llx, "
+ "ereport time %lld.%lld, "
+ "pool load time = %lld.%lld",
+ pool_guid, er_when.ertv_sec,
+ er_when.ertv_nsec, pool_load.ertv_sec,
+ pool_load.ertv_nsec);
+ zfs_stats.old_drops.fmds_value.ui64++;
+ return;
+ }
+ }
+ }
+
+ if (zcp == NULL) {
+ fmd_case_t *cs;
+ zfs_case_data_t data = { 0 };
+
+ /*
+ * If this is one of our 'fake' resource ereports, and there is
+ * no case open, simply discard it.
+ */
+ if (isresource) {
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ fmd_hdl_debug(hdl, "discarding '%s for vdev %llu",
+ class, vdev_guid);
+ return;
+ }
+
+ /*
+ * Skip tracking some ereports
+ */
+ if (strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
+ strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
+ strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * Open a new case.
+ */
+ cs = fmd_case_open(hdl, NULL);
+
+ fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'",
+ vdev_guid, class);
+
+ /*
+ * Initialize the case buffer. To commonize code, we actually
+ * create the buffer with existing data, and then call
+ * zfs_case_unserialize() to instantiate the in-core structure.
+ */
+ fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t));
+
+ data.zc_version = CASE_DATA_VERSION_SERD;
+ data.zc_ena = ena;
+ data.zc_pool_guid = pool_guid;
+ data.zc_vdev_guid = vdev_guid;
+ data.zc_pool_state = (int)pool_state;
+
+ fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
+
+ zcp = zfs_case_unserialize(hdl, cs);
+ assert(zcp != NULL);
+ if (pool_found)
+ zcp->zc_when = pool_load;
+ }
+
+ if (isresource) {
+ fmd_hdl_debug(hdl, "resource event '%s'", class);
+
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
+ /*
+ * The 'resource.fs.zfs.autoreplace' event indicates
+ * that the pool was loaded with the 'autoreplace'
+ * property set. In this case, any pending device
+ * failures should be ignored, as the asynchronous
+ * autoreplace handling will take care of them.
+ */
+ fmd_case_close(hdl, zcp->zc_case);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
+ /*
+ * The 'resource.fs.zfs.removed' event indicates that
+ * device removal was detected, and the device was
+ * closed asynchronously. If this is the case, we
+ * assume that any recent I/O errors were due to the
+ * device removal, not any fault of the device itself.
+ * We reset the SERD engine, and cancel any pending
+ * timers.
+ */
+ if (zcp->zc_data.zc_has_remove_timer) {
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_data.zc_has_remove_timer = 0;
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (zcp->zc_data.zc_serd_io[0] != '\0')
+ fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+ fmd_serd_reset(hdl,
+ zcp->zc_data.zc_serd_checksum);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
+ uint64_t state = 0;
+
+ if (zcp != NULL &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 &&
+ state == VDEV_STATE_HEALTHY) {
+ fmd_hdl_debug(hdl, "closing case after a "
+ "device statechange to healthy");
+ fmd_case_close(hdl, zcp->zc_case);
+ }
+ }
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * Associate the ereport with this case.
+ */
+ fmd_case_add_ereport(hdl, zcp->zc_case, ep);
+
+ /*
+ * Don't do anything else if this case is already solved.
+ */
+ if (fmd_case_solved(hdl, zcp->zc_case))
+ return;
+
+ fmd_hdl_debug(hdl, "error event '%s'", class);
+
+ /*
+ * Determine if we should solve the case and generate a fault. We solve
+ * a case if:
+ *
+ * a. A pool failed to open (ereport.fs.zfs.pool)
+ * b. A device failed to open (ereport.fs.zfs.pool) while a pool
+ * was up and running.
+ *
+ * We may see a series of ereports associated with a pool open, all
+ * chained together by the same ENA. If the pool open succeeds, then
+ * we'll see no further ereports. To detect when a pool open has
+ * succeeded, we associate a timer with the event. When it expires, we
+ * close the case.
+ */
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
+ /*
+ * Pool level fault. Before solving the case, go through and
+ * close any open device cases that may be pending.
+ */
+ for (dcp = uu_list_first(zfs_cases); dcp != NULL;
+ dcp = uu_list_next(zfs_cases, dcp)) {
+ if (dcp->zc_data.zc_pool_guid ==
+ zcp->zc_data.zc_pool_guid &&
+ dcp->zc_data.zc_vdev_guid != 0)
+ fmd_case_close(hdl, dcp->zc_case);
+ }
+
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
+ /*
+ * Pool level fault for reading the intent logs.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
+ /*
+ * Device fault.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+ char *failmode = NULL;
+ boolean_t checkremove = B_FALSE;
+
+ /*
+ * If this is a checksum or I/O error, then toss it into the
+ * appropriate SERD engine and check to see if it has fired.
+ * Ideally, we want to do something more sophisticated,
+ * (persistent errors for a single data block, etc). For now,
+ * a single SERD engine is sufficient.
+ */
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
+ if (zcp->zc_data.zc_serd_io[0] == '\0') {
+ zfs_serd_name(zcp->zc_data.zc_serd_io,
+ pool_guid, vdev_guid, "io");
+ fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
+ fmd_prop_get_int32(hdl, "io_N"),
+ fmd_prop_get_int64(hdl, "io_T"));
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
+ checkremove = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+ if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
+ zfs_serd_name(zcp->zc_data.zc_serd_checksum,
+ pool_guid, vdev_guid, "checksum");
+ fmd_serd_create(hdl,
+ zcp->zc_data.zc_serd_checksum,
+ fmd_prop_get_int32(hdl, "checksum_N"),
+ fmd_prop_get_int64(hdl, "checksum_T"));
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (fmd_serd_record(hdl,
+ zcp->zc_data.zc_serd_checksum, ep)) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.vdev.checksum", B_FALSE);
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
+ (nvlist_lookup_string(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
+ failmode != NULL) {
+ if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
+ strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.io_failure_continue",
+ B_FALSE);
+ } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
+ strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.io_failure_wait", B_FALSE);
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+#ifndef __linux__
+ /* This causes an unexpected fault diagnosis on linux */
+ checkremove = B_TRUE;
+#endif
+ }
+
+ /*
+ * Because I/O errors may be due to device removal, we postpone
+ * any diagnosis until we're sure that we aren't about to
+ * receive a 'resource.fs.zfs.removed' event.
+ */
+ if (checkremove) {
+ if (zcp->zc_data.zc_has_remove_timer)
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
+ zfs_remove_timeout);
+ if (!zcp->zc_data.zc_has_remove_timer) {
+ zcp->zc_data.zc_has_remove_timer = 1;
+ zfs_case_serialize(hdl, zcp);
+ }
+ }
+ }
+}
+
+/*
+ * The timeout is fired when we diagnosed an I/O error, and it was not due to
+ * device removal (which would cause the timeout to be cancelled).
+ */
+/* ARGSUSED */
+static void
+zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
+{
+ zfs_case_t *zcp = data;
+
+ if (id == zcp->zc_remove_timer)
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE);
+}
+
+/*
+ * The specified case has been closed and any case-specific
+ * data structures should be deallocated.
+ */
+static void
+zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
+{
+ zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
+
+ if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
+ if (zcp->zc_data.zc_serd_io[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_has_remove_timer)
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+
+ uu_list_remove(zfs_cases, zcp);
+ uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+}
+
+/*
+ * We use the fmd gc entry point to look for old cases that no longer apply.
+ * This allows us to keep our set of case data small in a long running system.
+ */
+static void
+zfs_fm_gc(fmd_hdl_t *hdl)
+{
+ zfs_purge_cases(hdl);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+ zfs_fm_recv, /* fmdo_recv */
+ zfs_fm_timeout, /* fmdo_timeout */
+ zfs_fm_close, /* fmdo_close */
+ NULL, /* fmdo_stats */
+ zfs_fm_gc, /* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+ { "checksum_N", FMD_TYPE_UINT32, "10" },
+ { "checksum_T", FMD_TYPE_TIME, "10min" },
+ { "io_N", FMD_TYPE_UINT32, "10" },
+ { "io_T", FMD_TYPE_TIME, "10min" },
+ { "remove_timeout", FMD_TYPE_TIME, "15sec" },
+ { NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+ "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_diagnosis_init(fmd_hdl_t *hdl)
+{
+ libzfs_handle_t *zhdl;
+
+ if ((zhdl = libzfs_init()) == NULL)
+ return;
+
+ if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
+ sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
+ NULL, UU_LIST_POOL_DEBUG)) == NULL) {
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ if ((zfs_cases = uu_list_create(zfs_case_pool, NULL,
+ UU_LIST_DEBUG)) == NULL) {
+ uu_list_pool_destroy(zfs_case_pool);
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+ uu_list_destroy(zfs_cases);
+ uu_list_pool_destroy(zfs_case_pool);
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ fmd_hdl_setspecific(hdl, zhdl);
+
+ (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
+ sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
+
+ zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
+}
+
+void
+_zfs_diagnosis_fini(fmd_hdl_t *hdl)
+{
+ zfs_case_t *zcp;
+ uu_list_walk_t *walk;
+ libzfs_handle_t *zhdl;
+
+ /*
+ * Remove all active cases.
+ */
+ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+ while ((zcp = uu_list_walk_next(walk)) != NULL) {
+ fmd_hdl_debug(hdl, "removing case ena %llu",
+ (long long unsigned)zcp->zc_data.zc_ena);
+ uu_list_remove(zfs_cases, zcp);
+ uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+ }
+ uu_list_walk_end(walk);
+
+ uu_list_destroy(zfs_cases);
+ uu_list_pool_destroy(zfs_case_pool);
+
+ zhdl = fmd_hdl_getspecific(hdl);
+ libzfs_fini(zhdl);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
new file mode 100644
index 000000000000..8d0a3b420086
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c
@@ -0,0 +1,956 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ */
+
+/*
+ * ZFS syseventd module.
+ *
+ * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
+ *
+ * The purpose of this module is to identify when devices are added to the
+ * system, and appropriately online or replace the affected vdevs.
+ *
+ * When a device is added to the system:
+ *
+ * 1. Search for any vdevs whose devid matches that of the newly added
+ * device.
+ *
+ * 2. If no vdevs are found, then search for any vdevs whose udev path
+ * matches that of the new device.
+ *
+ * 3. If no vdevs match by either method, then ignore the event.
+ *
+ * 4. Attempt to online the device with a flag to indicate that it should
+ * be unspared when resilvering completes. If this succeeds, then the
+ * same device was inserted and we should continue normally.
+ *
+ * 5. If the pool does not have the 'autoreplace' property set, attempt to
+ * online the device again without the unspare flag, which will
+ * generate a FMA fault.
+ *
+ * 6. If the pool has the 'autoreplace' property set, and the matching vdev
+ * is a whole disk, then label the new disk and attempt a 'zpool
+ * replace'.
+ *
+ * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK
+ * event indicates that a device failed to open during pool load, but the
+ * autoreplace property was set. In this case, we deferred the associated
+ * FMA fault until our module had a chance to process the autoreplace logic.
+ * If the device could not be replaced, then the second online attempt will
+ * trigger the FMA fault that we skipped earlier.
+ *
+ * ZFS on Linux porting notes:
+ * Linux udev provides a disk insert for both the disk and the partition
+ *
+ */
+
+#include <ctype.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <thread_pool.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+#define DEV_BYPATH_PATH "/dev/disk/by-path/"
+#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/"
+
+typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
+
+libzfs_handle_t *g_zfshdl;
+list_t g_pool_list; /* list of unavailable pools at initialization */
+list_t g_device_list; /* list of disks with asynchronous label request */
+tpool_t *g_tpool;
+boolean_t g_enumeration_done;
+pthread_t g_zfs_tid; /* zfs_enum_pools() thread */
+
+typedef struct unavailpool {
+ zpool_handle_t *uap_zhp;
+ list_node_t uap_node;
+} unavailpool_t;
+
+typedef struct pendingdev {
+ char pd_physpath[128];
+ list_node_t pd_node;
+} pendingdev_t;
+
+static int
+zfs_toplevel_state(zpool_handle_t *zhp)
+{
+ nvlist_t *nvroot;
+ vdev_stat_t *vs;
+ unsigned int c;
+
+ verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ return (vs->vs_state);
+}
+
+static int
+zfs_unavail_pool(zpool_handle_t *zhp, void *data)
+{
+ zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)",
+ zpool_get_name(zhp), (int)zfs_toplevel_state(zhp));
+
+ if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
+ unavailpool_t *uap;
+ uap = malloc(sizeof (unavailpool_t));
+ uap->uap_zhp = zhp;
+ list_insert_tail((list_t *)data, uap);
+ } else {
+ zpool_close(zhp);
+ }
+ return (0);
+}
+
+/*
+ * Two stage replace on Linux
+ * since we get disk notifications
+ * we can wait for partitioned disk slice to show up!
+ *
+ * First stage tags the disk, initiates async partitioning, and returns
+ * Second stage finds the tag and proceeds to ZFS labeling/replace
+ *
+ * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach
+ *
+ * 1. physical match with no fs, no partition
+ * tag it top, partition disk
+ *
+ * 2. physical match again, see partition and tag
+ *
+ */
+
+/*
+ * The device associated with the given vdev (either by devid or physical path)
+ * has been added to the system. If 'isdisk' is set, then we only attempt a
+ * replacement if it's a whole disk. This also implies that we should label the
+ * disk first.
+ *
+ * First, we attempt to online the device (making sure to undo any spare
+ * operation when finished). If this succeeds, then we're done. If it fails,
+ * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
+ * but that the label was not what we expected. If the 'autoreplace' property
+ * is enabled, then we relabel the disk (if specified), and attempt a 'zpool
+ * replace'. If the online is successful, but the new state is something else
+ * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
+ * race, and we should avoid attempting to relabel the disk.
+ *
+ * Also can arrive here from a ESC_ZFS_VDEV_CHECK event
+ */
+static void
+zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
+{
+ char *path;
+ vdev_state_t newstate;
+ nvlist_t *nvroot, *newvd;
+ pendingdev_t *device;
+ uint64_t wholedisk = 0ULL;
+ uint64_t offline = 0ULL;
+ uint64_t guid = 0ULL;
+ char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
+ char rawpath[PATH_MAX], fullpath[PATH_MAX];
+ char devpath[PATH_MAX];
+ int ret;
+ boolean_t is_dm = B_FALSE;
+ boolean_t is_sd = B_FALSE;
+ uint_t c;
+ vdev_stat_t *vs;
+
+ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
+ return;
+
+ /* Skip healthy disks */
+ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ if (vs->vs_state == VDEV_STATE_HEALTHY) {
+ zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
+ __func__, path);
+ return;
+ }
+
+ (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
+ (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ &enc_sysfs_path);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
+
+ if (offline)
+ return; /* don't intervene if it was taken offline */
+
+ is_dm = zfs_dev_is_dm(path);
+ zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
+ " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
+ physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
+ (long long unsigned int)guid);
+
+ /*
+ * The VDEV guid is preferred for identification (gets passed in path)
+ */
+ if (guid != 0) {
+ (void) snprintf(fullpath, sizeof (fullpath), "%llu",
+ (long long unsigned int)guid);
+ } else {
+ /*
+ * otherwise use path sans partition suffix for whole disks
+ */
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ if (wholedisk) {
+ char *spath = zfs_strip_partition(fullpath);
+ if (!spath) {
+ zed_log_msg(LOG_INFO, "%s: Can't alloc",
+ __func__);
+ return;
+ }
+
+ (void) strlcpy(fullpath, spath, sizeof (fullpath));
+ free(spath);
+ }
+ }
+
+ /*
+ * Attempt to online the device.
+ */
+ if (zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
+ (newstate == VDEV_STATE_HEALTHY ||
+ newstate == VDEV_STATE_DEGRADED)) {
+ zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s",
+ fullpath, (newstate == VDEV_STATE_HEALTHY) ?
+ "HEALTHY" : "DEGRADED");
+ return;
+ }
+
+ /*
+ * vdev_id alias rule for using scsi_debug devices (FMA automated
+ * testing)
+ */
+ if (physpath != NULL && strcmp("scsidebug", physpath) == 0)
+ is_sd = B_TRUE;
+
+ /*
+ * If the pool doesn't have the autoreplace property set, then use
+ * vdev online to trigger a FMA fault by posting an ereport.
+ */
+ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
+ !(wholedisk || is_dm) || (physpath == NULL)) {
+ (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+ &newstate);
+ zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
+ "not a whole disk for '%s'", fullpath);
+ return;
+ }
+
+ /*
+ * Convert physical path into its current device node. Rawpath
+ * needs to be /dev/disk/by-vdev for a scsi_debug device since
+ * /dev/disk/by-path will not be present.
+ */
+ (void) snprintf(rawpath, sizeof (rawpath), "%s%s",
+ is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
+
+ if (realpath(rawpath, devpath) == NULL && !is_dm) {
+ zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
+ rawpath, strerror(errno));
+
+ (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+ &newstate);
+
+ zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
+ fullpath, libzfs_error_description(g_zfshdl));
+ return;
+ }
+
+ /* Only autoreplace bad disks */
+ if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
+ (vs->vs_state != VDEV_STATE_FAULTED) &&
+ (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
+ return;
+ }
+
+ nvlist_lookup_string(vdev, "new_devid", &new_devid);
+
+ if (is_dm) {
+ /* Don't label device mapper or multipath disks. */
+ } else if (!labeled) {
+ /*
+ * we're auto-replacing a raw disk, so label it first
+ */
+ char *leafname;
+
+ /*
+ * If this is a request to label a whole disk, then attempt to
+ * write out the label. Before we can label the disk, we need
+ * to map the physical string that was matched on to the under
+ * lying device node.
+ *
+ * If any part of this process fails, then do a force online
+ * to trigger a ZFS fault for the device (and any hot spare
+ * replacement).
+ */
+ leafname = strrchr(devpath, '/') + 1;
+
+ /*
+ * If this is a request to label a whole disk, then attempt to
+ * write out the label.
+ */
+ if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
+ zed_log_msg(LOG_INFO, " zpool_label_disk: could not "
+ "label '%s' (%s)", leafname,
+ libzfs_error_description(g_zfshdl));
+
+ (void) zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_FORCEFAULT, &newstate);
+ return;
+ }
+
+ /*
+ * The disk labeling is asynchronous on Linux. Just record
+ * this label request and return as there will be another
+ * disk add event for the partition after the labeling is
+ * completed.
+ */
+ device = malloc(sizeof (pendingdev_t));
+ (void) strlcpy(device->pd_physpath, physpath,
+ sizeof (device->pd_physpath));
+ list_insert_tail(&g_device_list, device);
+
+ zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
+ leafname, (u_longlong_t)guid);
+
+ return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
+
+ } else /* labeled */ {
+ boolean_t found = B_FALSE;
+ /*
+ * match up with request above to label the disk
+ */
+ for (device = list_head(&g_device_list); device != NULL;
+ device = list_next(&g_device_list, device)) {
+ if (strcmp(physpath, device->pd_physpath) == 0) {
+ list_remove(&g_device_list, device);
+ free(device);
+ found = B_TRUE;
+ break;
+ }
+ zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s",
+ physpath, device->pd_physpath);
+ }
+ if (!found) {
+ /* unexpected partition slice encountered */
+ zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
+ fullpath);
+ (void) zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_FORCEFAULT, &newstate);
+ return;
+ }
+
+ zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
+ physpath, (u_longlong_t)guid);
+
+ (void) snprintf(devpath, sizeof (devpath), "%s%s",
+ DEV_BYID_PATH, new_devid);
+ }
+
+ /*
+ * Construct the root vdev to pass to zpool_vdev_attach(). While adding
+ * the entire vdev structure is harmless, we construct a reduced set of
+ * path/physpath/wholedisk to keep it simple.
+ */
+ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+ return;
+ }
+ if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+ nvlist_free(nvroot);
+ return;
+ }
+
+ if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
+ nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
+ nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 ||
+ (physpath != NULL && nvlist_add_string(newvd,
+ ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
+ (enc_sysfs_path != NULL && nvlist_add_string(newvd,
+ ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) ||
+ nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
+ nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
+ nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
+ 1) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs");
+ nvlist_free(newvd);
+ nvlist_free(nvroot);
+ return;
+ }
+
+ nvlist_free(newvd);
+
+ /*
+ * Wait for udev to verify the links exist, then auto-replace
+ * the leaf disk at same physical location.
+ */
+ if (zpool_label_disk_wait(path, 3000) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
+ "disk %s is missing", path);
+ nvlist_free(nvroot);
+ return;
+ }
+
+ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
+
+ zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
+ fullpath, path, (ret == 0) ? "no errors" :
+ libzfs_error_description(g_zfshdl));
+
+ nvlist_free(nvroot);
+}
+
+/*
+ * Utility functions to find a vdev matching given criteria.
+ */
+typedef struct dev_data {
+ const char *dd_compare;
+ const char *dd_prop;
+ zfs_process_func_t dd_func;
+ boolean_t dd_found;
+ boolean_t dd_islabeled;
+ uint64_t dd_pool_guid;
+ uint64_t dd_vdev_guid;
+ const char *dd_new_devid;
+} dev_data_t;
+
+static void
+zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
+{
+ dev_data_t *dp = data;
+ char *path = NULL;
+ uint_t c, children;
+ nvlist_t **child;
+
+ /*
+ * First iterate over any children.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+
+ /*
+ * Iterate over any spares and cache devices
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+
+ /* once a vdev was matched and processed there is nothing left to do */
+ if (dp->dd_found)
+ return;
+
+ /*
+ * Match by GUID if available otherwise fallback to devid or physical
+ */
+ if (dp->dd_vdev_guid != 0) {
+ uint64_t guid;
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid != dp->dd_vdev_guid) {
+ return;
+ }
+ zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid);
+ dp->dd_found = B_TRUE;
+
+ } else if (dp->dd_compare != NULL) {
+ /*
+ * NOTE: On Linux there is an event for partition, so unlike
+ * illumos, substring matching is not required to accommodate
+ * the partition suffix. An exact match will be present in
+ * the dp->dd_compare value.
+ */
+ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
+ strcmp(dp->dd_compare, path) != 0)
+ return;
+
+ zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s",
+ dp->dd_prop, path);
+ dp->dd_found = B_TRUE;
+
+ /* pass the new devid for use by replacing code */
+ if (dp->dd_new_devid != NULL) {
+ (void) nvlist_add_string(nvl, "new_devid",
+ dp->dd_new_devid);
+ }
+ }
+
+ (dp->dd_func)(zhp, nvl, dp->dd_islabeled);
+}
+
+static void
+zfs_enable_ds(void *arg)
+{
+ unavailpool_t *pool = (unavailpool_t *)arg;
+
+ (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
+ zpool_close(pool->uap_zhp);
+ free(pool);
+}
+
+static int
+zfs_iter_pool(zpool_handle_t *zhp, void *data)
+{
+ nvlist_t *config, *nvl;
+ dev_data_t *dp = data;
+ uint64_t pool_guid;
+ unavailpool_t *pool;
+
+ zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)",
+ zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop);
+
+ /*
+ * For each vdev in this pool, look for a match to apply dd_func
+ */
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ if (dp->dd_pool_guid == 0 ||
+ (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
+ (void) nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvl);
+ zfs_iter_vdev(zhp, nvl, data);
+ }
+ }
+
+ /*
+ * if this pool was originally unavailable,
+ * then enable its datasets asynchronously
+ */
+ if (g_enumeration_done) {
+ for (pool = list_head(&g_pool_list); pool != NULL;
+ pool = list_next(&g_pool_list, pool)) {
+
+ if (strcmp(zpool_get_name(zhp),
+ zpool_get_name(pool->uap_zhp)))
+ continue;
+ if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
+ list_remove(&g_pool_list, pool);
+ (void) tpool_dispatch(g_tpool, zfs_enable_ds,
+ pool);
+ break;
+ }
+ }
+ }
+
+ zpool_close(zhp);
+ return (dp->dd_found); /* cease iteration after a match */
+}
+
+/*
+ * Given a physical device location, iterate over all
+ * (pool, vdev) pairs which correspond to that location.
+ */
+static boolean_t
+devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
+ boolean_t is_slice)
+{
+ dev_data_t data = { 0 };
+
+ data.dd_compare = physical;
+ data.dd_func = func;
+ data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
+ data.dd_found = B_FALSE;
+ data.dd_islabeled = is_slice;
+ data.dd_new_devid = devid; /* used by auto replace code */
+
+ (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+ return (data.dd_found);
+}
+
+/*
+ * Given a device identifier, find any vdevs with a matching devid.
+ * On Linux we can match devid directly which is always a whole disk.
+ */
+static boolean_t
+devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice)
+{
+ dev_data_t data = { 0 };
+
+ data.dd_compare = devid;
+ data.dd_func = func;
+ data.dd_prop = ZPOOL_CONFIG_DEVID;
+ data.dd_found = B_FALSE;
+ data.dd_islabeled = is_slice;
+ data.dd_new_devid = devid;
+
+ (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+ return (data.dd_found);
+}
+
+/*
+ * Handle a EC_DEV_ADD.ESC_DISK event.
+ *
+ * illumos
+ * Expects: DEV_PHYS_PATH string in schema
+ * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
+ *
+ * path: '/dev/dsk/c0t1d0s0' (persistent)
+ * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a'
+ * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a'
+ *
+ * linux
+ * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema
+ * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID
+ *
+ * path: '/dev/sdc1' (not persistent)
+ * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1'
+ * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0'
+ */
+static int
+zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
+{
+ char *devpath = NULL, *devid;
+ boolean_t is_slice;
+
+ /*
+ * Expecting a devid string and an optional physical location
+ */
+ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
+ return (-1);
+
+ (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
+
+ is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0);
+
+ zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)",
+ devid, devpath ? devpath : "NULL", is_slice);
+
+ /*
+ * Iterate over all vdevs looking for a match in the following order:
+ * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
+ * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
+ *
+ * For disks, we only want to pay attention to vdevs marked as whole
+ * disks or are a multipath device.
+ */
+ if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL)
+ (void) devphys_iter(devpath, devid, zfs_process_add, is_slice);
+
+ return (0);
+}
+
+/*
+ * Called when we receive a VDEV_CHECK event, which indicates a device could not
+ * be opened during initial pool open, but the autoreplace property was set on
+ * the pool. In this case, we treat it as if it were an add event.
+ */
+static int
+zfs_deliver_check(nvlist_t *nvl)
+{
+ dev_data_t data = { 0 };
+
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
+ &data.dd_pool_guid) != 0 ||
+ nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
+ &data.dd_vdev_guid) != 0 ||
+ data.dd_vdev_guid == 0)
+ return (0);
+
+ zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu",
+ data.dd_pool_guid, data.dd_vdev_guid);
+
+ data.dd_func = zfs_process_add;
+
+ (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
+
+ return (0);
+}
+
+static int
+zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
+{
+ char *devname = data;
+ boolean_t avail_spare, l2cache;
+ nvlist_t *tgt;
+ int error;
+
+ zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
+ devname, zpool_get_name(zhp));
+
+ if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
+ &avail_spare, &l2cache, NULL)) != NULL) {
+ char *path, fullpath[MAXPATHLEN];
+ uint64_t wholedisk;
+
+ error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path);
+ if (error) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk);
+ if (error)
+ wholedisk = 0;
+
+ if (wholedisk) {
+ path = strrchr(path, '/');
+ if (path != NULL) {
+ path = zfs_strip_partition(path + 1);
+ if (path == NULL) {
+ zpool_close(zhp);
+ return (0);
+ }
+ } else {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ free(path);
+
+ /*
+ * We need to reopen the pool associated with this
+ * device so that the kernel can update the size of
+ * the expanded device. When expanding there is no
+ * need to restart the scrub from the beginning.
+ */
+ boolean_t scrub_restart = B_FALSE;
+ (void) zpool_reopen_one(zhp, &scrub_restart);
+ } else {
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ }
+
+ if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
+ vdev_state_t newstate;
+
+ if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
+ error = zpool_vdev_online(zhp, fullpath, 0,
+ &newstate);
+ zed_log_msg(LOG_INFO, "zfsdle_vdev_online: "
+ "setting device '%s' to ONLINE state "
+ "in pool '%s': %d", fullpath,
+ zpool_get_name(zhp), error);
+ }
+ }
+ zpool_close(zhp);
+ return (1);
+ }
+ zpool_close(zhp);
+ return (0);
+}
+
+/*
+ * This function handles the ESC_DEV_DLE device change event. Use the
+ * provided vdev guid when looking up a disk or partition, when the guid
+ * is not present assume the entire disk is owned by ZFS and append the
+ * expected -part1 partition information then lookup by physical path.
+ */
+static int
+zfs_deliver_dle(nvlist_t *nvl)
+{
+ char *devname, name[MAXPATHLEN];
+ uint64_t guid;
+
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
+ sprintf(name, "%llu", (u_longlong_t)guid);
+ } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) {
+ strlcpy(name, devname, MAXPATHLEN);
+ zfs_append_partition(name, MAXPATHLEN);
+ } else {
+ zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
+ }
+
+ if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) {
+ zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
+ "found", name);
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * syseventd daemon module event handler
+ *
+ * Handles syseventd daemon zfs device related events:
+ *
+ * EC_DEV_ADD.ESC_DISK
+ * EC_DEV_STATUS.ESC_DEV_DLE
+ * EC_ZFS.ESC_ZFS_VDEV_CHECK
+ *
+ * Note: assumes only one thread active at a time (not thread safe)
+ */
+static int
+zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ int ret;
+ boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE;
+
+ if (strcmp(class, EC_DEV_ADD) == 0) {
+ /*
+ * We're mainly interested in disk additions, but we also listen
+ * for new loop devices, to allow for simplified testing.
+ */
+ if (strcmp(subclass, ESC_DISK) == 0)
+ is_lofi = B_FALSE;
+ else if (strcmp(subclass, ESC_LOFI) == 0)
+ is_lofi = B_TRUE;
+ else
+ return (0);
+
+ is_check = B_FALSE;
+ } else if (strcmp(class, EC_ZFS) == 0 &&
+ strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
+ /*
+ * This event signifies that a device failed to open
+ * during pool load, but the 'autoreplace' property was
+ * set, so we should pretend it's just been added.
+ */
+ is_check = B_TRUE;
+ } else if (strcmp(class, EC_DEV_STATUS) == 0 &&
+ strcmp(subclass, ESC_DEV_DLE) == 0) {
+ is_dle = B_TRUE;
+ } else {
+ return (0);
+ }
+
+ if (is_dle)
+ ret = zfs_deliver_dle(nvl);
+ else if (is_check)
+ ret = zfs_deliver_check(nvl);
+ else
+ ret = zfs_deliver_add(nvl, is_lofi);
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static void *
+zfs_enum_pools(void *arg)
+{
+ (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
+ /*
+ * Linux - instead of using a thread pool, each list entry
+ * will spawn a thread when an unavailable pool transitions
+ * to available. zfs_slm_fini will wait for these threads.
+ */
+ g_enumeration_done = B_TRUE;
+ return (NULL);
+}
+
+/*
+ * called from zed daemon at startup
+ *
+ * sent messages from zevents or udev monitor
+ *
+ * For now, each agent has its own libzfs instance
+ */
+int
+zfs_slm_init()
+{
+ if ((g_zfshdl = libzfs_init()) == NULL)
+ return (-1);
+
+ /*
+ * collect a list of unavailable pools (asynchronously,
+ * since this can take a while)
+ */
+ list_create(&g_pool_list, sizeof (struct unavailpool),
+ offsetof(struct unavailpool, uap_node));
+
+ if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) {
+ list_destroy(&g_pool_list);
+ libzfs_fini(g_zfshdl);
+ return (-1);
+ }
+
+ list_create(&g_device_list, sizeof (struct pendingdev),
+ offsetof(struct pendingdev, pd_node));
+
+ return (0);
+}
+
+void
+zfs_slm_fini()
+{
+ unavailpool_t *pool;
+ pendingdev_t *device;
+
+ /* wait for zfs_enum_pools thread to complete */
+ (void) pthread_join(g_zfs_tid, NULL);
+ /* destroy the thread pool */
+ if (g_tpool != NULL) {
+ tpool_wait(g_tpool);
+ tpool_destroy(g_tpool);
+ }
+
+ while ((pool = (list_head(&g_pool_list))) != NULL) {
+ list_remove(&g_pool_list, pool);
+ zpool_close(pool->uap_zhp);
+ free(pool);
+ }
+ list_destroy(&g_pool_list);
+
+ while ((device = (list_head(&g_device_list))) != NULL) {
+ list_remove(&g_device_list, device);
+ free(device);
+ }
+ list_destroy(&g_device_list);
+
+ libzfs_fini(g_zfshdl);
+}
+
+void
+zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass);
+ (void) zfs_slm_deliver_event(class, subclass, nvl);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c
new file mode 100644
index 000000000000..9e95e20d5683
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c
@@ -0,0 +1,557 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+/*
+ * The ZFS retire agent is responsible for managing hot spares across all pools.
+ * When we see a device fault or a device removal, we try to open the associated
+ * pool and look for any hot spares. We iterate over any available hot spares
+ * and attempt a 'zpool replace' for each one.
+ *
+ * For vdevs diagnosed as faulty, the agent is also responsible for proactively
+ * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors).
+ */
+
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <libzfs.h>
+#include <string.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+
+typedef struct zfs_retire_repaired {
+ struct zfs_retire_repaired *zrr_next;
+ uint64_t zrr_pool;
+ uint64_t zrr_vdev;
+} zfs_retire_repaired_t;
+
+typedef struct zfs_retire_data {
+ libzfs_handle_t *zrd_hdl;
+ zfs_retire_repaired_t *zrd_repaired;
+} zfs_retire_data_t;
+
+static void
+zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp)
+{
+ zfs_retire_repaired_t *zrp;
+
+ while ((zrp = zdp->zrd_repaired) != NULL) {
+ zdp->zrd_repaired = zrp->zrr_next;
+ fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t));
+ }
+}
+
+/*
+ * Find a pool with a matching GUID.
+ */
+typedef struct find_cbdata {
+ uint64_t cb_guid;
+ zpool_handle_t *cb_zhp;
+ nvlist_t *cb_vdev;
+} find_cbdata_t;
+
+static int
+find_pool(zpool_handle_t *zhp, void *data)
+{
+ find_cbdata_t *cbp = data;
+
+ if (cbp->cb_guid ==
+ zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) {
+ cbp->cb_zhp = zhp;
+ return (1);
+ }
+
+ zpool_close(zhp);
+ return (0);
+}
+
+/*
+ * Find a vdev within a tree with a matching GUID.
+ */
+static nvlist_t *
+find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid)
+{
+ uint64_t guid;
+ nvlist_t **child;
+ uint_t c, children;
+ nvlist_t *ret;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+ guid == search_guid) {
+ fmd_hdl_debug(fmd_module_hdl("zfs-retire"),
+ "matched vdev %llu", guid);
+ return (nv);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return (NULL);
+
+ for (c = 0; c < children; c++) {
+ if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+ return (ret);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) != 0)
+ return (NULL);
+
+ for (c = 0; c < children; c++) {
+ if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+ return (ret);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) != 0)
+ return (NULL);
+
+ for (c = 0; c < children; c++) {
+ if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL)
+ return (ret);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Given a (pool, vdev) GUID pair, find the matching pool and vdev.
+ */
+static zpool_handle_t *
+find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid,
+ nvlist_t **vdevp)
+{
+ find_cbdata_t cb;
+ zpool_handle_t *zhp;
+ nvlist_t *config, *nvroot;
+
+ /*
+ * Find the corresponding pool and make sure the vdev still exists.
+ */
+ cb.cb_guid = pool_guid;
+ if (zpool_iter(zhdl, find_pool, &cb) != 1)
+ return (NULL);
+
+ zhp = cb.cb_zhp;
+ config = zpool_get_config(zhp, NULL);
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) != 0) {
+ zpool_close(zhp);
+ return (NULL);
+ }
+
+ if (vdev_guid != 0) {
+ if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) {
+ zpool_close(zhp);
+ return (NULL);
+ }
+ }
+
+ return (zhp);
+}
+
+/*
+ * Given a vdev, attempt to replace it with every known spare until one
+ * succeeds or we run out of devices to try.
+ * Return whether we were successful or not in replacing the device.
+ */
+static boolean_t
+replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
+{
+ nvlist_t *config, *nvroot, *replacement;
+ nvlist_t **spares;
+ uint_t s, nspares;
+ char *dev_name;
+ zprop_source_t source;
+ int ashift;
+
+ config = zpool_get_config(zhp, NULL);
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) != 0)
+ return (B_FALSE);
+
+ /*
+ * Find out if there are any hot spares available in the pool.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) != 0)
+ return (B_FALSE);
+
+ /*
+ * lookup "ashift" pool property, we may need it for the replacement
+ */
+ ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source);
+
+ replacement = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT);
+
+ dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+
+ /*
+ * Try to replace each spare, ending when we successfully
+ * replace it.
+ */
+ for (s = 0; s < nspares; s++) {
+ char *spare_name;
+
+ if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
+ &spare_name) != 0)
+ continue;
+
+ /* if set, add the "ashift" pool property to the spare nvlist */
+ if (source != ZPROP_SRC_DEFAULT)
+ (void) nvlist_add_uint64(spares[s],
+ ZPOOL_CONFIG_ASHIFT, ashift);
+
+ (void) nvlist_add_nvlist_array(replacement,
+ ZPOOL_CONFIG_CHILDREN, &spares[s], 1);
+
+ fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'",
+ dev_name, basename(spare_name));
+
+ if (zpool_vdev_attach(zhp, dev_name, spare_name,
+ replacement, B_TRUE, B_FALSE) == 0) {
+ free(dev_name);
+ nvlist_free(replacement);
+ return (B_TRUE);
+ }
+ }
+
+ free(dev_name);
+ nvlist_free(replacement);
+
+ return (B_FALSE);
+}
+
+/*
+ * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and
+ * ASRU is now usable. ZFS has found the device to be present and
+ * functioning.
+ */
+/*ARGSUSED*/
+static void
+zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl)
+{
+ zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+ zfs_retire_repaired_t *zrp;
+ uint64_t pool_guid, vdev_guid;
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ &pool_guid) != 0 || nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+ return;
+
+ /*
+ * Before checking the state of the ASRU, go through and see if we've
+ * already made an attempt to repair this ASRU. This list is cleared
+ * whenever we receive any kind of list event, and is designed to
+ * prevent us from generating a feedback loop when we attempt repairs
+ * against a faulted pool. The problem is that checking the unusable
+ * state of the ASRU can involve opening the pool, which can post
+ * statechange events but otherwise leave the pool in the faulted
+ * state. This list allows us to detect when a statechange event is
+ * due to our own request.
+ */
+ for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) {
+ if (zrp->zrr_pool == pool_guid &&
+ zrp->zrr_vdev == vdev_guid)
+ return;
+ }
+
+ zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP);
+ zrp->zrr_next = zdp->zrd_repaired;
+ zrp->zrr_pool = pool_guid;
+ zrp->zrr_vdev = vdev_guid;
+ zdp->zrd_repaired = zrp;
+
+ fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu",
+ vdev_guid, pool_guid);
+}
+
+/*ARGSUSED*/
+static void
+zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
+ const char *class)
+{
+ uint64_t pool_guid, vdev_guid;
+ zpool_handle_t *zhp;
+ nvlist_t *resource, *fault;
+ nvlist_t **faults;
+ uint_t f, nfaults;
+ zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+ libzfs_handle_t *zhdl = zdp->zrd_hdl;
+ boolean_t fault_device, degrade_device;
+ boolean_t is_repair;
+ char *scheme;
+ nvlist_t *vdev = NULL;
+ char *uuid;
+ int repair_done = 0;
+ boolean_t retire;
+ boolean_t is_disk;
+ vdev_aux_t aux;
+ uint64_t state = 0;
+
+ fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
+
+ nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
+
+ /*
+ * If this is a resource notifying us of device removal then simply
+ * check for an available spare and continue unless the device is a
+ * l2arc vdev, in which case we just offline it.
+ */
+ if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
+ (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+ state == VDEV_STATE_REMOVED)) {
+ char *devtype;
+ char *devname;
+
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ &pool_guid) != 0 ||
+ nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ &vdev_guid) != 0)
+ return;
+
+ if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
+ &vdev)) == NULL)
+ return;
+
+ devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE);
+
+ /* Can't replace l2arc with a spare: offline the device */
+ if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) {
+ fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname);
+ zpool_vdev_offline(zhp, devname, B_TRUE);
+ } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") ||
+ replace_with_spare(hdl, zhp, vdev) == B_FALSE) {
+ /* Could not handle with spare */
+ fmd_hdl_debug(hdl, "no spare for '%s'", devname);
+ }
+
+ free(devname);
+ zpool_close(zhp);
+ return;
+ }
+
+ if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0)
+ return;
+
+ /*
+ * Note: on zfsonlinux statechange events are more than just
+ * healthy ones so we need to confirm the actual state value.
+ */
+ if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+ state == VDEV_STATE_HEALTHY) {
+ zfs_vdev_repair(hdl, nvl);
+ return;
+ }
+ if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
+ zfs_vdev_repair(hdl, nvl);
+ return;
+ }
+
+ zfs_retire_clear_data(hdl, zdp);
+
+ if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0)
+ is_repair = B_TRUE;
+ else
+ is_repair = B_FALSE;
+
+ /*
+ * We subscribe to zfs faults as well as all repair events.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
+ &faults, &nfaults) != 0)
+ return;
+
+ for (f = 0; f < nfaults; f++) {
+ fault = faults[f];
+
+ fault_device = B_FALSE;
+ degrade_device = B_FALSE;
+ is_disk = B_FALSE;
+
+ if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE,
+ &retire) == 0 && retire == 0)
+ continue;
+
+ /*
+ * While we subscribe to fault.fs.zfs.*, we only take action
+ * for faults targeting a specific vdev (open failure or SERD
+ * failure). We also subscribe to fault.io.* events, so that
+ * faulty disks will be faulted in the ZFS configuration.
+ */
+ if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) {
+ fault_device = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, fault,
+ "fault.fs.zfs.vdev.checksum")) {
+ degrade_device = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, fault,
+ "fault.fs.zfs.device")) {
+ fault_device = B_FALSE;
+ } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) {
+ is_disk = B_TRUE;
+ fault_device = B_TRUE;
+ } else {
+ continue;
+ }
+
+ if (is_disk) {
+ continue;
+ } else {
+ /*
+ * This is a ZFS fault. Lookup the resource, and
+ * attempt to find the matching vdev.
+ */
+ if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE,
+ &resource) != 0 ||
+ nvlist_lookup_string(resource, FM_FMRI_SCHEME,
+ &scheme) != 0)
+ continue;
+
+ if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL,
+ &pool_guid) != 0)
+ continue;
+
+ if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV,
+ &vdev_guid) != 0) {
+ if (is_repair)
+ vdev_guid = 0;
+ else
+ continue;
+ }
+
+ if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid,
+ &vdev)) == NULL)
+ continue;
+
+ aux = VDEV_AUX_ERR_EXCEEDED;
+ }
+
+ if (vdev_guid == 0) {
+ /*
+ * For pool-level repair events, clear the entire pool.
+ */
+ fmd_hdl_debug(hdl, "zpool_clear of pool '%s'",
+ zpool_get_name(zhp));
+ (void) zpool_clear(zhp, NULL, NULL);
+ zpool_close(zhp);
+ continue;
+ }
+
+ /*
+ * If this is a repair event, then mark the vdev as repaired and
+ * continue.
+ */
+ if (is_repair) {
+ repair_done = 1;
+ fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu",
+ zpool_get_name(zhp), vdev_guid);
+ (void) zpool_vdev_clear(zhp, vdev_guid);
+ zpool_close(zhp);
+ continue;
+ }
+
+ /*
+ * Actively fault the device if needed.
+ */
+ if (fault_device)
+ (void) zpool_vdev_fault(zhp, vdev_guid, aux);
+ if (degrade_device)
+ (void) zpool_vdev_degrade(zhp, vdev_guid, aux);
+
+ if (fault_device || degrade_device)
+ fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'",
+ fault_device ? "fault" : "degrade", vdev_guid,
+ zpool_get_name(zhp));
+
+ /*
+ * Attempt to substitute a hot spare.
+ */
+ (void) replace_with_spare(hdl, zhp, vdev);
+ zpool_close(zhp);
+ }
+
+ if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done &&
+ nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0)
+ fmd_case_uuresolved(hdl, uuid);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+ zfs_retire_recv, /* fmdo_recv */
+ NULL, /* fmdo_timeout */
+ NULL, /* fmdo_close */
+ NULL, /* fmdo_stats */
+ NULL, /* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+ { "spare_on_remove", FMD_TYPE_BOOL, "true" },
+ { NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+ "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_retire_init(fmd_hdl_t *hdl)
+{
+ zfs_retire_data_t *zdp;
+ libzfs_handle_t *zhdl;
+
+ if ((zhdl = libzfs_init()) == NULL)
+ return;
+
+ if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP);
+ zdp->zrd_hdl = zhdl;
+
+ fmd_hdl_setspecific(hdl, zdp);
+}
+
+void
+_zfs_retire_fini(fmd_hdl_t *hdl)
+{
+ zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl);
+
+ if (zdp != NULL) {
+ zfs_retire_clear_data(hdl, zdp);
+ libzfs_fini(zdp->zrd_hdl);
+ fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t));
+ }
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed.c b/sys/contrib/openzfs/cmd/zed/zed.c
new file mode 100644
index 000000000000..0784e3834733
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.c
@@ -0,0 +1,306 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_event.h"
+#include "zed_file.h"
+#include "zed_log.h"
+
+static volatile sig_atomic_t _got_exit = 0;
+static volatile sig_atomic_t _got_hup = 0;
+
+/*
+ * Signal handler for SIGINT & SIGTERM.
+ */
+static void
+_exit_handler(int signum)
+{
+ _got_exit = 1;
+}
+
+/*
+ * Signal handler for SIGHUP.
+ */
+static void
+_hup_handler(int signum)
+{
+ _got_hup = 1;
+}
+
+/*
+ * Register signal handlers.
+ */
+static void
+_setup_sig_handlers(void)
+{
+ struct sigaction sa;
+
+ if (sigemptyset(&sa.sa_mask) < 0)
+ zed_log_die("Failed to initialize sigset");
+
+ sa.sa_flags = SA_RESTART;
+ sa.sa_handler = SIG_IGN;
+
+ if (sigaction(SIGPIPE, &sa, NULL) < 0)
+ zed_log_die("Failed to ignore SIGPIPE");
+
+ sa.sa_handler = _exit_handler;
+ if (sigaction(SIGINT, &sa, NULL) < 0)
+ zed_log_die("Failed to register SIGINT handler");
+
+ if (sigaction(SIGTERM, &sa, NULL) < 0)
+ zed_log_die("Failed to register SIGTERM handler");
+
+ sa.sa_handler = _hup_handler;
+ if (sigaction(SIGHUP, &sa, NULL) < 0)
+ zed_log_die("Failed to register SIGHUP handler");
+}
+
+/*
+ * Lock all current and future pages in the virtual memory address space.
+ * Access to locked pages will never be delayed by a page fault.
+ *
+ * EAGAIN is tested up to max_tries in case this is a transient error.
+ *
+ * Note that memory locks are not inherited by a child created via fork()
+ * and are automatically removed during an execve(). As such, this must
+ * be called after the daemon fork()s (when running in the background).
+ */
+static void
+_lock_memory(void)
+{
+#if HAVE_MLOCKALL
+ int i = 0;
+ const int max_tries = 10;
+
+ for (i = 0; i < max_tries; i++) {
+ if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) {
+ zed_log_msg(LOG_INFO, "Locked all pages in memory");
+ return;
+ }
+ if (errno != EAGAIN)
+ break;
+ }
+ zed_log_die("Failed to lock memory pages: %s", strerror(errno));
+
+#else /* HAVE_MLOCKALL */
+ zed_log_die("Failed to lock memory pages: mlockall() not supported");
+#endif /* HAVE_MLOCKALL */
+}
+
+/*
+ * Start daemonization of the process including the double fork().
+ *
+ * The parent process will block here until _finish_daemonize() is called
+ * (in the grandchild process), at which point the parent process will exit.
+ * This prevents the parent process from exiting until initialization is
+ * complete.
+ */
+static void
+_start_daemonize(void)
+{
+ pid_t pid;
+ struct sigaction sa;
+
+ /* Create pipe for communicating with child during daemonization. */
+ zed_log_pipe_open();
+
+ /* Background process and ensure child is not process group leader. */
+ pid = fork();
+ if (pid < 0) {
+ zed_log_die("Failed to create child process: %s",
+ strerror(errno));
+ } else if (pid > 0) {
+
+ /* Close writes since parent will only read from pipe. */
+ zed_log_pipe_close_writes();
+
+ /* Wait for notification that daemonization is complete. */
+ zed_log_pipe_wait();
+
+ zed_log_pipe_close_reads();
+ _exit(EXIT_SUCCESS);
+ }
+
+ /* Close reads since child will only write to pipe. */
+ zed_log_pipe_close_reads();
+
+ /* Create independent session and detach from terminal. */
+ if (setsid() < 0)
+ zed_log_die("Failed to create new session: %s",
+ strerror(errno));
+
+ /* Prevent child from terminating on HUP when session leader exits. */
+ if (sigemptyset(&sa.sa_mask) < 0)
+ zed_log_die("Failed to initialize sigset");
+
+ sa.sa_flags = 0;
+ sa.sa_handler = SIG_IGN;
+
+ if (sigaction(SIGHUP, &sa, NULL) < 0)
+ zed_log_die("Failed to ignore SIGHUP");
+
+ /* Ensure process cannot re-acquire terminal. */
+ pid = fork();
+ if (pid < 0) {
+ zed_log_die("Failed to create grandchild process: %s",
+ strerror(errno));
+ } else if (pid > 0) {
+ _exit(EXIT_SUCCESS);
+ }
+}
+
+/*
+ * Finish daemonization of the process by closing stdin/stdout/stderr.
+ *
+ * This must be called at the end of initialization after all external
+ * communication channels are established and accessible.
+ */
+static void
+_finish_daemonize(void)
+{
+ int devnull;
+
+ /* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */
+ devnull = open("/dev/null", O_RDWR);
+ if (devnull < 0)
+ zed_log_die("Failed to open /dev/null: %s", strerror(errno));
+
+ if (dup2(devnull, STDIN_FILENO) < 0)
+ zed_log_die("Failed to dup /dev/null onto stdin: %s",
+ strerror(errno));
+
+ if (dup2(devnull, STDOUT_FILENO) < 0)
+ zed_log_die("Failed to dup /dev/null onto stdout: %s",
+ strerror(errno));
+
+ if (dup2(devnull, STDERR_FILENO) < 0)
+ zed_log_die("Failed to dup /dev/null onto stderr: %s",
+ strerror(errno));
+
+ if ((devnull > STDERR_FILENO) && (close(devnull) < 0))
+ zed_log_die("Failed to close /dev/null: %s", strerror(errno));
+
+ /* Notify parent that daemonization is complete. */
+ zed_log_pipe_close_writes();
+}
+
+/*
+ * ZFS Event Daemon (ZED).
+ */
+int
+main(int argc, char *argv[])
+{
+ struct zed_conf *zcp;
+ uint64_t saved_eid;
+ int64_t saved_etime[2];
+
+ zed_log_init(argv[0]);
+ zed_log_stderr_open(LOG_NOTICE);
+ zcp = zed_conf_create();
+ zed_conf_parse_opts(zcp, argc, argv);
+ if (zcp->do_verbose)
+ zed_log_stderr_open(LOG_INFO);
+
+ if (geteuid() != 0)
+ zed_log_die("Must be run as root");
+
+ zed_conf_parse_file(zcp);
+
+ zed_file_close_from(STDERR_FILENO + 1);
+
+ (void) umask(0);
+
+ if (chdir("/") < 0)
+ zed_log_die("Failed to change to root directory");
+
+ if (zed_conf_scan_dir(zcp) < 0)
+ exit(EXIT_FAILURE);
+
+ if (!zcp->do_foreground) {
+ _start_daemonize();
+ zed_log_syslog_open(LOG_DAEMON);
+ }
+ _setup_sig_handlers();
+
+ if (zcp->do_memlock)
+ _lock_memory();
+
+ if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force))
+ exit(EXIT_FAILURE);
+
+ if (!zcp->do_foreground)
+ _finish_daemonize();
+
+ zed_log_msg(LOG_NOTICE,
+ "ZFS Event Daemon %s-%s (PID %d)",
+ ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid());
+
+ if (zed_conf_open_state(zcp) < 0)
+ exit(EXIT_FAILURE);
+
+ if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0)
+ exit(EXIT_FAILURE);
+
+idle:
+ /*
+ * If -I is specified, attempt to open /dev/zfs repeatedly until
+ * successful.
+ */
+ do {
+ if (!zed_event_init(zcp))
+ break;
+ /* Wait for some time and try again. tunable? */
+ sleep(30);
+ } while (!_got_exit && zcp->do_idle);
+
+ if (_got_exit)
+ goto out;
+
+ zed_event_seek(zcp, saved_eid, saved_etime);
+
+ while (!_got_exit) {
+ int rv;
+ if (_got_hup) {
+ _got_hup = 0;
+ (void) zed_conf_scan_dir(zcp);
+ }
+ rv = zed_event_service(zcp);
+
+ /* ENODEV: When kernel module is unloaded (osx) */
+ if (rv == ENODEV)
+ break;
+ }
+
+ zed_log_msg(LOG_NOTICE, "Exiting");
+ zed_event_fini(zcp);
+
+ if (zcp->do_idle && !_got_exit)
+ goto idle;
+
+out:
+ zed_conf_destroy(zcp);
+ zed_log_fini();
+ exit(EXIT_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore b/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore
new file mode 100644
index 000000000000..46a00945aa7c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/.gitignore
@@ -0,0 +1 @@
+history_event-zfs-list-cacher.sh
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am
new file mode 100644
index 000000000000..8b2d0c200286
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/Makefile.am
@@ -0,0 +1,53 @@
+include $(top_srcdir)/config/Rules.am
+include $(top_srcdir)/config/Substfiles.am
+
+EXTRA_DIST += README
+
+zedconfdir = $(sysconfdir)/zfs/zed.d
+
+dist_zedconf_DATA = \
+ zed-functions.sh \
+ zed.rc
+
+zedexecdir = $(zfsexecdir)/zed.d
+
+dist_zedexec_SCRIPTS = \
+ all-debug.sh \
+ all-syslog.sh \
+ data-notify.sh \
+ generic-notify.sh \
+ resilver_finish-notify.sh \
+ scrub_finish-notify.sh \
+ statechange-led.sh \
+ statechange-notify.sh \
+ vdev_clear-led.sh \
+ vdev_attach-led.sh \
+ pool_import-led.sh \
+ resilver_finish-start-scrub.sh \
+ trim_finish-notify.sh
+
+nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh
+
+SUBSTFILES += $(nodist_zedexec_SCRIPTS)
+
+zedconfdefaults = \
+ all-syslog.sh \
+ data-notify.sh \
+ history_event-zfs-list-cacher.sh \
+ resilver_finish-notify.sh \
+ scrub_finish-notify.sh \
+ statechange-led.sh \
+ statechange-notify.sh \
+ vdev_clear-led.sh \
+ vdev_attach-led.sh \
+ pool_import-led.sh \
+ resilver_finish-start-scrub.sh
+
+install-data-hook:
+ $(MKDIR_P) "$(DESTDIR)$(zedconfdir)"
+ for f in $(zedconfdefaults); do \
+ test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \
+ -L "$(DESTDIR)$(zedconfdir)/$${f}" || \
+ ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \
+ done
+ chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/README b/sys/contrib/openzfs/cmd/zed/zed.d/README
new file mode 100644
index 000000000000..7279b93704e2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/README
@@ -0,0 +1,30 @@
+Shell scripts are the recommended choice for ZEDLETs that mostly call
+other utilities and do relatively little data manipulation.
+
+Shell scripts MUST work on both bash and dash.
+
+Shell scripts MUST run cleanly through ShellCheck:
+ http://www.shellcheck.net/
+
+General functions reside in "zed-functions.sh". Use them where applicable.
+
+Additional references that may be of use:
+
+ Google Shell Style Guide
+ https://github.com/google/styleguide/blob/gh-pages/shell.xml
+
+ Dash as /bin/sh
+ https://wiki.ubuntu.com/DashAsBinSh
+
+ Common shell script mistakes
+ http://www.pixelbeat.org/programming/shell_script_mistakes.html
+
+ Filenames and Pathnames in Shell: How to do it Correctly
+ http://www.dwheeler.com/essays/filenames-in-shell.html
+
+ Autoconf: Portable Shell Programming
+ https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell
+
+Please BE CONSISTENT with the existing style, check for errors,
+minimize dependencies where possible, try to be portable,
+and comment anything non-obvious. Festina lente.
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh
new file mode 100755
index 000000000000..14b39caacd9d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-debug.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+#
+# Log all environment variables to ZED_DEBUG_LOG.
+#
+# This can be a useful aid when developing/debugging ZEDLETs since it shows the
+# environment variables defined for each zevent.
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}"
+
+zed_exit_if_ignoring_this_event
+
+lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock"
+
+umask 077
+zed_lock "${lockfile}"
+exec >> "${ZED_DEBUG_LOG}"
+
+printenv | sort
+echo
+
+exec >&-
+zed_unlock "${lockfile}"
+exit 0
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh
new file mode 100755
index 000000000000..cb9286500136
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+#
+# Log the zevent via syslog.
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+zed_exit_if_ignoring_this_event
+
+zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \
+ "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \
+ "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \
+ "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}"
+exit 0
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh
new file mode 100755
index 000000000000..639b459bdd3b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/data-notify.sh
@@ -0,0 +1,43 @@
+#!/bin/sh
+#
+# Send notification in response to a DATA error.
+#
+# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
+# class/pool/[vdev] combination. This protects against spamming the recipient
+# should multiple events occur together in time for the same pool/[vdev].
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: notification suppressed
+# 9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+[ -n "${ZED_NOTIFY_DATA}" ] || exit 3
+
+rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify"
+zed_rate_limit "${rate_limit_tag}" || exit 3
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has detected a data error:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+ echo " error: ${ZEVENT_ZIO_ERR}"
+ echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}"
+ echo " pool: ${ZEVENT_POOL}"
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh
new file mode 100755
index 000000000000..e438031a088a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/generic-notify.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+#
+# Send notification in response to a given zevent.
+#
+# This is a generic script than can be symlinked to a file in the
+# enabled-zedlets directory to have a notification sent when a particular
+# class of zevents occurs. The symlink filename must begin with the zevent
+# (sub)class string (e.g., "probe_failure-notify.sh" for the "probe_failure"
+# subclass). Refer to the zed(8) manpage for details.
+#
+# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given
+# class/pool combination. This protects against spamming the recipient
+# should multiple events occur together in time for the same pool.
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: notification suppressed
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+# Rate-limit the notification based in part on the filename.
+#
+rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")"
+rate_limit_interval="${ZED_NOTIFY_INTERVAL_SECS}"
+zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3
+
+umask 077
+pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}"
+host_str=" on $(hostname)"
+note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has posted the following event:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
+ [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
+ [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
+
+ [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \
+ && "${ZPOOL}" status "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
new file mode 100755
index 000000000000..053b4414a768
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in
@@ -0,0 +1,85 @@
+#!/bin/sh
+#
+# Track changes to enumerated pools for use in early-boot
+set -ef
+
+FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache"
+FSLIST_TMP="@runstatedir@/zfs-list.cache.new"
+FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}"
+
+# If the pool specific cache file is not writeable, abort
+[ -w "${FSLIST}" ] || exit 0
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+zed_exit_if_ignoring_this_event
+zed_check_cmd "${ZFS}" sort diff grep
+
+# If we are acting on a snapshot, we have nothing to do
+printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0
+
+# We obtain a lock on zfs-list to avoid any simultaneous writes.
+# If we run into trouble, log and drop the lock
+abort_alter() {
+ zed_log_msg "Error updating zfs-list.cache!"
+ zed_unlock zfs-list
+}
+
+finished() {
+ zed_unlock zfs-list
+ trap - EXIT
+ exit 0
+}
+
+case "${ZEVENT_HISTORY_INTERNAL_NAME}" in
+ create|"finish receiving"|import|destroy|rename)
+ ;;
+
+ export)
+ zed_lock zfs-list
+ trap abort_alter EXIT
+ echo > "${FSLIST}"
+ finished
+ ;;
+
+ set|inherit)
+ # Only act if one of the tracked properties is altered.
+ case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in
+ canmount|mountpoint|atime|relatime|devices|exec|readonly| \
+ setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \
+ org.openzfs.systemd:requires-mounts-for| \
+ org.openzfs.systemd:before|org.openzfs.systemd:after| \
+ org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \
+ org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \
+ ) ;;
+ *) exit 0 ;;
+ esac
+ ;;
+
+ *)
+ # Ignore all other events.
+ exit 0
+ ;;
+esac
+
+zed_lock zfs-list
+trap abort_alter EXIT
+
+PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\
+,readonly,setuid,nbmand,encroot,keylocation\
+,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\
+,org.openzfs.systemd:before,org.openzfs.systemd:after\
+,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\
+,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore"
+
+"${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}"
+
+# Sort the output so that it is stable
+sort "${FSLIST_TMP}" -o "${FSLIST_TMP}"
+
+# Don't modify the file if it hasn't changed
+diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}"
+rm -f "${FSLIST_TMP}"
+
+finished
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/pool_import-led.sh
@@ -0,0 +1 @@
+statechange-led.sh \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh
new file mode 120000
index 000000000000..e4c56bc5f816
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-notify.sh
@@ -0,0 +1 @@
+scrub_finish-notify.sh \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh
new file mode 100755
index 000000000000..c7cfd1ddba80
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/resilver_finish-start-scrub.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+# resilver_finish-start-scrub.sh
+# Run a scrub after a resilver
+#
+# Exit codes:
+# 1: Internal error
+# 2: Script wasn't enabled in zed.rc
+# 3: Scrubs are automatically started for sequential resilvers
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2
+[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3
+[ -n "${ZEVENT_POOL}" ] || exit 1
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 1
+zed_check_cmd "${ZPOOL}" || exit 1
+
+zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}"
+"${ZPOOL}" scrub "${ZEVENT_POOL}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh
new file mode 100755
index 000000000000..2145a100a3fa
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/scrub_finish-notify.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+#
+# Send notification in response to a RESILVER_FINISH or SCRUB_FINISH.
+#
+# By default, "zpool status" output will only be included for a scrub_finish
+# zevent if the pool is not healthy; to always include its output, set
+# ZED_NOTIFY_VERBOSE=1.
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: notification suppressed
+# 9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+
+if [ "${ZEVENT_SUBCLASS}" = "resilver_finish" ]; then
+ action="resilver"
+elif [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then
+ action="scrub"
+else
+ zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\""
+ exit 9
+fi
+
+zed_check_cmd "${ZPOOL}" || exit 9
+
+# For scrub, suppress notification if the pool is healthy
+# and verbosity is not enabled.
+#
+if [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then
+ healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \
+ | grep "'${ZEVENT_POOL}' is healthy")"
+ [ -n "${healthy}" ] && [ "${ZED_NOTIFY_VERBOSE}" -eq 0 ] && exit 3
+fi
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has finished a ${action}:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ "${ZPOOL}" status "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh
new file mode 100755
index 000000000000..e656e125d378
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-led.sh
@@ -0,0 +1,177 @@
+#!/bin/sh
+#
+# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes.
+#
+# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL.
+# Turn the LED off when it's back ONLINE again.
+#
+# This script run in two basic modes:
+#
+# 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then
+# only set the LED for that particular VDEV. This is the case for statechange
+# events and some vdev_* events.
+#
+# 2. If those vars are not set, then check the state of all VDEVs in the pool
+# and set the LEDs accordingly. This is the case for pool_import events.
+#
+# Note that this script requires that your enclosure be supported by the
+# Linux SCSI enclosure services (ses) driver. The script will do nothing
+# if you have no enclosure, or if your enclosure isn't supported.
+#
+# Exit codes:
+# 0: enclosure led successfully set
+# 1: enclosure leds not available
+# 2: enclosure leds administratively disabled
+# 3: The led sysfs path passed from ZFS does not exist
+# 4: $ZPOOL not set
+# 5: awk is not installed
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+if [ ! -d /sys/class/enclosure ] ; then
+ exit 1
+fi
+
+if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then
+ exit 2
+fi
+
+zed_check_cmd "$ZPOOL" || exit 4
+zed_check_cmd awk || exit 5
+
+# Global used in set_led debug print
+vdev=""
+
+# check_and_set_led (file, val)
+#
+# Read an enclosure sysfs file, and write it if it's not already set to 'val'
+#
+# Arguments
+# file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault)
+# val: value to set it to
+#
+# Return
+# 0 on success, 3 on missing sysfs path
+#
+check_and_set_led()
+{
+ file="$1"
+ val="$2"
+
+ if [ ! -e "$file" ] ; then
+ return 3
+ fi
+
+ # If another process is accessing the LED when we attempt to update it,
+ # the update will be lost so retry until the LED actually changes or we
+ # timeout.
+ for _ in $(seq 1 5); do
+ # We want to check the current state first, since writing to the
+ # 'fault' entry always causes a SES command, even if the
+ # current state is already what you want.
+ current=$(cat "${file}")
+
+ # On some enclosures if you write 1 to fault, and read it back,
+ # it will return 2. Treat all non-zero values as 1 for
+ # simplicity.
+ if [ "$current" != "0" ] ; then
+ current=1
+ fi
+
+ if [ "$current" != "$val" ] ; then
+ echo "$val" > "$file"
+ zed_log_msg "vdev $vdev set '$file' LED to $val"
+ else
+ break
+ fi
+ done
+}
+
+state_to_val()
+{
+ state="$1"
+ if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \
+ [ "$state" = "UNAVAIL" ] ; then
+ echo 1
+ elif [ "$state" = "ONLINE" ] ; then
+ echo 0
+ fi
+}
+
+# process_pool ([pool])
+#
+# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to
+# the VDEV's state.
+#
+# Arguments
+# pool: Optional pool name. If not specified, iterate though all pools.
+#
+# Return
+# 0 on success, 3 on missing sysfs path
+#
+process_pool()
+{
+ pool="$1"
+ rc=0
+
+ # Lookup all the current LED values and paths in parallel
+ #shellcheck disable=SC2016
+ cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",'
+ out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=')
+
+ #shellcheck disable=SC2034
+ echo "$out" | while read -r vdev state read write chksum therest; do
+ # Read out current LED value and path
+ tmp=$(echo "$therest" | sed 's/^.*led_token=//g')
+ vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}')
+ current_val=$(echo "$tmp" | awk -F ',' '{print $1}')
+
+ if [ "$current_val" != "0" ] ; then
+ current_val=1
+ fi
+
+ if [ -z "$vdev_enc_sysfs_path" ] ; then
+ # Skip anything with no sysfs LED entries
+ continue
+ fi
+
+ if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then
+ #shellcheck disable=SC2030
+ rc=1
+ zed_log_msg "vdev $vdev '$file/fault' doesn't exist"
+ continue;
+ fi
+
+ val=$(state_to_val "$state")
+
+ if [ "$current_val" = "$val" ] ; then
+ # LED is already set correctly
+ continue;
+ fi
+
+ if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then
+ rc=1
+ fi
+
+ done
+
+ #shellcheck disable=SC2031
+ if [ "$rc" = "0" ] ; then
+ return 0
+ else
+ # We didn't see a sysfs entry that we wanted to set
+ return 3
+ fi
+}
+
+if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then
+ # Got a statechange for an individual VDEV
+ val=$(state_to_val "$ZEVENT_VDEV_STATE_STR")
+ vdev=$(basename "$ZEVENT_VDEV_PATH")
+ check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val"
+else
+ # Process the entire pool
+ poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID")
+ process_pool "$poolname"
+fi
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh
new file mode 100755
index 000000000000..f46080a03239
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/statechange-notify.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License Version 1.0 (CDDL-1.0).
+# You can obtain a copy of the license from the top-level file
+# "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+# You may not use this file except in compliance with the license.
+#
+# CDDL HEADER END
+#
+
+#
+# Send notification in response to a fault induced statechange
+#
+# ZEVENT_SUBCLASS: 'statechange'
+# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED'
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 3: statechange not relevant
+# 4: statechange string missing (unexpected)
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4
+
+if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \
+ && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \
+ && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then
+ exit 3
+fi
+
+umask 077
+note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then
+ echo "The number of I/O errors associated with a ZFS device exceeded"
+ echo "acceptable levels. ZFS has marked the device as faulted."
+ elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then
+ echo "The number of checksum errors associated with a ZFS device"
+ echo "exceeded acceptable levels. ZFS has marked the device as"
+ echo "degraded."
+ else
+ echo "ZFS has detected that a device was removed."
+ fi
+
+ echo
+ echo " impact: Fault tolerance of the pool may be compromised."
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " state: ${ZEVENT_VDEV_STATE_STR}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}"
+ [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}"
+ [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}"
+ [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}"
+ [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}"
+
+ echo " pool: ${ZEVENT_POOL_GUID}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh b/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh
new file mode 100755
index 000000000000..5075302997e3
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/trim_finish-notify.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+#
+# Send notification in response to a TRIM_FINISH. The event
+# will be received for each vdev in the pool which was trimmed.
+#
+# Exit codes:
+# 0: notification sent
+# 1: notification failed
+# 2: notification not configured
+# 9: internal error
+
+[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
+. "${ZED_ZEDLET_DIR}/zed-functions.sh"
+
+[ -n "${ZEVENT_POOL}" ] || exit 9
+[ -n "${ZEVENT_SUBCLASS}" ] || exit 9
+
+zed_check_cmd "${ZPOOL}" || exit 9
+
+umask 077
+note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)"
+note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$"
+{
+ echo "ZFS has finished a trim:"
+ echo
+ echo " eid: ${ZEVENT_EID}"
+ echo " class: ${ZEVENT_SUBCLASS}"
+ echo " host: $(hostname)"
+ echo " time: ${ZEVENT_TIME_STRING}"
+
+ "${ZPOOL}" status -t "${ZEVENT_POOL}"
+
+} > "${note_pathname}"
+
+zed_notify "${note_subject}" "${note_pathname}"; rv=$?
+rm -f "${note_pathname}"
+exit "${rv}"
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_attach-led.sh
@@ -0,0 +1 @@
+statechange-led.sh \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh
new file mode 120000
index 000000000000..7d7404398a4a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/vdev_clear-led.sh
@@ -0,0 +1 @@
+statechange-led.sh \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh
new file mode 100755
index 000000000000..44a9b8d23303
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed-functions.sh
@@ -0,0 +1,538 @@
+#!/bin/sh
+# shellcheck disable=SC2039
+# zed-functions.sh
+#
+# ZED helper functions for use in ZEDLETs
+
+
+# Variable Defaults
+#
+: "${ZED_LOCKDIR:="/var/lock"}"
+: "${ZED_NOTIFY_INTERVAL_SECS:=3600}"
+: "${ZED_NOTIFY_VERBOSE:=0}"
+: "${ZED_RUNDIR:="/var/run"}"
+: "${ZED_SYSLOG_PRIORITY:="daemon.notice"}"
+: "${ZED_SYSLOG_TAG:="zed"}"
+
+ZED_FLOCK_FD=8
+
+
+# zed_check_cmd (cmd, ...)
+#
+# For each argument given, search PATH for the executable command [cmd].
+# Log a message if [cmd] is not found.
+#
+# Arguments
+# cmd: name of executable command for which to search
+#
+# Return
+# 0 if all commands are found in PATH and are executable
+# n for a count of the command executables that are not found
+#
+zed_check_cmd()
+{
+ local cmd
+ local rv=0
+
+ for cmd; do
+ if ! command -v "${cmd}" >/dev/null 2>&1; then
+ zed_log_err "\"${cmd}\" not installed"
+ rv=$((rv + 1))
+ fi
+ done
+ return "${rv}"
+}
+
+
+# zed_log_msg (msg, ...)
+#
+# Write all argument strings to the system log.
+#
+# Globals
+# ZED_SYSLOG_PRIORITY
+# ZED_SYSLOG_TAG
+#
+# Return
+# nothing
+#
+zed_log_msg()
+{
+ logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@"
+}
+
+
+# zed_log_err (msg, ...)
+#
+# Write an error message to the system log. This message will contain the
+# script name, EID, and all argument strings.
+#
+# Globals
+# ZED_SYSLOG_PRIORITY
+# ZED_SYSLOG_TAG
+# ZEVENT_EID
+#
+# Return
+# nothing
+#
+zed_log_err()
+{
+ logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \
+ "$(basename -- "$0"):""${ZEVENT_EID:+" eid=${ZEVENT_EID}:"}" "$@"
+}
+
+
+# zed_lock (lockfile, [fd])
+#
+# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be
+# immediately acquired, wait until it becomes available.
+#
+# Every zed_lock() must be paired with a corresponding zed_unlock().
+#
+# By default, flock-style locks associate the lockfile with file descriptor 8.
+# The bash manpage warns that file descriptors >9 should be used with care as
+# they may conflict with file descriptors used internally by the shell. File
+# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held
+# within the same process, they must use different file descriptors (preferably
+# decrementing from 8); otherwise, obtaining a new lock with a given file
+# descriptor will release the previous lock associated with that descriptor.
+#
+# Arguments
+# lockfile: pathname of the lock file; the lock will be stored in
+# ZED_LOCKDIR unless the pathname contains a "/".
+# fd: integer for the file descriptor used by flock (OPTIONAL unless holding
+# concurrent locks)
+#
+# Globals
+# ZED_FLOCK_FD
+# ZED_LOCKDIR
+#
+# Return
+# nothing
+#
+zed_lock()
+{
+ local lockfile="$1"
+ local fd="${2:-${ZED_FLOCK_FD}}"
+ local umask_bak
+ local err
+
+ [ -n "${lockfile}" ] || return
+ if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then
+ lockfile="${ZED_LOCKDIR}/${lockfile}"
+ fi
+
+ umask_bak="$(umask)"
+ umask 077
+
+ # Obtain a lock on the file bound to the given file descriptor.
+ #
+ eval "exec ${fd}> '${lockfile}'"
+ err="$(flock --exclusive "${fd}" 2>&1)"
+ # shellcheck disable=SC2181
+ if [ $? -ne 0 ]; then
+ zed_log_err "failed to lock \"${lockfile}\": ${err}"
+ fi
+
+ umask "${umask_bak}"
+}
+
+
+# zed_unlock (lockfile, [fd])
+#
+# Release the lock on [lockfile].
+#
+# Arguments
+# lockfile: pathname of the lock file
+# fd: integer for the file descriptor used by flock (must match the file
+# descriptor passed to the zed_lock function call)
+#
+# Globals
+# ZED_FLOCK_FD
+# ZED_LOCKDIR
+#
+# Return
+# nothing
+#
+zed_unlock()
+{
+ local lockfile="$1"
+ local fd="${2:-${ZED_FLOCK_FD}}"
+ local err
+
+ [ -n "${lockfile}" ] || return
+ if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then
+ lockfile="${ZED_LOCKDIR}/${lockfile}"
+ fi
+
+ # Release the lock and close the file descriptor.
+ err="$(flock --unlock "${fd}" 2>&1)"
+ # shellcheck disable=SC2181
+ if [ $? -ne 0 ]; then
+ zed_log_err "failed to unlock \"${lockfile}\": ${err}"
+ fi
+ eval "exec ${fd}>&-"
+}
+
+
+# zed_notify (subject, pathname)
+#
+# Send a notification via all available methods.
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Return
+# 0: notification succeeded via at least one method
+# 1: notification failed
+# 2: no notification methods configured
+#
+zed_notify()
+{
+ local subject="$1"
+ local pathname="$2"
+ local num_success=0
+ local num_failure=0
+
+ zed_notify_email "${subject}" "${pathname}"; rv=$?
+ [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+ [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+ zed_notify_pushbullet "${subject}" "${pathname}"; rv=$?
+ [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+ [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+ zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$?
+ [ "${rv}" -eq 0 ] && num_success=$((num_success + 1))
+ [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1))
+
+ [ "${num_success}" -gt 0 ] && return 0
+ [ "${num_failure}" -gt 0 ] && return 1
+ return 2
+}
+
+
+# zed_notify_email (subject, pathname)
+#
+# Send a notification via email to the address specified by ZED_EMAIL_ADDR.
+#
+# Requires the mail executable to be installed in the standard PATH, or
+# ZED_EMAIL_PROG to be defined with the pathname of an executable capable of
+# reading a message body from stdin.
+#
+# Command-line options to the mail executable can be specified in
+# ZED_EMAIL_OPTS. This undergoes the following keyword substitutions:
+# - @ADDRESS@ is replaced with the space-delimited recipient email address(es)
+# - @SUBJECT@ is replaced with the notification subject
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+# ZED_EMAIL_PROG
+# ZED_EMAIL_OPTS
+# ZED_EMAIL_ADDR
+#
+# Return
+# 0: notification sent
+# 1: notification failed
+# 2: not configured
+#
+zed_notify_email()
+{
+ local subject="$1"
+ local pathname="${2:-"/dev/null"}"
+
+ : "${ZED_EMAIL_PROG:="mail"}"
+ : "${ZED_EMAIL_OPTS:="-s '@SUBJECT@' @ADDRESS@"}"
+
+ # For backward compatibility with ZED_EMAIL.
+ if [ -n "${ZED_EMAIL}" ] && [ -z "${ZED_EMAIL_ADDR}" ]; then
+ ZED_EMAIL_ADDR="${ZED_EMAIL}"
+ fi
+ [ -n "${ZED_EMAIL_ADDR}" ] || return 2
+
+ zed_check_cmd "${ZED_EMAIL_PROG}" || return 1
+
+ [ -n "${subject}" ] || return 1
+ if [ ! -r "${pathname}" ]; then
+ zed_log_err \
+ "$(basename "${ZED_EMAIL_PROG}") cannot read \"${pathname}\""
+ return 1
+ fi
+
+ ZED_EMAIL_OPTS="$(echo "${ZED_EMAIL_OPTS}" \
+ | sed -e "s/@ADDRESS@/${ZED_EMAIL_ADDR}/g" \
+ -e "s/@SUBJECT@/${subject}/g")"
+
+ # shellcheck disable=SC2086
+ eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1
+ rv=$?
+ if [ "${rv}" -ne 0 ]; then
+ zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}"
+ return 1
+ fi
+ return 0
+}
+
+
+# zed_notify_pushbullet (subject, pathname)
+#
+# Send a notification via Pushbullet <https://www.pushbullet.com/>.
+# The access token (ZED_PUSHBULLET_ACCESS_TOKEN) identifies this client to the
+# Pushbullet server. The optional channel tag (ZED_PUSHBULLET_CHANNEL_TAG) is
+# for pushing to notification feeds that can be subscribed to; if a channel is
+# not defined, push notifications will instead be sent to all devices
+# associated with the account specified by the access token.
+#
+# Requires awk, curl, and sed executables to be installed in the standard PATH.
+#
+# References
+# https://docs.pushbullet.com/
+# https://www.pushbullet.com/security
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+# ZED_PUSHBULLET_ACCESS_TOKEN
+# ZED_PUSHBULLET_CHANNEL_TAG
+#
+# Return
+# 0: notification sent
+# 1: notification failed
+# 2: not configured
+#
+zed_notify_pushbullet()
+{
+ local subject="$1"
+ local pathname="${2:-"/dev/null"}"
+ local msg_body
+ local msg_tag
+ local msg_json
+ local msg_out
+ local msg_err
+ local url="https://api.pushbullet.com/v2/pushes"
+
+ [ -n "${ZED_PUSHBULLET_ACCESS_TOKEN}" ] || return 2
+
+ [ -n "${subject}" ] || return 1
+ if [ ! -r "${pathname}" ]; then
+ zed_log_err "pushbullet cannot read \"${pathname}\""
+ return 1
+ fi
+
+ zed_check_cmd "awk" "curl" "sed" || return 1
+
+ # Escape the following characters in the message body for JSON:
+ # newline, backslash, double quote, horizontal tab, vertical tab,
+ # and carriage return.
+ #
+ msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\"");
+ gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \
+ "${pathname}")"
+
+ # Push to a channel if one is configured.
+ #
+ [ -n "${ZED_PUSHBULLET_CHANNEL_TAG}" ] && msg_tag="$(printf \
+ '"channel_tag": "%s", ' "${ZED_PUSHBULLET_CHANNEL_TAG}")"
+
+ # Construct the JSON message for pushing a note.
+ #
+ msg_json="$(printf '{%s"type": "note", "title": "%s", "body": "%s"}' \
+ "${msg_tag}" "${subject}" "${msg_body}")"
+
+ # Send the POST request and check for errors.
+ #
+ msg_out="$(curl -u "${ZED_PUSHBULLET_ACCESS_TOKEN}:" -X POST "${url}" \
+ --header "Content-Type: application/json" --data-binary "${msg_json}" \
+ 2>/dev/null)"; rv=$?
+ if [ "${rv}" -ne 0 ]; then
+ zed_log_err "curl exit=${rv}"
+ return 1
+ fi
+ msg_err="$(echo "${msg_out}" \
+ | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')"
+ if [ -n "${msg_err}" ]; then
+ zed_log_err "pushbullet \"${msg_err}"\"
+ return 1
+ fi
+ return 0
+}
+
+
+# zed_notify_slack_webhook (subject, pathname)
+#
+# Notification via Slack Webhook <https://api.slack.com/incoming-webhooks>.
+# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the
+# Slack channel.
+#
+# Requires awk, curl, and sed executables to be installed in the standard PATH.
+#
+# References
+# https://api.slack.com/incoming-webhooks
+#
+# Arguments
+# subject: notification subject
+# pathname: pathname containing the notification message (OPTIONAL)
+#
+# Globals
+# ZED_SLACK_WEBHOOK_URL
+#
+# Return
+# 0: notification sent
+# 1: notification failed
+# 2: not configured
+#
+zed_notify_slack_webhook()
+{
+ [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2
+
+ local subject="$1"
+ local pathname="${2:-"/dev/null"}"
+ local msg_body
+ local msg_tag
+ local msg_json
+ local msg_out
+ local msg_err
+ local url="${ZED_SLACK_WEBHOOK_URL}"
+
+ [ -n "${subject}" ] || return 1
+ if [ ! -r "${pathname}" ]; then
+ zed_log_err "slack webhook cannot read \"${pathname}\""
+ return 1
+ fi
+
+ zed_check_cmd "awk" "curl" "sed" || return 1
+
+ # Escape the following characters in the message body for JSON:
+ # newline, backslash, double quote, horizontal tab, vertical tab,
+ # and carriage return.
+ #
+ msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\"");
+ gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \
+ "${pathname}")"
+
+ # Construct the JSON message for posting.
+ #
+ msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )"
+
+ # Send the POST request and check for errors.
+ #
+ msg_out="$(curl -X POST "${url}" \
+ --header "Content-Type: application/json" --data-binary "${msg_json}" \
+ 2>/dev/null)"; rv=$?
+ if [ "${rv}" -ne 0 ]; then
+ zed_log_err "curl exit=${rv}"
+ return 1
+ fi
+ msg_err="$(echo "${msg_out}" \
+ | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')"
+ if [ -n "${msg_err}" ]; then
+ zed_log_err "slack webhook \"${msg_err}"\"
+ return 1
+ fi
+ return 0
+}
+
+# zed_rate_limit (tag, [interval])
+#
+# Check whether an event of a given type [tag] has already occurred within the
+# last [interval] seconds.
+#
+# This function obtains a lock on the statefile using file descriptor 9.
+#
+# Arguments
+# tag: arbitrary string for grouping related events to rate-limit
+# interval: time interval in seconds (OPTIONAL)
+#
+# Globals
+# ZED_NOTIFY_INTERVAL_SECS
+# ZED_RUNDIR
+#
+# Return
+# 0 if the event should be processed
+# 1 if the event should be dropped
+#
+# State File Format
+# time;tag
+#
+zed_rate_limit()
+{
+ local tag="$1"
+ local interval="${2:-${ZED_NOTIFY_INTERVAL_SECS}}"
+ local lockfile="zed.zedlet.state.lock"
+ local lockfile_fd=9
+ local statefile="${ZED_RUNDIR}/zed.zedlet.state"
+ local time_now
+ local time_prev
+ local umask_bak
+ local rv=0
+
+ [ -n "${tag}" ] || return 0
+
+ zed_lock "${lockfile}" "${lockfile_fd}"
+ time_now="$(date +%s)"
+ time_prev="$(grep -E "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \
+ | tail -1 | cut -d\; -f1)"
+
+ if [ -n "${time_prev}" ] \
+ && [ "$((time_now - time_prev))" -lt "${interval}" ]; then
+ rv=1
+ else
+ umask_bak="$(umask)"
+ umask 077
+ grep -E -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \
+ > "${statefile}.$$"
+ echo "${time_now};${tag}" >> "${statefile}.$$"
+ mv -f "${statefile}.$$" "${statefile}"
+ umask "${umask_bak}"
+ fi
+
+ zed_unlock "${lockfile}" "${lockfile_fd}"
+ return "${rv}"
+}
+
+
+# zed_guid_to_pool (guid)
+#
+# Convert a pool GUID into its pool name (like "tank")
+# Arguments
+# guid: pool GUID (decimal or hex)
+#
+# Return
+# Pool name
+#
+zed_guid_to_pool()
+{
+ if [ -z "$1" ] ; then
+ return
+ fi
+
+ guid=$(printf "%llu" "$1")
+ if [ -n "$guid" ] ; then
+ $ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}'
+ fi
+}
+
+# zed_exit_if_ignoring_this_event
+#
+# Exit the script if we should ignore this event, as determined by
+# $ZED_SYSLOG_SUBCLASS_INCLUDE and $ZED_SYSLOG_SUBCLASS_EXCLUDE in zed.rc.
+# This function assumes you've imported the normal zed variables.
+zed_exit_if_ignoring_this_event()
+{
+ if [ -n "${ZED_SYSLOG_SUBCLASS_INCLUDE}" ]; then
+ eval "case ${ZEVENT_SUBCLASS} in
+ ${ZED_SYSLOG_SUBCLASS_INCLUDE});;
+ *) exit 0;;
+ esac"
+ elif [ -n "${ZED_SYSLOG_SUBCLASS_EXCLUDE}" ]; then
+ eval "case ${ZEVENT_SUBCLASS} in
+ ${ZED_SYSLOG_SUBCLASS_EXCLUDE}) exit 0;;
+ *);;
+ esac"
+ fi
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc
new file mode 100644
index 000000000000..1b220d28db20
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc
@@ -0,0 +1,122 @@
+##
+# zed.rc
+#
+# This file should be owned by root and permissioned 0600.
+##
+
+##
+# Absolute path to the debug output file.
+#
+#ZED_DEBUG_LOG="/tmp/zed.debug.log"
+
+##
+# Email address of the zpool administrator for receipt of notifications;
+# multiple addresses can be specified if they are delimited by whitespace.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+# Disabled by default; uncomment to enable.
+#
+#ZED_EMAIL_ADDR="root"
+
+##
+# Name or path of executable responsible for sending notifications via email;
+# the mail program must be capable of reading a message body from stdin.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+#
+#ZED_EMAIL_PROG="mail"
+
+##
+# Command-line options for ZED_EMAIL_PROG.
+# The string @ADDRESS@ will be replaced with the recipient email address(es).
+# The string @SUBJECT@ will be replaced with the notification subject;
+# this should be protected with quotes to prevent word-splitting.
+# Email will only be sent if ZED_EMAIL_ADDR is defined.
+#
+#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@"
+
+##
+# Default directory for zed lock files.
+#
+#ZED_LOCKDIR="/var/lock"
+
+##
+# Minimum number of seconds between notifications for a similar event.
+#
+#ZED_NOTIFY_INTERVAL_SECS=3600
+
+##
+# Notification verbosity.
+# If set to 0, suppress notification if the pool is healthy.
+# If set to 1, send notification regardless of pool health.
+#
+#ZED_NOTIFY_VERBOSE=0
+
+##
+# Send notifications for 'ereport.fs.zfs.data' events.
+# Disabled by default, any non-empty value will enable the feature.
+#
+#ZED_NOTIFY_DATA=
+
+##
+# Pushbullet access token.
+# This grants full access to your account -- protect it accordingly!
+# <https://www.pushbullet.com/get-started>
+# <https://www.pushbullet.com/account>
+# Disabled by default; uncomment to enable.
+#
+#ZED_PUSHBULLET_ACCESS_TOKEN=""
+
+##
+# Pushbullet channel tag for push notification feeds that can be subscribed to.
+# <https://www.pushbullet.com/my-channel>
+# If not defined, push notifications will instead be sent to all devices
+# associated with the account specified by the access token.
+# Disabled by default; uncomment to enable.
+#
+#ZED_PUSHBULLET_CHANNEL_TAG=""
+
+##
+# Slack Webhook URL.
+# This allows posting to the given channel and includes an access token.
+# <https://api.slack.com/incoming-webhooks>
+# Disabled by default; uncomment to enable.
+#
+#ZED_SLACK_WEBHOOK_URL=""
+
+##
+# Default directory for zed state files.
+#
+#ZED_RUNDIR="/var/run"
+
+##
+# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for
+# device mapper and multipath devices as well. Your enclosure must be
+# supported by the Linux SES driver for this to work.
+#
+ZED_USE_ENCLOSURE_LEDS=1
+
+##
+# Run a scrub after every resilver
+# Disabled by default, 1 to enable and 0 to disable.
+#ZED_SCRUB_AFTER_RESILVER=0
+
+##
+# The syslog priority (e.g., specified as a "facility.level" pair).
+#
+#ZED_SYSLOG_PRIORITY="daemon.notice"
+
+##
+# The syslog tag for marking zed events.
+#
+#ZED_SYSLOG_TAG="zed"
+
+##
+# Which set of event subclasses to log
+# By default, events from all subclasses are logged.
+# If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses
+# matching the pattern are logged. Use the pipe symbol (|)
+# or shell wildcards (*, ?) to match multiple subclasses.
+# Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the
+# matching subclasses are excluded from logging.
+#ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*"
+#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event"
+
diff --git a/sys/contrib/openzfs/cmd/zed/zed.h b/sys/contrib/openzfs/cmd/zed/zed.h
new file mode 100644
index 000000000000..3ac0e63141e8
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed.h
@@ -0,0 +1,58 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_H
+#define ZED_H
+
+/*
+ * Absolute path for the default zed configuration file.
+ */
+#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf"
+
+/*
+ * Absolute path for the default zed pid file.
+ */
+#define ZED_PID_FILE RUNSTATEDIR "/zed.pid"
+
+/*
+ * Absolute path for the default zed state file.
+ */
+#define ZED_STATE_FILE RUNSTATEDIR "/zed.state"
+
+/*
+ * Absolute path for the default zed zedlet directory.
+ */
+#define ZED_ZEDLET_DIR SYSCONFDIR "/zfs/zed.d"
+
+/*
+ * Reserved for future use.
+ */
+#define ZED_MAX_EVENTS 0
+
+/*
+ * Reserved for future use.
+ */
+#define ZED_MIN_EVENTS 0
+
+/*
+ * String prefix for ZED variables passed via environment variables.
+ */
+#define ZED_VAR_PREFIX "ZED_"
+
+/*
+ * String prefix for ZFS event names passed via environment variables.
+ */
+#define ZEVENT_VAR_PREFIX "ZEVENT_"
+
+#endif /* !ZED_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_conf.c b/sys/contrib/openzfs/cmd/zed/zed_conf.c
new file mode 100644
index 000000000000..52370eb87b29
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_conf.c
@@ -0,0 +1,735 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+/*
+ * Return a new configuration with default values.
+ */
+struct zed_conf *
+zed_conf_create(void)
+{
+ struct zed_conf *zcp;
+
+ zcp = calloc(1, sizeof (*zcp));
+ if (!zcp)
+ goto nomem;
+
+ zcp->syslog_facility = LOG_DAEMON;
+ zcp->min_events = ZED_MIN_EVENTS;
+ zcp->max_events = ZED_MAX_EVENTS;
+ zcp->pid_fd = -1;
+ zcp->zedlets = NULL; /* created via zed_conf_scan_dir() */
+ zcp->state_fd = -1; /* opened via zed_conf_open_state() */
+ zcp->zfs_hdl = NULL; /* opened via zed_event_init() */
+ zcp->zevent_fd = -1; /* opened via zed_event_init() */
+
+ if (!(zcp->conf_file = strdup(ZED_CONF_FILE)))
+ goto nomem;
+
+ if (!(zcp->pid_file = strdup(ZED_PID_FILE)))
+ goto nomem;
+
+ if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR)))
+ goto nomem;
+
+ if (!(zcp->state_file = strdup(ZED_STATE_FILE)))
+ goto nomem;
+
+ return (zcp);
+
+nomem:
+ zed_log_die("Failed to create conf: %s", strerror(errno));
+ return (NULL);
+}
+
+/*
+ * Destroy the configuration [zcp].
+ *
+ * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini().
+ */
+void
+zed_conf_destroy(struct zed_conf *zcp)
+{
+ if (!zcp)
+ return;
+
+ if (zcp->state_fd >= 0) {
+ if (close(zcp->state_fd) < 0)
+ zed_log_msg(LOG_WARNING,
+ "Failed to close state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ zcp->state_fd = -1;
+ }
+ if (zcp->pid_file) {
+ if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT))
+ zed_log_msg(LOG_WARNING,
+ "Failed to remove PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ }
+ if (zcp->pid_fd >= 0) {
+ if (close(zcp->pid_fd) < 0)
+ zed_log_msg(LOG_WARNING,
+ "Failed to close PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ zcp->pid_fd = -1;
+ }
+ if (zcp->conf_file) {
+ free(zcp->conf_file);
+ zcp->conf_file = NULL;
+ }
+ if (zcp->pid_file) {
+ free(zcp->pid_file);
+ zcp->pid_file = NULL;
+ }
+ if (zcp->zedlet_dir) {
+ free(zcp->zedlet_dir);
+ zcp->zedlet_dir = NULL;
+ }
+ if (zcp->state_file) {
+ free(zcp->state_file);
+ zcp->state_file = NULL;
+ }
+ if (zcp->zedlets) {
+ zed_strings_destroy(zcp->zedlets);
+ zcp->zedlets = NULL;
+ }
+ free(zcp);
+}
+
+/*
+ * Display command-line help and exit.
+ *
+ * If [got_err] is 0, output to stdout and exit normally;
+ * otherwise, output to stderr and exit with a failure status.
+ */
+static void
+_zed_conf_display_help(const char *prog, int got_err)
+{
+ FILE *fp = got_err ? stderr : stdout;
+ int w1 = 4; /* width of leading whitespace */
+ int w2 = 8; /* width of L-justified option field */
+
+ fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed"));
+ fprintf(fp, "\n");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h",
+ "Display help.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L",
+ "Display license information.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V",
+ "Display version information.");
+ fprintf(fp, "\n");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v",
+ "Be verbose.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f",
+ "Force daemon to run.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F",
+ "Run daemon in the foreground.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I",
+ "Idle daemon until kernel module is (re)loaded.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M",
+ "Lock all pages in memory.");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P",
+ "$PATH for ZED to use (only used by ZTS).");
+ fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z",
+ "Zero state file.");
+ fprintf(fp, "\n");
+#if 0
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE",
+ "Read configuration from FILE.", ZED_CONF_FILE);
+#endif
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR",
+ "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR);
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE",
+ "Write daemon's PID to FILE.", ZED_PID_FILE);
+ fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE",
+ "Write daemon's state to FILE.", ZED_STATE_FILE);
+ fprintf(fp, "\n");
+
+ exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS);
+}
+
+/*
+ * Display license information to stdout and exit.
+ */
+static void
+_zed_conf_display_license(void)
+{
+ const char **pp;
+ const char *text[] = {
+ "The ZFS Event Daemon (ZED) is distributed under the terms of the",
+ " Common Development and Distribution License (CDDL-1.0)",
+ " <http://opensource.org/licenses/CDDL-1.0>.",
+ "",
+ "Developed at Lawrence Livermore National Laboratory"
+ " (LLNL-CODE-403049).",
+ "",
+ NULL
+ };
+
+ for (pp = text; *pp; pp++)
+ printf("%s\n", *pp);
+
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * Display version information to stdout and exit.
+ */
+static void
+_zed_conf_display_version(void)
+{
+ printf("%s-%s-%s\n",
+ ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE);
+
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * Copy the [path] string to the [resultp] ptr.
+ * If [path] is not an absolute path, prefix it with the current working dir.
+ * If [resultp] is non-null, free its existing string before assignment.
+ */
+static void
+_zed_conf_parse_path(char **resultp, const char *path)
+{
+ char buf[PATH_MAX];
+
+ assert(resultp != NULL);
+ assert(path != NULL);
+
+ if (*resultp)
+ free(*resultp);
+
+ if (path[0] == '/') {
+ *resultp = strdup(path);
+ } else if (!getcwd(buf, sizeof (buf))) {
+ zed_log_die("Failed to get current working dir: %s",
+ strerror(errno));
+ } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) {
+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
+ } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) {
+ zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG));
+ } else {
+ *resultp = strdup(buf);
+ }
+ if (!*resultp)
+ zed_log_die("Failed to copy path: %s", strerror(ENOMEM));
+}
+
+/*
+ * Parse the command-line options into the configuration [zcp].
+ */
+void
+zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv)
+{
+ const char * const opts = ":hLVc:d:p:P:s:vfFMZI";
+ int opt;
+
+ if (!zcp || !argv || !argv[0])
+ zed_log_die("Failed to parse options: Internal error");
+
+ opterr = 0; /* suppress default getopt err msgs */
+
+ while ((opt = getopt(argc, argv, opts)) != -1) {
+ switch (opt) {
+ case 'h':
+ _zed_conf_display_help(argv[0], EXIT_SUCCESS);
+ break;
+ case 'L':
+ _zed_conf_display_license();
+ break;
+ case 'V':
+ _zed_conf_display_version();
+ break;
+ case 'c':
+ _zed_conf_parse_path(&zcp->conf_file, optarg);
+ break;
+ case 'd':
+ _zed_conf_parse_path(&zcp->zedlet_dir, optarg);
+ break;
+ case 'I':
+ zcp->do_idle = 1;
+ break;
+ case 'p':
+ _zed_conf_parse_path(&zcp->pid_file, optarg);
+ break;
+ case 'P':
+ _zed_conf_parse_path(&zcp->path, optarg);
+ break;
+ case 's':
+ _zed_conf_parse_path(&zcp->state_file, optarg);
+ break;
+ case 'v':
+ zcp->do_verbose = 1;
+ break;
+ case 'f':
+ zcp->do_force = 1;
+ break;
+ case 'F':
+ zcp->do_foreground = 1;
+ break;
+ case 'M':
+ zcp->do_memlock = 1;
+ break;
+ case 'Z':
+ zcp->do_zero = 1;
+ break;
+ case '?':
+ default:
+ if (optopt == '?')
+ _zed_conf_display_help(argv[0], EXIT_SUCCESS);
+
+ fprintf(stderr, "%s: %s '-%c'\n\n", argv[0],
+ "Invalid option", optopt);
+ _zed_conf_display_help(argv[0], EXIT_FAILURE);
+ break;
+ }
+ }
+}
+
+/*
+ * Parse the configuration file into the configuration [zcp].
+ *
+ * FIXME: Not yet implemented.
+ */
+void
+zed_conf_parse_file(struct zed_conf *zcp)
+{
+ if (!zcp)
+ zed_log_die("Failed to parse config: %s", strerror(EINVAL));
+}
+
+/*
+ * Scan the [zcp] zedlet_dir for files to exec based on the event class.
+ * Files must be executable by user, but not writable by group or other.
+ * Dotfiles are ignored.
+ *
+ * Return 0 on success with an updated set of zedlets,
+ * or -1 on error with errno set.
+ *
+ * FIXME: Check if zedlet_dir and all parent dirs are secure.
+ */
+int
+zed_conf_scan_dir(struct zed_conf *zcp)
+{
+ zed_strings_t *zedlets;
+ DIR *dirp;
+ struct dirent *direntp;
+ char pathname[PATH_MAX];
+ struct stat st;
+ int n;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s",
+ strerror(errno));
+ return (-1);
+ }
+ zedlets = zed_strings_create();
+ if (!zedlets) {
+ errno = ENOMEM;
+ zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s",
+ zcp->zedlet_dir, strerror(errno));
+ return (-1);
+ }
+ dirp = opendir(zcp->zedlet_dir);
+ if (!dirp) {
+ int errno_bak = errno;
+ zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s",
+ zcp->zedlet_dir, strerror(errno));
+ zed_strings_destroy(zedlets);
+ errno = errno_bak;
+ return (-1);
+ }
+ while ((direntp = readdir(dirp))) {
+ if (direntp->d_name[0] == '.')
+ continue;
+
+ n = snprintf(pathname, sizeof (pathname),
+ "%s/%s", zcp->zedlet_dir, direntp->d_name);
+ if ((n < 0) || (n >= sizeof (pathname))) {
+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
+ direntp->d_name, strerror(ENAMETOOLONG));
+ continue;
+ }
+ if (stat(pathname, &st) < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s",
+ pathname, strerror(errno));
+ continue;
+ }
+ if (!S_ISREG(st.st_mode)) {
+ zed_log_msg(LOG_INFO,
+ "Ignoring \"%s\": not a regular file",
+ direntp->d_name);
+ continue;
+ }
+ if ((st.st_uid != 0) && !zcp->do_force) {
+ zed_log_msg(LOG_NOTICE,
+ "Ignoring \"%s\": not owned by root",
+ direntp->d_name);
+ continue;
+ }
+ if (!(st.st_mode & S_IXUSR)) {
+ zed_log_msg(LOG_INFO,
+ "Ignoring \"%s\": not executable by user",
+ direntp->d_name);
+ continue;
+ }
+ if ((st.st_mode & S_IWGRP) && !zcp->do_force) {
+ zed_log_msg(LOG_NOTICE,
+ "Ignoring \"%s\": writable by group",
+ direntp->d_name);
+ continue;
+ }
+ if ((st.st_mode & S_IWOTH) && !zcp->do_force) {
+ zed_log_msg(LOG_NOTICE,
+ "Ignoring \"%s\": writable by other",
+ direntp->d_name);
+ continue;
+ }
+ if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to register \"%s\": %s",
+ direntp->d_name, strerror(errno));
+ continue;
+ }
+ if (zcp->do_verbose)
+ zed_log_msg(LOG_INFO,
+ "Registered zedlet \"%s\"", direntp->d_name);
+ }
+ if (closedir(dirp) < 0) {
+ int errno_bak = errno;
+ zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s",
+ zcp->zedlet_dir, strerror(errno));
+ zed_strings_destroy(zedlets);
+ errno = errno_bak;
+ return (-1);
+ }
+ if (zcp->zedlets)
+ zed_strings_destroy(zcp->zedlets);
+
+ zcp->zedlets = zedlets;
+ return (0);
+}
+
+/*
+ * Write the PID file specified in [zcp].
+ * Return 0 on success, -1 on error.
+ *
+ * This must be called after fork()ing to become a daemon (so the correct PID
+ * is recorded), but before daemonization is complete and the parent process
+ * exits (for synchronization with systemd).
+ */
+int
+zed_conf_write_pid(struct zed_conf *zcp)
+{
+ const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+ const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+ char buf[PATH_MAX];
+ int n;
+ char *p;
+ mode_t mask;
+ int rv;
+
+ if (!zcp || !zcp->pid_file) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
+ strerror(errno));
+ return (-1);
+ }
+ assert(zcp->pid_fd == -1);
+ /*
+ * Create PID file directory if needed.
+ */
+ n = strlcpy(buf, zcp->pid_file, sizeof (buf));
+ if (n >= sizeof (buf)) {
+ errno = ENAMETOOLONG;
+ zed_log_msg(LOG_ERR, "Failed to create PID file: %s",
+ strerror(errno));
+ goto err;
+ }
+ p = strrchr(buf, '/');
+ if (p)
+ *p = '\0';
+
+ if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) {
+ zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s",
+ buf, strerror(errno));
+ goto err;
+ }
+ /*
+ * Obtain PID file lock.
+ */
+ mask = umask(0);
+ umask(mask | 022);
+ zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode);
+ umask(mask);
+ if (zcp->pid_fd < 0) {
+ zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ goto err;
+ }
+ rv = zed_file_lock(zcp->pid_fd);
+ if (rv < 0) {
+ zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ goto err;
+ } else if (rv > 0) {
+ pid_t pid = zed_file_is_locked(zcp->pid_fd);
+ if (pid < 0) {
+ zed_log_msg(LOG_ERR,
+ "Failed to test lock on PID file \"%s\"",
+ zcp->pid_file);
+ } else if (pid > 0) {
+ zed_log_msg(LOG_ERR,
+ "Found PID %d bound to PID file \"%s\"",
+ pid, zcp->pid_file);
+ } else {
+ zed_log_msg(LOG_ERR,
+ "Inconsistent lock state on PID file \"%s\"",
+ zcp->pid_file);
+ }
+ goto err;
+ }
+ /*
+ * Write PID file.
+ */
+ n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid());
+ if ((n < 0) || (n >= sizeof (buf))) {
+ errno = ERANGE;
+ zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ } else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) {
+ zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ } else if (fdatasync(zcp->pid_fd) < 0) {
+ zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s",
+ zcp->pid_file, strerror(errno));
+ } else {
+ return (0);
+ }
+
+err:
+ if (zcp->pid_fd >= 0) {
+ (void) close(zcp->pid_fd);
+ zcp->pid_fd = -1;
+ }
+ return (-1);
+}
+
+/*
+ * Open and lock the [zcp] state_file.
+ * Return 0 on success, -1 on error.
+ *
+ * FIXME: Move state information into kernel.
+ */
+int
+zed_conf_open_state(struct zed_conf *zcp)
+{
+ char dirbuf[PATH_MAX];
+ mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+ int n;
+ char *p;
+ int rv;
+
+ if (!zcp || !zcp->state_file) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to open state file: %s",
+ strerror(errno));
+ return (-1);
+ }
+ n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf));
+ if (n >= sizeof (dirbuf)) {
+ errno = ENAMETOOLONG;
+ zed_log_msg(LOG_WARNING, "Failed to open state file: %s",
+ strerror(errno));
+ return (-1);
+ }
+ p = strrchr(dirbuf, '/');
+ if (p)
+ *p = '\0';
+
+ if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to create directory \"%s\": %s",
+ dirbuf, strerror(errno));
+ return (-1);
+ }
+ if (zcp->state_fd >= 0) {
+ if (close(zcp->state_fd) < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to close state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ }
+ if (zcp->do_zero)
+ (void) unlink(zcp->state_file);
+
+ zcp->state_fd = open(zcp->state_file,
+ (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH));
+ if (zcp->state_fd < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ rv = zed_file_lock(zcp->state_fd);
+ if (rv < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ if (rv > 0) {
+ pid_t pid = zed_file_is_locked(zcp->state_fd);
+ if (pid < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to test lock on state file \"%s\"",
+ zcp->state_file);
+ } else if (pid > 0) {
+ zed_log_msg(LOG_WARNING,
+ "Found PID %d bound to state file \"%s\"",
+ pid, zcp->state_file);
+ } else {
+ zed_log_msg(LOG_WARNING,
+ "Inconsistent lock state on state file \"%s\"",
+ zcp->state_file);
+ }
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Read the opened [zcp] state_file to obtain the eid & etime of the last event
+ * processed. Write the state from the last event to the [eidp] & [etime] args
+ * passed by reference. Note that etime[] is an array of size 2.
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[])
+{
+ ssize_t len;
+ struct iovec iov[3];
+ ssize_t n;
+
+ if (!zcp || !eidp || !etime) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR,
+ "Failed to read state file: %s", strerror(errno));
+ return (-1);
+ }
+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to reposition state file offset: %s",
+ strerror(errno));
+ return (-1);
+ }
+ len = 0;
+ iov[0].iov_base = eidp;
+ len += iov[0].iov_len = sizeof (*eidp);
+ iov[1].iov_base = &etime[0];
+ len += iov[1].iov_len = sizeof (etime[0]);
+ iov[2].iov_base = &etime[1];
+ len += iov[2].iov_len = sizeof (etime[1]);
+
+ n = readv(zcp->state_fd, iov, 3);
+ if (n == 0) {
+ *eidp = 0;
+ } else if (n < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to read state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ } else if (n != len) {
+ errno = EIO;
+ zed_log_msg(LOG_WARNING,
+ "Failed to read state file \"%s\": Read %d of %d bytes",
+ zcp->state_file, n, len);
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Write the [eid] & [etime] of the last processed event to the opened
+ * [zcp] state_file. Note that etime[] is an array of size 2.
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[])
+{
+ ssize_t len;
+ struct iovec iov[3];
+ ssize_t n;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR,
+ "Failed to write state file: %s", strerror(errno));
+ return (-1);
+ }
+ if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to reposition state file offset: %s",
+ strerror(errno));
+ return (-1);
+ }
+ len = 0;
+ iov[0].iov_base = &eid;
+ len += iov[0].iov_len = sizeof (eid);
+ iov[1].iov_base = &etime[0];
+ len += iov[1].iov_len = sizeof (etime[0]);
+ iov[2].iov_base = &etime[1];
+ len += iov[2].iov_len = sizeof (etime[1]);
+
+ n = writev(zcp->state_fd, iov, 3);
+ if (n < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to write state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ if (n != len) {
+ errno = EIO;
+ zed_log_msg(LOG_WARNING,
+ "Failed to write state file \"%s\": Wrote %d of %d bytes",
+ zcp->state_file, n, len);
+ return (-1);
+ }
+ if (fdatasync(zcp->state_fd) < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to sync state file \"%s\": %s",
+ zcp->state_file, strerror(errno));
+ return (-1);
+ }
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_conf.h b/sys/contrib/openzfs/cmd/zed/zed_conf.h
new file mode 100644
index 000000000000..424cb2c01c8c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_conf.h
@@ -0,0 +1,62 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_CONF_H
+#define ZED_CONF_H
+
+#include <libzfs.h>
+#include <stdint.h>
+#include "zed_strings.h"
+
+struct zed_conf {
+ unsigned do_force:1; /* true if force enabled */
+ unsigned do_foreground:1; /* true if run in foreground */
+ unsigned do_memlock:1; /* true if locking memory */
+ unsigned do_verbose:1; /* true if verbosity enabled */
+ unsigned do_zero:1; /* true if zeroing state */
+ unsigned do_idle:1; /* true if idle enabled */
+ int syslog_facility; /* syslog facility value */
+ int min_events; /* RESERVED FOR FUTURE USE */
+ int max_events; /* RESERVED FOR FUTURE USE */
+ char *conf_file; /* abs path to config file */
+ char *pid_file; /* abs path to pid file */
+ int pid_fd; /* fd to pid file for lock */
+ char *zedlet_dir; /* abs path to zedlet dir */
+ zed_strings_t *zedlets; /* names of enabled zedlets */
+ char *state_file; /* abs path to state file */
+ int state_fd; /* fd to state file */
+ libzfs_handle_t *zfs_hdl; /* handle to libzfs */
+ int zevent_fd; /* fd for access to zevents */
+ char *path; /* custom $PATH for zedlets to use */
+};
+
+struct zed_conf *zed_conf_create(void);
+
+void zed_conf_destroy(struct zed_conf *zcp);
+
+void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv);
+
+void zed_conf_parse_file(struct zed_conf *zcp);
+
+int zed_conf_scan_dir(struct zed_conf *zcp);
+
+int zed_conf_write_pid(struct zed_conf *zcp);
+
+int zed_conf_open_state(struct zed_conf *zcp);
+
+int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]);
+
+int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]);
+
+#endif /* !ZED_CONF_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.c b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c
new file mode 100644
index 000000000000..174d24523253
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ */
+
+#ifdef HAVE_LIBUDEV
+
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libudev.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+
+#include "zed_log.h"
+#include "zed_disk_event.h"
+#include "agents/zfs_agents.h"
+
+/*
+ * Portions of ZED need to see disk events for disks belonging to ZFS pools.
+ * A libudev monitor is established to monitor block device actions and pass
+ * them on to internal ZED logic modules. Initially, zfs_mod.c is the only
+ * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM
+ * module responsible for handling disk events for ZFS.
+ */
+
+pthread_t g_mon_tid;
+struct udev *g_udev;
+struct udev_monitor *g_mon;
+
+
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+
+/* 64MB is minimum usable disk for ZFS */
+#define MINIMUM_SECTORS 131072
+
+
+/*
+ * Post disk event to SLM module
+ *
+ * occurs in the context of monitor thread
+ */
+static void
+zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ char *strval;
+ uint64_t numval;
+
+ zed_log_msg(LOG_INFO, "zed_disk_event:");
+ zed_log_msg(LOG_INFO, "\tclass: %s", class);
+ zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass);
+ if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval);
+ if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval);
+ if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval);
+ if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
+ if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
+ if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval);
+
+ (void) zfs_agent_post_event(class, subclass, nvl);
+}
+
+/*
+ * dev_event_nvlist: place event schema into an nv pair list
+ *
+ * NAME VALUE (example)
+ * -------------- --------------------------------------------------------
+ * DEV_NAME /dev/sdl
+ * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/...
+ * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC
+ * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0
+ * DEV_IS_PART ---
+ * DEV_SIZE 500107862016
+ * ZFS_EV_POOL_GUID 17523635698032189180
+ * ZFS_EV_VDEV_GUID 14663607734290803088
+ */
+static nvlist_t *
+dev_event_nvlist(struct udev_device *dev)
+{
+ nvlist_t *nvl;
+ char strval[128];
+ const char *value, *path;
+ uint64_t guid;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ return (NULL);
+
+ if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0)
+ (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval);
+ if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0)
+ (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval);
+ if ((path = udev_device_get_devnode(dev)) != NULL)
+ (void) nvlist_add_string(nvl, DEV_NAME, path);
+ if ((value = udev_device_get_devpath(dev)) != NULL)
+ (void) nvlist_add_string(nvl, DEV_PATH, value);
+ value = udev_device_get_devtype(dev);
+ if ((value != NULL && strcmp("partition", value) == 0) ||
+ (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER")
+ != NULL)) {
+ (void) nvlist_add_boolean(nvl, DEV_IS_PART);
+ }
+ if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) {
+ uint64_t numval = DEV_BSIZE;
+
+ numval *= strtoull(value, NULL, 10);
+ (void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
+ }
+
+ /*
+ * Grab the pool and vdev guids from blkid cache
+ */
+ value = udev_device_get_property_value(dev, "ID_FS_UUID");
+ if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
+ (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid);
+
+ value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB");
+ if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0)
+ (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid);
+
+ /*
+ * Either a vdev guid or a devid must be present for matching
+ */
+ if (!nvlist_exists(nvl, DEV_IDENTIFIER) &&
+ !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) {
+ nvlist_free(nvl);
+ return (NULL);
+ }
+
+ return (nvl);
+}
+
+/*
+ * Listen for block device uevents
+ */
+static void *
+zed_udev_monitor(void *arg)
+{
+ struct udev_monitor *mon = arg;
+ char *tmp, *tmp2;
+
+ zed_log_msg(LOG_INFO, "Waiting for new udev disk events...");
+
+ while (1) {
+ struct udev_device *dev;
+ const char *action, *type, *part, *sectors;
+ const char *bus, *uuid;
+ const char *class, *subclass;
+ nvlist_t *nvl;
+ boolean_t is_zfs = B_FALSE;
+
+ /* allow a cancellation while blocked (recvmsg) */
+ pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
+
+ /* blocks at recvmsg until an event occurs */
+ if ((dev = udev_monitor_receive_device(mon)) == NULL) {
+ zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive "
+ "device error %d", errno);
+ continue;
+ }
+
+ /* allow all steps to complete before a cancellation */
+ pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
+
+ /*
+ * Strongly typed device is the preferred filter
+ */
+ type = udev_device_get_property_value(dev, "ID_FS_TYPE");
+ if (type != NULL && type[0] != '\0') {
+ if (strcmp(type, "zfs_member") == 0) {
+ is_zfs = B_TRUE;
+ } else {
+ /* not ours, so skip */
+ zed_log_msg(LOG_INFO, "zed_udev_monitor: skip "
+ "%s (in use by %s)",
+ udev_device_get_devnode(dev), type);
+ udev_device_unref(dev);
+ continue;
+ }
+ }
+
+ /*
+ * if this is a disk and it is partitioned, then the
+ * zfs label will reside in a DEVTYPE=partition and
+ * we can skip passing this event
+ */
+ type = udev_device_get_property_value(dev, "DEVTYPE");
+ part = udev_device_get_property_value(dev,
+ "ID_PART_TABLE_TYPE");
+ if (type != NULL && type[0] != '\0' &&
+ strcmp(type, "disk") == 0 &&
+ part != NULL && part[0] != '\0') {
+ /* skip and wait for partition event */
+ udev_device_unref(dev);
+ continue;
+ }
+
+ /*
+ * ignore small partitions
+ */
+ sectors = udev_device_get_property_value(dev,
+ "ID_PART_ENTRY_SIZE");
+ if (sectors == NULL)
+ sectors = udev_device_get_sysattr_value(dev, "size");
+ if (sectors != NULL &&
+ strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
+ udev_device_unref(dev);
+ continue;
+ }
+
+ /*
+ * If the blkid probe didn't find ZFS, then a persistent
+ * device id string is required in the message schema
+ * for matching with vdevs. Preflight here for expected
+ * udev information.
+ */
+ bus = udev_device_get_property_value(dev, "ID_BUS");
+ uuid = udev_device_get_property_value(dev, "DM_UUID");
+ if (!is_zfs && (bus == NULL && uuid == NULL)) {
+ zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
+ "source", udev_device_get_devnode(dev));
+ udev_device_unref(dev);
+ continue;
+ }
+
+ action = udev_device_get_action(dev);
+ if (strcmp(action, "add") == 0) {
+ class = EC_DEV_ADD;
+ subclass = ESC_DISK;
+ } else if (strcmp(action, "remove") == 0) {
+ class = EC_DEV_REMOVE;
+ subclass = ESC_DISK;
+ } else if (strcmp(action, "change") == 0) {
+ class = EC_DEV_STATUS;
+ subclass = ESC_DEV_DLE;
+ } else {
+ zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown",
+ action);
+ udev_device_unref(dev);
+ continue;
+ }
+
+ /*
+ * Special case an EC_DEV_ADD for multipath devices
+ *
+ * When a multipath device is created, udev reports the
+ * following:
+ *
+ * 1. "add" event of the dm device for the multipath device
+ * (like /dev/dm-3).
+ * 2. "change" event to create the actual multipath device
+ * symlink (like /dev/mapper/mpatha). The event also
+ * passes back the relevant DM vars we care about, like
+ * DM_UUID.
+ * 3. Another "change" event identical to #2 (that we ignore).
+ *
+ * To get the behavior we want, we treat the "change" event
+ * in #2 as a "add" event; as if "/dev/mapper/mpatha" was
+ * a new disk being added.
+ */
+ if (strcmp(class, EC_DEV_STATUS) == 0 &&
+ udev_device_get_property_value(dev, "DM_UUID") &&
+ udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) {
+ tmp = (char *)udev_device_get_devnode(dev);
+ tmp2 = zfs_get_underlying_path(tmp);
+ if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) {
+ /*
+ * We have a real underlying device, which
+ * means that this multipath "change" event is
+ * an "add" event.
+ *
+ * If the multipath device and the underlying
+ * dev are the same name (i.e. /dev/dm-5), then
+ * there is no real underlying disk for this
+ * multipath device, and so this "change" event
+ * really is a multipath removal.
+ */
+ class = EC_DEV_ADD;
+ subclass = ESC_DISK;
+ } else {
+ tmp = (char *)
+ udev_device_get_property_value(dev,
+ "DM_NR_VALID_PATHS");
+ /* treat as a multipath remove */
+ if (tmp != NULL && strcmp(tmp, "0") == 0) {
+ class = EC_DEV_REMOVE;
+ subclass = ESC_DISK;
+ }
+ }
+ free(tmp2);
+ }
+
+ /*
+ * Special case an EC_DEV_ADD for scsi_debug devices
+ *
+ * These devices require a udevadm trigger command after
+ * creation in order to register the vdev_id scsidebug alias
+ * rule (adds a persistent path (phys_path) used for fault
+ * management automated tests in the ZFS test suite.
+ *
+ * After udevadm trigger command, event registers as a "change"
+ * event but needs to instead be handled as another "add" event
+ * to allow for disk labeling and partitioning to occur.
+ */
+ if (strcmp(class, EC_DEV_STATUS) == 0 &&
+ udev_device_get_property_value(dev, "ID_VDEV") &&
+ udev_device_get_property_value(dev, "ID_MODEL")) {
+ const char *id_model, *id_model_sd = "scsi_debug";
+
+ id_model = udev_device_get_property_value(dev,
+ "ID_MODEL");
+ if (strcmp(id_model, id_model_sd) == 0) {
+ class = EC_DEV_ADD;
+ subclass = ESC_DISK;
+ }
+ }
+
+ if ((nvl = dev_event_nvlist(dev)) != NULL) {
+ zed_udev_event(class, subclass, nvl);
+ nvlist_free(nvl);
+ }
+
+ udev_device_unref(dev);
+ }
+
+ return (NULL);
+}
+
+int
+zed_disk_event_init()
+{
+ int fd, fflags;
+
+ if ((g_udev = udev_new()) == NULL) {
+ zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno);
+ return (-1);
+ }
+
+ /* Set up a udev monitor for block devices */
+ g_mon = udev_monitor_new_from_netlink(g_udev, "udev");
+ udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk");
+ udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block",
+ "partition");
+ udev_monitor_enable_receiving(g_mon);
+
+ /* Make sure monitoring socket is blocking */
+ fd = udev_monitor_get_fd(g_mon);
+ if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK)
+ (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK);
+
+ /* spawn a thread to monitor events */
+ if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) {
+ udev_monitor_unref(g_mon);
+ udev_unref(g_udev);
+ zed_log_msg(LOG_WARNING, "pthread_create failed");
+ return (-1);
+ }
+
+ zed_log_msg(LOG_INFO, "zed_disk_event_init");
+
+ return (0);
+}
+
+void
+zed_disk_event_fini()
+{
+ /* cancel monitor thread at recvmsg() */
+ (void) pthread_cancel(g_mon_tid);
+ (void) pthread_join(g_mon_tid, NULL);
+
+ /* cleanup udev resources */
+ udev_monitor_unref(g_mon);
+ udev_unref(g_udev);
+
+ zed_log_msg(LOG_INFO, "zed_disk_event_fini");
+}
+
+#else
+
+#include "zed_disk_event.h"
+
+int
+zed_disk_event_init()
+{
+ return (0);
+}
+
+void
+zed_disk_event_fini()
+{
+}
+
+#endif /* HAVE_LIBUDEV */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_disk_event.h b/sys/contrib/openzfs/cmd/zed/zed_disk_event.h
new file mode 100644
index 000000000000..ea9813d0a595
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_disk_event.h
@@ -0,0 +1,31 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef ZED_DISK_EVENT_H
+#define ZED_DISK_EVENT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int zed_disk_event_init(void);
+extern void zed_disk_event_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZED_DISK_EVENT_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.c b/sys/contrib/openzfs/cmd/zed/zed_event.c
new file mode 100644
index 000000000000..1c5d00e297ff
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_event.c
@@ -0,0 +1,965 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libzfs.h> /* FIXME: Replace with libzfs_core. */
+#include <paths.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/zfs_ioctl.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/fm/fs/zfs.h>
+#include "zed.h"
+#include "zed_conf.h"
+#include "zed_disk_event.h"
+#include "zed_event.h"
+#include "zed_exec.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+#include "agents/zfs_agents.h"
+
+#define MAXBUF 4096
+
+/*
+ * Open the libzfs interface.
+ */
+int
+zed_event_init(struct zed_conf *zcp)
+{
+ if (!zcp)
+ zed_log_die("Failed zed_event_init: %s", strerror(EINVAL));
+
+ zcp->zfs_hdl = libzfs_init();
+ if (!zcp->zfs_hdl) {
+ if (zcp->do_idle)
+ return (-1);
+ zed_log_die("Failed to initialize libzfs");
+ }
+
+ zcp->zevent_fd = open(ZFS_DEV, O_RDWR);
+ if (zcp->zevent_fd < 0) {
+ if (zcp->do_idle)
+ return (-1);
+ zed_log_die("Failed to open \"%s\": %s",
+ ZFS_DEV, strerror(errno));
+ }
+
+ zfs_agent_init(zcp->zfs_hdl);
+
+ if (zed_disk_event_init() != 0) {
+ if (zcp->do_idle)
+ return (-1);
+ zed_log_die("Failed to initialize disk events");
+ }
+
+ return (0);
+}
+
+/*
+ * Close the libzfs interface.
+ */
+void
+zed_event_fini(struct zed_conf *zcp)
+{
+ if (!zcp)
+ zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL));
+
+ zed_disk_event_fini();
+ zfs_agent_fini();
+
+ if (zcp->zevent_fd >= 0) {
+ if (close(zcp->zevent_fd) < 0)
+ zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s",
+ ZFS_DEV, strerror(errno));
+
+ zcp->zevent_fd = -1;
+ }
+ if (zcp->zfs_hdl) {
+ libzfs_fini(zcp->zfs_hdl);
+ zcp->zfs_hdl = NULL;
+ }
+}
+
+/*
+ * Seek to the event specified by [saved_eid] and [saved_etime].
+ * This protects against processing a given event more than once.
+ * Return 0 upon a successful seek to the specified event, or -1 otherwise.
+ *
+ * A zevent is considered to be uniquely specified by its (eid,time) tuple.
+ * The unsigned 64b eid is set to 1 when the kernel module is loaded, and
+ * incremented by 1 for each new event. Since the state file can persist
+ * across a kernel module reload, the time must be checked to ensure a match.
+ */
+int
+zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[])
+{
+ uint64_t eid;
+ int found;
+ nvlist_t *nvl;
+ int n_dropped;
+ int64_t *etime;
+ uint_t nelem;
+ int rv;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to seek zevent: %s",
+ strerror(errno));
+ return (-1);
+ }
+ eid = 0;
+ found = 0;
+ while ((eid < saved_eid) && !found) {
+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped,
+ ZEVENT_NONBLOCK, zcp->zevent_fd);
+
+ if ((rv != 0) || !nvl)
+ break;
+
+ if (n_dropped > 0) {
+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
+ /*
+ * FIXME: Increase max size of event nvlist in
+ * /sys/module/zfs/parameters/zfs_zevent_len_max ?
+ */
+ }
+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
+ } else if (nvlist_lookup_int64_array(nvl, "time",
+ &etime, &nelem) != 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu)", eid);
+ } else if (nelem != 2) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu, nelem=%u)",
+ eid, nelem);
+ } else if ((eid != saved_eid) ||
+ (etime[0] != saved_etime[0]) ||
+ (etime[1] != saved_etime[1])) {
+ /* no-op */
+ } else {
+ found = 1;
+ }
+ free(nvl);
+ }
+ if (!found && (saved_eid > 0)) {
+ if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START,
+ zcp->zevent_fd) < 0)
+ zed_log_msg(LOG_WARNING, "Failed to seek to eid=0");
+ else
+ eid = 0;
+ }
+ zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid);
+ return (found ? 0 : -1);
+}
+
+/*
+ * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0.
+ */
+static int
+_zed_event_value_is_hex(const char *name)
+{
+ const char *hex_suffix[] = {
+ "_guid",
+ "_guids",
+ NULL
+ };
+ const char **pp;
+ char *p;
+
+ if (!name)
+ return (0);
+
+ for (pp = hex_suffix; *pp; pp++) {
+ p = strstr(name, *pp);
+ if (p && strlen(p) == strlen(*pp))
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Add an environment variable for [eid] to the container [zsp].
+ *
+ * The variable name is the concatenation of [prefix] and [name] converted to
+ * uppercase with non-alphanumeric characters converted to underscores;
+ * [prefix] is optional, and [name] must begin with an alphabetic character.
+ * If the converted variable name already exists within the container [zsp],
+ * its existing value will be replaced with the new value.
+ *
+ * The variable value is specified by the format string [fmt].
+ *
+ * Returns 0 on success, and -1 on error (with errno set).
+ *
+ * All environment variables in [zsp] should be added through this function.
+ */
+static int
+_zed_event_add_var(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, const char *name, const char *fmt, ...)
+{
+ char keybuf[MAXBUF];
+ char valbuf[MAXBUF];
+ char *dstp;
+ const char *srcp;
+ const char *lastp;
+ int n;
+ int buflen;
+ va_list vargs;
+
+ assert(zsp != NULL);
+ assert(fmt != NULL);
+
+ if (!name) {
+ errno = EINVAL;
+ zed_log_msg(LOG_WARNING,
+ "Failed to add variable for eid=%llu: Name is empty", eid);
+ return (-1);
+ } else if (!isalpha(name[0])) {
+ errno = EINVAL;
+ zed_log_msg(LOG_WARNING,
+ "Failed to add variable for eid=%llu: "
+ "Name \"%s\" is invalid", eid, name);
+ return (-1);
+ }
+ /*
+ * Construct the string key by converting PREFIX (if present) and NAME.
+ */
+ dstp = keybuf;
+ lastp = keybuf + sizeof (keybuf);
+ if (prefix) {
+ for (srcp = prefix; *srcp && (dstp < lastp); srcp++)
+ *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_';
+ }
+ for (srcp = name; *srcp && (dstp < lastp); srcp++)
+ *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_';
+
+ if (dstp == lastp) {
+ errno = ENAMETOOLONG;
+ zed_log_msg(LOG_WARNING,
+ "Failed to add variable for eid=%llu: Name too long", eid);
+ return (-1);
+ }
+ *dstp = '\0';
+ /*
+ * Construct the string specified by "[PREFIX][NAME]=[FMT]".
+ */
+ dstp = valbuf;
+ buflen = sizeof (valbuf);
+ n = strlcpy(dstp, keybuf, buflen);
+ if (n >= sizeof (valbuf)) {
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, "Exceeded buffer size");
+ return (-1);
+ }
+ dstp += n;
+ buflen -= n;
+
+ *dstp++ = '=';
+ buflen--;
+
+ if (buflen <= 0) {
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, "Exceeded buffer size");
+ return (-1);
+ }
+
+ va_start(vargs, fmt);
+ n = vsnprintf(dstp, buflen, fmt, vargs);
+ va_end(vargs);
+
+ if ((n < 0) || (n >= buflen)) {
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, "Exceeded buffer size");
+ return (-1);
+ } else if (zed_strings_add(zsp, keybuf, valbuf) < 0) {
+ zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s",
+ keybuf, eid, strerror(errno));
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+_zed_event_add_array_err(uint64_t eid, const char *name)
+{
+ errno = EMSGSIZE;
+ zed_log_msg(LOG_WARNING,
+ "Failed to convert nvpair \"%s\" for eid=%llu: "
+ "Exceeded buffer size", name, eid);
+ return (-1);
+}
+
+static int
+_zed_event_add_int8_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int8_t *i8p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT8_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int8_array(nvp, &i8p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%d ", i8p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint8_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ uint8_t *u8p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT8_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_uint8_array(nvp, &u8p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%u ", u8p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int16_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int16_t *i16p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT16_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int16_array(nvp, &i16p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%d ", i16p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint16_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ uint16_t *u16p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT16_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_uint16_array(nvp, &u16p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%u ", u16p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int32_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int32_t *i32p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT32_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int32_array(nvp, &i32p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%d ", i32p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint32_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ uint32_t *u32p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_uint32_array(nvp, &u32p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%u ", u32p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_int64_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ int64_t *i64p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT64_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_int64_array(nvp, &i64p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_uint64_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ const char *fmt;
+ uint64_t *u64p;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT64_ARRAY));
+
+ name = nvpair_name(nvp);
+ fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu ";
+ (void) nvpair_value_uint64_array(nvp, &u64p, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]);
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+static int
+_zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp,
+ const char *prefix, nvpair_t *nvp)
+{
+ char buf[MAXBUF];
+ int buflen = sizeof (buf);
+ const char *name;
+ char **strp;
+ uint_t nelem;
+ uint_t i;
+ char *p;
+ int n;
+
+ assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_STRING_ARRAY));
+
+ name = nvpair_name(nvp);
+ (void) nvpair_value_string_array(nvp, &strp, &nelem);
+ for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) {
+ n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : "<NULL>");
+ if ((n < 0) || (n >= buflen))
+ return (_zed_event_add_array_err(eid, name));
+ p += n;
+ buflen -= n;
+ }
+ if (nelem > 0)
+ *--p = '\0';
+
+ return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf));
+}
+
+/*
+ * Convert the nvpair [nvp] to a string which is added to the environment
+ * of the child process.
+ * Return 0 on success, -1 on error.
+ *
+ * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()?
+ */
+static void
+_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp)
+{
+ const char *name;
+ data_type_t type;
+ const char *prefix = ZEVENT_VAR_PREFIX;
+ boolean_t b;
+ double d;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+
+ assert(zsp != NULL);
+ assert(nvp != NULL);
+
+ name = nvpair_name(nvp);
+ type = nvpair_type(nvp);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ _zed_event_add_var(eid, zsp, prefix, name, "%s", "1");
+ break;
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ _zed_event_add_var(eid, zsp, prefix, name, "%s", b ? "1" : "0");
+ break;
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i8);
+ break;
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (int8_t *)&i8);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i8);
+ break;
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ _zed_event_add_var(eid, zsp, prefix, name, "%u", i8);
+ break;
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (int16_t *)&i16);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i16);
+ break;
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ _zed_event_add_var(eid, zsp, prefix, name, "%u", i16);
+ break;
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (int32_t *)&i32);
+ _zed_event_add_var(eid, zsp, prefix, name, "%d", i32);
+ break;
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ _zed_event_add_var(eid, zsp, prefix, name, "%u", i32);
+ break;
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (int64_t *)&i64);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%lld", (longlong_t)i64);
+ break;
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"),
+ (u_longlong_t)i64);
+ /*
+ * shadow readable strings for vdev state pairs
+ */
+ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 ||
+ strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) {
+ char alt[32];
+
+ (void) snprintf(alt, sizeof (alt), "%s_str", name);
+ _zed_event_add_var(eid, zsp, prefix, alt, "%s",
+ zpool_state_to_name(i64, VDEV_AUX_NONE));
+ } else
+ /*
+ * shadow readable strings for pool state
+ */
+ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE) == 0) {
+ char alt[32];
+
+ (void) snprintf(alt, sizeof (alt), "%s_str", name);
+ _zed_event_add_var(eid, zsp, prefix, alt, "%s",
+ zpool_pool_state_to_name(i64));
+ }
+ break;
+ case DATA_TYPE_DOUBLE:
+ (void) nvpair_value_double(nvp, &d);
+ _zed_event_add_var(eid, zsp, prefix, name, "%g", d);
+ break;
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%llu", (u_longlong_t)i64);
+ break;
+ case DATA_TYPE_NVLIST:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", (str ? str : "<NULL>"));
+ break;
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ case DATA_TYPE_BYTE_ARRAY:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ case DATA_TYPE_INT8_ARRAY:
+ _zed_event_add_int8_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT8_ARRAY:
+ _zed_event_add_uint8_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_INT16_ARRAY:
+ _zed_event_add_int16_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT16_ARRAY:
+ _zed_event_add_uint16_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_INT32_ARRAY:
+ _zed_event_add_int32_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT32_ARRAY:
+ _zed_event_add_uint32_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_INT64_ARRAY:
+ _zed_event_add_int64_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_UINT64_ARRAY:
+ _zed_event_add_uint64_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_STRING_ARRAY:
+ _zed_event_add_string_array(eid, zsp, prefix, nvp);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ _zed_event_add_var(eid, zsp, prefix, name,
+ "%s", "_NOT_IMPLEMENTED_"); /* FIXME */
+ break;
+ default:
+ errno = EINVAL;
+ zed_log_msg(LOG_WARNING,
+ "Failed to convert nvpair \"%s\" for eid=%llu: "
+ "Unrecognized type=%u", name, eid, (unsigned int) type);
+ break;
+ }
+}
+
+/*
+ * Restrict various environment variables to safe and sane values
+ * when constructing the environment for the child process, unless
+ * we're running with a custom $PATH (like under the ZFS test suite).
+ *
+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
+ */
+static void
+_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp,
+ const char *path)
+{
+ const char *env_restrict[][2] = {
+ { "IFS", " \t\n" },
+ { "PATH", _PATH_STDPATH },
+ { "ZDB", SBINDIR "/zdb" },
+ { "ZED", SBINDIR "/zed" },
+ { "ZFS", SBINDIR "/zfs" },
+ { "ZINJECT", SBINDIR "/zinject" },
+ { "ZPOOL", SBINDIR "/zpool" },
+ { "ZFS_ALIAS", ZFS_META_ALIAS },
+ { "ZFS_VERSION", ZFS_META_VERSION },
+ { "ZFS_RELEASE", ZFS_META_RELEASE },
+ { NULL, NULL }
+ };
+
+ /*
+ * If we have a custom $PATH, use the default ZFS binary locations
+ * instead of the hard-coded ones.
+ */
+ const char *env_path[][2] = {
+ { "IFS", " \t\n" },
+ { "PATH", NULL }, /* $PATH copied in later on */
+ { "ZDB", "zdb" },
+ { "ZED", "zed" },
+ { "ZFS", "zfs" },
+ { "ZINJECT", "zinject" },
+ { "ZPOOL", "zpool" },
+ { "ZFS_ALIAS", ZFS_META_ALIAS },
+ { "ZFS_VERSION", ZFS_META_VERSION },
+ { "ZFS_RELEASE", ZFS_META_RELEASE },
+ { NULL, NULL }
+ };
+ const char *(*pa)[2];
+
+ assert(zsp != NULL);
+
+ pa = path != NULL ? env_path : env_restrict;
+
+ for (; *(*pa); pa++) {
+ /* Use our custom $PATH if we have one */
+ if (path != NULL && strcmp((*pa)[0], "PATH") == 0)
+ (*pa)[1] = path;
+
+ _zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]);
+ }
+}
+
+/*
+ * Preserve specified variables from the parent environment
+ * when constructing the environment for the child process.
+ *
+ * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1.
+ */
+static void
+_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp)
+{
+ const char *env_preserve[] = {
+ "TZ",
+ NULL
+ };
+ const char **keyp;
+ const char *val;
+
+ assert(zsp != NULL);
+
+ for (keyp = env_preserve; *keyp; keyp++) {
+ if ((val = getenv(*keyp)))
+ _zed_event_add_var(eid, zsp, NULL, *keyp, "%s", val);
+ }
+}
+
+/*
+ * Compute the "subclass" by removing the first 3 components of [class]
+ * (which will always be of the form "*.fs.zfs"). Return a pointer inside
+ * the string [class], or NULL if insufficient components exist.
+ */
+static const char *
+_zed_event_get_subclass(const char *class)
+{
+ const char *p;
+ int i;
+
+ if (!class)
+ return (NULL);
+
+ p = class;
+ for (i = 0; i < 3; i++) {
+ p = strchr(p, '.');
+ if (!p)
+ break;
+ p++;
+ }
+ return (p);
+}
+
+/*
+ * Convert the zevent time from a 2-element array of 64b integers
+ * into a more convenient form:
+ * - TIME_SECS is the second component of the time.
+ * - TIME_NSECS is the nanosecond component of the time.
+ * - TIME_STRING is an almost-RFC3339-compliant string representation.
+ */
+static void
+_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[])
+{
+ struct tm *stp;
+ char buf[32];
+
+ assert(zsp != NULL);
+ assert(etime != NULL);
+
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_SECS",
+ "%lld", (long long int) etime[0]);
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_NSECS",
+ "%lld", (long long int) etime[1]);
+
+ if (!(stp = localtime((const time_t *) &etime[0]))) {
+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s",
+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error");
+ } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) {
+ zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s",
+ ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error");
+ } else {
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_STRING",
+ "%s", buf);
+ }
+}
+
+/*
+ * Service the next zevent, blocking until one is available.
+ */
+int
+zed_event_service(struct zed_conf *zcp)
+{
+ nvlist_t *nvl;
+ nvpair_t *nvp;
+ int n_dropped;
+ zed_strings_t *zsp;
+ uint64_t eid;
+ int64_t *etime;
+ uint_t nelem;
+ char *class;
+ const char *subclass;
+ int rv;
+
+ if (!zcp) {
+ errno = EINVAL;
+ zed_log_msg(LOG_ERR, "Failed to service zevent: %s",
+ strerror(errno));
+ return (EINVAL);
+ }
+ rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE,
+ zcp->zevent_fd);
+
+ if ((rv != 0) || !nvl)
+ return (errno);
+
+ if (n_dropped > 0) {
+ zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped);
+ /*
+ * FIXME: Increase max size of event nvlist in
+ * /sys/module/zfs/parameters/zfs_zevent_len_max ?
+ */
+ }
+ if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) {
+ zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid");
+ } else if (nvlist_lookup_int64_array(
+ nvl, "time", &etime, &nelem) != 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu)", eid);
+ } else if (nelem != 2) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent time (eid=%llu, nelem=%u)",
+ eid, nelem);
+ } else if (nvlist_lookup_string(nvl, "class", &class) != 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to lookup zevent class (eid=%llu)", eid);
+ } else {
+ /* let internal modules see this event first */
+ zfs_agent_post_event(class, NULL, nvl);
+
+ zsp = zed_strings_create();
+
+ nvp = NULL;
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)))
+ _zed_event_add_nvpair(eid, zsp, nvp);
+
+ _zed_event_add_env_restrict(eid, zsp, zcp->path);
+ _zed_event_add_env_preserve(eid, zsp);
+
+ _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID",
+ "%d", (int)getpid());
+ _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR",
+ "%s", zcp->zedlet_dir);
+ subclass = _zed_event_get_subclass(class);
+ _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS",
+ "%s", (subclass ? subclass : class));
+
+ _zed_event_add_time_strings(eid, zsp, etime);
+
+ zed_exec_process(eid, class, subclass,
+ zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd);
+
+ zed_conf_write_state(zcp, eid, etime);
+
+ zed_strings_destroy(zsp);
+ }
+ nvlist_free(nvl);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_event.h b/sys/contrib/openzfs/cmd/zed/zed_event.h
new file mode 100644
index 000000000000..c1455c3a0629
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_event.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_EVENT_H
+#define ZED_EVENT_H
+
+#include <stdint.h>
+
+int zed_event_init(struct zed_conf *zcp);
+
+void zed_event_fini(struct zed_conf *zcp);
+
+int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid,
+ int64_t saved_etime[]);
+
+int zed_event_service(struct zed_conf *zcp);
+
+#endif /* !ZED_EVENT_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.c b/sys/contrib/openzfs/cmd/zed/zed_exec.c
new file mode 100644
index 000000000000..08b7b5568362
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_exec.c
@@ -0,0 +1,232 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include "zed_exec.h"
+#include "zed_file.h"
+#include "zed_log.h"
+#include "zed_strings.h"
+
+#define ZEVENT_FILENO 3
+
+/*
+ * Create an environment string array for passing to execve() using the
+ * NAME=VALUE strings in container [zsp].
+ * Return a newly-allocated environment, or NULL on error.
+ */
+static char **
+_zed_exec_create_env(zed_strings_t *zsp)
+{
+ int num_ptrs;
+ int buflen;
+ char *buf;
+ char **pp;
+ char *p;
+ const char *q;
+ int i;
+ int len;
+
+ num_ptrs = zed_strings_count(zsp) + 1;
+ buflen = num_ptrs * sizeof (char *);
+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp))
+ buflen += strlen(q) + 1;
+
+ buf = calloc(1, buflen);
+ if (!buf)
+ return (NULL);
+
+ pp = (char **)buf;
+ p = buf + (num_ptrs * sizeof (char *));
+ i = 0;
+ for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) {
+ pp[i] = p;
+ len = strlen(q) + 1;
+ memcpy(p, q, len);
+ p += len;
+ i++;
+ }
+ pp[i] = NULL;
+ assert(buf + buflen == p);
+ return ((char **)buf);
+}
+
+/*
+ * Fork a child process to handle event [eid]. The program [prog]
+ * in directory [dir] is executed with the environment [env].
+ *
+ * The file descriptor [zfd] is the zevent_fd used to track the
+ * current cursor location within the zevent nvlist.
+ */
+static void
+_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog,
+ char *env[], int zfd)
+{
+ char path[PATH_MAX];
+ int n;
+ pid_t pid;
+ int fd;
+ pid_t wpid;
+ int status;
+
+ assert(dir != NULL);
+ assert(prog != NULL);
+ assert(env != NULL);
+ assert(zfd >= 0);
+
+ n = snprintf(path, sizeof (path), "%s/%s", dir, prog);
+ if ((n < 0) || (n >= sizeof (path))) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to fork \"%s\" for eid=%llu: %s",
+ prog, eid, strerror(ENAMETOOLONG));
+ return;
+ }
+ pid = fork();
+ if (pid < 0) {
+ zed_log_msg(LOG_WARNING,
+ "Failed to fork \"%s\" for eid=%llu: %s",
+ prog, eid, strerror(errno));
+ return;
+ } else if (pid == 0) {
+ (void) umask(022);
+ if ((fd = open("/dev/null", O_RDWR)) != -1) {
+ (void) dup2(fd, STDIN_FILENO);
+ (void) dup2(fd, STDOUT_FILENO);
+ (void) dup2(fd, STDERR_FILENO);
+ }
+ (void) dup2(zfd, ZEVENT_FILENO);
+ zed_file_close_from(ZEVENT_FILENO + 1);
+ execle(path, prog, NULL, env);
+ _exit(127);
+ }
+
+ /* parent process */
+
+ zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d",
+ prog, eid, pid);
+
+ /* FIXME: Timeout rogue child processes with sigalarm? */
+
+ /*
+ * Wait for child process using WNOHANG to limit
+ * the time spent waiting to 10 seconds (10,000ms).
+ */
+ for (n = 0; n < 1000; n++) {
+ wpid = waitpid(pid, &status, WNOHANG);
+ if (wpid == (pid_t)-1) {
+ if (errno == EINTR)
+ continue;
+ zed_log_msg(LOG_WARNING,
+ "Failed to wait for \"%s\" eid=%llu pid=%d",
+ prog, eid, pid);
+ break;
+ } else if (wpid == 0) {
+ struct timespec t;
+
+ /* child still running */
+ t.tv_sec = 0;
+ t.tv_nsec = 10000000; /* 10ms */
+ (void) nanosleep(&t, NULL);
+ continue;
+ }
+
+ if (WIFEXITED(status)) {
+ zed_log_msg(LOG_INFO,
+ "Finished \"%s\" eid=%llu pid=%d exit=%d",
+ prog, eid, pid, WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ zed_log_msg(LOG_INFO,
+ "Finished \"%s\" eid=%llu pid=%d sig=%d/%s",
+ prog, eid, pid, WTERMSIG(status),
+ strsignal(WTERMSIG(status)));
+ } else {
+ zed_log_msg(LOG_INFO,
+ "Finished \"%s\" eid=%llu pid=%d status=0x%X",
+ prog, eid, (unsigned int) status);
+ }
+ break;
+ }
+
+ /*
+ * kill child process after 10 seconds
+ */
+ if (wpid == 0) {
+ zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d",
+ prog, pid);
+ (void) kill(pid, SIGKILL);
+ }
+}
+
+/*
+ * Process the event [eid] by synchronously invoking all zedlets with a
+ * matching class prefix.
+ *
+ * Each executable in [zedlets] from the directory [dir] is matched against
+ * the event's [class], [subclass], and the "all" class (which matches
+ * all events). Every zedlet with a matching class prefix is invoked.
+ * The NAME=VALUE strings in [envs] will be passed to the zedlet as
+ * environment variables.
+ *
+ * The file descriptor [zfd] is the zevent_fd used to track the
+ * current cursor location within the zevent nvlist.
+ *
+ * Return 0 on success, -1 on error.
+ */
+int
+zed_exec_process(uint64_t eid, const char *class, const char *subclass,
+ const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd)
+{
+ const char *class_strings[4];
+ const char *allclass = "all";
+ const char **csp;
+ const char *z;
+ char **e;
+ int n;
+
+ if (!dir || !zedlets || !envs || zfd < 0)
+ return (-1);
+
+ csp = class_strings;
+
+ if (class)
+ *csp++ = class;
+
+ if (subclass)
+ *csp++ = subclass;
+
+ if (allclass)
+ *csp++ = allclass;
+
+ *csp = NULL;
+
+ e = _zed_exec_create_env(envs);
+
+ for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) {
+ for (csp = class_strings; *csp; csp++) {
+ n = strlen(*csp);
+ if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n]))
+ _zed_exec_fork_child(eid, dir, z, e, zfd);
+ }
+ }
+ free(e);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_exec.h b/sys/contrib/openzfs/cmd/zed/zed_exec.h
new file mode 100644
index 000000000000..4153e5519a46
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_exec.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_EXEC_H
+#define ZED_EXEC_H
+
+#include <stdint.h>
+#include "zed_strings.h"
+
+int zed_exec_process(uint64_t eid, const char *class, const char *subclass,
+ const char *dir, zed_strings_t *zedlets, zed_strings_t *envs,
+ int zevent_fd);
+
+#endif /* !ZED_EXEC_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_file.c b/sys/contrib/openzfs/cmd/zed/zed_file.c
new file mode 100644
index 000000000000..c3cf3d421c6f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_file.c
@@ -0,0 +1,217 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "zed_file.h"
+#include "zed_log.h"
+
+/*
+ * Read up to [n] bytes from [fd] into [buf].
+ * Return the number of bytes read, 0 on EOF, or -1 on error.
+ */
+ssize_t
+zed_file_read_n(int fd, void *buf, size_t n)
+{
+ unsigned char *p;
+ size_t n_left;
+ ssize_t n_read;
+
+ p = buf;
+ n_left = n;
+ while (n_left > 0) {
+ if ((n_read = read(fd, p, n_left)) < 0) {
+ if (errno == EINTR)
+ continue;
+ else
+ return (-1);
+
+ } else if (n_read == 0) {
+ break;
+ }
+ n_left -= n_read;
+ p += n_read;
+ }
+ return (n - n_left);
+}
+
+/*
+ * Write [n] bytes from [buf] out to [fd].
+ * Return the number of bytes written, or -1 on error.
+ */
+ssize_t
+zed_file_write_n(int fd, void *buf, size_t n)
+{
+ const unsigned char *p;
+ size_t n_left;
+ ssize_t n_written;
+
+ p = buf;
+ n_left = n;
+ while (n_left > 0) {
+ if ((n_written = write(fd, p, n_left)) < 0) {
+ if (errno == EINTR)
+ continue;
+ else
+ return (-1);
+
+ }
+ n_left -= n_written;
+ p += n_written;
+ }
+ return (n);
+}
+
+/*
+ * Set an exclusive advisory lock on the open file descriptor [fd].
+ * Return 0 on success, 1 if a conflicting lock is held by another process,
+ * or -1 on error (with errno set).
+ */
+int
+zed_file_lock(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lock) < 0) {
+ if ((errno == EACCES) || (errno == EAGAIN))
+ return (1);
+
+ return (-1);
+ }
+ return (0);
+}
+
+/*
+ * Release an advisory lock held on the open file descriptor [fd].
+ * Return 0 on success, or -1 on error (with errno set).
+ */
+int
+zed_file_unlock(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ lock.l_type = F_UNLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(fd, F_SETLK, &lock) < 0)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * Test whether an exclusive advisory lock could be obtained for the open
+ * file descriptor [fd].
+ * Return 0 if the file is not locked, >0 for the PID of another process
+ * holding a conflicting lock, or -1 on error (with errno set).
+ */
+pid_t
+zed_file_is_locked(int fd)
+{
+ struct flock lock;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+
+ if (fcntl(fd, F_GETLK, &lock) < 0)
+ return (-1);
+
+ if (lock.l_type == F_UNLCK)
+ return (0);
+
+ return (lock.l_pid);
+}
+
+/*
+ * Close all open file descriptors greater than or equal to [lowfd].
+ * Any errors encountered while closing file descriptors are ignored.
+ */
+void
+zed_file_close_from(int lowfd)
+{
+ const int maxfd_def = 256;
+ int errno_bak;
+ struct rlimit rl;
+ int maxfd;
+ int fd;
+
+ errno_bak = errno;
+
+ if (getrlimit(RLIMIT_NOFILE, &rl) < 0) {
+ maxfd = maxfd_def;
+ } else if (rl.rlim_max == RLIM_INFINITY) {
+ maxfd = maxfd_def;
+ } else {
+ maxfd = rl.rlim_max;
+ }
+ for (fd = lowfd; fd < maxfd; fd++)
+ (void) close(fd);
+
+ errno = errno_bak;
+}
+
+/*
+ * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically
+ * closed upon successful execution of one of the exec functions.
+ * Return 0 on success, or -1 on error.
+ *
+ * FIXME: No longer needed?
+ */
+int
+zed_file_close_on_exec(int fd)
+{
+ int flags;
+
+ if (fd < 0) {
+ errno = EBADF;
+ return (-1);
+ }
+ flags = fcntl(fd, F_GETFD);
+ if (flags == -1)
+ return (-1);
+
+ flags |= FD_CLOEXEC;
+
+ if (fcntl(fd, F_SETFD, flags) == -1)
+ return (-1);
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_file.h b/sys/contrib/openzfs/cmd/zed/zed_file.h
new file mode 100644
index 000000000000..05f360d20efd
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_file.h
@@ -0,0 +1,35 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_FILE_H
+#define ZED_FILE_H
+
+#include <sys/types.h>
+#include <unistd.h>
+
+ssize_t zed_file_read_n(int fd, void *buf, size_t n);
+
+ssize_t zed_file_write_n(int fd, void *buf, size_t n);
+
+int zed_file_lock(int fd);
+
+int zed_file_unlock(int fd);
+
+pid_t zed_file_is_locked(int fd);
+
+void zed_file_close_from(int fd);
+
+int zed_file_close_on_exec(int fd);
+
+#endif /* !ZED_FILE_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_log.c b/sys/contrib/openzfs/cmd/zed/zed_log.c
new file mode 100644
index 000000000000..5a3f2dbdb832
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_log.c
@@ -0,0 +1,256 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <unistd.h>
+#include "zed_log.h"
+
+#define ZED_LOG_MAX_LOG_LEN 1024
+
+static struct {
+ unsigned do_stderr:1;
+ unsigned do_syslog:1;
+ const char *identity;
+ int priority;
+ int pipe_fd[2];
+} _ctx;
+
+/*
+ * Initialize the logging subsystem.
+ */
+void
+zed_log_init(const char *identity)
+{
+ if (identity) {
+ const char *p = strrchr(identity, '/');
+ _ctx.identity = (p != NULL) ? p + 1 : identity;
+ } else {
+ _ctx.identity = NULL;
+ }
+ _ctx.pipe_fd[0] = -1;
+ _ctx.pipe_fd[1] = -1;
+}
+
+/*
+ * Shutdown the logging subsystem.
+ */
+void
+zed_log_fini(void)
+{
+ zed_log_stderr_close();
+ zed_log_syslog_close();
+}
+
+/*
+ * Create pipe for communicating daemonization status between the parent and
+ * child processes across the double-fork().
+ */
+void
+zed_log_pipe_open(void)
+{
+ if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1))
+ zed_log_die("Invalid use of zed_log_pipe_open in PID %d",
+ (int)getpid());
+
+ if (pipe(_ctx.pipe_fd) < 0)
+ zed_log_die("Failed to create daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+}
+
+/*
+ * Close the read-half of the daemonize pipe.
+ *
+ * This should be called by the child after fork()ing from the parent since
+ * the child will never read from this pipe.
+ */
+void
+zed_log_pipe_close_reads(void)
+{
+ if (_ctx.pipe_fd[0] < 0)
+ zed_log_die(
+ "Invalid use of zed_log_pipe_close_reads in PID %d",
+ (int)getpid());
+
+ if (close(_ctx.pipe_fd[0]) < 0)
+ zed_log_die(
+ "Failed to close reads on daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+
+ _ctx.pipe_fd[0] = -1;
+}
+
+/*
+ * Close the write-half of the daemonize pipe.
+ *
+ * This should be called by the parent after fork()ing its child since the
+ * parent will never write to this pipe.
+ *
+ * This should also be called by the child once initialization is complete
+ * in order to signal the parent that it can safely exit.
+ */
+void
+zed_log_pipe_close_writes(void)
+{
+ if (_ctx.pipe_fd[1] < 0)
+ zed_log_die(
+ "Invalid use of zed_log_pipe_close_writes in PID %d",
+ (int)getpid());
+
+ if (close(_ctx.pipe_fd[1]) < 0)
+ zed_log_die(
+ "Failed to close writes on daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+
+ _ctx.pipe_fd[1] = -1;
+}
+
+/*
+ * Block on reading from the daemonize pipe until signaled by the child
+ * (via zed_log_pipe_close_writes()) that initialization is complete.
+ *
+ * This should only be called by the parent while waiting to exit after
+ * fork()ing the child.
+ */
+void
+zed_log_pipe_wait(void)
+{
+ ssize_t n;
+ char c;
+
+ if (_ctx.pipe_fd[0] < 0)
+ zed_log_die("Invalid use of zed_log_pipe_wait in PID %d",
+ (int)getpid());
+
+ for (;;) {
+ n = read(_ctx.pipe_fd[0], &c, sizeof (c));
+ if (n < 0) {
+ if (errno == EINTR)
+ continue;
+ zed_log_die(
+ "Failed to read from daemonize pipe in PID %d: %s",
+ (int)getpid(), strerror(errno));
+ }
+ if (n == 0) {
+ break;
+ }
+ }
+}
+
+/*
+ * Start logging messages at the syslog [priority] level or higher to stderr.
+ * Refer to syslog(3) for valid priority values.
+ */
+void
+zed_log_stderr_open(int priority)
+{
+ _ctx.do_stderr = 1;
+ _ctx.priority = priority;
+}
+
+/*
+ * Stop logging messages to stderr.
+ */
+void
+zed_log_stderr_close(void)
+{
+ if (_ctx.do_stderr)
+ _ctx.do_stderr = 0;
+}
+
+/*
+ * Start logging messages to syslog.
+ * Refer to syslog(3) for valid option/facility values.
+ */
+void
+zed_log_syslog_open(int facility)
+{
+ _ctx.do_syslog = 1;
+ openlog(_ctx.identity, LOG_NDELAY | LOG_PID, facility);
+}
+
+/*
+ * Stop logging messages to syslog.
+ */
+void
+zed_log_syslog_close(void)
+{
+ if (_ctx.do_syslog) {
+ _ctx.do_syslog = 0;
+ closelog();
+ }
+}
+
+/*
+ * Auxiliary function to log a message to syslog and/or stderr.
+ */
+static void
+_zed_log_aux(int priority, const char *fmt, va_list vargs)
+{
+ char buf[ZED_LOG_MAX_LOG_LEN];
+ int n;
+
+ if (!fmt)
+ return;
+
+ n = vsnprintf(buf, sizeof (buf), fmt, vargs);
+ if ((n < 0) || (n >= sizeof (buf))) {
+ buf[sizeof (buf) - 2] = '+';
+ buf[sizeof (buf) - 1] = '\0';
+ }
+
+ if (_ctx.do_syslog)
+ syslog(priority, "%s", buf);
+
+ if (_ctx.do_stderr && (priority <= _ctx.priority))
+ fprintf(stderr, "%s\n", buf);
+}
+
+/*
+ * Log a message at the given [priority] level specified by the printf-style
+ * format string [fmt].
+ */
+void
+zed_log_msg(int priority, const char *fmt, ...)
+{
+ va_list vargs;
+
+ if (fmt) {
+ va_start(vargs, fmt);
+ _zed_log_aux(priority, fmt, vargs);
+ va_end(vargs);
+ }
+}
+
+/*
+ * Log a fatal error message specified by the printf-style format string [fmt].
+ */
+void
+zed_log_die(const char *fmt, ...)
+{
+ va_list vargs;
+
+ if (fmt) {
+ va_start(vargs, fmt);
+ _zed_log_aux(LOG_ERR, fmt, vargs);
+ va_end(vargs);
+ }
+ exit(EXIT_FAILURE);
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_log.h b/sys/contrib/openzfs/cmd/zed/zed_log.h
new file mode 100644
index 000000000000..a03a4f53967c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_log.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_LOG_H
+#define ZED_LOG_H
+
+#include <syslog.h>
+
+void zed_log_init(const char *identity);
+
+void zed_log_fini(void);
+
+void zed_log_pipe_open(void);
+
+void zed_log_pipe_close_reads(void);
+
+void zed_log_pipe_close_writes(void);
+
+void zed_log_pipe_wait(void);
+
+void zed_log_stderr_open(int priority);
+
+void zed_log_stderr_close(void);
+
+void zed_log_syslog_open(int facility);
+
+void zed_log_syslog_close(void);
+
+void zed_log_msg(int priority, const char *fmt, ...);
+
+void zed_log_die(const char *fmt, ...);
+
+#endif /* !ZED_LOG_H */
diff --git a/sys/contrib/openzfs/cmd/zed/zed_strings.c b/sys/contrib/openzfs/cmd/zed/zed_strings.c
new file mode 100644
index 000000000000..6b1c669d71f4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_strings.c
@@ -0,0 +1,247 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/avl.h>
+#include <sys/sysmacros.h>
+#include "zed_strings.h"
+
+struct zed_strings {
+ avl_tree_t tree;
+ avl_node_t *iteratorp;
+};
+
+struct zed_strings_node {
+ avl_node_t node;
+ char *key;
+ char *val;
+};
+
+typedef struct zed_strings_node zed_strings_node_t;
+
+/*
+ * Compare zed_strings_node_t nodes [x1] and [x2].
+ * As required for the AVL tree, return -1 for <, 0 for ==, and +1 for >.
+ */
+static int
+_zed_strings_node_compare(const void *x1, const void *x2)
+{
+ const char *s1;
+ const char *s2;
+ int rv;
+
+ assert(x1 != NULL);
+ assert(x2 != NULL);
+
+ s1 = ((const zed_strings_node_t *) x1)->key;
+ assert(s1 != NULL);
+ s2 = ((const zed_strings_node_t *) x2)->key;
+ assert(s2 != NULL);
+ rv = strcmp(s1, s2);
+
+ if (rv < 0)
+ return (-1);
+
+ if (rv > 0)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Return a new string container, or NULL on error.
+ */
+zed_strings_t *
+zed_strings_create(void)
+{
+ zed_strings_t *zsp;
+
+ zsp = calloc(1, sizeof (*zsp));
+ if (!zsp)
+ return (NULL);
+
+ avl_create(&zsp->tree, _zed_strings_node_compare,
+ sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node));
+
+ zsp->iteratorp = NULL;
+ return (zsp);
+}
+
+/*
+ * Destroy the string node [np].
+ */
+static void
+_zed_strings_node_destroy(zed_strings_node_t *np)
+{
+ if (!np)
+ return;
+
+ if (np->key) {
+ if (np->key != np->val)
+ free(np->key);
+ np->key = NULL;
+ }
+ if (np->val) {
+ free(np->val);
+ np->val = NULL;
+ }
+ free(np);
+}
+
+/*
+ * Return a new string node for storing the string [val], or NULL on error.
+ * If [key] is specified, it will be used to index the node; otherwise,
+ * the string [val] will be used.
+ */
+static zed_strings_node_t *
+_zed_strings_node_create(const char *key, const char *val)
+{
+ zed_strings_node_t *np;
+
+ assert(val != NULL);
+
+ np = calloc(1, sizeof (*np));
+ if (!np)
+ return (NULL);
+
+ np->val = strdup(val);
+ if (!np->val)
+ goto nomem;
+
+ if (key) {
+ np->key = strdup(key);
+ if (!np->key)
+ goto nomem;
+ } else {
+ np->key = np->val;
+ }
+ return (np);
+
+nomem:
+ _zed_strings_node_destroy(np);
+ return (NULL);
+}
+
+/*
+ * Destroy the string container [zsp] and all nodes within.
+ */
+void
+zed_strings_destroy(zed_strings_t *zsp)
+{
+ void *cookie;
+ zed_strings_node_t *np;
+
+ if (!zsp)
+ return;
+
+ cookie = NULL;
+ while ((np = avl_destroy_nodes(&zsp->tree, &cookie)))
+ _zed_strings_node_destroy(np);
+
+ avl_destroy(&zsp->tree);
+ free(zsp);
+}
+
+/*
+ * Add a copy of the string [s] indexed by [key] to the container [zsp].
+ * If [key] already exists within the container [zsp], it will be replaced
+ * with the new string [s].
+ * If [key] is NULL, the string [s] will be used as the key.
+ * Return 0 on success, or -1 on error.
+ */
+int
+zed_strings_add(zed_strings_t *zsp, const char *key, const char *s)
+{
+ zed_strings_node_t *newp, *oldp;
+
+ if (!zsp || !s) {
+ errno = EINVAL;
+ return (-1);
+ }
+ if (key == s)
+ key = NULL;
+
+ newp = _zed_strings_node_create(key, s);
+ if (!newp)
+ return (-1);
+
+ oldp = avl_find(&zsp->tree, newp, NULL);
+ if (oldp) {
+ avl_remove(&zsp->tree, oldp);
+ _zed_strings_node_destroy(oldp);
+ }
+ avl_add(&zsp->tree, newp);
+ return (0);
+}
+
+/*
+ * Return the first string in container [zsp].
+ * Return NULL if there are no strings, or on error.
+ * This can be called multiple times to re-traverse [zsp].
+ * XXX: Not thread-safe.
+ */
+const char *
+zed_strings_first(zed_strings_t *zsp)
+{
+ if (!zsp) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ zsp->iteratorp = avl_first(&zsp->tree);
+ if (!zsp->iteratorp)
+ return (NULL);
+
+ return (((zed_strings_node_t *)zsp->iteratorp)->val);
+
+}
+
+/*
+ * Return the next string in container [zsp].
+ * Return NULL after the last string, or on error.
+ * This must be called after zed_strings_first().
+ * XXX: Not thread-safe.
+ */
+const char *
+zed_strings_next(zed_strings_t *zsp)
+{
+ if (!zsp) {
+ errno = EINVAL;
+ return (NULL);
+ }
+ if (!zsp->iteratorp)
+ return (NULL);
+
+ zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp);
+ if (!zsp->iteratorp)
+ return (NULL);
+
+ return (((zed_strings_node_t *)zsp->iteratorp)->val);
+}
+
+/*
+ * Return the number of strings in container [zsp], or -1 on error.
+ */
+int
+zed_strings_count(zed_strings_t *zsp)
+{
+ if (!zsp) {
+ errno = EINVAL;
+ return (-1);
+ }
+ return (avl_numnodes(&zsp->tree));
+}
diff --git a/sys/contrib/openzfs/cmd/zed/zed_strings.h b/sys/contrib/openzfs/cmd/zed/zed_strings.h
new file mode 100644
index 000000000000..37a84cad7ffc
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zed/zed_strings.h
@@ -0,0 +1,27 @@
+/*
+ * This file is part of the ZFS Event Daemon (ZED)
+ * for ZFS on Linux (ZoL) <http://zfsonlinux.org/>.
+ * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
+ * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
+ * Refer to the ZoL git commit log for authoritative copyright attribution.
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ */
+
+#ifndef ZED_STRINGS_H
+#define ZED_STRINGS_H
+
+typedef struct zed_strings zed_strings_t;
+
+zed_strings_t *zed_strings_create(void);
+void zed_strings_destroy(zed_strings_t *zsp);
+int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s);
+const char *zed_strings_first(zed_strings_t *zsp);
+const char *zed_strings_next(zed_strings_t *zsp);
+int zed_strings_count(zed_strings_t *zsp);
+
+#endif /* !ZED_STRINGS_H */
diff --git a/sys/contrib/openzfs/cmd/zfs/.gitignore b/sys/contrib/openzfs/cmd/zfs/.gitignore
new file mode 100644
index 000000000000..0fd9cc63af2a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/.gitignore
@@ -0,0 +1 @@
+/zfs
diff --git a/sys/contrib/openzfs/cmd/zfs/Makefile.am b/sys/contrib/openzfs/cmd/zfs/Makefile.am
new file mode 100644
index 000000000000..dec5920381d5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/Makefile.am
@@ -0,0 +1,23 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zfs
+
+zfs_SOURCES = \
+ zfs_iter.c \
+ zfs_iter.h \
+ zfs_main.c \
+ zfs_util.h \
+ zfs_project.c \
+ zfs_projectutil.h
+
+zfs_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+ $(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zfs_LDADD += $(LTLIBINTL)
+
+if BUILD_FREEBSD
+zfs_LDADD += -lgeom -ljail
+endif
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_iter.c b/sys/contrib/openzfs/cmd/zfs/zfs_iter.c
new file mode 100644
index 000000000000..f2359508c16d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_iter.c
@@ -0,0 +1,512 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+#include <libzfs.h>
+
+#include "zfs_util.h"
+#include "zfs_iter.h"
+
+/*
+ * This is a private interface used to gather up all the datasets specified on
+ * the command line so that we can iterate over them in order.
+ *
+ * First, we iterate over all filesystems, gathering them together into an
+ * AVL tree. We report errors for any explicitly specified datasets
+ * that we couldn't open.
+ *
+ * When finished, we have an AVL tree of ZFS handles. We go through and execute
+ * the provided callback for each one, passing whatever data the user supplied.
+ */
+
+typedef struct zfs_node {
+ zfs_handle_t *zn_handle;
+ uu_avl_node_t zn_avlnode;
+} zfs_node_t;
+
+typedef struct callback_data {
+ uu_avl_t *cb_avl;
+ int cb_flags;
+ zfs_type_t cb_types;
+ zfs_sort_column_t *cb_sortcol;
+ zprop_list_t **cb_proplist;
+ int cb_depth_limit;
+ int cb_depth;
+ uint8_t cb_props_table[ZFS_NUM_PROPS];
+} callback_data_t;
+
+uu_avl_pool_t *avl_pool;
+
+/*
+ * Include snaps if they were requested or if this a zfs list where types
+ * were not specified and the "listsnapshots" property is set on this pool.
+ */
+static boolean_t
+zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb)
+{
+ zpool_handle_t *zph;
+
+ if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0)
+ return (cb->cb_types & ZFS_TYPE_SNAPSHOT);
+
+ zph = zfs_get_pool_handle(zhp);
+ return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL));
+}
+
+/*
+ * Called for each dataset. If the object is of an appropriate type,
+ * add it to the avl tree and recurse over any children as necessary.
+ */
+static int
+zfs_callback(zfs_handle_t *zhp, void *data)
+{
+ callback_data_t *cb = data;
+ boolean_t should_close = B_TRUE;
+ boolean_t include_snaps = zfs_include_snapshots(zhp, cb);
+ boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK);
+
+ if ((zfs_get_type(zhp) & cb->cb_types) ||
+ ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) {
+ uu_avl_index_t idx;
+ zfs_node_t *node = safe_malloc(sizeof (zfs_node_t));
+
+ node->zn_handle = zhp;
+ uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
+ if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
+ &idx) == NULL) {
+ if (cb->cb_proplist) {
+ if ((*cb->cb_proplist) &&
+ !(*cb->cb_proplist)->pl_all)
+ zfs_prune_proplist(zhp,
+ cb->cb_props_table);
+
+ if (zfs_expand_proplist(zhp, cb->cb_proplist,
+ (cb->cb_flags & ZFS_ITER_RECVD_PROPS),
+ (cb->cb_flags & ZFS_ITER_LITERAL_PROPS))
+ != 0) {
+ free(node);
+ return (-1);
+ }
+ }
+ uu_avl_insert(cb->cb_avl, node, idx);
+ should_close = B_FALSE;
+ } else {
+ free(node);
+ }
+ }
+
+ /*
+ * Recurse if necessary.
+ */
+ if (cb->cb_flags & ZFS_ITER_RECURSE &&
+ ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
+ cb->cb_depth < cb->cb_depth_limit)) {
+ cb->cb_depth++;
+
+ /*
+ * If we are not looking for filesystems, we don't need to
+ * recurse into filesystems when we are at our depth limit.
+ */
+ if ((cb->cb_depth < cb->cb_depth_limit ||
+ (cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
+ (cb->cb_types &
+ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) &&
+ zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+ (void) zfs_iter_filesystems(zhp, zfs_callback, data);
+ }
+
+ if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT |
+ ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) {
+ (void) zfs_iter_snapshots(zhp,
+ (cb->cb_flags & ZFS_ITER_SIMPLE) != 0,
+ zfs_callback, data, 0, 0);
+ }
+
+ if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT |
+ ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) {
+ (void) zfs_iter_bookmarks(zhp, zfs_callback, data);
+ }
+
+ cb->cb_depth--;
+ }
+
+ if (should_close)
+ zfs_close(zhp);
+
+ return (0);
+}
+
+int
+zfs_add_sort_column(zfs_sort_column_t **sc, const char *name,
+ boolean_t reverse)
+{
+ zfs_sort_column_t *col;
+ zfs_prop_t prop;
+
+ if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL &&
+ !zfs_prop_user(name))
+ return (-1);
+
+ col = safe_malloc(sizeof (zfs_sort_column_t));
+
+ col->sc_prop = prop;
+ col->sc_reverse = reverse;
+ if (prop == ZPROP_INVAL) {
+ col->sc_user_prop = safe_malloc(strlen(name) + 1);
+ (void) strcpy(col->sc_user_prop, name);
+ }
+
+ if (*sc == NULL) {
+ col->sc_last = col;
+ *sc = col;
+ } else {
+ (*sc)->sc_last->sc_next = col;
+ (*sc)->sc_last = col;
+ }
+
+ return (0);
+}
+
+void
+zfs_free_sort_columns(zfs_sort_column_t *sc)
+{
+ zfs_sort_column_t *col;
+
+ while (sc != NULL) {
+ col = sc->sc_next;
+ free(sc->sc_user_prop);
+ free(sc);
+ sc = col;
+ }
+}
+
+int
+zfs_sort_only_by_name(const zfs_sort_column_t *sc)
+{
+ return (sc != NULL && sc->sc_next == NULL &&
+ sc->sc_prop == ZFS_PROP_NAME);
+}
+
+/* ARGSUSED */
+static int
+zfs_compare(const void *larg, const void *rarg, void *unused)
+{
+ zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+ zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+ const char *lname = zfs_get_name(l);
+ const char *rname = zfs_get_name(r);
+ char *lat, *rat;
+ uint64_t lcreate, rcreate;
+ int ret;
+
+ lat = (char *)strchr(lname, '@');
+ rat = (char *)strchr(rname, '@');
+
+ if (lat != NULL)
+ *lat = '\0';
+ if (rat != NULL)
+ *rat = '\0';
+
+ ret = strcmp(lname, rname);
+ if (ret == 0 && (lat != NULL || rat != NULL)) {
+ /*
+ * If we're comparing a dataset to one of its snapshots, we
+ * always make the full dataset first.
+ */
+ if (lat == NULL) {
+ ret = -1;
+ } else if (rat == NULL) {
+ ret = 1;
+ } else {
+ /*
+ * If we have two snapshots from the same dataset, then
+ * we want to sort them according to creation time. We
+ * use the hidden CREATETXG property to get an absolute
+ * ordering of snapshots.
+ */
+ lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
+ rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
+
+ /*
+ * Both lcreate and rcreate being 0 means we don't have
+ * properties and we should compare full name.
+ */
+ if (lcreate == 0 && rcreate == 0)
+ ret = strcmp(lat + 1, rat + 1);
+ else if (lcreate < rcreate)
+ ret = -1;
+ else if (lcreate > rcreate)
+ ret = 1;
+ }
+ }
+
+ if (lat != NULL)
+ *lat = '@';
+ if (rat != NULL)
+ *rat = '@';
+
+ return (ret);
+}
+
+/*
+ * Sort datasets by specified columns.
+ *
+ * o Numeric types sort in ascending order.
+ * o String types sort in alphabetical order.
+ * o Types inappropriate for a row sort that row to the literal
+ * bottom, regardless of the specified ordering.
+ *
+ * If no sort columns are specified, or two datasets compare equally
+ * across all specified columns, they are sorted alphabetically by name
+ * with snapshots grouped under their parents.
+ */
+static int
+zfs_sort(const void *larg, const void *rarg, void *data)
+{
+ zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+ zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+ zfs_sort_column_t *sc = (zfs_sort_column_t *)data;
+ zfs_sort_column_t *psc;
+
+ for (psc = sc; psc != NULL; psc = psc->sc_next) {
+ char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN];
+ char *lstr, *rstr;
+ uint64_t lnum, rnum;
+ boolean_t lvalid, rvalid;
+ int ret = 0;
+
+ /*
+ * We group the checks below the generic code. If 'lstr' and
+ * 'rstr' are non-NULL, then we do a string based comparison.
+ * Otherwise, we compare 'lnum' and 'rnum'.
+ */
+ lstr = rstr = NULL;
+ if (psc->sc_prop == ZPROP_INVAL) {
+ nvlist_t *luser, *ruser;
+ nvlist_t *lval, *rval;
+
+ luser = zfs_get_user_props(l);
+ ruser = zfs_get_user_props(r);
+
+ lvalid = (nvlist_lookup_nvlist(luser,
+ psc->sc_user_prop, &lval) == 0);
+ rvalid = (nvlist_lookup_nvlist(ruser,
+ psc->sc_user_prop, &rval) == 0);
+
+ if (lvalid)
+ verify(nvlist_lookup_string(lval,
+ ZPROP_VALUE, &lstr) == 0);
+ if (rvalid)
+ verify(nvlist_lookup_string(rval,
+ ZPROP_VALUE, &rstr) == 0);
+ } else if (psc->sc_prop == ZFS_PROP_NAME) {
+ lvalid = rvalid = B_TRUE;
+
+ (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf));
+ (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf));
+
+ lstr = lbuf;
+ rstr = rbuf;
+ } else if (zfs_prop_is_string(psc->sc_prop)) {
+ lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf,
+ sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0);
+ rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf,
+ sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0);
+
+ lstr = lbuf;
+ rstr = rbuf;
+ } else {
+ lvalid = zfs_prop_valid_for_type(psc->sc_prop,
+ zfs_get_type(l), B_FALSE);
+ rvalid = zfs_prop_valid_for_type(psc->sc_prop,
+ zfs_get_type(r), B_FALSE);
+
+ if (lvalid)
+ (void) zfs_prop_get_numeric(l, psc->sc_prop,
+ &lnum, NULL, NULL, 0);
+ if (rvalid)
+ (void) zfs_prop_get_numeric(r, psc->sc_prop,
+ &rnum, NULL, NULL, 0);
+ }
+
+ if (!lvalid && !rvalid)
+ continue;
+ else if (!lvalid)
+ return (1);
+ else if (!rvalid)
+ return (-1);
+
+ if (lstr)
+ ret = strcmp(lstr, rstr);
+ else if (lnum < rnum)
+ ret = -1;
+ else if (lnum > rnum)
+ ret = 1;
+
+ if (ret != 0) {
+ if (psc->sc_reverse == B_TRUE)
+ ret = (ret < 0) ? 1 : -1;
+ return (ret);
+ }
+ }
+
+ return (zfs_compare(larg, rarg, NULL));
+}
+
+int
+zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
+ zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit,
+ zfs_iter_f callback, void *data)
+{
+ callback_data_t cb = {0};
+ int ret = 0;
+ zfs_node_t *node;
+ uu_avl_walk_t *walk;
+
+ avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
+ offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
+
+ if (avl_pool == NULL)
+ nomem();
+
+ cb.cb_sortcol = sortcol;
+ cb.cb_flags = flags;
+ cb.cb_proplist = proplist;
+ cb.cb_types = types;
+ cb.cb_depth_limit = limit;
+ /*
+ * If cb_proplist is provided then in the zfs_handles created we
+ * retain only those properties listed in cb_proplist and sortcol.
+ * The rest are pruned. So, the caller should make sure that no other
+ * properties other than those listed in cb_proplist/sortcol are
+ * accessed.
+ *
+ * If cb_proplist is NULL then we retain all the properties. We
+ * always retain the zoned property, which some other properties
+ * need (userquota & friends), and the createtxg property, which
+ * we need to sort snapshots.
+ */
+ if (cb.cb_proplist && *cb.cb_proplist) {
+ zprop_list_t *p = *cb.cb_proplist;
+
+ while (p) {
+ if (p->pl_prop >= ZFS_PROP_TYPE &&
+ p->pl_prop < ZFS_NUM_PROPS) {
+ cb.cb_props_table[p->pl_prop] = B_TRUE;
+ }
+ p = p->pl_next;
+ }
+
+ while (sortcol) {
+ if (sortcol->sc_prop >= ZFS_PROP_TYPE &&
+ sortcol->sc_prop < ZFS_NUM_PROPS) {
+ cb.cb_props_table[sortcol->sc_prop] = B_TRUE;
+ }
+ sortcol = sortcol->sc_next;
+ }
+
+ cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE;
+ cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE;
+ } else {
+ (void) memset(cb.cb_props_table, B_TRUE,
+ sizeof (cb.cb_props_table));
+ }
+
+ if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+ nomem();
+
+ if (argc == 0) {
+ /*
+ * If given no arguments, iterate over all datasets.
+ */
+ cb.cb_flags |= ZFS_ITER_RECURSE;
+ ret = zfs_iter_root(g_zfs, zfs_callback, &cb);
+ } else {
+ int i;
+ zfs_handle_t *zhp;
+ zfs_type_t argtype;
+
+ /*
+ * If we're recursive, then we always allow filesystems as
+ * arguments. If we also are interested in snapshots or
+ * bookmarks, then we can take volumes as well.
+ */
+ argtype = types;
+ if (flags & ZFS_ITER_RECURSE) {
+ argtype |= ZFS_TYPE_FILESYSTEM;
+ if (types & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK))
+ argtype |= ZFS_TYPE_VOLUME;
+ }
+
+ for (i = 0; i < argc; i++) {
+ if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) {
+ zhp = zfs_path_to_zhandle(g_zfs, argv[i],
+ argtype);
+ } else {
+ zhp = zfs_open(g_zfs, argv[i], argtype);
+ }
+ if (zhp != NULL)
+ ret |= zfs_callback(zhp, &cb);
+ else
+ ret = 1;
+ }
+ }
+
+ /*
+ * At this point we've got our AVL tree full of zfs handles, so iterate
+ * over each one and execute the real user callback.
+ */
+ for (node = uu_avl_first(cb.cb_avl); node != NULL;
+ node = uu_avl_next(cb.cb_avl, node))
+ ret |= callback(node->zn_handle, data);
+
+ /*
+ * Finally, clean up the AVL tree.
+ */
+ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+ nomem();
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(cb.cb_avl, node);
+ zfs_close(node->zn_handle);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(cb.cb_avl);
+ uu_avl_pool_destroy(avl_pool);
+
+ return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_iter.h b/sys/contrib/openzfs/cmd/zfs/zfs_iter.h
new file mode 100644
index 000000000000..2697fbdca1df
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_iter.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef ZFS_ITER_H
+#define ZFS_ITER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct zfs_sort_column {
+ struct zfs_sort_column *sc_next;
+ struct zfs_sort_column *sc_last;
+ zfs_prop_t sc_prop;
+ char *sc_user_prop;
+ boolean_t sc_reverse;
+} zfs_sort_column_t;
+
+#define ZFS_ITER_RECURSE (1 << 0)
+#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
+#define ZFS_ITER_PROP_LISTSNAPS (1 << 2)
+#define ZFS_ITER_DEPTH_LIMIT (1 << 3)
+#define ZFS_ITER_RECVD_PROPS (1 << 4)
+#define ZFS_ITER_LITERAL_PROPS (1 << 5)
+#define ZFS_ITER_SIMPLE (1 << 6)
+
+int zfs_for_each(int, char **, int options, zfs_type_t,
+ zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
+int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
+void zfs_free_sort_columns(zfs_sort_column_t *);
+int zfs_sort_only_by_name(const zfs_sort_column_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZFS_ITER_H */
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
new file mode 100644
index 000000000000..1a113c5c0382
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c
@@ -0,0 +1,8637 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2012 Milan Jurik. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <sys/debug.h>
+#include <errno.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <libnvpair.h>
+#include <locale.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <zone.h>
+#include <grp.h>
+#include <pwd.h>
+#include <signal.h>
+#include <sys/debug.h>
+#include <sys/list.h>
+#include <sys/mkdev.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/fs/zfs.h>
+#include <sys/systeminfo.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/zfs_project.h>
+
+#include <libzfs.h>
+#include <libzfs_core.h>
+#include <zfs_prop.h>
+#include <zfs_deleg.h>
+#include <libzutil.h>
+#include <libuutil.h>
+#ifdef HAVE_IDMAP
+#include <aclutils.h>
+#include <directory.h>
+#endif /* HAVE_IDMAP */
+
+#include "zfs_iter.h"
+#include "zfs_util.h"
+#include "zfs_comutil.h"
+#include "libzfs_impl.h"
+#include "zfs_projectutil.h"
+
+libzfs_handle_t *g_zfs;
+
+static FILE *mnttab_file;
+static char history_str[HIS_MAX_RECORD_LEN];
+static boolean_t log_history = B_TRUE;
+
+static int zfs_do_clone(int argc, char **argv);
+static int zfs_do_create(int argc, char **argv);
+static int zfs_do_destroy(int argc, char **argv);
+static int zfs_do_get(int argc, char **argv);
+static int zfs_do_inherit(int argc, char **argv);
+static int zfs_do_list(int argc, char **argv);
+static int zfs_do_mount(int argc, char **argv);
+static int zfs_do_rename(int argc, char **argv);
+static int zfs_do_rollback(int argc, char **argv);
+static int zfs_do_set(int argc, char **argv);
+static int zfs_do_upgrade(int argc, char **argv);
+static int zfs_do_snapshot(int argc, char **argv);
+static int zfs_do_unmount(int argc, char **argv);
+static int zfs_do_share(int argc, char **argv);
+static int zfs_do_unshare(int argc, char **argv);
+static int zfs_do_send(int argc, char **argv);
+static int zfs_do_receive(int argc, char **argv);
+static int zfs_do_promote(int argc, char **argv);
+static int zfs_do_userspace(int argc, char **argv);
+static int zfs_do_allow(int argc, char **argv);
+static int zfs_do_unallow(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_holds(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
+static int zfs_do_diff(int argc, char **argv);
+static int zfs_do_bookmark(int argc, char **argv);
+static int zfs_do_channel_program(int argc, char **argv);
+static int zfs_do_load_key(int argc, char **argv);
+static int zfs_do_unload_key(int argc, char **argv);
+static int zfs_do_change_key(int argc, char **argv);
+static int zfs_do_project(int argc, char **argv);
+static int zfs_do_version(int argc, char **argv);
+static int zfs_do_redact(int argc, char **argv);
+static int zfs_do_wait(int argc, char **argv);
+
+#ifdef __FreeBSD__
+static int zfs_do_jail(int argc, char **argv);
+static int zfs_do_unjail(int argc, char **argv);
+#endif
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+typedef enum {
+ HELP_CLONE,
+ HELP_CREATE,
+ HELP_DESTROY,
+ HELP_GET,
+ HELP_INHERIT,
+ HELP_UPGRADE,
+ HELP_LIST,
+ HELP_MOUNT,
+ HELP_PROMOTE,
+ HELP_RECEIVE,
+ HELP_RENAME,
+ HELP_ROLLBACK,
+ HELP_SEND,
+ HELP_SET,
+ HELP_SHARE,
+ HELP_SNAPSHOT,
+ HELP_UNMOUNT,
+ HELP_UNSHARE,
+ HELP_ALLOW,
+ HELP_UNALLOW,
+ HELP_USERSPACE,
+ HELP_GROUPSPACE,
+ HELP_PROJECTSPACE,
+ HELP_PROJECT,
+ HELP_HOLD,
+ HELP_HOLDS,
+ HELP_RELEASE,
+ HELP_DIFF,
+ HELP_BOOKMARK,
+ HELP_CHANNEL_PROGRAM,
+ HELP_LOAD_KEY,
+ HELP_UNLOAD_KEY,
+ HELP_CHANGE_KEY,
+ HELP_VERSION,
+ HELP_REDACT,
+ HELP_JAIL,
+ HELP_UNJAIL,
+ HELP_WAIT,
+} zfs_help_t;
+
+typedef struct zfs_command {
+ const char *name;
+ int (*func)(int argc, char **argv);
+ zfs_help_t usage;
+} zfs_command_t;
+
+/*
+ * Master command table. Each ZFS command has a name, associated function, and
+ * usage message. The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message. An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zfs_command_t command_table[] = {
+ { "version", zfs_do_version, HELP_VERSION },
+ { NULL },
+ { "create", zfs_do_create, HELP_CREATE },
+ { "destroy", zfs_do_destroy, HELP_DESTROY },
+ { NULL },
+ { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT },
+ { "rollback", zfs_do_rollback, HELP_ROLLBACK },
+ { "clone", zfs_do_clone, HELP_CLONE },
+ { "promote", zfs_do_promote, HELP_PROMOTE },
+ { "rename", zfs_do_rename, HELP_RENAME },
+ { "bookmark", zfs_do_bookmark, HELP_BOOKMARK },
+ { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM },
+ { NULL },
+ { "list", zfs_do_list, HELP_LIST },
+ { NULL },
+ { "set", zfs_do_set, HELP_SET },
+ { "get", zfs_do_get, HELP_GET },
+ { "inherit", zfs_do_inherit, HELP_INHERIT },
+ { "upgrade", zfs_do_upgrade, HELP_UPGRADE },
+ { NULL },
+ { "userspace", zfs_do_userspace, HELP_USERSPACE },
+ { "groupspace", zfs_do_userspace, HELP_GROUPSPACE },
+ { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE },
+ { NULL },
+ { "project", zfs_do_project, HELP_PROJECT },
+ { NULL },
+ { "mount", zfs_do_mount, HELP_MOUNT },
+ { "unmount", zfs_do_unmount, HELP_UNMOUNT },
+ { "share", zfs_do_share, HELP_SHARE },
+ { "unshare", zfs_do_unshare, HELP_UNSHARE },
+ { NULL },
+ { "send", zfs_do_send, HELP_SEND },
+ { "receive", zfs_do_receive, HELP_RECEIVE },
+ { NULL },
+ { "allow", zfs_do_allow, HELP_ALLOW },
+ { NULL },
+ { "unallow", zfs_do_unallow, HELP_UNALLOW },
+ { NULL },
+ { "hold", zfs_do_hold, HELP_HOLD },
+ { "holds", zfs_do_holds, HELP_HOLDS },
+ { "release", zfs_do_release, HELP_RELEASE },
+ { "diff", zfs_do_diff, HELP_DIFF },
+ { "load-key", zfs_do_load_key, HELP_LOAD_KEY },
+ { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY },
+ { "change-key", zfs_do_change_key, HELP_CHANGE_KEY },
+ { "redact", zfs_do_redact, HELP_REDACT },
+ { "wait", zfs_do_wait, HELP_WAIT },
+
+#ifdef __FreeBSD__
+ { "jail", zfs_do_jail, HELP_JAIL },
+ { "unjail", zfs_do_unjail, HELP_UNJAIL },
+#endif
+};
+
+#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0]))
+
+zfs_command_t *current_command;
+
+static const char *
+get_usage(zfs_help_t idx)
+{
+ switch (idx) {
+ case HELP_CLONE:
+ return (gettext("\tclone [-p] [-o property=value] ... "
+ "<snapshot> <filesystem|volume>\n"));
+ case HELP_CREATE:
+ return (gettext("\tcreate [-Pnpv] [-o property=value] ... "
+ "<filesystem>\n"
+ "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... "
+ "-V <size> <volume>\n"));
+ case HELP_DESTROY:
+ return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n"
+ "\tdestroy [-dnpRrv] "
+ "<filesystem|volume>@<snap>[%<snap>][,...]\n"
+ "\tdestroy <filesystem|volume>#<bookmark>\n"));
+ case HELP_GET:
+ return (gettext("\tget [-rHp] [-d max] "
+ "[-o \"all\" | field[,...]]\n"
+ "\t [-t type[,...]] [-s source[,...]]\n"
+ "\t <\"all\" | property[,...]> "
+ "[filesystem|volume|snapshot|bookmark] ...\n"));
+ case HELP_INHERIT:
+ return (gettext("\tinherit [-rS] <property> "
+ "<filesystem|volume|snapshot> ...\n"));
+ case HELP_UPGRADE:
+ return (gettext("\tupgrade [-v]\n"
+ "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
+ case HELP_LIST:
+ return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] "
+ "[-s property]...\n\t [-S property]... [-t type[,...]] "
+ "[filesystem|volume|snapshot] ...\n"));
+ case HELP_MOUNT:
+ return (gettext("\tmount\n"
+ "\tmount [-flvO] [-o opts] <-a | filesystem>\n"));
+ case HELP_PROMOTE:
+ return (gettext("\tpromote <clone-filesystem>\n"));
+ case HELP_RECEIVE:
+ return (gettext("\treceive [-vMnsFhu] "
+ "[-o <property>=<value>] ... [-x <property>] ...\n"
+ "\t <filesystem|volume|snapshot>\n"
+ "\treceive [-vMnsFhu] [-o <property>=<value>] ... "
+ "[-x <property>] ... \n"
+ "\t [-d | -e] <filesystem>\n"
+ "\treceive -A <filesystem|volume>\n"));
+ case HELP_RENAME:
+ return (gettext("\trename [-f] <filesystem|volume|snapshot> "
+ "<filesystem|volume|snapshot>\n"
+ "\trename -p [-f] <filesystem|volume> <filesystem|volume>\n"
+ "\trename -u [-f] <filesystem> <filesystem>\n"
+ "\trename -r <snapshot> <snapshot>\n"));
+ case HELP_ROLLBACK:
+ return (gettext("\trollback [-rRf] <snapshot>\n"));
+ case HELP_SEND:
+ return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] "
+ "<snapshot>\n"
+ "\tsend [-nvPLecw] [-i snapshot|bookmark] "
+ "<filesystem|volume|snapshot>\n"
+ "\tsend [-DnPpvLec] [-i bookmark|snapshot] "
+ "--redact <bookmark> <snapshot>\n"
+ "\tsend [-nvPe] -t <receive_resume_token>\n"
+ "\tsend [-Pnv] --saved filesystem\n"));
+ case HELP_SET:
+ return (gettext("\tset <property=value> ... "
+ "<filesystem|volume|snapshot> ...\n"));
+ case HELP_SHARE:
+ return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n"));
+ case HELP_SNAPSHOT:
+ return (gettext("\tsnapshot [-r] [-o property=value] ... "
+ "<filesystem|volume>@<snap> ...\n"));
+ case HELP_UNMOUNT:
+ return (gettext("\tunmount [-fu] "
+ "<-a | filesystem|mountpoint>\n"));
+ case HELP_UNSHARE:
+ return (gettext("\tunshare "
+ "<-a [nfs|smb] | filesystem|mountpoint>\n"));
+ case HELP_ALLOW:
+ return (gettext("\tallow <filesystem|volume>\n"
+ "\tallow [-ldug] "
+ "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
+ "\t <filesystem|volume>\n"
+ "\tallow [-ld] -e <perm|@setname>[,...] "
+ "<filesystem|volume>\n"
+ "\tallow -c <perm|@setname>[,...] <filesystem|volume>\n"
+ "\tallow -s @setname <perm|@setname>[,...] "
+ "<filesystem|volume>\n"));
+ case HELP_UNALLOW:
+ return (gettext("\tunallow [-rldug] "
+ "<\"everyone\"|user|group>[,...]\n"
+ "\t [<perm|@setname>[,...]] <filesystem|volume>\n"
+ "\tunallow [-rld] -e [<perm|@setname>[,...]] "
+ "<filesystem|volume>\n"
+ "\tunallow [-r] -c [<perm|@setname>[,...]] "
+ "<filesystem|volume>\n"
+ "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
+ "<filesystem|volume>\n"));
+ case HELP_USERSPACE:
+ return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
+ "[-s field] ...\n"
+ "\t [-S field] ... [-t type[,...]] "
+ "<filesystem|snapshot>\n"));
+ case HELP_GROUPSPACE:
+ return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
+ "[-s field] ...\n"
+ "\t [-S field] ... [-t type[,...]] "
+ "<filesystem|snapshot>\n"));
+ case HELP_PROJECTSPACE:
+ return (gettext("\tprojectspace [-Hp] [-o field[,...]] "
+ "[-s field] ... \n"
+ "\t [-S field] ... <filesystem|snapshot>\n"));
+ case HELP_PROJECT:
+ return (gettext("\tproject [-d|-r] <directory|file ...>\n"
+ "\tproject -c [-0] [-d|-r] [-p id] <directory|file ...>\n"
+ "\tproject -C [-k] [-r] <directory ...>\n"
+ "\tproject [-p id] [-r] [-s] <directory ...>\n"));
+ case HELP_HOLD:
+ return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+ case HELP_HOLDS:
+ return (gettext("\tholds [-rH] <snapshot> ...\n"));
+ case HELP_RELEASE:
+ return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
+ case HELP_DIFF:
+ return (gettext("\tdiff [-FHt] <snapshot> "
+ "[snapshot|filesystem]\n"));
+ case HELP_BOOKMARK:
+ return (gettext("\tbookmark <snapshot|bookmark> "
+ "<newbookmark>\n"));
+ case HELP_CHANNEL_PROGRAM:
+ return (gettext("\tprogram [-jn] [-t <instruction limit>] "
+ "[-m <memory limit (b)>]\n"
+ "\t <pool> <program file> [lua args...]\n"));
+ case HELP_LOAD_KEY:
+ return (gettext("\tload-key [-rn] [-L <keylocation>] "
+ "<-a | filesystem|volume>\n"));
+ case HELP_UNLOAD_KEY:
+ return (gettext("\tunload-key [-r] "
+ "<-a | filesystem|volume>\n"));
+ case HELP_CHANGE_KEY:
+ return (gettext("\tchange-key [-l] [-o keyformat=<value>]\n"
+ "\t [-o keylocation=<value>] [-o pbkdf2iters=<value>]\n"
+ "\t <filesystem|volume>\n"
+ "\tchange-key -i [-l] <filesystem|volume>\n"));
+ case HELP_VERSION:
+ return (gettext("\tversion\n"));
+ case HELP_REDACT:
+ return (gettext("\tredact <snapshot> <bookmark> "
+ "<redaction_snapshot> ...\n"));
+ case HELP_JAIL:
+ return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
+ case HELP_UNJAIL:
+ return (gettext("\tunjail <jailid|jailname> <filesystem>\n"));
+ case HELP_WAIT:
+ return (gettext("\twait [-t <activity>] <filesystem>\n"));
+ }
+
+ abort();
+ /* NOTREACHED */
+}
+
+void
+nomem(void)
+{
+ (void) fprintf(stderr, gettext("internal error: out of memory\n"));
+ exit(1);
+}
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+
+void *
+safe_malloc(size_t size)
+{
+ void *data;
+
+ if ((data = calloc(1, size)) == NULL)
+ nomem();
+
+ return (data);
+}
+
+static void *
+safe_realloc(void *data, size_t size)
+{
+ void *newp;
+ if ((newp = realloc(data, size)) == NULL) {
+ free(data);
+ nomem();
+ }
+
+ return (newp);
+}
+
+static char *
+safe_strdup(char *str)
+{
+ char *dupstr = strdup(str);
+
+ if (dupstr == NULL)
+ nomem();
+
+ return (dupstr);
+}
+
+/*
+ * Callback routine that will print out information for each of
+ * the properties.
+ */
+static int
+usage_prop_cb(int prop, void *cb)
+{
+ FILE *fp = cb;
+
+ (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop));
+
+ if (zfs_prop_readonly(prop))
+ (void) fprintf(fp, " NO ");
+ else
+ (void) fprintf(fp, "YES ");
+
+ if (zfs_prop_inheritable(prop))
+ (void) fprintf(fp, " YES ");
+ else
+ (void) fprintf(fp, " NO ");
+
+ if (zfs_prop_values(prop) == NULL)
+ (void) fprintf(fp, "-\n");
+ else
+ (void) fprintf(fp, "%s\n", zfs_prop_values(prop));
+
+ return (ZPROP_CONT);
+}
+
+/*
+ * Display usage message. If we're inside a command, display only the usage for
+ * that command. Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(boolean_t requested)
+{
+ int i;
+ boolean_t show_properties = B_FALSE;
+ FILE *fp = requested ? stdout : stderr;
+
+ if (current_command == NULL) {
+
+ (void) fprintf(fp, gettext("usage: zfs command args ...\n"));
+ (void) fprintf(fp,
+ gettext("where 'command' is one of the following:\n\n"));
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ (void) fprintf(fp, "\n");
+ else
+ (void) fprintf(fp, "%s",
+ get_usage(command_table[i].usage));
+ }
+
+ (void) fprintf(fp, gettext("\nEach dataset is of the form: "
+ "pool/[dataset/]*dataset[@name]\n"));
+ } else {
+ (void) fprintf(fp, gettext("usage:\n"));
+ (void) fprintf(fp, "%s", get_usage(current_command->usage));
+ }
+
+ if (current_command != NULL &&
+ (strcmp(current_command->name, "set") == 0 ||
+ strcmp(current_command->name, "get") == 0 ||
+ strcmp(current_command->name, "inherit") == 0 ||
+ strcmp(current_command->name, "list") == 0))
+ show_properties = B_TRUE;
+
+ if (show_properties) {
+ (void) fprintf(fp,
+ gettext("\nThe following properties are supported:\n"));
+
+ (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n",
+ "PROPERTY", "EDIT", "INHERIT", "VALUES");
+
+ /* Iterate over all properties */
+ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
+ ZFS_TYPE_DATASET);
+
+ (void) fprintf(fp, "\t%-15s ", "userused@...");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "groupused@...");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "projectused@...");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "userobjused@...");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "groupobjused@...");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "projectobjused@...");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "userquota@...");
+ (void) fprintf(fp, "YES NO <size> | none\n");
+ (void) fprintf(fp, "\t%-15s ", "groupquota@...");
+ (void) fprintf(fp, "YES NO <size> | none\n");
+ (void) fprintf(fp, "\t%-15s ", "projectquota@...");
+ (void) fprintf(fp, "YES NO <size> | none\n");
+ (void) fprintf(fp, "\t%-15s ", "userobjquota@...");
+ (void) fprintf(fp, "YES NO <size> | none\n");
+ (void) fprintf(fp, "\t%-15s ", "groupobjquota@...");
+ (void) fprintf(fp, "YES NO <size> | none\n");
+ (void) fprintf(fp, "\t%-15s ", "projectobjquota@...");
+ (void) fprintf(fp, "YES NO <size> | none\n");
+ (void) fprintf(fp, "\t%-15s ", "written@<snap>");
+ (void) fprintf(fp, " NO NO <size>\n");
+ (void) fprintf(fp, "\t%-15s ", "written#<bookmark>");
+ (void) fprintf(fp, " NO NO <size>\n");
+
+ (void) fprintf(fp, gettext("\nSizes are specified in bytes "
+ "with standard units such as K, M, G, etc.\n"));
+ (void) fprintf(fp, gettext("\nUser-defined properties can "
+ "be specified by using a name containing a colon (:).\n"));
+ (void) fprintf(fp, gettext("\nThe {user|group|project}"
+ "[obj]{used|quota}@ properties must be appended with\n"
+ "a user|group|project specifier of one of these forms:\n"
+ " POSIX name (eg: \"matt\")\n"
+ " POSIX id (eg: \"126829\")\n"
+ " SMB name@domain (eg: \"matt@sun\")\n"
+ " SMB SID (eg: \"S-1-234-567-89\")\n"));
+ } else {
+ (void) fprintf(fp,
+ gettext("\nFor the property list, run: %s\n"),
+ "zfs set|get");
+ (void) fprintf(fp,
+ gettext("\nFor the delegated permission list, run: %s\n"),
+ "zfs allow|unallow");
+ }
+
+ /*
+ * See comments at end of main().
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ exit(requested ? 0 : 2);
+}
+
+/*
+ * Take a property=value argument string and add it to the given nvlist.
+ * Modifies the argument inplace.
+ */
+static boolean_t
+parseprop(nvlist_t *props, char *propname)
+{
+ char *propval;
+
+ if ((propval = strchr(propname, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing "
+ "'=' for property=value argument\n"));
+ return (B_FALSE);
+ }
+ *propval = '\0';
+ propval++;
+ if (nvlist_exists(props, propname)) {
+ (void) fprintf(stderr, gettext("property '%s' "
+ "specified multiple times\n"), propname);
+ return (B_FALSE);
+ }
+ if (nvlist_add_string(props, propname, propval) != 0)
+ nomem();
+ return (B_TRUE);
+}
+
+/*
+ * Take a property name argument and add it to the given nvlist.
+ * Modifies the argument inplace.
+ */
+static boolean_t
+parsepropname(nvlist_t *props, char *propname)
+{
+ if (strchr(propname, '=') != NULL) {
+ (void) fprintf(stderr, gettext("invalid character "
+ "'=' in property argument\n"));
+ return (B_FALSE);
+ }
+ if (nvlist_exists(props, propname)) {
+ (void) fprintf(stderr, gettext("property '%s' "
+ "specified multiple times\n"), propname);
+ return (B_FALSE);
+ }
+ if (nvlist_add_boolean(props, propname) != 0)
+ nomem();
+ return (B_TRUE);
+}
+
+static int
+parse_depth(char *opt, int *flags)
+{
+ char *tmp;
+ int depth;
+
+ depth = (int)strtol(opt, &tmp, 0);
+ if (*tmp) {
+ (void) fprintf(stderr,
+ gettext("%s is not an integer\n"), optarg);
+ usage(B_FALSE);
+ }
+ if (depth < 0) {
+ (void) fprintf(stderr,
+ gettext("Depth can not be negative.\n"));
+ usage(B_FALSE);
+ }
+ *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
+ return (depth);
+}
+
+#define PROGRESS_DELAY 2 /* seconds */
+
+static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+static time_t pt_begin;
+static char *pt_header = NULL;
+static boolean_t pt_shown;
+
+static void
+start_progress_timer(void)
+{
+ pt_begin = time(NULL) + PROGRESS_DELAY;
+ pt_shown = B_FALSE;
+}
+
+static void
+set_progress_header(char *header)
+{
+ assert(pt_header == NULL);
+ pt_header = safe_strdup(header);
+ if (pt_shown) {
+ (void) printf("%s: ", header);
+ (void) fflush(stdout);
+ }
+}
+
+static void
+update_progress(char *update)
+{
+ if (!pt_shown && time(NULL) > pt_begin) {
+ int len = strlen(update);
+
+ (void) printf("%s: %s%*.*s", pt_header, update, len, len,
+ pt_reverse);
+ (void) fflush(stdout);
+ pt_shown = B_TRUE;
+ } else if (pt_shown) {
+ int len = strlen(update);
+
+ (void) printf("%s%*.*s", update, len, len, pt_reverse);
+ (void) fflush(stdout);
+ }
+}
+
+static void
+finish_progress(char *done)
+{
+ if (pt_shown) {
+ (void) printf("%s\n", done);
+ (void) fflush(stdout);
+ }
+ free(pt_header);
+ pt_header = NULL;
+}
+
+static int
+zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type)
+{
+ zfs_handle_t *zhp = NULL;
+ int ret = 0;
+
+ zhp = zfs_open(hdl, dataset, type);
+ if (zhp == NULL)
+ return (1);
+
+ /*
+ * Volumes may neither be mounted or shared. Potentially in the
+ * future filesystems detected on these volumes could be mounted.
+ */
+ if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ /*
+ * Mount and/or share the new filesystem as appropriate. We provide a
+ * verbose error message to let the user know that their filesystem was
+ * in fact created, even if we failed to mount or share it.
+ *
+ * If the user doesn't want the dataset automatically mounted, then
+ * skip the mount/share step
+ */
+ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) &&
+ zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) {
+ if (zfs_mount_delegation_check()) {
+ (void) fprintf(stderr, gettext("filesystem "
+ "successfully created, but it may only be "
+ "mounted by root\n"));
+ ret = 1;
+ } else if (zfs_mount(zhp, NULL, 0) != 0) {
+ (void) fprintf(stderr, gettext("filesystem "
+ "successfully created, but not mounted\n"));
+ ret = 1;
+ } else if (zfs_share(zhp) != 0) {
+ (void) fprintf(stderr, gettext("filesystem "
+ "successfully created, but not shared\n"));
+ ret = 1;
+ }
+ zfs_commit_all_shares();
+ }
+
+ zfs_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
+ *
+ * Given an existing dataset, create a writable copy whose initial contents
+ * are the same as the source. The newly created dataset maintains a
+ * dependency on the original; the original cannot be destroyed so long as
+ * the clone exists.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ */
+static int
+zfs_do_clone(int argc, char **argv)
+{
+ zfs_handle_t *zhp = NULL;
+ boolean_t parents = B_FALSE;
+ nvlist_t *props;
+ int ret = 0;
+ int c;
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ /* check options */
+ while ((c = getopt(argc, argv, "o:p")) != -1) {
+ switch (c) {
+ case 'o':
+ if (!parseprop(props, optarg)) {
+ nvlist_free(props);
+ return (1);
+ }
+ break;
+ case 'p':
+ parents = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ goto usage;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing source dataset "
+ "argument\n"));
+ goto usage;
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing target dataset "
+ "argument\n"));
+ goto usage;
+ }
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ goto usage;
+ }
+
+ /* open the source dataset */
+ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) {
+ nvlist_free(props);
+ return (1);
+ }
+
+ if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_VOLUME)) {
+ /*
+ * Now create the ancestors of the target dataset. If the
+ * target already exists and '-p' option was used we should not
+ * complain.
+ */
+ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_VOLUME)) {
+ zfs_close(zhp);
+ nvlist_free(props);
+ return (0);
+ }
+ if (zfs_create_ancestors(g_zfs, argv[1]) != 0) {
+ zfs_close(zhp);
+ nvlist_free(props);
+ return (1);
+ }
+ }
+
+ /* pass to libzfs */
+ ret = zfs_clone(zhp, argv[1], props);
+
+ /* create the mountpoint if necessary */
+ if (ret == 0) {
+ if (log_history) {
+ (void) zpool_log_history(g_zfs, history_str);
+ log_history = B_FALSE;
+ }
+
+ ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET);
+ }
+
+ zfs_close(zhp);
+ nvlist_free(props);
+
+ return (!!ret);
+
+usage:
+ ASSERT3P(zhp, ==, NULL);
+ nvlist_free(props);
+ usage(B_FALSE);
+ return (-1);
+}
+
+/*
+ * zfs create [-Pnpv] [-o prop=value] ... fs
+ * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size
+ *
+ * Create a new dataset. This command can be used to create filesystems
+ * and volumes. Snapshot creation is handled by 'zfs snapshot'.
+ * For volumes, the user must specify a size to be used.
+ *
+ * The '-s' flag applies only to volumes, and indicates that we should not try
+ * to set the reservation for this volume. By default we set a reservation
+ * equal to the size for any volume. For pools with SPA_VERSION >=
+ * SPA_VERSION_REFRESERVATION, we set a refreservation instead.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ *
+ * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity
+ * check of arguments and properties, but does not check for permissions,
+ * available space, etc.
+ *
+ * The '-v' flag is for verbose output.
+ *
+ * The '-P' flag is used for parseable output. It implies '-v'.
+ */
+static int
+zfs_do_create(int argc, char **argv)
+{
+ zfs_type_t type = ZFS_TYPE_FILESYSTEM;
+ zpool_handle_t *zpool_handle = NULL;
+ nvlist_t *real_props = NULL;
+ uint64_t volsize = 0;
+ int c;
+ boolean_t noreserve = B_FALSE;
+ boolean_t bflag = B_FALSE;
+ boolean_t parents = B_FALSE;
+ boolean_t dryrun = B_FALSE;
+ boolean_t verbose = B_FALSE;
+ boolean_t parseable = B_FALSE;
+ int ret = 1;
+ nvlist_t *props;
+ uint64_t intval;
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) {
+ switch (c) {
+ case 'V':
+ type = ZFS_TYPE_VOLUME;
+ if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+ (void) fprintf(stderr, gettext("bad volume "
+ "size '%s': %s\n"), optarg,
+ libzfs_error_description(g_zfs));
+ goto error;
+ }
+
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
+ nomem();
+ volsize = intval;
+ break;
+ case 'P':
+ verbose = B_TRUE;
+ parseable = B_TRUE;
+ break;
+ case 'p':
+ parents = B_TRUE;
+ break;
+ case 'b':
+ bflag = B_TRUE;
+ if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) {
+ (void) fprintf(stderr, gettext("bad volume "
+ "block size '%s': %s\n"), optarg,
+ libzfs_error_description(g_zfs));
+ goto error;
+ }
+
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ intval) != 0)
+ nomem();
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'o':
+ if (!parseprop(props, optarg))
+ goto error;
+ break;
+ case 's':
+ noreserve = B_TRUE;
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing size "
+ "argument\n"));
+ goto badusage;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ goto badusage;
+ }
+ }
+
+ if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) {
+ (void) fprintf(stderr, gettext("'-s' and '-b' can only be "
+ "used when creating a volume\n"));
+ goto badusage;
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc == 0) {
+ (void) fprintf(stderr, gettext("missing %s argument\n"),
+ zfs_type_to_name(type));
+ goto badusage;
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ goto badusage;
+ }
+
+ if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) {
+ char msg[ZFS_MAX_DATASET_NAME_LEN * 2];
+ char *p;
+
+ if ((p = strchr(argv[0], '/')) != NULL)
+ *p = '\0';
+ zpool_handle = zpool_open(g_zfs, argv[0]);
+ if (p != NULL)
+ *p = '/';
+ if (zpool_handle == NULL)
+ goto error;
+
+ (void) snprintf(msg, sizeof (msg),
+ dryrun ? gettext("cannot verify '%s'") :
+ gettext("cannot create '%s'"), argv[0]);
+ if (props && (real_props = zfs_valid_proplist(g_zfs, type,
+ props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) {
+ zpool_close(zpool_handle);
+ goto error;
+ }
+ }
+
+ /*
+ * if volsize is not a multiple of volblocksize, round it up to the
+ * nearest multiple of the volblocksize
+ */
+ if (type == ZFS_TYPE_VOLUME) {
+ uint64_t volblocksize;
+
+ if (nvlist_lookup_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+ &volblocksize) != 0)
+ volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+
+ if (volsize % volblocksize) {
+ volsize = P2ROUNDUP_TYPED(volsize, volblocksize,
+ uint64_t);
+
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) {
+ nvlist_free(props);
+ nomem();
+ }
+ }
+ }
+
+
+ if (type == ZFS_TYPE_VOLUME && !noreserve) {
+ uint64_t spa_version;
+ zfs_prop_t resv_prop;
+ char *strval;
+
+ spa_version = zpool_get_prop_int(zpool_handle,
+ ZPOOL_PROP_VERSION, NULL);
+ if (spa_version >= SPA_VERSION_REFRESERVATION)
+ resv_prop = ZFS_PROP_REFRESERVATION;
+ else
+ resv_prop = ZFS_PROP_RESERVATION;
+
+ volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
+ real_props);
+
+ if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
+ &strval) != 0) {
+ if (nvlist_add_uint64(props,
+ zfs_prop_to_name(resv_prop), volsize) != 0) {
+ nvlist_free(props);
+ nomem();
+ }
+ }
+ }
+ if (zpool_handle != NULL) {
+ zpool_close(zpool_handle);
+ nvlist_free(real_props);
+ }
+
+ if (parents && zfs_name_valid(argv[0], type)) {
+ /*
+ * Now create the ancestors of target dataset. If the target
+ * already exists and '-p' option was used we should not
+ * complain.
+ */
+ if (zfs_dataset_exists(g_zfs, argv[0], type)) {
+ ret = 0;
+ goto error;
+ }
+ if (verbose) {
+ (void) printf(parseable ? "create_ancestors\t%s\n" :
+ dryrun ? "would create ancestors of %s\n" :
+ "create ancestors of %s\n", argv[0]);
+ }
+ if (!dryrun) {
+ if (zfs_create_ancestors(g_zfs, argv[0]) != 0) {
+ goto error;
+ }
+ }
+ }
+
+ if (verbose) {
+ nvpair_t *nvp = NULL;
+ (void) printf(parseable ? "create\t%s\n" :
+ dryrun ? "would create %s\n" : "create %s\n", argv[0]);
+ while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) {
+ uint64_t uval;
+ char *sval;
+
+ switch (nvpair_type(nvp)) {
+ case DATA_TYPE_UINT64:
+ VERIFY0(nvpair_value_uint64(nvp, &uval));
+ (void) printf(parseable ?
+ "property\t%s\t%llu\n" : "\t%s=%llu\n",
+ nvpair_name(nvp), (u_longlong_t)uval);
+ break;
+ case DATA_TYPE_STRING:
+ VERIFY0(nvpair_value_string(nvp, &sval));
+ (void) printf(parseable ?
+ "property\t%s\t%s\n" : "\t%s=%s\n",
+ nvpair_name(nvp), sval);
+ break;
+ default:
+ (void) fprintf(stderr, "property '%s' "
+ "has illegal type %d\n",
+ nvpair_name(nvp), nvpair_type(nvp));
+ abort();
+ }
+ }
+ }
+ if (dryrun) {
+ ret = 0;
+ goto error;
+ }
+
+ /* pass to libzfs */
+ if (zfs_create(g_zfs, argv[0], type, props) != 0)
+ goto error;
+
+ if (log_history) {
+ (void) zpool_log_history(g_zfs, history_str);
+ log_history = B_FALSE;
+ }
+
+ ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET);
+error:
+ nvlist_free(props);
+ return (ret);
+badusage:
+ nvlist_free(props);
+ usage(B_FALSE);
+ return (2);
+}
+
+/*
+ * zfs destroy [-rRf] <fs, vol>
+ * zfs destroy [-rRd] <snap>
+ *
+ * -r Recursively destroy all children
+ * -R Recursively destroy all dependents, including clones
+ * -f Force unmounting of any dependents
+ * -d If we can't destroy now, mark for deferred destruction
+ *
+ * Destroys the given dataset. By default, it will unmount any filesystems,
+ * and refuse to destroy a dataset that has any dependents. A dependent can
+ * either be a child, or a clone of a child.
+ */
+typedef struct destroy_cbdata {
+ boolean_t cb_first;
+ boolean_t cb_force;
+ boolean_t cb_recurse;
+ boolean_t cb_error;
+ boolean_t cb_doclones;
+ zfs_handle_t *cb_target;
+ boolean_t cb_defer_destroy;
+ boolean_t cb_verbose;
+ boolean_t cb_parsable;
+ boolean_t cb_dryrun;
+ nvlist_t *cb_nvl;
+ nvlist_t *cb_batchedsnaps;
+
+ /* first snap in contiguous run */
+ char *cb_firstsnap;
+ /* previous snap in contiguous run */
+ char *cb_prevsnap;
+ int64_t cb_snapused;
+ char *cb_snapspec;
+ char *cb_bookmark;
+ uint64_t cb_snap_count;
+} destroy_cbdata_t;
+
+/*
+ * Check for any dependents based on the '-r' or '-R' flags.
+ */
+static int
+destroy_check_dependent(zfs_handle_t *zhp, void *data)
+{
+ destroy_cbdata_t *cbp = data;
+ const char *tname = zfs_get_name(cbp->cb_target);
+ const char *name = zfs_get_name(zhp);
+
+ if (strncmp(tname, name, strlen(tname)) == 0 &&
+ (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) {
+ /*
+ * This is a direct descendant, not a clone somewhere else in
+ * the hierarchy.
+ */
+ if (cbp->cb_recurse)
+ goto out;
+
+ if (cbp->cb_first) {
+ (void) fprintf(stderr, gettext("cannot destroy '%s': "
+ "%s has children\n"),
+ zfs_get_name(cbp->cb_target),
+ zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+ (void) fprintf(stderr, gettext("use '-r' to destroy "
+ "the following datasets:\n"));
+ cbp->cb_first = B_FALSE;
+ cbp->cb_error = B_TRUE;
+ }
+
+ (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+ } else {
+ /*
+ * This is a clone. We only want to report this if the '-r'
+ * wasn't specified, or the target is a snapshot.
+ */
+ if (!cbp->cb_recurse &&
+ zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT)
+ goto out;
+
+ if (cbp->cb_first) {
+ (void) fprintf(stderr, gettext("cannot destroy '%s': "
+ "%s has dependent clones\n"),
+ zfs_get_name(cbp->cb_target),
+ zfs_type_to_name(zfs_get_type(cbp->cb_target)));
+ (void) fprintf(stderr, gettext("use '-R' to destroy "
+ "the following datasets:\n"));
+ cbp->cb_first = B_FALSE;
+ cbp->cb_error = B_TRUE;
+ cbp->cb_dryrun = B_TRUE;
+ }
+
+ (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+ }
+
+out:
+ zfs_close(zhp);
+ return (0);
+}
+
+static int
+destroy_batched(destroy_cbdata_t *cb)
+{
+ int error = zfs_destroy_snaps_nvl(g_zfs,
+ cb->cb_batchedsnaps, B_FALSE);
+ fnvlist_free(cb->cb_batchedsnaps);
+ cb->cb_batchedsnaps = fnvlist_alloc();
+ return (error);
+}
+
+static int
+destroy_callback(zfs_handle_t *zhp, void *data)
+{
+ destroy_cbdata_t *cb = data;
+ const char *name = zfs_get_name(zhp);
+ int error;
+
+ if (cb->cb_verbose) {
+ if (cb->cb_parsable) {
+ (void) printf("destroy\t%s\n", name);
+ } else if (cb->cb_dryrun) {
+ (void) printf(gettext("would destroy %s\n"),
+ name);
+ } else {
+ (void) printf(gettext("will destroy %s\n"),
+ name);
+ }
+ }
+
+ /*
+ * Ignore pools (which we've already flagged as an error before getting
+ * here).
+ */
+ if (strchr(zfs_get_name(zhp), '/') == NULL &&
+ zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+ zfs_close(zhp);
+ return (0);
+ }
+ if (cb->cb_dryrun) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ /*
+ * We batch up all contiguous snapshots (even of different
+ * filesystems) and destroy them with one ioctl. We can't
+ * simply do all snap deletions and then all fs deletions,
+ * because we must delete a clone before its origin.
+ */
+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+ cb->cb_snap_count++;
+ fnvlist_add_boolean(cb->cb_batchedsnaps, name);
+ if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy)
+ error = destroy_batched(cb);
+ } else {
+ error = destroy_batched(cb);
+ if (error != 0 ||
+ zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
+ zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
+ zfs_close(zhp);
+ /*
+ * When performing a recursive destroy we ignore errors
+ * so that the recursive destroy could continue
+ * destroying past problem datasets
+ */
+ if (cb->cb_recurse) {
+ cb->cb_error = B_TRUE;
+ return (0);
+ }
+ return (-1);
+ }
+ }
+
+ zfs_close(zhp);
+ return (0);
+}
+
+static int
+destroy_print_cb(zfs_handle_t *zhp, void *arg)
+{
+ destroy_cbdata_t *cb = arg;
+ const char *name = zfs_get_name(zhp);
+ int err = 0;
+
+ if (nvlist_exists(cb->cb_nvl, name)) {
+ if (cb->cb_firstsnap == NULL)
+ cb->cb_firstsnap = strdup(name);
+ if (cb->cb_prevsnap != NULL)
+ free(cb->cb_prevsnap);
+ /* this snap continues the current range */
+ cb->cb_prevsnap = strdup(name);
+ if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL)
+ nomem();
+ if (cb->cb_verbose) {
+ if (cb->cb_parsable) {
+ (void) printf("destroy\t%s\n", name);
+ } else if (cb->cb_dryrun) {
+ (void) printf(gettext("would destroy %s\n"),
+ name);
+ } else {
+ (void) printf(gettext("will destroy %s\n"),
+ name);
+ }
+ }
+ } else if (cb->cb_firstsnap != NULL) {
+ /* end of this range */
+ uint64_t used = 0;
+ err = lzc_snaprange_space(cb->cb_firstsnap,
+ cb->cb_prevsnap, &used);
+ cb->cb_snapused += used;
+ free(cb->cb_firstsnap);
+ cb->cb_firstsnap = NULL;
+ free(cb->cb_prevsnap);
+ cb->cb_prevsnap = NULL;
+ }
+ zfs_close(zhp);
+ return (err);
+}
+
+static int
+destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb)
+{
+ int err;
+ assert(cb->cb_firstsnap == NULL);
+ assert(cb->cb_prevsnap == NULL);
+ err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0);
+ if (cb->cb_firstsnap != NULL) {
+ uint64_t used = 0;
+ if (err == 0) {
+ err = lzc_snaprange_space(cb->cb_firstsnap,
+ cb->cb_prevsnap, &used);
+ }
+ cb->cb_snapused += used;
+ free(cb->cb_firstsnap);
+ cb->cb_firstsnap = NULL;
+ free(cb->cb_prevsnap);
+ cb->cb_prevsnap = NULL;
+ }
+ return (err);
+}
+
+static int
+snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg)
+{
+ destroy_cbdata_t *cb = arg;
+ int err = 0;
+
+ /* Check for clones. */
+ if (!cb->cb_doclones && !cb->cb_defer_destroy) {
+ cb->cb_target = zhp;
+ cb->cb_first = B_TRUE;
+ err = zfs_iter_dependents(zhp, B_TRUE,
+ destroy_check_dependent, cb);
+ }
+
+ if (err == 0) {
+ if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
+ nomem();
+ }
+ zfs_close(zhp);
+ return (err);
+}
+
+static int
+gather_snapshots(zfs_handle_t *zhp, void *arg)
+{
+ destroy_cbdata_t *cb = arg;
+ int err = 0;
+
+ err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
+ if (err == ENOENT)
+ err = 0;
+ if (err != 0)
+ goto out;
+
+ if (cb->cb_verbose) {
+ err = destroy_print_snapshots(zhp, cb);
+ if (err != 0)
+ goto out;
+ }
+
+ if (cb->cb_recurse)
+ err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
+
+out:
+ zfs_close(zhp);
+ return (err);
+}
+
+static int
+destroy_clones(destroy_cbdata_t *cb)
+{
+ nvpair_t *pair;
+ for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
+ zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
+ ZFS_TYPE_SNAPSHOT);
+ if (zhp != NULL) {
+ boolean_t defer = cb->cb_defer_destroy;
+ int err;
+
+ /*
+ * We can't defer destroy non-snapshots, so set it to
+ * false while destroying the clones.
+ */
+ cb->cb_defer_destroy = B_FALSE;
+ err = zfs_iter_dependents(zhp, B_FALSE,
+ destroy_callback, cb);
+ cb->cb_defer_destroy = defer;
+ zfs_close(zhp);
+ if (err != 0)
+ return (err);
+ }
+ }
+ return (0);
+}
+
+static int
+zfs_do_destroy(int argc, char **argv)
+{
+ destroy_cbdata_t cb = { 0 };
+ int rv = 0;
+ int err = 0;
+ int c;
+ zfs_handle_t *zhp = NULL;
+ char *at, *pound;
+ zfs_type_t type = ZFS_TYPE_DATASET;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
+ switch (c) {
+ case 'v':
+ cb.cb_verbose = B_TRUE;
+ break;
+ case 'p':
+ cb.cb_verbose = B_TRUE;
+ cb.cb_parsable = B_TRUE;
+ break;
+ case 'n':
+ cb.cb_dryrun = B_TRUE;
+ break;
+ case 'd':
+ cb.cb_defer_destroy = B_TRUE;
+ type = ZFS_TYPE_SNAPSHOT;
+ break;
+ case 'f':
+ cb.cb_force = B_TRUE;
+ break;
+ case 'r':
+ cb.cb_recurse = B_TRUE;
+ break;
+ case 'R':
+ cb.cb_recurse = B_TRUE;
+ cb.cb_doclones = B_TRUE;
+ break;
+ case '?':
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc == 0) {
+ (void) fprintf(stderr, gettext("missing dataset argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ at = strchr(argv[0], '@');
+ pound = strchr(argv[0], '#');
+ if (at != NULL) {
+
+ /* Build the list of snaps to destroy in cb_nvl. */
+ cb.cb_nvl = fnvlist_alloc();
+
+ *at = '\0';
+ zhp = zfs_open(g_zfs, argv[0],
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL) {
+ nvlist_free(cb.cb_nvl);
+ return (1);
+ }
+
+ cb.cb_snapspec = at + 1;
+ if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
+ cb.cb_error) {
+ rv = 1;
+ goto out;
+ }
+
+ if (nvlist_empty(cb.cb_nvl)) {
+ (void) fprintf(stderr, gettext("could not find any "
+ "snapshots to destroy; check snapshot names.\n"));
+ rv = 1;
+ goto out;
+ }
+
+ if (cb.cb_verbose) {
+ char buf[16];
+ zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf));
+ if (cb.cb_parsable) {
+ (void) printf("reclaim\t%llu\n",
+ (u_longlong_t)cb.cb_snapused);
+ } else if (cb.cb_dryrun) {
+ (void) printf(gettext("would reclaim %s\n"),
+ buf);
+ } else {
+ (void) printf(gettext("will reclaim %s\n"),
+ buf);
+ }
+ }
+
+ if (!cb.cb_dryrun) {
+ if (cb.cb_doclones) {
+ cb.cb_batchedsnaps = fnvlist_alloc();
+ err = destroy_clones(&cb);
+ if (err == 0) {
+ err = zfs_destroy_snaps_nvl(g_zfs,
+ cb.cb_batchedsnaps, B_FALSE);
+ }
+ if (err != 0) {
+ rv = 1;
+ goto out;
+ }
+ }
+ if (err == 0) {
+ err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
+ cb.cb_defer_destroy);
+ }
+ }
+
+ if (err != 0)
+ rv = 1;
+ } else if (pound != NULL) {
+ int err;
+ nvlist_t *nvl;
+
+ if (cb.cb_dryrun) {
+ (void) fprintf(stderr,
+ "dryrun is not supported with bookmark\n");
+ return (-1);
+ }
+
+ if (cb.cb_defer_destroy) {
+ (void) fprintf(stderr,
+ "defer destroy is not supported with bookmark\n");
+ return (-1);
+ }
+
+ if (cb.cb_recurse) {
+ (void) fprintf(stderr,
+ "recursive is not supported with bookmark\n");
+ return (-1);
+ }
+
+ /*
+ * Unfortunately, zfs_bookmark() doesn't honor the
+ * casesensitivity setting. However, we can't simply
+ * remove this check, because lzc_destroy_bookmarks()
+ * ignores non-existent bookmarks, so this is necessary
+ * to get a proper error message.
+ */
+ if (!zfs_bookmark_exists(argv[0])) {
+ (void) fprintf(stderr, gettext("bookmark '%s' "
+ "does not exist.\n"), argv[0]);
+ return (1);
+ }
+
+ nvl = fnvlist_alloc();
+ fnvlist_add_boolean(nvl, argv[0]);
+
+ err = lzc_destroy_bookmarks(nvl, NULL);
+ if (err != 0) {
+ (void) zfs_standard_error(g_zfs, err,
+ "cannot destroy bookmark");
+ }
+
+ nvlist_free(nvl);
+
+ return (err);
+ } else {
+ /* Open the given dataset */
+ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
+ return (1);
+
+ cb.cb_target = zhp;
+
+ /*
+ * Perform an explicit check for pools before going any further.
+ */
+ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
+ zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+ (void) fprintf(stderr, gettext("cannot destroy '%s': "
+ "operation does not apply to pools\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use 'zfs destroy -r "
+ "%s' to destroy all datasets in the pool\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use 'zpool destroy %s' "
+ "to destroy the pool itself\n"), zfs_get_name(zhp));
+ rv = 1;
+ goto out;
+ }
+
+ /*
+ * Check for any dependents and/or clones.
+ */
+ cb.cb_first = B_TRUE;
+ if (!cb.cb_doclones &&
+ zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
+ &cb) != 0) {
+ rv = 1;
+ goto out;
+ }
+
+ if (cb.cb_error) {
+ rv = 1;
+ goto out;
+ }
+ cb.cb_batchedsnaps = fnvlist_alloc();
+ if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
+ &cb) != 0) {
+ rv = 1;
+ goto out;
+ }
+
+ /*
+ * Do the real thing. The callback will close the
+ * handle regardless of whether it succeeds or not.
+ */
+ err = destroy_callback(zhp, &cb);
+ zhp = NULL;
+ if (err == 0) {
+ err = zfs_destroy_snaps_nvl(g_zfs,
+ cb.cb_batchedsnaps, cb.cb_defer_destroy);
+ }
+ if (err != 0 || cb.cb_error == B_TRUE)
+ rv = 1;
+ }
+
+out:
+ fnvlist_free(cb.cb_batchedsnaps);
+ fnvlist_free(cb.cb_nvl);
+ if (zhp != NULL)
+ zfs_close(zhp);
+ return (rv);
+}
+
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+ int i;
+ zfs_get_column_t col;
+
+ for (i = 0; i < ZFS_GET_NCOLS &&
+ (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+ if (col == GET_COL_RECVD)
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ * < all | property[,property]... > < fs | snap | vol > ...
+ *
+ * -r recurse over any child datasets
+ * -H scripted mode. Headers are stripped, and fields are separated
+ * by tabs instead of spaces.
+ * -o Set of fields to display. One of "name,property,value,
+ * received,source". Default is "name,property,value,source".
+ * "all" is an alias for all five.
+ * -s Set of sources to allow. One of
+ * "local,default,inherited,received,temporary,none". Default is
+ * all six.
+ * -p Display values in parsable (literal) format.
+ *
+ * Prints properties for the given datasets. The user can control which
+ * columns to display as well as which property types to allow.
+ */
+
+/*
+ * Invoked to display the properties for a single dataset.
+ */
+static int
+get_callback(zfs_handle_t *zhp, void *data)
+{
+ char buf[ZFS_MAXPROPLEN];
+ char rbuf[ZFS_MAXPROPLEN];
+ zprop_source_t sourcetype;
+ char source[ZFS_MAX_DATASET_NAME_LEN];
+ zprop_get_cbdata_t *cbp = data;
+ nvlist_t *user_props = zfs_get_user_props(zhp);
+ zprop_list_t *pl = cbp->cb_proplist;
+ nvlist_t *propval;
+ char *strval;
+ char *sourceval;
+ boolean_t received = is_recvd_column(cbp);
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ char *recvdval = NULL;
+ /*
+ * Skip the special fake placeholder. This will also skip over
+ * the name property when 'all' is specified.
+ */
+ if (pl->pl_prop == ZFS_PROP_NAME &&
+ pl == cbp->cb_proplist)
+ continue;
+
+ if (pl->pl_prop != ZPROP_INVAL) {
+ if (zfs_prop_get(zhp, pl->pl_prop, buf,
+ sizeof (buf), &sourcetype, source,
+ sizeof (source),
+ cbp->cb_literal) != 0) {
+ if (pl->pl_all)
+ continue;
+ if (!zfs_prop_valid_for_type(pl->pl_prop,
+ ZFS_TYPE_DATASET, B_FALSE)) {
+ (void) fprintf(stderr,
+ gettext("No such property '%s'\n"),
+ zfs_prop_to_name(pl->pl_prop));
+ continue;
+ }
+ sourcetype = ZPROP_SRC_NONE;
+ (void) strlcpy(buf, "-", sizeof (buf));
+ }
+
+ if (received && (zfs_prop_get_recvd(zhp,
+ zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
+ cbp->cb_literal) == 0))
+ recvdval = rbuf;
+
+ zprop_print_one_property(zfs_get_name(zhp), cbp,
+ zfs_prop_to_name(pl->pl_prop),
+ buf, sourcetype, source, recvdval);
+ } else if (zfs_prop_userquota(pl->pl_user_prop)) {
+ sourcetype = ZPROP_SRC_LOCAL;
+
+ if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+ buf, sizeof (buf), cbp->cb_literal) != 0) {
+ sourcetype = ZPROP_SRC_NONE;
+ (void) strlcpy(buf, "-", sizeof (buf));
+ }
+
+ zprop_print_one_property(zfs_get_name(zhp), cbp,
+ pl->pl_user_prop, buf, sourcetype, source, NULL);
+ } else if (zfs_prop_written(pl->pl_user_prop)) {
+ sourcetype = ZPROP_SRC_LOCAL;
+
+ if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+ buf, sizeof (buf), cbp->cb_literal) != 0) {
+ sourcetype = ZPROP_SRC_NONE;
+ (void) strlcpy(buf, "-", sizeof (buf));
+ }
+
+ zprop_print_one_property(zfs_get_name(zhp), cbp,
+ pl->pl_user_prop, buf, sourcetype, source, NULL);
+ } else {
+ if (nvlist_lookup_nvlist(user_props,
+ pl->pl_user_prop, &propval) != 0) {
+ if (pl->pl_all)
+ continue;
+ sourcetype = ZPROP_SRC_NONE;
+ strval = "-";
+ } else {
+ verify(nvlist_lookup_string(propval,
+ ZPROP_VALUE, &strval) == 0);
+ verify(nvlist_lookup_string(propval,
+ ZPROP_SOURCE, &sourceval) == 0);
+
+ if (strcmp(sourceval,
+ zfs_get_name(zhp)) == 0) {
+ sourcetype = ZPROP_SRC_LOCAL;
+ } else if (strcmp(sourceval,
+ ZPROP_SOURCE_VAL_RECVD) == 0) {
+ sourcetype = ZPROP_SRC_RECEIVED;
+ } else {
+ sourcetype = ZPROP_SRC_INHERITED;
+ (void) strlcpy(source,
+ sourceval, sizeof (source));
+ }
+ }
+
+ if (received && (zfs_prop_get_recvd(zhp,
+ pl->pl_user_prop, rbuf, sizeof (rbuf),
+ cbp->cb_literal) == 0))
+ recvdval = rbuf;
+
+ zprop_print_one_property(zfs_get_name(zhp), cbp,
+ pl->pl_user_prop, strval, sourcetype,
+ source, recvdval);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_do_get(int argc, char **argv)
+{
+ zprop_get_cbdata_t cb = { 0 };
+ int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+ int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK;
+ char *value, *fields;
+ int ret = 0;
+ int limit = 0;
+ zprop_list_t fake_name = { 0 };
+
+ /*
+ * Set up default columns and sources.
+ */
+ cb.cb_sources = ZPROP_SRC_ALL;
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_SOURCE;
+ cb.cb_type = ZFS_TYPE_DATASET;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
+ switch (c) {
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ break;
+ case 'd':
+ limit = parse_depth(optarg, &flags);
+ break;
+ case 'r':
+ flags |= ZFS_ITER_RECURSE;
+ break;
+ case 'H':
+ cb.cb_scripted = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case 'o':
+ /*
+ * Process the set of columns to display. We zero out
+ * the structure to give us a blank slate.
+ */
+ bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+ i = 0;
+ while (*optarg != '\0') {
+ static char *col_subopts[] =
+ { "name", "property", "value", "received",
+ "source", "all", NULL };
+
+ if (i == ZFS_GET_NCOLS) {
+ (void) fprintf(stderr, gettext("too "
+ "many fields given to -o "
+ "option\n"));
+ usage(B_FALSE);
+ }
+
+ switch (getsubopt(&optarg, col_subopts,
+ &value)) {
+ case 0:
+ cb.cb_columns[i++] = GET_COL_NAME;
+ break;
+ case 1:
+ cb.cb_columns[i++] = GET_COL_PROPERTY;
+ break;
+ case 2:
+ cb.cb_columns[i++] = GET_COL_VALUE;
+ break;
+ case 3:
+ cb.cb_columns[i++] = GET_COL_RECVD;
+ flags |= ZFS_ITER_RECVD_PROPS;
+ break;
+ case 4:
+ cb.cb_columns[i++] = GET_COL_SOURCE;
+ break;
+ case 5:
+ if (i > 0) {
+ (void) fprintf(stderr,
+ gettext("\"all\" conflicts "
+ "with specific fields "
+ "given to -o option\n"));
+ usage(B_FALSE);
+ }
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_RECVD;
+ cb.cb_columns[4] = GET_COL_SOURCE;
+ flags |= ZFS_ITER_RECVD_PROPS;
+ i = ZFS_GET_NCOLS;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid column name "
+ "'%s'\n"), value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+
+ case 's':
+ cb.cb_sources = 0;
+ while (*optarg != '\0') {
+ static char *source_subopts[] = {
+ "local", "default", "inherited",
+ "received", "temporary", "none",
+ NULL };
+
+ switch (getsubopt(&optarg, source_subopts,
+ &value)) {
+ case 0:
+ cb.cb_sources |= ZPROP_SRC_LOCAL;
+ break;
+ case 1:
+ cb.cb_sources |= ZPROP_SRC_DEFAULT;
+ break;
+ case 2:
+ cb.cb_sources |= ZPROP_SRC_INHERITED;
+ break;
+ case 3:
+ cb.cb_sources |= ZPROP_SRC_RECEIVED;
+ break;
+ case 4:
+ cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+ break;
+ case 5:
+ cb.cb_sources |= ZPROP_SRC_NONE;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid source "
+ "'%s'\n"), value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+
+ case 't':
+ types = 0;
+ flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+ while (*optarg != '\0') {
+ static char *type_subopts[] = { "filesystem",
+ "volume", "snapshot", "snap", "bookmark",
+ "all", NULL };
+
+ switch (getsubopt(&optarg, type_subopts,
+ &value)) {
+ case 0:
+ types |= ZFS_TYPE_FILESYSTEM;
+ break;
+ case 1:
+ types |= ZFS_TYPE_VOLUME;
+ break;
+ case 2:
+ case 3:
+ types |= ZFS_TYPE_SNAPSHOT;
+ break;
+ case 4:
+ types |= ZFS_TYPE_BOOKMARK;
+ break;
+ case 5:
+ types = ZFS_TYPE_DATASET |
+ ZFS_TYPE_BOOKMARK;
+ break;
+
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid type '%s'\n"),
+ value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing property "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+
+ fields = argv[0];
+
+ /*
+ * Handle users who want to get all snapshots or bookmarks
+ * of a dataset (ex. 'zfs get -t snapshot refer <dataset>').
+ */
+ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) &&
+ argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) {
+ flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE);
+ limit = 1;
+ }
+
+ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
+ != 0)
+ usage(B_FALSE);
+
+ argc--;
+ argv++;
+
+ /*
+ * As part of zfs_expand_proplist(), we keep track of the maximum column
+ * width for each property. For the 'NAME' (and 'SOURCE') columns, we
+ * need to know the maximum name length. However, the user likely did
+ * not specify 'name' as one of the properties to fetch, so we need to
+ * make sure we always include at least this property for
+ * print_get_headers() to work properly.
+ */
+ if (cb.cb_proplist != NULL) {
+ fake_name.pl_prop = ZFS_PROP_NAME;
+ fake_name.pl_width = strlen(gettext("NAME"));
+ fake_name.pl_next = cb.cb_proplist;
+ cb.cb_proplist = &fake_name;
+ }
+
+ cb.cb_first = B_TRUE;
+
+ /* run for each object */
+ ret = zfs_for_each(argc, argv, flags, types, NULL,
+ &cb.cb_proplist, limit, get_callback, &cb);
+
+ if (cb.cb_proplist == &fake_name)
+ zprop_free_list(fake_name.pl_next);
+ else
+ zprop_free_list(cb.cb_proplist);
+
+ return (ret);
+}
+
+/*
+ * inherit [-rS] <property> <fs|vol> ...
+ *
+ * -r Recurse over all children
+ * -S Revert to received value, if any
+ *
+ * For each dataset specified on the command line, inherit the given property
+ * from its parent. Inheriting a property at the pool level will cause it to
+ * use the default value. The '-r' flag will recurse over all children, and is
+ * useful for setting a property on a hierarchy-wide basis, regardless of any
+ * local modifications for each dataset.
+ */
+
+typedef struct inherit_cbdata {
+ const char *cb_propname;
+ boolean_t cb_received;
+} inherit_cbdata_t;
+
+static int
+inherit_recurse_cb(zfs_handle_t *zhp, void *data)
+{
+ inherit_cbdata_t *cb = data;
+ zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
+
+ /*
+ * If we're doing it recursively, then ignore properties that
+ * are not valid for this type of dataset.
+ */
+ if (prop != ZPROP_INVAL &&
+ !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE))
+ return (0);
+
+ return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
+}
+
+static int
+inherit_cb(zfs_handle_t *zhp, void *data)
+{
+ inherit_cbdata_t *cb = data;
+
+ return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
+}
+
+static int
+zfs_do_inherit(int argc, char **argv)
+{
+ int c;
+ zfs_prop_t prop;
+ inherit_cbdata_t cb = { 0 };
+ char *propname;
+ int ret = 0;
+ int flags = 0;
+ boolean_t received = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "rS")) != -1) {
+ switch (c) {
+ case 'r':
+ flags |= ZFS_ITER_RECURSE;
+ break;
+ case 'S':
+ received = B_TRUE;
+ break;
+ case '?':
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing property argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing dataset argument\n"));
+ usage(B_FALSE);
+ }
+
+ propname = argv[0];
+ argc--;
+ argv++;
+
+ if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
+ if (zfs_prop_readonly(prop)) {
+ (void) fprintf(stderr, gettext(
+ "%s property is read-only\n"),
+ propname);
+ return (1);
+ }
+ if (!zfs_prop_inheritable(prop) && !received) {
+ (void) fprintf(stderr, gettext("'%s' property cannot "
+ "be inherited\n"), propname);
+ if (prop == ZFS_PROP_QUOTA ||
+ prop == ZFS_PROP_RESERVATION ||
+ prop == ZFS_PROP_REFQUOTA ||
+ prop == ZFS_PROP_REFRESERVATION) {
+ (void) fprintf(stderr, gettext("use 'zfs set "
+ "%s=none' to clear\n"), propname);
+ (void) fprintf(stderr, gettext("use 'zfs "
+ "inherit -S %s' to revert to received "
+ "value\n"), propname);
+ }
+ return (1);
+ }
+ if (received && (prop == ZFS_PROP_VOLSIZE ||
+ prop == ZFS_PROP_VERSION)) {
+ (void) fprintf(stderr, gettext("'%s' property cannot "
+ "be reverted to a received value\n"), propname);
+ return (1);
+ }
+ } else if (!zfs_prop_user(propname)) {
+ (void) fprintf(stderr, gettext("invalid property '%s'\n"),
+ propname);
+ usage(B_FALSE);
+ }
+
+ cb.cb_propname = propname;
+ cb.cb_received = received;
+
+ if (flags & ZFS_ITER_RECURSE) {
+ ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
+ NULL, NULL, 0, inherit_recurse_cb, &cb);
+ } else {
+ ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
+ NULL, NULL, 0, inherit_cb, &cb);
+ }
+
+ return (ret);
+}
+
+typedef struct upgrade_cbdata {
+ uint64_t cb_numupgraded;
+ uint64_t cb_numsamegraded;
+ uint64_t cb_numfailed;
+ uint64_t cb_version;
+ boolean_t cb_newer;
+ boolean_t cb_foundone;
+ char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN];
+} upgrade_cbdata_t;
+
+static int
+same_pool(zfs_handle_t *zhp, const char *name)
+{
+ int len1 = strcspn(name, "/@");
+ const char *zhname = zfs_get_name(zhp);
+ int len2 = strcspn(zhname, "/@");
+
+ if (len1 != len2)
+ return (B_FALSE);
+ return (strncmp(name, zhname, len1) == 0);
+}
+
+static int
+upgrade_list_callback(zfs_handle_t *zhp, void *data)
+{
+ upgrade_cbdata_t *cb = data;
+ int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+
+ /* list if it's old/new */
+ if ((!cb->cb_newer && version < ZPL_VERSION) ||
+ (cb->cb_newer && version > ZPL_VERSION)) {
+ char *str;
+ if (cb->cb_newer) {
+ str = gettext("The following filesystems are "
+ "formatted using a newer software version and\n"
+ "cannot be accessed on the current system.\n\n");
+ } else {
+ str = gettext("The following filesystems are "
+ "out of date, and can be upgraded. After being\n"
+ "upgraded, these filesystems (and any 'zfs send' "
+ "streams generated from\n"
+ "subsequent snapshots) will no longer be "
+ "accessible by older software versions.\n\n");
+ }
+
+ if (!cb->cb_foundone) {
+ (void) puts(str);
+ (void) printf(gettext("VER FILESYSTEM\n"));
+ (void) printf(gettext("--- ------------\n"));
+ cb->cb_foundone = B_TRUE;
+ }
+
+ (void) printf("%2u %s\n", version, zfs_get_name(zhp));
+ }
+
+ return (0);
+}
+
+static int
+upgrade_set_callback(zfs_handle_t *zhp, void *data)
+{
+ upgrade_cbdata_t *cb = data;
+ int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+ int needed_spa_version;
+ int spa_version;
+
+ if (zfs_spa_version(zhp, &spa_version) < 0)
+ return (-1);
+
+ needed_spa_version = zfs_spa_version_map(cb->cb_version);
+
+ if (needed_spa_version < 0)
+ return (-1);
+
+ if (spa_version < needed_spa_version) {
+ /* can't upgrade */
+ (void) printf(gettext("%s: can not be "
+ "upgraded; the pool version needs to first "
+ "be upgraded\nto version %d\n\n"),
+ zfs_get_name(zhp), needed_spa_version);
+ cb->cb_numfailed++;
+ return (0);
+ }
+
+ /* upgrade */
+ if (version < cb->cb_version) {
+ char verstr[16];
+ (void) snprintf(verstr, sizeof (verstr),
+ "%llu", (u_longlong_t)cb->cb_version);
+ if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) {
+ /*
+ * If they did "zfs upgrade -a", then we could
+ * be doing ioctls to different pools. We need
+ * to log this history once to each pool, and bypass
+ * the normal history logging that happens in main().
+ */
+ (void) zpool_log_history(g_zfs, history_str);
+ log_history = B_FALSE;
+ }
+ if (zfs_prop_set(zhp, "version", verstr) == 0)
+ cb->cb_numupgraded++;
+ else
+ cb->cb_numfailed++;
+ (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp));
+ } else if (version > cb->cb_version) {
+ /* can't downgrade */
+ (void) printf(gettext("%s: can not be downgraded; "
+ "it is already at version %u\n"),
+ zfs_get_name(zhp), version);
+ cb->cb_numfailed++;
+ } else {
+ cb->cb_numsamegraded++;
+ }
+ return (0);
+}
+
+/*
+ * zfs upgrade
+ * zfs upgrade -v
+ * zfs upgrade [-r] [-V <version>] <-a | filesystem>
+ */
+static int
+zfs_do_upgrade(int argc, char **argv)
+{
+ boolean_t all = B_FALSE;
+ boolean_t showversions = B_FALSE;
+ int ret = 0;
+ upgrade_cbdata_t cb = { 0 };
+ int c;
+ int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "rvV:a")) != -1) {
+ switch (c) {
+ case 'r':
+ flags |= ZFS_ITER_RECURSE;
+ break;
+ case 'v':
+ showversions = B_TRUE;
+ break;
+ case 'V':
+ if (zfs_prop_string_to_index(ZFS_PROP_VERSION,
+ optarg, &cb.cb_version) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid version %s\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 'a':
+ all = B_TRUE;
+ break;
+ case '?':
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version))
+ usage(B_FALSE);
+ if (showversions && (flags & ZFS_ITER_RECURSE || all ||
+ cb.cb_version || argc))
+ usage(B_FALSE);
+ if ((all || argc) && (showversions))
+ usage(B_FALSE);
+ if (all && argc)
+ usage(B_FALSE);
+
+ if (showversions) {
+ /* Show info on available versions. */
+ (void) printf(gettext("The following filesystem versions are "
+ "supported:\n\n"));
+ (void) printf(gettext("VER DESCRIPTION\n"));
+ (void) printf("--- -----------------------------------------"
+ "---------------\n");
+ (void) printf(gettext(" 1 Initial ZFS filesystem version\n"));
+ (void) printf(gettext(" 2 Enhanced directory entries\n"));
+ (void) printf(gettext(" 3 Case insensitive and filesystem "
+ "user identifier (FUID)\n"));
+ (void) printf(gettext(" 4 userquota, groupquota "
+ "properties\n"));
+ (void) printf(gettext(" 5 System attributes\n"));
+ (void) printf(gettext("\nFor more information on a particular "
+ "version, including supported releases,\n"));
+ (void) printf("see the ZFS Administration Guide.\n\n");
+ ret = 0;
+ } else if (argc || all) {
+ /* Upgrade filesystems */
+ if (cb.cb_version == 0)
+ cb.cb_version = ZPL_VERSION;
+ ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
+ NULL, NULL, 0, upgrade_set_callback, &cb);
+ (void) printf(gettext("%llu filesystems upgraded\n"),
+ (u_longlong_t)cb.cb_numupgraded);
+ if (cb.cb_numsamegraded) {
+ (void) printf(gettext("%llu filesystems already at "
+ "this version\n"),
+ (u_longlong_t)cb.cb_numsamegraded);
+ }
+ if (cb.cb_numfailed != 0)
+ ret = 1;
+ } else {
+ /* List old-version filesystems */
+ boolean_t found;
+ (void) printf(gettext("This system is currently running "
+ "ZFS filesystem version %llu.\n\n"), ZPL_VERSION);
+
+ flags |= ZFS_ITER_RECURSE;
+ ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
+ NULL, NULL, 0, upgrade_list_callback, &cb);
+
+ found = cb.cb_foundone;
+ cb.cb_foundone = B_FALSE;
+ cb.cb_newer = B_TRUE;
+
+ ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
+ NULL, NULL, 0, upgrade_list_callback, &cb);
+
+ if (!cb.cb_foundone && !found) {
+ (void) printf(gettext("All filesystems are "
+ "formatted with the current version.\n"));
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...]
+ * [-S field [-S field]...] filesystem | snapshot
+ *
+ * -H Scripted mode; elide headers and separate columns by tabs.
+ * -i Translate SID to POSIX ID.
+ * -n Print numeric ID instead of user/group name.
+ * -o Control which fields to display.
+ * -p Use exact (parsable) numeric output.
+ * -s Specify sort columns, descending order.
+ * -S Specify sort columns, ascending order.
+ * -t Control which object types to display.
+ *
+ * Displays space consumed by, and quotas on, each user in the specified
+ * filesystem or snapshot.
+ */
+
+/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
+enum us_field_types {
+ USFIELD_TYPE,
+ USFIELD_NAME,
+ USFIELD_USED,
+ USFIELD_QUOTA,
+ USFIELD_OBJUSED,
+ USFIELD_OBJQUOTA
+};
+static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA",
+ "OBJUSED", "OBJQUOTA" };
+static char *us_field_names[] = { "type", "name", "used", "quota",
+ "objused", "objquota" };
+#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *))
+
+#define USTYPE_PSX_GRP (1 << 0)
+#define USTYPE_PSX_USR (1 << 1)
+#define USTYPE_SMB_GRP (1 << 2)
+#define USTYPE_SMB_USR (1 << 3)
+#define USTYPE_PROJ (1 << 4)
+#define USTYPE_ALL \
+ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \
+ USTYPE_PROJ)
+
+static int us_type_bits[] = {
+ USTYPE_PSX_GRP,
+ USTYPE_PSX_USR,
+ USTYPE_SMB_GRP,
+ USTYPE_SMB_USR,
+ USTYPE_ALL
+};
+static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
+ "smbuser", "all" };
+
+typedef struct us_node {
+ nvlist_t *usn_nvl;
+ uu_avl_node_t usn_avlnode;
+ uu_list_node_t usn_listnode;
+} us_node_t;
+
+typedef struct us_cbdata {
+ nvlist_t **cb_nvlp;
+ uu_avl_pool_t *cb_avl_pool;
+ uu_avl_t *cb_avl;
+ boolean_t cb_numname;
+ boolean_t cb_nicenum;
+ boolean_t cb_sid2posix;
+ zfs_userquota_prop_t cb_prop;
+ zfs_sort_column_t *cb_sortcol;
+ size_t cb_width[USFIELD_LAST];
+} us_cbdata_t;
+
+static boolean_t us_populated = B_FALSE;
+
+typedef struct {
+ zfs_sort_column_t *si_sortcol;
+ boolean_t si_numname;
+} us_sort_info_t;
+
+static int
+us_field_index(char *field)
+{
+ int i;
+
+ for (i = 0; i < USFIELD_LAST; i++) {
+ if (strcmp(field, us_field_names[i]) == 0)
+ return (i);
+ }
+
+ return (-1);
+}
+
+static int
+us_compare(const void *larg, const void *rarg, void *unused)
+{
+ const us_node_t *l = larg;
+ const us_node_t *r = rarg;
+ us_sort_info_t *si = (us_sort_info_t *)unused;
+ zfs_sort_column_t *sortcol = si->si_sortcol;
+ boolean_t numname = si->si_numname;
+ nvlist_t *lnvl = l->usn_nvl;
+ nvlist_t *rnvl = r->usn_nvl;
+ int rc = 0;
+ boolean_t lvb, rvb;
+
+ for (; sortcol != NULL; sortcol = sortcol->sc_next) {
+ char *lvstr = "";
+ char *rvstr = "";
+ uint32_t lv32 = 0;
+ uint32_t rv32 = 0;
+ uint64_t lv64 = 0;
+ uint64_t rv64 = 0;
+ zfs_prop_t prop = sortcol->sc_prop;
+ const char *propname = NULL;
+ boolean_t reverse = sortcol->sc_reverse;
+
+ switch (prop) {
+ case ZFS_PROP_TYPE:
+ propname = "type";
+ (void) nvlist_lookup_uint32(lnvl, propname, &lv32);
+ (void) nvlist_lookup_uint32(rnvl, propname, &rv32);
+ if (rv32 != lv32)
+ rc = (rv32 < lv32) ? 1 : -1;
+ break;
+ case ZFS_PROP_NAME:
+ propname = "name";
+ if (numname) {
+compare_nums:
+ (void) nvlist_lookup_uint64(lnvl, propname,
+ &lv64);
+ (void) nvlist_lookup_uint64(rnvl, propname,
+ &rv64);
+ if (rv64 != lv64)
+ rc = (rv64 < lv64) ? 1 : -1;
+ } else {
+ if ((nvlist_lookup_string(lnvl, propname,
+ &lvstr) == ENOENT) ||
+ (nvlist_lookup_string(rnvl, propname,
+ &rvstr) == ENOENT)) {
+ goto compare_nums;
+ }
+ rc = strcmp(lvstr, rvstr);
+ }
+ break;
+ case ZFS_PROP_USED:
+ case ZFS_PROP_QUOTA:
+ if (!us_populated)
+ break;
+ if (prop == ZFS_PROP_USED)
+ propname = "used";
+ else
+ propname = "quota";
+ (void) nvlist_lookup_uint64(lnvl, propname, &lv64);
+ (void) nvlist_lookup_uint64(rnvl, propname, &rv64);
+ if (rv64 != lv64)
+ rc = (rv64 < lv64) ? 1 : -1;
+ break;
+
+ default:
+ break;
+ }
+
+ if (rc != 0) {
+ if (rc < 0)
+ return (reverse ? 1 : -1);
+ else
+ return (reverse ? -1 : 1);
+ }
+ }
+
+ /*
+ * If entries still seem to be the same, check if they are of the same
+ * type (smbentity is added only if we are doing SID to POSIX ID
+ * translation where we can have duplicate type/name combinations).
+ */
+ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
+ nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
+ lvb != rvb)
+ return (lvb < rvb ? -1 : 1);
+
+ return (0);
+}
+
+static boolean_t
+zfs_prop_is_user(unsigned p)
+{
+ return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA ||
+ p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA);
+}
+
+static boolean_t
+zfs_prop_is_group(unsigned p)
+{
+ return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA ||
+ p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA);
+}
+
+static boolean_t
+zfs_prop_is_project(unsigned p)
+{
+ return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA ||
+ p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA);
+}
+
+static inline const char *
+us_type2str(unsigned field_type)
+{
+ switch (field_type) {
+ case USTYPE_PSX_USR:
+ return ("POSIX User");
+ case USTYPE_PSX_GRP:
+ return ("POSIX Group");
+ case USTYPE_SMB_USR:
+ return ("SMB User");
+ case USTYPE_SMB_GRP:
+ return ("SMB Group");
+ case USTYPE_PROJ:
+ return ("Project");
+ default:
+ return ("Undefined");
+ }
+}
+
+static int
+userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
+{
+ us_cbdata_t *cb = (us_cbdata_t *)arg;
+ zfs_userquota_prop_t prop = cb->cb_prop;
+ char *name = NULL;
+ char *propname;
+ char sizebuf[32];
+ us_node_t *node;
+ uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
+ uu_avl_t *avl = cb->cb_avl;
+ uu_avl_index_t idx;
+ nvlist_t *props;
+ us_node_t *n;
+ zfs_sort_column_t *sortcol = cb->cb_sortcol;
+ unsigned type = 0;
+ const char *typestr;
+ size_t namelen;
+ size_t typelen;
+ size_t sizelen;
+ int typeidx, nameidx, sizeidx;
+ us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
+ boolean_t smbentity = B_FALSE;
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+ node = safe_malloc(sizeof (us_node_t));
+ uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
+ node->usn_nvl = props;
+
+ if (domain != NULL && domain[0] != '\0') {
+#ifdef HAVE_IDMAP
+ /* SMB */
+ char sid[MAXNAMELEN + 32];
+ uid_t id;
+ uint64_t classes;
+ int err;
+ directory_error_t e;
+
+ smbentity = B_TRUE;
+
+ (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
+
+ if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+ type = USTYPE_SMB_GRP;
+ err = sid_to_id(sid, B_FALSE, &id);
+ } else {
+ type = USTYPE_SMB_USR;
+ err = sid_to_id(sid, B_TRUE, &id);
+ }
+
+ if (err == 0) {
+ rid = id;
+ if (!cb->cb_sid2posix) {
+ e = directory_name_from_sid(NULL, sid, &name,
+ &classes);
+ if (e != NULL)
+ directory_error_free(e);
+ if (name == NULL)
+ name = sid;
+ }
+ }
+#else
+ nvlist_free(props);
+ free(node);
+
+ return (-1);
+#endif /* HAVE_IDMAP */
+ }
+
+ if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') {
+ /* POSIX or -i */
+ if (zfs_prop_is_group(prop)) {
+ type = USTYPE_PSX_GRP;
+ if (!cb->cb_numname) {
+ struct group *g;
+
+ if ((g = getgrgid(rid)) != NULL)
+ name = g->gr_name;
+ }
+ } else if (zfs_prop_is_user(prop)) {
+ type = USTYPE_PSX_USR;
+ if (!cb->cb_numname) {
+ struct passwd *p;
+
+ if ((p = getpwuid(rid)) != NULL)
+ name = p->pw_name;
+ }
+ } else {
+ type = USTYPE_PROJ;
+ }
+ }
+
+ /*
+ * Make sure that the type/name combination is unique when doing
+ * SID to POSIX ID translation (hence changing the type from SMB to
+ * POSIX).
+ */
+ if (cb->cb_sid2posix &&
+ nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
+ nomem();
+
+ /* Calculate/update width of TYPE field */
+ typestr = us_type2str(type);
+ typelen = strlen(gettext(typestr));
+ typeidx = us_field_index("type");
+ if (typelen > cb->cb_width[typeidx])
+ cb->cb_width[typeidx] = typelen;
+ if (nvlist_add_uint32(props, "type", type) != 0)
+ nomem();
+
+ /* Calculate/update width of NAME field */
+ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) {
+ if (nvlist_add_uint64(props, "name", rid) != 0)
+ nomem();
+ namelen = snprintf(NULL, 0, "%u", rid);
+ } else {
+ if (nvlist_add_string(props, "name", name) != 0)
+ nomem();
+ namelen = strlen(name);
+ }
+ nameidx = us_field_index("name");
+ if (nameidx >= 0 && namelen > cb->cb_width[nameidx])
+ cb->cb_width[nameidx] = namelen;
+
+ /*
+ * Check if this type/name combination is in the list and update it;
+ * otherwise add new node to the list.
+ */
+ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
+ uu_avl_insert(avl, node, idx);
+ } else {
+ nvlist_free(props);
+ free(node);
+ node = n;
+ props = node->usn_nvl;
+ }
+
+ /* Calculate/update width of USED/QUOTA fields */
+ if (cb->cb_nicenum) {
+ if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED ||
+ prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA ||
+ prop == ZFS_PROP_PROJECTUSED ||
+ prop == ZFS_PROP_PROJECTQUOTA) {
+ zfs_nicebytes(space, sizebuf, sizeof (sizebuf));
+ } else {
+ zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+ }
+ } else {
+ (void) snprintf(sizebuf, sizeof (sizebuf), "%llu",
+ (u_longlong_t)space);
+ }
+ sizelen = strlen(sizebuf);
+ if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED ||
+ prop == ZFS_PROP_PROJECTUSED) {
+ propname = "used";
+ if (!nvlist_exists(props, "quota"))
+ (void) nvlist_add_uint64(props, "quota", 0);
+ } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA ||
+ prop == ZFS_PROP_PROJECTQUOTA) {
+ propname = "quota";
+ if (!nvlist_exists(props, "used"))
+ (void) nvlist_add_uint64(props, "used", 0);
+ } else if (prop == ZFS_PROP_USEROBJUSED ||
+ prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) {
+ propname = "objused";
+ if (!nvlist_exists(props, "objquota"))
+ (void) nvlist_add_uint64(props, "objquota", 0);
+ } else if (prop == ZFS_PROP_USEROBJQUOTA ||
+ prop == ZFS_PROP_GROUPOBJQUOTA ||
+ prop == ZFS_PROP_PROJECTOBJQUOTA) {
+ propname = "objquota";
+ if (!nvlist_exists(props, "objused"))
+ (void) nvlist_add_uint64(props, "objused", 0);
+ } else {
+ return (-1);
+ }
+ sizeidx = us_field_index(propname);
+ if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx])
+ cb->cb_width[sizeidx] = sizelen;
+
+ if (nvlist_add_uint64(props, propname, space) != 0)
+ nomem();
+
+ return (0);
+}
+
+static void
+print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
+ size_t *width, us_node_t *node)
+{
+ nvlist_t *nvl = node->usn_nvl;
+ char valstr[MAXNAMELEN];
+ boolean_t first = B_TRUE;
+ int cfield = 0;
+ int field;
+ uint32_t ustype;
+
+ /* Check type */
+ (void) nvlist_lookup_uint32(nvl, "type", &ustype);
+ if (!(ustype & types))
+ return;
+
+ while ((field = fields[cfield]) != USFIELD_LAST) {
+ nvpair_t *nvp = NULL;
+ data_type_t type;
+ uint32_t val32;
+ uint64_t val64;
+ char *strval = "-";
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ if (strcmp(nvpair_name(nvp),
+ us_field_names[field]) == 0)
+ break;
+ }
+
+ type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp);
+ switch (type) {
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &val32);
+ break;
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &val64);
+ break;
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &strval);
+ break;
+ case DATA_TYPE_UNKNOWN:
+ break;
+ default:
+ (void) fprintf(stderr, "invalid data type\n");
+ }
+
+ switch (field) {
+ case USFIELD_TYPE:
+ if (type == DATA_TYPE_UINT32)
+ strval = (char *)us_type2str(val32);
+ break;
+ case USFIELD_NAME:
+ if (type == DATA_TYPE_UINT64) {
+ (void) sprintf(valstr, "%llu",
+ (u_longlong_t)val64);
+ strval = valstr;
+ }
+ break;
+ case USFIELD_USED:
+ case USFIELD_QUOTA:
+ if (type == DATA_TYPE_UINT64) {
+ if (parsable) {
+ (void) sprintf(valstr, "%llu",
+ (u_longlong_t)val64);
+ strval = valstr;
+ } else if (field == USFIELD_QUOTA &&
+ val64 == 0) {
+ strval = "none";
+ } else {
+ zfs_nicebytes(val64, valstr,
+ sizeof (valstr));
+ strval = valstr;
+ }
+ }
+ break;
+ case USFIELD_OBJUSED:
+ case USFIELD_OBJQUOTA:
+ if (type == DATA_TYPE_UINT64) {
+ if (parsable) {
+ (void) sprintf(valstr, "%llu",
+ (u_longlong_t)val64);
+ strval = valstr;
+ } else if (field == USFIELD_OBJQUOTA &&
+ val64 == 0) {
+ strval = "none";
+ } else {
+ zfs_nicenum(val64, valstr,
+ sizeof (valstr));
+ strval = valstr;
+ }
+ }
+ break;
+ }
+
+ if (!first) {
+ if (scripted)
+ (void) printf("\t");
+ else
+ (void) printf(" ");
+ }
+ if (scripted)
+ (void) printf("%s", strval);
+ else if (field == USFIELD_TYPE || field == USFIELD_NAME)
+ (void) printf("%-*s", (int)width[field], strval);
+ else
+ (void) printf("%*s", (int)width[field], strval);
+
+ first = B_FALSE;
+ cfield++;
+ }
+
+ (void) printf("\n");
+}
+
+static void
+print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
+ size_t *width, boolean_t rmnode, uu_avl_t *avl)
+{
+ us_node_t *node;
+ const char *col;
+ int cfield = 0;
+ int field;
+
+ if (!scripted) {
+ boolean_t first = B_TRUE;
+
+ while ((field = fields[cfield]) != USFIELD_LAST) {
+ col = gettext(us_field_hdr[field]);
+ if (field == USFIELD_TYPE || field == USFIELD_NAME) {
+ (void) printf(first ? "%-*s" : " %-*s",
+ (int)width[field], col);
+ } else {
+ (void) printf(first ? "%*s" : " %*s",
+ (int)width[field], col);
+ }
+ first = B_FALSE;
+ cfield++;
+ }
+ (void) printf("\n");
+ }
+
+ for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
+ print_us_node(scripted, parsable, fields, types, width, node);
+ if (rmnode)
+ nvlist_free(node->usn_nvl);
+ }
+}
+
+static int
+zfs_do_userspace(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ zfs_userquota_prop_t p;
+ uu_avl_pool_t *avl_pool;
+ uu_avl_t *avl_tree;
+ uu_avl_walk_t *walk;
+ char *delim;
+ char deffields[] = "type,name,used,quota,objused,objquota";
+ char *ofield = NULL;
+ char *tfield = NULL;
+ int cfield = 0;
+ int fields[256];
+ int i;
+ boolean_t scripted = B_FALSE;
+ boolean_t prtnum = B_FALSE;
+ boolean_t parsable = B_FALSE;
+ boolean_t sid2posix = B_FALSE;
+ int ret = 0;
+ int c;
+ zfs_sort_column_t *sortcol = NULL;
+ int types = USTYPE_PSX_USR | USTYPE_SMB_USR;
+ us_cbdata_t cb;
+ us_node_t *node;
+ us_node_t *rmnode;
+ uu_list_pool_t *listpool;
+ uu_list_t *list;
+ uu_avl_index_t idx = 0;
+ uu_list_index_t idx2 = 0;
+
+ if (argc < 2)
+ usage(B_FALSE);
+
+ if (strcmp(argv[0], "groupspace") == 0) {
+ /* Toggle default group types */
+ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
+ } else if (strcmp(argv[0], "projectspace") == 0) {
+ types = USTYPE_PROJ;
+ prtnum = B_TRUE;
+ }
+
+ while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
+ switch (c) {
+ case 'n':
+ if (types == USTYPE_PROJ) {
+ (void) fprintf(stderr,
+ gettext("invalid option 'n'\n"));
+ usage(B_FALSE);
+ }
+ prtnum = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case 'p':
+ parsable = B_TRUE;
+ break;
+ case 'o':
+ ofield = optarg;
+ break;
+ case 's':
+ case 'S':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ c == 's' ? B_FALSE : B_TRUE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid field '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 't':
+ if (types == USTYPE_PROJ) {
+ (void) fprintf(stderr,
+ gettext("invalid option 't'\n"));
+ usage(B_FALSE);
+ }
+ tfield = optarg;
+ break;
+ case 'i':
+ if (types == USTYPE_PROJ) {
+ (void) fprintf(stderr,
+ gettext("invalid option 'i'\n"));
+ usage(B_FALSE);
+ }
+ sid2posix = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing dataset name\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /* Use default output fields if not specified using -o */
+ if (ofield == NULL)
+ ofield = deffields;
+ do {
+ if ((delim = strchr(ofield, ',')) != NULL)
+ *delim = '\0';
+ if ((fields[cfield++] = us_field_index(ofield)) == -1) {
+ (void) fprintf(stderr, gettext("invalid type '%s' "
+ "for -o option\n"), ofield);
+ return (-1);
+ }
+ if (delim != NULL)
+ ofield = delim + 1;
+ } while (delim != NULL);
+ fields[cfield] = USFIELD_LAST;
+
+ /* Override output types (-t option) */
+ if (tfield != NULL) {
+ types = 0;
+
+ do {
+ boolean_t found = B_FALSE;
+
+ if ((delim = strchr(tfield, ',')) != NULL)
+ *delim = '\0';
+ for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
+ i++) {
+ if (strcmp(tfield, us_type_names[i]) == 0) {
+ found = B_TRUE;
+ types |= us_type_bits[i];
+ break;
+ }
+ }
+ if (!found) {
+ (void) fprintf(stderr, gettext("invalid type "
+ "'%s' for -t option\n"), tfield);
+ return (-1);
+ }
+ if (delim != NULL)
+ tfield = delim + 1;
+ } while (delim != NULL);
+ }
+
+ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_SNAPSHOT)) == NULL)
+ return (1);
+ if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) {
+ (void) fprintf(stderr, gettext("operation is only applicable "
+ "to filesystems and their snapshots\n"));
+ zfs_close(zhp);
+ return (1);
+ }
+
+ if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
+ offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
+ nomem();
+ if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+ nomem();
+
+ /* Always add default sorting columns */
+ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
+ (void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
+
+ cb.cb_sortcol = sortcol;
+ cb.cb_numname = prtnum;
+ cb.cb_nicenum = !parsable;
+ cb.cb_avl_pool = avl_pool;
+ cb.cb_avl = avl_tree;
+ cb.cb_sid2posix = sid2posix;
+
+ for (i = 0; i < USFIELD_LAST; i++)
+ cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
+
+ for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
+ if ((zfs_prop_is_user(p) &&
+ !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) ||
+ (zfs_prop_is_group(p) &&
+ !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) ||
+ (zfs_prop_is_project(p) && types != USTYPE_PROJ))
+ continue;
+
+ cb.cb_prop = p;
+ if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) {
+ zfs_close(zhp);
+ return (ret);
+ }
+ }
+ zfs_close(zhp);
+
+ /* Sort the list */
+ if ((node = uu_avl_first(avl_tree)) == NULL)
+ return (0);
+
+ us_populated = B_TRUE;
+
+ listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
+ offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
+ list = uu_list_create(listpool, NULL, UU_DEFAULT);
+ uu_list_node_init(node, &node->usn_listnode, listpool);
+
+ while (node != NULL) {
+ rmnode = node;
+ node = uu_avl_next(avl_tree, node);
+ uu_avl_remove(avl_tree, rmnode);
+ if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
+ uu_list_insert(list, rmnode, idx2);
+ }
+
+ for (node = uu_list_first(list); node != NULL;
+ node = uu_list_next(list, node)) {
+ us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
+
+ if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
+ uu_avl_insert(avl_tree, node, idx);
+ }
+
+ uu_list_destroy(list);
+ uu_list_pool_destroy(listpool);
+
+ /* Print and free node nvlist memory */
+ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
+ cb.cb_avl);
+
+ zfs_free_sort_columns(sortcol);
+
+ /* Clean up the AVL tree */
+ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+ nomem();
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(cb.cb_avl, node);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(avl_tree);
+ uu_avl_pool_destroy(avl_pool);
+
+ return (ret);
+}
+
+/*
+ * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property]
+ * [-t type[,...]] [filesystem|volume|snapshot] ...
+ *
+ * -H Scripted mode; elide headers and separate columns by tabs
+ * -p Display values in parsable (literal) format.
+ * -r Recurse over all children
+ * -d Limit recursion by depth.
+ * -o Control which fields to display.
+ * -s Specify sort columns, descending order.
+ * -S Specify sort columns, ascending order.
+ * -t Control which object types to display.
+ *
+ * When given no arguments, list all filesystems in the system.
+ * Otherwise, list the specified datasets, optionally recursing down them if
+ * '-r' is specified.
+ */
+typedef struct list_cbdata {
+ boolean_t cb_first;
+ boolean_t cb_literal;
+ boolean_t cb_scripted;
+ zprop_list_t *cb_proplist;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(list_cbdata_t *cb)
+{
+ zprop_list_t *pl = cb->cb_proplist;
+ char headerbuf[ZFS_MAXPROPLEN];
+ const char *header;
+ int i;
+ boolean_t first = B_TRUE;
+ boolean_t right_justify;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ if (!first) {
+ (void) printf(" ");
+ } else {
+ first = B_FALSE;
+ }
+
+ right_justify = B_FALSE;
+ if (pl->pl_prop != ZPROP_INVAL) {
+ header = zfs_prop_column_name(pl->pl_prop);
+ right_justify = zfs_prop_align_right(pl->pl_prop);
+ } else {
+ for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+ headerbuf[i] = toupper(pl->pl_user_prop[i]);
+ headerbuf[i] = '\0';
+ header = headerbuf;
+ }
+
+ if (pl->pl_next == NULL && !right_justify)
+ (void) printf("%s", header);
+ else if (right_justify)
+ (void) printf("%*s", (int)pl->pl_width, header);
+ else
+ (void) printf("%-*s", (int)pl->pl_width, header);
+ }
+
+ (void) printf("\n");
+}
+
+/*
+ * Given a dataset and a list of fields, print out all the properties according
+ * to the described layout.
+ */
+static void
+print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
+{
+ zprop_list_t *pl = cb->cb_proplist;
+ boolean_t first = B_TRUE;
+ char property[ZFS_MAXPROPLEN];
+ nvlist_t *userprops = zfs_get_user_props(zhp);
+ nvlist_t *propval;
+ char *propstr;
+ boolean_t right_justify;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ if (!first) {
+ if (cb->cb_scripted)
+ (void) printf("\t");
+ else
+ (void) printf(" ");
+ } else {
+ first = B_FALSE;
+ }
+
+ if (pl->pl_prop == ZFS_PROP_NAME) {
+ (void) strlcpy(property, zfs_get_name(zhp),
+ sizeof (property));
+ propstr = property;
+ right_justify = zfs_prop_align_right(pl->pl_prop);
+ } else if (pl->pl_prop != ZPROP_INVAL) {
+ if (zfs_prop_get(zhp, pl->pl_prop, property,
+ sizeof (property), NULL, NULL, 0,
+ cb->cb_literal) != 0)
+ propstr = "-";
+ else
+ propstr = property;
+ right_justify = zfs_prop_align_right(pl->pl_prop);
+ } else if (zfs_prop_userquota(pl->pl_user_prop)) {
+ if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+ property, sizeof (property), cb->cb_literal) != 0)
+ propstr = "-";
+ else
+ propstr = property;
+ right_justify = B_TRUE;
+ } else if (zfs_prop_written(pl->pl_user_prop)) {
+ if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+ property, sizeof (property), cb->cb_literal) != 0)
+ propstr = "-";
+ else
+ propstr = property;
+ right_justify = B_TRUE;
+ } else {
+ if (nvlist_lookup_nvlist(userprops,
+ pl->pl_user_prop, &propval) != 0)
+ propstr = "-";
+ else
+ verify(nvlist_lookup_string(propval,
+ ZPROP_VALUE, &propstr) == 0);
+ right_justify = B_FALSE;
+ }
+
+ /*
+ * If this is being called in scripted mode, or if this is the
+ * last column and it is left-justified, don't include a width
+ * format specifier.
+ */
+ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
+ (void) printf("%s", propstr);
+ else if (right_justify)
+ (void) printf("%*s", (int)pl->pl_width, propstr);
+ else
+ (void) printf("%-*s", (int)pl->pl_width, propstr);
+ }
+
+ (void) printf("\n");
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+list_callback(zfs_handle_t *zhp, void *data)
+{
+ list_cbdata_t *cbp = data;
+
+ if (cbp->cb_first) {
+ if (!cbp->cb_scripted)
+ print_header(cbp);
+ cbp->cb_first = B_FALSE;
+ }
+
+ print_dataset(zhp, cbp);
+
+ return (0);
+}
+
+static int
+zfs_do_list(int argc, char **argv)
+{
+ int c;
+ static char default_fields[] =
+ "name,used,available,referenced,mountpoint";
+ int types = ZFS_TYPE_DATASET;
+ boolean_t types_specified = B_FALSE;
+ char *fields = NULL;
+ list_cbdata_t cb = { 0 };
+ char *value;
+ int limit = 0;
+ int ret = 0;
+ zfs_sort_column_t *sortcol = NULL;
+ int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
+ switch (c) {
+ case 'o':
+ fields = optarg;
+ break;
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ flags |= ZFS_ITER_LITERAL_PROPS;
+ break;
+ case 'd':
+ limit = parse_depth(optarg, &flags);
+ break;
+ case 'r':
+ flags |= ZFS_ITER_RECURSE;
+ break;
+ case 'H':
+ cb.cb_scripted = B_TRUE;
+ break;
+ case 's':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ B_FALSE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid property '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 'S':
+ if (zfs_add_sort_column(&sortcol, optarg,
+ B_TRUE) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid property '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case 't':
+ types = 0;
+ types_specified = B_TRUE;
+ flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+ while (*optarg != '\0') {
+ static char *type_subopts[] = { "filesystem",
+ "volume", "snapshot", "snap", "bookmark",
+ "all", NULL };
+
+ switch (getsubopt(&optarg, type_subopts,
+ &value)) {
+ case 0:
+ types |= ZFS_TYPE_FILESYSTEM;
+ break;
+ case 1:
+ types |= ZFS_TYPE_VOLUME;
+ break;
+ case 2:
+ case 3:
+ types |= ZFS_TYPE_SNAPSHOT;
+ break;
+ case 4:
+ types |= ZFS_TYPE_BOOKMARK;
+ break;
+ case 5:
+ types = ZFS_TYPE_DATASET |
+ ZFS_TYPE_BOOKMARK;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid type '%s'\n"),
+ value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (fields == NULL)
+ fields = default_fields;
+
+ /*
+ * If we are only going to list snapshot names and sort by name,
+ * then we can use faster version.
+ */
+ if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
+ flags |= ZFS_ITER_SIMPLE;
+
+ /*
+ * If "-o space" and no types were specified, don't display snapshots.
+ */
+ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
+ types &= ~ZFS_TYPE_SNAPSHOT;
+
+ /*
+ * Handle users who want to list all snapshots or bookmarks
+ * of the current dataset (ex. 'zfs list -t snapshot <dataset>').
+ */
+ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) &&
+ argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) {
+ flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE);
+ limit = 1;
+ }
+
+ /*
+ * If the user specifies '-o all', the zprop_get_list() doesn't
+ * normally include the name of the dataset. For 'zfs list', we always
+ * want this property to be first.
+ */
+ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET)
+ != 0)
+ usage(B_FALSE);
+
+ cb.cb_first = B_TRUE;
+
+ ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
+ limit, list_callback, &cb);
+
+ zprop_free_list(cb.cb_proplist);
+ zfs_free_sort_columns(sortcol);
+
+ if (ret == 0 && cb.cb_first && !cb.cb_scripted)
+ (void) fprintf(stderr, gettext("no datasets available\n"));
+
+ return (ret);
+}
+
+/*
+ * zfs rename [-fu] <fs | snap | vol> <fs | snap | vol>
+ * zfs rename [-f] -p <fs | vol> <fs | vol>
+ * zfs rename [-u] -r <snap> <snap>
+ *
+ * Renames the given dataset to another of the same type.
+ *
+ * The '-p' flag creates all the non-existing ancestors of the target first.
+ * The '-u' flag prevents file systems from being remounted during rename.
+ */
+/* ARGSUSED */
+static int
+zfs_do_rename(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ renameflags_t flags = { 0 };
+ int c;
+ int ret = 0;
+ int types;
+ boolean_t parents = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "pruf")) != -1) {
+ switch (c) {
+ case 'p':
+ parents = B_TRUE;
+ break;
+ case 'r':
+ flags.recursive = B_TRUE;
+ break;
+ case 'u':
+ flags.nounmount = B_TRUE;
+ break;
+ case 'f':
+ flags.forceunmount = B_TRUE;
+ break;
+ case '?':
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing source dataset "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing target dataset "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (flags.recursive && parents) {
+ (void) fprintf(stderr, gettext("-p and -r options are mutually "
+ "exclusive\n"));
+ usage(B_FALSE);
+ }
+
+ if (flags.nounmount && parents) {
+ (void) fprintf(stderr, gettext("-u and -p options are mutually "
+ "exclusive\n"));
+ usage(B_FALSE);
+ }
+
+ if (flags.recursive && strchr(argv[0], '@') == 0) {
+ (void) fprintf(stderr, gettext("source dataset for recursive "
+ "rename must be a snapshot\n"));
+ usage(B_FALSE);
+ }
+
+ if (flags.nounmount)
+ types = ZFS_TYPE_FILESYSTEM;
+ else if (parents)
+ types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+ else
+ types = ZFS_TYPE_DATASET;
+
+ if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
+ return (1);
+
+ /* If we were asked and the name looks good, try to create ancestors. */
+ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) &&
+ zfs_create_ancestors(g_zfs, argv[1]) != 0) {
+ zfs_close(zhp);
+ return (1);
+ }
+
+ ret = (zfs_rename(zhp, argv[1], flags) != 0);
+
+ zfs_close(zhp);
+ return (ret);
+}
+
+/*
+ * zfs promote <fs>
+ *
+ * Promotes the given clone fs to be the parent
+ */
+/* ARGSUSED */
+static int
+zfs_do_promote(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ int ret = 0;
+
+ /* check options */
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ /* check number of arguments */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing clone filesystem"
+ " argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL)
+ return (1);
+
+ ret = (zfs_promote(zhp) != 0);
+
+
+ zfs_close(zhp);
+ return (ret);
+}
+
+static int
+zfs_do_redact(int argc, char **argv)
+{
+ char *snap = NULL;
+ char *bookname = NULL;
+ char **rsnaps = NULL;
+ int numrsnaps = 0;
+ argv++;
+ argc--;
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("too few arguments\n"));
+ usage(B_FALSE);
+ }
+
+ snap = argv[0];
+ bookname = argv[1];
+ rsnaps = argv + 2;
+ numrsnaps = argc - 2;
+
+ nvlist_t *rsnapnv = fnvlist_alloc();
+
+ for (int i = 0; i < numrsnaps; i++) {
+ fnvlist_add_boolean(rsnapnv, rsnaps[i]);
+ }
+
+ int err = lzc_redact(snap, bookname, rsnapnv);
+ fnvlist_free(rsnapnv);
+
+ switch (err) {
+ case 0:
+ break;
+ case ENOENT:
+ (void) fprintf(stderr,
+ gettext("provided snapshot %s does not exist\n"), snap);
+ break;
+ case EEXIST:
+ (void) fprintf(stderr, gettext("specified redaction bookmark "
+ "(%s) provided already exists\n"), bookname);
+ break;
+ case ENAMETOOLONG:
+ (void) fprintf(stderr, gettext("provided bookmark name cannot "
+ "be used, final name would be too long\n"));
+ break;
+ case E2BIG:
+ (void) fprintf(stderr, gettext("too many redaction snapshots "
+ "specified\n"));
+ break;
+ case EINVAL:
+ if (strchr(bookname, '#') != NULL)
+ (void) fprintf(stderr, gettext(
+ "redaction bookmark name must not contain '#'\n"));
+ else
+ (void) fprintf(stderr, gettext(
+ "redaction snapshot must be descendent of "
+ "snapshot being redacted\n"));
+ break;
+ case EALREADY:
+ (void) fprintf(stderr, gettext("attempted to redact redacted "
+ "dataset or with respect to redacted dataset\n"));
+ break;
+ case ENOTSUP:
+ (void) fprintf(stderr, gettext("redaction bookmarks feature "
+ "not enabled\n"));
+ break;
+ case EXDEV:
+ (void) fprintf(stderr, gettext("potentially invalid redaction "
+ "snapshot; full dataset names required\n"));
+ break;
+ default:
+ (void) fprintf(stderr, gettext("internal error: %s\n"),
+ strerror(errno));
+ }
+
+ return (err);
+}
+
+/*
+ * zfs rollback [-rRf] <snapshot>
+ *
+ * -r Delete any intervening snapshots before doing rollback
+ * -R Delete any snapshots and their clones
+ * -f ignored for backwards compatibility
+ *
+ * Given a filesystem, rollback to a specific snapshot, discarding any changes
+ * since then and making it the active dataset. If more recent snapshots exist,
+ * the command will complain unless the '-r' flag is given.
+ */
+typedef struct rollback_cbdata {
+ uint64_t cb_create;
+ uint8_t cb_younger_ds_printed;
+ boolean_t cb_first;
+ int cb_doclones;
+ char *cb_target;
+ int cb_error;
+ boolean_t cb_recurse;
+} rollback_cbdata_t;
+
+static int
+rollback_check_dependent(zfs_handle_t *zhp, void *data)
+{
+ rollback_cbdata_t *cbp = data;
+
+ if (cbp->cb_first && cbp->cb_recurse) {
+ (void) fprintf(stderr, gettext("cannot rollback to "
+ "'%s': clones of previous snapshots exist\n"),
+ cbp->cb_target);
+ (void) fprintf(stderr, gettext("use '-R' to "
+ "force deletion of the following clones and "
+ "dependents:\n"));
+ cbp->cb_first = 0;
+ cbp->cb_error = 1;
+ }
+
+ (void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+
+ zfs_close(zhp);
+ return (0);
+}
+
+
+/*
+ * Report some snapshots/bookmarks more recent than the one specified.
+ * Used when '-r' is not specified. We reuse this same callback for the
+ * snapshot dependents - if 'cb_dependent' is set, then this is a
+ * dependent and we should report it without checking the transaction group.
+ */
+static int
+rollback_check(zfs_handle_t *zhp, void *data)
+{
+ rollback_cbdata_t *cbp = data;
+ /*
+ * Max number of younger snapshots and/or bookmarks to display before
+ * we stop the iteration.
+ */
+ const uint8_t max_younger = 32;
+
+ if (cbp->cb_doclones) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
+ if (cbp->cb_first && !cbp->cb_recurse) {
+ (void) fprintf(stderr, gettext("cannot "
+ "rollback to '%s': more recent snapshots "
+ "or bookmarks exist\n"),
+ cbp->cb_target);
+ (void) fprintf(stderr, gettext("use '-r' to "
+ "force deletion of the following "
+ "snapshots and bookmarks:\n"));
+ cbp->cb_first = 0;
+ cbp->cb_error = 1;
+ }
+
+ if (cbp->cb_recurse) {
+ if (zfs_iter_dependents(zhp, B_TRUE,
+ rollback_check_dependent, cbp) != 0) {
+ zfs_close(zhp);
+ return (-1);
+ }
+ } else {
+ (void) fprintf(stderr, "%s\n",
+ zfs_get_name(zhp));
+ cbp->cb_younger_ds_printed++;
+ }
+ }
+ zfs_close(zhp);
+
+ if (cbp->cb_younger_ds_printed == max_younger) {
+ /*
+ * This non-recursive rollback is going to fail due to the
+ * presence of snapshots and/or bookmarks that are younger than
+ * the rollback target.
+ * We printed some of the offending objects, now we stop
+ * zfs_iter_snapshot/bookmark iteration so we can fail fast and
+ * avoid iterating over the rest of the younger objects
+ */
+ (void) fprintf(stderr, gettext("Output limited to %d "
+ "snapshots/bookmarks\n"), max_younger);
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+zfs_do_rollback(int argc, char **argv)
+{
+ int ret = 0;
+ int c;
+ boolean_t force = B_FALSE;
+ rollback_cbdata_t cb = { 0 };
+ zfs_handle_t *zhp, *snap;
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
+ char *delim;
+ uint64_t min_txg = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "rRf")) != -1) {
+ switch (c) {
+ case 'r':
+ cb.cb_recurse = 1;
+ break;
+ case 'R':
+ cb.cb_recurse = 1;
+ cb.cb_doclones = 1;
+ break;
+ case 'f':
+ force = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing dataset argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /* open the snapshot */
+ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL)
+ return (1);
+
+ /* open the parent dataset */
+ (void) strlcpy(parentname, argv[0], sizeof (parentname));
+ verify((delim = strrchr(parentname, '@')) != NULL);
+ *delim = '\0';
+ if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) {
+ zfs_close(snap);
+ return (1);
+ }
+
+ /*
+ * Check for more recent snapshots and/or clones based on the presence
+ * of '-r' and '-R'.
+ */
+ cb.cb_target = argv[0];
+ cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
+ cb.cb_first = B_TRUE;
+ cb.cb_error = 0;
+
+ if (cb.cb_create > 0)
+ min_txg = cb.cb_create;
+
+ if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb,
+ min_txg, 0)) != 0)
+ goto out;
+ if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
+ goto out;
+
+ if ((ret = cb.cb_error) != 0)
+ goto out;
+
+ /*
+ * Rollback parent to the given snapshot.
+ */
+ ret = zfs_rollback(zhp, snap, force);
+
+out:
+ zfs_close(snap);
+ zfs_close(zhp);
+
+ if (ret == 0)
+ return (0);
+ else
+ return (1);
+}
+
+/*
+ * zfs set property=value ... { fs | snap | vol } ...
+ *
+ * Sets the given properties for all datasets specified on the command line.
+ */
+
+static int
+set_callback(zfs_handle_t *zhp, void *data)
+{
+ nvlist_t *props = data;
+
+ if (zfs_prop_set_list(zhp, props) != 0) {
+ switch (libzfs_errno(g_zfs)) {
+ case EZFS_MOUNTFAILED:
+ (void) fprintf(stderr, gettext("property may be set "
+ "but unable to remount filesystem\n"));
+ break;
+ case EZFS_SHARENFSFAILED:
+ (void) fprintf(stderr, gettext("property may be set "
+ "but unable to reshare filesystem\n"));
+ break;
+ }
+ return (1);
+ }
+ return (0);
+}
+
+static int
+zfs_do_set(int argc, char **argv)
+{
+ nvlist_t *props = NULL;
+ int ds_start = -1; /* argv idx of first dataset arg */
+ int ret = 0;
+ int i;
+
+ /* check for options */
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ /* check number of arguments */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing arguments\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 3) {
+ if (strchr(argv[1], '=') == NULL) {
+ (void) fprintf(stderr, gettext("missing property=value "
+ "argument(s)\n"));
+ } else {
+ (void) fprintf(stderr, gettext("missing dataset "
+ "name(s)\n"));
+ }
+ usage(B_FALSE);
+ }
+
+ /* validate argument order: prop=val args followed by dataset args */
+ for (i = 1; i < argc; i++) {
+ if (strchr(argv[i], '=') != NULL) {
+ if (ds_start > 0) {
+ /* out-of-order prop=val argument */
+ (void) fprintf(stderr, gettext("invalid "
+ "argument order\n"));
+ usage(B_FALSE);
+ }
+ } else if (ds_start < 0) {
+ ds_start = i;
+ }
+ }
+ if (ds_start < 0) {
+ (void) fprintf(stderr, gettext("missing dataset name(s)\n"));
+ usage(B_FALSE);
+ }
+
+ /* Populate a list of property settings */
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+ for (i = 1; i < ds_start; i++) {
+ if (!parseprop(props, argv[i])) {
+ ret = -1;
+ goto error;
+ }
+ }
+
+ ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
+ ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
+
+error:
+ nvlist_free(props);
+ return (ret);
+}
+
+typedef struct snap_cbdata {
+ nvlist_t *sd_nvl;
+ boolean_t sd_recursive;
+ const char *sd_snapname;
+} snap_cbdata_t;
+
+static int
+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
+{
+ snap_cbdata_t *sd = arg;
+ char *name;
+ int rv = 0;
+ int error;
+
+ if (sd->sd_recursive &&
+ zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
+ zfs_close(zhp);
+ return (0);
+ }
+
+ error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
+ if (error == -1)
+ nomem();
+ fnvlist_add_boolean(sd->sd_nvl, name);
+ free(name);
+
+ if (sd->sd_recursive)
+ rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
+ zfs_close(zhp);
+ return (rv);
+}
+
+/*
+ * zfs snapshot [-r] [-o prop=value] ... <fs@snap>
+ *
+ * Creates a snapshot with the given name. While functionally equivalent to
+ * 'zfs create', it is a separate command to differentiate intent.
+ */
+static int
+zfs_do_snapshot(int argc, char **argv)
+{
+ int ret = 0;
+ int c;
+ nvlist_t *props;
+ snap_cbdata_t sd = { 0 };
+ boolean_t multiple_snaps = B_FALSE;
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+ if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ /* check options */
+ while ((c = getopt(argc, argv, "ro:")) != -1) {
+ switch (c) {
+ case 'o':
+ if (!parseprop(props, optarg)) {
+ nvlist_free(sd.sd_nvl);
+ nvlist_free(props);
+ return (1);
+ }
+ break;
+ case 'r':
+ sd.sd_recursive = B_TRUE;
+ multiple_snaps = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ goto usage;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing snapshot argument\n"));
+ goto usage;
+ }
+
+ if (argc > 1)
+ multiple_snaps = B_TRUE;
+ for (; argc > 0; argc--, argv++) {
+ char *atp;
+ zfs_handle_t *zhp;
+
+ atp = strchr(argv[0], '@');
+ if (atp == NULL)
+ goto usage;
+ *atp = '\0';
+ sd.sd_snapname = atp + 1;
+ zhp = zfs_open(g_zfs, argv[0],
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL)
+ goto usage;
+ if (zfs_snapshot_cb(zhp, &sd) != 0)
+ goto usage;
+ }
+
+ ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
+ nvlist_free(sd.sd_nvl);
+ nvlist_free(props);
+ if (ret != 0 && multiple_snaps)
+ (void) fprintf(stderr, gettext("no snapshots were created\n"));
+ return (ret != 0);
+
+usage:
+ nvlist_free(sd.sd_nvl);
+ nvlist_free(props);
+ usage(B_FALSE);
+ return (-1);
+}
+
+
+/*
+ * Send a backup stream to stdout.
+ */
+static int
+zfs_do_send(int argc, char **argv)
+{
+ char *fromname = NULL;
+ char *toname = NULL;
+ char *resume_token = NULL;
+ char *cp;
+ zfs_handle_t *zhp;
+ sendflags_t flags = { 0 };
+ int c, err;
+ nvlist_t *dbgnv = NULL;
+ char *redactbook = NULL;
+
+ struct option long_options[] = {
+ {"replicate", no_argument, NULL, 'R'},
+ {"redact", required_argument, NULL, 'd'},
+ {"props", no_argument, NULL, 'p'},
+ {"parsable", no_argument, NULL, 'P'},
+ {"dedup", no_argument, NULL, 'D'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"dryrun", no_argument, NULL, 'n'},
+ {"large-block", no_argument, NULL, 'L'},
+ {"embed", no_argument, NULL, 'e'},
+ {"resume", required_argument, NULL, 't'},
+ {"compressed", no_argument, NULL, 'c'},
+ {"raw", no_argument, NULL, 'w'},
+ {"backup", no_argument, NULL, 'b'},
+ {"holds", no_argument, NULL, 'h'},
+ {"saved", no_argument, NULL, 'S'},
+ {0, 0, 0, 0}
+ };
+
+ /* check options */
+ while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S",
+ long_options, NULL)) != -1) {
+ switch (c) {
+ case 'i':
+ if (fromname)
+ usage(B_FALSE);
+ fromname = optarg;
+ break;
+ case 'I':
+ if (fromname)
+ usage(B_FALSE);
+ fromname = optarg;
+ flags.doall = B_TRUE;
+ break;
+ case 'R':
+ flags.replicate = B_TRUE;
+ break;
+ case 'd':
+ redactbook = optarg;
+ break;
+ case 'p':
+ flags.props = B_TRUE;
+ break;
+ case 'b':
+ flags.backup = B_TRUE;
+ break;
+ case 'h':
+ flags.holds = B_TRUE;
+ break;
+ case 'P':
+ flags.parsable = B_TRUE;
+ break;
+ case 'v':
+ flags.verbosity++;
+ flags.progress = B_TRUE;
+ break;
+ case 'D':
+ (void) fprintf(stderr,
+ gettext("WARNING: deduplicated send is no "
+ "longer supported. A regular,\n"
+ "non-deduplicated stream will be generated.\n\n"));
+ break;
+ case 'n':
+ flags.dryrun = B_TRUE;
+ break;
+ case 'L':
+ flags.largeblock = B_TRUE;
+ break;
+ case 'e':
+ flags.embed_data = B_TRUE;
+ break;
+ case 't':
+ resume_token = optarg;
+ break;
+ case 'c':
+ flags.compress = B_TRUE;
+ break;
+ case 'w':
+ flags.raw = B_TRUE;
+ flags.compress = B_TRUE;
+ flags.embed_data = B_TRUE;
+ flags.largeblock = B_TRUE;
+ break;
+ case 'S':
+ flags.saved = B_TRUE;
+ break;
+ case ':':
+ /*
+ * If a parameter was not passed, optopt contains the
+ * value that would normally lead us into the
+ * appropriate case statement. If it's > 256, then this
+ * must be a longopt and we should look at argv to get
+ * the string. Otherwise it's just the character, so we
+ * should use it directly.
+ */
+ if (optopt <= UINT8_MAX) {
+ (void) fprintf(stderr,
+ gettext("missing argument for '%c' "
+ "option\n"), optopt);
+ } else {
+ (void) fprintf(stderr,
+ gettext("missing argument for '%s' "
+ "option\n"), argv[optind - 1]);
+ }
+ usage(B_FALSE);
+ break;
+ case '?':
+ /*FALLTHROUGH*/
+ default:
+ /*
+ * If an invalid flag was passed, optopt contains the
+ * character if it was a short flag, or 0 if it was a
+ * longopt.
+ */
+ if (optopt != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ } else {
+ (void) fprintf(stderr,
+ gettext("invalid option '%s'\n"),
+ argv[optind - 1]);
+
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ if (flags.parsable && flags.verbosity == 0)
+ flags.verbosity = 1;
+
+ argc -= optind;
+ argv += optind;
+
+ if (resume_token != NULL) {
+ if (fromname != NULL || flags.replicate || flags.props ||
+ flags.backup || flags.holds ||
+ flags.saved || redactbook != NULL) {
+ (void) fprintf(stderr,
+ gettext("invalid flags combined with -t\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ } else {
+ if (argc < 1) {
+ (void) fprintf(stderr,
+ gettext("missing snapshot argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ }
+
+ if (flags.saved) {
+ if (fromname != NULL || flags.replicate || flags.props ||
+ flags.doall || flags.backup ||
+ flags.holds || flags.largeblock || flags.embed_data ||
+ flags.compress || flags.raw || redactbook != NULL) {
+ (void) fprintf(stderr, gettext("incompatible flags "
+ "combined with saved send flag\n"));
+ usage(B_FALSE);
+ }
+ if (strchr(argv[0], '@') != NULL) {
+ (void) fprintf(stderr, gettext("saved send must "
+ "specify the dataset with partially-received "
+ "state\n"));
+ usage(B_FALSE);
+ }
+ }
+
+ if (flags.raw && redactbook != NULL) {
+ (void) fprintf(stderr,
+ gettext("Error: raw sends may not be redacted.\n"));
+ return (1);
+ }
+
+ if (!flags.dryrun && isatty(STDOUT_FILENO)) {
+ (void) fprintf(stderr,
+ gettext("Error: Stream can not be written to a terminal.\n"
+ "You must redirect standard output.\n"));
+ return (1);
+ }
+
+ if (flags.saved) {
+ zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
+ if (zhp == NULL)
+ return (1);
+
+ err = zfs_send_saved(zhp, &flags, STDOUT_FILENO,
+ resume_token);
+ zfs_close(zhp);
+ return (err != 0);
+ } else if (resume_token != NULL) {
+ return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
+ resume_token));
+ }
+
+ /*
+ * For everything except -R and -I, use the new, cleaner code path.
+ */
+ if (!(flags.replicate || flags.doall)) {
+ char frombuf[ZFS_MAX_DATASET_NAME_LEN];
+
+ if (fromname != NULL && (strchr(fromname, '#') == NULL &&
+ strchr(fromname, '@') == NULL)) {
+ /*
+ * Neither bookmark or snapshot was specified. Print a
+ * warning, and assume snapshot.
+ */
+ (void) fprintf(stderr, "Warning: incremental source "
+ "didn't specify type, assuming snapshot. Use '@' "
+ "or '#' prefix to avoid ambiguity.\n");
+ (void) snprintf(frombuf, sizeof (frombuf), "@%s",
+ fromname);
+ fromname = frombuf;
+ }
+ if (fromname != NULL &&
+ (fromname[0] == '#' || fromname[0] == '@')) {
+ /*
+ * Incremental source name begins with # or @.
+ * Default to same fs as target.
+ */
+ char tmpbuf[ZFS_MAX_DATASET_NAME_LEN];
+ (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf));
+ (void) strlcpy(frombuf, argv[0], sizeof (frombuf));
+ cp = strchr(frombuf, '@');
+ if (cp != NULL)
+ *cp = '\0';
+ (void) strlcat(frombuf, tmpbuf, sizeof (frombuf));
+ fromname = frombuf;
+ }
+
+ zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
+ if (zhp == NULL)
+ return (1);
+ err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags,
+ redactbook);
+ zfs_close(zhp);
+ return (err != 0);
+ }
+
+ if (fromname != NULL && strchr(fromname, '#')) {
+ (void) fprintf(stderr,
+ gettext("Error: multiple snapshots cannot be "
+ "sent from a bookmark.\n"));
+ return (1);
+ }
+
+ if (redactbook != NULL) {
+ (void) fprintf(stderr, gettext("Error: multiple snapshots "
+ "cannot be sent redacted.\n"));
+ return (1);
+ }
+
+ if ((cp = strchr(argv[0], '@')) == NULL) {
+ (void) fprintf(stderr, gettext("Error: "
+ "Unsupported flag with filesystem or bookmark.\n"));
+ return (1);
+ }
+ *cp = '\0';
+ toname = cp + 1;
+ zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL)
+ return (1);
+
+ /*
+ * If they specified the full path to the snapshot, chop off
+ * everything except the short name of the snapshot, but special
+ * case if they specify the origin.
+ */
+ if (fromname && (cp = strchr(fromname, '@')) != NULL) {
+ char origin[ZFS_MAX_DATASET_NAME_LEN];
+ zprop_source_t src;
+
+ (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
+ origin, sizeof (origin), &src, NULL, 0, B_FALSE);
+
+ if (strcmp(origin, fromname) == 0) {
+ fromname = NULL;
+ flags.fromorigin = B_TRUE;
+ } else {
+ *cp = '\0';
+ if (cp != fromname && strcmp(argv[0], fromname)) {
+ (void) fprintf(stderr,
+ gettext("incremental source must be "
+ "in same filesystem\n"));
+ usage(B_FALSE);
+ }
+ fromname = cp + 1;
+ if (strchr(fromname, '@') || strchr(fromname, '/')) {
+ (void) fprintf(stderr,
+ gettext("invalid incremental source\n"));
+ usage(B_FALSE);
+ }
+ }
+ }
+
+ if (flags.replicate && fromname == NULL)
+ flags.doall = B_TRUE;
+
+ err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
+ flags.verbosity >= 3 ? &dbgnv : NULL);
+
+ if (flags.verbosity >= 3 && dbgnv != NULL) {
+ /*
+ * dump_nvlist prints to stdout, but that's been
+ * redirected to a file. Make it print to stderr
+ * instead.
+ */
+ (void) dup2(STDERR_FILENO, STDOUT_FILENO);
+ dump_nvlist(dbgnv, 0);
+ nvlist_free(dbgnv);
+ }
+ zfs_close(zhp);
+
+ return (err != 0);
+}
+
+/*
+ * Restore a backup stream from stdin.
+ */
+static int
+zfs_do_receive(int argc, char **argv)
+{
+ int c, err = 0;
+ recvflags_t flags = { 0 };
+ boolean_t abort_resumable = B_FALSE;
+ nvlist_t *props;
+
+ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) {
+ switch (c) {
+ case 'o':
+ if (!parseprop(props, optarg)) {
+ nvlist_free(props);
+ usage(B_FALSE);
+ }
+ break;
+ case 'x':
+ if (!parsepropname(props, optarg)) {
+ nvlist_free(props);
+ usage(B_FALSE);
+ }
+ break;
+ case 'd':
+ if (flags.istail) {
+ (void) fprintf(stderr, gettext("invalid option "
+ "combination: -d and -e are mutually "
+ "exclusive\n"));
+ usage(B_FALSE);
+ }
+ flags.isprefix = B_TRUE;
+ break;
+ case 'e':
+ if (flags.isprefix) {
+ (void) fprintf(stderr, gettext("invalid option "
+ "combination: -d and -e are mutually "
+ "exclusive\n"));
+ usage(B_FALSE);
+ }
+ flags.istail = B_TRUE;
+ break;
+ case 'h':
+ flags.skipholds = B_TRUE;
+ break;
+ case 'M':
+ flags.forceunmount = B_TRUE;
+ break;
+ case 'n':
+ flags.dryrun = B_TRUE;
+ break;
+ case 'u':
+ flags.nomount = B_TRUE;
+ break;
+ case 'v':
+ flags.verbose = B_TRUE;
+ break;
+ case 's':
+ flags.resumable = B_TRUE;
+ break;
+ case 'F':
+ flags.force = B_TRUE;
+ break;
+ case 'A':
+ abort_resumable = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */
+ if (flags.istail)
+ flags.isprefix = B_TRUE;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing snapshot argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (abort_resumable) {
+ if (flags.isprefix || flags.istail || flags.dryrun ||
+ flags.resumable || flags.nomount) {
+ (void) fprintf(stderr, gettext("invalid option\n"));
+ usage(B_FALSE);
+ }
+
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+ (void) snprintf(namebuf, sizeof (namebuf),
+ "%s/%%recv", argv[0]);
+
+ if (zfs_dataset_exists(g_zfs, namebuf,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
+ zfs_handle_t *zhp = zfs_open(g_zfs,
+ namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL) {
+ nvlist_free(props);
+ return (1);
+ }
+ err = zfs_destroy(zhp, B_FALSE);
+ zfs_close(zhp);
+ } else {
+ zfs_handle_t *zhp = zfs_open(g_zfs,
+ argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL)
+ usage(B_FALSE);
+ if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
+ zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+ NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
+ (void) fprintf(stderr,
+ gettext("'%s' does not have any "
+ "resumable receive state to abort\n"),
+ argv[0]);
+ nvlist_free(props);
+ zfs_close(zhp);
+ return (1);
+ }
+ err = zfs_destroy(zhp, B_FALSE);
+ zfs_close(zhp);
+ }
+ nvlist_free(props);
+ return (err != 0);
+ }
+
+ if (isatty(STDIN_FILENO)) {
+ (void) fprintf(stderr,
+ gettext("Error: Backup stream can not be read "
+ "from a terminal.\n"
+ "You must redirect standard input.\n"));
+ nvlist_free(props);
+ return (1);
+ }
+ err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
+ nvlist_free(props);
+
+ return (err != 0);
+}
+
+/*
+ * allow/unallow stuff
+ */
+/* copied from zfs/sys/dsl_deleg.h */
+#define ZFS_DELEG_PERM_CREATE "create"
+#define ZFS_DELEG_PERM_DESTROY "destroy"
+#define ZFS_DELEG_PERM_SNAPSHOT "snapshot"
+#define ZFS_DELEG_PERM_ROLLBACK "rollback"
+#define ZFS_DELEG_PERM_CLONE "clone"
+#define ZFS_DELEG_PERM_PROMOTE "promote"
+#define ZFS_DELEG_PERM_RENAME "rename"
+#define ZFS_DELEG_PERM_MOUNT "mount"
+#define ZFS_DELEG_PERM_SHARE "share"
+#define ZFS_DELEG_PERM_SEND "send"
+#define ZFS_DELEG_PERM_RECEIVE "receive"
+#define ZFS_DELEG_PERM_ALLOW "allow"
+#define ZFS_DELEG_PERM_USERPROP "userprop"
+#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */
+#define ZFS_DELEG_PERM_USERQUOTA "userquota"
+#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota"
+#define ZFS_DELEG_PERM_USERUSED "userused"
+#define ZFS_DELEG_PERM_GROUPUSED "groupused"
+#define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota"
+#define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota"
+#define ZFS_DELEG_PERM_USEROBJUSED "userobjused"
+#define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused"
+
+#define ZFS_DELEG_PERM_HOLD "hold"
+#define ZFS_DELEG_PERM_RELEASE "release"
+#define ZFS_DELEG_PERM_DIFF "diff"
+#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
+#define ZFS_DELEG_PERM_LOAD_KEY "load-key"
+#define ZFS_DELEG_PERM_CHANGE_KEY "change-key"
+
+#define ZFS_DELEG_PERM_PROJECTUSED "projectused"
+#define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota"
+#define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused"
+#define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota"
+
+#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
+
+static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
+ { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
+ { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+ { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+ { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+ { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
+ { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+ { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+ { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+ { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+ { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+ { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+ { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+ { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
+ { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+ { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+ { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
+ { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY },
+ { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY },
+
+ { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+ { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+ { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+ { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+ { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+ { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA },
+ { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED },
+ { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA },
+ { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED },
+ { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED },
+ { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA },
+ { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED },
+ { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA },
+ { NULL, ZFS_DELEG_NOTE_NONE }
+};
+
+/* permission structure */
+typedef struct deleg_perm {
+ zfs_deleg_who_type_t dp_who_type;
+ const char *dp_name;
+ boolean_t dp_local;
+ boolean_t dp_descend;
+} deleg_perm_t;
+
+/* */
+typedef struct deleg_perm_node {
+ deleg_perm_t dpn_perm;
+
+ uu_avl_node_t dpn_avl_node;
+} deleg_perm_node_t;
+
+typedef struct fs_perm fs_perm_t;
+
+/* permissions set */
+typedef struct who_perm {
+ zfs_deleg_who_type_t who_type;
+ const char *who_name; /* id */
+ char who_ug_name[256]; /* user/group name */
+ fs_perm_t *who_fsperm; /* uplink */
+
+ uu_avl_t *who_deleg_perm_avl; /* permissions */
+} who_perm_t;
+
+/* */
+typedef struct who_perm_node {
+ who_perm_t who_perm;
+ uu_avl_node_t who_avl_node;
+} who_perm_node_t;
+
+typedef struct fs_perm_set fs_perm_set_t;
+/* fs permissions */
+struct fs_perm {
+ const char *fsp_name;
+
+ uu_avl_t *fsp_sc_avl; /* sets,create */
+ uu_avl_t *fsp_uge_avl; /* user,group,everyone */
+
+ fs_perm_set_t *fsp_set; /* uplink */
+};
+
+/* */
+typedef struct fs_perm_node {
+ fs_perm_t fspn_fsperm;
+ uu_avl_t *fspn_avl;
+
+ uu_list_node_t fspn_list_node;
+} fs_perm_node_t;
+
+/* top level structure */
+struct fs_perm_set {
+ uu_list_pool_t *fsps_list_pool;
+ uu_list_t *fsps_list; /* list of fs_perms */
+
+ uu_avl_pool_t *fsps_named_set_avl_pool;
+ uu_avl_pool_t *fsps_who_perm_avl_pool;
+ uu_avl_pool_t *fsps_deleg_perm_avl_pool;
+};
+
+static inline const char *
+deleg_perm_type(zfs_deleg_note_t note)
+{
+ /* subcommands */
+ switch (note) {
+ /* SUBCOMMANDS */
+ /* OTHER */
+ case ZFS_DELEG_NOTE_GROUPQUOTA:
+ case ZFS_DELEG_NOTE_GROUPUSED:
+ case ZFS_DELEG_NOTE_USERPROP:
+ case ZFS_DELEG_NOTE_USERQUOTA:
+ case ZFS_DELEG_NOTE_USERUSED:
+ case ZFS_DELEG_NOTE_USEROBJQUOTA:
+ case ZFS_DELEG_NOTE_USEROBJUSED:
+ case ZFS_DELEG_NOTE_GROUPOBJQUOTA:
+ case ZFS_DELEG_NOTE_GROUPOBJUSED:
+ case ZFS_DELEG_NOTE_PROJECTUSED:
+ case ZFS_DELEG_NOTE_PROJECTQUOTA:
+ case ZFS_DELEG_NOTE_PROJECTOBJUSED:
+ case ZFS_DELEG_NOTE_PROJECTOBJQUOTA:
+ /* other */
+ return (gettext("other"));
+ default:
+ return (gettext("subcommand"));
+ }
+}
+
+static int
+who_type2weight(zfs_deleg_who_type_t who_type)
+{
+ int res;
+ switch (who_type) {
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ res = 0;
+ break;
+ case ZFS_DELEG_CREATE_SETS:
+ case ZFS_DELEG_CREATE:
+ res = 1;
+ break;
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ res = 2;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ res = 3;
+ break;
+ case ZFS_DELEG_EVERYONE_SETS:
+ case ZFS_DELEG_EVERYONE:
+ res = 4;
+ break;
+ default:
+ res = -1;
+ }
+
+ return (res);
+}
+
+/* ARGSUSED */
+static int
+who_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+ const who_perm_node_t *l = larg;
+ const who_perm_node_t *r = rarg;
+ zfs_deleg_who_type_t ltype = l->who_perm.who_type;
+ zfs_deleg_who_type_t rtype = r->who_perm.who_type;
+ int lweight = who_type2weight(ltype);
+ int rweight = who_type2weight(rtype);
+ int res = lweight - rweight;
+ if (res == 0)
+ res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
+ ZFS_MAX_DELEG_NAME-1);
+
+ if (res == 0)
+ return (0);
+ if (res > 0)
+ return (1);
+ else
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+deleg_perm_compare(const void *larg, const void *rarg, void *unused)
+{
+ const deleg_perm_node_t *l = larg;
+ const deleg_perm_node_t *r = rarg;
+ int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
+ ZFS_MAX_DELEG_NAME-1);
+
+ if (res == 0)
+ return (0);
+
+ if (res > 0)
+ return (1);
+ else
+ return (-1);
+}
+
+static inline void
+fs_perm_set_init(fs_perm_set_t *fspset)
+{
+ bzero(fspset, sizeof (fs_perm_set_t));
+
+ if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
+ sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
+ NULL, UU_DEFAULT)) == NULL)
+ nomem();
+ if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
+ "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
+ who_perm_node_t, who_avl_node), who_perm_compare,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
+ "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
+ who_perm_node_t, who_avl_node), who_perm_compare,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
+ "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
+ deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
+ == NULL)
+ nomem();
+}
+
+static inline void fs_perm_fini(fs_perm_t *);
+static inline void who_perm_fini(who_perm_t *);
+
+static inline void
+fs_perm_set_fini(fs_perm_set_t *fspset)
+{
+ fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
+
+ while (node != NULL) {
+ fs_perm_node_t *next_node =
+ uu_list_next(fspset->fsps_list, node);
+ fs_perm_t *fsperm = &node->fspn_fsperm;
+ fs_perm_fini(fsperm);
+ uu_list_remove(fspset->fsps_list, node);
+ free(node);
+ node = next_node;
+ }
+
+ uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
+ uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
+ uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
+}
+
+static inline void
+deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
+ const char *name)
+{
+ deleg_perm->dp_who_type = type;
+ deleg_perm->dp_name = name;
+}
+
+static inline void
+who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
+ zfs_deleg_who_type_t type, const char *name)
+{
+ uu_avl_pool_t *pool;
+ pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
+
+ bzero(who_perm, sizeof (who_perm_t));
+
+ if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
+ UU_DEFAULT)) == NULL)
+ nomem();
+
+ who_perm->who_type = type;
+ who_perm->who_name = name;
+ who_perm->who_fsperm = fsperm;
+}
+
+static inline void
+who_perm_fini(who_perm_t *who_perm)
+{
+ deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
+
+ while (node != NULL) {
+ deleg_perm_node_t *next_node =
+ uu_avl_next(who_perm->who_deleg_perm_avl, node);
+
+ uu_avl_remove(who_perm->who_deleg_perm_avl, node);
+ free(node);
+ node = next_node;
+ }
+
+ uu_avl_destroy(who_perm->who_deleg_perm_avl);
+}
+
+static inline void
+fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
+{
+ uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool;
+ uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool;
+
+ bzero(fsperm, sizeof (fs_perm_t));
+
+ if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
+ == NULL)
+ nomem();
+
+ if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
+ == NULL)
+ nomem();
+
+ fsperm->fsp_set = fspset;
+ fsperm->fsp_name = fsname;
+}
+
+static inline void
+fs_perm_fini(fs_perm_t *fsperm)
+{
+ who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
+ while (node != NULL) {
+ who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
+ node);
+ who_perm_t *who_perm = &node->who_perm;
+ who_perm_fini(who_perm);
+ uu_avl_remove(fsperm->fsp_sc_avl, node);
+ free(node);
+ node = next_node;
+ }
+
+ node = uu_avl_first(fsperm->fsp_uge_avl);
+ while (node != NULL) {
+ who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
+ node);
+ who_perm_t *who_perm = &node->who_perm;
+ who_perm_fini(who_perm);
+ uu_avl_remove(fsperm->fsp_uge_avl, node);
+ free(node);
+ node = next_node;
+ }
+
+ uu_avl_destroy(fsperm->fsp_sc_avl);
+ uu_avl_destroy(fsperm->fsp_uge_avl);
+}
+
+static void
+set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
+ zfs_deleg_who_type_t who_type, const char *name, char locality)
+{
+ uu_avl_index_t idx = 0;
+
+ deleg_perm_node_t *found_node = NULL;
+ deleg_perm_t *deleg_perm = &node->dpn_perm;
+
+ deleg_perm_init(deleg_perm, who_type, name);
+
+ if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+ == NULL)
+ uu_avl_insert(avl, node, idx);
+ else {
+ node = found_node;
+ deleg_perm = &node->dpn_perm;
+ }
+
+
+ switch (locality) {
+ case ZFS_DELEG_LOCAL:
+ deleg_perm->dp_local = B_TRUE;
+ break;
+ case ZFS_DELEG_DESCENDENT:
+ deleg_perm->dp_descend = B_TRUE;
+ break;
+ case ZFS_DELEG_NA:
+ break;
+ default:
+ assert(B_FALSE); /* invalid locality */
+ }
+}
+
+static inline int
+parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
+{
+ nvpair_t *nvp = NULL;
+ fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
+ uu_avl_t *avl = who_perm->who_deleg_perm_avl;
+ zfs_deleg_who_type_t who_type = who_perm->who_type;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ const char *name = nvpair_name(nvp);
+ data_type_t type = nvpair_type(nvp);
+ uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
+ deleg_perm_node_t *node =
+ safe_malloc(sizeof (deleg_perm_node_t));
+
+ VERIFY(type == DATA_TYPE_BOOLEAN);
+
+ uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
+ set_deleg_perm_node(avl, node, who_type, name, locality);
+ }
+
+ return (0);
+}
+
+static inline int
+parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
+{
+ nvpair_t *nvp = NULL;
+ fs_perm_set_t *fspset = fsperm->fsp_set;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ nvlist_t *nvl2 = NULL;
+ const char *name = nvpair_name(nvp);
+ uu_avl_t *avl = NULL;
+ uu_avl_pool_t *avl_pool = NULL;
+ zfs_deleg_who_type_t perm_type = name[0];
+ char perm_locality = name[1];
+ const char *perm_name = name + 3;
+ who_perm_t *who_perm = NULL;
+
+ assert('$' == name[2]);
+
+ if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+ return (-1);
+
+ switch (perm_type) {
+ case ZFS_DELEG_CREATE:
+ case ZFS_DELEG_CREATE_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ case ZFS_DELEG_NAMED_SET_SETS:
+ avl_pool = fspset->fsps_named_set_avl_pool;
+ avl = fsperm->fsp_sc_avl;
+ break;
+ case ZFS_DELEG_USER:
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_GROUP:
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_EVERYONE:
+ case ZFS_DELEG_EVERYONE_SETS:
+ avl_pool = fspset->fsps_who_perm_avl_pool;
+ avl = fsperm->fsp_uge_avl;
+ break;
+
+ default:
+ assert(!"unhandled zfs_deleg_who_type_t");
+ }
+
+ who_perm_node_t *found_node = NULL;
+ who_perm_node_t *node = safe_malloc(
+ sizeof (who_perm_node_t));
+ who_perm = &node->who_perm;
+ uu_avl_index_t idx = 0;
+
+ uu_avl_node_init(node, &node->who_avl_node, avl_pool);
+ who_perm_init(who_perm, fsperm, perm_type, perm_name);
+
+ if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+ == NULL) {
+ if (avl == fsperm->fsp_uge_avl) {
+ uid_t rid = 0;
+ struct passwd *p = NULL;
+ struct group *g = NULL;
+ const char *nice_name = NULL;
+
+ switch (perm_type) {
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ rid = atoi(perm_name);
+ p = getpwuid(rid);
+ if (p)
+ nice_name = p->pw_name;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ rid = atoi(perm_name);
+ g = getgrgid(rid);
+ if (g)
+ nice_name = g->gr_name;
+ break;
+
+ default:
+ break;
+ }
+
+ if (nice_name != NULL) {
+ (void) strlcpy(
+ node->who_perm.who_ug_name,
+ nice_name, 256);
+ } else {
+ /* User or group unknown */
+ (void) snprintf(
+ node->who_perm.who_ug_name,
+ sizeof (node->who_perm.who_ug_name),
+ "(unknown: %d)", rid);
+ }
+ }
+
+ uu_avl_insert(avl, node, idx);
+ } else {
+ node = found_node;
+ who_perm = &node->who_perm;
+ }
+
+ assert(who_perm != NULL);
+ (void) parse_who_perm(who_perm, nvl2, perm_locality);
+ }
+
+ return (0);
+}
+
+static inline int
+parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
+{
+ nvpair_t *nvp = NULL;
+ uu_avl_index_t idx = 0;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ nvlist_t *nvl2 = NULL;
+ const char *fsname = nvpair_name(nvp);
+ data_type_t type = nvpair_type(nvp);
+ fs_perm_t *fsperm = NULL;
+ fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
+ if (node == NULL)
+ nomem();
+
+ fsperm = &node->fspn_fsperm;
+
+ VERIFY(DATA_TYPE_NVLIST == type);
+
+ uu_list_node_init(node, &node->fspn_list_node,
+ fspset->fsps_list_pool);
+
+ idx = uu_list_numnodes(fspset->fsps_list);
+ fs_perm_init(fsperm, fspset, fsname);
+
+ if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+ return (-1);
+
+ (void) parse_fs_perm(fsperm, nvl2);
+
+ uu_list_insert(fspset->fsps_list, node, idx);
+ }
+
+ return (0);
+}
+
+static inline const char *
+deleg_perm_comment(zfs_deleg_note_t note)
+{
+ const char *str = "";
+
+ /* subcommands */
+ switch (note) {
+ /* SUBCOMMANDS */
+ case ZFS_DELEG_NOTE_ALLOW:
+ str = gettext("Must also have the permission that is being"
+ "\n\t\t\t\tallowed");
+ break;
+ case ZFS_DELEG_NOTE_CLONE:
+ str = gettext("Must also have the 'create' ability and 'mount'"
+ "\n\t\t\t\tability in the origin file system");
+ break;
+ case ZFS_DELEG_NOTE_CREATE:
+ str = gettext("Must also have the 'mount' ability");
+ break;
+ case ZFS_DELEG_NOTE_DESTROY:
+ str = gettext("Must also have the 'mount' ability");
+ break;
+ case ZFS_DELEG_NOTE_DIFF:
+ str = gettext("Allows lookup of paths within a dataset;"
+ "\n\t\t\t\tgiven an object number. Ordinary users need this"
+ "\n\t\t\t\tin order to use zfs diff");
+ break;
+ case ZFS_DELEG_NOTE_HOLD:
+ str = gettext("Allows adding a user hold to a snapshot");
+ break;
+ case ZFS_DELEG_NOTE_MOUNT:
+ str = gettext("Allows mount/umount of ZFS datasets");
+ break;
+ case ZFS_DELEG_NOTE_PROMOTE:
+ str = gettext("Must also have the 'mount'\n\t\t\t\tand"
+ " 'promote' ability in the origin file system");
+ break;
+ case ZFS_DELEG_NOTE_RECEIVE:
+ str = gettext("Must also have the 'mount' and 'create'"
+ " ability");
+ break;
+ case ZFS_DELEG_NOTE_RELEASE:
+ str = gettext("Allows releasing a user hold which\n\t\t\t\t"
+ "might destroy the snapshot");
+ break;
+ case ZFS_DELEG_NOTE_RENAME:
+ str = gettext("Must also have the 'mount' and 'create'"
+ "\n\t\t\t\tability in the new parent");
+ break;
+ case ZFS_DELEG_NOTE_ROLLBACK:
+ str = gettext("");
+ break;
+ case ZFS_DELEG_NOTE_SEND:
+ str = gettext("");
+ break;
+ case ZFS_DELEG_NOTE_SHARE:
+ str = gettext("Allows sharing file systems over NFS or SMB"
+ "\n\t\t\t\tprotocols");
+ break;
+ case ZFS_DELEG_NOTE_SNAPSHOT:
+ str = gettext("");
+ break;
+ case ZFS_DELEG_NOTE_LOAD_KEY:
+ str = gettext("Allows loading or unloading an encryption key");
+ break;
+ case ZFS_DELEG_NOTE_CHANGE_KEY:
+ str = gettext("Allows changing or adding an encryption key");
+ break;
+/*
+ * case ZFS_DELEG_NOTE_VSCAN:
+ * str = gettext("");
+ * break;
+ */
+ /* OTHER */
+ case ZFS_DELEG_NOTE_GROUPQUOTA:
+ str = gettext("Allows accessing any groupquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_GROUPUSED:
+ str = gettext("Allows reading any groupused@... property");
+ break;
+ case ZFS_DELEG_NOTE_USERPROP:
+ str = gettext("Allows changing any user property");
+ break;
+ case ZFS_DELEG_NOTE_USERQUOTA:
+ str = gettext("Allows accessing any userquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_USERUSED:
+ str = gettext("Allows reading any userused@... property");
+ break;
+ case ZFS_DELEG_NOTE_USEROBJQUOTA:
+ str = gettext("Allows accessing any userobjquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_GROUPOBJQUOTA:
+ str = gettext("Allows accessing any \n\t\t\t\t"
+ "groupobjquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_GROUPOBJUSED:
+ str = gettext("Allows reading any groupobjused@... property");
+ break;
+ case ZFS_DELEG_NOTE_USEROBJUSED:
+ str = gettext("Allows reading any userobjused@... property");
+ break;
+ case ZFS_DELEG_NOTE_PROJECTQUOTA:
+ str = gettext("Allows accessing any projectquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_PROJECTOBJQUOTA:
+ str = gettext("Allows accessing any \n\t\t\t\t"
+ "projectobjquota@... property");
+ break;
+ case ZFS_DELEG_NOTE_PROJECTUSED:
+ str = gettext("Allows reading any projectused@... property");
+ break;
+ case ZFS_DELEG_NOTE_PROJECTOBJUSED:
+ str = gettext("Allows accessing any \n\t\t\t\t"
+ "projectobjused@... property");
+ break;
+ /* other */
+ default:
+ str = "";
+ }
+
+ return (str);
+}
+
+struct allow_opts {
+ boolean_t local;
+ boolean_t descend;
+ boolean_t user;
+ boolean_t group;
+ boolean_t everyone;
+ boolean_t create;
+ boolean_t set;
+ boolean_t recursive; /* unallow only */
+ boolean_t prt_usage;
+
+ boolean_t prt_perms;
+ char *who;
+ char *perms;
+ const char *dataset;
+};
+
+static inline int
+prop_cmp(const void *a, const void *b)
+{
+ const char *str1 = *(const char **)a;
+ const char *str2 = *(const char **)b;
+ return (strcmp(str1, str2));
+}
+
+static void
+allow_usage(boolean_t un, boolean_t requested, const char *msg)
+{
+ const char *opt_desc[] = {
+ "-h", gettext("show this help message and exit"),
+ "-l", gettext("set permission locally"),
+ "-d", gettext("set permission for descents"),
+ "-u", gettext("set permission for user"),
+ "-g", gettext("set permission for group"),
+ "-e", gettext("set permission for everyone"),
+ "-c", gettext("set create time permission"),
+ "-s", gettext("define permission set"),
+ /* unallow only */
+ "-r", gettext("remove permissions recursively"),
+ };
+ size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
+ size_t allow_size = unallow_size - 2;
+ const char *props[ZFS_NUM_PROPS];
+ int i;
+ size_t count = 0;
+ FILE *fp = requested ? stdout : stderr;
+ zprop_desc_t *pdtbl = zfs_prop_get_table();
+ const char *fmt = gettext("%-16s %-14s\t%s\n");
+
+ (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
+ HELP_ALLOW));
+ (void) fprintf(fp, gettext("Options:\n"));
+ for (i = 0; i < (un ? unallow_size : allow_size); i += 2) {
+ const char *opt = opt_desc[i];
+ const char *optdsc = opt_desc[i + 1];
+ (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc);
+ }
+
+ (void) fprintf(fp, gettext("\nThe following permissions are "
+ "supported:\n\n"));
+ (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
+ gettext("NOTES"));
+ for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
+ const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
+ zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
+ const char *perm_type = deleg_perm_type(perm_note);
+ const char *perm_comment = deleg_perm_comment(perm_note);
+ (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
+ }
+
+ for (i = 0; i < ZFS_NUM_PROPS; i++) {
+ zprop_desc_t *pd = &pdtbl[i];
+ if (pd->pd_visible != B_TRUE)
+ continue;
+
+ if (pd->pd_attr == PROP_READONLY)
+ continue;
+
+ props[count++] = pd->pd_name;
+ }
+ props[count] = NULL;
+
+ qsort(props, count, sizeof (char *), prop_cmp);
+
+ for (i = 0; i < count; i++)
+ (void) fprintf(fp, fmt, props[i], gettext("property"), "");
+
+ if (msg != NULL)
+ (void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
+
+ exit(requested ? 0 : 2);
+}
+
+static inline const char *
+munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
+ char **permsp)
+{
+ if (un && argc == expected_argc - 1)
+ *permsp = NULL;
+ else if (argc == expected_argc)
+ *permsp = argv[argc - 2];
+ else
+ allow_usage(un, B_FALSE,
+ gettext("wrong number of parameters\n"));
+
+ return (argv[argc - 1]);
+}
+
+static void
+parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
+{
+ int uge_sum = opts->user + opts->group + opts->everyone;
+ int csuge_sum = opts->create + opts->set + uge_sum;
+ int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
+ int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
+
+ if (uge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("-u, -g, and -e are mutually exclusive\n"));
+
+ if (opts->prt_usage) {
+ if (argc == 0 && all_sum == 0)
+ allow_usage(un, B_TRUE, NULL);
+ else
+ usage(B_FALSE);
+ }
+
+ if (opts->set) {
+ if (csuge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("invalid options combined with -s\n"));
+
+ opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+ if (argv[0][0] != '@')
+ allow_usage(un, B_FALSE,
+ gettext("invalid set name: missing '@' prefix\n"));
+ opts->who = argv[0];
+ } else if (opts->create) {
+ if (ldcsuge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("invalid options combined with -c\n"));
+ opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+ } else if (opts->everyone) {
+ if (csuge_sum > 1)
+ allow_usage(un, B_FALSE,
+ gettext("invalid options combined with -e\n"));
+ opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+ } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
+ == 0) {
+ opts->everyone = B_TRUE;
+ argc--;
+ argv++;
+ opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+ } else if (argc == 1 && !un) {
+ opts->prt_perms = B_TRUE;
+ opts->dataset = argv[argc-1];
+ } else {
+ opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+ opts->who = argv[0];
+ }
+
+ if (!opts->local && !opts->descend) {
+ opts->local = B_TRUE;
+ opts->descend = B_TRUE;
+ }
+}
+
+static void
+store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
+ const char *who, char *perms, nvlist_t *top_nvl)
+{
+ int i;
+ char ld[2] = { '\0', '\0' };
+ char who_buf[MAXNAMELEN + 32];
+ char base_type = '\0';
+ char set_type = '\0';
+ nvlist_t *base_nvl = NULL;
+ nvlist_t *set_nvl = NULL;
+ nvlist_t *nvl;
+
+ if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+ if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ switch (type) {
+ case ZFS_DELEG_NAMED_SET_SETS:
+ case ZFS_DELEG_NAMED_SET:
+ set_type = ZFS_DELEG_NAMED_SET_SETS;
+ base_type = ZFS_DELEG_NAMED_SET;
+ ld[0] = ZFS_DELEG_NA;
+ break;
+ case ZFS_DELEG_CREATE_SETS:
+ case ZFS_DELEG_CREATE:
+ set_type = ZFS_DELEG_CREATE_SETS;
+ base_type = ZFS_DELEG_CREATE;
+ ld[0] = ZFS_DELEG_NA;
+ break;
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ set_type = ZFS_DELEG_USER_SETS;
+ base_type = ZFS_DELEG_USER;
+ if (local)
+ ld[0] = ZFS_DELEG_LOCAL;
+ if (descend)
+ ld[1] = ZFS_DELEG_DESCENDENT;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ set_type = ZFS_DELEG_GROUP_SETS;
+ base_type = ZFS_DELEG_GROUP;
+ if (local)
+ ld[0] = ZFS_DELEG_LOCAL;
+ if (descend)
+ ld[1] = ZFS_DELEG_DESCENDENT;
+ break;
+ case ZFS_DELEG_EVERYONE_SETS:
+ case ZFS_DELEG_EVERYONE:
+ set_type = ZFS_DELEG_EVERYONE_SETS;
+ base_type = ZFS_DELEG_EVERYONE;
+ if (local)
+ ld[0] = ZFS_DELEG_LOCAL;
+ if (descend)
+ ld[1] = ZFS_DELEG_DESCENDENT;
+ break;
+
+ default:
+ assert(set_type != '\0' && base_type != '\0');
+ }
+
+ if (perms != NULL) {
+ char *curr = perms;
+ char *end = curr + strlen(perms);
+
+ while (curr < end) {
+ char *delim = strchr(curr, ',');
+ if (delim == NULL)
+ delim = end;
+ else
+ *delim = '\0';
+
+ if (curr[0] == '@')
+ nvl = set_nvl;
+ else
+ nvl = base_nvl;
+
+ (void) nvlist_add_boolean(nvl, curr);
+ if (delim != end)
+ *delim = ',';
+ curr = delim + 1;
+ }
+
+ for (i = 0; i < 2; i++) {
+ char locality = ld[i];
+ if (locality == 0)
+ continue;
+
+ if (!nvlist_empty(base_nvl)) {
+ if (who != NULL)
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$%s",
+ base_type, locality, who);
+ else
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$",
+ base_type, locality);
+
+ (void) nvlist_add_nvlist(top_nvl, who_buf,
+ base_nvl);
+ }
+
+
+ if (!nvlist_empty(set_nvl)) {
+ if (who != NULL)
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$%s",
+ set_type, locality, who);
+ else
+ (void) snprintf(who_buf,
+ sizeof (who_buf), "%c%c$",
+ set_type, locality);
+
+ (void) nvlist_add_nvlist(top_nvl, who_buf,
+ set_nvl);
+ }
+ }
+ } else {
+ for (i = 0; i < 2; i++) {
+ char locality = ld[i];
+ if (locality == 0)
+ continue;
+
+ if (who != NULL)
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$%s", base_type, locality, who);
+ else
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$", base_type, locality);
+ (void) nvlist_add_boolean(top_nvl, who_buf);
+
+ if (who != NULL)
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$%s", set_type, locality, who);
+ else
+ (void) snprintf(who_buf, sizeof (who_buf),
+ "%c%c$", set_type, locality);
+ (void) nvlist_add_boolean(top_nvl, who_buf);
+ }
+ }
+}
+
+static int
+construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
+{
+ if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ if (opts->set) {
+ store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
+ opts->descend, opts->who, opts->perms, *nvlp);
+ } else if (opts->create) {
+ store_allow_perm(ZFS_DELEG_CREATE, opts->local,
+ opts->descend, NULL, opts->perms, *nvlp);
+ } else if (opts->everyone) {
+ store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
+ opts->descend, NULL, opts->perms, *nvlp);
+ } else {
+ char *curr = opts->who;
+ char *end = curr + strlen(curr);
+
+ while (curr < end) {
+ const char *who;
+ zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
+ char *endch;
+ char *delim = strchr(curr, ',');
+ char errbuf[256];
+ char id[64];
+ struct passwd *p = NULL;
+ struct group *g = NULL;
+
+ uid_t rid;
+ if (delim == NULL)
+ delim = end;
+ else
+ *delim = '\0';
+
+ rid = (uid_t)strtol(curr, &endch, 0);
+ if (opts->user) {
+ who_type = ZFS_DELEG_USER;
+ if (*endch != '\0')
+ p = getpwnam(curr);
+ else
+ p = getpwuid(rid);
+
+ if (p != NULL)
+ rid = p->pw_uid;
+ else if (*endch != '\0') {
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid user %s\n"), curr);
+ allow_usage(un, B_TRUE, errbuf);
+ }
+ } else if (opts->group) {
+ who_type = ZFS_DELEG_GROUP;
+ if (*endch != '\0')
+ g = getgrnam(curr);
+ else
+ g = getgrgid(rid);
+
+ if (g != NULL)
+ rid = g->gr_gid;
+ else if (*endch != '\0') {
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid group %s\n"), curr);
+ allow_usage(un, B_TRUE, errbuf);
+ }
+ } else {
+ if (*endch != '\0') {
+ p = getpwnam(curr);
+ } else {
+ p = getpwuid(rid);
+ }
+
+ if (p == NULL) {
+ if (*endch != '\0') {
+ g = getgrnam(curr);
+ } else {
+ g = getgrgid(rid);
+ }
+ }
+
+ if (p != NULL) {
+ who_type = ZFS_DELEG_USER;
+ rid = p->pw_uid;
+ } else if (g != NULL) {
+ who_type = ZFS_DELEG_GROUP;
+ rid = g->gr_gid;
+ } else {
+ (void) snprintf(errbuf, 256, gettext(
+ "invalid user/group %s\n"), curr);
+ allow_usage(un, B_TRUE, errbuf);
+ }
+ }
+
+ (void) sprintf(id, "%u", rid);
+ who = id;
+
+ store_allow_perm(who_type, opts->local,
+ opts->descend, who, opts->perms, *nvlp);
+ curr = delim + 1;
+ }
+ }
+
+ return (0);
+}
+
+static void
+print_set_creat_perms(uu_avl_t *who_avl)
+{
+ const char *sc_title[] = {
+ gettext("Permission sets:\n"),
+ gettext("Create time permissions:\n"),
+ NULL
+ };
+ who_perm_node_t *who_node = NULL;
+ int prev_weight = -1;
+
+ for (who_node = uu_avl_first(who_avl); who_node != NULL;
+ who_node = uu_avl_next(who_avl, who_node)) {
+ uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+ zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+ const char *who_name = who_node->who_perm.who_name;
+ int weight = who_type2weight(who_type);
+ boolean_t first = B_TRUE;
+ deleg_perm_node_t *deleg_node;
+
+ if (prev_weight != weight) {
+ (void) printf("%s", sc_title[weight]);
+ prev_weight = weight;
+ }
+
+ if (who_name == NULL || strnlen(who_name, 1) == 0)
+ (void) printf("\t");
+ else
+ (void) printf("\t%s ", who_name);
+
+ for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
+ deleg_node = uu_avl_next(avl, deleg_node)) {
+ if (first) {
+ (void) printf("%s",
+ deleg_node->dpn_perm.dp_name);
+ first = B_FALSE;
+ } else
+ (void) printf(",%s",
+ deleg_node->dpn_perm.dp_name);
+ }
+
+ (void) printf("\n");
+ }
+}
+
+static void
+print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
+ const char *title)
+{
+ who_perm_node_t *who_node = NULL;
+ boolean_t prt_title = B_TRUE;
+ uu_avl_walk_t *walk;
+
+ if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
+ nomem();
+
+ while ((who_node = uu_avl_walk_next(walk)) != NULL) {
+ const char *who_name = who_node->who_perm.who_name;
+ const char *nice_who_name = who_node->who_perm.who_ug_name;
+ uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+ zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+ char delim = ' ';
+ deleg_perm_node_t *deleg_node;
+ boolean_t prt_who = B_TRUE;
+
+ for (deleg_node = uu_avl_first(avl);
+ deleg_node != NULL;
+ deleg_node = uu_avl_next(avl, deleg_node)) {
+ if (local != deleg_node->dpn_perm.dp_local ||
+ descend != deleg_node->dpn_perm.dp_descend)
+ continue;
+
+ if (prt_who) {
+ const char *who = NULL;
+ if (prt_title) {
+ prt_title = B_FALSE;
+ (void) printf("%s", title);
+ }
+
+ switch (who_type) {
+ case ZFS_DELEG_USER_SETS:
+ case ZFS_DELEG_USER:
+ who = gettext("user");
+ if (nice_who_name)
+ who_name = nice_who_name;
+ break;
+ case ZFS_DELEG_GROUP_SETS:
+ case ZFS_DELEG_GROUP:
+ who = gettext("group");
+ if (nice_who_name)
+ who_name = nice_who_name;
+ break;
+ case ZFS_DELEG_EVERYONE_SETS:
+ case ZFS_DELEG_EVERYONE:
+ who = gettext("everyone");
+ who_name = NULL;
+ break;
+
+ default:
+ assert(who != NULL);
+ }
+
+ prt_who = B_FALSE;
+ if (who_name == NULL)
+ (void) printf("\t%s", who);
+ else
+ (void) printf("\t%s %s", who, who_name);
+ }
+
+ (void) printf("%c%s", delim,
+ deleg_node->dpn_perm.dp_name);
+ delim = ',';
+ }
+
+ if (!prt_who)
+ (void) printf("\n");
+ }
+
+ uu_avl_walk_end(walk);
+}
+
+static void
+print_fs_perms(fs_perm_set_t *fspset)
+{
+ fs_perm_node_t *node = NULL;
+ char buf[MAXNAMELEN + 32];
+ const char *dsname = buf;
+
+ for (node = uu_list_first(fspset->fsps_list); node != NULL;
+ node = uu_list_next(fspset->fsps_list, node)) {
+ uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
+ uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
+ int left = 0;
+
+ (void) snprintf(buf, sizeof (buf),
+ gettext("---- Permissions on %s "),
+ node->fspn_fsperm.fsp_name);
+ (void) printf("%s", dsname);
+ left = 70 - strlen(buf);
+ while (left-- > 0)
+ (void) printf("-");
+ (void) printf("\n");
+
+ print_set_creat_perms(sc_avl);
+ print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
+ gettext("Local permissions:\n"));
+ print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
+ gettext("Descendent permissions:\n"));
+ print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
+ gettext("Local+Descendent permissions:\n"));
+ }
+}
+
+static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
+
+struct deleg_perms {
+ boolean_t un;
+ nvlist_t *nvl;
+};
+
+static int
+set_deleg_perms(zfs_handle_t *zhp, void *data)
+{
+ struct deleg_perms *perms = (struct deleg_perms *)data;
+ zfs_type_t zfs_type = zfs_get_type(zhp);
+
+ if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
+ return (0);
+
+ return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
+}
+
+static int
+zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
+{
+ zfs_handle_t *zhp;
+ nvlist_t *perm_nvl = NULL;
+ nvlist_t *update_perm_nvl = NULL;
+ int error = 1;
+ int c;
+ struct allow_opts opts = { 0 };
+
+ const char *optstr = un ? "ldugecsrh" : "ldugecsh";
+
+ /* check opts */
+ while ((c = getopt(argc, argv, optstr)) != -1) {
+ switch (c) {
+ case 'l':
+ opts.local = B_TRUE;
+ break;
+ case 'd':
+ opts.descend = B_TRUE;
+ break;
+ case 'u':
+ opts.user = B_TRUE;
+ break;
+ case 'g':
+ opts.group = B_TRUE;
+ break;
+ case 'e':
+ opts.everyone = B_TRUE;
+ break;
+ case 's':
+ opts.set = B_TRUE;
+ break;
+ case 'c':
+ opts.create = B_TRUE;
+ break;
+ case 'r':
+ opts.recursive = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case 'h':
+ opts.prt_usage = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check arguments */
+ parse_allow_args(argc, argv, un, &opts);
+
+ /* try to open the dataset */
+ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM |
+ ZFS_TYPE_VOLUME)) == NULL) {
+ (void) fprintf(stderr, "Failed to open dataset: %s\n",
+ opts.dataset);
+ return (-1);
+ }
+
+ if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
+ goto cleanup2;
+
+ fs_perm_set_init(&fs_perm_set);
+ if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
+ (void) fprintf(stderr, "Failed to parse fsacl permissions\n");
+ goto cleanup1;
+ }
+
+ if (opts.prt_perms)
+ print_fs_perms(&fs_perm_set);
+ else {
+ (void) construct_fsacl_list(un, &opts, &update_perm_nvl);
+ if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
+ goto cleanup0;
+
+ if (un && opts.recursive) {
+ struct deleg_perms data = { un, update_perm_nvl };
+ if (zfs_iter_filesystems(zhp, set_deleg_perms,
+ &data) != 0)
+ goto cleanup0;
+ }
+ }
+
+ error = 0;
+
+cleanup0:
+ nvlist_free(perm_nvl);
+ nvlist_free(update_perm_nvl);
+cleanup1:
+ fs_perm_set_fini(&fs_perm_set);
+cleanup2:
+ zfs_close(zhp);
+
+ return (error);
+}
+
+static int
+zfs_do_allow(int argc, char **argv)
+{
+ return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
+}
+
+static int
+zfs_do_unallow(int argc, char **argv)
+{
+ return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
+}
+
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+ int errors = 0;
+ int i;
+ const char *tag;
+ boolean_t recursive = B_FALSE;
+ const char *opts = holding ? "rt" : "r";
+ int c;
+
+ /* check options */
+ while ((c = getopt(argc, argv, opts)) != -1) {
+ switch (c) {
+ case 'r':
+ recursive = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 2)
+ usage(B_FALSE);
+
+ tag = argv[0];
+ --argc;
+ ++argv;
+
+ if (holding && tag[0] == '.') {
+ /* tags starting with '.' are reserved for libzfs */
+ (void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+ usage(B_FALSE);
+ }
+
+ for (i = 0; i < argc; ++i) {
+ zfs_handle_t *zhp;
+ char parent[ZFS_MAX_DATASET_NAME_LEN];
+ const char *delim;
+ char *path = argv[i];
+
+ delim = strchr(path, '@');
+ if (delim == NULL) {
+ (void) fprintf(stderr,
+ gettext("'%s' is not a snapshot\n"), path);
+ ++errors;
+ continue;
+ }
+ (void) strncpy(parent, path, delim - path);
+ parent[delim - path] = '\0';
+
+ zhp = zfs_open(g_zfs, parent,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL) {
+ ++errors;
+ continue;
+ }
+ if (holding) {
+ if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
+ ++errors;
+ } else {
+ if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+ ++errors;
+ }
+ zfs_close(zhp);
+ }
+
+ return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] [-t] <tag> <snap> ...
+ *
+ * -r Recursively hold
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+ return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ * -r Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+ return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
+typedef struct holds_cbdata {
+ boolean_t cb_recursive;
+ const char *cb_snapname;
+ nvlist_t **cb_nvlp;
+ size_t cb_max_namelen;
+ size_t cb_max_taglen;
+} holds_cbdata_t;
+
+#define STRFTIME_FMT_STR "%a %b %e %H:%M %Y"
+#define DATETIME_BUF_LEN (32)
+/*
+ *
+ */
+static void
+print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl)
+{
+ int i;
+ nvpair_t *nvp = NULL;
+ char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
+ const char *col;
+
+ if (!scripted) {
+ for (i = 0; i < 3; i++) {
+ col = gettext(hdr_cols[i]);
+ if (i < 2)
+ (void) printf("%-*s ", i ? tagwidth : nwidth,
+ col);
+ else
+ (void) printf("%s\n", col);
+ }
+ }
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ char *zname = nvpair_name(nvp);
+ nvlist_t *nvl2;
+ nvpair_t *nvp2 = NULL;
+ (void) nvpair_value_nvlist(nvp, &nvl2);
+ while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
+ char tsbuf[DATETIME_BUF_LEN];
+ char *tagname = nvpair_name(nvp2);
+ uint64_t val = 0;
+ time_t time;
+ struct tm t;
+
+ (void) nvpair_value_uint64(nvp2, &val);
+ time = (time_t)val;
+ (void) localtime_r(&time, &t);
+ (void) strftime(tsbuf, DATETIME_BUF_LEN,
+ gettext(STRFTIME_FMT_STR), &t);
+
+ if (scripted) {
+ (void) printf("%s\t%s\t%s\n", zname,
+ tagname, tsbuf);
+ } else {
+ (void) printf("%-*s %-*s %s\n", nwidth,
+ zname, tagwidth, tagname, tsbuf);
+ }
+ }
+ }
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+holds_callback(zfs_handle_t *zhp, void *data)
+{
+ holds_cbdata_t *cbp = data;
+ nvlist_t *top_nvl = *cbp->cb_nvlp;
+ nvlist_t *nvl = NULL;
+ nvpair_t *nvp = NULL;
+ const char *zname = zfs_get_name(zhp);
+ size_t znamelen = strlen(zname);
+
+ if (cbp->cb_recursive) {
+ const char *snapname;
+ char *delim = strchr(zname, '@');
+ if (delim == NULL)
+ return (0);
+
+ snapname = delim + 1;
+ if (strcmp(cbp->cb_snapname, snapname))
+ return (0);
+ }
+
+ if (zfs_get_holds(zhp, &nvl) != 0)
+ return (-1);
+
+ if (znamelen > cbp->cb_max_namelen)
+ cbp->cb_max_namelen = znamelen;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ const char *tag = nvpair_name(nvp);
+ size_t taglen = strlen(tag);
+ if (taglen > cbp->cb_max_taglen)
+ cbp->cb_max_taglen = taglen;
+ }
+
+ return (nvlist_add_nvlist(top_nvl, zname, nvl));
+}
+
+/*
+ * zfs holds [-rH] <snap> ...
+ *
+ * -r Lists holds that are set on the named snapshots recursively.
+ * -H Scripted mode; elide headers and separate columns by tabs.
+ */
+static int
+zfs_do_holds(int argc, char **argv)
+{
+ int errors = 0;
+ int c;
+ int i;
+ boolean_t scripted = B_FALSE;
+ boolean_t recursive = B_FALSE;
+ const char *opts = "rH";
+ nvlist_t *nvl;
+
+ int types = ZFS_TYPE_SNAPSHOT;
+ holds_cbdata_t cb = { 0 };
+
+ int limit = 0;
+ int ret = 0;
+ int flags = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, opts)) != -1) {
+ switch (c) {
+ case 'r':
+ recursive = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ if (recursive) {
+ types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+ flags |= ZFS_ITER_RECURSE;
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1)
+ usage(B_FALSE);
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ nomem();
+
+ for (i = 0; i < argc; ++i) {
+ char *snapshot = argv[i];
+ const char *delim;
+ const char *snapname;
+
+ delim = strchr(snapshot, '@');
+ if (delim == NULL) {
+ (void) fprintf(stderr,
+ gettext("'%s' is not a snapshot\n"), snapshot);
+ ++errors;
+ continue;
+ }
+ snapname = delim + 1;
+ if (recursive)
+ snapshot[delim - snapshot] = '\0';
+
+ cb.cb_recursive = recursive;
+ cb.cb_snapname = snapname;
+ cb.cb_nvlp = &nvl;
+
+ /*
+ * 1. collect holds data, set format options
+ */
+ ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
+ holds_callback, &cb);
+ if (ret != 0)
+ ++errors;
+ }
+
+ /*
+ * 2. print holds data
+ */
+ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl);
+
+ if (nvlist_empty(nvl))
+ (void) fprintf(stderr, gettext("no datasets available\n"));
+
+ nvlist_free(nvl);
+
+ return (0 != errors);
+}
+
+#define CHECK_SPINNER 30
+#define SPINNER_TIME 3 /* seconds */
+#define MOUNT_TIME 1 /* seconds */
+
+typedef struct get_all_state {
+ boolean_t ga_verbose;
+ get_all_cb_t *ga_cbp;
+} get_all_state_t;
+
+static int
+get_one_dataset(zfs_handle_t *zhp, void *data)
+{
+ static char *spin[] = { "-", "\\", "|", "/" };
+ static int spinval = 0;
+ static int spincheck = 0;
+ static time_t last_spin_time = (time_t)0;
+ get_all_state_t *state = data;
+ zfs_type_t type = zfs_get_type(zhp);
+
+ if (state->ga_verbose) {
+ if (--spincheck < 0) {
+ time_t now = time(NULL);
+ if (last_spin_time + SPINNER_TIME < now) {
+ update_progress(spin[spinval++ % 4]);
+ last_spin_time = now;
+ }
+ spincheck = CHECK_SPINNER;
+ }
+ }
+
+ /*
+ * Iterate over any nested datasets.
+ */
+ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+ zfs_close(zhp);
+ return (1);
+ }
+
+ /*
+ * Skip any datasets whose type does not match.
+ */
+ if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
+ zfs_close(zhp);
+ return (0);
+ }
+ libzfs_add_handle(state->ga_cbp, zhp);
+ assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
+
+ return (0);
+}
+
+static void
+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
+{
+ get_all_state_t state = {
+ .ga_verbose = verbose,
+ .ga_cbp = cbp
+ };
+
+ if (verbose)
+ set_progress_header(gettext("Reading ZFS config"));
+ (void) zfs_iter_root(g_zfs, get_one_dataset, &state);
+
+ if (verbose)
+ finish_progress(gettext("done."));
+}
+
+/*
+ * Generic callback for sharing or mounting filesystems. Because the code is so
+ * similar, we have a common function with an extra parameter to determine which
+ * mode we are using.
+ */
+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
+
+typedef struct share_mount_state {
+ share_mount_op_t sm_op;
+ boolean_t sm_verbose;
+ int sm_flags;
+ char *sm_options;
+ char *sm_proto; /* only valid for OP_SHARE */
+ pthread_mutex_t sm_lock; /* protects the remaining fields */
+ uint_t sm_total; /* number of filesystems to process */
+ uint_t sm_done; /* number of filesystems processed */
+ int sm_status; /* -1 if any of the share/mount operations failed */
+} share_mount_state_t;
+
+/*
+ * Share or mount a dataset.
+ */
+static int
+share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
+ boolean_t explicit, const char *options)
+{
+ char mountpoint[ZFS_MAXPROPLEN];
+ char shareopts[ZFS_MAXPROPLEN];
+ char smbshareopts[ZFS_MAXPROPLEN];
+ const char *cmdname = op == OP_SHARE ? "share" : "mount";
+ struct mnttab mnt;
+ uint64_t zoned, canmount;
+ boolean_t shared_nfs, shared_smb;
+
+ assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
+
+ /*
+ * Check to make sure we can mount/share this dataset. If we
+ * are in the global zone and the filesystem is exported to a
+ * local zone, or if we are in a local zone and the
+ * filesystem is not exported, then it is an error.
+ */
+ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+
+ if (zoned && getzoneid() == GLOBAL_ZONEID) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "dataset is exported to a local zone\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+
+ } else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "permission denied\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ /*
+ * Ignore any filesystems which don't apply to us. This
+ * includes those with a legacy mountpoint, or those with
+ * legacy share options.
+ */
+ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+ sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+ sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
+ sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
+ strcmp(smbshareopts, "off") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot share '%s': "
+ "legacy share\n"), zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use share(1M) to "
+ "share this filesystem, or set "
+ "sharenfs property on\n"));
+ return (1);
+ }
+
+ /*
+ * We cannot share or mount legacy filesystems. If the
+ * shareopts is non-legacy but the mountpoint is legacy, we
+ * treat it as a legacy share.
+ */
+ if (strcmp(mountpoint, "legacy") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use %s(1M) to "
+ "%s this filesystem\n"), cmdname, cmdname);
+ return (1);
+ }
+
+ if (strcmp(mountpoint, "none") == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': no "
+ "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+ return (1);
+ }
+
+ /*
+ * canmount explicit outcome
+ * on no pass through
+ * on yes pass through
+ * off no return 0
+ * off yes display error, return 1
+ * noauto no return 0
+ * noauto yes pass through
+ */
+ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+ if (canmount == ZFS_CANMOUNT_OFF) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "'canmount' property is set to 'off'\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+ /*
+ * When performing a 'zfs mount -a', we skip any mounts for
+ * datasets that have 'noauto' set. Sharing a dataset with
+ * 'noauto' set is only allowed if it's mounted.
+ */
+ if (op == OP_MOUNT)
+ return (0);
+ if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) {
+ /* also purge it from existing exports */
+ zfs_unshareall_bypath(zhp, mountpoint);
+ return (0);
+ }
+ }
+
+ /*
+ * If this filesystem is encrypted and does not have
+ * a loaded key, we can not mount it.
+ */
+ if ((flags & MS_CRYPT) == 0 &&
+ zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF &&
+ zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) ==
+ ZFS_KEYSTATUS_UNAVAILABLE) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "encryption key not loaded\n"), cmdname, zfs_get_name(zhp));
+ return (1);
+ }
+
+ /*
+ * If this filesystem is inconsistent and has a receive resume
+ * token, we can not mount it.
+ */
+ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+ zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+ NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "Contains partially-completed state from "
+ "\"zfs receive -s\", which can be resumed with "
+ "\"zfs send -t\"\n"),
+ cmdname, zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot %s '%s': "
+ "Dataset is not complete, was created by receiving "
+ "a redacted zfs send stream.\n"), cmdname,
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ /*
+ * At this point, we have verified that the mountpoint and/or
+ * shareopts are appropriate for auto management. If the
+ * filesystem is already mounted or shared, return (failing
+ * for explicit requests); otherwise mount or share the
+ * filesystem.
+ */
+ switch (op) {
+ case OP_SHARE:
+
+ shared_nfs = zfs_is_shared_nfs(zhp, NULL);
+ shared_smb = zfs_is_shared_smb(zhp, NULL);
+
+ if ((shared_nfs && shared_smb) ||
+ (shared_nfs && strcmp(shareopts, "on") == 0 &&
+ strcmp(smbshareopts, "off") == 0) ||
+ (shared_smb && strcmp(smbshareopts, "on") == 0 &&
+ strcmp(shareopts, "off") == 0)) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot share "
+ "'%s': filesystem already shared\n"),
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (!zfs_is_mounted(zhp, NULL) &&
+ zfs_mount(zhp, NULL, flags) != 0)
+ return (1);
+
+ if (protocol == NULL) {
+ if (zfs_shareall(zhp) != 0)
+ return (1);
+ } else if (strcmp(protocol, "nfs") == 0) {
+ if (zfs_share_nfs(zhp))
+ return (1);
+ } else if (strcmp(protocol, "smb") == 0) {
+ if (zfs_share_smb(zhp))
+ return (1);
+ } else {
+ (void) fprintf(stderr, gettext("cannot share "
+ "'%s': invalid share type '%s' "
+ "specified\n"),
+ zfs_get_name(zhp), protocol);
+ return (1);
+ }
+
+ break;
+
+ case OP_MOUNT:
+ if (options == NULL)
+ mnt.mnt_mntopts = "";
+ else
+ mnt.mnt_mntopts = (char *)options;
+
+ if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+ zfs_is_mounted(zhp, NULL)) {
+ if (!explicit)
+ return (0);
+
+ (void) fprintf(stderr, gettext("cannot mount "
+ "'%s': filesystem already mounted\n"),
+ zfs_get_name(zhp));
+ return (1);
+ }
+
+ if (zfs_mount(zhp, options, flags) != 0)
+ return (1);
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Reports progress in the form "(current/total)". Not thread-safe.
+ */
+static void
+report_mount_progress(int current, int total)
+{
+ static time_t last_progress_time = 0;
+ time_t now = time(NULL);
+ char info[32];
+
+ /* report 1..n instead of 0..n-1 */
+ ++current;
+
+ /* display header if we're here for the first time */
+ if (current == 1) {
+ set_progress_header(gettext("Mounting ZFS filesystems"));
+ } else if (current != total && last_progress_time + MOUNT_TIME >= now) {
+ /* too soon to report again */
+ return;
+ }
+
+ last_progress_time = now;
+
+ (void) sprintf(info, "(%d/%d)", current, total);
+
+ if (current == total)
+ finish_progress(info);
+ else
+ update_progress(info);
+}
+
+/*
+ * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and
+ * updates the progress meter.
+ */
+static int
+share_mount_one_cb(zfs_handle_t *zhp, void *arg)
+{
+ share_mount_state_t *sms = arg;
+ int ret;
+
+ ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
+ B_FALSE, sms->sm_options);
+
+ pthread_mutex_lock(&sms->sm_lock);
+ if (ret != 0)
+ sms->sm_status = ret;
+ sms->sm_done++;
+ if (sms->sm_verbose)
+ report_mount_progress(sms->sm_done, sms->sm_total);
+ pthread_mutex_unlock(&sms->sm_lock);
+ return (ret);
+}
+
+static void
+append_options(char *mntopts, char *newopts)
+{
+ int len = strlen(mntopts);
+
+ /* original length plus new string to append plus 1 for the comma */
+ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) {
+ (void) fprintf(stderr, gettext("the opts argument for "
+ "'%s' option is too long (more than %d chars)\n"),
+ "-o", MNT_LINE_MAX);
+ usage(B_FALSE);
+ }
+
+ if (*mntopts)
+ mntopts[len++] = ',';
+
+ (void) strcpy(&mntopts[len], newopts);
+}
+
+static int
+share_mount(int op, int argc, char **argv)
+{
+ int do_all = 0;
+ boolean_t verbose = B_FALSE;
+ int c, ret = 0;
+ char *options = NULL;
+ int flags = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al"))
+ != -1) {
+ switch (c) {
+ case 'a':
+ do_all = 1;
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case 'l':
+ flags |= MS_CRYPT;
+ break;
+ case 'o':
+ if (*optarg == '\0') {
+ (void) fprintf(stderr, gettext("empty mount "
+ "options (-o) specified\n"));
+ usage(B_FALSE);
+ }
+
+ if (options == NULL)
+ options = safe_malloc(MNT_LINE_MAX + 1);
+
+ /* option validation is done later */
+ append_options(options, optarg);
+ break;
+ case 'O':
+ flags |= MS_OVERLAY;
+ break;
+ case 'f':
+ flags |= MS_FORCE;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (do_all) {
+ char *protocol = NULL;
+
+ if (op == OP_SHARE && argc > 0) {
+ if (strcmp(argv[0], "nfs") != 0 &&
+ strcmp(argv[0], "smb") != 0) {
+ (void) fprintf(stderr, gettext("share type "
+ "must be 'nfs' or 'smb'\n"));
+ usage(B_FALSE);
+ }
+ protocol = argv[0];
+ argc--;
+ argv++;
+ }
+
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ start_progress_timer();
+ get_all_cb_t cb = { 0 };
+ get_all_datasets(&cb, verbose);
+
+ if (cb.cb_used == 0) {
+ if (options != NULL)
+ free(options);
+ return (0);
+ }
+
+ share_mount_state_t share_mount_state = { 0 };
+ share_mount_state.sm_op = op;
+ share_mount_state.sm_verbose = verbose;
+ share_mount_state.sm_flags = flags;
+ share_mount_state.sm_options = options;
+ share_mount_state.sm_proto = protocol;
+ share_mount_state.sm_total = cb.cb_used;
+ pthread_mutex_init(&share_mount_state.sm_lock, NULL);
+
+ /*
+ * libshare isn't mt-safe, so only do the operation in parallel
+ * if we're mounting. Additionally, the key-loading option must
+ * be serialized so that we can prompt the user for their keys
+ * in a consistent manner.
+ */
+ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
+ share_mount_one_cb, &share_mount_state,
+ op == OP_MOUNT && !(flags & MS_CRYPT));
+ zfs_commit_all_shares();
+
+ ret = share_mount_state.sm_status;
+
+ for (int i = 0; i < cb.cb_used; i++)
+ zfs_close(cb.cb_handles[i]);
+ free(cb.cb_handles);
+ } else if (argc == 0) {
+ struct mnttab entry;
+
+ if ((op == OP_SHARE) || (options != NULL)) {
+ (void) fprintf(stderr, gettext("missing filesystem "
+ "argument (specify -a for all)\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * When mount is given no arguments, go through
+ * /proc/self/mounts and display any active ZFS mounts.
+ * We hide any snapshots, since they are controlled
+ * automatically.
+ */
+
+ /* Reopen MNTTAB to prevent reading stale data from open file */
+ if (freopen(MNTTAB, "r", mnttab_file) == NULL) {
+ if (options != NULL)
+ free(options);
+ return (ENOENT);
+ }
+
+ while (getmntent(mnttab_file, &entry) == 0) {
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
+ strchr(entry.mnt_special, '@') != NULL)
+ continue;
+
+ (void) printf("%-30s %s\n", entry.mnt_special,
+ entry.mnt_mountp);
+ }
+
+ } else {
+ zfs_handle_t *zhp;
+
+ if (argc > 1) {
+ (void) fprintf(stderr,
+ gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((zhp = zfs_open(g_zfs, argv[0],
+ ZFS_TYPE_FILESYSTEM)) == NULL) {
+ ret = 1;
+ } else {
+ ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
+ options);
+ zfs_commit_all_shares();
+ zfs_close(zhp);
+ }
+ }
+
+ if (options != NULL)
+ free(options);
+
+ return (ret);
+}
+
+/*
+ * zfs mount -a [nfs]
+ * zfs mount filesystem
+ *
+ * Mount all filesystems, or mount the given filesystem.
+ */
+static int
+zfs_do_mount(int argc, char **argv)
+{
+ return (share_mount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs share -a [nfs | smb]
+ * zfs share filesystem
+ *
+ * Share all filesystems, or share the given filesystem.
+ */
+static int
+zfs_do_share(int argc, char **argv)
+{
+ return (share_mount(OP_SHARE, argc, argv));
+}
+
+typedef struct unshare_unmount_node {
+ zfs_handle_t *un_zhp;
+ char *un_mountp;
+ uu_avl_node_t un_avlnode;
+} unshare_unmount_node_t;
+
+/* ARGSUSED */
+static int
+unshare_unmount_compare(const void *larg, const void *rarg, void *unused)
+{
+ const unshare_unmount_node_t *l = larg;
+ const unshare_unmount_node_t *r = rarg;
+
+ return (strcmp(l->un_mountp, r->un_mountp));
+}
+
+/*
+ * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an
+ * absolute path, find the entry /proc/self/mounts, verify that it's a
+ * ZFS filesystem, and unmount it appropriately.
+ */
+static int
+unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
+{
+ zfs_handle_t *zhp;
+ int ret = 0;
+ struct stat64 statbuf;
+ struct extmnttab entry;
+ const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
+ ino_t path_inode;
+
+ /*
+ * Search for the given (major,minor) pair in the mount table.
+ */
+
+ /* Reopen MNTTAB to prevent reading stale data from open file */
+ if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+ return (ENOENT);
+
+ if (getextmntent(path, &entry, &statbuf) != 0) {
+ if (op == OP_SHARE) {
+ (void) fprintf(stderr, gettext("cannot %s '%s': not "
+ "currently mounted\n"), cmdname, path);
+ return (1);
+ }
+ (void) fprintf(stderr, gettext("warning: %s not in"
+ "/proc/self/mounts\n"), path);
+ if ((ret = umount2(path, flags)) != 0)
+ (void) fprintf(stderr, gettext("%s: %s\n"), path,
+ strerror(errno));
+ return (ret != 0);
+ }
+ path_inode = statbuf.st_ino;
+
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
+ (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS "
+ "filesystem\n"), cmdname, path);
+ return (1);
+ }
+
+ if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+ ZFS_TYPE_FILESYSTEM)) == NULL)
+ return (1);
+
+ ret = 1;
+ if (stat64(entry.mnt_mountp, &statbuf) != 0) {
+ (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"),
+ cmdname, path, strerror(errno));
+ goto out;
+ } else if (statbuf.st_ino != path_inode) {
+ (void) fprintf(stderr, gettext("cannot "
+ "%s '%s': not a mountpoint\n"), cmdname, path);
+ goto out;
+ }
+
+ if (op == OP_SHARE) {
+ char nfs_mnt_prop[ZFS_MAXPROPLEN];
+ char smbshare_prop[ZFS_MAXPROPLEN];
+
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop,
+ sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop,
+ sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (strcmp(nfs_mnt_prop, "off") == 0 &&
+ strcmp(smbshare_prop, "off") == 0) {
+ (void) fprintf(stderr, gettext("cannot unshare "
+ "'%s': legacy share\n"), path);
+ (void) fprintf(stderr, gettext("use exportfs(8) "
+ "or smbcontrol(1) to unshare this filesystem\n"));
+ } else if (!zfs_is_shared(zhp)) {
+ (void) fprintf(stderr, gettext("cannot unshare '%s': "
+ "not currently shared\n"), path);
+ } else {
+ ret = zfs_unshareall_bypath(zhp, path);
+ zfs_commit_all_shares();
+ }
+ } else {
+ char mtpt_prop[ZFS_MAXPROPLEN];
+
+ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop,
+ sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0);
+
+ if (is_manual) {
+ ret = zfs_unmount(zhp, NULL, flags);
+ } else if (strcmp(mtpt_prop, "legacy") == 0) {
+ (void) fprintf(stderr, gettext("cannot unmount "
+ "'%s': legacy mountpoint\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use umount(8) "
+ "to unmount this filesystem\n"));
+ } else {
+ ret = zfs_unmountall(zhp, flags);
+ }
+ }
+
+out:
+ zfs_close(zhp);
+
+ return (ret != 0);
+}
+
+/*
+ * Generic callback for unsharing or unmounting a filesystem.
+ */
+static int
+unshare_unmount(int op, int argc, char **argv)
+{
+ int do_all = 0;
+ int flags = 0;
+ int ret = 0;
+ int c;
+ zfs_handle_t *zhp;
+ char nfs_mnt_prop[ZFS_MAXPROPLEN];
+ char sharesmb[ZFS_MAXPROPLEN];
+
+ /* check options */
+ while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) {
+ switch (c) {
+ case 'a':
+ do_all = 1;
+ break;
+ case 'f':
+ flags |= MS_FORCE;
+ break;
+ case 'u':
+ flags |= MS_CRYPT;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (do_all) {
+ /*
+ * We could make use of zfs_for_each() to walk all datasets in
+ * the system, but this would be very inefficient, especially
+ * since we would have to linearly search /proc/self/mounts for
+ * each one. Instead, do one pass through /proc/self/mounts
+ * looking for zfs entries and call zfs_unmount() for each one.
+ *
+ * Things get a little tricky if the administrator has created
+ * mountpoints beneath other ZFS filesystems. In this case, we
+ * have to unmount the deepest filesystems first. To accomplish
+ * this, we place all the mountpoints in an AVL tree sorted by
+ * the special type (dataset name), and walk the result in
+ * reverse to make sure to get any snapshots first.
+ */
+ struct mnttab entry;
+ uu_avl_pool_t *pool;
+ uu_avl_t *tree = NULL;
+ unshare_unmount_node_t *node;
+ uu_avl_index_t idx;
+ uu_avl_walk_t *walk;
+ char *protocol = NULL;
+
+ if (op == OP_SHARE && argc > 0) {
+ if (strcmp(argv[0], "nfs") != 0 &&
+ strcmp(argv[0], "smb") != 0) {
+ (void) fprintf(stderr, gettext("share type "
+ "must be 'nfs' or 'smb'\n"));
+ usage(B_FALSE);
+ }
+ protocol = argv[0];
+ argc--;
+ argv++;
+ }
+
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (((pool = uu_avl_pool_create("unmount_pool",
+ sizeof (unshare_unmount_node_t),
+ offsetof(unshare_unmount_node_t, un_avlnode),
+ unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
+ ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
+ nomem();
+
+ /* Reopen MNTTAB to prevent reading stale data from open file */
+ if (freopen(MNTTAB, "r", mnttab_file) == NULL)
+ return (ENOENT);
+
+ while (getmntent(mnttab_file, &entry) == 0) {
+
+ /* ignore non-ZFS entries */
+ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
+ continue;
+
+ /* ignore snapshots */
+ if (strchr(entry.mnt_special, '@') != NULL)
+ continue;
+
+ if ((zhp = zfs_open(g_zfs, entry.mnt_special,
+ ZFS_TYPE_FILESYSTEM)) == NULL) {
+ ret = 1;
+ continue;
+ }
+
+ /*
+ * Ignore datasets that are excluded/restricted by
+ * parent pool name.
+ */
+ if (zpool_skip_pool(zfs_get_pool_name(zhp))) {
+ zfs_close(zhp);
+ continue;
+ }
+
+ switch (op) {
+ case OP_SHARE:
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
+ NULL, NULL, 0, B_FALSE) == 0);
+ if (strcmp(nfs_mnt_prop, "off") != 0)
+ break;
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
+ NULL, NULL, 0, B_FALSE) == 0);
+ if (strcmp(nfs_mnt_prop, "off") == 0)
+ continue;
+ break;
+ case OP_MOUNT:
+ /* Ignore legacy mounts */
+ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
+ NULL, NULL, 0, B_FALSE) == 0);
+ if (strcmp(nfs_mnt_prop, "legacy") == 0)
+ continue;
+ /* Ignore canmount=noauto mounts */
+ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
+ ZFS_CANMOUNT_NOAUTO)
+ continue;
+ default:
+ break;
+ }
+
+ node = safe_malloc(sizeof (unshare_unmount_node_t));
+ node->un_zhp = zhp;
+ node->un_mountp = safe_strdup(entry.mnt_mountp);
+
+ uu_avl_node_init(node, &node->un_avlnode, pool);
+
+ if (uu_avl_find(tree, node, NULL, &idx) == NULL) {
+ uu_avl_insert(tree, node, idx);
+ } else {
+ zfs_close(node->un_zhp);
+ free(node->un_mountp);
+ free(node);
+ }
+ }
+
+ /*
+ * Walk the AVL tree in reverse, unmounting each filesystem and
+ * removing it from the AVL tree in the process.
+ */
+ if ((walk = uu_avl_walk_start(tree,
+ UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
+ nomem();
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ const char *mntarg = NULL;
+
+ uu_avl_remove(tree, node);
+ switch (op) {
+ case OP_SHARE:
+ if (zfs_unshareall_bytype(node->un_zhp,
+ node->un_mountp, protocol) != 0)
+ ret = 1;
+ break;
+
+ case OP_MOUNT:
+ if (zfs_unmount(node->un_zhp,
+ mntarg, flags) != 0)
+ ret = 1;
+ break;
+ }
+
+ zfs_close(node->un_zhp);
+ free(node->un_mountp);
+ free(node);
+ }
+
+ if (op == OP_SHARE)
+ zfs_commit_shares(protocol);
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(tree);
+ uu_avl_pool_destroy(pool);
+
+ } else {
+ if (argc != 1) {
+ if (argc == 0)
+ (void) fprintf(stderr,
+ gettext("missing filesystem argument\n"));
+ else
+ (void) fprintf(stderr,
+ gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * We have an argument, but it may be a full path or a ZFS
+ * filesystem. Pass full paths off to unmount_path() (shared by
+ * manual_unmount), otherwise open the filesystem and pass to
+ * zfs_unmount().
+ */
+ if (argv[0][0] == '/')
+ return (unshare_unmount_path(op, argv[0],
+ flags, B_FALSE));
+
+ if ((zhp = zfs_open(g_zfs, argv[0],
+ ZFS_TYPE_FILESYSTEM)) == NULL)
+ return (1);
+
+ verify(zfs_prop_get(zhp, op == OP_SHARE ?
+ ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+ nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
+ NULL, 0, B_FALSE) == 0);
+
+ switch (op) {
+ case OP_SHARE:
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+ nfs_mnt_prop,
+ sizeof (nfs_mnt_prop),
+ NULL, NULL, 0, B_FALSE) == 0);
+ verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+ sharesmb, sizeof (sharesmb), NULL, NULL,
+ 0, B_FALSE) == 0);
+
+ if (strcmp(nfs_mnt_prop, "off") == 0 &&
+ strcmp(sharesmb, "off") == 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unshare '%s': legacy share\n"),
+ zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use "
+ "unshare(1M) to unshare this "
+ "filesystem\n"));
+ ret = 1;
+ } else if (!zfs_is_shared(zhp)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unshare '%s': not currently "
+ "shared\n"), zfs_get_name(zhp));
+ ret = 1;
+ } else if (zfs_unshareall(zhp) != 0) {
+ ret = 1;
+ }
+ break;
+
+ case OP_MOUNT:
+ if (strcmp(nfs_mnt_prop, "legacy") == 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unmount '%s': legacy "
+ "mountpoint\n"), zfs_get_name(zhp));
+ (void) fprintf(stderr, gettext("use "
+ "umount(1M) to unmount this "
+ "filesystem\n"));
+ ret = 1;
+ } else if (!zfs_is_mounted(zhp, NULL)) {
+ (void) fprintf(stderr, gettext("cannot "
+ "unmount '%s': not currently "
+ "mounted\n"),
+ zfs_get_name(zhp));
+ ret = 1;
+ } else if (zfs_unmountall(zhp, flags) != 0) {
+ ret = 1;
+ }
+ break;
+ }
+
+ zfs_close(zhp);
+ }
+
+ return (ret);
+}
+
+/*
+ * zfs unmount [-fu] -a
+ * zfs unmount [-fu] filesystem
+ *
+ * Unmount all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unmount(int argc, char **argv)
+{
+ return (unshare_unmount(OP_MOUNT, argc, argv));
+}
+
+/*
+ * zfs unshare -a
+ * zfs unshare filesystem
+ *
+ * Unshare all filesystems, or a specific ZFS filesystem.
+ */
+static int
+zfs_do_unshare(int argc, char **argv)
+{
+ return (unshare_unmount(OP_SHARE, argc, argv));
+}
+
+static int
+find_command_idx(char *command, int *idx)
+{
+ int i;
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ continue;
+
+ if (strcmp(command, command_table[i].name) == 0) {
+ *idx = i;
+ return (0);
+ }
+ }
+ return (1);
+}
+
+static int
+zfs_do_diff(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ int flags = 0;
+ char *tosnap = NULL;
+ char *fromsnap = NULL;
+ char *atp, *copy;
+ int err = 0;
+ int c;
+ struct sigaction sa;
+
+ while ((c = getopt(argc, argv, "FHt")) != -1) {
+ switch (c) {
+ case 'F':
+ flags |= ZFS_DIFF_CLASSIFY;
+ break;
+ case 'H':
+ flags |= ZFS_DIFF_PARSEABLE;
+ break;
+ case 't':
+ flags |= ZFS_DIFF_TIMESTAMP;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr,
+ gettext("must provide at least one snapshot name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ fromsnap = argv[0];
+ tosnap = (argc == 2) ? argv[1] : NULL;
+
+ copy = NULL;
+ if (*fromsnap != '@')
+ copy = strdup(fromsnap);
+ else if (tosnap)
+ copy = strdup(tosnap);
+ if (copy == NULL)
+ usage(B_FALSE);
+
+ if ((atp = strchr(copy, '@')) != NULL)
+ *atp = '\0';
+
+ if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) {
+ free(copy);
+ return (1);
+ }
+ free(copy);
+
+ /*
+ * Ignore SIGPIPE so that the library can give us
+ * information on any failure
+ */
+ if (sigemptyset(&sa.sa_mask) == -1) {
+ err = errno;
+ goto out;
+ }
+ sa.sa_flags = 0;
+ sa.sa_handler = SIG_IGN;
+ if (sigaction(SIGPIPE, &sa, NULL) == -1) {
+ err = errno;
+ goto out;
+ }
+
+ err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
+out:
+ zfs_close(zhp);
+
+ return (err != 0);
+}
+
+/*
+ * zfs bookmark <fs@source>|<fs#source> <fs#bookmark>
+ *
+ * Creates a bookmark with the given name from the source snapshot
+ * or creates a copy of an existing source bookmark.
+ */
+static int
+zfs_do_bookmark(int argc, char **argv)
+{
+ char *source, *bookname;
+ char expbuf[ZFS_MAX_DATASET_NAME_LEN];
+ int source_type;
+ nvlist_t *nvl;
+ int ret = 0;
+ int c;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "")) != -1) {
+ switch (c) {
+ case '?':
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ goto usage;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing source argument\n"));
+ goto usage;
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing bookmark argument\n"));
+ goto usage;
+ }
+
+ source = argv[0];
+ bookname = argv[1];
+
+ if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) {
+ (void) fprintf(stderr,
+ gettext("invalid source name '%s': "
+ "must contain a '@' or '#'\n"), source);
+ goto usage;
+ }
+ if (strchr(bookname, '#') == NULL) {
+ (void) fprintf(stderr,
+ gettext("invalid bookmark name '%s': "
+ "must contain a '#'\n"), bookname);
+ goto usage;
+ }
+
+ /*
+ * expand source or bookname to full path:
+ * one of them may be specified as short name
+ */
+ {
+ char **expand;
+ char *source_short, *bookname_short;
+ source_short = strpbrk(source, "@#");
+ bookname_short = strpbrk(bookname, "#");
+ if (source_short == source &&
+ bookname_short == bookname) {
+ (void) fprintf(stderr, gettext(
+ "either source or bookmark must be specified as "
+ "full dataset paths"));
+ goto usage;
+ } else if (source_short != source &&
+ bookname_short != bookname) {
+ expand = NULL;
+ } else if (source_short != source) {
+ strlcpy(expbuf, source, sizeof (expbuf));
+ expand = &bookname;
+ } else if (bookname_short != bookname) {
+ strlcpy(expbuf, bookname, sizeof (expbuf));
+ expand = &source;
+ } else {
+ abort();
+ }
+ if (expand != NULL) {
+ *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */
+ (void) strlcat(expbuf, *expand, sizeof (expbuf));
+ *expand = expbuf;
+ }
+ }
+
+ /* determine source type */
+ switch (*strpbrk(source, "@#")) {
+ case '@': source_type = ZFS_TYPE_SNAPSHOT; break;
+ case '#': source_type = ZFS_TYPE_BOOKMARK; break;
+ default: abort();
+ }
+
+ /* test the source exists */
+ zfs_handle_t *zhp;
+ zhp = zfs_open(g_zfs, source, source_type);
+ if (zhp == NULL)
+ goto usage;
+ zfs_close(zhp);
+
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, bookname, source);
+ ret = lzc_bookmark(nvl, NULL);
+ fnvlist_free(nvl);
+
+ if (ret != 0) {
+ const char *err_msg = NULL;
+ char errbuf[1024];
+
+ (void) snprintf(errbuf, sizeof (errbuf),
+ dgettext(TEXT_DOMAIN,
+ "cannot create bookmark '%s'"), bookname);
+
+ switch (ret) {
+ case EXDEV:
+ err_msg = "bookmark is in a different pool";
+ break;
+ case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR:
+ err_msg = "source is not an ancestor of the "
+ "new bookmark's dataset";
+ break;
+ case EEXIST:
+ err_msg = "bookmark exists";
+ break;
+ case EINVAL:
+ err_msg = "invalid argument";
+ break;
+ case ENOTSUP:
+ err_msg = "bookmark feature not enabled";
+ break;
+ case ENOSPC:
+ err_msg = "out of space";
+ break;
+ case ENOENT:
+ err_msg = "dataset does not exist";
+ break;
+ default:
+ (void) zfs_standard_error(g_zfs, ret, errbuf);
+ break;
+ }
+ if (err_msg != NULL) {
+ (void) fprintf(stderr, "%s: %s\n", errbuf,
+ dgettext(TEXT_DOMAIN, err_msg));
+ }
+ }
+
+ return (ret != 0);
+
+usage:
+ usage(B_FALSE);
+ return (-1);
+}
+
+static int
+zfs_do_channel_program(int argc, char **argv)
+{
+ int ret, fd, c;
+ char *progbuf, *filename, *poolname;
+ size_t progsize, progread;
+ nvlist_t *outnvl = NULL;
+ uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT;
+ uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT;
+ boolean_t sync_flag = B_TRUE, json_output = B_FALSE;
+ zpool_handle_t *zhp;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "nt:m:j")) != -1) {
+ switch (c) {
+ case 't':
+ case 'm': {
+ uint64_t arg;
+ char *endp;
+
+ errno = 0;
+ arg = strtoull(optarg, &endp, 0);
+ if (errno != 0 || *endp != '\0') {
+ (void) fprintf(stderr, gettext(
+ "invalid argument "
+ "'%s': expected integer\n"), optarg);
+ goto usage;
+ }
+
+ if (c == 't') {
+ instrlimit = arg;
+ } else {
+ ASSERT3U(c, ==, 'm');
+ memlimit = arg;
+ }
+ break;
+ }
+ case 'n': {
+ sync_flag = B_FALSE;
+ break;
+ }
+ case 'j': {
+ json_output = B_TRUE;
+ break;
+ }
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ goto usage;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 2) {
+ (void) fprintf(stderr,
+ gettext("invalid number of arguments\n"));
+ goto usage;
+ }
+
+ poolname = argv[0];
+ filename = argv[1];
+ if (strcmp(filename, "-") == 0) {
+ fd = 0;
+ filename = "standard input";
+ } else if ((fd = open(filename, O_RDONLY)) < 0) {
+ (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
+ filename, strerror(errno));
+ return (1);
+ }
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
+ (void) fprintf(stderr, gettext("cannot open pool '%s'\n"),
+ poolname);
+ if (fd != 0)
+ (void) close(fd);
+ return (1);
+ }
+ zpool_close(zhp);
+
+ /*
+ * Read in the channel program, expanding the program buffer as
+ * necessary.
+ */
+ progread = 0;
+ progsize = 1024;
+ progbuf = safe_malloc(progsize);
+ do {
+ ret = read(fd, progbuf + progread, progsize - progread);
+ progread += ret;
+ if (progread == progsize && ret > 0) {
+ progsize *= 2;
+ progbuf = safe_realloc(progbuf, progsize);
+ }
+ } while (ret > 0);
+
+ if (fd != 0)
+ (void) close(fd);
+ if (ret < 0) {
+ free(progbuf);
+ (void) fprintf(stderr,
+ gettext("cannot read '%s': %s\n"),
+ filename, strerror(errno));
+ return (1);
+ }
+ progbuf[progread] = '\0';
+
+ /*
+ * Any remaining arguments are passed as arguments to the lua script as
+ * a string array:
+ * {
+ * "argv" -> [ "arg 1", ... "arg n" ],
+ * }
+ */
+ nvlist_t *argnvl = fnvlist_alloc();
+ fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2);
+
+ if (sync_flag) {
+ ret = lzc_channel_program(poolname, progbuf,
+ instrlimit, memlimit, argnvl, &outnvl);
+ } else {
+ ret = lzc_channel_program_nosync(poolname, progbuf,
+ instrlimit, memlimit, argnvl, &outnvl);
+ }
+
+ if (ret != 0) {
+ /*
+ * On error, report the error message handed back by lua if one
+ * exists. Otherwise, generate an appropriate error message,
+ * falling back on strerror() for an unexpected return code.
+ */
+ char *errstring = NULL;
+ const char *msg = gettext("Channel program execution failed");
+ uint64_t instructions = 0;
+ if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) {
+ (void) nvlist_lookup_string(outnvl,
+ ZCP_RET_ERROR, &errstring);
+ if (errstring == NULL)
+ errstring = strerror(ret);
+ if (ret == ETIME) {
+ (void) nvlist_lookup_uint64(outnvl,
+ ZCP_ARG_INSTRLIMIT, &instructions);
+ }
+ } else {
+ switch (ret) {
+ case EINVAL:
+ errstring =
+ "Invalid instruction or memory limit.";
+ break;
+ case ENOMEM:
+ errstring = "Return value too large.";
+ break;
+ case ENOSPC:
+ errstring = "Memory limit exhausted.";
+ break;
+ case ETIME:
+ errstring = "Timed out.";
+ break;
+ case EPERM:
+ errstring = "Permission denied. Channel "
+ "programs must be run as root.";
+ break;
+ default:
+ (void) zfs_standard_error(g_zfs, ret, msg);
+ }
+ }
+ if (errstring != NULL)
+ (void) fprintf(stderr, "%s:\n%s\n", msg, errstring);
+
+ if (ret == ETIME && instructions != 0)
+ (void) fprintf(stderr,
+ gettext("%llu Lua instructions\n"),
+ (u_longlong_t)instructions);
+ } else {
+ if (json_output) {
+ (void) nvlist_print_json(stdout, outnvl);
+ } else if (nvlist_empty(outnvl)) {
+ (void) fprintf(stdout, gettext("Channel program fully "
+ "executed and did not produce output.\n"));
+ } else {
+ (void) fprintf(stdout, gettext("Channel program fully "
+ "executed and produced output:\n"));
+ dump_nvlist(outnvl, 4);
+ }
+ }
+
+ free(progbuf);
+ fnvlist_free(outnvl);
+ fnvlist_free(argnvl);
+ return (ret != 0);
+
+usage:
+ usage(B_FALSE);
+ return (-1);
+}
+
+
+typedef struct loadkey_cbdata {
+ boolean_t cb_loadkey;
+ boolean_t cb_recursive;
+ boolean_t cb_noop;
+ char *cb_keylocation;
+ uint64_t cb_numfailed;
+ uint64_t cb_numattempted;
+} loadkey_cbdata_t;
+
+static int
+load_key_callback(zfs_handle_t *zhp, void *data)
+{
+ int ret;
+ boolean_t is_encroot;
+ loadkey_cbdata_t *cb = data;
+ uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS);
+
+ /*
+ * If we are working recursively, we want to skip loading / unloading
+ * keys for non-encryption roots and datasets whose keys are already
+ * in the desired end-state.
+ */
+ if (cb->cb_recursive) {
+ ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL);
+ if (ret != 0)
+ return (ret);
+ if (!is_encroot)
+ return (0);
+
+ if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) ||
+ (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE))
+ return (0);
+ }
+
+ cb->cb_numattempted++;
+
+ if (cb->cb_loadkey)
+ ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation);
+ else
+ ret = zfs_crypto_unload_key(zhp);
+
+ if (ret != 0) {
+ cb->cb_numfailed++;
+ return (ret);
+ }
+
+ return (0);
+}
+
+static int
+load_unload_keys(int argc, char **argv, boolean_t loadkey)
+{
+ int c, ret = 0, flags = 0;
+ boolean_t do_all = B_FALSE;
+ loadkey_cbdata_t cb = { 0 };
+
+ cb.cb_loadkey = loadkey;
+
+ while ((c = getopt(argc, argv, "anrL:")) != -1) {
+ /* noop and alternate keylocations only apply to zfs load-key */
+ if (loadkey) {
+ switch (c) {
+ case 'n':
+ cb.cb_noop = B_TRUE;
+ continue;
+ case 'L':
+ cb.cb_keylocation = optarg;
+ continue;
+ default:
+ break;
+ }
+ }
+
+ switch (c) {
+ case 'a':
+ do_all = B_TRUE;
+ cb.cb_recursive = B_TRUE;
+ break;
+ case 'r':
+ flags |= ZFS_ITER_RECURSE;
+ cb.cb_recursive = B_TRUE;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (!do_all && argc == 0) {
+ (void) fprintf(stderr,
+ gettext("Missing dataset argument or -a option\n"));
+ usage(B_FALSE);
+ }
+
+ if (do_all && argc != 0) {
+ (void) fprintf(stderr,
+ gettext("Cannot specify dataset with -a option\n"));
+ usage(B_FALSE);
+ }
+
+ if (cb.cb_recursive && cb.cb_keylocation != NULL &&
+ strcmp(cb.cb_keylocation, "prompt") != 0) {
+ (void) fprintf(stderr, gettext("alternate keylocation may only "
+ "be 'prompt' with -r or -a\n"));
+ usage(B_FALSE);
+ }
+
+ ret = zfs_for_each(argc, argv, flags,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0,
+ load_key_callback, &cb);
+
+ if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) {
+ (void) printf(gettext("%llu / %llu key(s) successfully %s\n"),
+ (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed),
+ (u_longlong_t)cb.cb_numattempted,
+ loadkey ? (cb.cb_noop ? "verified" : "loaded") :
+ "unloaded");
+ }
+
+ if (cb.cb_numfailed != 0)
+ ret = -1;
+
+ return (ret);
+}
+
+static int
+zfs_do_load_key(int argc, char **argv)
+{
+ return (load_unload_keys(argc, argv, B_TRUE));
+}
+
+
+static int
+zfs_do_unload_key(int argc, char **argv)
+{
+ return (load_unload_keys(argc, argv, B_FALSE));
+}
+
+static int
+zfs_do_change_key(int argc, char **argv)
+{
+ int c, ret;
+ uint64_t keystatus;
+ boolean_t loadkey = B_FALSE, inheritkey = B_FALSE;
+ zfs_handle_t *zhp = NULL;
+ nvlist_t *props = fnvlist_alloc();
+
+ while ((c = getopt(argc, argv, "lio:")) != -1) {
+ switch (c) {
+ case 'l':
+ loadkey = B_TRUE;
+ break;
+ case 'i':
+ inheritkey = B_TRUE;
+ break;
+ case 'o':
+ if (!parseprop(props, optarg)) {
+ nvlist_free(props);
+ return (1);
+ }
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ if (inheritkey && !nvlist_empty(props)) {
+ (void) fprintf(stderr,
+ gettext("Properties not allowed for inheriting\n"));
+ usage(B_FALSE);
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("Missing dataset argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("Too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ zhp = zfs_open(g_zfs, argv[argc - 1],
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+ if (zhp == NULL)
+ usage(B_FALSE);
+
+ if (loadkey) {
+ keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS);
+ if (keystatus != ZFS_KEYSTATUS_AVAILABLE) {
+ ret = zfs_crypto_load_key(zhp, B_FALSE, NULL);
+ if (ret != 0) {
+ nvlist_free(props);
+ zfs_close(zhp);
+ return (-1);
+ }
+ }
+
+ /* refresh the properties so the new keystatus is visible */
+ zfs_refresh_properties(zhp);
+ }
+
+ ret = zfs_crypto_rewrap(zhp, props, inheritkey);
+ if (ret != 0) {
+ nvlist_free(props);
+ zfs_close(zhp);
+ return (-1);
+ }
+
+ nvlist_free(props);
+ zfs_close(zhp);
+ return (0);
+}
+
+/*
+ * 1) zfs project [-d|-r] <file|directory ...>
+ * List project ID and inherit flag of file(s) or directories.
+ * -d: List the directory itself, not its children.
+ * -r: List subdirectories recursively.
+ *
+ * 2) zfs project -C [-k] [-r] <file|directory ...>
+ * Clear project inherit flag and/or ID on the file(s) or directories.
+ * -k: Keep the project ID unchanged. If not specified, the project ID
+ * will be reset as zero.
+ * -r: Clear on subdirectories recursively.
+ *
+ * 3) zfs project -c [-0] [-d|-r] [-p id] <file|directory ...>
+ * Check project ID and inherit flag on the file(s) or directories,
+ * report the outliers.
+ * -0: Print file name followed by a NUL instead of newline.
+ * -d: Check the directory itself, not its children.
+ * -p: Specify the referenced ID for comparing with the target file(s)
+ * or directories' project IDs. If not specified, the target (top)
+ * directory's project ID will be used as the referenced one.
+ * -r: Check subdirectories recursively.
+ *
+ * 4) zfs project [-p id] [-r] [-s] <file|directory ...>
+ * Set project ID and/or inherit flag on the file(s) or directories.
+ * -p: Set the project ID as the given id.
+ * -r: Set on subdirectories recursively. If not specify "-p" option,
+ * it will use top-level directory's project ID as the given id,
+ * then set both project ID and inherit flag on all descendants
+ * of the top-level directory.
+ * -s: Set project inherit flag.
+ */
+static int
+zfs_do_project(int argc, char **argv)
+{
+ zfs_project_control_t zpc = {
+ .zpc_expected_projid = ZFS_INVALID_PROJID,
+ .zpc_op = ZFS_PROJECT_OP_DEFAULT,
+ .zpc_dironly = B_FALSE,
+ .zpc_keep_projid = B_FALSE,
+ .zpc_newline = B_TRUE,
+ .zpc_recursive = B_FALSE,
+ .zpc_set_flag = B_FALSE,
+ };
+ int ret = 0, c;
+
+ if (argc < 2)
+ usage(B_FALSE);
+
+ while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) {
+ switch (c) {
+ case '0':
+ zpc.zpc_newline = B_FALSE;
+ break;
+ case 'C':
+ if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) {
+ (void) fprintf(stderr, gettext("cannot "
+ "specify '-C' '-c' '-s' together\n"));
+ usage(B_FALSE);
+ }
+
+ zpc.zpc_op = ZFS_PROJECT_OP_CLEAR;
+ break;
+ case 'c':
+ if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) {
+ (void) fprintf(stderr, gettext("cannot "
+ "specify '-C' '-c' '-s' together\n"));
+ usage(B_FALSE);
+ }
+
+ zpc.zpc_op = ZFS_PROJECT_OP_CHECK;
+ break;
+ case 'd':
+ zpc.zpc_dironly = B_TRUE;
+ /* overwrite "-r" option */
+ zpc.zpc_recursive = B_FALSE;
+ break;
+ case 'k':
+ zpc.zpc_keep_projid = B_TRUE;
+ break;
+ case 'p': {
+ char *endptr;
+
+ errno = 0;
+ zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0);
+ if (errno != 0 || *endptr != '\0') {
+ (void) fprintf(stderr,
+ gettext("project ID must be less than "
+ "%u\n"), UINT32_MAX);
+ usage(B_FALSE);
+ }
+ if (zpc.zpc_expected_projid >= UINT32_MAX) {
+ (void) fprintf(stderr,
+ gettext("invalid project ID\n"));
+ usage(B_FALSE);
+ }
+ break;
+ }
+ case 'r':
+ zpc.zpc_recursive = B_TRUE;
+ /* overwrite "-d" option */
+ zpc.zpc_dironly = B_FALSE;
+ break;
+ case 's':
+ if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) {
+ (void) fprintf(stderr, gettext("cannot "
+ "specify '-C' '-c' '-s' together\n"));
+ usage(B_FALSE);
+ }
+
+ zpc.zpc_set_flag = B_TRUE;
+ zpc.zpc_op = ZFS_PROJECT_OP_SET;
+ break;
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) {
+ if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID)
+ zpc.zpc_op = ZFS_PROJECT_OP_SET;
+ else
+ zpc.zpc_op = ZFS_PROJECT_OP_LIST;
+ }
+
+ switch (zpc.zpc_op) {
+ case ZFS_PROJECT_OP_LIST:
+ if (zpc.zpc_keep_projid) {
+ (void) fprintf(stderr,
+ gettext("'-k' is only valid together with '-C'\n"));
+ usage(B_FALSE);
+ }
+ if (!zpc.zpc_newline) {
+ (void) fprintf(stderr,
+ gettext("'-0' is only valid together with '-c'\n"));
+ usage(B_FALSE);
+ }
+ break;
+ case ZFS_PROJECT_OP_CHECK:
+ if (zpc.zpc_keep_projid) {
+ (void) fprintf(stderr,
+ gettext("'-k' is only valid together with '-C'\n"));
+ usage(B_FALSE);
+ }
+ break;
+ case ZFS_PROJECT_OP_CLEAR:
+ if (zpc.zpc_dironly) {
+ (void) fprintf(stderr,
+ gettext("'-d' is useless together with '-C'\n"));
+ usage(B_FALSE);
+ }
+ if (!zpc.zpc_newline) {
+ (void) fprintf(stderr,
+ gettext("'-0' is only valid together with '-c'\n"));
+ usage(B_FALSE);
+ }
+ if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) {
+ (void) fprintf(stderr,
+ gettext("'-p' is useless together with '-C'\n"));
+ usage(B_FALSE);
+ }
+ break;
+ case ZFS_PROJECT_OP_SET:
+ if (zpc.zpc_dironly) {
+ (void) fprintf(stderr,
+ gettext("'-d' is useless for set project ID and/or "
+ "inherit flag\n"));
+ usage(B_FALSE);
+ }
+ if (zpc.zpc_keep_projid) {
+ (void) fprintf(stderr,
+ gettext("'-k' is only valid together with '-C'\n"));
+ usage(B_FALSE);
+ }
+ if (!zpc.zpc_newline) {
+ (void) fprintf(stderr,
+ gettext("'-0' is only valid together with '-c'\n"));
+ usage(B_FALSE);
+ }
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ argv += optind;
+ argc -= optind;
+ if (argc == 0) {
+ (void) fprintf(stderr,
+ gettext("missing file or directory target(s)\n"));
+ usage(B_FALSE);
+ }
+
+ for (int i = 0; i < argc; i++) {
+ int err;
+
+ err = zfs_project_handle(argv[i], &zpc);
+ if (err && !ret)
+ ret = err;
+ }
+
+ return (ret);
+}
+
+static int
+zfs_do_wait(int argc, char **argv)
+{
+ boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES];
+ int error, i;
+ char c;
+
+ /* By default, wait for all types of activity. */
+ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++)
+ enabled[i] = B_TRUE;
+
+ while ((c = getopt(argc, argv, "t:")) != -1) {
+ switch (c) {
+ case 't':
+ {
+ static char *col_subopts[] = { "deleteq", NULL };
+ char *value;
+
+ /* Reset activities array */
+ bzero(&enabled, sizeof (enabled));
+ while (*optarg != '\0') {
+ int activity = getsubopt(&optarg, col_subopts,
+ &value);
+
+ if (activity < 0) {
+ (void) fprintf(stderr,
+ gettext("invalid activity '%s'\n"),
+ value);
+ usage(B_FALSE);
+ }
+
+ enabled[activity] = B_TRUE;
+ }
+ break;
+ }
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argv += optind;
+ argc -= optind;
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing 'filesystem' "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM);
+ if (zhp == NULL)
+ return (1);
+
+ for (;;) {
+ boolean_t missing = B_FALSE;
+ boolean_t any_waited = B_FALSE;
+
+ for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) {
+ boolean_t waited;
+
+ if (!enabled[i])
+ continue;
+
+ error = zfs_wait_status(zhp, i, &missing, &waited);
+ if (error != 0 || missing)
+ break;
+
+ any_waited = (any_waited || waited);
+ }
+
+ if (error != 0 || missing || !any_waited)
+ break;
+ }
+
+ zfs_close(zhp);
+
+ return (error);
+}
+
+/*
+ * Display version message
+ */
+static int
+zfs_do_version(int argc, char **argv)
+{
+ if (zfs_version_print() == -1)
+ return (1);
+
+ return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+ int ret = 0;
+ int i = 0;
+ char *cmdname;
+ char **newargv;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+
+ opterr = 0;
+
+ /*
+ * Make sure the user has specified some command.
+ */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing command\n"));
+ usage(B_FALSE);
+ }
+
+ cmdname = argv[1];
+
+ /*
+ * The 'umount' command is an alias for 'unmount'
+ */
+ if (strcmp(cmdname, "umount") == 0)
+ cmdname = "unmount";
+
+ /*
+ * The 'recv' command is an alias for 'receive'
+ */
+ if (strcmp(cmdname, "recv") == 0)
+ cmdname = "receive";
+
+ /*
+ * The 'snap' command is an alias for 'snapshot'
+ */
+ if (strcmp(cmdname, "snap") == 0)
+ cmdname = "snapshot";
+
+ /*
+ * Special case '-?'
+ */
+ if ((strcmp(cmdname, "-?") == 0) ||
+ (strcmp(cmdname, "--help") == 0))
+ usage(B_TRUE);
+
+ /*
+ * Special case '-V|--version'
+ */
+ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0))
+ return (zfs_do_version(argc, argv));
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+ return (1);
+ }
+
+ mnttab_file = g_zfs->libzfs_mnttab;
+
+ zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
+
+ libzfs_print_on_error(g_zfs, B_TRUE);
+
+ /*
+ * Many commands modify input strings for string parsing reasons.
+ * We create a copy to protect the original argv.
+ */
+ newargv = malloc((argc + 1) * sizeof (newargv[0]));
+ for (i = 0; i < argc; i++)
+ newargv[i] = strdup(argv[i]);
+ newargv[argc] = NULL;
+
+ /*
+ * Run the appropriate command.
+ */
+ libzfs_mnttab_cache(g_zfs, B_TRUE);
+ if (find_command_idx(cmdname, &i) == 0) {
+ current_command = &command_table[i];
+ ret = command_table[i].func(argc - 1, newargv + 1);
+ } else if (strchr(cmdname, '=') != NULL) {
+ verify(find_command_idx("set", &i) == 0);
+ current_command = &command_table[i];
+ ret = command_table[i].func(argc, newargv);
+ } else {
+ (void) fprintf(stderr, gettext("unrecognized "
+ "command '%s'\n"), cmdname);
+ usage(B_FALSE);
+ ret = 1;
+ }
+
+ for (i = 0; i < argc; i++)
+ free(newargv[i]);
+ free(newargv);
+
+ if (ret == 0 && log_history)
+ (void) zpool_log_history(g_zfs, history_str);
+
+ libzfs_fini(g_zfs);
+
+ /*
+ * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+ * for the purposes of running ::findleaks.
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ return (ret);
+}
+
+#ifdef __FreeBSD__
+#include <sys/jail.h>
+#include <jail.h>
+/*
+ * Attach/detach the given dataset to/from the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_jail_impl(int argc, char **argv, boolean_t attach)
+{
+ zfs_handle_t *zhp;
+ int jailid, ret;
+
+ /* check number of arguments */
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing argument(s)\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ jailid = jail_getid(argv[1]);
+ if (jailid < 0) {
+ (void) fprintf(stderr, gettext("invalid jail id or name\n"));
+ usage(B_FALSE);
+ }
+
+ zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
+ if (zhp == NULL)
+ return (1);
+
+ ret = (zfs_jail(zhp, jailid, attach) != 0);
+
+ zfs_close(zhp);
+ return (ret);
+}
+
+/*
+ * zfs jail jailid filesystem
+ *
+ * Attach the given dataset to the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_jail(int argc, char **argv)
+{
+ return (zfs_do_jail_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs unjail jailid filesystem
+ *
+ * Detach the given dataset from the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_unjail(int argc, char **argv)
+{
+ return (zfs_do_jail_impl(argc, argv, B_FALSE));
+}
+#endif
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_project.c b/sys/contrib/openzfs/cmd/zfs/zfs_project.c
new file mode 100644
index 000000000000..341cc005de48
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_project.c
@@ -0,0 +1,295 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Intle Corporation. All rights reserved.
+ */
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <stddef.h>
+#include <libintl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/list.h>
+#include <sys/zfs_project.h>
+
+#include "zfs_util.h"
+#include "zfs_projectutil.h"
+
+typedef struct zfs_project_item {
+ list_node_t zpi_list;
+ char zpi_name[0];
+} zfs_project_item_t;
+
+static void
+zfs_project_item_alloc(list_t *head, const char *name)
+{
+ zfs_project_item_t *zpi;
+
+ zpi = safe_malloc(sizeof (zfs_project_item_t) + strlen(name) + 1);
+ strcpy(zpi->zpi_name, name);
+ list_insert_tail(head, zpi);
+}
+
+static int
+zfs_project_sanity_check(const char *name, zfs_project_control_t *zpc,
+ struct stat *st)
+{
+ int ret;
+
+ ret = stat(name, st);
+ if (ret) {
+ (void) fprintf(stderr, gettext("failed to stat %s: %s\n"),
+ name, strerror(errno));
+ return (ret);
+ }
+
+ if (!S_ISREG(st->st_mode) && !S_ISDIR(st->st_mode)) {
+ (void) fprintf(stderr, gettext("only support project quota on "
+ "regular file or directory\n"));
+ return (-1);
+ }
+
+ if (!S_ISDIR(st->st_mode)) {
+ if (zpc->zpc_dironly) {
+ (void) fprintf(stderr, gettext(
+ "'-d' option on non-dir target %s\n"), name);
+ return (-1);
+ }
+
+ if (zpc->zpc_recursive) {
+ (void) fprintf(stderr, gettext(
+ "'-r' option on non-dir target %s\n"), name);
+ return (-1);
+ }
+ }
+
+ return (0);
+}
+
+static int
+zfs_project_load_projid(const char *name, zfs_project_control_t *zpc)
+{
+ zfsxattr_t fsx;
+ int ret, fd;
+
+ fd = open(name, O_RDONLY | O_NOCTTY);
+ if (fd < 0) {
+ (void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+ name, strerror(errno));
+ return (fd);
+ }
+
+ ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx);
+ if (ret)
+ (void) fprintf(stderr,
+ gettext("failed to get xattr for %s: %s\n"),
+ name, strerror(errno));
+ else
+ zpc->zpc_expected_projid = fsx.fsx_projid;
+
+ close(fd);
+ return (ret);
+}
+
+static int
+zfs_project_handle_one(const char *name, zfs_project_control_t *zpc)
+{
+ zfsxattr_t fsx;
+ int ret, fd;
+
+ fd = open(name, O_RDONLY | O_NOCTTY);
+ if (fd < 0) {
+ if (errno == ENOENT && zpc->zpc_ignore_noent)
+ return (0);
+
+ (void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+ name, strerror(errno));
+ return (fd);
+ }
+
+ ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx);
+ if (ret) {
+ (void) fprintf(stderr,
+ gettext("failed to get xattr for %s: %s\n"),
+ name, strerror(errno));
+ goto out;
+ }
+
+ switch (zpc->zpc_op) {
+ case ZFS_PROJECT_OP_LIST:
+ (void) printf("%5u %c %s\n", fsx.fsx_projid,
+ (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) ? 'P' : '-', name);
+ goto out;
+ case ZFS_PROJECT_OP_CHECK:
+ if (fsx.fsx_projid == zpc->zpc_expected_projid &&
+ fsx.fsx_xflags & ZFS_PROJINHERIT_FL)
+ goto out;
+
+ if (!zpc->zpc_newline) {
+ char c = '\0';
+
+ (void) printf("%s%c", name, c);
+ goto out;
+ }
+
+ if (fsx.fsx_projid != zpc->zpc_expected_projid)
+ (void) printf("%s - project ID is not set properly "
+ "(%u/%u)\n", name, fsx.fsx_projid,
+ (uint32_t)zpc->zpc_expected_projid);
+
+ if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL))
+ (void) printf("%s - project inherit flag is not set\n",
+ name);
+
+ goto out;
+ case ZFS_PROJECT_OP_CLEAR:
+ if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL) &&
+ (zpc->zpc_keep_projid ||
+ fsx.fsx_projid == ZFS_DEFAULT_PROJID))
+ goto out;
+
+ fsx.fsx_xflags &= ~ZFS_PROJINHERIT_FL;
+ if (!zpc->zpc_keep_projid)
+ fsx.fsx_projid = ZFS_DEFAULT_PROJID;
+ break;
+ case ZFS_PROJECT_OP_SET:
+ if (fsx.fsx_projid == zpc->zpc_expected_projid &&
+ (!zpc->zpc_set_flag || fsx.fsx_xflags & ZFS_PROJINHERIT_FL))
+ goto out;
+
+ fsx.fsx_projid = zpc->zpc_expected_projid;
+ if (zpc->zpc_set_flag)
+ fsx.fsx_xflags |= ZFS_PROJINHERIT_FL;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ ret = ioctl(fd, ZFS_IOC_FSSETXATTR, &fsx);
+ if (ret)
+ (void) fprintf(stderr,
+ gettext("failed to set xattr for %s: %s\n"),
+ name, strerror(errno));
+
+out:
+ close(fd);
+ return (ret);
+}
+
+static int
+zfs_project_handle_dir(const char *name, zfs_project_control_t *zpc,
+ list_t *head)
+{
+ char fullname[PATH_MAX];
+ struct dirent *ent;
+ DIR *dir;
+ int ret = 0;
+
+ dir = opendir(name);
+ if (dir == NULL) {
+ if (errno == ENOENT && zpc->zpc_ignore_noent)
+ return (0);
+
+ ret = -errno;
+ (void) fprintf(stderr, gettext("failed to opendir %s: %s\n"),
+ name, strerror(errno));
+ return (ret);
+ }
+
+ /* Non-top item, ignore the case of being removed or renamed by race. */
+ zpc->zpc_ignore_noent = B_TRUE;
+ errno = 0;
+ while (!ret && (ent = readdir(dir)) != NULL) {
+ /* skip "." and ".." */
+ if (strcmp(ent->d_name, ".") == 0 ||
+ strcmp(ent->d_name, "..") == 0)
+ continue;
+
+ if (strlen(ent->d_name) + strlen(name) >=
+ sizeof (fullname) + 1) {
+ errno = ENAMETOOLONG;
+ break;
+ }
+
+ sprintf(fullname, "%s/%s", name, ent->d_name);
+ ret = zfs_project_handle_one(fullname, zpc);
+ if (!ret && zpc->zpc_recursive && ent->d_type == DT_DIR)
+ zfs_project_item_alloc(head, fullname);
+ }
+
+ if (errno && !ret) {
+ ret = -errno;
+ (void) fprintf(stderr, gettext("failed to readdir %s: %s\n"),
+ name, strerror(errno));
+ }
+
+ closedir(dir);
+ return (ret);
+}
+
+int
+zfs_project_handle(const char *name, zfs_project_control_t *zpc)
+{
+ zfs_project_item_t *zpi;
+ struct stat st;
+ list_t head;
+ int ret;
+
+ ret = zfs_project_sanity_check(name, zpc, &st);
+ if (ret)
+ return (ret);
+
+ if ((zpc->zpc_op == ZFS_PROJECT_OP_SET ||
+ zpc->zpc_op == ZFS_PROJECT_OP_CHECK) &&
+ zpc->zpc_expected_projid == ZFS_INVALID_PROJID) {
+ ret = zfs_project_load_projid(name, zpc);
+ if (ret)
+ return (ret);
+ }
+
+ zpc->zpc_ignore_noent = B_FALSE;
+ ret = zfs_project_handle_one(name, zpc);
+ if (ret || !S_ISDIR(st.st_mode) || zpc->zpc_dironly ||
+ (!zpc->zpc_recursive &&
+ zpc->zpc_op != ZFS_PROJECT_OP_LIST &&
+ zpc->zpc_op != ZFS_PROJECT_OP_CHECK))
+ return (ret);
+
+ list_create(&head, sizeof (zfs_project_item_t),
+ offsetof(zfs_project_item_t, zpi_list));
+ zfs_project_item_alloc(&head, name);
+ while ((zpi = list_remove_head(&head)) != NULL) {
+ if (!ret)
+ ret = zfs_project_handle_dir(zpi->zpi_name, zpc, &head);
+ free(zpi);
+ }
+
+ return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h b/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h
new file mode 100644
index 000000000000..1792a3383a03
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_projectutil.h
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Intel Corporation. All rights reserved.
+ */
+
+#ifndef _ZFS_PROJECTUTIL_H
+#define _ZFS_PROJECTUTIL_H
+
+typedef enum {
+ ZFS_PROJECT_OP_DEFAULT = 0,
+ ZFS_PROJECT_OP_LIST = 1,
+ ZFS_PROJECT_OP_CHECK = 2,
+ ZFS_PROJECT_OP_CLEAR = 3,
+ ZFS_PROJECT_OP_SET = 4,
+} zfs_project_ops_t;
+
+typedef struct zfs_project_control {
+ uint64_t zpc_expected_projid;
+ zfs_project_ops_t zpc_op;
+ boolean_t zpc_dironly;
+ boolean_t zpc_ignore_noent;
+ boolean_t zpc_keep_projid;
+ boolean_t zpc_newline;
+ boolean_t zpc_recursive;
+ boolean_t zpc_set_flag;
+} zfs_project_control_t;
+
+int zfs_project_handle(const char *name, zfs_project_control_t *zpc);
+
+#endif /* _ZFS_PROJECTUTIL_H */
diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_util.h b/sys/contrib/openzfs/cmd/zfs/zfs_util.h
new file mode 100644
index 000000000000..a56af59adb15
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs/zfs_util.h
@@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _ZFS_UTIL_H
+#define _ZFS_UTIL_H
+
+#include <libzfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void * safe_malloc(size_t size);
+void nomem(void);
+extern libzfs_handle_t *g_zfs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_UTIL_H */
diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore b/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore
new file mode 100644
index 000000000000..f95f853e48c2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/.gitignore
@@ -0,0 +1 @@
+zfs_ids_to_path
diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am b/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am
new file mode 100644
index 000000000000..176eeb3c72c5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/Makefile.am
@@ -0,0 +1,9 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zfs_ids_to_path
+
+zfs_ids_to_path_SOURCES = \
+ zfs_ids_to_path.c
+
+zfs_ids_to_path_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la
diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c
new file mode 100644
index 000000000000..6cfaa6f41fa5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+#include <libintl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <libzfs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+libzfs_handle_t *g_zfs;
+
+static void
+usage(int err)
+{
+ fprintf(stderr, "Usage: [-v] zfs_ids_to_path <pool> <objset id> "
+ "<object id>\n");
+ exit(err);
+}
+
+int
+main(int argc, char **argv)
+{
+ boolean_t verbose = B_FALSE;
+ char c;
+ while ((c = getopt(argc, argv, "v")) != -1) {
+ switch (c) {
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 3) {
+ (void) fprintf(stderr, "Incorrect number of arguments: %d\n",
+ argc);
+ usage(1);
+ }
+
+ uint64_t objset, object;
+ if (sscanf(argv[1], "%llu", (u_longlong_t *)&objset) != 1) {
+ (void) fprintf(stderr, "Invalid objset id: %s\n", argv[2]);
+ usage(2);
+ }
+ if (sscanf(argv[2], "%llu", (u_longlong_t *)&object) != 1) {
+ (void) fprintf(stderr, "Invalid object id: %s\n", argv[3]);
+ usage(3);
+ }
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+ return (4);
+ }
+ zpool_handle_t *pool = zpool_open(g_zfs, argv[0]);
+ if (pool == NULL) {
+ fprintf(stderr, "Could not open pool %s\n", argv[1]);
+ libzfs_fini(g_zfs);
+ return (5);
+ }
+
+ char pathname[PATH_MAX * 2];
+ if (verbose) {
+ zpool_obj_to_path_ds(pool, objset, object, pathname,
+ sizeof (pathname));
+ } else {
+ zpool_obj_to_path(pool, objset, object, pathname,
+ sizeof (pathname));
+ }
+ printf("%s\n", pathname);
+ zpool_close(pool);
+ libzfs_fini(g_zfs);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am b/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am
new file mode 100644
index 000000000000..69c99ca9d828
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zgenhostid/Makefile.am
@@ -0,0 +1 @@
+dist_bin_SCRIPTS = zgenhostid
diff --git a/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid
new file mode 100755
index 000000000000..8b468740c72b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Emulate genhostid(1) available on RHEL/CENTOS, for use on distros
+# which do not provide that utility.
+#
+# Usage:
+# zgenhostid
+# zgenhostid <value>
+#
+# If /etc/hostid already exists and is size > 0, the script exits immediately
+# and changes nothing. Unlike genhostid, this generates an error message.
+#
+# The first form generates a random hostid and stores it in /etc/hostid.
+# The second form checks that the provided value is between 0x1 and 0xFFFFFFFF
+# and if so, stores it in /etc/hostid. This form is not supported by
+# genhostid(1).
+
+hostid_file=/etc/hostid
+
+function usage {
+ echo "$0 [value]"
+ echo "If $hostid_file is not present, store a hostid in it." >&2
+ echo "The optional value must be an 8-digit hex number between" >&2
+ echo "1 and 2^32-1. If no value is provided, a random one will" >&2
+ echo "be generated. The value must be unique among your systems." >&2
+}
+
+# hostid(1) ignores contents of /etc/hostid if size < 4 bytes. It would
+# be better if this checked size >= 4 bytes but it the method must be
+# widely portable.
+if [ -s $hostid_file ]; then
+ echo "$hostid_file already exists. No change made." >&2
+ exit 1
+fi
+
+if [ -n "$1" ]; then
+ host_id=$1
+else
+ # $RANDOM goes from 0..32k-1
+ number=$((((RANDOM % 4) * 32768 + RANDOM) * 32768 + RANDOM))
+ host_id=$(printf "%08x" $number)
+fi
+
+if egrep -o '^0{8}$' <<< $host_id >/dev/null 2>&1; then
+ usage
+ exit 2
+fi
+
+if ! egrep -o '^[a-fA-F0-9]{8}$' <<< $host_id >/dev/null 2>&1; then
+ usage
+ exit 3
+fi
+
+a=${host_id:6:2}
+b=${host_id:4:2}
+c=${host_id:2:2}
+d=${host_id:0:2}
+
+echo -ne \\x$a\\x$b\\x$c\\x$d > $hostid_file
+
+exit 0
diff --git a/sys/contrib/openzfs/cmd/zhack/.gitignore b/sys/contrib/openzfs/cmd/zhack/.gitignore
new file mode 100644
index 000000000000..763a18898b88
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zhack/.gitignore
@@ -0,0 +1 @@
+/zhack
diff --git a/sys/contrib/openzfs/cmd/zhack/Makefile.am b/sys/contrib/openzfs/cmd/zhack/Makefile.am
new file mode 100644
index 000000000000..5cddac32b5ac
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zhack/Makefile.am
@@ -0,0 +1,14 @@
+include $(top_srcdir)/config/Rules.am
+
+# Unconditionally enable debugging for zhack
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = zhack
+
+zhack_SOURCES = \
+ zhack.c
+
+zhack_LDADD = \
+ $(abs_top_builddir)/lib/libzpool/libzpool.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zhack/zhack.c b/sys/contrib/openzfs/cmd/zhack/zhack.c
new file mode 100644
index 000000000000..4d958fe4365a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zhack/zhack.c
@@ -0,0 +1,532 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+/*
+ * zhack is a debugging tool that can write changes to ZFS pool using libzpool
+ * for testing purposes. Altering pools with zhack is unsupported and may
+ * result in corrupted pools.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/dsl_synctask.h>
+#include <sys/vdev.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfeature.h>
+#include <sys/dmu_tx.h>
+#include <libzutil.h>
+
+extern boolean_t zfeature_checks_disable;
+
+const char cmdname[] = "zhack";
+static importargs_t g_importargs;
+static char *g_pool;
+static boolean_t g_readonly;
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr,
+ "Usage: %s [-c cachefile] [-d dir] <subcommand> <args> ...\n"
+ "where <subcommand> <args> is one of the following:\n"
+ "\n", cmdname);
+
+ (void) fprintf(stderr,
+ " feature stat <pool>\n"
+ " print information about enabled features\n"
+ " feature enable [-r] [-d desc] <pool> <feature>\n"
+ " add a new enabled feature to the pool\n"
+ " -d <desc> sets the feature's description\n"
+ " -r set read-only compatible flag for feature\n"
+ " feature ref [-md] <pool> <feature>\n"
+ " change the refcount on the given feature\n"
+ " -d decrease instead of increase the refcount\n"
+ " -m add the feature to the label if increasing refcount\n"
+ "\n"
+ " <feature> : should be a feature guid\n");
+ exit(1);
+}
+
+
+static void
+fatal(spa_t *spa, void *tag, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (spa != NULL) {
+ spa_close(spa, tag);
+ (void) spa_export(g_pool, NULL, B_TRUE, B_FALSE);
+ }
+
+ va_start(ap, fmt);
+ (void) fprintf(stderr, "%s: ", cmdname);
+ (void) vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ (void) fprintf(stderr, "\n");
+
+ exit(1);
+}
+
+/* ARGSUSED */
+static int
+space_delta_cb(dmu_object_type_t bonustype, const void *data,
+ zfs_file_info_t *zoi)
+{
+ /*
+ * Is it a valid type of object to track?
+ */
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+ return (ENOENT);
+ (void) fprintf(stderr, "modifying object that needs user accounting");
+ abort();
+ /* NOTREACHED */
+}
+
+/*
+ * Target is the dataset whose pool we want to open.
+ */
+static void
+zhack_import(char *target, boolean_t readonly)
+{
+ nvlist_t *config;
+ nvlist_t *props;
+ int error;
+
+ kernel_init(readonly ? SPA_MODE_READ :
+ (SPA_MODE_READ | SPA_MODE_WRITE));
+
+ dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb);
+
+ g_readonly = readonly;
+ g_importargs.can_be_active = readonly;
+ g_pool = strdup(target);
+
+ error = zpool_find_config(NULL, target, &config, &g_importargs,
+ &libzpool_config_ops);
+ if (error)
+ fatal(NULL, FTAG, "cannot import '%s'", target);
+
+ props = NULL;
+ if (readonly) {
+ VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0);
+ }
+
+ zfeature_checks_disable = B_TRUE;
+ error = spa_import(target, config, props,
+ (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
+ zfeature_checks_disable = B_FALSE;
+ if (error == EEXIST)
+ error = 0;
+
+ if (error)
+ fatal(NULL, FTAG, "can't import '%s': %s", target,
+ strerror(error));
+}
+
+static void
+zhack_spa_open(char *target, boolean_t readonly, void *tag, spa_t **spa)
+{
+ int err;
+
+ zhack_import(target, readonly);
+
+ zfeature_checks_disable = B_TRUE;
+ err = spa_open(target, spa, tag);
+ zfeature_checks_disable = B_FALSE;
+
+ if (err != 0)
+ fatal(*spa, FTAG, "cannot open '%s': %s", target,
+ strerror(err));
+ if (spa_version(*spa) < SPA_VERSION_FEATURES) {
+ fatal(*spa, FTAG, "'%s' has version %d, features not enabled",
+ target, (int)spa_version(*spa));
+ }
+}
+
+static void
+dump_obj(objset_t *os, uint64_t obj, const char *name)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+
+ (void) printf("%s_obj:\n", name);
+
+ for (zap_cursor_init(&zc, os, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+ if (za.za_integer_length == 8) {
+ ASSERT(za.za_num_integers == 1);
+ (void) printf("\t%s = %llu\n",
+ za.za_name, (u_longlong_t)za.za_first_integer);
+ } else {
+ ASSERT(za.za_integer_length == 1);
+ char val[1024];
+ VERIFY(zap_lookup(os, obj, za.za_name,
+ 1, sizeof (val), val) == 0);
+ (void) printf("\t%s = %s\n", za.za_name, val);
+ }
+ }
+ zap_cursor_fini(&zc);
+}
+
+static void
+dump_mos(spa_t *spa)
+{
+ nvlist_t *nv = spa->spa_label_features;
+ nvpair_t *pair;
+
+ (void) printf("label config:\n");
+ for (pair = nvlist_next_nvpair(nv, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(nv, pair)) {
+ (void) printf("\t%s\n", nvpair_name(pair));
+ }
+}
+
+static void
+zhack_do_feature_stat(int argc, char **argv)
+{
+ spa_t *spa;
+ objset_t *os;
+ char *target;
+
+ argc--;
+ argv++;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, "error: missing pool name\n");
+ usage();
+ }
+ target = argv[0];
+
+ zhack_spa_open(target, B_TRUE, FTAG, &spa);
+ os = spa->spa_meta_objset;
+
+ dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
+ dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
+ dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
+ if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+ dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg");
+ }
+ dump_mos(spa);
+
+ spa_close(spa, FTAG);
+}
+
+static void
+zhack_feature_enable_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ zfeature_info_t *feature = arg;
+
+ feature_enable_sync(spa, feature, tx);
+
+ spa_history_log_internal(spa, "zhack enable feature", tx,
+ "name=%s flags=%u",
+ feature->fi_guid, feature->fi_flags);
+}
+
+static void
+zhack_do_feature_enable(int argc, char **argv)
+{
+ int c;
+ char *desc, *target;
+ spa_t *spa;
+ objset_t *mos;
+ zfeature_info_t feature;
+ spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+ /*
+ * Features are not added to the pool's label until their refcounts
+ * are incremented, so fi_mos can just be left as false for now.
+ */
+ desc = NULL;
+ feature.fi_uname = "zhack";
+ feature.fi_flags = 0;
+ feature.fi_depends = nodeps;
+ feature.fi_feature = SPA_FEATURE_NONE;
+
+ optind = 1;
+ while ((c = getopt(argc, argv, "+rd:")) != -1) {
+ switch (c) {
+ case 'r':
+ feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
+ break;
+ case 'd':
+ desc = strdup(optarg);
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (desc == NULL)
+ desc = strdup("zhack injected");
+ feature.fi_desc = desc;
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "error: missing feature or pool name\n");
+ usage();
+ }
+ target = argv[0];
+ feature.fi_guid = argv[1];
+
+ if (!zfeature_is_valid_guid(feature.fi_guid))
+ fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
+
+ zhack_spa_open(target, B_FALSE, FTAG, &spa);
+ mos = spa->spa_meta_objset;
+
+ if (zfeature_is_supported(feature.fi_guid))
+ fatal(spa, FTAG, "'%s' is a real feature, will not enable");
+ if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
+ fatal(spa, FTAG, "feature already enabled: %s",
+ feature.fi_guid);
+
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL));
+
+ spa_close(spa, FTAG);
+
+ free(desc);
+}
+
+static void
+feature_incr_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ zfeature_info_t *feature = arg;
+ uint64_t refcount;
+
+ VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
+ feature_sync(spa, feature, refcount + 1, tx);
+ spa_history_log_internal(spa, "zhack feature incr", tx,
+ "name=%s", feature->fi_guid);
+}
+
+static void
+feature_decr_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ zfeature_info_t *feature = arg;
+ uint64_t refcount;
+
+ VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
+ feature_sync(spa, feature, refcount - 1, tx);
+ spa_history_log_internal(spa, "zhack feature decr", tx,
+ "name=%s", feature->fi_guid);
+}
+
+static void
+zhack_do_feature_ref(int argc, char **argv)
+{
+ int c;
+ char *target;
+ boolean_t decr = B_FALSE;
+ spa_t *spa;
+ objset_t *mos;
+ zfeature_info_t feature;
+ spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+ /*
+ * fi_desc does not matter here because it was written to disk
+ * when the feature was enabled, but we need to properly set the
+ * feature for read or write based on the information we read off
+ * disk later.
+ */
+ feature.fi_uname = "zhack";
+ feature.fi_flags = 0;
+ feature.fi_desc = NULL;
+ feature.fi_depends = nodeps;
+ feature.fi_feature = SPA_FEATURE_NONE;
+
+ optind = 1;
+ while ((c = getopt(argc, argv, "+md")) != -1) {
+ switch (c) {
+ case 'm':
+ feature.fi_flags |= ZFEATURE_FLAG_MOS;
+ break;
+ case 'd':
+ decr = B_TRUE;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "error: missing feature or pool name\n");
+ usage();
+ }
+ target = argv[0];
+ feature.fi_guid = argv[1];
+
+ if (!zfeature_is_valid_guid(feature.fi_guid))
+ fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
+
+ zhack_spa_open(target, B_FALSE, FTAG, &spa);
+ mos = spa->spa_meta_objset;
+
+ if (zfeature_is_supported(feature.fi_guid)) {
+ fatal(spa, FTAG,
+ "'%s' is a real feature, will not change refcount");
+ }
+
+ if (0 == zap_contains(mos, spa->spa_feat_for_read_obj,
+ feature.fi_guid)) {
+ feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT;
+ } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj,
+ feature.fi_guid)) {
+ feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
+ } else {
+ fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid);
+ }
+
+ if (decr) {
+ uint64_t count;
+ if (feature_get_refcount_from_disk(spa, &feature,
+ &count) == 0 && count == 0) {
+ fatal(spa, FTAG, "feature refcount already 0: %s",
+ feature.fi_guid);
+ }
+ }
+
+ VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+ decr ? feature_decr_sync : feature_incr_sync, &feature,
+ 5, ZFS_SPACE_CHECK_NORMAL));
+
+ spa_close(spa, FTAG);
+}
+
+static int
+zhack_do_feature(int argc, char **argv)
+{
+ char *subcommand;
+
+ argc--;
+ argv++;
+ if (argc == 0) {
+ (void) fprintf(stderr,
+ "error: no feature operation specified\n");
+ usage();
+ }
+
+ subcommand = argv[0];
+ if (strcmp(subcommand, "stat") == 0) {
+ zhack_do_feature_stat(argc, argv);
+ } else if (strcmp(subcommand, "enable") == 0) {
+ zhack_do_feature_enable(argc, argv);
+ } else if (strcmp(subcommand, "ref") == 0) {
+ zhack_do_feature_ref(argc, argv);
+ } else {
+ (void) fprintf(stderr, "error: unknown subcommand: %s\n",
+ subcommand);
+ usage();
+ }
+
+ return (0);
+}
+
+#define MAX_NUM_PATHS 1024
+
+int
+main(int argc, char **argv)
+{
+ extern void zfs_prop_init(void);
+
+ char *path[MAX_NUM_PATHS];
+ const char *subcommand;
+ int rv = 0;
+ int c;
+
+ g_importargs.path = path;
+
+ dprintf_setup(&argc, argv);
+ zfs_prop_init();
+
+ while ((c = getopt(argc, argv, "+c:d:")) != -1) {
+ switch (c) {
+ case 'c':
+ g_importargs.cachefile = optarg;
+ break;
+ case 'd':
+ assert(g_importargs.paths < MAX_NUM_PATHS);
+ g_importargs.path[g_importargs.paths++] = optarg;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+ optind = 1;
+
+ if (argc == 0) {
+ (void) fprintf(stderr, "error: no command specified\n");
+ usage();
+ }
+
+ subcommand = argv[0];
+
+ if (strcmp(subcommand, "feature") == 0) {
+ rv = zhack_do_feature(argc, argv);
+ } else {
+ (void) fprintf(stderr, "error: unknown subcommand: %s\n",
+ subcommand);
+ usage();
+ }
+
+ if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) {
+ fatal(NULL, FTAG, "pool export failed; "
+ "changes may not be committed to disk\n");
+ }
+
+ kernel_fini();
+
+ return (rv);
+}
diff --git a/sys/contrib/openzfs/cmd/zinject/.gitignore b/sys/contrib/openzfs/cmd/zinject/.gitignore
new file mode 100644
index 000000000000..bded8400996c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/.gitignore
@@ -0,0 +1 @@
+/zinject
diff --git a/sys/contrib/openzfs/cmd/zinject/Makefile.am b/sys/contrib/openzfs/cmd/zinject/Makefile.am
new file mode 100644
index 000000000000..091d92cd6026
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/Makefile.am
@@ -0,0 +1,13 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zinject
+
+zinject_SOURCES = \
+ translate.c \
+ zinject.c \
+ zinject.h
+
+zinject_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zinject/translate.c b/sys/contrib/openzfs/cmd/zinject/translate.c
new file mode 100644
index 000000000000..4939c0b85b5f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/translate.c
@@ -0,0 +1,397 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ */
+
+#include <libzfs.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/file.h>
+#include <sys/mntent.h>
+#include <sys/mnttab.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/vdev_impl.h>
+
+#include <sys/mkdev.h>
+
+#include "zinject.h"
+
+static int debug;
+
+static void
+ziprintf(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!debug)
+ return;
+
+ va_start(ap, fmt);
+ (void) vprintf(fmt, ap);
+ va_end(ap);
+}
+
+static void
+compress_slashes(const char *src, char *dest)
+{
+ while (*src != '\0') {
+ *dest = *src++;
+ while (*dest == '/' && *src == '/')
+ ++src;
+ ++dest;
+ }
+ *dest = '\0';
+}
+
+/*
+ * Given a full path to a file, translate into a dataset name and a relative
+ * path within the dataset. 'dataset' must be at least MAXNAMELEN characters,
+ * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64
+ * buffer, which we need later to get the object ID.
+ */
+static int
+parse_pathname(const char *inpath, char *dataset, char *relpath,
+ struct stat64 *statbuf)
+{
+ struct extmnttab mp;
+ const char *rel;
+ char fullpath[MAXPATHLEN];
+
+ compress_slashes(inpath, fullpath);
+
+ if (fullpath[0] != '/') {
+ (void) fprintf(stderr, "invalid object '%s': must be full "
+ "path\n", fullpath);
+ usage();
+ return (-1);
+ }
+
+ if (getextmntent(fullpath, &mp, statbuf) != 0) {
+ (void) fprintf(stderr, "cannot find mountpoint for '%s'\n",
+ fullpath);
+ return (-1);
+ }
+
+ if (strcmp(mp.mnt_fstype, MNTTYPE_ZFS) != 0) {
+ (void) fprintf(stderr, "invalid path '%s': not a ZFS "
+ "filesystem\n", fullpath);
+ return (-1);
+ }
+
+ if (strncmp(fullpath, mp.mnt_mountp, strlen(mp.mnt_mountp)) != 0) {
+ (void) fprintf(stderr, "invalid path '%s': mountpoint "
+ "doesn't match path\n", fullpath);
+ return (-1);
+ }
+
+ (void) strcpy(dataset, mp.mnt_special);
+
+ rel = fullpath + strlen(mp.mnt_mountp);
+ if (rel[0] == '/')
+ rel++;
+ (void) strcpy(relpath, rel);
+
+ return (0);
+}
+
+/*
+ * Convert from a dataset to a objset id. Note that
+ * we grab the object number from the inode number.
+ */
+static int
+object_from_path(const char *dataset, uint64_t object, zinject_record_t *record)
+{
+ zfs_handle_t *zhp;
+
+ if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
+ return (-1);
+
+ record->zi_objset = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
+ record->zi_object = object;
+
+ zfs_close(zhp);
+
+ return (0);
+}
+
+/*
+ * Initialize the range based on the type, level, and range given.
+ */
+static int
+initialize_range(err_type_t type, int level, char *range,
+ zinject_record_t *record)
+{
+ /*
+ * Determine the numeric range from the string.
+ */
+ if (range == NULL) {
+ /*
+ * If range is unspecified, set the range to [0,-1], which
+ * indicates that the whole object should be treated as an
+ * error.
+ */
+ record->zi_start = 0;
+ record->zi_end = -1ULL;
+ } else {
+ char *end;
+
+ /* XXX add support for suffixes */
+ record->zi_start = strtoull(range, &end, 10);
+
+
+ if (*end == '\0')
+ record->zi_end = record->zi_start + 1;
+ else if (*end == ',')
+ record->zi_end = strtoull(end + 1, &end, 10);
+
+ if (*end != '\0') {
+ (void) fprintf(stderr, "invalid range '%s': must be "
+ "a numeric range of the form 'start[,end]'\n",
+ range);
+ return (-1);
+ }
+ }
+
+ switch (type) {
+ default:
+ break;
+ case TYPE_DATA:
+ break;
+
+ case TYPE_DNODE:
+ /*
+ * If this is a request to inject faults into the dnode, then we
+ * must translate the current (objset,object) pair into an
+ * offset within the metadnode for the objset. Specifying any
+ * kind of range with type 'dnode' is illegal.
+ */
+ if (range != NULL) {
+ (void) fprintf(stderr, "range cannot be specified when "
+ "type is 'dnode'\n");
+ return (-1);
+ }
+
+ record->zi_start = record->zi_object * sizeof (dnode_phys_t);
+ record->zi_end = record->zi_start + sizeof (dnode_phys_t);
+ record->zi_object = 0;
+ break;
+ }
+
+ record->zi_level = level;
+
+ return (0);
+}
+
+int
+translate_record(err_type_t type, const char *object, const char *range,
+ int level, zinject_record_t *record, char *poolname, char *dataset)
+{
+ char path[MAXPATHLEN];
+ char *slash;
+ struct stat64 statbuf;
+ int ret = -1;
+
+ debug = (getenv("ZINJECT_DEBUG") != NULL);
+
+ ziprintf("translating: %s\n", object);
+
+ if (MOS_TYPE(type)) {
+ /*
+ * MOS objects are treated specially.
+ */
+ switch (type) {
+ default:
+ break;
+ case TYPE_MOS:
+ record->zi_type = 0;
+ break;
+ case TYPE_MOSDIR:
+ record->zi_type = DMU_OT_OBJECT_DIRECTORY;
+ break;
+ case TYPE_METASLAB:
+ record->zi_type = DMU_OT_OBJECT_ARRAY;
+ break;
+ case TYPE_CONFIG:
+ record->zi_type = DMU_OT_PACKED_NVLIST;
+ break;
+ case TYPE_BPOBJ:
+ record->zi_type = DMU_OT_BPOBJ;
+ break;
+ case TYPE_SPACEMAP:
+ record->zi_type = DMU_OT_SPACE_MAP;
+ break;
+ case TYPE_ERRLOG:
+ record->zi_type = DMU_OT_ERROR_LOG;
+ break;
+ }
+
+ dataset[0] = '\0';
+ (void) strcpy(poolname, object);
+ return (0);
+ }
+
+ /*
+ * Convert a full path into a (dataset, file) pair.
+ */
+ if (parse_pathname(object, dataset, path, &statbuf) != 0)
+ goto err;
+
+ ziprintf(" dataset: %s\n", dataset);
+ ziprintf(" path: %s\n", path);
+
+ /*
+ * Convert (dataset, file) into (objset, object)
+ */
+ if (object_from_path(dataset, statbuf.st_ino, record) != 0)
+ goto err;
+
+ ziprintf("raw objset: %llu\n", record->zi_objset);
+ ziprintf("raw object: %llu\n", record->zi_object);
+
+ /*
+ * For the given object, initialize the range in bytes
+ */
+ if (initialize_range(type, level, (char *)range, record) != 0)
+ goto err;
+
+ ziprintf(" objset: %llu\n", record->zi_objset);
+ ziprintf(" object: %llu\n", record->zi_object);
+ if (record->zi_start == 0 &&
+ record->zi_end == -1ULL)
+ ziprintf(" range: all\n");
+ else
+ ziprintf(" range: [%llu, %llu]\n", record->zi_start,
+ record->zi_end);
+
+ /*
+ * Copy the pool name
+ */
+ (void) strcpy(poolname, dataset);
+ if ((slash = strchr(poolname, '/')) != NULL)
+ *slash = '\0';
+
+ ret = 0;
+
+err:
+ return (ret);
+}
+
+int
+translate_raw(const char *str, zinject_record_t *record)
+{
+ /*
+ * A raw bookmark of the form objset:object:level:blkid, where each
+ * number is a hexadecimal value.
+ */
+ if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset,
+ (u_longlong_t *)&record->zi_object, &record->zi_level,
+ (u_longlong_t *)&record->zi_start) != 4) {
+ (void) fprintf(stderr, "bad raw spec '%s': must be of the form "
+ "'objset:object:level:blkid'\n", str);
+ return (-1);
+ }
+
+ record->zi_end = record->zi_start;
+
+ return (0);
+}
+
+int
+translate_device(const char *pool, const char *device, err_type_t label_type,
+ zinject_record_t *record)
+{
+ char *end;
+ zpool_handle_t *zhp;
+ nvlist_t *tgt;
+ boolean_t isspare, iscache;
+
+ /*
+ * Given a device name or GUID, create an appropriate injection record
+ * with zi_guid set.
+ */
+ if ((zhp = zpool_open(g_zfs, pool)) == NULL)
+ return (-1);
+
+ record->zi_guid = strtoull(device, &end, 0);
+ if (record->zi_guid == 0 || *end != '\0') {
+ tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL);
+
+ if (tgt == NULL) {
+ (void) fprintf(stderr, "cannot find device '%s' in "
+ "pool '%s'\n", device, pool);
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID,
+ &record->zi_guid) == 0);
+ }
+
+ /*
+ * Device faults can take on three different forms:
+ * 1). delayed or hanging I/O
+ * 2). zfs label faults
+ * 3). generic disk faults
+ */
+ if (record->zi_timer != 0) {
+ record->zi_cmd = ZINJECT_DELAY_IO;
+ } else if (label_type != TYPE_INVAL) {
+ record->zi_cmd = ZINJECT_LABEL_FAULT;
+ } else {
+ record->zi_cmd = ZINJECT_DEVICE_FAULT;
+ }
+
+ switch (label_type) {
+ default:
+ break;
+ case TYPE_LABEL_UBERBLOCK:
+ record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]);
+ record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1;
+ break;
+ case TYPE_LABEL_NVLIST:
+ record->zi_start = offsetof(vdev_label_t, vl_vdev_phys);
+ record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1;
+ break;
+ case TYPE_LABEL_PAD1:
+ record->zi_start = offsetof(vdev_label_t, vl_pad1);
+ record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+ break;
+ case TYPE_LABEL_PAD2:
+ record->zi_start = offsetof(vdev_label_t, vl_be);
+ record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1;
+ break;
+ }
+ zpool_close(zhp);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.c b/sys/contrib/openzfs/cmd/zinject/zinject.c
new file mode 100644
index 000000000000..bf97b0d68713
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.c
@@ -0,0 +1,1287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * ZFS Fault Injector
+ *
+ * This userland component takes a set of options and uses libzpool to translate
+ * from a user-visible object type and name to an internal representation.
+ * There are two basic types of faults: device faults and data faults.
+ *
+ *
+ * DEVICE FAULTS
+ *
+ * Errors can be injected into a particular vdev using the '-d' option. This
+ * option takes a path or vdev GUID to uniquely identify the device within a
+ * pool. There are four types of errors that can be injected, IO, ENXIO,
+ * ECHILD, and EILSEQ. These can be controlled through the '-e' option and the
+ * default is ENXIO. For EIO failures, any attempt to read data from the device
+ * will return EIO, but a subsequent attempt to reopen the device will succeed.
+ * For ENXIO failures, any attempt to read from the device will return EIO, but
+ * any attempt to reopen the device will also return ENXIO. The EILSEQ failures
+ * only apply to read operations (-T read) and will flip a bit after the device
+ * has read the original data.
+ *
+ * For label faults, the -L option must be specified. This allows faults
+ * to be injected into either the nvlist, uberblock, pad1, or pad2 region
+ * of all the labels for the specified device.
+ *
+ * This form of the command looks like:
+ *
+ * zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
+ *
+ *
+ * DATA FAULTS
+ *
+ * We begin with a tuple of the form:
+ *
+ * <type,level,range,object>
+ *
+ * type A string describing the type of data to target. Each type
+ * implicitly describes how to interpret 'object'. Currently,
+ * the following values are supported:
+ *
+ * data User data for a file
+ * dnode Dnode for a file or directory
+ *
+ * The following MOS objects are special. Instead of injecting
+ * errors on a particular object or blkid, we inject errors across
+ * all objects of the given type.
+ *
+ * mos Any data in the MOS
+ * mosdir object directory
+ * config pool configuration
+ * bpobj blkptr list
+ * spacemap spacemap
+ * metaslab metaslab
+ * errlog persistent error log
+ *
+ * level Object level. Defaults to '0', not applicable to all types. If
+ * a range is given, this corresponds to the indirect block
+ * corresponding to the specific range.
+ *
+ * range A numerical range [start,end) within the object. Defaults to
+ * the full size of the file.
+ *
+ * object A string describing the logical location of the object. For
+ * files and directories (currently the only supported types),
+ * this is the path of the object on disk.
+ *
+ * This is translated, via libzpool, into the following internal representation:
+ *
+ * <type,objset,object,level,range>
+ *
+ * These types should be self-explanatory. This tuple is then passed to the
+ * kernel via a special ioctl() to initiate fault injection for the given
+ * object. Note that 'type' is not strictly necessary for fault injection, but
+ * is used when translating existing faults into a human-readable string.
+ *
+ *
+ * The command itself takes one of the forms:
+ *
+ * zinject
+ * zinject <-a | -u pool>
+ * zinject -c <id|all>
+ * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
+ * [-r range] <object>
+ * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
+ *
+ * With no arguments, the command prints all currently registered injection
+ * handlers, with their numeric identifiers.
+ *
+ * The '-c' option will clear the given handler, or all handlers if 'all' is
+ * specified.
+ *
+ * The '-e' option takes a string describing the errno to simulate. This must
+ * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this
+ * will result in the same behavior, but RAID-Z will produce a different set of
+ * ereports for this situation.
+ *
+ * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is
+ * specified, then the ARC cache is flushed appropriately. If '-u' is
+ * specified, then the underlying SPA is unloaded. Either of these flags can be
+ * specified independently of any other handlers. The '-m' flag automatically
+ * does an unmount and remount of the underlying dataset to aid in flushing the
+ * cache.
+ *
+ * The '-f' flag controls the frequency of errors injected, expressed as a
+ * real number percentage between 0.0001 and 100. The default is 100.
+ *
+ * The this form is responsible for actually injecting the handler into the
+ * framework. It takes the arguments described above, translates them to the
+ * internal tuple using libzpool, and then issues an ioctl() to register the
+ * handler.
+ *
+ * The final form can target a specific bookmark, regardless of whether a
+ * human-readable interface has been designed. It allows developers to specify
+ * a particular block by number.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <sys/fs/zfs.h>
+#include <sys/mount.h>
+
+#include <libzfs.h>
+
+#undef verify /* both libzfs.h and zfs_context.h want to define this */
+
+#include "zinject.h"
+
+libzfs_handle_t *g_zfs;
+int zfs_fd;
+
+static const char *errtable[TYPE_INVAL] = {
+ "data",
+ "dnode",
+ "mos",
+ "mosdir",
+ "metaslab",
+ "config",
+ "bpobj",
+ "spacemap",
+ "errlog",
+ "uber",
+ "nvlist",
+ "pad1",
+ "pad2"
+};
+
+static err_type_t
+name_to_type(const char *arg)
+{
+ int i;
+ for (i = 0; i < TYPE_INVAL; i++)
+ if (strcmp(errtable[i], arg) == 0)
+ return (i);
+
+ return (TYPE_INVAL);
+}
+
+static const char *
+type_to_name(uint64_t type)
+{
+ switch (type) {
+ case DMU_OT_OBJECT_DIRECTORY:
+ return ("mosdir");
+ case DMU_OT_OBJECT_ARRAY:
+ return ("metaslab");
+ case DMU_OT_PACKED_NVLIST:
+ return ("config");
+ case DMU_OT_BPOBJ:
+ return ("bpobj");
+ case DMU_OT_SPACE_MAP:
+ return ("spacemap");
+ case DMU_OT_ERROR_LOG:
+ return ("errlog");
+ default:
+ return ("-");
+ }
+}
+
+
+/*
+ * Print usage message.
+ */
+void
+usage(void)
+{
+ (void) printf(
+ "usage:\n"
+ "\n"
+ "\tzinject\n"
+ "\n"
+ "\t\tList all active injection records.\n"
+ "\n"
+ "\tzinject -c <id|all>\n"
+ "\n"
+ "\t\tClear the particular record (if given a numeric ID), or\n"
+ "\t\tall records if 'all' is specified.\n"
+ "\n"
+ "\tzinject -p <function name> pool\n"
+ "\t\tInject a panic fault at the specified function. Only \n"
+ "\t\tfunctions which call spa_vdev_config_exit(), or \n"
+ "\t\tspa_vdev_exit() will trigger a panic.\n"
+ "\n"
+ "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
+ "\t\t[-T <read|write|free|claim|all>] [-f frequency] pool\n\n"
+ "\t\tInject a fault into a particular device or the device's\n"
+ "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "
+ "\t\t'pad1', or 'pad2'.\n"
+ "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n"
+ "\t\t'corrupt' (bit flip).\n"
+ "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
+ "\t\tdevice error injection to a percentage of the IOs.\n"
+ "\n"
+ "\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
+ "\t\tPerform a specific action on a particular device.\n"
+ "\n"
+ "\tzinject -d device -D latency:lanes pool\n"
+ "\n"
+ "\t\tAdd an artificial delay to IO requests on a particular\n"
+ "\t\tdevice, such that the requests take a minimum of 'latency'\n"
+ "\t\tmilliseconds to complete. Each delay has an associated\n"
+ "\t\tnumber of 'lanes' which defines the number of concurrent\n"
+ "\t\tIO requests that can be processed.\n"
+ "\n"
+ "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
+ "\t\tthe device will only be able to service a single IO request\n"
+ "\t\tat a time with each request taking 10 ms to complete. So,\n"
+ "\t\tif only a single request is submitted every 10 ms, the\n"
+ "\t\taverage latency will be 10 ms; but if more than one request\n"
+ "\t\tis submitted every 10 ms, the average latency will be more\n"
+ "\t\tthan 10 ms.\n"
+ "\n"
+ "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
+ "\t\tlanes (-D 10:2), then the device will be able to service\n"
+ "\t\ttwo requests at a time, each with a minimum latency of\n"
+ "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
+ "\t\tthe average latency will be 10 ms; but if more than two\n"
+ "\t\trequests are submitted every 10 ms, the average latency\n"
+ "\t\twill be more than 10 ms.\n"
+ "\n"
+ "\t\tAlso note, these delays are additive. So two invocations\n"
+ "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
+ "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
+ "\t\tlanes with differing target latencies. For example, an\n"
+ "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
+ "\t\tcreate 3 lanes on the device; one lane with a latency\n"
+ "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
+ "\n"
+ "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
+ "\t\tCause the pool to stop writing blocks yet not\n"
+ "\t\treport errors for a duration. Simulates buggy hardware\n"
+ "\t\tthat fails to honor cache flush requests.\n"
+ "\t\tDefault duration is 30 seconds. The machine is panicked\n"
+ "\t\tat the end of the duration.\n"
+ "\n"
+ "\tzinject -b objset:object:level:blkid pool\n"
+ "\n"
+ "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
+ "\t\tspecified by the remaining tuple. Each number is in\n"
+ "\t\thexadecimal, and only one block can be specified.\n"
+ "\n"
+ "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
+ "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
+ "\n"
+ "\t\tInject an error into the object specified by the '-t' option\n"
+ "\t\tand the object descriptor. The 'object' parameter is\n"
+ "\t\tinterpreted depending on the '-t' option.\n"
+ "\n"
+ "\t\t-q\tQuiet mode. Only print out the handler number added.\n"
+ "\t\t-e\tInject a specific error. Must be one of 'io',\n"
+ "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n"
+ "\t\t-C\tInject the given error only into specific DVAs. The\n"
+ "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
+ "\t\t\tseparated by commas (ex. '0,2').\n"
+ "\t\t-l\tInject error at a particular block level. Default is "
+ "0.\n"
+ "\t\t-m\tAutomatically remount underlying filesystem.\n"
+ "\t\t-r\tInject error over a particular logical range of an\n"
+ "\t\t\tobject. Will be translated to the appropriate blkid\n"
+ "\t\t\trange according to the object's properties.\n"
+ "\t\t-a\tFlush the ARC cache. Can be specified without any\n"
+ "\t\t\tassociated object.\n"
+ "\t\t-u\tUnload the associated pool. Can be specified with only\n"
+ "\t\t\ta pool object.\n"
+ "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n"
+ "\t\t\ta percentage between 0.0001 and 100.\n"
+ "\n"
+ "\t-t data\t\tInject an error into the plain file contents of a\n"
+ "\t\t\tfile. The object must be specified as a complete path\n"
+ "\t\t\tto a file on a ZFS filesystem.\n"
+ "\n"
+ "\t-t dnode\tInject an error into the metadnode in the block\n"
+ "\t\t\tcorresponding to the dnode for a file or directory. The\n"
+ "\t\t\t'-r' option is incompatible with this mode. The object\n"
+ "\t\t\tis specified as a complete path to a file or directory\n"
+ "\t\t\ton a ZFS filesystem.\n"
+ "\n"
+ "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
+ "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n"
+ "\t\t\tspacemap, metaslab, errlog. The only valid <object> is\n"
+ "\t\t\tthe poolname.\n");
+}
+
+static int
+iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
+ void *data)
+{
+ zfs_cmd_t zc = {"\0"};
+ int ret;
+
+ while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
+ if ((ret = func((int)zc.zc_guid, zc.zc_name,
+ &zc.zc_inject_record, data)) != 0)
+ return (ret);
+
+ if (errno != ENOENT) {
+ (void) fprintf(stderr, "Unable to list handlers: %s\n",
+ strerror(errno));
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+print_data_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ int *count = data;
+
+ if (record->zi_guid != 0 || record->zi_func[0] != '\0')
+ return (0);
+
+ if (*count == 0) {
+ (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s "
+ "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
+ "LVL", "DVAs", "RANGE");
+ (void) printf("--- --------------- ------ "
+ "------ -------- --- ---- ---------------\n");
+ }
+
+ *count += 1;
+
+ (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ",
+ id, pool, (u_longlong_t)record->zi_objset,
+ (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
+ record->zi_level, record->zi_dvas);
+
+
+ if (record->zi_start == 0 &&
+ record->zi_end == -1ULL)
+ (void) printf("all\n");
+ else
+ (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
+ (u_longlong_t)record->zi_end);
+
+ return (0);
+}
+
+static int
+print_device_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ int *count = data;
+
+ if (record->zi_guid == 0 || record->zi_func[0] != '\0')
+ return (0);
+
+ if (record->zi_cmd == ZINJECT_DELAY_IO)
+ return (0);
+
+ if (*count == 0) {
+ (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID");
+ (void) printf("--- --------------- ----------------\n");
+ }
+
+ *count += 1;
+
+ (void) printf("%3d %-15s %llx\n", id, pool,
+ (u_longlong_t)record->zi_guid);
+
+ return (0);
+}
+
+static int
+print_delay_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ int *count = data;
+
+ if (record->zi_guid == 0 || record->zi_func[0] != '\0')
+ return (0);
+
+ if (record->zi_cmd != ZINJECT_DELAY_IO)
+ return (0);
+
+ if (*count == 0) {
+ (void) printf("%3s %-15s %-15s %-15s %s\n",
+ "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
+ (void) printf("--- --------------- --------------- "
+ "--------------- ----------------\n");
+ }
+
+ *count += 1;
+
+ (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool,
+ (u_longlong_t)NSEC2MSEC(record->zi_timer),
+ (u_longlong_t)record->zi_nlanes,
+ (u_longlong_t)record->zi_guid);
+
+ return (0);
+}
+
+static int
+print_panic_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ int *count = data;
+
+ if (record->zi_func[0] == '\0')
+ return (0);
+
+ if (*count == 0) {
+ (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION");
+ (void) printf("--- --------------- ----------------\n");
+ }
+
+ *count += 1;
+
+ (void) printf("%3d %-15s %s\n", id, pool, record->zi_func);
+
+ return (0);
+}
+
+/*
+ * Print all registered error handlers. Returns the number of handlers
+ * registered.
+ */
+static int
+print_all_handlers(void)
+{
+ int count = 0, total = 0;
+
+ (void) iter_handlers(print_device_handler, &count);
+ if (count > 0) {
+ total += count;
+ (void) printf("\n");
+ count = 0;
+ }
+
+ (void) iter_handlers(print_delay_handler, &count);
+ if (count > 0) {
+ total += count;
+ (void) printf("\n");
+ count = 0;
+ }
+
+ (void) iter_handlers(print_data_handler, &count);
+ if (count > 0) {
+ total += count;
+ (void) printf("\n");
+ count = 0;
+ }
+
+ (void) iter_handlers(print_panic_handler, &count);
+
+ return (count + total);
+}
+
+/* ARGSUSED */
+static int
+cancel_one_handler(int id, const char *pool, zinject_record_t *record,
+ void *data)
+{
+ zfs_cmd_t zc = {"\0"};
+
+ zc.zc_guid = (uint64_t)id;
+
+ if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
+ (void) fprintf(stderr, "failed to remove handler %d: %s\n",
+ id, strerror(errno));
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Remove all fault injection handlers.
+ */
+static int
+cancel_all_handlers(void)
+{
+ int ret = iter_handlers(cancel_one_handler, NULL);
+
+ if (ret == 0)
+ (void) printf("removed all registered handlers\n");
+
+ return (ret);
+}
+
+/*
+ * Remove a specific fault injection handler.
+ */
+static int
+cancel_handler(int id)
+{
+ zfs_cmd_t zc = {"\0"};
+
+ zc.zc_guid = (uint64_t)id;
+
+ if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
+ (void) fprintf(stderr, "failed to remove handler %d: %s\n",
+ id, strerror(errno));
+ return (1);
+ }
+
+ (void) printf("removed handler %d\n", id);
+
+ return (0);
+}
+
+/*
+ * Register a new fault injection handler.
+ */
+static int
+register_handler(const char *pool, int flags, zinject_record_t *record,
+ int quiet)
+{
+ zfs_cmd_t zc = {"\0"};
+
+ (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+ zc.zc_inject_record = *record;
+ zc.zc_guid = flags;
+
+ if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
+ (void) fprintf(stderr, "failed to add handler: %s\n",
+ errno == EDOM ? "block level exceeds max level of object" :
+ strerror(errno));
+ return (1);
+ }
+
+ if (flags & ZINJECT_NULL)
+ return (0);
+
+ if (quiet) {
+ (void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
+ } else {
+ (void) printf("Added handler %llu with the following "
+ "properties:\n", (u_longlong_t)zc.zc_guid);
+ (void) printf(" pool: %s\n", pool);
+ if (record->zi_guid) {
+ (void) printf(" vdev: %llx\n",
+ (u_longlong_t)record->zi_guid);
+ } else if (record->zi_func[0] != '\0') {
+ (void) printf(" panic function: %s\n",
+ record->zi_func);
+ } else if (record->zi_duration > 0) {
+ (void) printf(" time: %lld seconds\n",
+ (u_longlong_t)record->zi_duration);
+ } else if (record->zi_duration < 0) {
+ (void) printf(" txgs: %lld \n",
+ (u_longlong_t)-record->zi_duration);
+ } else {
+ (void) printf("objset: %llu\n",
+ (u_longlong_t)record->zi_objset);
+ (void) printf("object: %llu\n",
+ (u_longlong_t)record->zi_object);
+ (void) printf(" type: %llu\n",
+ (u_longlong_t)record->zi_type);
+ (void) printf(" level: %d\n", record->zi_level);
+ if (record->zi_start == 0 &&
+ record->zi_end == -1ULL)
+ (void) printf(" range: all\n");
+ else
+ (void) printf(" range: [%llu, %llu)\n",
+ (u_longlong_t)record->zi_start,
+ (u_longlong_t)record->zi_end);
+ (void) printf(" dvas: 0x%x\n", record->zi_dvas);
+ }
+ }
+
+ return (0);
+}
+
+static int
+perform_action(const char *pool, zinject_record_t *record, int cmd)
+{
+ zfs_cmd_t zc = {"\0"};
+
+ ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
+ (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
+ zc.zc_guid = record->zi_guid;
+ zc.zc_cookie = cmd;
+
+ if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+ return (0);
+
+ return (1);
+}
+
+static int
+parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
+{
+ unsigned long scan_delay;
+ unsigned long scan_nlanes;
+
+ if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
+ return (1);
+
+ /*
+ * We explicitly disallow a delay of zero here, because we key
+ * off this value being non-zero in translate_device(), to
+ * determine if the fault is a ZINJECT_DELAY_IO fault or not.
+ */
+ if (scan_delay == 0)
+ return (1);
+
+ /*
+ * The units for the CLI delay parameter is milliseconds, but
+ * the data passed to the kernel is interpreted as nanoseconds.
+ * Thus we scale the milliseconds to nanoseconds here, and this
+ * nanosecond value is used to pass the delay to the kernel.
+ */
+ *delay = MSEC2NSEC(scan_delay);
+ *nlanes = scan_nlanes;
+
+ return (0);
+}
+
+static int
+parse_frequency(const char *str, uint32_t *percent)
+{
+ double val;
+ char *post;
+
+ val = strtod(str, &post);
+ if (post == NULL || *post != '\0')
+ return (EINVAL);
+
+ /* valid range is [0.0001, 100.0] */
+ val /= 100.0f;
+ if (val < 0.000001f || val > 1.0f)
+ return (ERANGE);
+
+ /* convert to an integer for use by kernel */
+ *percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));
+
+ return (0);
+}
+
+/*
+ * This function converts a string specifier for DVAs into a bit mask.
+ * The dva's provided by the user should be 0 indexed and separated by
+ * a comma. For example:
+ * "1" -> 0b0010 (0x2)
+ * "0,1" -> 0b0011 (0x3)
+ * "0,1,2" -> 0b0111 (0x7)
+ */
+static int
+parse_dvas(const char *str, uint32_t *dvas_out)
+{
+ const char *c = str;
+ uint32_t mask = 0;
+ boolean_t need_delim = B_FALSE;
+
+ /* max string length is 5 ("0,1,2") */
+ if (strlen(str) > 5 || strlen(str) == 0)
+ return (EINVAL);
+
+ while (*c != '\0') {
+ switch (*c) {
+ case '0':
+ case '1':
+ case '2':
+ /* check for pipe between DVAs */
+ if (need_delim)
+ return (EINVAL);
+
+ /* check if this DVA has been set already */
+ if (mask & (1 << ((*c) - '0')))
+ return (EINVAL);
+
+ mask |= (1 << ((*c) - '0'));
+ need_delim = B_TRUE;
+ break;
+ case ',':
+ need_delim = B_FALSE;
+ break;
+ default:
+ /* check for invalid character */
+ return (EINVAL);
+ }
+ c++;
+ }
+
+ /* check for dangling delimiter */
+ if (!need_delim)
+ return (EINVAL);
+
+ *dvas_out = mask;
+ return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+ int c;
+ char *range = NULL;
+ char *cancel = NULL;
+ char *end;
+ char *raw = NULL;
+ char *device = NULL;
+ int level = 0;
+ int quiet = 0;
+ int error = 0;
+ int domount = 0;
+ int io_type = ZIO_TYPES;
+ int action = VDEV_STATE_UNKNOWN;
+ err_type_t type = TYPE_INVAL;
+ err_type_t label = TYPE_INVAL;
+ zinject_record_t record = { 0 };
+ char pool[MAXNAMELEN] = "";
+ char dataset[MAXNAMELEN] = "";
+ zfs_handle_t *zhp = NULL;
+ int nowrites = 0;
+ int dur_txg = 0;
+ int dur_secs = 0;
+ int ret;
+ int flags = 0;
+ uint32_t dvas = 0;
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+ return (1);
+ }
+
+ libzfs_print_on_error(g_zfs, B_TRUE);
+
+ if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
+ (void) fprintf(stderr, "failed to open ZFS device\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ if (argc == 1) {
+ /*
+ * No arguments. Print the available handlers. If there are no
+ * available handlers, direct the user to '-h' for help
+ * information.
+ */
+ if (print_all_handlers() == 0) {
+ (void) printf("No handlers registered.\n");
+ (void) printf("Run 'zinject -h' for usage "
+ "information.\n");
+ }
+ libzfs_fini(g_zfs);
+ return (0);
+ }
+
+ while ((c = getopt(argc, argv,
+ ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+ switch (c) {
+ case 'a':
+ flags |= ZINJECT_FLUSH_ARC;
+ break;
+ case 'A':
+ if (strcasecmp(optarg, "degrade") == 0) {
+ action = VDEV_STATE_DEGRADED;
+ } else if (strcasecmp(optarg, "fault") == 0) {
+ action = VDEV_STATE_FAULTED;
+ } else {
+ (void) fprintf(stderr, "invalid action '%s': "
+ "must be 'degrade' or 'fault'\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'b':
+ raw = optarg;
+ break;
+ case 'c':
+ cancel = optarg;
+ break;
+ case 'C':
+ ret = parse_dvas(optarg, &dvas);
+ if (ret != 0) {
+ (void) fprintf(stderr, "invalid DVA list '%s': "
+ "DVAs should be 0 indexed and separated by "
+ "commas.\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'd':
+ device = optarg;
+ break;
+ case 'D':
+ errno = 0;
+ ret = parse_delay(optarg, &record.zi_timer,
+ &record.zi_nlanes);
+ if (ret != 0) {
+
+ (void) fprintf(stderr, "invalid i/o delay "
+ "value: '%s'\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'e':
+ if (strcasecmp(optarg, "io") == 0) {
+ error = EIO;
+ } else if (strcasecmp(optarg, "checksum") == 0) {
+ error = ECKSUM;
+ } else if (strcasecmp(optarg, "decompress") == 0) {
+ error = EINVAL;
+ } else if (strcasecmp(optarg, "decrypt") == 0) {
+ error = EACCES;
+ } else if (strcasecmp(optarg, "nxio") == 0) {
+ error = ENXIO;
+ } else if (strcasecmp(optarg, "dtl") == 0) {
+ error = ECHILD;
+ } else if (strcasecmp(optarg, "corrupt") == 0) {
+ error = EILSEQ;
+ } else {
+ (void) fprintf(stderr, "invalid error type "
+ "'%s': must be 'io', 'checksum' or "
+ "'nxio'\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'f':
+ ret = parse_frequency(optarg, &record.zi_freq);
+ if (ret != 0) {
+ (void) fprintf(stderr, "%sfrequency value must "
+ "be in the range [0.0001, 100.0]\n",
+ ret == EINVAL ? "invalid value: " :
+ ret == ERANGE ? "out of range: " : "");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'F':
+ record.zi_failfast = B_TRUE;
+ break;
+ case 'g':
+ dur_txg = 1;
+ record.zi_duration = (int)strtol(optarg, &end, 10);
+ if (record.zi_duration <= 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid duration '%s': "
+ "must be a positive integer\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ /* store duration of txgs as its negative */
+ record.zi_duration *= -1;
+ break;
+ case 'h':
+ usage();
+ libzfs_fini(g_zfs);
+ return (0);
+ case 'I':
+ /* default duration, if one hasn't yet been defined */
+ nowrites = 1;
+ if (dur_secs == 0 && dur_txg == 0)
+ record.zi_duration = 30;
+ break;
+ case 'l':
+ level = (int)strtol(optarg, &end, 10);
+ if (*end != '\0') {
+ (void) fprintf(stderr, "invalid level '%s': "
+ "must be an integer\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'm':
+ domount = 1;
+ break;
+ case 'p':
+ (void) strlcpy(record.zi_func, optarg,
+ sizeof (record.zi_func));
+ record.zi_cmd = ZINJECT_PANIC;
+ break;
+ case 'q':
+ quiet = 1;
+ break;
+ case 'r':
+ range = optarg;
+ flags |= ZINJECT_CALC_RANGE;
+ break;
+ case 's':
+ dur_secs = 1;
+ record.zi_duration = (int)strtol(optarg, &end, 10);
+ if (record.zi_duration <= 0 || *end != '\0') {
+ (void) fprintf(stderr, "invalid duration '%s': "
+ "must be a positive integer\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'T':
+ if (strcasecmp(optarg, "read") == 0) {
+ io_type = ZIO_TYPE_READ;
+ } else if (strcasecmp(optarg, "write") == 0) {
+ io_type = ZIO_TYPE_WRITE;
+ } else if (strcasecmp(optarg, "free") == 0) {
+ io_type = ZIO_TYPE_FREE;
+ } else if (strcasecmp(optarg, "claim") == 0) {
+ io_type = ZIO_TYPE_CLAIM;
+ } else if (strcasecmp(optarg, "all") == 0) {
+ io_type = ZIO_TYPES;
+ } else {
+ (void) fprintf(stderr, "invalid I/O type "
+ "'%s': must be 'read', 'write', 'free', "
+ "'claim' or 'all'\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 't':
+ if ((type = name_to_type(optarg)) == TYPE_INVAL &&
+ !MOS_TYPE(type)) {
+ (void) fprintf(stderr, "invalid type '%s'\n",
+ optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case 'u':
+ flags |= ZINJECT_UNLOAD_SPA;
+ break;
+ case 'L':
+ if ((label = name_to_type(optarg)) == TYPE_INVAL &&
+ !LABEL_TYPE(type)) {
+ (void) fprintf(stderr, "invalid label type "
+ "'%s'\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
+ case ':':
+ (void) fprintf(stderr, "option -%c requires an "
+ "operand\n", optopt);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ case '?':
+ (void) fprintf(stderr, "invalid option '%c'\n",
+ optopt);
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (record.zi_duration != 0)
+ record.zi_cmd = ZINJECT_IGNORED_WRITES;
+
+ if (cancel != NULL) {
+ /*
+ * '-c' is invalid with any other options.
+ */
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ record.zi_freq > 0 || dvas != 0) {
+ (void) fprintf(stderr, "cancel (-c) incompatible with "
+ "any other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+ if (argc != 0) {
+ (void) fprintf(stderr, "extraneous argument to '-c'\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (strcmp(cancel, "all") == 0) {
+ return (cancel_all_handlers());
+ } else {
+ int id = (int)strtol(cancel, &end, 10);
+ if (*end != '\0') {
+ (void) fprintf(stderr, "invalid handle id '%s':"
+ " must be an integer or 'all'\n", cancel);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ return (cancel_handler(id));
+ }
+ }
+
+ if (device != NULL) {
+ /*
+ * Device (-d) injection uses a completely different mechanism
+ * for doing injection, so handle it separately here.
+ */
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ dvas != 0) {
+ (void) fprintf(stderr, "device (-d) incompatible with "
+ "data error injection\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (argc != 1) {
+ (void) fprintf(stderr, "device (-d) injection requires "
+ "a single pool name\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ (void) strlcpy(pool, argv[0], sizeof (pool));
+ dataset[0] = '\0';
+
+ if (error == ECKSUM) {
+ (void) fprintf(stderr, "device error type must be "
+ "'io', 'nxio' or 'corrupt'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ if (error == EILSEQ &&
+ (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) {
+ (void) fprintf(stderr, "device corrupt errors require "
+ "io type read and a frequency value\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_iotype = io_type;
+ if (translate_device(pool, device, label, &record) != 0) {
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ if (!error)
+ error = ENXIO;
+
+ if (action != VDEV_STATE_UNKNOWN)
+ return (perform_action(pool, &record, action));
+
+ } else if (raw != NULL) {
+ if (range != NULL || type != TYPE_INVAL || level != 0 ||
+ record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ record.zi_freq > 0 || dvas != 0) {
+ (void) fprintf(stderr, "raw (-b) format with "
+ "any other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (argc != 1) {
+ (void) fprintf(stderr, "raw (-b) format expects a "
+ "single pool name\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ (void) strlcpy(pool, argv[0], sizeof (pool));
+ dataset[0] = '\0';
+
+ if (error == ENXIO) {
+ (void) fprintf(stderr, "data error type must be "
+ "'checksum' or 'io'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_cmd = ZINJECT_DATA_FAULT;
+ if (translate_raw(raw, &record) != 0) {
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ if (!error)
+ error = EIO;
+ } else if (record.zi_cmd == ZINJECT_PANIC) {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || device != NULL || record.zi_freq > 0 ||
+ dvas != 0) {
+ (void) fprintf(stderr, "panic (-p) incompatible with "
+ "other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (argc < 1 || argc > 2) {
+ (void) fprintf(stderr, "panic (-p) injection requires "
+ "a single pool name and an optional id\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ (void) strlcpy(pool, argv[0], sizeof (pool));
+ if (argv[1] != NULL)
+ record.zi_type = atoi(argv[1]);
+ dataset[0] = '\0';
+ } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || record.zi_freq > 0 || dvas != 0) {
+ (void) fprintf(stderr, "hardware failure (-I) "
+ "incompatible with other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (nowrites == 0) {
+ (void) fprintf(stderr, "-s or -g meaningless "
+ "without -I (ignore writes)\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ } else if (dur_secs && dur_txg) {
+ (void) fprintf(stderr, "choose a duration either "
+ "in seconds (-s) or a number of txgs (-g) "
+ "but not both\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ } else if (argc != 1) {
+ (void) fprintf(stderr, "ignore writes (-I) "
+ "injection requires a single pool name\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ (void) strlcpy(pool, argv[0], sizeof (pool));
+ dataset[0] = '\0';
+ } else if (type == TYPE_INVAL) {
+ if (flags == 0) {
+ (void) fprintf(stderr, "at least one of '-b', '-d', "
+ "'-t', '-a', '-p', '-I' or '-u' "
+ "must be specified\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
+ (void) strlcpy(pool, argv[0], sizeof (pool));
+ dataset[0] = '\0';
+ } else if (argc != 0) {
+ (void) fprintf(stderr, "extraneous argument for "
+ "'-f'\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ flags |= ZINJECT_NULL;
+ } else {
+ if (argc != 1) {
+ (void) fprintf(stderr, "missing object\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
+ if (error == ENXIO || error == EILSEQ) {
+ (void) fprintf(stderr, "data error type must be "
+ "'checksum' or 'io'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ if (dvas != 0) {
+ if (error == EACCES || error == EINVAL) {
+ (void) fprintf(stderr, "the '-C' option may "
+ "not be used with logical data errors "
+ "'decrypt' and 'decompress'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_dvas = dvas;
+ }
+
+ if (error == EACCES) {
+ if (type != TYPE_DATA) {
+ (void) fprintf(stderr, "decryption errors "
+ "may only be injected for 'data' types\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_cmd = ZINJECT_DECRYPT_FAULT;
+ /*
+ * Internally, ZFS actually uses ECKSUM for decryption
+ * errors since EACCES is used to indicate the key was
+ * not found.
+ */
+ error = ECKSUM;
+ } else {
+ record.zi_cmd = ZINJECT_DATA_FAULT;
+ }
+
+ if (translate_record(type, argv[0], range, level, &record, pool,
+ dataset) != 0) {
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ if (!error)
+ error = EIO;
+ }
+
+ /*
+ * If this is pool-wide metadata, unmount everything. The ioctl() will
+ * unload the pool, so that we trigger spa-wide reopen of metadata next
+ * time we access the pool.
+ */
+ if (dataset[0] != '\0' && domount) {
+ if ((zhp = zfs_open(g_zfs, dataset,
+ ZFS_TYPE_DATASET)) == NULL) {
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ if (zfs_unmount(zhp, NULL, 0) != 0) {
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ }
+
+ record.zi_error = error;
+
+ ret = register_handler(pool, flags, &record, quiet);
+
+ if (dataset[0] != '\0' && domount)
+ ret = (zfs_mount(zhp, NULL, 0) != 0);
+
+ libzfs_fini(g_zfs);
+
+ return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zinject/zinject.h b/sys/contrib/openzfs/cmd/zinject/zinject.h
new file mode 100644
index 000000000000..46fdcad8b31f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zinject/zinject.h
@@ -0,0 +1,70 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef _ZINJECT_H
+#define _ZINJECT_H
+
+#include <sys/zfs_ioctl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ TYPE_DATA, /* plain file contents */
+ TYPE_DNODE, /* metadnode contents */
+ TYPE_MOS, /* all MOS data */
+ TYPE_MOSDIR, /* MOS object directory */
+ TYPE_METASLAB, /* metaslab objects */
+ TYPE_CONFIG, /* MOS config */
+ TYPE_BPOBJ, /* block pointer list */
+ TYPE_SPACEMAP, /* space map objects */
+ TYPE_ERRLOG, /* persistent error log */
+ TYPE_LABEL_UBERBLOCK, /* label specific uberblock */
+ TYPE_LABEL_NVLIST, /* label specific nvlist */
+ TYPE_LABEL_PAD1, /* label specific 8K pad1 area */
+ TYPE_LABEL_PAD2, /* label specific 8K pad2 area */
+ TYPE_INVAL
+} err_type_t;
+
+#define MOS_TYPE(t) \
+ ((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK)
+
+#define LABEL_TYPE(t) \
+ ((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL)
+
+int translate_record(err_type_t type, const char *object, const char *range,
+ int level, zinject_record_t *record, char *poolname, char *dataset);
+int translate_raw(const char *raw, zinject_record_t *record);
+int translate_device(const char *pool, const char *device,
+ err_type_t label_type, zinject_record_t *record);
+void usage(void);
+
+extern libzfs_handle_t *g_zfs;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZINJECT_H */
diff --git a/sys/contrib/openzfs/cmd/zpool/.gitignore b/sys/contrib/openzfs/cmd/zpool/.gitignore
new file mode 100644
index 000000000000..8ea518af78e5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/.gitignore
@@ -0,0 +1 @@
+/zpool
diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am
new file mode 100644
index 000000000000..c0378b136901
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am
@@ -0,0 +1,136 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS)
+
+DEFAULT_INCLUDES += -I$(srcdir)
+
+sbin_PROGRAMS = zpool
+
+zpool_SOURCES = \
+ zpool_iter.c \
+ zpool_main.c \
+ zpool_util.c \
+ zpool_util.h \
+ zpool_vdev.c
+
+if BUILD_FREEBSD
+zpool_SOURCES += os/freebsd/zpool_vdev_os.c
+endif
+
+if BUILD_LINUX
+zpool_SOURCES += os/linux/zpool_vdev_os.c
+endif
+
+zpool_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+ $(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zpool_LDADD += $(LTLIBINTL)
+
+if BUILD_FREEBSD
+zpool_LDADD += -lgeom
+endif
+zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS)
+
+zpoolconfdir = $(sysconfdir)/zfs/zpool.d
+zpoolexecdir = $(zfsexecdir)/zpool.d
+
+EXTRA_DIST = zpool.d/README
+
+dist_zpoolexec_SCRIPTS = \
+ zpool.d/dm-deps \
+ zpool.d/enc \
+ zpool.d/encdev \
+ zpool.d/fault_led \
+ zpool.d/iostat \
+ zpool.d/iostat-1s \
+ zpool.d/iostat-10s \
+ zpool.d/label \
+ zpool.d/locate_led \
+ zpool.d/lsblk \
+ zpool.d/media \
+ zpool.d/model \
+ zpool.d/serial \
+ zpool.d/ses \
+ zpool.d/size \
+ zpool.d/slot \
+ zpool.d/smart \
+ zpool.d/smartx \
+ zpool.d/temp \
+ zpool.d/health \
+ zpool.d/r_proc \
+ zpool.d/w_proc \
+ zpool.d/r_ucor \
+ zpool.d/w_ucor \
+ zpool.d/nonmed \
+ zpool.d/defect \
+ zpool.d/hours_on \
+ zpool.d/realloc \
+ zpool.d/rep_ucor \
+ zpool.d/cmd_to \
+ zpool.d/pend_sec \
+ zpool.d/off_ucor \
+ zpool.d/ata_err \
+ zpool.d/nvme_err \
+ zpool.d/pwr_cyc \
+ zpool.d/upath \
+ zpool.d/vendor \
+ zpool.d/smart_test \
+ zpool.d/test_type \
+ zpool.d/test_status \
+ zpool.d/test_progress \
+ zpool.d/test_ended
+
+zpoolconfdefaults = \
+ dm-deps \
+ enc \
+ encdev \
+ fault_led \
+ iostat \
+ iostat-1s \
+ iostat-10s \
+ label \
+ locate_led \
+ lsblk \
+ media \
+ model \
+ serial \
+ ses \
+ size \
+ slot \
+ smart \
+ smartx \
+ temp \
+ health \
+ r_proc \
+ w_proc \
+ r_ucor \
+ w_ucor \
+ nonmed \
+ defect \
+ hours_on \
+ realloc \
+ rep_ucor \
+ cmd_to \
+ pend_sec \
+ off_ucor \
+ ata_err \
+ nvme_err \
+ pwr_cyc \
+ upath \
+ vendor \
+ smart_test \
+ test_type \
+ test_status \
+ test_progress \
+ test_ended
+
+install-data-hook:
+ $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
+ for f in $(zpoolconfdefaults); do \
+ test -f "$(DESTDIR)$(zpoolconfdir)/$${f}" -o \
+ -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \
+ ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \
+ done
diff --git a/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c
new file mode 100644
index 000000000000..7d48f61a0ee7
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/os/freebsd/zpool_vdev_os.c
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration. Each entry in the list can be one of:
+ *
+ * Device vdevs
+ * disk=(path=..., devid=...)
+ * file=(path=...)
+ *
+ * Group vdevs
+ * raidz[1|2]=(...)
+ * mirror=(...)
+ *
+ * Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs. All userland verification of devices is contained within
+ * this file. If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'make_root_vdev'. The
+ * function performs several passes:
+ *
+ * 1. Construct the vdev specification. Performs syntax validation and
+ * makes sure each device is valid.
+ * 2. Check for devices in use. Using libdiskmgt, makes sure that no
+ * devices are also in use. Some can be overridden using the 'force'
+ * flag, others cannot.
+ * 3. Check for replication errors if the 'force' flag is not specified.
+ * validates that the replication level is consistent across the
+ * entire pool.
+ * 4. Call libzfs to label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <sys/spa.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <paths.h>
+#include <sys/stat.h>
+#include <sys/disk.h>
+#include <sys/mntent.h>
+#include <libgeom.h>
+
+#include "zpool_util.h"
+#include <sys/zfs_context.h>
+
+int
+check_device(const char *name, boolean_t force, boolean_t isspare,
+ boolean_t iswholedisk)
+{
+ char path[MAXPATHLEN];
+
+ if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0)
+ snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name);
+ else
+ strlcpy(path, name, sizeof (path));
+
+ return (check_file(path, force, isspare));
+}
+
+boolean_t
+check_sector_size_database(char *path, int *sector_size)
+{
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
new file mode 100644
index 000000000000..d087c4c14dac
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -0,0 +1,410 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration. Each entry in the list can be one of:
+ *
+ * Device vdevs
+ * disk=(path=..., devid=...)
+ * file=(path=...)
+ *
+ * Group vdevs
+ * raidz[1|2]=(...)
+ * mirror=(...)
+ *
+ * Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs. All userland verification of devices is contained within
+ * this file. If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'make_root_vdev'. The
+ * function performs several passes:
+ *
+ * 1. Construct the vdev specification. Performs syntax validation and
+ * makes sure each device is valid.
+ * 2. Check for devices in use. Using libblkid to make sure that no
+ * devices are also in use. Some can be overridden using the 'force'
+ * flag, others cannot.
+ * 3. Check for replication errors if the 'force' flag is not specified.
+ * validates that the replication level is consistent across the
+ * entire pool.
+ * 4. Call libzfs to label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <sys/spa.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "zpool_util.h"
+#include <sys/zfs_context.h>
+
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/efi_partition.h>
+#include <sys/stat.h>
+#include <sys/vtoc.h>
+#include <sys/mntent.h>
+#include <uuid/uuid.h>
+#include <blkid/blkid.h>
+
+typedef struct vdev_disk_db_entry
+{
+ char id[24];
+ int sector_size;
+} vdev_disk_db_entry_t;
+
+/*
+ * Database of block devices that lie about physical sector sizes. The
+ * identification string must be precisely 24 characters to avoid false
+ * negatives
+ */
+static vdev_disk_db_entry_t vdev_disk_database[] = {
+ {"ATA ADATA SSD S396 3", 8192},
+ {"ATA APPLE SSD SM128E", 8192},
+ {"ATA APPLE SSD SM256E", 8192},
+ {"ATA APPLE SSD SM512E", 8192},
+ {"ATA APPLE SSD SM768E", 8192},
+ {"ATA C400-MTFDDAC064M", 8192},
+ {"ATA C400-MTFDDAC128M", 8192},
+ {"ATA C400-MTFDDAC256M", 8192},
+ {"ATA C400-MTFDDAC512M", 8192},
+ {"ATA Corsair Force 3 ", 8192},
+ {"ATA Corsair Force GS", 8192},
+ {"ATA INTEL SSDSA2CT04", 8192},
+ {"ATA INTEL SSDSA2BZ10", 8192},
+ {"ATA INTEL SSDSA2BZ20", 8192},
+ {"ATA INTEL SSDSA2BZ30", 8192},
+ {"ATA INTEL SSDSA2CW04", 8192},
+ {"ATA INTEL SSDSA2CW08", 8192},
+ {"ATA INTEL SSDSA2CW12", 8192},
+ {"ATA INTEL SSDSA2CW16", 8192},
+ {"ATA INTEL SSDSA2CW30", 8192},
+ {"ATA INTEL SSDSA2CW60", 8192},
+ {"ATA INTEL SSDSC2CT06", 8192},
+ {"ATA INTEL SSDSC2CT12", 8192},
+ {"ATA INTEL SSDSC2CT18", 8192},
+ {"ATA INTEL SSDSC2CT24", 8192},
+ {"ATA INTEL SSDSC2CW06", 8192},
+ {"ATA INTEL SSDSC2CW12", 8192},
+ {"ATA INTEL SSDSC2CW18", 8192},
+ {"ATA INTEL SSDSC2CW24", 8192},
+ {"ATA INTEL SSDSC2CW48", 8192},
+ {"ATA KINGSTON SH100S3", 8192},
+ {"ATA KINGSTON SH103S3", 8192},
+ {"ATA M4-CT064M4SSD2 ", 8192},
+ {"ATA M4-CT128M4SSD2 ", 8192},
+ {"ATA M4-CT256M4SSD2 ", 8192},
+ {"ATA M4-CT512M4SSD2 ", 8192},
+ {"ATA OCZ-AGILITY2 ", 8192},
+ {"ATA OCZ-AGILITY3 ", 8192},
+ {"ATA OCZ-VERTEX2 3.5 ", 8192},
+ {"ATA OCZ-VERTEX3 ", 8192},
+ {"ATA OCZ-VERTEX3 LT ", 8192},
+ {"ATA OCZ-VERTEX3 MI ", 8192},
+ {"ATA OCZ-VERTEX4 ", 8192},
+ {"ATA SAMSUNG MZ7WD120", 8192},
+ {"ATA SAMSUNG MZ7WD240", 8192},
+ {"ATA SAMSUNG MZ7WD480", 8192},
+ {"ATA SAMSUNG MZ7WD960", 8192},
+ {"ATA SAMSUNG SSD 830 ", 8192},
+ {"ATA Samsung SSD 840 ", 8192},
+ {"ATA SanDisk SSD U100", 8192},
+ {"ATA TOSHIBA THNSNH06", 8192},
+ {"ATA TOSHIBA THNSNH12", 8192},
+ {"ATA TOSHIBA THNSNH25", 8192},
+ {"ATA TOSHIBA THNSNH51", 8192},
+ {"ATA APPLE SSD TS064C", 4096},
+ {"ATA APPLE SSD TS128C", 4096},
+ {"ATA APPLE SSD TS256C", 4096},
+ {"ATA APPLE SSD TS512C", 4096},
+ {"ATA INTEL SSDSA2M040", 4096},
+ {"ATA INTEL SSDSA2M080", 4096},
+ {"ATA INTEL SSDSA2M160", 4096},
+ {"ATA INTEL SSDSC2MH12", 4096},
+ {"ATA INTEL SSDSC2MH25", 4096},
+ {"ATA OCZ CORE_SSD ", 4096},
+ {"ATA OCZ-VERTEX ", 4096},
+ {"ATA SAMSUNG MCCOE32G", 4096},
+ {"ATA SAMSUNG MCCOE64G", 4096},
+ {"ATA SAMSUNG SSD PM80", 4096},
+ /* Flash drives optimized for 4KB IOs on larger pages */
+ {"ATA INTEL SSDSC2BA10", 4096},
+ {"ATA INTEL SSDSC2BA20", 4096},
+ {"ATA INTEL SSDSC2BA40", 4096},
+ {"ATA INTEL SSDSC2BA80", 4096},
+ {"ATA INTEL SSDSC2BB08", 4096},
+ {"ATA INTEL SSDSC2BB12", 4096},
+ {"ATA INTEL SSDSC2BB16", 4096},
+ {"ATA INTEL SSDSC2BB24", 4096},
+ {"ATA INTEL SSDSC2BB30", 4096},
+ {"ATA INTEL SSDSC2BB40", 4096},
+ {"ATA INTEL SSDSC2BB48", 4096},
+ {"ATA INTEL SSDSC2BB60", 4096},
+ {"ATA INTEL SSDSC2BB80", 4096},
+ {"ATA INTEL SSDSC2BW24", 4096},
+ {"ATA INTEL SSDSC2BW48", 4096},
+ {"ATA INTEL SSDSC2BP24", 4096},
+ {"ATA INTEL SSDSC2BP48", 4096},
+ {"NA SmrtStorSDLKAE9W", 4096},
+ {"NVMe Amazon EC2 NVMe ", 4096},
+ /* Imported from Open Solaris */
+ {"ATA MARVELL SD88SA02", 4096},
+ /* Advanced format Hard drives */
+ {"ATA Hitachi HDS5C303", 4096},
+ {"ATA SAMSUNG HD204UI ", 4096},
+ {"ATA ST2000DL004 HD20", 4096},
+ {"ATA WDC WD10EARS-00M", 4096},
+ {"ATA WDC WD10EARS-00S", 4096},
+ {"ATA WDC WD10EARS-00Z", 4096},
+ {"ATA WDC WD15EARS-00M", 4096},
+ {"ATA WDC WD15EARS-00S", 4096},
+ {"ATA WDC WD15EARS-00Z", 4096},
+ {"ATA WDC WD20EARS-00M", 4096},
+ {"ATA WDC WD20EARS-00S", 4096},
+ {"ATA WDC WD20EARS-00Z", 4096},
+ {"ATA WDC WD1600BEVT-0", 4096},
+ {"ATA WDC WD2500BEVT-0", 4096},
+ {"ATA WDC WD3200BEVT-0", 4096},
+ {"ATA WDC WD5000BEVT-0", 4096},
+};
+
+
+#define INQ_REPLY_LEN 96
+#define INQ_CMD_LEN 6
+
+static const int vdev_disk_database_size =
+ sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]);
+
+boolean_t
+check_sector_size_database(char *path, int *sector_size)
+{
+ unsigned char inq_buff[INQ_REPLY_LEN];
+ unsigned char sense_buffer[32];
+ unsigned char inq_cmd_blk[INQ_CMD_LEN] =
+ {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0};
+ sg_io_hdr_t io_hdr;
+ int error;
+ int fd;
+ int i;
+
+ /* Prepare INQUIRY command */
+ memset(&io_hdr, 0, sizeof (sg_io_hdr_t));
+ io_hdr.interface_id = 'S';
+ io_hdr.cmd_len = sizeof (inq_cmd_blk);
+ io_hdr.mx_sb_len = sizeof (sense_buffer);
+ io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+ io_hdr.dxfer_len = INQ_REPLY_LEN;
+ io_hdr.dxferp = inq_buff;
+ io_hdr.cmdp = inq_cmd_blk;
+ io_hdr.sbp = sense_buffer;
+ io_hdr.timeout = 10; /* 10 milliseconds is ample time */
+
+ if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
+ return (B_FALSE);
+
+ error = ioctl(fd, SG_IO, (unsigned long) &io_hdr);
+
+ (void) close(fd);
+
+ if (error < 0)
+ return (B_FALSE);
+
+ if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
+ return (B_FALSE);
+
+ for (i = 0; i < vdev_disk_database_size; i++) {
+ if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24))
+ continue;
+
+ *sector_size = vdev_disk_database[i].sector_size;
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare)
+{
+ int err;
+ char *value;
+
+ /* No valid type detected device is safe to use */
+ value = blkid_get_tag_value(cache, "TYPE", path);
+ if (value == NULL)
+ return (0);
+
+ /*
+ * If libblkid detects a ZFS device, we check the device
+ * using check_file() to see if it's safe. The one safe
+ * case is a spare device shared between multiple pools.
+ */
+ if (strcmp(value, "zfs_member") == 0) {
+ err = check_file(path, force, isspare);
+ } else {
+ if (force) {
+ err = 0;
+ } else {
+ err = -1;
+ vdev_error(gettext("%s contains a filesystem of "
+ "type '%s'\n"), path, value);
+ }
+ }
+
+ free(value);
+
+ return (err);
+}
+
+/*
+ * Validate that a disk including all partitions are safe to use.
+ *
+ * For EFI labeled disks this can done relatively easily with the libefi
+ * library. The partition numbers are extracted from the label and used
+ * to generate the expected /dev/ paths. Each partition can then be
+ * checked for conflicts.
+ *
+ * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible
+ * but due to the lack of a readily available libraries this scanning is
+ * not implemented. Instead only the device path as given is checked.
+ */
+static int
+check_disk(const char *path, blkid_cache cache, int force,
+ boolean_t isspare, boolean_t iswholedisk)
+{
+ struct dk_gpt *vtoc;
+ char slice_path[MAXPATHLEN];
+ int err = 0;
+ int fd, i;
+ int flags = O_RDONLY|O_DIRECT;
+
+ if (!iswholedisk)
+ return (check_slice(path, cache, force, isspare));
+
+ /* only spares can be shared, other devices require exclusive access */
+ if (!isspare)
+ flags |= O_EXCL;
+
+ if ((fd = open(path, flags)) < 0) {
+ char *value = blkid_get_tag_value(cache, "TYPE", path);
+ (void) fprintf(stderr, gettext("%s is in use and contains "
+ "a %s filesystem.\n"), path, value ? value : "unknown");
+ free(value);
+ return (-1);
+ }
+
+ /*
+ * Expected to fail for non-EFI labeled disks. Just check the device
+ * as given and do not attempt to detect and scan partitions.
+ */
+ err = efi_alloc_and_read(fd, &vtoc);
+ if (err) {
+ (void) close(fd);
+ return (check_slice(path, cache, force, isspare));
+ }
+
+ /*
+ * The primary efi partition label is damaged however the secondary
+ * label at the end of the device is intact. Rather than use this
+ * label we should play it safe and treat this as a non efi device.
+ */
+ if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) {
+ efi_free(vtoc);
+ (void) close(fd);
+
+ if (force) {
+ /* Partitions will now be created using the backup */
+ return (0);
+ } else {
+ vdev_error(gettext("%s contains a corrupt primary "
+ "EFI label.\n"), path);
+ return (-1);
+ }
+ }
+
+ for (i = 0; i < vtoc->efi_nparts; i++) {
+
+ if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED ||
+ uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid))
+ continue;
+
+ if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0)
+ (void) snprintf(slice_path, sizeof (slice_path),
+ "%s%s%d", path, "-part", i+1);
+ else
+ (void) snprintf(slice_path, sizeof (slice_path),
+ "%s%s%d", path, isdigit(path[strlen(path)-1]) ?
+ "p" : "", i+1);
+
+ err = check_slice(slice_path, cache, force, isspare);
+ if (err)
+ break;
+ }
+
+ efi_free(vtoc);
+ (void) close(fd);
+
+ return (err);
+}
+
+int
+check_device(const char *path, boolean_t force,
+ boolean_t isspare, boolean_t iswholedisk)
+{
+ blkid_cache cache;
+ int error;
+
+ error = blkid_get_cache(&cache, NULL);
+ if (error != 0) {
+ (void) fprintf(stderr, gettext("unable to access the blkid "
+ "cache.\n"));
+ return (-1);
+ }
+
+ error = check_disk(path, cache, force, isspare, iswholedisk);
+ blkid_put_cache(cache);
+
+ return (error);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/README b/sys/contrib/openzfs/cmd/zpool/zpool.d/README
new file mode 100644
index 000000000000..033b7c363f5a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/README
@@ -0,0 +1,9 @@
+This directory contains scripts that can be run the zpool status/iostat
+-c option:
+
+ zpool status -c script1,script2, ...
+
+ zpool iostat -vc script1,script2, ...
+
+Some scripts output different values depending on the symlink name that is
+used to run them. See the zpool(8) man page for more details.
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err b/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ata_err
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to b/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/cmd_to
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/defect b/sys/contrib/openzfs/cmd/zpool/zpool.d/defect
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/defect
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps b/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps
new file mode 100755
index 000000000000..ee39514e4d92
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/dm-deps
@@ -0,0 +1,29 @@
+#!/bin/sh
+#
+# Show device mapper dependent / underlying devices. This is useful for
+# looking up the /dev/sd* devices associated with a dm or multipath device.
+#
+
+if [ "$1" = "-h" ] ; then
+ echo "Show device mapper dependent (underlying) devices."
+ exit
+fi
+
+dev="$VDEV_PATH"
+
+# If the VDEV path is a symlink, resolve it to a real device
+if [ -L "$dev" ] ; then
+ dev=$(readlink "$dev")
+fi
+
+dev=$(basename "$dev")
+val=""
+if [ -d "/sys/class/block/$dev/slaves" ] ; then
+ # ls -C: output in columns, no newlines
+ val=$(ls -C "/sys/class/block/$dev/slaves")
+
+ # ls -C will print two spaces between files; change to one space.
+ val=$(echo "$val" | sed -r 's/[[:blank:]]+/ /g')
+fi
+
+echo "dm-deps=$val"
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/enc b/sys/contrib/openzfs/cmd/zpool/zpool.d/enc
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/enc
@@ -0,0 +1 @@
+ses \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev b/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/encdev
@@ -0,0 +1 @@
+ses \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led b/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/fault_led
@@ -0,0 +1 @@
+ses \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/health b/sys/contrib/openzfs/cmd/zpool/zpool.d/health
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/health
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on b/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/hours_on
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat
new file mode 100755
index 000000000000..41a3acfae7a4
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat
@@ -0,0 +1,77 @@
+#!/bin/sh
+#
+# Display most relevant iostat bandwidth/latency numbers. The output is
+# dependent on the name of the script/symlink used to call it.
+#
+
+helpstr="
+iostat: Show iostat values since boot (summary page).
+iostat-1s: Do a single 1-second iostat sample and show values.
+iostat-10s: Do a single 10-second iostat sample and show values."
+
+script=$(basename "$0")
+if [ "$1" = "-h" ] ; then
+ echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+ exit
+fi
+
+if [ "$script" = "iostat-1s" ] ; then
+ # Do a single one-second sample
+ interval=1
+ # Don't show summary stats
+ brief="yes"
+elif [ "$script" = "iostat-10s" ] ; then
+ # Do a single ten-second sample
+ interval=10
+ # Don't show summary stats
+ brief="yes"
+fi
+
+if [ -f "$VDEV_UPATH" ] ; then
+ # We're a file-based vdev, iostat doesn't work on us. Do nothing.
+ exit
+fi
+
+if [ "$(uname)" = "FreeBSD" ]; then
+ out=$(iostat -dKx \
+ ${interval:+"-w $interval"} \
+ ${interval:+"-c 1"} \
+ "$VDEV_UPATH" | tail -n 2)
+else
+ out=$(iostat -kx \
+ ${brief:+"-y"} \
+ ${interval:+"$interval"} \
+ ${interval:+"1"} \
+ "$VDEV_UPATH" | awk NF | tail -n 2)
+fi
+
+
+# Sample output (we want the last two lines):
+#
+# Linux 2.6.32-642.13.1.el6.x86_64 (centos68) 03/09/2017 _x86_64_ (6 CPU)
+#
+# avg-cpu: %user %nice %system %iowait %steal %idle
+# 0.00 0.00 0.00 0.00 0.00 100.00
+#
+# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util
+# sdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
+#
+
+# Get the column names
+cols=$(echo "$out" | head -n 1)
+
+# Get the values and tab separate them to make them cut-able.
+vals=$(echo "$out" | tail -n 1 | sed -r 's/[[:blank:]]+/\t/g')
+
+i=0
+for col in $cols ; do
+ i=$((i+1))
+ # Skip the first column since it's just the device name
+ if [ $i -eq 1 ]; then
+ continue
+ fi
+
+ # Get i'th value
+ val=$(echo "$vals" | cut -f "$i")
+ echo "$col=$val"
+done
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s
new file mode 120000
index 000000000000..084278d99f0f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-10s
@@ -0,0 +1 @@
+iostat \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s
new file mode 120000
index 000000000000..084278d99f0f
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/iostat-1s
@@ -0,0 +1 @@
+iostat \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/label b/sys/contrib/openzfs/cmd/zpool/zpool.d/label
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/label
@@ -0,0 +1 @@
+lsblk \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led b/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/locate_led
@@ -0,0 +1 @@
+ses \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk b/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk
new file mode 100755
index 000000000000..1cdef40494fe
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/lsblk
@@ -0,0 +1,83 @@
+#!/bin/sh
+#
+# Print some common lsblk values
+#
+# Any (lowercased) name symlinked to the lsblk script will be passed to lsblk
+# as one of its --output names. Here's a partial list of --output names
+# from the lsblk binary:
+#
+# Available columns (for --output):
+# NAME device name
+# KNAME internal kernel device name
+# MAJ:MIN major:minor device number
+# FSTYPE filesystem type
+# MOUNTPOINT where the device is mounted
+# LABEL filesystem LABEL
+# UUID filesystem UUID
+# RA read-ahead of the device
+# RO read-only device
+# RM removable device
+# MODEL device identifier
+# SIZE size of the device
+# STATE state of the device
+# OWNER user name
+# GROUP group name
+# MODE device node permissions
+# ALIGNMENT alignment offset
+# MIN-IO minimum I/O size
+# OPT-IO optimal I/O size
+# PHY-SEC physical sector size
+# LOG-SEC logical sector size
+# ROTA rotational device
+# SCHED I/O scheduler name
+# RQ-SIZE request queue size
+# TYPE device type
+# DISC-ALN discard alignment offset
+# DISC-GRAN discard granularity
+# DISC-MAX discard max bytes
+# DISC-ZERO discard zeroes data
+#
+# If the script is run as just 'lsblk' then print out disk size, vendor,
+# and model number.
+
+
+helpstr="
+label: Show filesystem label.
+model: Show disk model number.
+size: Show the disk capacity.
+vendor: Show the disk vendor.
+lsblk: Show the disk size, vendor, and model number."
+
+script=$(basename "$0")
+
+if [ "$1" = "-h" ] ; then
+ echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+ exit
+fi
+
+if [ "$script" = "lsblk" ] ; then
+ list="size vendor model"
+else
+ list=$(echo "$script" | tr '[:upper:]' '[:lower:]')
+fi
+
+# Older versions of lsblk don't support all these values (like SERIAL).
+for i in $list ; do
+
+ # Special case: Looking up the size of a file-based vdev can't
+ # be done with lsblk.
+ if [ "$i" = "size" ] && [ -f "$VDEV_UPATH" ] ; then
+ size=$(du -h --apparent-size "$VDEV_UPATH" | cut -f 1)
+ echo "size=$size"
+ continue
+ fi
+
+
+ val=""
+ if val=$(eval "lsblk -dl -n -o $i $VDEV_UPATH 2>/dev/null") ; then
+ # Remove leading/trailing whitespace from value
+ val=$(echo "$val" | sed -e 's/^[[:space:]]*//' \
+ -e 's/[[:space:]]*$//')
+ fi
+ echo "$i=$val"
+done
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/media b/sys/contrib/openzfs/cmd/zpool/zpool.d/media
new file mode 100755
index 000000000000..05bc15918bc9
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/media
@@ -0,0 +1,27 @@
+#!/bin/sh
+#
+# Print out the type of device
+#
+
+if [ "$1" = "-h" ] ; then
+ echo "Show whether a vdev is a file, hdd, or ssd."
+ exit
+fi
+
+if [ -b "$VDEV_UPATH" ]; then
+ device=$(basename "$VDEV_UPATH")
+ val=$(cat "/sys/block/$device/queue/rotational" 2>/dev/null)
+ if [ "$val" = "0" ]; then
+ MEDIA="ssd"
+ fi
+
+ if [ "$val" = "1" ]; then
+ MEDIA="hdd"
+ fi
+else
+ if [ -f "$VDEV_UPATH" ]; then
+ MEDIA="file"
+ fi
+fi
+
+echo "media=$MEDIA"
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/model b/sys/contrib/openzfs/cmd/zpool/zpool.d/model
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/model
@@ -0,0 +1 @@
+lsblk \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed b/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/nonmed
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err b/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/nvme_err
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/off_ucor
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec b/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/pend_sec
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc b/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/pwr_cyc
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_proc
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/r_ucor
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc b/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/realloc
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/rep_ucor
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/serial b/sys/contrib/openzfs/cmd/zpool/zpool.d/serial
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/serial
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/ses b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses
new file mode 100755
index 000000000000..f6b7520dfb6c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/ses
@@ -0,0 +1,52 @@
+#!/bin/sh
+#
+# Print SCSI Enclosure Services (SES) info. The output is dependent on the name
+# of the script/symlink used to call it.
+#
+helpstr="
+enc: Show disk enclosure w:x:y:z value.
+slot: Show disk slot number as reported by the enclosure.
+encdev: Show /dev/sg* device associated with the enclosure disk slot.
+fault_led: Show value of the disk enclosure slot fault LED.
+locate_led: Show value of the disk enclosure slot locate LED.
+ses: Show disk's enc, enc device, slot, and fault/locate LED values."
+
+script=$(basename "$0")
+if [ "$1" = "-h" ] ; then
+ echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+ exit
+fi
+
+if [ "$script" = "ses" ] ; then
+ scripts='enc encdev slot fault_led locate_led'
+else
+ scripts="$script"
+fi
+
+for i in $scripts ; do
+ if [ -z "$VDEV_ENC_SYSFS_PATH" ] ; then
+ echo "$i="
+ continue
+ fi
+
+ val=""
+ case $i in
+ enc)
+ val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null)
+ ;;
+ slot)
+ val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null)
+ ;;
+ encdev)
+ val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null)
+ ;;
+ fault_led)
+ val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null)
+ ;;
+ locate_led)
+ val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null)
+ ;;
+ esac
+ echo "$i=$val"
+done
+
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/size b/sys/contrib/openzfs/cmd/zpool/zpool.d/size
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/size
@@ -0,0 +1 @@
+lsblk \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/slot b/sys/contrib/openzfs/cmd/zpool/zpool.d/slot
new file mode 120000
index 000000000000..478d1e8967a1
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/slot
@@ -0,0 +1 @@
+ses \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smart b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart
new file mode 100755
index 000000000000..f8854b75227c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart
@@ -0,0 +1,243 @@
+#!/bin/sh
+#
+# Show SMART stats
+#
+
+helpstr="
+smart: Show SMART temperature and error stats (specific to drive type)
+smartx: Show SMART extended drive stats (specific to drive type).
+temp: Show SMART drive temperature in celsius (all drives).
+health: Show reported SMART status (all drives).
+r_proc: Show SMART read GBytes processed over drive lifetime (SAS).
+w_proc: Show SMART write GBytes processed over drive lifetime (SAS).
+r_ucor: Show SMART read uncorrectable errors (SAS).
+w_ucor: Show SMART write uncorrectable errors (SAS).
+nonmed: Show SMART non-medium errors (SAS).
+defect: Show SMART grown defect list (SAS).
+hours_on: Show number of hours drive powered on (all drives).
+realloc: Show SMART reallocated sectors count (ATA).
+rep_ucor: Show SMART reported uncorrectable count (ATA).
+cmd_to: Show SMART command timeout count (ATA).
+pend_sec: Show SMART current pending sector count (ATA).
+off_ucor: Show SMART offline uncorrectable errors (ATA).
+ata_err: Show SMART ATA errors (ATA).
+pwr_cyc: Show SMART power cycle count (ATA).
+serial: Show disk serial number.
+nvme_err: Show SMART NVMe errors (NVMe).
+smart_test: Show SMART self-test results summary.
+test_type: Show SMART self-test type (short, long... ).
+test_status: Show SMART self-test status.
+test_progress: Show SMART self-test percentage done.
+test_ended: Show when the last SMART self-test ended (if supported).
+"
+
+# Hack for developer testing
+#
+# If you set $samples to a directory containing smartctl output text files,
+# we will use them instead of running smartctl on the vdevs. This can be
+# useful if you want to test a bunch of different smartctl outputs. Also, if
+# $samples is set, and additional 'file' column is added to the zpool output
+# showing the filename.
+samples=
+
+# get_filename_from_dir DIR
+#
+# Look in directory DIR and return a filename from it. The filename returned
+# is chosen quasi-sequentially (based off our PID). This allows us to return
+# a different filename every time this script is invoked (which we do for each
+# vdev), without having to maintain state.
+get_filename_from_dir()
+{
+ dir=$1
+ pid="$$"
+ num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
+ mod=$((pid % num_files))
+ i=0
+ find "$dir" -type f -printf "%f\n" | while read -r file ; do
+ if [ "$mod" = "$i" ] ; then
+ echo "$file"
+ break
+ fi
+ i=$((i+1))
+ done
+}
+
+script=$(basename "$0")
+
+if [ "$1" = "-h" ] ; then
+ echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2-
+ exit
+fi
+
+smartctl_path=$(command -v smartctl)
+
+# shellcheck disable=SC2015
+if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
+ if [ -n "$samples" ] ; then
+ # cat a smartctl output text file instead of running smartctl
+ # on a vdev (only used for developer testing).
+ file=$(get_filename_from_dir "$samples")
+ echo "file=$file"
+ raw_out=$(cat "$samples/$file")
+ else
+ raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
+ fi
+
+ # What kind of drive are we? Look for the right line in smartctl:
+ #
+ # SAS:
+ # Transport protocol: SAS
+ #
+ # SATA:
+ # ATA Version is: 8
+ #
+ # NVMe:
+ # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
+ #
+ out=$(echo "$raw_out" | awk '
+# SAS specific
+/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
+/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8}
+/Non-medium error count/{print "nonmed="$4}
+/Elements in grown defect list/{print "defect="$6}
+
+# SAS common
+/SAS/{type="sas"}
+/Drive Temperature:/{print "temp="$4}
+# Status can be a long string, substitute spaces for '_'
+/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
+/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
+/Serial number:/{print "serial="$3}
+
+# SATA specific
+/Reallocated_Sector_Ct/{print "realloc="$10}
+/Reported_Uncorrect/{print "rep_ucor="$10}
+/Command_Timeout/{print "cmd_to="$10}
+/Current_Pending_Sector/{print "pend_sec="$10}
+/Offline_Uncorrectable/{print "off_ucor="$10}
+/ATA Error Count:/{print "ata_err="$4}
+/Power_Cycle_Count/{print "pwr_cyc="$10}
+
+# SATA common
+/SATA/{type="sata"}
+/Temperature_Celsius/{print "temp="$10}
+/Airflow_Temperature_Cel/{print "temp="$10}
+/Current Temperature:/{print "temp="$3}
+/SMART overall-health self-assessment test result:/{print "health="$6}
+/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
+/Serial Number:/{print "serial="$3}
+
+# NVMe common
+/NVMe/{type="nvme"}
+/Temperature:/{print "temp="$2}
+/SMART overall-health self-assessment test result:/{print "health="$6}
+/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
+/Serial Number:/{print "serial="$3}
+/Power Cycles:/{print "pwr_cyc="$3}
+
+# NVMe specific
+/Media and Data Integrity Errors:/{print "nvme_err="$6}
+
+# SMART self-test info
+/Self-test execution status:/{progress=tolower($4)} # SAS
+/SMART Self-test log/{test_seen=1} # SAS
+/SMART Extended Self-test Log/{test_seen=1} # SATA
+/# 1/{
+ test_type=tolower($3"_"$4);
+ # Status could be one word ("Completed") or multiple ("Completed: read
+ # failure"). Look for the ":" to see if we need to grab more words.
+
+ if ($5 ~ ":")
+ status=tolower($5""$6"_"$7)
+ else
+ status=tolower($5)
+ if (status=="self")
+ status="running";
+
+ if (type == "sas") {
+ hours=int($(NF-4))
+ } else {
+ hours=int($(NF-1))
+ # SATA reports percent remaining, rather than percent done
+ # Convert it to percent done.
+ progress=(100-int($(NF-2)))"%"
+ }
+ # When we int()-ify "hours", it converts stuff like "NOW" and "-" into
+ # 0. In those cases, set it to hours_on, so they will cancel out in
+ # the "hours_ago" calculation later on.
+ if (hours == 0)
+ hours=hours_on
+
+ if (test_seen) {
+ print "test="hours_on
+ print "test_type="test_type
+ print "test_status="status
+ print "test_progress="progress
+ }
+ # Not all drives report hours_on
+ if (hours_on && hours) {
+ total_hours_ago=(hours_on-hours)
+ days_ago=int(total_hours_ago/24)
+ hours_ago=(total_hours_ago % 24)
+ if (days_ago != 0)
+ ago_str=days_ago"d"
+ if (hours_ago !=0)
+ ago_str=ago_str""hours_ago"h"
+ print "test_ended="ago_str
+ }
+}
+
+END {print "type="type; ORS="\n"; print ""}
+');
+fi
+type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)
+
+# If type is not set by now, either we don't have a block device
+# or smartctl failed. Either way, default to ATA and set $out to
+# nothing.
+if [ -z "$type" ]; then
+ type="sata"
+ out=
+fi
+
+case $script in
+smart)
+ # Print temperature plus common predictors of drive failure
+ if [ "$type" = "sas" ] ; then
+ scripts="temp|health|r_ucor|w_ucor"
+ elif [ "$type" = "sata" ] ; then
+ scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
+ elif [ "$type" = "nvme" ] ; then
+ scripts="temp|health|nvme_err"
+ fi
+ ;;
+smartx)
+ # Print some other interesting stats
+ if [ "$type" = "sas" ] ; then
+ scripts="hours_on|defect|nonmed|r_proc|w_proc"
+ elif [ "$type" = "sata" ] ; then
+ scripts="hours_on|pwr_cyc"
+ elif [ "$type" = "nvme" ] ; then
+ scripts="hours_on|pwr_cyc"
+ fi
+ ;;
+smart_test)
+ scripts="test_type|test_status|test_progress|test_ended"
+ ;;
+*)
+ scripts="$script"
+esac
+
+with_vals=$(echo "$out" | grep -E "$scripts")
+if [ -n "$with_vals" ]; then
+ echo "$with_vals"
+ without_vals=$(echo "$scripts" | tr "|" "\n" |
+ grep -v -E "$(echo "$with_vals" |
+ awk -F "=" '{print $1}')" | awk '{print $0"="}')
+else
+ without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}')
+fi
+
+if [ -n "$without_vals" ]; then
+ echo "$without_vals"
+fi
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smart_test
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx b/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/smartx
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/temp b/sys/contrib/openzfs/cmd/zpool/zpool.d/temp
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/temp
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_ended
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_progress
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_status
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/test_type
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/upath b/sys/contrib/openzfs/cmd/zpool/zpool.d/upath
new file mode 100755
index 000000000000..16a4327d4850
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/upath
@@ -0,0 +1,7 @@
+#!/bin/sh
+if [ "$1" = "-h" ] ; then
+ echo "Show the underlying path for a device."
+ exit
+fi
+
+echo upath="$VDEV_UPATH"
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor b/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor
new file mode 120000
index 000000000000..7d1e766add99
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/vendor
@@ -0,0 +1 @@
+lsblk \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_proc
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor
new file mode 120000
index 000000000000..94f22861f0ce
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool.d/w_ucor
@@ -0,0 +1 @@
+smart \ No newline at end of file
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
new file mode 100644
index 000000000000..5f3153bca2c2
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c
@@ -0,0 +1,757 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+#include <libintl.h>
+#include <libuutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <thread_pool.h>
+
+#include <libzfs.h>
+#include <libzutil.h>
+#include <sys/zfs_context.h>
+#include <sys/wait.h>
+
+#include "zpool_util.h"
+
+/*
+ * Private interface for iterating over pools specified on the command line.
+ * Most consumers will call for_each_pool, but in order to support iostat, we
+ * allow fined grained control through the zpool_list_t interface.
+ */
+
+typedef struct zpool_node {
+ zpool_handle_t *zn_handle;
+ uu_avl_node_t zn_avlnode;
+ int zn_mark;
+} zpool_node_t;
+
+struct zpool_list {
+ boolean_t zl_findall;
+ uu_avl_t *zl_avl;
+ uu_avl_pool_t *zl_pool;
+ zprop_list_t **zl_proplist;
+};
+
+/* ARGSUSED */
+static int
+zpool_compare(const void *larg, const void *rarg, void *unused)
+{
+ zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle;
+ zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle;
+ const char *lname = zpool_get_name(l);
+ const char *rname = zpool_get_name(r);
+
+ return (strcmp(lname, rname));
+}
+
+/*
+ * Callback function for pool_list_get(). Adds the given pool to the AVL tree
+ * of known pools.
+ */
+static int
+add_pool(zpool_handle_t *zhp, void *data)
+{
+ zpool_list_t *zlp = data;
+ zpool_node_t *node = safe_malloc(sizeof (zpool_node_t));
+ uu_avl_index_t idx;
+
+ node->zn_handle = zhp;
+ uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool);
+ if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) {
+ if (zlp->zl_proplist &&
+ zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) {
+ zpool_close(zhp);
+ free(node);
+ return (-1);
+ }
+ uu_avl_insert(zlp->zl_avl, node, idx);
+ } else {
+ zpool_close(zhp);
+ free(node);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Create a list of pools based on the given arguments. If we're given no
+ * arguments, then iterate over all pools in the system and add them to the AVL
+ * tree. Otherwise, add only those pool explicitly specified on the command
+ * line.
+ */
+zpool_list_t *
+pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err)
+{
+ zpool_list_t *zlp;
+
+ zlp = safe_malloc(sizeof (zpool_list_t));
+
+ zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t),
+ offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT);
+
+ if (zlp->zl_pool == NULL)
+ zpool_no_memory();
+
+ if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL,
+ UU_DEFAULT)) == NULL)
+ zpool_no_memory();
+
+ zlp->zl_proplist = proplist;
+
+ if (argc == 0) {
+ (void) zpool_iter(g_zfs, add_pool, zlp);
+ zlp->zl_findall = B_TRUE;
+ } else {
+ int i;
+
+ for (i = 0; i < argc; i++) {
+ zpool_handle_t *zhp;
+
+ if ((zhp = zpool_open_canfail(g_zfs, argv[i])) !=
+ NULL) {
+ if (add_pool(zhp, zlp) != 0)
+ *err = B_TRUE;
+ } else {
+ *err = B_TRUE;
+ }
+ }
+ }
+
+ return (zlp);
+}
+
+/*
+ * Search for any new pools, adding them to the list. We only add pools when no
+ * options were given on the command line. Otherwise, we keep the list fixed as
+ * those that were explicitly specified.
+ */
+void
+pool_list_update(zpool_list_t *zlp)
+{
+ if (zlp->zl_findall)
+ (void) zpool_iter(g_zfs, add_pool, zlp);
+}
+
+/*
+ * Iterate over all pools in the list, executing the callback for each
+ */
+int
+pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func,
+ void *data)
+{
+ zpool_node_t *node, *next_node;
+ int ret = 0;
+
+ for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) {
+ next_node = uu_avl_next(zlp->zl_avl, node);
+ if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL ||
+ unavail)
+ ret |= func(node->zn_handle, data);
+ }
+
+ return (ret);
+}
+
+/*
+ * Remove the given pool from the list. When running iostat, we want to remove
+ * those pools that no longer exist.
+ */
+void
+pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp)
+{
+ zpool_node_t search, *node;
+
+ search.zn_handle = zhp;
+ if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) {
+ uu_avl_remove(zlp->zl_avl, node);
+ zpool_close(node->zn_handle);
+ free(node);
+ }
+}
+
+/*
+ * Free all the handles associated with this list.
+ */
+void
+pool_list_free(zpool_list_t *zlp)
+{
+ uu_avl_walk_t *walk;
+ zpool_node_t *node;
+
+ if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory"));
+ exit(1);
+ }
+
+ while ((node = uu_avl_walk_next(walk)) != NULL) {
+ uu_avl_remove(zlp->zl_avl, node);
+ zpool_close(node->zn_handle);
+ free(node);
+ }
+
+ uu_avl_walk_end(walk);
+ uu_avl_destroy(zlp->zl_avl);
+ uu_avl_pool_destroy(zlp->zl_pool);
+
+ free(zlp);
+}
+
+/*
+ * Returns the number of elements in the pool list.
+ */
+int
+pool_list_count(zpool_list_t *zlp)
+{
+ return (uu_avl_numnodes(zlp->zl_avl));
+}
+
+/*
+ * High level function which iterates over all pools given on the command line,
+ * using the pool_list_* interfaces.
+ */
+int
+for_each_pool(int argc, char **argv, boolean_t unavail,
+ zprop_list_t **proplist, zpool_iter_f func, void *data)
+{
+ zpool_list_t *list;
+ int ret = 0;
+
+ if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL)
+ return (1);
+
+ if (pool_list_iter(list, unavail, func, data) != 0)
+ ret = 1;
+
+ pool_list_free(list);
+
+ return (ret);
+}
+
+static int
+for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func,
+ void *data)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ int ret = 0;
+ int i;
+ char *type;
+
+ const char *list[] = {
+ ZPOOL_CONFIG_SPARES,
+ ZPOOL_CONFIG_L2CACHE,
+ ZPOOL_CONFIG_CHILDREN
+ };
+
+ for (i = 0; i < ARRAY_SIZE(list); i++) {
+ if (nvlist_lookup_nvlist_array(nv, list[i], &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++) {
+ uint64_t ishole = 0;
+
+ (void) nvlist_lookup_uint64(child[c],
+ ZPOOL_CONFIG_IS_HOLE, &ishole);
+
+ if (ishole)
+ continue;
+
+ ret |= for_each_vdev_cb(zhp, child[c], func,
+ data);
+ }
+ }
+ }
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+ return (ret);
+
+ /* Don't run our function on root vdevs */
+ if (strcmp(type, VDEV_TYPE_ROOT) != 0) {
+ ret |= func(zhp, nv, data);
+ }
+
+ return (ret);
+}
+
+/*
+ * This is the equivalent of for_each_pool() for vdevs. It iterates thorough
+ * all vdevs in the pool, ignoring root vdevs and holes, calling func() on
+ * each one.
+ *
+ * @zhp: Zpool handle
+ * @func: Function to call on each vdev
+ * @data: Custom data to pass to the function
+ */
+int
+for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data)
+{
+ nvlist_t *config, *nvroot = NULL;
+
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ }
+ return (for_each_vdev_cb(zhp, nvroot, func, data));
+}
+
+/*
+ * Process the vcdl->vdev_cmd_data[] array to figure out all the unique column
+ * names and their widths. When this function is done, vcdl->uniq_cols,
+ * vcdl->uniq_cols_cnt, and vcdl->uniq_cols_width will be filled in.
+ */
+static void
+process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl)
+{
+ char **uniq_cols = NULL, **tmp = NULL;
+ int *uniq_cols_width;
+ vdev_cmd_data_t *data;
+ int cnt = 0;
+ int k;
+
+ /* For each vdev */
+ for (int i = 0; i < vcdl->count; i++) {
+ data = &vcdl->data[i];
+ /* For each column the vdev reported */
+ for (int j = 0; j < data->cols_cnt; j++) {
+ /* Is this column in our list of unique column names? */
+ for (k = 0; k < cnt; k++) {
+ if (strcmp(data->cols[j], uniq_cols[k]) == 0)
+ break; /* yes it is */
+ }
+ if (k == cnt) {
+ /* No entry for column, add to list */
+ tmp = realloc(uniq_cols, sizeof (*uniq_cols) *
+ (cnt + 1));
+ if (tmp == NULL)
+ break; /* Nothing we can do... */
+ uniq_cols = tmp;
+ uniq_cols[cnt] = data->cols[j];
+ cnt++;
+ }
+ }
+ }
+
+ /*
+ * We now have a list of all the unique column names. Figure out the
+ * max width of each column by looking at the column name and all its
+ * values.
+ */
+ uniq_cols_width = safe_malloc(sizeof (*uniq_cols_width) * cnt);
+ for (int i = 0; i < cnt; i++) {
+ /* Start off with the column title's width */
+ uniq_cols_width[i] = strlen(uniq_cols[i]);
+ /* For each vdev */
+ for (int j = 0; j < vcdl->count; j++) {
+ /* For each of the vdev's values in a column */
+ data = &vcdl->data[j];
+ for (k = 0; k < data->cols_cnt; k++) {
+ /* Does this vdev have a value for this col? */
+ if (strcmp(data->cols[k], uniq_cols[i]) == 0) {
+ /* Is the value width larger? */
+ uniq_cols_width[i] =
+ MAX(uniq_cols_width[i],
+ strlen(data->lines[k]));
+ }
+ }
+ }
+ }
+
+ vcdl->uniq_cols = uniq_cols;
+ vcdl->uniq_cols_cnt = cnt;
+ vcdl->uniq_cols_width = uniq_cols_width;
+}
+
+
+/*
+ * Process a line of command output
+ *
+ * When running 'zpool iostat|status -c' the lines of output can either be
+ * in the form of:
+ *
+ * column_name=value
+ *
+ * Or just:
+ *
+ * value
+ *
+ * Process the column_name (if any) and value.
+ *
+ * Returns 0 if line was processed, and there are more lines can still be
+ * processed.
+ *
+ * Returns 1 if this was the last line to process, or error.
+ */
+static int
+vdev_process_cmd_output(vdev_cmd_data_t *data, char *line)
+{
+ char *col = NULL;
+ char *val = line;
+ char *equals;
+ char **tmp;
+
+ if (line == NULL)
+ return (1);
+
+ equals = strchr(line, '=');
+ if (equals != NULL) {
+ /*
+ * We have a 'column=value' type line. Split it into the
+ * column and value strings by turning the '=' into a '\0'.
+ */
+ *equals = '\0';
+ col = line;
+ val = equals + 1;
+ } else {
+ val = line;
+ }
+
+ /* Do we already have a column by this name? If so, skip it. */
+ if (col != NULL) {
+ for (int i = 0; i < data->cols_cnt; i++) {
+ if (strcmp(col, data->cols[i]) == 0)
+ return (0); /* Duplicate, skip */
+ }
+ }
+
+ if (val != NULL) {
+ tmp = realloc(data->lines,
+ (data->lines_cnt + 1) * sizeof (*data->lines));
+ if (tmp == NULL)
+ return (1);
+
+ data->lines = tmp;
+ data->lines[data->lines_cnt] = strdup(val);
+ data->lines_cnt++;
+ }
+
+ if (col != NULL) {
+ tmp = realloc(data->cols,
+ (data->cols_cnt + 1) * sizeof (*data->cols));
+ if (tmp == NULL)
+ return (1);
+
+ data->cols = tmp;
+ data->cols[data->cols_cnt] = strdup(col);
+ data->cols_cnt++;
+ }
+
+ if (val != NULL && col == NULL)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Run the cmd and store results in *data.
+ */
+static void
+vdev_run_cmd(vdev_cmd_data_t *data, char *cmd)
+{
+ int rc;
+ char *argv[2] = {cmd, 0};
+ char *env[5] = {"PATH=/bin:/sbin:/usr/bin:/usr/sbin", NULL, NULL, NULL,
+ NULL};
+ char **lines = NULL;
+ int lines_cnt = 0;
+ int i;
+
+ /* Setup our custom environment variables */
+ rc = asprintf(&env[1], "VDEV_PATH=%s",
+ data->path ? data->path : "");
+ if (rc == -1)
+ goto out;
+
+ rc = asprintf(&env[2], "VDEV_UPATH=%s",
+ data->upath ? data->upath : "");
+ if (rc == -1)
+ goto out;
+
+ rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s",
+ data->vdev_enc_sysfs_path ?
+ data->vdev_enc_sysfs_path : "");
+ if (rc == -1)
+ goto out;
+
+ /* Run the command */
+ rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines,
+ &lines_cnt);
+ if (rc != 0)
+ goto out;
+
+ /* Process the output we got */
+ for (i = 0; i < lines_cnt; i++)
+ if (vdev_process_cmd_output(data, lines[i]) != 0)
+ break;
+
+out:
+ if (lines != NULL)
+ libzfs_free_str_array(lines, lines_cnt);
+
+ /* Start with i = 1 since env[0] was statically allocated */
+ for (i = 1; i < ARRAY_SIZE(env); i++)
+ if (env[i] != NULL)
+ free(env[i]);
+}
+
+/*
+ * Generate the search path for zpool iostat/status -c scripts.
+ * The string returned must be freed.
+ */
+char *
+zpool_get_cmd_search_path(void)
+{
+ const char *env;
+ char *sp = NULL;
+
+ env = getenv("ZPOOL_SCRIPTS_PATH");
+ if (env != NULL)
+ return (strdup(env));
+
+ env = getenv("HOME");
+ if (env != NULL) {
+ if (asprintf(&sp, "%s/.zpool.d:%s",
+ env, ZPOOL_SCRIPTS_DIR) != -1) {
+ return (sp);
+ }
+ }
+
+ if (asprintf(&sp, "%s", ZPOOL_SCRIPTS_DIR) != -1)
+ return (sp);
+
+ return (NULL);
+}
+
+/* Thread function run for each vdev */
+static void
+vdev_run_cmd_thread(void *cb_cmd_data)
+{
+ vdev_cmd_data_t *data = cb_cmd_data;
+ char *cmd = NULL, *cmddup, *cmdrest;
+
+ cmddup = strdup(data->cmd);
+ if (cmddup == NULL)
+ return;
+
+ cmdrest = cmddup;
+ while ((cmd = strtok_r(cmdrest, ",", &cmdrest))) {
+ char *dir = NULL, *sp, *sprest;
+ char fullpath[MAXPATHLEN];
+
+ if (strchr(cmd, '/') != NULL)
+ continue;
+
+ sp = zpool_get_cmd_search_path();
+ if (sp == NULL)
+ continue;
+
+ sprest = sp;
+ while ((dir = strtok_r(sprest, ":", &sprest))) {
+ if (snprintf(fullpath, sizeof (fullpath),
+ "%s/%s", dir, cmd) == -1)
+ continue;
+
+ if (access(fullpath, X_OK) == 0) {
+ vdev_run_cmd(data, fullpath);
+ break;
+ }
+ }
+ free(sp);
+ }
+ free(cmddup);
+}
+
+/* For each vdev in the pool run a command */
+static int
+for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl)
+{
+ vdev_cmd_data_list_t *vcdl = cb_vcdl;
+ vdev_cmd_data_t *data;
+ char *path = NULL;
+ char *vname = NULL;
+ char *vdev_enc_sysfs_path = NULL;
+ int i, match = 0;
+
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0)
+ return (1);
+
+ nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ &vdev_enc_sysfs_path);
+
+ /* Spares show more than once if they're in use, so skip if exists */
+ for (i = 0; i < vcdl->count; i++) {
+ if ((strcmp(vcdl->data[i].path, path) == 0) &&
+ (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) {
+ /* vdev already exists, skip it */
+ return (0);
+ }
+ }
+
+ /* Check for selected vdevs here, if any */
+ for (i = 0; i < vcdl->vdev_names_count; i++) {
+ vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags);
+ if (strcmp(vcdl->vdev_names[i], vname) == 0) {
+ free(vname);
+ match = 1;
+ break; /* match */
+ }
+ free(vname);
+ }
+
+ /* If we selected vdevs, and this isn't one of them, then bail out */
+ if (!match && vcdl->vdev_names_count)
+ return (0);
+
+ /*
+ * Resize our array and add in the new element.
+ */
+ if (!(vcdl->data = realloc(vcdl->data,
+ sizeof (*vcdl->data) * (vcdl->count + 1))))
+ return (ENOMEM); /* couldn't realloc */
+
+ data = &vcdl->data[vcdl->count];
+
+ data->pool = strdup(zpool_get_name(zhp));
+ data->path = strdup(path);
+ data->upath = zfs_get_underlying_path(path);
+ data->cmd = vcdl->cmd;
+ data->lines = data->cols = NULL;
+ data->lines_cnt = data->cols_cnt = 0;
+ if (vdev_enc_sysfs_path)
+ data->vdev_enc_sysfs_path = strdup(vdev_enc_sysfs_path);
+ else
+ data->vdev_enc_sysfs_path = NULL;
+
+ vcdl->count++;
+
+ return (0);
+}
+
+/* Get the names and count of the vdevs */
+static int
+all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl)
+{
+ return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl));
+}
+
+/*
+ * Now that vcdl is populated with our complete list of vdevs, spawn
+ * off the commands.
+ */
+static void
+all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl)
+{
+ tpool_t *t;
+
+ t = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL);
+ if (t == NULL)
+ return;
+
+ /* Spawn off the command for each vdev */
+ for (int i = 0; i < vcdl->count; i++) {
+ (void) tpool_dispatch(t, vdev_run_cmd_thread,
+ (void *) &vcdl->data[i]);
+ }
+
+ /* Wait for threads to finish */
+ tpool_wait(t);
+ tpool_destroy(t);
+}
+
+/*
+ * Run command 'cmd' on all vdevs in all pools in argv. Saves the first line of
+ * output from the command in vcdk->data[].line for all vdevs. If you want
+ * to run the command on only certain vdevs, fill in g_zfs, vdev_names,
+ * vdev_names_count, and cb_name_flags. Otherwise leave them as zero.
+ *
+ * Returns a vdev_cmd_data_list_t that must be freed with
+ * free_vdev_cmd_data_list();
+ */
+vdev_cmd_data_list_t *
+all_pools_for_each_vdev_run(int argc, char **argv, char *cmd,
+ libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count,
+ int cb_name_flags)
+{
+ vdev_cmd_data_list_t *vcdl;
+ vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t));
+ vcdl->cmd = cmd;
+
+ vcdl->vdev_names = vdev_names;
+ vcdl->vdev_names_count = vdev_names_count;
+ vcdl->cb_name_flags = cb_name_flags;
+ vcdl->g_zfs = g_zfs;
+
+ /* Gather our list of all vdevs in all pools */
+ for_each_pool(argc, argv, B_TRUE, NULL,
+ all_pools_for_each_vdev_gather_cb, vcdl);
+
+ /* Run command on all vdevs in all pools */
+ all_pools_for_each_vdev_run_vcdl(vcdl);
+
+ /*
+ * vcdl->data[] now contains all the column names and values for each
+ * vdev. We need to process that into a master list of unique column
+ * names, and figure out the width of each column.
+ */
+ process_unique_cmd_columns(vcdl);
+
+ return (vcdl);
+}
+
+/*
+ * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run()
+ */
+void
+free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl)
+{
+ free(vcdl->uniq_cols);
+ free(vcdl->uniq_cols_width);
+
+ for (int i = 0; i < vcdl->count; i++) {
+ free(vcdl->data[i].path);
+ free(vcdl->data[i].pool);
+ free(vcdl->data[i].upath);
+
+ for (int j = 0; j < vcdl->data[i].lines_cnt; j++)
+ free(vcdl->data[i].lines[j]);
+
+ free(vcdl->data[i].lines);
+
+ for (int j = 0; j < vcdl->data[i].cols_cnt; j++)
+ free(vcdl->data[i].cols[j]);
+
+ free(vcdl->data[i].cols);
+ free(vcdl->data[i].vdev_enc_sysfs_path);
+ }
+ free(vcdl->data);
+ free(vcdl);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
new file mode 100644
index 000000000000..adbb78a8effd
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@@ -0,0 +1,10329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
+ * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
+ * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <libuutil.h>
+#include <locale.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <time.h>
+#include <unistd.h>
+#include <pwd.h>
+#include <zone.h>
+#include <sys/wait.h>
+#include <zfs_prop.h>
+#include <sys/fs/zfs.h>
+#include <sys/stat.h>
+#include <sys/systeminfo.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+
+#include <math.h>
+
+#include <libzfs.h>
+#include <libzutil.h>
+
+#include "zpool_util.h"
+#include "zfs_comutil.h"
+#include "zfeature_common.h"
+
+#include "statcommon.h"
+
+libzfs_handle_t *g_zfs;
+
+static int zpool_do_create(int, char **);
+static int zpool_do_destroy(int, char **);
+
+static int zpool_do_add(int, char **);
+static int zpool_do_remove(int, char **);
+static int zpool_do_labelclear(int, char **);
+
+static int zpool_do_checkpoint(int, char **);
+
+static int zpool_do_list(int, char **);
+static int zpool_do_iostat(int, char **);
+static int zpool_do_status(int, char **);
+
+static int zpool_do_online(int, char **);
+static int zpool_do_offline(int, char **);
+static int zpool_do_clear(int, char **);
+static int zpool_do_reopen(int, char **);
+
+static int zpool_do_reguid(int, char **);
+
+static int zpool_do_attach(int, char **);
+static int zpool_do_detach(int, char **);
+static int zpool_do_replace(int, char **);
+static int zpool_do_split(int, char **);
+
+static int zpool_do_initialize(int, char **);
+static int zpool_do_scrub(int, char **);
+static int zpool_do_resilver(int, char **);
+static int zpool_do_trim(int, char **);
+
+static int zpool_do_import(int, char **);
+static int zpool_do_export(int, char **);
+
+static int zpool_do_upgrade(int, char **);
+
+static int zpool_do_history(int, char **);
+static int zpool_do_events(int, char **);
+
+static int zpool_do_get(int, char **);
+static int zpool_do_set(int, char **);
+
+static int zpool_do_sync(int, char **);
+
+static int zpool_do_version(int, char **);
+
+static int zpool_do_wait(int, char **);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+typedef enum {
+ HELP_ADD,
+ HELP_ATTACH,
+ HELP_CLEAR,
+ HELP_CREATE,
+ HELP_CHECKPOINT,
+ HELP_DESTROY,
+ HELP_DETACH,
+ HELP_EXPORT,
+ HELP_HISTORY,
+ HELP_IMPORT,
+ HELP_IOSTAT,
+ HELP_LABELCLEAR,
+ HELP_LIST,
+ HELP_OFFLINE,
+ HELP_ONLINE,
+ HELP_REPLACE,
+ HELP_REMOVE,
+ HELP_INITIALIZE,
+ HELP_SCRUB,
+ HELP_RESILVER,
+ HELP_TRIM,
+ HELP_STATUS,
+ HELP_UPGRADE,
+ HELP_EVENTS,
+ HELP_GET,
+ HELP_SET,
+ HELP_SPLIT,
+ HELP_SYNC,
+ HELP_REGUID,
+ HELP_REOPEN,
+ HELP_VERSION,
+ HELP_WAIT
+} zpool_help_t;
+
+
+/*
+ * Flags for stats to display with "zpool iostats"
+ */
+enum iostat_type {
+ IOS_DEFAULT = 0,
+ IOS_LATENCY = 1,
+ IOS_QUEUES = 2,
+ IOS_L_HISTO = 3,
+ IOS_RQ_HISTO = 4,
+ IOS_COUNT, /* always last element */
+};
+
+/* iostat_type entries as bitmasks */
+#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT)
+#define IOS_LATENCY_M (1ULL << IOS_LATENCY)
+#define IOS_QUEUES_M (1ULL << IOS_QUEUES)
+#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO)
+#define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO)
+
+/* Mask of all the histo bits */
+#define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M)
+
+/*
+ * Lookup table for iostat flags to nvlist names. Basically a list
+ * of all the nvlists a flag requires. Also specifies the order in
+ * which data gets printed in zpool iostat.
+ */
+static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
+ [IOS_L_HISTO] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ NULL},
+ [IOS_LATENCY] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ NULL},
+ [IOS_QUEUES] = {
+ ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+ NULL},
+ [IOS_RQ_HISTO] = {
+ ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
+ ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
+ ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+ ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+ ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+ NULL},
+};
+
+
+/*
+ * Given a cb->cb_flags with a histogram bit set, return the iostat_type.
+ * Right now, only one histo bit is ever set at one time, so we can
+ * just do a highbit64(a)
+ */
+#define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1)
+
+typedef struct zpool_command {
+ const char *name;
+ int (*func)(int, char **);
+ zpool_help_t usage;
+} zpool_command_t;
+
+/*
+ * Master command table. Each ZFS command has a name, associated function, and
+ * usage message. The usage messages need to be internationalized, so we have
+ * to have a function to return the usage message based on a command index.
+ *
+ * These commands are organized according to how they are displayed in the usage
+ * message. An empty command (one with a NULL name) indicates an empty line in
+ * the generic usage message.
+ */
+static zpool_command_t command_table[] = {
+ { "version", zpool_do_version, HELP_VERSION },
+ { NULL },
+ { "create", zpool_do_create, HELP_CREATE },
+ { "destroy", zpool_do_destroy, HELP_DESTROY },
+ { NULL },
+ { "add", zpool_do_add, HELP_ADD },
+ { "remove", zpool_do_remove, HELP_REMOVE },
+ { NULL },
+ { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR },
+ { NULL },
+ { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT },
+ { NULL },
+ { "list", zpool_do_list, HELP_LIST },
+ { "iostat", zpool_do_iostat, HELP_IOSTAT },
+ { "status", zpool_do_status, HELP_STATUS },
+ { NULL },
+ { "online", zpool_do_online, HELP_ONLINE },
+ { "offline", zpool_do_offline, HELP_OFFLINE },
+ { "clear", zpool_do_clear, HELP_CLEAR },
+ { "reopen", zpool_do_reopen, HELP_REOPEN },
+ { NULL },
+ { "attach", zpool_do_attach, HELP_ATTACH },
+ { "detach", zpool_do_detach, HELP_DETACH },
+ { "replace", zpool_do_replace, HELP_REPLACE },
+ { "split", zpool_do_split, HELP_SPLIT },
+ { NULL },
+ { "initialize", zpool_do_initialize, HELP_INITIALIZE },
+ { "resilver", zpool_do_resilver, HELP_RESILVER },
+ { "scrub", zpool_do_scrub, HELP_SCRUB },
+ { "trim", zpool_do_trim, HELP_TRIM },
+ { NULL },
+ { "import", zpool_do_import, HELP_IMPORT },
+ { "export", zpool_do_export, HELP_EXPORT },
+ { "upgrade", zpool_do_upgrade, HELP_UPGRADE },
+ { "reguid", zpool_do_reguid, HELP_REGUID },
+ { NULL },
+ { "history", zpool_do_history, HELP_HISTORY },
+ { "events", zpool_do_events, HELP_EVENTS },
+ { NULL },
+ { "get", zpool_do_get, HELP_GET },
+ { "set", zpool_do_set, HELP_SET },
+ { "sync", zpool_do_sync, HELP_SYNC },
+ { NULL },
+ { "wait", zpool_do_wait, HELP_WAIT },
+};
+
+#define NCOMMAND (ARRAY_SIZE(command_table))
+
+#define VDEV_ALLOC_CLASS_LOGS "logs"
+
+static zpool_command_t *current_command;
+static char history_str[HIS_MAX_RECORD_LEN];
+static boolean_t log_history = B_TRUE;
+static uint_t timestamp_fmt = NODATE;
+
+static const char *
+get_usage(zpool_help_t idx)
+{
+ switch (idx) {
+ case HELP_ADD:
+ return (gettext("\tadd [-fgLnP] [-o property=value] "
+ "<pool> <vdev> ...\n"));
+ case HELP_ATTACH:
+ return (gettext("\tattach [-fsw] [-o property=value] "
+ "<pool> <device> <new-device>\n"));
+ case HELP_CLEAR:
+ return (gettext("\tclear [-nF] <pool> [device]\n"));
+ case HELP_CREATE:
+ return (gettext("\tcreate [-fnd] [-o property=value] ... \n"
+ "\t [-O file-system-property=value] ... \n"
+ "\t [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
+ case HELP_CHECKPOINT:
+ return (gettext("\tcheckpoint [-d [-w]] <pool> ...\n"));
+ case HELP_DESTROY:
+ return (gettext("\tdestroy [-f] <pool>\n"));
+ case HELP_DETACH:
+ return (gettext("\tdetach <pool> <device>\n"));
+ case HELP_EXPORT:
+ return (gettext("\texport [-af] <pool> ...\n"));
+ case HELP_HISTORY:
+ return (gettext("\thistory [-il] [<pool>] ...\n"));
+ case HELP_IMPORT:
+ return (gettext("\timport [-d dir] [-D]\n"
+ "\timport [-o mntopts] [-o property=value] ... \n"
+ "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
+ "[-R root] [-F [-n]] -a\n"
+ "\timport [-o mntopts] [-o property=value] ... \n"
+ "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] "
+ "[-R root] [-F [-n]]\n"
+ "\t [--rewind-to-checkpoint] <pool | id> [newpool]\n"));
+ case HELP_IOSTAT:
+ return (gettext("\tiostat [[[-c [script1,script2,...]"
+ "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n"
+ "\t [[pool ...]|[pool vdev ...]|[vdev ...]]"
+ " [[-n] interval [count]]\n"));
+ case HELP_LABELCLEAR:
+ return (gettext("\tlabelclear [-f] <vdev>\n"));
+ case HELP_LIST:
+ return (gettext("\tlist [-gHLpPv] [-o property[,...]] "
+ "[-T d|u] [pool] ... \n"
+ "\t [interval [count]]\n"));
+ case HELP_OFFLINE:
+ return (gettext("\toffline [-f] [-t] <pool> <device> ...\n"));
+ case HELP_ONLINE:
+ return (gettext("\tonline [-e] <pool> <device> ...\n"));
+ case HELP_REPLACE:
+ return (gettext("\treplace [-fsw] [-o property=value] "
+ "<pool> <device> [new-device]\n"));
+ case HELP_REMOVE:
+ return (gettext("\tremove [-npsw] <pool> <device> ...\n"));
+ case HELP_REOPEN:
+ return (gettext("\treopen [-n] <pool>\n"));
+ case HELP_INITIALIZE:
+ return (gettext("\tinitialize [-c | -s] [-w] <pool> "
+ "[<device> ...]\n"));
+ case HELP_SCRUB:
+ return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
+ case HELP_RESILVER:
+ return (gettext("\tresilver <pool> ...\n"));
+ case HELP_TRIM:
+ return (gettext("\ttrim [-dw] [-r <rate>] [-c | -s] <pool> "
+ "[<device> ...]\n"));
+ case HELP_STATUS:
+ return (gettext("\tstatus [-c [script1,script2,...]] "
+ "[-igLpPstvxD] [-T d|u] [pool] ... \n"
+ "\t [interval [count]]\n"));
+ case HELP_UPGRADE:
+ return (gettext("\tupgrade\n"
+ "\tupgrade -v\n"
+ "\tupgrade [-V version] <-a | pool ...>\n"));
+ case HELP_EVENTS:
+ return (gettext("\tevents [-vHf [pool] | -c]\n"));
+ case HELP_GET:
+ return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] "
+ "<\"all\" | property[,...]> <pool> ...\n"));
+ case HELP_SET:
+ return (gettext("\tset <property=value> <pool> \n"));
+ case HELP_SPLIT:
+ return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n"
+ "\t [-o property=value] <pool> <newpool> "
+ "[<device> ...]\n"));
+ case HELP_REGUID:
+ return (gettext("\treguid <pool>\n"));
+ case HELP_SYNC:
+ return (gettext("\tsync [pool] ...\n"));
+ case HELP_VERSION:
+ return (gettext("\tversion\n"));
+ case HELP_WAIT:
+ return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
+ "<pool> [interval]\n"));
+ }
+
+ abort();
+ /* NOTREACHED */
+}
+
+static void
+zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res)
+{
+ uint_t children = 0;
+ nvlist_t **child;
+ uint_t i;
+
+ (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children);
+
+ if (children == 0) {
+ char *path = zpool_vdev_name(g_zfs, zhp, nvroot,
+ VDEV_NAME_PATH);
+
+ if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 &&
+ strcmp(path, VDEV_TYPE_HOLE) != 0)
+ fnvlist_add_boolean(res, path);
+
+ free(path);
+ return;
+ }
+
+ for (i = 0; i < children; i++) {
+ zpool_collect_leaves(zhp, child[i], res);
+ }
+}
+
+/*
+ * Callback routine that will print out a pool property value.
+ */
+static int
+print_prop_cb(int prop, void *cb)
+{
+ FILE *fp = cb;
+
+ (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop));
+
+ if (zpool_prop_readonly(prop))
+ (void) fprintf(fp, " NO ");
+ else
+ (void) fprintf(fp, " YES ");
+
+ if (zpool_prop_values(prop) == NULL)
+ (void) fprintf(fp, "-\n");
+ else
+ (void) fprintf(fp, "%s\n", zpool_prop_values(prop));
+
+ return (ZPROP_CONT);
+}
+
+/*
+ * Display usage message. If we're inside a command, display only the usage for
+ * that command. Otherwise, iterate over the entire command table and display
+ * a complete usage message.
+ */
+static void
+usage(boolean_t requested)
+{
+ FILE *fp = requested ? stdout : stderr;
+
+ if (current_command == NULL) {
+ int i;
+
+ (void) fprintf(fp, gettext("usage: zpool command args ...\n"));
+ (void) fprintf(fp,
+ gettext("where 'command' is one of the following:\n\n"));
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ (void) fprintf(fp, "\n");
+ else
+ (void) fprintf(fp, "%s",
+ get_usage(command_table[i].usage));
+ }
+ } else {
+ (void) fprintf(fp, gettext("usage:\n"));
+ (void) fprintf(fp, "%s", get_usage(current_command->usage));
+ }
+
+ if (current_command != NULL &&
+ ((strcmp(current_command->name, "set") == 0) ||
+ (strcmp(current_command->name, "get") == 0) ||
+ (strcmp(current_command->name, "list") == 0))) {
+
+ (void) fprintf(fp,
+ gettext("\nthe following properties are supported:\n"));
+
+ (void) fprintf(fp, "\n\t%-19s %s %s\n\n",
+ "PROPERTY", "EDIT", "VALUES");
+
+ /* Iterate over all properties */
+ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
+ ZFS_TYPE_POOL);
+
+ (void) fprintf(fp, "\t%-19s ", "feature@...");
+ (void) fprintf(fp, "YES disabled | enabled | active\n");
+
+ (void) fprintf(fp, gettext("\nThe feature@ properties must be "
+ "appended with a feature name.\nSee zpool-features(5).\n"));
+ }
+
+ /*
+ * See comments at end of main().
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ exit(requested ? 0 : 2);
+}
+
+/*
+ * zpool initialize [-c | -s] [-w] <pool> [<vdev> ...]
+ * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool
+ * if none specified.
+ *
+ * -c Cancel. Ends active initializing.
+ * -s Suspend. Initializing can then be restarted with no flags.
+ * -w Wait. Blocks until initializing has completed.
+ */
+int
+zpool_do_initialize(int argc, char **argv)
+{
+ int c;
+ char *poolname;
+ zpool_handle_t *zhp;
+ nvlist_t *vdevs;
+ int err = 0;
+ boolean_t wait = B_FALSE;
+
+ struct option long_options[] = {
+ {"cancel", no_argument, NULL, 'c'},
+ {"suspend", no_argument, NULL, 's'},
+ {"wait", no_argument, NULL, 'w'},
+ {0, 0, 0, 0}
+ };
+
+ pool_initialize_func_t cmd_type = POOL_INITIALIZE_START;
+ while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) {
+ switch (c) {
+ case 'c':
+ if (cmd_type != POOL_INITIALIZE_START &&
+ cmd_type != POOL_INITIALIZE_CANCEL) {
+ (void) fprintf(stderr, gettext("-c cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_INITIALIZE_CANCEL;
+ break;
+ case 's':
+ if (cmd_type != POOL_INITIALIZE_START &&
+ cmd_type != POOL_INITIALIZE_SUSPEND) {
+ (void) fprintf(stderr, gettext("-s cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_INITIALIZE_SUSPEND;
+ break;
+ case 'w':
+ wait = B_TRUE;
+ break;
+ case '?':
+ if (optopt != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ } else {
+ (void) fprintf(stderr,
+ gettext("invalid option '%s'\n"),
+ argv[optind - 1]);
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ return (-1);
+ }
+
+ if (wait && (cmd_type != POOL_INITIALIZE_START)) {
+ (void) fprintf(stderr, gettext("-w cannot be used with -c or "
+ "-s\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+ zhp = zpool_open(g_zfs, poolname);
+ if (zhp == NULL)
+ return (-1);
+
+ vdevs = fnvlist_alloc();
+ if (argc == 1) {
+ /* no individual leaf vdevs specified, so add them all */
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+ nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ zpool_collect_leaves(zhp, nvroot, vdevs);
+ } else {
+ for (int i = 1; i < argc; i++) {
+ fnvlist_add_boolean(vdevs, argv[i]);
+ }
+ }
+
+ if (wait)
+ err = zpool_initialize_wait(zhp, cmd_type, vdevs);
+ else
+ err = zpool_initialize(zhp, cmd_type, vdevs);
+
+ fnvlist_free(vdevs);
+ zpool_close(zhp);
+
+ return (err);
+}
+
+/*
+ * print a pool vdev config for dry runs
+ */
+static void
+print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
+ const char *match, int name_flags)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *vname;
+ boolean_t printed = B_FALSE;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ if (name != NULL)
+ (void) printf("\t%*s%s\n", indent, "", name);
+ return;
+ }
+
+ for (c = 0; c < children; c++) {
+ uint64_t is_log = B_FALSE;
+ char *class = "";
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (is_log)
+ class = VDEV_ALLOC_BIAS_LOG;
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &class);
+ if (strcmp(match, class) != 0)
+ continue;
+
+ if (!printed && name != NULL) {
+ (void) printf("\t%*s%s\n", indent, "", name);
+ printed = B_TRUE;
+ }
+ vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags);
+ print_vdev_tree(zhp, vname, child[c], indent + 2, "",
+ name_flags);
+ free(vname);
+ }
+}
+
+static boolean_t
+prop_list_contains_feature(nvlist_t *proplist)
+{
+ nvpair_t *nvp;
+ for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
+ nvp = nvlist_next_nvpair(proplist, nvp)) {
+ if (zpool_prop_feature(nvpair_name(nvp)))
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Add a property pair (name, string-value) into a property nvlist.
+ */
+static int
+add_prop_list(const char *propname, char *propval, nvlist_t **props,
+ boolean_t poolprop)
+{
+ zpool_prop_t prop = ZPOOL_PROP_INVAL;
+ nvlist_t *proplist;
+ const char *normnm;
+ char *strval;
+
+ if (*props == NULL &&
+ nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) {
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory\n"));
+ return (1);
+ }
+
+ proplist = *props;
+
+ if (poolprop) {
+ const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
+
+ if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL &&
+ !zpool_prop_feature(propname)) {
+ (void) fprintf(stderr, gettext("property '%s' is "
+ "not a valid pool property\n"), propname);
+ return (2);
+ }
+
+ /*
+ * feature@ properties and version should not be specified
+ * at the same time.
+ */
+ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) &&
+ nvlist_exists(proplist, vname)) ||
+ (prop == ZPOOL_PROP_VERSION &&
+ prop_list_contains_feature(proplist))) {
+ (void) fprintf(stderr, gettext("'feature@' and "
+ "'version' properties cannot be specified "
+ "together\n"));
+ return (2);
+ }
+
+
+ if (zpool_prop_feature(propname))
+ normnm = propname;
+ else
+ normnm = zpool_prop_to_name(prop);
+ } else {
+ zfs_prop_t fsprop = zfs_name_to_prop(propname);
+
+ if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM,
+ B_FALSE)) {
+ normnm = zfs_prop_to_name(fsprop);
+ } else if (zfs_prop_user(propname) ||
+ zfs_prop_userquota(propname)) {
+ normnm = propname;
+ } else {
+ (void) fprintf(stderr, gettext("property '%s' is "
+ "not a valid filesystem property\n"), propname);
+ return (2);
+ }
+ }
+
+ if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
+ prop != ZPOOL_PROP_CACHEFILE) {
+ (void) fprintf(stderr, gettext("property '%s' "
+ "specified multiple times\n"), propname);
+ return (2);
+ }
+
+ if (nvlist_add_string(proplist, normnm, propval) != 0) {
+ (void) fprintf(stderr, gettext("internal "
+ "error: out of memory\n"));
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * Set a default property pair (name, string-value) in a property nvlist
+ */
+static int
+add_prop_list_default(const char *propname, char *propval, nvlist_t **props,
+ boolean_t poolprop)
+{
+ char *pval;
+
+ if (nvlist_lookup_string(*props, propname, &pval) == 0)
+ return (0);
+
+ return (add_prop_list(propname, propval, props, B_TRUE));
+}
+
+/*
+ * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
+ *
+ * -f Force addition of devices, even if they appear in use
+ * -g Display guid for individual vdev name.
+ * -L Follow links when resolving vdev path name.
+ * -n Do not add the devices, but display the resulting layout if
+ * they were to be added.
+ * -o Set property=value.
+ * -P Display full path for vdev name.
+ *
+ * Adds the given vdevs to 'pool'. As with create, the bulk of this work is
+ * handled by make_root_vdev(), which constructs the nvlist needed to pass to
+ * libzfs.
+ */
+int
+zpool_do_add(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ boolean_t dryrun = B_FALSE;
+ int name_flags = 0;
+ int c;
+ nvlist_t *nvroot;
+ char *poolname;
+ int ret;
+ zpool_handle_t *zhp;
+ nvlist_t *config;
+ nvlist_t *props = NULL;
+ char *propval;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case 'g':
+ name_flags |= VDEV_NAME_GUID;
+ break;
+ case 'L':
+ name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'o':
+ if ((propval = strchr(optarg, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing "
+ "'=' for -o option\n"));
+ usage(B_FALSE);
+ }
+ *propval = '\0';
+ propval++;
+
+ if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
+ (add_prop_list(optarg, propval, &props, B_TRUE)))
+ usage(B_FALSE);
+ break;
+ case 'P':
+ name_flags |= VDEV_NAME_PATH;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing vdev specification\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ argc--;
+ argv++;
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+ poolname);
+ zpool_close(zhp);
+ return (1);
+ }
+
+ /* unless manually specified use "ashift" pool property (if set) */
+ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) {
+ int intval;
+ zprop_source_t src;
+ char strval[ZPOOL_MAXPROPLEN];
+
+ intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src);
+ if (src != ZPROP_SRC_DEFAULT) {
+ (void) sprintf(strval, "%" PRId32, intval);
+ verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval,
+ &props, B_TRUE) == 0);
+ }
+ }
+
+ /* pass off to make_root_vdev for processing */
+ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
+ argc, argv);
+ if (nvroot == NULL) {
+ zpool_close(zhp);
+ return (1);
+ }
+
+ if (dryrun) {
+ nvlist_t *poolnvroot;
+ nvlist_t **l2child;
+ uint_t l2children, c;
+ char *vname;
+ boolean_t hadcache = B_FALSE;
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &poolnvroot) == 0);
+
+ (void) printf(gettext("would update '%s' to the following "
+ "configuration:\n"), zpool_get_name(zhp));
+
+ /* print original main pool and new tree */
+ print_vdev_tree(zhp, poolname, poolnvroot, 0, "",
+ name_flags | VDEV_NAME_TYPE_ID);
+ print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags);
+
+ /* print other classes: 'dedup', 'special', and 'log' */
+ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) {
+ print_vdev_tree(zhp, "dedup", poolnvroot, 0,
+ VDEV_ALLOC_BIAS_DEDUP, name_flags);
+ print_vdev_tree(zhp, NULL, nvroot, 0,
+ VDEV_ALLOC_BIAS_DEDUP, name_flags);
+ } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) {
+ print_vdev_tree(zhp, "dedup", nvroot, 0,
+ VDEV_ALLOC_BIAS_DEDUP, name_flags);
+ }
+
+ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) {
+ print_vdev_tree(zhp, "special", poolnvroot, 0,
+ VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+ print_vdev_tree(zhp, NULL, nvroot, 0,
+ VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+ } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) {
+ print_vdev_tree(zhp, "special", nvroot, 0,
+ VDEV_ALLOC_BIAS_SPECIAL, name_flags);
+ }
+
+ if (num_logs(poolnvroot) > 0) {
+ print_vdev_tree(zhp, "logs", poolnvroot, 0,
+ VDEV_ALLOC_BIAS_LOG, name_flags);
+ print_vdev_tree(zhp, NULL, nvroot, 0,
+ VDEV_ALLOC_BIAS_LOG, name_flags);
+ } else if (num_logs(nvroot) > 0) {
+ print_vdev_tree(zhp, "logs", nvroot, 0,
+ VDEV_ALLOC_BIAS_LOG, name_flags);
+ }
+
+ /* Do the same for the caches */
+ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2child, &l2children) == 0 && l2children) {
+ hadcache = B_TRUE;
+ (void) printf(gettext("\tcache\n"));
+ for (c = 0; c < l2children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL,
+ l2child[c], name_flags);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
+ }
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2child, &l2children) == 0 && l2children) {
+ if (!hadcache)
+ (void) printf(gettext("\tcache\n"));
+ for (c = 0; c < l2children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL,
+ l2child[c], name_flags);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
+ }
+
+ ret = 0;
+ } else {
+ ret = (zpool_add(zhp, nvroot) != 0);
+ }
+
+ nvlist_free(props);
+ nvlist_free(nvroot);
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool remove [-npsw] <pool> <vdev> ...
+ *
+ * Removes the given vdev from the pool.
+ */
+int
+zpool_do_remove(int argc, char **argv)
+{
+ char *poolname;
+ int i, ret = 0;
+ zpool_handle_t *zhp = NULL;
+ boolean_t stop = B_FALSE;
+ int c;
+ boolean_t noop = B_FALSE;
+ boolean_t parsable = B_FALSE;
+ boolean_t wait = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "npsw")) != -1) {
+ switch (c) {
+ case 'n':
+ noop = B_TRUE;
+ break;
+ case 'p':
+ parsable = B_TRUE;
+ break;
+ case 's':
+ stop = B_TRUE;
+ break;
+ case 'w':
+ wait = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ if (stop && noop) {
+ (void) fprintf(stderr, gettext("stop request ignored\n"));
+ return (0);
+ }
+
+ if (stop) {
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ if (zpool_vdev_remove_cancel(zhp) != 0)
+ ret = 1;
+ if (wait) {
+ (void) fprintf(stderr, gettext("invalid option "
+ "combination: -w cannot be used with -s\n"));
+ usage(B_FALSE);
+ }
+ } else {
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device\n"));
+ usage(B_FALSE);
+ }
+
+ for (i = 1; i < argc; i++) {
+ if (noop) {
+ uint64_t size;
+
+ if (zpool_vdev_indirect_size(zhp, argv[i],
+ &size) != 0) {
+ ret = 1;
+ break;
+ }
+ if (parsable) {
+ (void) printf("%s %llu\n",
+ argv[i], (unsigned long long)size);
+ } else {
+ char valstr[32];
+ zfs_nicenum(size, valstr,
+ sizeof (valstr));
+ (void) printf("Memory that will be "
+ "used after removing %s: %s\n",
+ argv[i], valstr);
+ }
+ } else {
+ if (zpool_vdev_remove(zhp, argv[i]) != 0)
+ ret = 1;
+ }
+ }
+
+ if (ret == 0 && wait)
+ ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE);
+ }
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool labelclear [-f] <vdev>
+ *
+ * -f Force clearing the label for the vdevs which are members of
+ * the exported or foreign pools.
+ *
+ * Verifies that the vdev is not active and zeros out the label information
+ * on the device.
+ */
+int
+zpool_do_labelclear(int argc, char **argv)
+{
+ char vdev[MAXPATHLEN];
+ char *name = NULL;
+ struct stat st;
+ int c, fd = -1, ret = 0;
+ nvlist_t *config;
+ pool_state_t state;
+ boolean_t inuse = B_FALSE;
+ boolean_t force = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ default:
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get vdev name */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing vdev name\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * Check if we were given absolute path and use it as is.
+ * Otherwise if the provided vdev name doesn't point to a file,
+ * try prepending expected disk paths and partition numbers.
+ */
+ (void) strlcpy(vdev, argv[0], sizeof (vdev));
+ if (vdev[0] != '/' && stat(vdev, &st) != 0) {
+ int error;
+
+ error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN);
+ if (error == 0 && zfs_dev_is_whole_disk(vdev)) {
+ if (zfs_append_partition(vdev, MAXPATHLEN) == -1)
+ error = ENOENT;
+ }
+
+ if (error || (stat(vdev, &st) != 0)) {
+ (void) fprintf(stderr, gettext(
+ "failed to find device %s, try specifying absolute "
+ "path instead\n"), argv[0]);
+ return (1);
+ }
+ }
+
+ if ((fd = open(vdev, O_RDWR)) < 0) {
+ (void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+ vdev, strerror(errno));
+ return (1);
+ }
+
+ /*
+ * Flush all dirty pages for the block device. This should not be
+ * fatal when the device does not support BLKFLSBUF as would be the
+ * case for a file vdev.
+ */
+ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY))
+ (void) fprintf(stderr, gettext("failed to invalidate "
+ "cache for %s: %s\n"), vdev, strerror(errno));
+
+ if (zpool_read_label(fd, &config, NULL) != 0) {
+ (void) fprintf(stderr,
+ gettext("failed to read label from %s\n"), vdev);
+ ret = 1;
+ goto errout;
+ }
+ nvlist_free(config);
+
+ ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
+ if (ret != 0) {
+ (void) fprintf(stderr,
+ gettext("failed to check state for %s\n"), vdev);
+ ret = 1;
+ goto errout;
+ }
+
+ if (!inuse)
+ goto wipe_label;
+
+ switch (state) {
+ default:
+ case POOL_STATE_ACTIVE:
+ case POOL_STATE_SPARE:
+ case POOL_STATE_L2CACHE:
+ (void) fprintf(stderr, gettext(
+ "%s is a member (%s) of pool \"%s\"\n"),
+ vdev, zpool_pool_state_to_name(state), name);
+ ret = 1;
+ goto errout;
+
+ case POOL_STATE_EXPORTED:
+ if (force)
+ break;
+ (void) fprintf(stderr, gettext(
+ "use '-f' to override the following error:\n"
+ "%s is a member of exported pool \"%s\"\n"),
+ vdev, name);
+ ret = 1;
+ goto errout;
+
+ case POOL_STATE_POTENTIALLY_ACTIVE:
+ if (force)
+ break;
+ (void) fprintf(stderr, gettext(
+ "use '-f' to override the following error:\n"
+ "%s is a member of potentially active pool \"%s\"\n"),
+ vdev, name);
+ ret = 1;
+ goto errout;
+
+ case POOL_STATE_DESTROYED:
+ /* inuse should never be set for a destroyed pool */
+ assert(0);
+ break;
+ }
+
+wipe_label:
+ ret = zpool_clear_label(fd);
+ if (ret != 0) {
+ (void) fprintf(stderr,
+ gettext("failed to clear label for %s\n"), vdev);
+ }
+
+errout:
+ free(name);
+ (void) close(fd);
+
+ return (ret);
+}
+
+/*
+ * zpool create [-fnd] [-o property=value] ...
+ * [-O file-system-property=value] ...
+ * [-R root] [-m mountpoint] <pool> <dev> ...
+ *
+ * -f Force creation, even if devices appear in use
+ * -n Do not create the pool, but display the resulting layout if it
+ * were to be created.
+ * -R Create a pool under an alternate root
+ * -m Set default mountpoint for the root dataset. By default it's
+ * '/<pool>'
+ * -o Set property=value.
+ * -o Set feature@feature=enabled|disabled.
+ * -d Don't automatically enable all supported pool features
+ * (individual features can be enabled with -o).
+ * -O Set fsproperty=value in the pool's root file system
+ *
+ * Creates the named pool according to the given vdev specification. The
+ * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c.
+ * Once we get the nvlist back from make_root_vdev(), we either print out the
+ * contents (if '-n' was specified), or pass it to libzfs to do the creation.
+ */
+int
+zpool_do_create(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ boolean_t dryrun = B_FALSE;
+ boolean_t enable_all_pool_feat = B_TRUE;
+ int c;
+ nvlist_t *nvroot = NULL;
+ char *poolname;
+ char *tname = NULL;
+ int ret = 1;
+ char *altroot = NULL;
+ char *mountpoint = NULL;
+ nvlist_t *fsprops = NULL;
+ nvlist_t *props = NULL;
+ char *propval;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'd':
+ enable_all_pool_feat = B_FALSE;
+ break;
+ case 'R':
+ altroot = optarg;
+ if (add_prop_list(zpool_prop_to_name(
+ ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
+ goto errout;
+ if (add_prop_list_default(zpool_prop_to_name(
+ ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+ goto errout;
+ break;
+ case 'm':
+ /* Equivalent to -O mountpoint=optarg */
+ mountpoint = optarg;
+ break;
+ case 'o':
+ if ((propval = strchr(optarg, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing "
+ "'=' for -o option\n"));
+ goto errout;
+ }
+ *propval = '\0';
+ propval++;
+
+ if (add_prop_list(optarg, propval, &props, B_TRUE))
+ goto errout;
+
+ /*
+ * If the user is creating a pool that doesn't support
+ * feature flags, don't enable any features.
+ */
+ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
+ char *end;
+ u_longlong_t ver;
+
+ ver = strtoull(propval, &end, 10);
+ if (*end == '\0' &&
+ ver < SPA_VERSION_FEATURES) {
+ enable_all_pool_feat = B_FALSE;
+ }
+ }
+ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
+ altroot = propval;
+ break;
+ case 'O':
+ if ((propval = strchr(optarg, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing "
+ "'=' for -O option\n"));
+ goto errout;
+ }
+ *propval = '\0';
+ propval++;
+
+ /*
+ * Mountpoints are checked and then added later.
+ * Uniquely among properties, they can be specified
+ * more than once, to avoid conflict with -m.
+ */
+ if (0 == strcmp(optarg,
+ zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
+ mountpoint = propval;
+ } else if (add_prop_list(optarg, propval, &fsprops,
+ B_FALSE)) {
+ goto errout;
+ }
+ break;
+ case 't':
+ /*
+ * Sanity check temporary pool name.
+ */
+ if (strchr(optarg, '/') != NULL) {
+ (void) fprintf(stderr, gettext("cannot create "
+ "'%s': invalid character '/' in temporary "
+ "name\n"), optarg);
+ (void) fprintf(stderr, gettext("use 'zfs "
+ "create' to create a dataset\n"));
+ goto errout;
+ }
+
+ if (add_prop_list(zpool_prop_to_name(
+ ZPOOL_PROP_TNAME), optarg, &props, B_TRUE))
+ goto errout;
+ if (add_prop_list_default(zpool_prop_to_name(
+ ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+ goto errout;
+ tname = optarg;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ goto badusage;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ goto badusage;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ goto badusage;
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing vdev specification\n"));
+ goto badusage;
+ }
+
+ poolname = argv[0];
+
+ /*
+ * As a special case, check for use of '/' in the name, and direct the
+ * user to use 'zfs create' instead.
+ */
+ if (strchr(poolname, '/') != NULL) {
+ (void) fprintf(stderr, gettext("cannot create '%s': invalid "
+ "character '/' in pool name\n"), poolname);
+ (void) fprintf(stderr, gettext("use 'zfs create' to "
+ "create a dataset\n"));
+ goto errout;
+ }
+
+ /* pass off to make_root_vdev for bulk processing */
+ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun,
+ argc - 1, argv + 1);
+ if (nvroot == NULL)
+ goto errout;
+
+ /* make_root_vdev() allows 0 toplevel children if there are spares */
+ if (!zfs_allocatable_devs(nvroot)) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: at least one toplevel vdev must be "
+ "specified\n"));
+ goto errout;
+ }
+
+ if (altroot != NULL && altroot[0] != '/') {
+ (void) fprintf(stderr, gettext("invalid alternate root '%s': "
+ "must be an absolute path\n"), altroot);
+ goto errout;
+ }
+
+ /*
+ * Check the validity of the mountpoint and direct the user to use the
+ * '-m' mountpoint option if it looks like its in use.
+ */
+ if (mountpoint == NULL ||
+ (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
+ strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) {
+ char buf[MAXPATHLEN];
+ DIR *dirp;
+
+ if (mountpoint && mountpoint[0] != '/') {
+ (void) fprintf(stderr, gettext("invalid mountpoint "
+ "'%s': must be an absolute path, 'legacy', or "
+ "'none'\n"), mountpoint);
+ goto errout;
+ }
+
+ if (mountpoint == NULL) {
+ if (altroot != NULL)
+ (void) snprintf(buf, sizeof (buf), "%s/%s",
+ altroot, poolname);
+ else
+ (void) snprintf(buf, sizeof (buf), "/%s",
+ poolname);
+ } else {
+ if (altroot != NULL)
+ (void) snprintf(buf, sizeof (buf), "%s%s",
+ altroot, mountpoint);
+ else
+ (void) snprintf(buf, sizeof (buf), "%s",
+ mountpoint);
+ }
+
+ if ((dirp = opendir(buf)) == NULL && errno != ENOENT) {
+ (void) fprintf(stderr, gettext("mountpoint '%s' : "
+ "%s\n"), buf, strerror(errno));
+ (void) fprintf(stderr, gettext("use '-m' "
+ "option to provide a different default\n"));
+ goto errout;
+ } else if (dirp) {
+ int count = 0;
+
+ while (count < 3 && readdir(dirp) != NULL)
+ count++;
+ (void) closedir(dirp);
+
+ if (count > 2) {
+ (void) fprintf(stderr, gettext("mountpoint "
+ "'%s' exists and is not empty\n"), buf);
+ (void) fprintf(stderr, gettext("use '-m' "
+ "option to provide a "
+ "different default\n"));
+ goto errout;
+ }
+ }
+ }
+
+ /*
+ * Now that the mountpoint's validity has been checked, ensure that
+ * the property is set appropriately prior to creating the pool.
+ */
+ if (mountpoint != NULL) {
+ ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
+ mountpoint, &fsprops, B_FALSE);
+ if (ret != 0)
+ goto errout;
+ }
+
+ ret = 1;
+ if (dryrun) {
+ /*
+ * For a dry run invocation, print out a basic message and run
+ * through all the vdevs in the list and print out in an
+ * appropriate hierarchy.
+ */
+ (void) printf(gettext("would create '%s' with the "
+ "following layout:\n\n"), poolname);
+
+ print_vdev_tree(NULL, poolname, nvroot, 0, "", 0);
+ print_vdev_tree(NULL, "dedup", nvroot, 0,
+ VDEV_ALLOC_BIAS_DEDUP, 0);
+ print_vdev_tree(NULL, "special", nvroot, 0,
+ VDEV_ALLOC_BIAS_SPECIAL, 0);
+ print_vdev_tree(NULL, "logs", nvroot, 0,
+ VDEV_ALLOC_BIAS_LOG, 0);
+
+ ret = 0;
+ } else {
+ /*
+ * Hand off to libzfs.
+ */
+ spa_feature_t i;
+ for (i = 0; i < SPA_FEATURES; i++) {
+ char propname[MAXPATHLEN];
+ char *propval;
+ zfeature_info_t *feat = &spa_feature_table[i];
+
+ (void) snprintf(propname, sizeof (propname),
+ "feature@%s", feat->fi_uname);
+
+ /*
+ * Only features contained in props will be enabled:
+ * remove from the nvlist every ZFS_FEATURE_DISABLED
+ * value and add every missing ZFS_FEATURE_ENABLED if
+ * enable_all_pool_feat is set.
+ */
+ if (!nvlist_lookup_string(props, propname, &propval)) {
+ if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0)
+ (void) nvlist_remove_all(props,
+ propname);
+ } else if (enable_all_pool_feat) {
+ ret = add_prop_list(propname,
+ ZFS_FEATURE_ENABLED, &props, B_TRUE);
+ if (ret != 0)
+ goto errout;
+ }
+ }
+
+ ret = 1;
+ if (zpool_create(g_zfs, poolname,
+ nvroot, props, fsprops) == 0) {
+ zfs_handle_t *pool = zfs_open(g_zfs,
+ tname ? tname : poolname, ZFS_TYPE_FILESYSTEM);
+ if (pool != NULL) {
+ if (zfs_mount(pool, NULL, 0) == 0) {
+ ret = zfs_shareall(pool);
+ zfs_commit_all_shares();
+ }
+ zfs_close(pool);
+ }
+ } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) {
+ (void) fprintf(stderr, gettext("pool name may have "
+ "been omitted\n"));
+ }
+ }
+
+errout:
+ nvlist_free(nvroot);
+ nvlist_free(fsprops);
+ nvlist_free(props);
+ return (ret);
+badusage:
+ nvlist_free(fsprops);
+ nvlist_free(props);
+ usage(B_FALSE);
+ return (2);
+}
+
+/*
+ * zpool destroy <pool>
+ *
+ * -f Forcefully unmount any datasets
+ *
+ * Destroy the given pool. Automatically unmounts any datasets in the pool.
+ */
+int
+zpool_do_destroy(int argc, char **argv)
+{
+ boolean_t force = B_FALSE;
+ int c;
+ char *pool;
+ zpool_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ pool = argv[0];
+
+ if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+ /*
+ * As a special case, check for use of '/' in the name, and
+ * direct the user to use 'zfs destroy' instead.
+ */
+ if (strchr(pool, '/') != NULL)
+ (void) fprintf(stderr, gettext("use 'zfs destroy' to "
+ "destroy a dataset\n"));
+ return (1);
+ }
+
+ if (zpool_disable_datasets(zhp, force) != 0) {
+ (void) fprintf(stderr, gettext("could not destroy '%s': "
+ "could not unmount datasets\n"), zpool_get_name(zhp));
+ zpool_close(zhp);
+ return (1);
+ }
+
+ /* The history must be logged as part of the export */
+ log_history = B_FALSE;
+
+ ret = (zpool_destroy(zhp, history_str) != 0);
+
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+typedef struct export_cbdata {
+ boolean_t force;
+ boolean_t hardforce;
+} export_cbdata_t;
+
+/*
+ * Export one pool
+ */
+static int
+zpool_export_one(zpool_handle_t *zhp, void *data)
+{
+ export_cbdata_t *cb = data;
+
+ if (zpool_disable_datasets(zhp, cb->force) != 0)
+ return (1);
+
+ /* The history must be logged as part of the export */
+ log_history = B_FALSE;
+
+ if (cb->hardforce) {
+ if (zpool_export_force(zhp, history_str) != 0)
+ return (1);
+ } else if (zpool_export(zhp, cb->force, history_str) != 0) {
+ return (1);
+ }
+
+ return (0);
+}
+
+/*
+ * zpool export [-f] <pool> ...
+ *
+ * -a Export all pools
+ * -f Forcefully unmount datasets
+ *
+ * Export the given pools. By default, the command will attempt to cleanly
+ * unmount any active datasets within the pool. If the '-f' flag is specified,
+ * then the datasets will be forcefully unmounted.
+ */
+int
+zpool_do_export(int argc, char **argv)
+{
+ export_cbdata_t cb;
+ boolean_t do_all = B_FALSE;
+ boolean_t force = B_FALSE;
+ boolean_t hardforce = B_FALSE;
+ int c, ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "afF")) != -1) {
+ switch (c) {
+ case 'a':
+ do_all = B_TRUE;
+ break;
+ case 'f':
+ force = B_TRUE;
+ break;
+ case 'F':
+ hardforce = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ cb.force = force;
+ cb.hardforce = hardforce;
+ argc -= optind;
+ argv += optind;
+
+ if (do_all) {
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ return (for_each_pool(argc, argv, B_TRUE, NULL,
+ zpool_export_one, &cb));
+ }
+
+ /* check arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ }
+
+ ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb);
+
+ return (ret);
+}
+
+/*
+ * Given a vdev configuration, determine the maximum width needed for the device
+ * name column.
+ */
+static int
+max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max,
+ int name_flags)
+{
+ char *name;
+ nvlist_t **child;
+ uint_t c, children;
+ int ret;
+
+ name = zpool_vdev_name(g_zfs, zhp, nv, name_flags);
+ if (strlen(name) + depth > max)
+ max = strlen(name) + depth;
+
+ free(name);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if ((ret = max_width(zhp, child[c], depth + 2,
+ max, name_flags)) > max)
+ max = ret;
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if ((ret = max_width(zhp, child[c], depth + 2,
+ max, name_flags)) > max)
+ max = ret;
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if ((ret = max_width(zhp, child[c], depth + 2,
+ max, name_flags)) > max)
+ max = ret;
+ }
+
+ return (max);
+}
+
+typedef struct spare_cbdata {
+ uint64_t cb_guid;
+ zpool_handle_t *cb_zhp;
+} spare_cbdata_t;
+
+static boolean_t
+find_vdev(nvlist_t *nv, uint64_t search)
+{
+ uint64_t guid;
+ nvlist_t **child;
+ uint_t c, children;
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+ search == guid)
+ return (B_TRUE);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if (find_vdev(child[c], search))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+find_spare(zpool_handle_t *zhp, void *data)
+{
+ spare_cbdata_t *cbp = data;
+ nvlist_t *config, *nvroot;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ if (find_vdev(nvroot, cbp->cb_guid)) {
+ cbp->cb_zhp = zhp;
+ return (1);
+ }
+
+ zpool_close(zhp);
+ return (0);
+}
+
+typedef struct status_cbdata {
+ int cb_count;
+ int cb_name_flags;
+ int cb_namewidth;
+ boolean_t cb_allpools;
+ boolean_t cb_verbose;
+ boolean_t cb_literal;
+ boolean_t cb_explain;
+ boolean_t cb_first;
+ boolean_t cb_dedup_stats;
+ boolean_t cb_print_status;
+ boolean_t cb_print_slow_ios;
+ boolean_t cb_print_vdev_init;
+ boolean_t cb_print_vdev_trim;
+ vdev_cmd_data_list_t *vcdl;
+} status_cbdata_t;
+
+/* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */
+static int
+is_blank_str(char *str)
+{
+ while (str != NULL && *str != '\0') {
+ if (!isblank(*str))
+ return (0);
+ str++;
+ }
+ return (1);
+}
+
+/* Print command output lines for specific vdev in a specific pool */
+static void
+zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path)
+{
+ vdev_cmd_data_t *data;
+ int i, j;
+ char *val;
+
+ for (i = 0; i < vcdl->count; i++) {
+ if ((strcmp(vcdl->data[i].path, path) != 0) ||
+ (strcmp(vcdl->data[i].pool, pool) != 0)) {
+ /* Not the vdev we're looking for */
+ continue;
+ }
+
+ data = &vcdl->data[i];
+ /* Print out all the output values for this vdev */
+ for (j = 0; j < vcdl->uniq_cols_cnt; j++) {
+ val = NULL;
+ /* Does this vdev have values for this column? */
+ for (int k = 0; k < data->cols_cnt; k++) {
+ if (strcmp(data->cols[k],
+ vcdl->uniq_cols[j]) == 0) {
+ /* yes it does, record the value */
+ val = data->lines[k];
+ break;
+ }
+ }
+ /*
+ * Mark empty values with dashes to make output
+ * awk-able.
+ */
+ if (is_blank_str(val))
+ val = "-";
+
+ printf("%*s", vcdl->uniq_cols_width[j], val);
+ if (j < vcdl->uniq_cols_cnt - 1)
+ printf(" ");
+ }
+
+ /* Print out any values that aren't in a column at the end */
+ for (j = data->cols_cnt; j < data->lines_cnt; j++) {
+ /* Did we have any columns? If so print a spacer. */
+ if (vcdl->uniq_cols_cnt > 0)
+ printf(" ");
+
+ val = data->lines[j];
+ printf("%s", val ? val : "");
+ }
+ break;
+ }
+}
+
+/*
+ * Print vdev initialization status for leaves
+ */
+static void
+print_status_initialize(vdev_stat_t *vs, boolean_t verbose)
+{
+ if (verbose) {
+ if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+ vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+ vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) &&
+ !vs->vs_scan_removing) {
+ char zbuf[1024];
+ char tbuf[256];
+ struct tm zaction_ts;
+
+ time_t t = vs->vs_initialize_action_time;
+ int initialize_pct = 100;
+ if (vs->vs_initialize_state !=
+ VDEV_INITIALIZE_COMPLETE) {
+ initialize_pct = (vs->vs_initialize_bytes_done *
+ 100 / (vs->vs_initialize_bytes_est + 1));
+ }
+
+ (void) localtime_r(&t, &zaction_ts);
+ (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+ switch (vs->vs_initialize_state) {
+ case VDEV_INITIALIZE_SUSPENDED:
+ (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+ gettext("suspended, started at"), tbuf);
+ break;
+ case VDEV_INITIALIZE_ACTIVE:
+ (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+ gettext("started at"), tbuf);
+ break;
+ case VDEV_INITIALIZE_COMPLETE:
+ (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+ gettext("completed at"), tbuf);
+ break;
+ }
+
+ (void) printf(gettext(" (%d%% initialized%s)"),
+ initialize_pct, zbuf);
+ } else {
+ (void) printf(gettext(" (uninitialized)"));
+ }
+ } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+ (void) printf(gettext(" (initializing)"));
+ }
+}
+
+/*
+ * Print vdev TRIM status for leaves
+ */
+static void
+print_status_trim(vdev_stat_t *vs, boolean_t verbose)
+{
+ if (verbose) {
+ if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE ||
+ vs->vs_trim_state == VDEV_TRIM_SUSPENDED ||
+ vs->vs_trim_state == VDEV_TRIM_COMPLETE) &&
+ !vs->vs_scan_removing) {
+ char zbuf[1024];
+ char tbuf[256];
+ struct tm zaction_ts;
+
+ time_t t = vs->vs_trim_action_time;
+ int trim_pct = 100;
+ if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) {
+ trim_pct = (vs->vs_trim_bytes_done *
+ 100 / (vs->vs_trim_bytes_est + 1));
+ }
+
+ (void) localtime_r(&t, &zaction_ts);
+ (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts);
+
+ switch (vs->vs_trim_state) {
+ case VDEV_TRIM_SUSPENDED:
+ (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+ gettext("suspended, started at"), tbuf);
+ break;
+ case VDEV_TRIM_ACTIVE:
+ (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+ gettext("started at"), tbuf);
+ break;
+ case VDEV_TRIM_COMPLETE:
+ (void) snprintf(zbuf, sizeof (zbuf), ", %s %s",
+ gettext("completed at"), tbuf);
+ break;
+ }
+
+ (void) printf(gettext(" (%d%% trimmed%s)"),
+ trim_pct, zbuf);
+ } else if (vs->vs_trim_notsup) {
+ (void) printf(gettext(" (trim unsupported)"));
+ } else {
+ (void) printf(gettext(" (untrimmed)"));
+ }
+ } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) {
+ (void) printf(gettext(" (trimming)"));
+ }
+}
+
+/*
+ * Return the color associated with a health string. This includes returning
+ * NULL for no color change.
+ */
+static char *
+health_str_to_color(const char *health)
+{
+ if (strcmp(health, gettext("FAULTED")) == 0 ||
+ strcmp(health, gettext("SUSPENDED")) == 0 ||
+ strcmp(health, gettext("UNAVAIL")) == 0) {
+ return (ANSI_RED);
+ }
+
+ if (strcmp(health, gettext("OFFLINE")) == 0 ||
+ strcmp(health, gettext("DEGRADED")) == 0 ||
+ strcmp(health, gettext("REMOVED")) == 0) {
+ return (ANSI_YELLOW);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Print out configuration state as requested by status_callback.
+ */
+static void
+print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
+ nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
+{
+ nvlist_t **child, *root;
+ uint_t c, i, vsc, children;
+ pool_scan_stat_t *ps = NULL;
+ vdev_stat_t *vs;
+ char rbuf[6], wbuf[6], cbuf[6];
+ char *vname;
+ uint64_t notpresent;
+ spare_cbdata_t spare_cb;
+ const char *state;
+ char *type;
+ char *path = NULL;
+ char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ return;
+
+ state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+
+ if (isspare) {
+ /*
+ * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
+ * online drives.
+ */
+ if (vs->vs_aux == VDEV_AUX_SPARED)
+ state = gettext("INUSE");
+ else if (vs->vs_state == VDEV_STATE_HEALTHY)
+ state = gettext("AVAIL");
+ }
+
+ printf_color(health_str_to_color(state),
+ "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
+ name, state);
+
+ if (!isspare) {
+ if (vs->vs_read_errors)
+ rcolor = ANSI_RED;
+
+ if (vs->vs_write_errors)
+ wcolor = ANSI_RED;
+
+ if (vs->vs_checksum_errors)
+ ccolor = ANSI_RED;
+
+ if (cb->cb_literal) {
+ printf(" ");
+ printf_color(rcolor, "%5llu",
+ (u_longlong_t)vs->vs_read_errors);
+ printf(" ");
+ printf_color(wcolor, "%5llu",
+ (u_longlong_t)vs->vs_write_errors);
+ printf(" ");
+ printf_color(ccolor, "%5llu",
+ (u_longlong_t)vs->vs_checksum_errors);
+ } else {
+ zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+ zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+ zfs_nicenum(vs->vs_checksum_errors, cbuf,
+ sizeof (cbuf));
+ printf(" ");
+ printf_color(rcolor, "%5s", rbuf);
+ printf(" ");
+ printf_color(wcolor, "%5s", wbuf);
+ printf(" ");
+ printf_color(ccolor, "%5s", cbuf);
+ }
+ if (cb->cb_print_slow_ios) {
+ if (children == 0) {
+ /* Only leafs vdevs have slow IOs */
+ zfs_nicenum(vs->vs_slow_ios, rbuf,
+ sizeof (rbuf));
+ } else {
+ snprintf(rbuf, sizeof (rbuf), "-");
+ }
+
+ if (cb->cb_literal)
+ printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
+ else
+ printf(" %5s", rbuf);
+ }
+ }
+
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &notpresent) == 0) {
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+ (void) printf(" %s %s", gettext("was"), path);
+ } else if (vs->vs_aux != 0) {
+ (void) printf(" ");
+ color_start(ANSI_RED);
+ switch (vs->vs_aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ (void) printf(gettext("cannot open"));
+ break;
+
+ case VDEV_AUX_BAD_GUID_SUM:
+ (void) printf(gettext("missing device"));
+ break;
+
+ case VDEV_AUX_NO_REPLICAS:
+ (void) printf(gettext("insufficient replicas"));
+ break;
+
+ case VDEV_AUX_VERSION_NEWER:
+ (void) printf(gettext("newer version"));
+ break;
+
+ case VDEV_AUX_UNSUP_FEAT:
+ (void) printf(gettext("unsupported feature(s)"));
+ break;
+
+ case VDEV_AUX_ASHIFT_TOO_BIG:
+ (void) printf(gettext("unsupported minimum blocksize"));
+ break;
+
+ case VDEV_AUX_SPARED:
+ verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+ &spare_cb.cb_guid) == 0);
+ if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) {
+ if (strcmp(zpool_get_name(spare_cb.cb_zhp),
+ zpool_get_name(zhp)) == 0)
+ (void) printf(gettext("currently in "
+ "use"));
+ else
+ (void) printf(gettext("in use by "
+ "pool '%s'"),
+ zpool_get_name(spare_cb.cb_zhp));
+ zpool_close(spare_cb.cb_zhp);
+ } else {
+ (void) printf(gettext("currently in use"));
+ }
+ break;
+
+ case VDEV_AUX_ERR_EXCEEDED:
+ (void) printf(gettext("too many errors"));
+ break;
+
+ case VDEV_AUX_IO_FAILURE:
+ (void) printf(gettext("experienced I/O failures"));
+ break;
+
+ case VDEV_AUX_BAD_LOG:
+ (void) printf(gettext("bad intent log"));
+ break;
+
+ case VDEV_AUX_EXTERNAL:
+ (void) printf(gettext("external device fault"));
+ break;
+
+ case VDEV_AUX_SPLIT_POOL:
+ (void) printf(gettext("split into new pool"));
+ break;
+
+ case VDEV_AUX_ACTIVE:
+ (void) printf(gettext("currently in use"));
+ break;
+
+ case VDEV_AUX_CHILDREN_OFFLINE:
+ (void) printf(gettext("all children offline"));
+ break;
+
+ default:
+ (void) printf(gettext("corrupted data"));
+ break;
+ }
+ color_end();
+ }
+
+ /* The root vdev has the scrub/resilver stats */
+ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+ ZPOOL_CONFIG_VDEV_TREE);
+ (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &c);
+
+ if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) {
+ if (vs->vs_scan_processed != 0) {
+ (void) printf(gettext(" (%s)"),
+ (ps->pss_func == POOL_SCAN_RESILVER) ?
+ "resilvering" : "repairing");
+ } else if (vs->vs_resilver_deferred) {
+ (void) printf(gettext(" (awaiting resilver)"));
+ }
+ }
+
+ /* The top-level vdevs have the rebuild stats */
+ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
+ children == 0) {
+ if (vs->vs_rebuild_processed != 0) {
+ (void) printf(gettext(" (resilvering)"));
+ }
+ }
+
+ if (cb->vcdl != NULL) {
+ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+ printf(" ");
+ zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path);
+ }
+ }
+
+ /* Display vdev initialization and trim status for leaves */
+ if (children == 0) {
+ print_status_initialize(vs, cb->cb_print_vdev_init);
+ print_status_trim(vs, cb->cb_print_vdev_trim);
+ }
+
+ (void) printf("\n");
+
+ for (c = 0; c < children; c++) {
+ uint64_t islog = B_FALSE, ishole = B_FALSE;
+
+ /* Don't print logs or holes here */
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &islog);
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+ &ishole);
+ if (islog || ishole)
+ continue;
+ /* Only print normal classes here */
+ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
+ /* Provide vdev_rebuild_stats to children if available */
+ if (vrs == NULL) {
+ (void) nvlist_lookup_uint64_array(nv,
+ ZPOOL_CONFIG_REBUILD_STATS,
+ (uint64_t **)&vrs, &i);
+ }
+
+ vname = zpool_vdev_name(g_zfs, zhp, child[c],
+ cb->cb_name_flags | VDEV_NAME_TYPE_ID);
+ print_status_config(zhp, cb, vname, child[c], depth + 2,
+ isspare, vrs);
+ free(vname);
+ }
+}
+
+/*
+ * Print the configuration of an exported pool. Iterate over all vdevs in the
+ * pool, printing out the name and status for each one.
+ */
+static void
+print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv,
+ int depth)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ vdev_stat_t *vs;
+ char *type, *vname;
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+ if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
+ strcmp(type, VDEV_TYPE_HOLE) == 0)
+ return;
+
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
+ (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name);
+ (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux));
+
+ if (vs->vs_aux != 0) {
+ (void) printf(" ");
+
+ switch (vs->vs_aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ (void) printf(gettext("cannot open"));
+ break;
+
+ case VDEV_AUX_BAD_GUID_SUM:
+ (void) printf(gettext("missing device"));
+ break;
+
+ case VDEV_AUX_NO_REPLICAS:
+ (void) printf(gettext("insufficient replicas"));
+ break;
+
+ case VDEV_AUX_VERSION_NEWER:
+ (void) printf(gettext("newer version"));
+ break;
+
+ case VDEV_AUX_UNSUP_FEAT:
+ (void) printf(gettext("unsupported feature(s)"));
+ break;
+
+ case VDEV_AUX_ERR_EXCEEDED:
+ (void) printf(gettext("too many errors"));
+ break;
+
+ case VDEV_AUX_ACTIVE:
+ (void) printf(gettext("currently in use"));
+ break;
+
+ case VDEV_AUX_CHILDREN_OFFLINE:
+ (void) printf(gettext("all children offline"));
+ break;
+
+ default:
+ (void) printf(gettext("corrupted data"));
+ break;
+ }
+ }
+ (void) printf("\n");
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ uint64_t is_log = B_FALSE;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (is_log)
+ continue;
+ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
+ vname = zpool_vdev_name(g_zfs, NULL, child[c],
+ cb->cb_name_flags | VDEV_NAME_TYPE_ID);
+ print_import_config(cb, vname, child[c], depth + 2);
+ free(vname);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ (void) printf(gettext("\tcache\n"));
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL, child[c],
+ cb->cb_name_flags);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ (void) printf(gettext("\tspares\n"));
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL, child[c],
+ cb->cb_name_flags);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
+ }
+}
+
+/*
+ * Print specialized class vdevs.
+ *
+ * These are recorded as top level vdevs in the main pool child array
+ * but with "is_log" set to 1 or an "alloc_bias" string. We use either
+ * print_status_config() or print_import_config() to print the top level
+ * class vdevs then any of their children (eg mirrored slogs) are printed
+ * recursively - which works because only the top level vdev is marked.
+ */
+static void
+print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
+ const char *class)
+{
+ uint_t c, children;
+ nvlist_t **child;
+ boolean_t printed = B_FALSE;
+
+ assert(zhp != NULL || !cb->cb_verbose);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) != 0)
+ return;
+
+ for (c = 0; c < children; c++) {
+ uint64_t is_log = B_FALSE;
+ char *bias = NULL;
+ char *type = NULL;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+
+ if (is_log) {
+ bias = VDEV_ALLOC_CLASS_LOGS;
+ } else {
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_TYPE, &type);
+ }
+
+ if (bias == NULL || strcmp(bias, class) != 0)
+ continue;
+ if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ continue;
+
+ if (!printed) {
+ (void) printf("\t%s\t\n", gettext(class));
+ printed = B_TRUE;
+ }
+
+ char *name = zpool_vdev_name(g_zfs, zhp, child[c],
+ cb->cb_name_flags | VDEV_NAME_TYPE_ID);
+ if (cb->cb_print_status)
+ print_status_config(zhp, cb, name, child[c], 2,
+ B_FALSE, NULL);
+ else
+ print_import_config(cb, name, child[c], 2);
+ free(name);
+ }
+}
+
+/*
+ * Display the status for the given pool.
+ */
+static void
+show_import(nvlist_t *config)
+{
+ uint64_t pool_state;
+ vdev_stat_t *vs;
+ char *name;
+ uint64_t guid;
+ uint64_t hostid = 0;
+ char *msgid;
+ char *hostname = "unknown";
+ nvlist_t *nvroot, *nvinfo;
+ zpool_status_t reason;
+ zpool_errata_t errata;
+ const char *health;
+ uint_t vsc;
+ char *comment;
+ status_cbdata_t cb = { 0 };
+
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &name) == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &guid) == 0);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &pool_state) == 0);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &vsc) == 0);
+ health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+
+ reason = zpool_import_status(config, &msgid, &errata);
+
+ (void) printf(gettext(" pool: %s\n"), name);
+ (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid);
+ (void) printf(gettext(" state: %s"), health);
+ if (pool_state == POOL_STATE_DESTROYED)
+ (void) printf(gettext(" (DESTROYED)"));
+ (void) printf("\n");
+
+ switch (reason) {
+ case ZPOOL_STATUS_MISSING_DEV_R:
+ case ZPOOL_STATUS_MISSING_DEV_NR:
+ case ZPOOL_STATUS_BAD_GUID_SUM:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices are "
+ "missing from the system.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_LABEL_R:
+ case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices contains"
+ " corrupted data.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_DATA:
+ (void) printf(
+ gettext(" status: The pool data is corrupted.\n"));
+ break;
+
+ case ZPOOL_STATUS_OFFLINE_DEV:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices "
+ "are offlined.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_POOL:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool metadata is "
+ "corrupted.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_OLDER:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
+ "a legacy on-disk version.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_NEWER:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
+ "an incompatible version.\n"));
+ break;
+
+ case ZPOOL_STATUS_FEAT_DISABLED:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("Some supported features are "
+ "not enabled on the pool.\n"));
+ break;
+
+ case ZPOOL_STATUS_UNSUP_FEAT_READ:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool uses the following "
+ "feature(s) not supported on this system:\n"));
+ color_start(ANSI_YELLOW);
+ zpool_print_unsup_feat(config);
+ color_end();
+ break;
+
+ case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool can only be "
+ "accessed in read-only mode on this system. It\n\tcannot be"
+ " accessed in read-write mode because it uses the "
+ "following\n\tfeature(s) not supported on this system:\n"));
+ color_start(ANSI_YELLOW);
+ zpool_print_unsup_feat(config);
+ color_end();
+ break;
+
+ case ZPOOL_STATUS_HOSTID_ACTIVE:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool is currently "
+ "imported by another system.\n"));
+ break;
+
+ case ZPOOL_STATUS_HOSTID_REQUIRED:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool has the "
+ "multihost property on. It cannot\n\tbe safely imported "
+ "when the system hostid is not set.\n"));
+ break;
+
+ case ZPOOL_STATUS_HOSTID_MISMATCH:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool was last accessed "
+ "by another system.\n"));
+ break;
+
+ case ZPOOL_STATUS_FAULTED_DEV_R:
+ case ZPOOL_STATUS_FAULTED_DEV_NR:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices are "
+ "faulted.\n"));
+ break;
+
+ case ZPOOL_STATUS_BAD_LOG:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("An intent log record cannot "
+ "be read.\n"));
+ break;
+
+ case ZPOOL_STATUS_RESILVERING:
+ case ZPOOL_STATUS_REBUILDING:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices were "
+ "being resilvered.\n"));
+ break;
+
+ case ZPOOL_STATUS_ERRATA:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"),
+ errata);
+ break;
+
+ default:
+ /*
+ * No other status can be seen when importing pools.
+ */
+ assert(reason == ZPOOL_STATUS_OK);
+ }
+
+ /*
+ * Print out an action according to the overall state of the pool.
+ */
+ if (vs->vs_state == VDEV_STATE_HEALTHY) {
+ if (reason == ZPOOL_STATUS_VERSION_OLDER ||
+ reason == ZPOOL_STATUS_FEAT_DISABLED) {
+ (void) printf(gettext(" action: The pool can be "
+ "imported using its name or numeric identifier, "
+ "though\n\tsome features will not be available "
+ "without an explicit 'zpool upgrade'.\n"));
+ } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
+ (void) printf(gettext(" action: The pool can be "
+ "imported using its name or numeric "
+ "identifier and\n\tthe '-f' flag.\n"));
+ } else if (reason == ZPOOL_STATUS_ERRATA) {
+ switch (errata) {
+ case ZPOOL_ERRATA_NONE:
+ break;
+
+ case ZPOOL_ERRATA_ZOL_2094_SCRUB:
+ (void) printf(gettext(" action: The pool can "
+ "be imported using its name or numeric "
+ "identifier,\n\thowever there is a compat"
+ "ibility issue which should be corrected"
+ "\n\tby running 'zpool scrub'\n"));
+ break;
+
+ case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY:
+ (void) printf(gettext(" action: The pool can"
+ "not be imported with this version of ZFS "
+ "due to\n\tan active asynchronous destroy. "
+ "Revert to an earlier version\n\tand "
+ "allow the destroy to complete before "
+ "updating.\n"));
+ break;
+
+ case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION:
+ (void) printf(gettext(" action: Existing "
+ "encrypted datasets contain an on-disk "
+ "incompatibility, which\n\tneeds to be "
+ "corrected. Backup these datasets to new "
+ "encrypted datasets\n\tand destroy the "
+ "old ones.\n"));
+ break;
+
+ case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION:
+ (void) printf(gettext(" action: Existing "
+ "encrypted snapshots and bookmarks contain "
+ "an on-disk\n\tincompatibility. This may "
+ "cause on-disk corruption if they are used"
+ "\n\twith 'zfs recv'. To correct the "
+ "issue, enable the bookmark_v2 feature.\n\t"
+ "No additional action is needed if there "
+ "are no encrypted snapshots or\n\t"
+ "bookmarks. If preserving the encrypted "
+ "snapshots and bookmarks is\n\trequired, "
+ "use a non-raw send to backup and restore "
+ "them. Alternately,\n\tthey may be removed"
+ " to resolve the incompatibility.\n"));
+ break;
+ default:
+ /*
+ * All errata must contain an action message.
+ */
+ assert(0);
+ }
+ } else {
+ (void) printf(gettext(" action: The pool can be "
+ "imported using its name or numeric "
+ "identifier.\n"));
+ }
+ } else if (vs->vs_state == VDEV_STATE_DEGRADED) {
+ (void) printf(gettext(" action: The pool can be imported "
+ "despite missing or damaged devices. The\n\tfault "
+ "tolerance of the pool may be compromised if imported.\n"));
+ } else {
+ switch (reason) {
+ case ZPOOL_STATUS_VERSION_NEWER:
+ (void) printf(gettext(" action: The pool cannot be "
+ "imported. Access the pool on a system running "
+ "newer\n\tsoftware, or recreate the pool from "
+ "backup.\n"));
+ break;
+ case ZPOOL_STATUS_UNSUP_FEAT_READ:
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("The pool cannot be "
+ "imported. Access the pool on a system that "
+ "supports\n\tthe required feature(s), or recreate "
+ "the pool from backup.\n"));
+ break;
+ case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("The pool cannot be "
+ "imported in read-write mode. Import the pool "
+ "with\n"
+ "\t\"-o readonly=on\", access the pool on a system "
+ "that supports the\n\trequired feature(s), or "
+ "recreate the pool from backup.\n"));
+ break;
+ case ZPOOL_STATUS_MISSING_DEV_R:
+ case ZPOOL_STATUS_MISSING_DEV_NR:
+ case ZPOOL_STATUS_BAD_GUID_SUM:
+ (void) printf(gettext(" action: The pool cannot be "
+ "imported. Attach the missing\n\tdevices and try "
+ "again.\n"));
+ break;
+ case ZPOOL_STATUS_HOSTID_ACTIVE:
+ VERIFY0(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_LOAD_INFO, &nvinfo));
+
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
+ hostname = fnvlist_lookup_string(nvinfo,
+ ZPOOL_CONFIG_MMP_HOSTNAME);
+
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
+ hostid = fnvlist_lookup_uint64(nvinfo,
+ ZPOOL_CONFIG_MMP_HOSTID);
+
+ (void) printf(gettext(" action: The pool must be "
+ "exported from %s (hostid=%lx)\n\tbefore it "
+ "can be safely imported.\n"), hostname,
+ (unsigned long) hostid);
+ break;
+ case ZPOOL_STATUS_HOSTID_REQUIRED:
+ (void) printf(gettext(" action: Set a unique system "
+ "hostid with the zgenhostid(8) command.\n"));
+ break;
+ default:
+ (void) printf(gettext(" action: The pool cannot be "
+ "imported due to damaged devices or data.\n"));
+ }
+ }
+
+ /* Print the comment attached to the pool. */
+ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+ (void) printf(gettext("comment: %s\n"), comment);
+
+ /*
+ * If the state is "closed" or "can't open", and the aux state
+ * is "corrupt data":
+ */
+ if (((vs->vs_state == VDEV_STATE_CLOSED) ||
+ (vs->vs_state == VDEV_STATE_CANT_OPEN)) &&
+ (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) {
+ if (pool_state == POOL_STATE_DESTROYED)
+ (void) printf(gettext("\tThe pool was destroyed, "
+ "but can be imported using the '-Df' flags.\n"));
+ else if (pool_state != POOL_STATE_EXPORTED)
+ (void) printf(gettext("\tThe pool may be active on "
+ "another system, but can be imported using\n\t"
+ "the '-f' flag.\n"));
+ }
+
+ if (msgid != NULL) {
+ (void) printf(gettext(
+ " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"),
+ msgid);
+ }
+
+ (void) printf(gettext(" config:\n\n"));
+
+ cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name),
+ VDEV_NAME_TYPE_ID);
+ if (cb.cb_namewidth < 10)
+ cb.cb_namewidth = 10;
+
+ print_import_config(&cb, name, nvroot, 0);
+
+ print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP);
+ print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
+ print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS);
+
+ if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
+ (void) printf(gettext("\n\tAdditional devices are known to "
+ "be part of this pool, though their\n\texact "
+ "configuration cannot be determined.\n"));
+ }
+}
+
+static boolean_t
+zfs_force_import_required(nvlist_t *config)
+{
+ uint64_t state;
+ uint64_t hostid = 0;
+ nvlist_t *nvinfo;
+
+ state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+ if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
+ return (B_TRUE);
+
+ nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
+ mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
+ ZPOOL_CONFIG_MMP_STATE);
+
+ if (mmp_state != MMP_STATE_INACTIVE)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Perform the import for the given configuration. This passes the heavy
+ * lifting off to zpool_import_props(), and then mounts the datasets contained
+ * within the pool.
+ */
+static int
+do_import(nvlist_t *config, const char *newname, const char *mntopts,
+ nvlist_t *props, int flags)
+{
+ int ret = 0;
+ zpool_handle_t *zhp;
+ char *name;
+ uint64_t version;
+
+ name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME);
+ version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION);
+
+ if (!SPA_VERSION_IS_SUPPORTED(version)) {
+ (void) fprintf(stderr, gettext("cannot import '%s': pool "
+ "is formatted using an unsupported ZFS version\n"), name);
+ return (1);
+ } else if (zfs_force_import_required(config) &&
+ !(flags & ZFS_IMPORT_ANY_HOST)) {
+ mmp_state_t mmp_state = MMP_STATE_INACTIVE;
+ nvlist_t *nvinfo;
+
+ nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE))
+ mmp_state = fnvlist_lookup_uint64(nvinfo,
+ ZPOOL_CONFIG_MMP_STATE);
+
+ if (mmp_state == MMP_STATE_ACTIVE) {
+ char *hostname = "<unknown>";
+ uint64_t hostid = 0;
+
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME))
+ hostname = fnvlist_lookup_string(nvinfo,
+ ZPOOL_CONFIG_MMP_HOSTNAME);
+
+ if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID))
+ hostid = fnvlist_lookup_uint64(nvinfo,
+ ZPOOL_CONFIG_MMP_HOSTID);
+
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "pool is imported on %s (hostid: "
+ "0x%lx)\nExport the pool on the other system, "
+ "then run 'zpool import'.\n"),
+ name, hostname, (unsigned long) hostid);
+ } else if (mmp_state == MMP_STATE_NO_HOSTID) {
+ (void) fprintf(stderr, gettext("Cannot import '%s': "
+ "pool has the multihost property on and the\n"
+ "system's hostid is not set. Set a unique hostid "
+ "with the zgenhostid(8) command.\n"), name);
+ } else {
+ char *hostname = "<unknown>";
+ uint64_t timestamp = 0;
+ uint64_t hostid = 0;
+
+ if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
+ hostname = fnvlist_lookup_string(config,
+ ZPOOL_CONFIG_HOSTNAME);
+
+ if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP))
+ timestamp = fnvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_TIMESTAMP);
+
+ if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
+ hostid = fnvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_HOSTID);
+
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "pool was previously in use from another system.\n"
+ "Last accessed by %s (hostid=%lx) at %s"
+ "The pool can be imported, use 'zpool import -f' "
+ "to import the pool.\n"), name, hostname,
+ (unsigned long)hostid, ctime((time_t *)&timestamp));
+ }
+
+ return (1);
+ }
+
+ if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
+ return (1);
+
+ if (newname != NULL)
+ name = (char *)newname;
+
+ if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
+ return (1);
+
+ /*
+ * Loading keys is best effort. We don't want to return immediately
+ * if it fails but we do want to give the error to the caller.
+ */
+ if (flags & ZFS_IMPORT_LOAD_KEYS) {
+ ret = zfs_crypto_attempt_load_keys(g_zfs, name);
+ if (ret != 0)
+ ret = 1;
+ }
+
+ if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+ !(flags & ZFS_IMPORT_ONLY) &&
+ zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+ zpool_close(zhp);
+ return (1);
+ }
+
+ zpool_close(zhp);
+ return (ret);
+}
+
+typedef struct target_exists_args {
+ const char *poolname;
+ uint64_t poolguid;
+} target_exists_args_t;
+
+static int
+name_or_guid_exists(zpool_handle_t *zhp, void *data)
+{
+ target_exists_args_t *args = data;
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+ int found = 0;
+
+ if (config == NULL)
+ return (0);
+
+ if (args->poolname != NULL) {
+ char *pool_name;
+
+ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+ &pool_name) == 0);
+ if (strcmp(pool_name, args->poolname) == 0)
+ found = 1;
+ } else {
+ uint64_t pool_guid;
+
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) == 0);
+ if (pool_guid == args->poolguid)
+ found = 1;
+ }
+ zpool_close(zhp);
+
+ return (found);
+}
+/*
+ * zpool checkpoint <pool>
+ * checkpoint --discard <pool>
+ *
+ * -d Discard the checkpoint from a checkpointed
+ * --discard pool.
+ *
+ * -w Wait for discarding a checkpoint to complete.
+ * --wait
+ *
+ * Checkpoints the specified pool, by taking a "snapshot" of its
+ * current state. A pool can only have one checkpoint at a time.
+ */
+int
+zpool_do_checkpoint(int argc, char **argv)
+{
+ boolean_t discard, wait;
+ char *pool;
+ zpool_handle_t *zhp;
+ int c, err;
+
+ struct option long_options[] = {
+ {"discard", no_argument, NULL, 'd'},
+ {"wait", no_argument, NULL, 'w'},
+ {0, 0, 0, 0}
+ };
+
+ discard = B_FALSE;
+ wait = B_FALSE;
+ while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) {
+ switch (c) {
+ case 'd':
+ discard = B_TRUE;
+ break;
+ case 'w':
+ wait = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ if (wait && !discard) {
+ (void) fprintf(stderr, gettext("--wait only valid when "
+ "--discard also specified\n"));
+ usage(B_FALSE);
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ pool = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, pool)) == NULL) {
+ /* As a special case, check for use of '/' in the name */
+ if (strchr(pool, '/') != NULL)
+ (void) fprintf(stderr, gettext("'zpool checkpoint' "
+ "doesn't work on datasets. To save the state "
+ "of a dataset from a specific point in time "
+ "please use 'zfs snapshot'\n"));
+ return (1);
+ }
+
+ if (discard) {
+ err = (zpool_discard_checkpoint(zhp) != 0);
+ if (err == 0 && wait)
+ err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD);
+ } else {
+ err = (zpool_checkpoint(zhp) != 0);
+ }
+
+ zpool_close(zhp);
+
+ return (err);
+}
+
+#define CHECKPOINT_OPT 1024
+
+/*
+ * zpool import [-d dir] [-D]
+ * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
+ * [-d dir | -c cachefile] [-f] -a
+ * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l]
+ * [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
+ *
+ * -c Read pool information from a cachefile instead of searching
+ * devices.
+ *
+ * -d Scan in a specific directory, other than /dev/. More than
+ * one directory can be specified using multiple '-d' options.
+ *
+ * -D Scan for previously destroyed pools or import all or only
+ * specified destroyed pools.
+ *
+ * -R Temporarily import the pool, with all mountpoints relative to
+ * the given root. The pool will remain exported when the machine
+ * is rebooted.
+ *
+ * -V Import even in the presence of faulted vdevs. This is an
+ * intentionally undocumented option for testing purposes, and
+ * treats the pool configuration as complete, leaving any bad
+ * vdevs in the FAULTED state. In other words, it does verbatim
+ * import.
+ *
+ * -f Force import, even if it appears that the pool is active.
+ *
+ * -F Attempt rewind if necessary.
+ *
+ * -n See if rewind would work, but don't actually rewind.
+ *
+ * -N Import the pool but don't mount datasets.
+ *
+ * -T Specify a starting txg to use for import. This option is
+ * intentionally undocumented option for testing purposes.
+ *
+ * -a Import all pools found.
+ *
+ * -l Load encryption keys while importing.
+ *
+ * -o Set property=value and/or temporary mount options (without '=').
+ *
+ * -s Scan using the default search path, the libblkid cache will
+ * not be consulted.
+ *
+ * --rewind-to-checkpoint
+ * Import the pool and revert back to the checkpoint.
+ *
+ * The import command scans for pools to import, and import pools based on pool
+ * name and GUID. The pool can also be renamed as part of the import process.
+ */
+int
+zpool_do_import(int argc, char **argv)
+{
+ char **searchdirs = NULL;
+ char *env, *envdup = NULL;
+ int nsearch = 0;
+ int c;
+ int err = 0;
+ nvlist_t *pools = NULL;
+ boolean_t do_all = B_FALSE;
+ boolean_t do_destroyed = B_FALSE;
+ char *mntopts = NULL;
+ nvpair_t *elem;
+ nvlist_t *config;
+ uint64_t searchguid = 0;
+ char *searchname = NULL;
+ char *propval;
+ nvlist_t *found_config;
+ nvlist_t *policy = NULL;
+ nvlist_t *props = NULL;
+ boolean_t first;
+ int flags = ZFS_IMPORT_NORMAL;
+ uint32_t rewind_policy = ZPOOL_NO_REWIND;
+ boolean_t dryrun = B_FALSE;
+ boolean_t do_rewind = B_FALSE;
+ boolean_t xtreme_rewind = B_FALSE;
+ boolean_t do_scan = B_FALSE;
+ boolean_t pool_exists = B_FALSE;
+ uint64_t pool_state, txg = -1ULL;
+ char *cachefile = NULL;
+ importargs_t idata = { 0 };
+ char *endptr;
+
+ struct option long_options[] = {
+ {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT},
+ {0, 0, 0, 0}
+ };
+
+ /* check options */
+ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX",
+ long_options, NULL)) != -1) {
+ switch (c) {
+ case 'a':
+ do_all = B_TRUE;
+ break;
+ case 'c':
+ cachefile = optarg;
+ break;
+ case 'd':
+ if (searchdirs == NULL) {
+ searchdirs = safe_malloc(sizeof (char *));
+ } else {
+ char **tmp = safe_malloc((nsearch + 1) *
+ sizeof (char *));
+ bcopy(searchdirs, tmp, nsearch *
+ sizeof (char *));
+ free(searchdirs);
+ searchdirs = tmp;
+ }
+ searchdirs[nsearch++] = optarg;
+ break;
+ case 'D':
+ do_destroyed = B_TRUE;
+ break;
+ case 'f':
+ flags |= ZFS_IMPORT_ANY_HOST;
+ break;
+ case 'F':
+ do_rewind = B_TRUE;
+ break;
+ case 'l':
+ flags |= ZFS_IMPORT_LOAD_KEYS;
+ break;
+ case 'm':
+ flags |= ZFS_IMPORT_MISSING_LOG;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'N':
+ flags |= ZFS_IMPORT_ONLY;
+ break;
+ case 'o':
+ if ((propval = strchr(optarg, '=')) != NULL) {
+ *propval = '\0';
+ propval++;
+ if (add_prop_list(optarg, propval,
+ &props, B_TRUE))
+ goto error;
+ } else {
+ mntopts = optarg;
+ }
+ break;
+ case 'R':
+ if (add_prop_list(zpool_prop_to_name(
+ ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE))
+ goto error;
+ if (add_prop_list_default(zpool_prop_to_name(
+ ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+ goto error;
+ break;
+ case 's':
+ do_scan = B_TRUE;
+ break;
+ case 't':
+ flags |= ZFS_IMPORT_TEMP_NAME;
+ if (add_prop_list_default(zpool_prop_to_name(
+ ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
+ goto error;
+ break;
+
+ case 'T':
+ errno = 0;
+ txg = strtoull(optarg, &endptr, 0);
+ if (errno != 0 || *endptr != '\0') {
+ (void) fprintf(stderr,
+ gettext("invalid txg value\n"));
+ usage(B_FALSE);
+ }
+ rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
+ break;
+ case 'V':
+ flags |= ZFS_IMPORT_VERBATIM;
+ break;
+ case 'X':
+ xtreme_rewind = B_TRUE;
+ break;
+ case CHECKPOINT_OPT:
+ flags |= ZFS_IMPORT_CHECKPOINT;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (cachefile && nsearch != 0) {
+ (void) fprintf(stderr, gettext("-c is incompatible with -d\n"));
+ usage(B_FALSE);
+ }
+
+ if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) {
+ (void) fprintf(stderr, gettext("-l is incompatible with -N\n"));
+ usage(B_FALSE);
+ }
+
+ if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) {
+ (void) fprintf(stderr, gettext("-l is only meaningful during "
+ "an import\n"));
+ usage(B_FALSE);
+ }
+
+ if ((dryrun || xtreme_rewind) && !do_rewind) {
+ (void) fprintf(stderr,
+ gettext("-n or -X only meaningful with -F\n"));
+ usage(B_FALSE);
+ }
+ if (dryrun)
+ rewind_policy = ZPOOL_TRY_REWIND;
+ else if (do_rewind)
+ rewind_policy = ZPOOL_DO_REWIND;
+ if (xtreme_rewind)
+ rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+ /* In the future, we can capture further policy and include it here */
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+ nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
+ rewind_policy) != 0)
+ goto error;
+
+ /* check argument count */
+ if (do_all) {
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ } else {
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+ }
+
+ /*
+ * Check for the effective uid. We do this explicitly here because
+ * otherwise any attempt to discover pools will silently fail.
+ */
+ if (argc == 0 && geteuid() != 0) {
+ (void) fprintf(stderr, gettext("cannot "
+ "discover pools: permission denied\n"));
+ if (searchdirs != NULL)
+ free(searchdirs);
+
+ nvlist_free(props);
+ nvlist_free(policy);
+ return (1);
+ }
+
+ /*
+ * Depending on the arguments given, we do one of the following:
+ *
+ * <none> Iterate through all pools and display information about
+ * each one.
+ *
+ * -a Iterate through all pools and try to import each one.
+ *
+ * <id> Find the pool that corresponds to the given GUID/pool
+ * name and import that one.
+ *
+ * -D Above options applies only to destroyed pools.
+ */
+ if (argc != 0) {
+ char *endptr;
+
+ errno = 0;
+ searchguid = strtoull(argv[0], &endptr, 10);
+ if (errno != 0 || *endptr != '\0') {
+ searchname = argv[0];
+ searchguid = 0;
+ }
+ found_config = NULL;
+
+ /*
+ * User specified a name or guid. Ensure it's unique.
+ */
+ target_exists_args_t search = {searchname, searchguid};
+ pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search);
+ }
+
+ /*
+ * Check the environment for the preferred search path.
+ */
+ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) {
+ char *dir;
+
+ envdup = strdup(env);
+
+ dir = strtok(envdup, ":");
+ while (dir != NULL) {
+ if (searchdirs == NULL) {
+ searchdirs = safe_malloc(sizeof (char *));
+ } else {
+ char **tmp = safe_malloc((nsearch + 1) *
+ sizeof (char *));
+ bcopy(searchdirs, tmp, nsearch *
+ sizeof (char *));
+ free(searchdirs);
+ searchdirs = tmp;
+ }
+ searchdirs[nsearch++] = dir;
+ dir = strtok(NULL, ":");
+ }
+ }
+
+ idata.path = searchdirs;
+ idata.paths = nsearch;
+ idata.poolname = searchname;
+ idata.guid = searchguid;
+ idata.cachefile = cachefile;
+ idata.scan = do_scan;
+ idata.policy = policy;
+
+ pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops);
+
+ if (pools != NULL && pool_exists &&
+ (argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "a pool with that name already exists\n"),
+ argv[0]);
+ (void) fprintf(stderr, gettext("use the form '%s "
+ "<pool | id> <newpool>' to give it a new name\n"),
+ "zpool import");
+ err = 1;
+ } else if (pools == NULL && pool_exists) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "a pool with that name is already created/imported,\n"),
+ argv[0]);
+ (void) fprintf(stderr, gettext("and no additional pools "
+ "with that name were found\n"));
+ err = 1;
+ } else if (pools == NULL) {
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "no such pool available\n"), argv[0]);
+ }
+ err = 1;
+ }
+
+ if (err == 1) {
+ if (searchdirs != NULL)
+ free(searchdirs);
+ if (envdup != NULL)
+ free(envdup);
+ nvlist_free(policy);
+ nvlist_free(pools);
+ nvlist_free(props);
+ return (1);
+ }
+
+ /*
+ * At this point we have a list of import candidate configs. Even if
+ * we were searching by pool name or guid, we still need to
+ * post-process the list to deal with pool state and possible
+ * duplicate names.
+ */
+ err = 0;
+ elem = NULL;
+ first = B_TRUE;
+ while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
+
+ verify(nvpair_value_nvlist(elem, &config) == 0);
+
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+ &pool_state) == 0);
+ if (!do_destroyed && pool_state == POOL_STATE_DESTROYED)
+ continue;
+ if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
+ continue;
+
+ verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY,
+ policy) == 0);
+
+ if (argc == 0) {
+ if (first)
+ first = B_FALSE;
+ else if (!do_all)
+ (void) printf("\n");
+
+ if (do_all) {
+ err |= do_import(config, NULL, mntopts,
+ props, flags);
+ } else {
+ show_import(config);
+ }
+ } else if (searchname != NULL) {
+ char *name;
+
+ /*
+ * We are searching for a pool based on name.
+ */
+ verify(nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME, &name) == 0);
+
+ if (strcmp(name, searchname) == 0) {
+ if (found_config != NULL) {
+ (void) fprintf(stderr, gettext(
+ "cannot import '%s': more than "
+ "one matching pool\n"), searchname);
+ (void) fprintf(stderr, gettext(
+ "import by numeric ID instead\n"));
+ err = B_TRUE;
+ }
+ found_config = config;
+ }
+ } else {
+ uint64_t guid;
+
+ /*
+ * Search for a pool by guid.
+ */
+ verify(nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
+
+ if (guid == searchguid)
+ found_config = config;
+ }
+ }
+
+ /*
+ * If we were searching for a specific pool, verify that we found a
+ * pool, and then do the import.
+ */
+ if (argc != 0 && err == 0) {
+ if (found_config == NULL) {
+ (void) fprintf(stderr, gettext("cannot import '%s': "
+ "no such pool available\n"), argv[0]);
+ err = B_TRUE;
+ } else {
+ err |= do_import(found_config, argc == 1 ? NULL :
+ argv[1], mntopts, props, flags);
+ }
+ }
+
+ /*
+ * If we were just looking for pools, report an error if none were
+ * found.
+ */
+ if (argc == 0 && first)
+ (void) fprintf(stderr,
+ gettext("no pools available to import\n"));
+
+error:
+ nvlist_free(props);
+ nvlist_free(pools);
+ nvlist_free(policy);
+ if (searchdirs != NULL)
+ free(searchdirs);
+ if (envdup != NULL)
+ free(envdup);
+
+ return (err ? 1 : 0);
+}
+
+/*
+ * zpool sync [-f] [pool] ...
+ *
+ * -f (undocumented) force uberblock (and config including zpool cache file)
+ * update.
+ *
+ * Sync the specified pool(s).
+ * Without arguments "zpool sync" will sync all pools.
+ * This command initiates TXG sync(s) and will return after the TXG(s) commit.
+ *
+ */
+static int
+zpool_do_sync(int argc, char **argv)
+{
+ int ret;
+ boolean_t force = B_FALSE;
+
+ /* check options */
+ while ((ret = getopt(argc, argv, "f")) != -1) {
+ switch (ret) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* if argc == 0 we will execute zpool_sync_one on all pools */
+ ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force);
+
+ return (ret);
+}
+
+typedef struct iostat_cbdata {
+ uint64_t cb_flags;
+ int cb_name_flags;
+ int cb_namewidth;
+ int cb_iteration;
+ char **cb_vdev_names; /* Only show these vdevs */
+ unsigned int cb_vdev_names_count;
+ boolean_t cb_verbose;
+ boolean_t cb_literal;
+ boolean_t cb_scripted;
+ zpool_list_t *cb_list;
+ vdev_cmd_data_list_t *vcdl;
+} iostat_cbdata_t;
+
+/* iostat labels */
+typedef struct name_and_columns {
+ const char *name; /* Column name */
+ unsigned int columns; /* Center name to this number of columns */
+} name_and_columns_t;
+
+#define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */
+
+static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
+{
+ [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
+ {NULL}},
+ [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+ {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}},
+ [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
+ {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
+ {"trimq_write", 2}, {NULL}},
+ [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
+ {"asyncq_wait", 2}, {NULL}},
+ [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2},
+ {"async_read", 2}, {"async_write", 2}, {"scrub", 2},
+ {"trim", 2}, {NULL}},
+};
+
+/* Shorthand - if "columns" field not set, default to 1 column */
+static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
+{
+ [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {NULL}},
+ [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}},
+ [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"},
+ {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"},
+ {"pend"}, {"activ"}, {NULL}},
+ [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
+ {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}},
+ [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
+ {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}},
+};
+
+static const char *histo_to_title[] = {
+ [IOS_L_HISTO] = "latency",
+ [IOS_RQ_HISTO] = "req_size",
+};
+
+/*
+ * Return the number of labels in a null-terminated name_and_columns_t
+ * array.
+ *
+ */
+static unsigned int
+label_array_len(const name_and_columns_t *labels)
+{
+ int i = 0;
+
+ while (labels[i].name)
+ i++;
+
+ return (i);
+}
+
+/*
+ * Return the number of strings in a null-terminated string array.
+ * For example:
+ *
+ * const char foo[] = {"bar", "baz", NULL}
+ *
+ * returns 2
+ */
+static uint64_t
+str_array_len(const char *array[])
+{
+ uint64_t i = 0;
+ while (array[i])
+ i++;
+
+ return (i);
+}
+
+
+/*
+ * Return a default column width for default/latency/queue columns. This does
+ * not include histograms, which have their columns autosized.
+ */
+static unsigned int
+default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
+{
+ unsigned long column_width = 5; /* Normal niceprint */
+ static unsigned long widths[] = {
+ /*
+ * Choose some sane default column sizes for printing the
+ * raw numbers.
+ */
+ [IOS_DEFAULT] = 15, /* 1PB capacity */
+ [IOS_LATENCY] = 10, /* 1B ns = 10sec */
+ [IOS_QUEUES] = 6, /* 1M queue entries */
+ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */
+ [IOS_RQ_HISTO] = 6, /* 1M queue entries */
+ };
+
+ if (cb->cb_literal)
+ column_width = widths[type];
+
+ return (column_width);
+}
+
+/*
+ * Print the column labels, i.e:
+ *
+ * capacity operations bandwidth
+ * alloc free read write read write ...
+ *
+ * If force_column_width is set, use it for the column width. If not set, use
+ * the default column width.
+ */
+static void
+print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
+{
+ int i, idx, s;
+ int text_start, rw_column_width, spaces_to_end;
+ uint64_t flags = cb->cb_flags;
+ uint64_t f;
+ unsigned int column_width = force_column_width;
+
+ /* For each bit set in flags */
+ for (f = flags; f; f &= ~(1ULL << idx)) {
+ idx = lowbit64(f) - 1;
+ if (!force_column_width)
+ column_width = default_column_width(cb, idx);
+ /* Print our top labels centered over "read write" label. */
+ for (i = 0; i < label_array_len(labels[idx]); i++) {
+ const char *name = labels[idx][i].name;
+ /*
+ * We treat labels[][].columns == 0 as shorthand
+ * for one column. It makes writing out the label
+ * tables more concise.
+ */
+ unsigned int columns = MAX(1, labels[idx][i].columns);
+ unsigned int slen = strlen(name);
+
+ rw_column_width = (column_width * columns) +
+ (2 * (columns - 1));
+
+ text_start = (int)((rw_column_width) / columns -
+ slen / columns);
+ if (text_start < 0)
+ text_start = 0;
+
+ printf(" "); /* Two spaces between columns */
+
+ /* Space from beginning of column to label */
+ for (s = 0; s < text_start; s++)
+ printf(" ");
+
+ printf("%s", name);
+
+ /* Print space after label to end of column */
+ spaces_to_end = rw_column_width - text_start - slen;
+ if (spaces_to_end < 0)
+ spaces_to_end = 0;
+
+ for (s = 0; s < spaces_to_end; s++)
+ printf(" ");
+ }
+ }
+}
+
+
+/*
+ * print_cmd_columns - Print custom column titles from -c
+ *
+ * If the user specified the "zpool status|iostat -c" then print their custom
+ * column titles in the header. For example, print_cmd_columns() would print
+ * the " col1 col2" part of this:
+ *
+ * $ zpool iostat -vc 'echo col1=val1; echo col2=val2'
+ * ...
+ * capacity operations bandwidth
+ * pool alloc free read write read write col1 col2
+ * ---------- ----- ----- ----- ----- ----- ----- ---- ----
+ * mypool 269K 1008M 0 0 107 946
+ * mirror 269K 1008M 0 0 107 946
+ * sdb - - 0 0 102 473 val1 val2
+ * sdc - - 0 0 5 473 val1 val2
+ * ---------- ----- ----- ----- ----- ----- ----- ---- ----
+ */
+static void
+print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes)
+{
+ int i, j;
+ vdev_cmd_data_t *data = &vcdl->data[0];
+
+ if (vcdl->count == 0 || data == NULL)
+ return;
+
+ /*
+ * Each vdev cmd should have the same column names unless the user did
+ * something weird with their cmd. Just take the column names from the
+ * first vdev and assume it works for all of them.
+ */
+ for (i = 0; i < vcdl->uniq_cols_cnt; i++) {
+ printf(" ");
+ if (use_dashes) {
+ for (j = 0; j < vcdl->uniq_cols_width[i]; j++)
+ printf("-");
+ } else {
+ printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i],
+ vcdl->uniq_cols[i]);
+ }
+ }
+}
+
+
+/*
+ * Utility function to print out a line of dashes like:
+ *
+ * -------------------------------- ----- ----- ----- ----- -----
+ *
+ * ...or a dashed named-row line like:
+ *
+ * logs - - - - -
+ *
+ * @cb: iostat data
+ *
+ * @force_column_width If non-zero, use the value as the column width.
+ * Otherwise use the default column widths.
+ *
+ * @name: Print a dashed named-row line starting
+ * with @name. Otherwise, print a regular
+ * dashed line.
+ */
+static void
+print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const char *name)
+{
+ int i;
+ unsigned int namewidth;
+ uint64_t flags = cb->cb_flags;
+ uint64_t f;
+ int idx;
+ const name_and_columns_t *labels;
+ const char *title;
+
+
+ if (cb->cb_flags & IOS_ANYHISTO_M) {
+ title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
+ } else if (cb->cb_vdev_names_count) {
+ title = "vdev";
+ } else {
+ title = "pool";
+ }
+
+ namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
+ name ? strlen(name) : 0);
+
+
+ if (name) {
+ printf("%-*s", namewidth, name);
+ } else {
+ for (i = 0; i < namewidth; i++)
+ (void) printf("-");
+ }
+
+ /* For each bit in flags */
+ for (f = flags; f; f &= ~(1ULL << idx)) {
+ unsigned int column_width;
+ idx = lowbit64(f) - 1;
+ if (force_column_width)
+ column_width = force_column_width;
+ else
+ column_width = default_column_width(cb, idx);
+
+ labels = iostat_bottom_labels[idx];
+ for (i = 0; i < label_array_len(labels); i++) {
+ if (name)
+ printf(" %*s-", column_width - 1, " ");
+ else
+ printf(" %.*s", column_width,
+ "--------------------");
+ }
+ }
+}
+
+
+static void
+print_iostat_separator_impl(iostat_cbdata_t *cb,
+ unsigned int force_column_width)
+{
+ print_iostat_dashes(cb, force_column_width, NULL);
+}
+
+static void
+print_iostat_separator(iostat_cbdata_t *cb)
+{
+ print_iostat_separator_impl(cb, 0);
+}
+
+static void
+print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width,
+ const char *histo_vdev_name)
+{
+ unsigned int namewidth;
+ const char *title;
+
+ if (cb->cb_flags & IOS_ANYHISTO_M) {
+ title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)];
+ } else if (cb->cb_vdev_names_count) {
+ title = "vdev";
+ } else {
+ title = "pool";
+ }
+
+ namewidth = MAX(MAX(strlen(title), cb->cb_namewidth),
+ histo_vdev_name ? strlen(histo_vdev_name) : 0);
+
+ if (histo_vdev_name)
+ printf("%-*s", namewidth, histo_vdev_name);
+ else
+ printf("%*s", namewidth, "");
+
+
+ print_iostat_labels(cb, force_column_width, iostat_top_labels);
+ printf("\n");
+
+ printf("%-*s", namewidth, title);
+
+ print_iostat_labels(cb, force_column_width, iostat_bottom_labels);
+ if (cb->vcdl != NULL)
+ print_cmd_columns(cb->vcdl, 0);
+
+ printf("\n");
+
+ print_iostat_separator_impl(cb, force_column_width);
+
+ if (cb->vcdl != NULL)
+ print_cmd_columns(cb->vcdl, 1);
+
+ printf("\n");
+}
+
+static void
+print_iostat_header(iostat_cbdata_t *cb)
+{
+ print_iostat_header_impl(cb, 0, NULL);
+}
+
+
+/*
+ * Display a single statistic.
+ */
+static void
+print_one_stat(uint64_t value, enum zfs_nicenum_format format,
+ unsigned int column_size, boolean_t scripted)
+{
+ char buf[64];
+
+ zfs_nicenum_format(value, buf, sizeof (buf), format);
+
+ if (scripted)
+ printf("\t%s", buf);
+ else
+ printf(" %*s", column_size, buf);
+}
+
+/*
+ * Calculate the default vdev stats
+ *
+ * Subtract oldvs from newvs, apply a scaling factor, and save the resulting
+ * stats into calcvs.
+ */
+static void
+calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs,
+ vdev_stat_t *calcvs)
+{
+ int i;
+
+ memcpy(calcvs, newvs, sizeof (*calcvs));
+ for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++)
+ calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]);
+
+ for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++)
+ calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]);
+}
+
+/*
+ * Internal representation of the extended iostats data.
+ *
+ * The extended iostat stats are exported in nvlists as either uint64_t arrays
+ * or single uint64_t's. We make both look like arrays to make them easier
+ * to process. In order to make single uint64_t's look like arrays, we set
+ * __data to the stat data, and then set *data = &__data with count = 1. Then,
+ * we can just use *data and count.
+ */
+struct stat_array {
+ uint64_t *data;
+ uint_t count; /* Number of entries in data[] */
+ uint64_t __data; /* Only used when data is a single uint64_t */
+};
+
+static uint64_t
+stat_histo_max(struct stat_array *nva, unsigned int len)
+{
+ uint64_t max = 0;
+ int i;
+ for (i = 0; i < len; i++)
+ max = MAX(max, array64_max(nva[i].data, nva[i].count));
+
+ return (max);
+}
+
+/*
+ * Helper function to lookup a uint64_t array or uint64_t value and store its
+ * data as a stat_array. If the nvpair is a single uint64_t value, then we make
+ * it look like a one element array to make it easier to process.
+ */
+static int
+nvpair64_to_stat_array(nvlist_t *nvl, const char *name,
+ struct stat_array *nva)
+{
+ nvpair_t *tmp;
+ int ret;
+
+ verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0);
+ switch (nvpair_type(tmp)) {
+ case DATA_TYPE_UINT64_ARRAY:
+ ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count);
+ break;
+ case DATA_TYPE_UINT64:
+ ret = nvpair_value_uint64(tmp, &nva->__data);
+ nva->data = &nva->__data;
+ nva->count = 1;
+ break;
+ default:
+ /* Not a uint64_t */
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Given a list of nvlist names, look up the extended stats in newnv and oldnv,
+ * subtract them, and return the results in a newly allocated stat_array.
+ * You must free the returned array after you are done with it with
+ * free_calc_stats().
+ *
+ * Additionally, you can set "oldnv" to NULL if you simply want the newnv
+ * values.
+ */
+static struct stat_array *
+calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ nvlist_t *oldnvx = NULL, *newnvx;
+ struct stat_array *oldnva, *newnva, *calcnva;
+ int i, j;
+ unsigned int alloc_size = (sizeof (struct stat_array)) * len;
+
+ /* Extract our extended stats nvlist from the main list */
+ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &newnvx) == 0);
+ if (oldnv) {
+ verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &oldnvx) == 0);
+ }
+
+ newnva = safe_malloc(alloc_size);
+ oldnva = safe_malloc(alloc_size);
+ calcnva = safe_malloc(alloc_size);
+
+ for (j = 0; j < len; j++) {
+ verify(nvpair64_to_stat_array(newnvx, names[j],
+ &newnva[j]) == 0);
+ calcnva[j].count = newnva[j].count;
+ alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]);
+ calcnva[j].data = safe_malloc(alloc_size);
+ memcpy(calcnva[j].data, newnva[j].data, alloc_size);
+
+ if (oldnvx) {
+ verify(nvpair64_to_stat_array(oldnvx, names[j],
+ &oldnva[j]) == 0);
+ for (i = 0; i < oldnva[j].count; i++)
+ calcnva[j].data[i] -= oldnva[j].data[i];
+ }
+ }
+ free(newnva);
+ free(oldnva);
+ return (calcnva);
+}
+
+static void
+free_calc_stats(struct stat_array *nva, unsigned int len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ free(nva[i].data);
+
+ free(nva);
+}
+
+static void
+print_iostat_histo(struct stat_array *nva, unsigned int len,
+ iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth,
+ double scale)
+{
+ int i, j;
+ char buf[6];
+ uint64_t val;
+ enum zfs_nicenum_format format;
+ unsigned int buckets;
+ unsigned int start_bucket;
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ /* All these histos are the same size, so just use nva[0].count */
+ buckets = nva[0].count;
+
+ if (cb->cb_flags & IOS_RQ_HISTO_M) {
+ /* Start at 512 - req size should never be lower than this */
+ start_bucket = 9;
+ } else {
+ start_bucket = 0;
+ }
+
+ for (j = start_bucket; j < buckets; j++) {
+ /* Print histogram bucket label */
+ if (cb->cb_flags & IOS_L_HISTO_M) {
+ /* Ending range of this bucket */
+ val = (1UL << (j + 1)) - 1;
+ zfs_nicetime(val, buf, sizeof (buf));
+ } else {
+ /* Request size (starting range of bucket) */
+ val = (1UL << j);
+ zfs_nicenum(val, buf, sizeof (buf));
+ }
+
+ if (cb->cb_scripted)
+ printf("%llu", (u_longlong_t)val);
+ else
+ printf("%-*s", namewidth, buf);
+
+ /* Print the values on the line */
+ for (i = 0; i < len; i++) {
+ print_one_stat(nva[i].data[j] * scale, format,
+ column_width, cb->cb_scripted);
+ }
+ printf("\n");
+ }
+}
+
+static void
+print_solid_separator(unsigned int length)
+{
+ while (length--)
+ printf("-");
+ printf("\n");
+}
+
+static void
+print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv, double scale, const char *name)
+{
+ unsigned int column_width;
+ unsigned int namewidth;
+ unsigned int entire_width;
+ enum iostat_type type;
+ struct stat_array *nva;
+ const char **names;
+ unsigned int names_len;
+
+ /* What type of histo are we? */
+ type = IOS_HISTO_IDX(cb->cb_flags);
+
+ /* Get NULL-terminated array of nvlist names for our histo */
+ names = vsx_type_to_nvlist[type];
+ names_len = str_array_len(names); /* num of names */
+
+ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv);
+
+ if (cb->cb_literal) {
+ column_width = MAX(5,
+ (unsigned int) log10(stat_histo_max(nva, names_len)) + 1);
+ } else {
+ column_width = 5;
+ }
+
+ namewidth = MAX(cb->cb_namewidth,
+ strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]));
+
+ /*
+ * Calculate the entire line width of what we're printing. The
+ * +2 is for the two spaces between columns:
+ */
+ /* read write */
+ /* ----- ----- */
+ /* |___| <---------- column_width */
+ /* */
+ /* |__________| <--- entire_width */
+ /* */
+ entire_width = namewidth + (column_width + 2) *
+ label_array_len(iostat_bottom_labels[type]);
+
+ if (cb->cb_scripted)
+ printf("%s\n", name);
+ else
+ print_iostat_header_impl(cb, column_width, name);
+
+ print_iostat_histo(nva, names_len, cb, column_width,
+ namewidth, scale);
+
+ free_calc_stats(nva, names_len);
+ if (!cb->cb_scripted)
+ print_solid_separator(entire_width);
+}
+
+/*
+ * Calculate the average latency of a power-of-two latency histogram
+ */
+static uint64_t
+single_histo_average(uint64_t *histo, unsigned int buckets)
+{
+ int i;
+ uint64_t count = 0, total = 0;
+
+ for (i = 0; i < buckets; i++) {
+ /*
+ * Our buckets are power-of-two latency ranges. Use the
+ * midpoint latency of each bucket to calculate the average.
+ * For example:
+ *
+ * Bucket Midpoint
+ * 8ns-15ns: 12ns
+ * 16ns-31ns: 24ns
+ * ...
+ */
+ if (histo[i] != 0) {
+ total += histo[i] * (((1UL << i) + ((1UL << i)/2)));
+ count += histo[i];
+ }
+ }
+
+ /* Prevent divide by zero */
+ return (count == 0 ? 0 : total / count);
+}
+
+static void
+print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ int i;
+ uint64_t val;
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+ ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+ ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+ };
+
+ struct stat_array *nva;
+
+ unsigned int column_width = default_column_width(cb, IOS_QUEUES);
+ enum zfs_nicenum_format format;
+
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv);
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ val = nva[i].data[0];
+ print_one_stat(val, format, column_width, cb->cb_scripted);
+ }
+
+ free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+static void
+print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv,
+ nvlist_t *newnv)
+{
+ int i;
+ uint64_t val;
+ const char *names[] = {
+ ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+ ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+ };
+ struct stat_array *nva;
+
+ unsigned int column_width = default_column_width(cb, IOS_LATENCY);
+ enum zfs_nicenum_format format;
+
+ nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv);
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAWTIME;
+ else
+ format = ZFS_NICENUM_TIME;
+
+ /* Print our avg latencies on the line */
+ for (i = 0; i < ARRAY_SIZE(names); i++) {
+ /* Compute average latency for a latency histo */
+ val = single_histo_average(nva[i].data, nva[i].count);
+ print_one_stat(val, format, column_width, cb->cb_scripted);
+ }
+ free_calc_stats(nva, ARRAY_SIZE(names));
+}
+
+/*
+ * Print default statistics (capacity/operations/bandwidth)
+ */
+static void
+print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale)
+{
+ unsigned int column_width = default_column_width(cb, IOS_DEFAULT);
+ enum zfs_nicenum_format format;
+ char na; /* char to print for "not applicable" values */
+
+ if (cb->cb_literal) {
+ format = ZFS_NICENUM_RAW;
+ na = '0';
+ } else {
+ format = ZFS_NICENUM_1024;
+ na = '-';
+ }
+
+ /* only toplevel vdevs have capacity stats */
+ if (vs->vs_space == 0) {
+ if (cb->cb_scripted)
+ printf("\t%c\t%c", na, na);
+ else
+ printf(" %*c %*c", column_width, na, column_width,
+ na);
+ } else {
+ print_one_stat(vs->vs_alloc, format, column_width,
+ cb->cb_scripted);
+ print_one_stat(vs->vs_space - vs->vs_alloc, format,
+ column_width, cb->cb_scripted);
+ }
+
+ print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale),
+ format, column_width, cb->cb_scripted);
+ print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale),
+ format, column_width, cb->cb_scripted);
+}
+
+static const char *class_name[] = {
+ VDEV_ALLOC_BIAS_DEDUP,
+ VDEV_ALLOC_BIAS_SPECIAL,
+ VDEV_ALLOC_CLASS_LOGS
+};
+
+/*
+ * Print out all the statistics for the given vdev. This can either be the
+ * toplevel configuration, or called recursively. If 'name' is NULL, then this
+ * is a verbose output, and we don't want to display the toplevel pool stats.
+ *
+ * Returns the number of stat lines printed.
+ */
+static unsigned int
+print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
+ nvlist_t *newnv, iostat_cbdata_t *cb, int depth)
+{
+ nvlist_t **oldchild, **newchild;
+ uint_t c, children, oldchildren;
+ vdev_stat_t *oldvs, *newvs, *calcvs;
+ vdev_stat_t zerovs = { 0 };
+ char *vname;
+ int i;
+ int ret = 0;
+ uint64_t tdelta;
+ double scale;
+
+ if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
+ return (ret);
+
+ calcvs = safe_malloc(sizeof (*calcvs));
+
+ if (oldnv != NULL) {
+ verify(nvlist_lookup_uint64_array(oldnv,
+ ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
+ } else {
+ oldvs = &zerovs;
+ }
+
+ /* Do we only want to see a specific vdev? */
+ for (i = 0; i < cb->cb_vdev_names_count; i++) {
+ /* Yes we do. Is this the vdev? */
+ if (strcmp(name, cb->cb_vdev_names[i]) == 0) {
+ /*
+ * This is our vdev. Since it is the only vdev we
+ * will be displaying, make depth = 0 so that it
+ * doesn't get indented.
+ */
+ depth = 0;
+ break;
+ }
+ }
+
+ if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) {
+ /* Couldn't match the name */
+ goto children;
+ }
+
+
+ verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&newvs, &c) == 0);
+
+ /*
+ * Print the vdev name unless it's is a histogram. Histograms
+ * display the vdev name in the header itself.
+ */
+ if (!(cb->cb_flags & IOS_ANYHISTO_M)) {
+ if (cb->cb_scripted) {
+ printf("%s", name);
+ } else {
+ if (strlen(name) + depth > cb->cb_namewidth)
+ (void) printf("%*s%s", depth, "", name);
+ else
+ (void) printf("%*s%s%*s", depth, "", name,
+ (int)(cb->cb_namewidth - strlen(name) -
+ depth), "");
+ }
+ }
+
+ /* Calculate our scaling factor */
+ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp;
+ if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) {
+ /*
+ * If we specify printing histograms with no time interval, then
+ * print the histogram numbers over the entire lifetime of the
+ * vdev.
+ */
+ scale = 1;
+ } else {
+ if (tdelta == 0)
+ scale = 1.0;
+ else
+ scale = (double)NANOSEC / tdelta;
+ }
+
+ if (cb->cb_flags & IOS_DEFAULT_M) {
+ calc_default_iostats(oldvs, newvs, calcvs);
+ print_iostat_default(calcvs, cb, scale);
+ }
+ if (cb->cb_flags & IOS_LATENCY_M)
+ print_iostat_latency(cb, oldnv, newnv);
+ if (cb->cb_flags & IOS_QUEUES_M)
+ print_iostat_queues(cb, oldnv, newnv);
+ if (cb->cb_flags & IOS_ANYHISTO_M) {
+ printf("\n");
+ print_iostat_histos(cb, oldnv, newnv, scale, name);
+ }
+
+ if (cb->vcdl != NULL) {
+ char *path;
+ if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH,
+ &path) == 0) {
+ printf(" ");
+ zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path);
+ }
+ }
+
+ if (!(cb->cb_flags & IOS_ANYHISTO_M))
+ printf("\n");
+
+ ret++;
+
+children:
+
+ free(calcvs);
+
+ if (!cb->cb_verbose)
+ return (ret);
+
+ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN,
+ &newchild, &children) != 0)
+ return (ret);
+
+ if (oldnv) {
+ if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN,
+ &oldchild, &oldchildren) != 0)
+ return (ret);
+
+ children = MIN(oldchildren, children);
+ }
+
+ /*
+ * print normal top-level devices
+ */
+ for (c = 0; c < children; c++) {
+ uint64_t ishole = B_FALSE, islog = B_FALSE;
+
+ (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
+ &ishole);
+
+ (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
+ &islog);
+
+ if (ishole || islog)
+ continue;
+
+ if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+ cb->cb_name_flags);
+ ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
+ newchild[c], cb, depth + 2);
+ free(vname);
+ }
+
+ /*
+ * print all other top-level devices
+ */
+ for (uint_t n = 0; n < 3; n++) {
+ boolean_t printed = B_FALSE;
+
+ for (c = 0; c < children; c++) {
+ uint64_t islog = B_FALSE;
+ char *bias = NULL;
+ char *type = NULL;
+
+ (void) nvlist_lookup_uint64(newchild[c],
+ ZPOOL_CONFIG_IS_LOG, &islog);
+ if (islog) {
+ bias = VDEV_ALLOC_CLASS_LOGS;
+ } else {
+ (void) nvlist_lookup_string(newchild[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+ (void) nvlist_lookup_string(newchild[c],
+ ZPOOL_CONFIG_TYPE, &type);
+ }
+ if (bias == NULL || strcmp(bias, class_name[n]) != 0)
+ continue;
+ if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ continue;
+
+ if (!printed) {
+ if ((!(cb->cb_flags & IOS_ANYHISTO_M)) &&
+ !cb->cb_scripted && !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0,
+ class_name[n]);
+ }
+ printf("\n");
+ printed = B_TRUE;
+ }
+
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+ cb->cb_name_flags);
+ ret += print_vdev_stats(zhp, vname, oldnv ?
+ oldchild[c] : NULL, newchild[c], cb, depth + 2);
+ free(vname);
+ }
+ }
+
+ /*
+ * Include level 2 ARC devices in iostat output
+ */
+ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
+ &newchild, &children) != 0)
+ return (ret);
+
+ if (oldnv) {
+ if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
+ &oldchild, &oldchildren) != 0)
+ return (ret);
+
+ children = MIN(oldchildren, children);
+ }
+
+ if (children > 0) {
+ if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted &&
+ !cb->cb_vdev_names) {
+ print_iostat_dashes(cb, 0, "cache");
+ }
+ printf("\n");
+
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+ cb->cb_name_flags);
+ ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c]
+ : NULL, newchild[c], cb, depth + 2);
+ free(vname);
+ }
+ }
+
+ return (ret);
+}
+
+static int
+refresh_iostat(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ boolean_t missing;
+
+ /*
+ * If the pool has disappeared, remove it from the list and continue.
+ */
+ if (zpool_refresh_stats(zhp, &missing) != 0)
+ return (-1);
+
+ if (missing)
+ pool_list_remove(cb->cb_list, zhp);
+
+ return (0);
+}
+
+/*
+ * Callback to print out the iostats for the given pool.
+ */
+static int
+print_iostat(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ nvlist_t *oldconfig, *newconfig;
+ nvlist_t *oldnvroot, *newnvroot;
+ int ret;
+
+ newconfig = zpool_get_config(zhp, &oldconfig);
+
+ if (cb->cb_iteration == 1)
+ oldconfig = NULL;
+
+ verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &newnvroot) == 0);
+
+ if (oldconfig == NULL)
+ oldnvroot = NULL;
+ else
+ verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE,
+ &oldnvroot) == 0);
+
+ ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot,
+ cb, 0);
+ if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) &&
+ !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) {
+ print_iostat_separator(cb);
+ if (cb->vcdl != NULL) {
+ print_cmd_columns(cb->vcdl, 1);
+ }
+ printf("\n");
+ }
+
+ return (ret);
+}
+
+static int
+get_columns(void)
+{
+ struct winsize ws;
+ int columns = 80;
+ int error;
+
+ if (isatty(STDOUT_FILENO)) {
+ error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws);
+ if (error == 0)
+ columns = ws.ws_col;
+ } else {
+ columns = 999;
+ }
+
+ return (columns);
+}
+
+/*
+ * Return the required length of the pool/vdev name column. The minimum
+ * allowed width and output formatting flags must be provided.
+ */
+static int
+get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose)
+{
+ nvlist_t *config, *nvroot;
+ int width = min_width;
+
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ unsigned int poolname_len = strlen(zpool_get_name(zhp));
+ if (verbose == B_FALSE) {
+ width = MAX(poolname_len, min_width);
+ } else {
+ width = MAX(poolname_len,
+ max_width(zhp, nvroot, 0, min_width, flags));
+ }
+ }
+
+ return (width);
+}
+
+/*
+ * Parse the input string, get the 'interval' and 'count' value if there is one.
+ */
+static void
+get_interval_count(int *argcp, char **argv, float *iv,
+ unsigned long *cnt)
+{
+ float interval = 0;
+ unsigned long count = 0;
+ int argc = *argcp;
+
+ /*
+ * Determine if the last argument is an integer or a pool name
+ */
+ if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
+ char *end;
+
+ errno = 0;
+ interval = strtof(argv[argc - 1], &end);
+
+ if (*end == '\0' && errno == 0) {
+ if (interval == 0) {
+ (void) fprintf(stderr, gettext("interval "
+ "cannot be zero\n"));
+ usage(B_FALSE);
+ }
+ /*
+ * Ignore the last parameter
+ */
+ argc--;
+ } else {
+ /*
+ * If this is not a valid number, just plow on. The
+ * user will get a more informative error message later
+ * on.
+ */
+ interval = 0;
+ }
+ }
+
+ /*
+ * If the last argument is also an integer, then we have both a count
+ * and an interval.
+ */
+ if (argc > 0 && zfs_isnumber(argv[argc - 1])) {
+ char *end;
+
+ errno = 0;
+ count = interval;
+ interval = strtof(argv[argc - 1], &end);
+
+ if (*end == '\0' && errno == 0) {
+ if (interval == 0) {
+ (void) fprintf(stderr, gettext("interval "
+ "cannot be zero\n"));
+ usage(B_FALSE);
+ }
+
+ /*
+ * Ignore the last parameter
+ */
+ argc--;
+ } else {
+ interval = 0;
+ }
+ }
+
+ *iv = interval;
+ *cnt = count;
+ *argcp = argc;
+}
+
+static void
+get_timestamp_arg(char c)
+{
+ if (c == 'u')
+ timestamp_fmt = UDATE;
+ else if (c == 'd')
+ timestamp_fmt = DDATE;
+ else
+ usage(B_FALSE);
+}
+
+/*
+ * Return stat flags that are supported by all pools by both the module and
+ * zpool iostat. "*data" should be initialized to all 0xFFs before running.
+ * It will get ANDed down until only the flags that are supported on all pools
+ * remain.
+ */
+static int
+get_stat_flags_cb(zpool_handle_t *zhp, void *data)
+{
+ uint64_t *mask = data;
+ nvlist_t *config, *nvroot, *nvx;
+ uint64_t flags = 0;
+ int i, j;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+
+ /* Default stats are always supported, but for completeness.. */
+ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS))
+ flags |= IOS_DEFAULT_M;
+
+ /* Get our extended stats nvlist from the main list */
+ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX,
+ &nvx) != 0) {
+ /*
+ * No extended stats; they're probably running an older
+ * module. No big deal, we support that too.
+ */
+ goto end;
+ }
+
+ /* For each extended stat, make sure all its nvpairs are supported */
+ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) {
+ if (!vsx_type_to_nvlist[j][0])
+ continue;
+
+ /* Start off by assuming the flag is supported, then check */
+ flags |= (1ULL << j);
+ for (i = 0; vsx_type_to_nvlist[j][i]; i++) {
+ if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) {
+ /* flag isn't supported */
+ flags = flags & ~(1ULL << j);
+ break;
+ }
+ }
+ }
+end:
+ *mask = *mask & flags;
+ return (0);
+}
+
+/*
+ * Return a bitmask of stats that are supported on all pools by both the module
+ * and zpool iostat.
+ */
+static uint64_t
+get_stat_flags(zpool_list_t *list)
+{
+ uint64_t mask = -1;
+
+ /*
+ * get_stat_flags_cb() will lop off bits from "mask" until only the
+ * flags that are supported on all pools remain.
+ */
+ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask);
+ return (mask);
+}
+
+/*
+ * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise.
+ */
+static int
+is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data)
+{
+ iostat_cbdata_t *cb = cb_data;
+ char *name = NULL;
+ int ret = 0;
+
+ name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags);
+
+ if (strcmp(name, cb->cb_vdev_names[0]) == 0)
+ ret = 1; /* match */
+ free(name);
+
+ return (ret);
+}
+
+/*
+ * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise.
+ */
+static int
+is_vdev(zpool_handle_t *zhp, void *cb_data)
+{
+ return (for_each_vdev(zhp, is_vdev_cb, cb_data));
+}
+
+/*
+ * Check if vdevs are in a pool
+ *
+ * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise
+ * return 0. If pool_name is NULL, then search all pools.
+ */
+static int
+are_vdevs_in_pool(int argc, char **argv, char *pool_name,
+ iostat_cbdata_t *cb)
+{
+ char **tmp_name;
+ int ret = 0;
+ int i;
+ int pool_count = 0;
+
+ if ((argc == 0) || !*argv)
+ return (0);
+
+ if (pool_name)
+ pool_count = 1;
+
+ /* Temporarily hijack cb_vdev_names for a second... */
+ tmp_name = cb->cb_vdev_names;
+
+ /* Go though our list of prospective vdev names */
+ for (i = 0; i < argc; i++) {
+ cb->cb_vdev_names = argv + i;
+
+ /* Is this name a vdev in our pools? */
+ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL,
+ is_vdev, cb);
+ if (!ret) {
+ /* No match */
+ break;
+ }
+ }
+
+ cb->cb_vdev_names = tmp_name;
+
+ return (ret);
+}
+
+static int
+is_pool_cb(zpool_handle_t *zhp, void *data)
+{
+ char *name = data;
+ if (strcmp(name, zpool_get_name(zhp)) == 0)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Do we have a pool named *name? If so, return 1, otherwise 0.
+ */
+static int
+is_pool(char *name)
+{
+ return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name));
+}
+
+/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */
+static int
+are_all_pools(int argc, char **argv)
+{
+ if ((argc == 0) || !*argv)
+ return (0);
+
+ while (--argc >= 0)
+ if (!is_pool(argv[argc]))
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Helper function to print out vdev/pool names we can't resolve. Used for an
+ * error message.
+ */
+static void
+error_list_unresolved_vdevs(int argc, char **argv, char *pool_name,
+ iostat_cbdata_t *cb)
+{
+ int i;
+ char *name;
+ char *str;
+ for (i = 0; i < argc; i++) {
+ name = argv[i];
+
+ if (is_pool(name))
+ str = gettext("pool");
+ else if (are_vdevs_in_pool(1, &name, pool_name, cb))
+ str = gettext("vdev in this pool");
+ else if (are_vdevs_in_pool(1, &name, NULL, cb))
+ str = gettext("vdev in another pool");
+ else
+ str = gettext("unknown");
+
+ fprintf(stderr, "\t%s (%s)\n", name, str);
+ }
+}
+
+/*
+ * Same as get_interval_count(), but with additional checks to not misinterpret
+ * guids as interval/count values. Assumes VDEV_NAME_GUID is set in
+ * cb.cb_name_flags.
+ */
+static void
+get_interval_count_filter_guids(int *argc, char **argv, float *interval,
+ unsigned long *count, iostat_cbdata_t *cb)
+{
+ char **tmpargv = argv;
+ int argc_for_interval = 0;
+
+ /* Is the last arg an interval value? Or a guid? */
+ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) {
+ /*
+ * The last arg is not a guid, so it's probably an
+ * interval value.
+ */
+ argc_for_interval++;
+
+ if (*argc >= 2 &&
+ !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) {
+ /*
+ * The 2nd to last arg is not a guid, so it's probably
+ * an interval value.
+ */
+ argc_for_interval++;
+ }
+ }
+
+ /* Point to our list of possible intervals */
+ tmpargv = &argv[*argc - argc_for_interval];
+
+ *argc = *argc - argc_for_interval;
+ get_interval_count(&argc_for_interval, tmpargv,
+ interval, count);
+}
+
+/*
+ * Floating point sleep(). Allows you to pass in a floating point value for
+ * seconds.
+ */
+static void
+fsleep(float sec)
+{
+ struct timespec req;
+ req.tv_sec = floor(sec);
+ req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC;
+ nanosleep(&req, NULL);
+}
+
+/*
+ * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or
+ * if we were unable to determine its size.
+ */
+static int
+terminal_height(void)
+{
+ struct winsize win;
+
+ if (isatty(STDOUT_FILENO) == 0)
+ return (-1);
+
+ if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0)
+ return (win.ws_row);
+
+ return (-1);
+}
+
+/*
+ * Run one of the zpool status/iostat -c scripts with the help (-h) option and
+ * print the result.
+ *
+ * name: Short name of the script ('iostat').
+ * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat');
+ */
+static void
+print_zpool_script_help(char *name, char *path)
+{
+ char *argv[] = {path, "-h", NULL};
+ char **lines = NULL;
+ int lines_cnt = 0;
+ int rc;
+
+ rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines,
+ &lines_cnt);
+ if (rc != 0 || lines == NULL || lines_cnt <= 0) {
+ if (lines != NULL)
+ libzfs_free_str_array(lines, lines_cnt);
+ return;
+ }
+
+ for (int i = 0; i < lines_cnt; i++)
+ if (!is_blank_str(lines[i]))
+ printf(" %-14s %s\n", name, lines[i]);
+
+ libzfs_free_str_array(lines, lines_cnt);
+}
+
+/*
+ * Go though the zpool status/iostat -c scripts in the user's path, run their
+ * help option (-h), and print out the results.
+ */
+static void
+print_zpool_dir_scripts(char *dirpath)
+{
+ DIR *dir;
+ struct dirent *ent;
+ char fullpath[MAXPATHLEN];
+ struct stat dir_stat;
+
+ if ((dir = opendir(dirpath)) != NULL) {
+ /* print all the files and directories within directory */
+ while ((ent = readdir(dir)) != NULL) {
+ sprintf(fullpath, "%s/%s", dirpath, ent->d_name);
+
+ /* Print the scripts */
+ if (stat(fullpath, &dir_stat) == 0)
+ if (dir_stat.st_mode & S_IXUSR &&
+ S_ISREG(dir_stat.st_mode))
+ print_zpool_script_help(ent->d_name,
+ fullpath);
+ }
+ closedir(dir);
+ }
+}
+
+/*
+ * Print out help text for all zpool status/iostat -c scripts.
+ */
+static void
+print_zpool_script_list(char *subcommand)
+{
+ char *dir, *sp;
+
+ printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand);
+
+ sp = zpool_get_cmd_search_path();
+ if (sp == NULL)
+ return;
+
+ dir = strtok(sp, ":");
+ while (dir != NULL) {
+ print_zpool_dir_scripts(dir);
+ dir = strtok(NULL, ":");
+ }
+
+ free(sp);
+}
+
+/*
+ * Set the minimum pool/vdev name column width. The width must be at least 10,
+ * but may be as large as the column width - 42 so it still fits on one line.
+ * NOTE: 42 is the width of the default capacity/operations/bandwidth output
+ */
+static int
+get_namewidth_iostat(zpool_handle_t *zhp, void *data)
+{
+ iostat_cbdata_t *cb = data;
+ int width, available_width;
+
+ /*
+ * get_namewidth() returns the maximum width of any name in that column
+ * for any pool/vdev/device line that will be output.
+ */
+ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
+ cb->cb_verbose);
+
+ /*
+ * The width we are calculating is the width of the header and also the
+ * padding width for names that are less than maximum width. The stats
+ * take up 42 characters, so the width available for names is:
+ */
+ available_width = get_columns() - 42;
+
+ /*
+ * If the maximum width fits on a screen, then great! Make everything
+ * line up by justifying all lines to the same width. If that max
+ * width is larger than what's available, the name plus stats won't fit
+ * on one line, and justifying to that width would cause every line to
+ * wrap on the screen. We only want lines with long names to wrap.
+ * Limit the padding to what won't wrap.
+ */
+ if (width > available_width)
+ width = available_width;
+
+ /*
+ * And regardless of whatever the screen width is (get_columns can
+ * return 0 if the width is not known or less than 42 for a narrow
+ * terminal) have the width be a minimum of 10.
+ */
+ if (width < 10)
+ width = 10;
+
+ /* Save the calculated width */
+ cb->cb_namewidth = width;
+
+ return (0);
+}
+
+/*
+ * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name]
+ * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]]
+ * [interval [count]]
+ *
+ * -c CMD For each vdev, run command CMD
+ * -g Display guid for individual vdev name.
+ * -L Follow links when resolving vdev path name.
+ * -P Display full path for vdev name.
+ * -v Display statistics for individual vdevs
+ * -h Display help
+ * -p Display values in parsable (exact) format.
+ * -H Scripted mode. Don't display headers, and separate properties
+ * by a single tab.
+ * -l Display average latency
+ * -q Display queue depths
+ * -w Display latency histograms
+ * -r Display request size histogram
+ * -T Display a timestamp in date(1) or Unix format
+ * -n Only print headers once
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes. The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c. We rely
+ * on pool_list_update() to detect the addition of new pools. Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+ int c;
+ int ret;
+ int npools;
+ float interval = 0;
+ unsigned long count = 0;
+ int winheight = 24;
+ zpool_list_t *list;
+ boolean_t verbose = B_FALSE;
+ boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE;
+ boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE;
+ boolean_t omit_since_boot = B_FALSE;
+ boolean_t guid = B_FALSE;
+ boolean_t follow_links = B_FALSE;
+ boolean_t full_name = B_FALSE;
+ boolean_t headers_once = B_FALSE;
+ iostat_cbdata_t cb = { 0 };
+ char *cmd = NULL;
+
+ /* Used for printing error message */
+ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q',
+ [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'};
+
+ uint64_t unsupported_flags;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) {
+ switch (c) {
+ case 'c':
+ if (cmd != NULL) {
+ fprintf(stderr,
+ gettext("Can't set -c flag twice\n"));
+ exit(1);
+ }
+
+ if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL &&
+ !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) {
+ fprintf(stderr, gettext(
+ "Can't run -c, disabled by "
+ "ZPOOL_SCRIPTS_ENABLED.\n"));
+ exit(1);
+ }
+
+ if ((getuid() <= 0 || geteuid() <= 0) &&
+ !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) {
+ fprintf(stderr, gettext(
+ "Can't run -c with root privileges "
+ "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n"));
+ exit(1);
+ }
+ cmd = optarg;
+ verbose = B_TRUE;
+ break;
+ case 'g':
+ guid = B_TRUE;
+ break;
+ case 'L':
+ follow_links = B_TRUE;
+ break;
+ case 'P':
+ full_name = B_TRUE;
+ break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case 'p':
+ parsable = B_TRUE;
+ break;
+ case 'l':
+ latency = B_TRUE;
+ break;
+ case 'q':
+ queues = B_TRUE;
+ break;
+ case 'H':
+ scripted = B_TRUE;
+ break;
+ case 'w':
+ l_histo = B_TRUE;
+ break;
+ case 'r':
+ rq_histo = B_TRUE;
+ break;
+ case 'y':
+ omit_since_boot = B_TRUE;
+ break;
+ case 'n':
+ headers_once = B_TRUE;
+ break;
+ case 'h':
+ usage(B_FALSE);
+ break;
+ case '?':
+ if (optopt == 'c') {
+ print_zpool_script_list("iostat");
+ exit(0);
+ } else {
+ fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ cb.cb_literal = parsable;
+ cb.cb_scripted = scripted;
+
+ if (guid)
+ cb.cb_name_flags |= VDEV_NAME_GUID;
+ if (follow_links)
+ cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ if (full_name)
+ cb.cb_name_flags |= VDEV_NAME_PATH;
+ cb.cb_iteration = 0;
+ cb.cb_namewidth = 0;
+ cb.cb_verbose = verbose;
+
+ /* Get our interval and count values (if any) */
+ if (guid) {
+ get_interval_count_filter_guids(&argc, argv, &interval,
+ &count, &cb);
+ } else {
+ get_interval_count(&argc, argv, &interval, &count);
+ }
+
+ if (argc == 0) {
+ /* No args, so just print the defaults. */
+ } else if (are_all_pools(argc, argv)) {
+ /* All the args are pool names */
+ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) {
+ /* All the args are vdevs */
+ cb.cb_vdev_names = argv;
+ cb.cb_vdev_names_count = argc;
+ argc = 0; /* No pools to process */
+ } else if (are_all_pools(1, argv)) {
+ /* The first arg is a pool name */
+ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) {
+ /* ...and the rest are vdev names */
+ cb.cb_vdev_names = argv + 1;
+ cb.cb_vdev_names_count = argc - 1;
+ argc = 1; /* One pool to process */
+ } else {
+ fprintf(stderr, gettext("Expected either a list of "));
+ fprintf(stderr, gettext("pools, or list of vdevs in"));
+ fprintf(stderr, " \"%s\", ", argv[0]);
+ fprintf(stderr, gettext("but got:\n"));
+ error_list_unresolved_vdevs(argc - 1, argv + 1,
+ argv[0], &cb);
+ fprintf(stderr, "\n");
+ usage(B_FALSE);
+ return (1);
+ }
+ } else {
+ /*
+ * The args don't make sense. The first arg isn't a pool name,
+ * nor are all the args vdevs.
+ */
+ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n"));
+ fprintf(stderr, "\n");
+ return (1);
+ }
+
+ if (cb.cb_vdev_names_count != 0) {
+ /*
+ * If user specified vdevs, it implies verbose.
+ */
+ cb.cb_verbose = B_TRUE;
+ }
+
+ /*
+ * Construct the list of all interesting pools.
+ */
+ ret = 0;
+ if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
+ return (1);
+
+ if (pool_list_count(list) == 0 && argc != 0) {
+ pool_list_free(list);
+ return (1);
+ }
+
+ if (pool_list_count(list) == 0 && interval == 0) {
+ pool_list_free(list);
+ (void) fprintf(stderr, gettext("no pools available\n"));
+ return (1);
+ }
+
+ if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) {
+ pool_list_free(list);
+ (void) fprintf(stderr,
+ gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n"));
+ usage(B_FALSE);
+ return (1);
+ }
+
+ if (l_histo && rq_histo) {
+ pool_list_free(list);
+ (void) fprintf(stderr,
+ gettext("Only one of [-r|-w] can be passed at a time\n"));
+ usage(B_FALSE);
+ return (1);
+ }
+
+ /*
+ * Enter the main iostat loop.
+ */
+ cb.cb_list = list;
+
+ if (l_histo) {
+ /*
+ * Histograms tables look out of place when you try to display
+ * them with the other stats, so make a rule that you can only
+ * print histograms by themselves.
+ */
+ cb.cb_flags = IOS_L_HISTO_M;
+ } else if (rq_histo) {
+ cb.cb_flags = IOS_RQ_HISTO_M;
+ } else {
+ cb.cb_flags = IOS_DEFAULT_M;
+ if (latency)
+ cb.cb_flags |= IOS_LATENCY_M;
+ if (queues)
+ cb.cb_flags |= IOS_QUEUES_M;
+ }
+
+ /*
+ * See if the module supports all the stats we want to display.
+ */
+ unsupported_flags = cb.cb_flags & ~get_stat_flags(list);
+ if (unsupported_flags) {
+ uint64_t f;
+ int idx;
+ fprintf(stderr,
+ gettext("The loaded zfs module doesn't support:"));
+
+ /* for each bit set in unsupported_flags */
+ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) {
+ idx = lowbit64(f) - 1;
+ fprintf(stderr, " -%c", flag_to_arg[idx]);
+ }
+
+ fprintf(stderr, ". Try running a newer module.\n");
+ pool_list_free(list);
+
+ return (1);
+ }
+
+ for (;;) {
+ if ((npools = pool_list_count(list)) == 0)
+ (void) fprintf(stderr, gettext("no pools available\n"));
+ else {
+ /*
+ * If this is the first iteration and -y was supplied
+ * we skip any printing.
+ */
+ boolean_t skip = (omit_since_boot &&
+ cb.cb_iteration == 0);
+
+ /*
+ * Refresh all statistics. This is done as an
+ * explicit step before calculating the maximum name
+ * width, so that any * configuration changes are
+ * properly accounted for.
+ */
+ (void) pool_list_iter(list, B_FALSE, refresh_iostat,
+ &cb);
+
+ /*
+ * Iterate over all pools to determine the maximum width
+ * for the pool / device name column across all pools.
+ */
+ cb.cb_namewidth = 0;
+ (void) pool_list_iter(list, B_FALSE,
+ get_namewidth_iostat, &cb);
+
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
+
+ if (cmd != NULL && cb.cb_verbose &&
+ !(cb.cb_flags & IOS_ANYHISTO_M)) {
+ cb.vcdl = all_pools_for_each_vdev_run(argc,
+ argv, cmd, g_zfs, cb.cb_vdev_names,
+ cb.cb_vdev_names_count, cb.cb_name_flags);
+ } else {
+ cb.vcdl = NULL;
+ }
+
+
+ /*
+ * Check terminal size so we can print headers
+ * even when terminal window has its height
+ * changed.
+ */
+ winheight = terminal_height();
+ /*
+ * Are we connected to TTY? If not, headers_once
+ * should be true, to avoid breaking scripts.
+ */
+ if (winheight < 0)
+ headers_once = B_TRUE;
+
+ /*
+ * If it's the first time and we're not skipping it,
+ * or either skip or verbose mode, print the header.
+ *
+ * The histogram code explicitly prints its header on
+ * every vdev, so skip this for histograms.
+ */
+ if (((++cb.cb_iteration == 1 && !skip) ||
+ (skip != verbose) ||
+ (!headers_once &&
+ (cb.cb_iteration % winheight) == 0)) &&
+ (!(cb.cb_flags & IOS_ANYHISTO_M)) &&
+ !cb.cb_scripted)
+ print_iostat_header(&cb);
+
+ if (skip) {
+ (void) fsleep(interval);
+ continue;
+ }
+
+ pool_list_iter(list, B_FALSE, print_iostat, &cb);
+
+ /*
+ * If there's more than one pool, and we're not in
+ * verbose mode (which prints a separator for us),
+ * then print a separator.
+ *
+ * In addition, if we're printing specific vdevs then
+ * we also want an ending separator.
+ */
+ if (((npools > 1 && !verbose &&
+ !(cb.cb_flags & IOS_ANYHISTO_M)) ||
+ (!(cb.cb_flags & IOS_ANYHISTO_M) &&
+ cb.cb_vdev_names_count)) &&
+ !cb.cb_scripted) {
+ print_iostat_separator(&cb);
+ if (cb.vcdl != NULL)
+ print_cmd_columns(cb.vcdl, 1);
+ printf("\n");
+ }
+
+ if (cb.vcdl != NULL)
+ free_vdev_cmd_data_list(cb.vcdl);
+
+ }
+
+ /*
+ * Flush the output so that redirection to a file isn't buffered
+ * indefinitely.
+ */
+ (void) fflush(stdout);
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) fsleep(interval);
+ }
+
+ pool_list_free(list);
+
+ return (ret);
+}
+
+typedef struct list_cbdata {
+ boolean_t cb_verbose;
+ int cb_name_flags;
+ int cb_namewidth;
+ boolean_t cb_scripted;
+ zprop_list_t *cb_proplist;
+ boolean_t cb_literal;
+} list_cbdata_t;
+
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(list_cbdata_t *cb)
+{
+ zprop_list_t *pl = cb->cb_proplist;
+ char headerbuf[ZPOOL_MAXPROPLEN];
+ const char *header;
+ boolean_t first = B_TRUE;
+ boolean_t right_justify;
+ size_t width = 0;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+ width = pl->pl_width;
+ if (first && cb->cb_verbose) {
+ /*
+ * Reset the width to accommodate the verbose listing
+ * of devices.
+ */
+ width = cb->cb_namewidth;
+ }
+
+ if (!first)
+ (void) printf(" ");
+ else
+ first = B_FALSE;
+
+ right_justify = B_FALSE;
+ if (pl->pl_prop != ZPROP_INVAL) {
+ header = zpool_prop_column_name(pl->pl_prop);
+ right_justify = zpool_prop_align_right(pl->pl_prop);
+ } else {
+ int i;
+
+ for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+ headerbuf[i] = toupper(pl->pl_user_prop[i]);
+ headerbuf[i] = '\0';
+ header = headerbuf;
+ }
+
+ if (pl->pl_next == NULL && !right_justify)
+ (void) printf("%s", header);
+ else if (right_justify)
+ (void) printf("%*s", (int)width, header);
+ else
+ (void) printf("%-*s", (int)width, header);
+ }
+
+ (void) printf("\n");
+}
+
+/*
+ * Given a pool and a list of properties, print out all the properties according
+ * to the described layout. Used by zpool_do_list().
+ */
+static void
+print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
+{
+ zprop_list_t *pl = cb->cb_proplist;
+ boolean_t first = B_TRUE;
+ char property[ZPOOL_MAXPROPLEN];
+ char *propstr;
+ boolean_t right_justify;
+ size_t width;
+
+ for (; pl != NULL; pl = pl->pl_next) {
+
+ width = pl->pl_width;
+ if (first && cb->cb_verbose) {
+ /*
+ * Reset the width to accommodate the verbose listing
+ * of devices.
+ */
+ width = cb->cb_namewidth;
+ }
+
+ if (!first) {
+ if (cb->cb_scripted)
+ (void) printf("\t");
+ else
+ (void) printf(" ");
+ } else {
+ first = B_FALSE;
+ }
+
+ right_justify = B_FALSE;
+ if (pl->pl_prop != ZPROP_INVAL) {
+ if (zpool_get_prop(zhp, pl->pl_prop, property,
+ sizeof (property), NULL, cb->cb_literal) != 0)
+ propstr = "-";
+ else
+ propstr = property;
+
+ right_justify = zpool_prop_align_right(pl->pl_prop);
+ } else if ((zpool_prop_feature(pl->pl_user_prop) ||
+ zpool_prop_unsupported(pl->pl_user_prop)) &&
+ zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
+ sizeof (property)) == 0) {
+ propstr = property;
+ } else {
+ propstr = "-";
+ }
+
+
+ /*
+ * If this is being called in scripted mode, or if this is the
+ * last column and it is left-justified, don't include a width
+ * format specifier.
+ */
+ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
+ (void) printf("%s", propstr);
+ else if (right_justify)
+ (void) printf("%*s", (int)width, propstr);
+ else
+ (void) printf("%-*s", (int)width, propstr);
+ }
+
+ (void) printf("\n");
+}
+
+static void
+print_one_column(zpool_prop_t prop, uint64_t value, const char *str,
+ boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format)
+{
+ char propval[64];
+ boolean_t fixed;
+ size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
+
+ switch (prop) {
+ case ZPOOL_PROP_EXPANDSZ:
+ case ZPOOL_PROP_CHECKPOINT:
+ case ZPOOL_PROP_DEDUPRATIO:
+ if (value == 0)
+ (void) strlcpy(propval, "-", sizeof (propval));
+ else
+ zfs_nicenum_format(value, propval, sizeof (propval),
+ format);
+ break;
+ case ZPOOL_PROP_FRAGMENTATION:
+ if (value == ZFS_FRAG_INVALID) {
+ (void) strlcpy(propval, "-", sizeof (propval));
+ } else if (format == ZFS_NICENUM_RAW) {
+ (void) snprintf(propval, sizeof (propval), "%llu",
+ (unsigned long long)value);
+ } else {
+ (void) snprintf(propval, sizeof (propval), "%llu%%",
+ (unsigned long long)value);
+ }
+ break;
+ case ZPOOL_PROP_CAPACITY:
+ /* capacity value is in parts-per-10,000 (aka permyriad) */
+ if (format == ZFS_NICENUM_RAW)
+ (void) snprintf(propval, sizeof (propval), "%llu",
+ (unsigned long long)value / 100);
+ else
+ (void) snprintf(propval, sizeof (propval),
+ value < 1000 ? "%1.2f%%" : value < 10000 ?
+ "%2.1f%%" : "%3.0f%%", value / 100.0);
+ break;
+ case ZPOOL_PROP_HEALTH:
+ width = 8;
+ snprintf(propval, sizeof (propval), "%-*s", (int)width, str);
+ break;
+ default:
+ zfs_nicenum_format(value, propval, sizeof (propval), format);
+ }
+
+ if (!valid)
+ (void) strlcpy(propval, "-", sizeof (propval));
+
+ if (scripted)
+ (void) printf("\t%s", propval);
+ else
+ (void) printf(" %*s", (int)width, propval);
+}
+
+/*
+ * print static default line per vdev
+ * not compatible with '-o' <proplist> option
+ */
+static void
+print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
+ list_cbdata_t *cb, int depth, boolean_t isspare)
+{
+ nvlist_t **child;
+ vdev_stat_t *vs;
+ uint_t c, children;
+ char *vname;
+ boolean_t scripted = cb->cb_scripted;
+ uint64_t islog = B_FALSE;
+ char *dashes = "%-*s - - - - "
+ "- - - - -\n";
+
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
+ if (name != NULL) {
+ boolean_t toplevel = (vs->vs_space != 0);
+ uint64_t cap;
+ enum zfs_nicenum_format format;
+ const char *state;
+
+ if (cb->cb_literal)
+ format = ZFS_NICENUM_RAW;
+ else
+ format = ZFS_NICENUM_1024;
+
+ if (strcmp(name, VDEV_TYPE_INDIRECT) == 0)
+ return;
+
+ if (scripted)
+ (void) printf("\t%s", name);
+ else if (strlen(name) + depth > cb->cb_namewidth)
+ (void) printf("%*s%s", depth, "", name);
+ else
+ (void) printf("%*s%s%*s", depth, "", name,
+ (int)(cb->cb_namewidth - strlen(name) - depth), "");
+
+ /*
+ * Print the properties for the individual vdevs. Some
+ * properties are only applicable to toplevel vdevs. The
+ * 'toplevel' boolean value is passed to the print_one_column()
+ * to indicate that the value is valid.
+ */
+ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted,
+ toplevel, format);
+ print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL,
+ scripted, toplevel, format);
+ print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
+ NULL, scripted, toplevel, format);
+ print_one_column(ZPOOL_PROP_CHECKPOINT,
+ vs->vs_checkpoint_space, NULL, scripted, toplevel, format);
+ print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL,
+ scripted, B_TRUE, format);
+ print_one_column(ZPOOL_PROP_FRAGMENTATION,
+ vs->vs_fragmentation, NULL, scripted,
+ (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel),
+ format);
+ cap = (vs->vs_space == 0) ? 0 :
+ (vs->vs_alloc * 10000 / vs->vs_space);
+ print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL,
+ scripted, toplevel, format);
+ print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL,
+ scripted, toplevel, format);
+ state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+ if (isspare) {
+ if (vs->vs_aux == VDEV_AUX_SPARED)
+ state = "INUSE";
+ else if (vs->vs_state == VDEV_STATE_HEALTHY)
+ state = "AVAIL";
+ }
+ print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted,
+ B_TRUE, format);
+ (void) printf("\n");
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return;
+
+ /* list the normal vdevs first */
+ for (c = 0; c < children; c++) {
+ uint64_t ishole = B_FALSE;
+
+ if (nvlist_lookup_uint64(child[c],
+ ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
+ continue;
+
+ if (nvlist_lookup_uint64(child[c],
+ ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog)
+ continue;
+
+ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
+ vname = zpool_vdev_name(g_zfs, zhp, child[c],
+ cb->cb_name_flags);
+ print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE);
+ free(vname);
+ }
+
+ /* list the classes: 'logs', 'dedup', and 'special' */
+ for (uint_t n = 0; n < 3; n++) {
+ boolean_t printed = B_FALSE;
+
+ for (c = 0; c < children; c++) {
+ char *bias = NULL;
+ char *type = NULL;
+
+ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &islog) == 0 && islog) {
+ bias = VDEV_ALLOC_CLASS_LOGS;
+ } else {
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, &bias);
+ (void) nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_TYPE, &type);
+ }
+ if (bias == NULL || strcmp(bias, class_name[n]) != 0)
+ continue;
+ if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0)
+ continue;
+
+ if (!printed) {
+ /* LINTED E_SEC_PRINTF_VAR_FMT */
+ (void) printf(dashes, cb->cb_namewidth,
+ class_name[n]);
+ printed = B_TRUE;
+ }
+ vname = zpool_vdev_name(g_zfs, zhp, child[c],
+ cb->cb_name_flags);
+ print_list_stats(zhp, vname, child[c], cb, depth + 2,
+ B_FALSE);
+ free(vname);
+ }
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0 && children > 0) {
+ /* LINTED E_SEC_PRINTF_VAR_FMT */
+ (void) printf(dashes, cb->cb_namewidth, "cache");
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, child[c],
+ cb->cb_name_flags);
+ print_list_stats(zhp, vname, child[c], cb, depth + 2,
+ B_FALSE);
+ free(vname);
+ }
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
+ &children) == 0 && children > 0) {
+ /* LINTED E_SEC_PRINTF_VAR_FMT */
+ (void) printf(dashes, cb->cb_namewidth, "spare");
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, child[c],
+ cb->cb_name_flags);
+ print_list_stats(zhp, vname, child[c], cb, depth + 2,
+ B_TRUE);
+ free(vname);
+ }
+ }
+}
+
+/*
+ * Generic callback function to list a pool.
+ */
+static int
+list_callback(zpool_handle_t *zhp, void *data)
+{
+ list_cbdata_t *cbp = data;
+
+ print_pool(zhp, cbp);
+
+ if (cbp->cb_verbose) {
+ nvlist_t *config, *nvroot;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE);
+ }
+
+ return (0);
+}
+
+/*
+ * Set the minimum pool/vdev name column width. The width must be at least 9,
+ * but may be as large as needed.
+ */
+static int
+get_namewidth_list(zpool_handle_t *zhp, void *data)
+{
+ list_cbdata_t *cb = data;
+ int width;
+
+ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
+ cb->cb_verbose);
+
+ if (width < 9)
+ width = 9;
+
+ cb->cb_namewidth = width;
+
+ return (0);
+}
+
+/*
+ * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
+ *
+ * -g Display guid for individual vdev name.
+ * -H Scripted mode. Don't display headers, and separate properties
+ * by a single tab.
+ * -L Follow links when resolving vdev path name.
+ * -o List of properties to display. Defaults to
+ * "name,size,allocated,free,expandsize,fragmentation,capacity,"
+ * "dedupratio,health,altroot"
+ * -p Display values in parsable (exact) format.
+ * -P Display full path for vdev name.
+ * -T Display a timestamp in date(1) or Unix format
+ *
+ * List all pools in the system, whether or not they're healthy. Output space
+ * statistics for each one, as well as health status summary.
+ */
+int
+zpool_do_list(int argc, char **argv)
+{
+ int c;
+ int ret = 0;
+ list_cbdata_t cb = { 0 };
+ static char default_props[] =
+ "name,size,allocated,free,checkpoint,expandsize,fragmentation,"
+ "capacity,dedupratio,health,altroot";
+ char *props = default_props;
+ float interval = 0;
+ unsigned long count = 0;
+ zpool_list_t *list;
+ boolean_t first = B_TRUE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) {
+ switch (c) {
+ case 'g':
+ cb.cb_name_flags |= VDEV_NAME_GUID;
+ break;
+ case 'H':
+ cb.cb_scripted = B_TRUE;
+ break;
+ case 'L':
+ cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ break;
+ case 'o':
+ props = optarg;
+ break;
+ case 'P':
+ cb.cb_name_flags |= VDEV_NAME_PATH;
+ break;
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
+ case 'v':
+ cb.cb_verbose = B_TRUE;
+ cb.cb_namewidth = 8; /* 8 until precalc is avail */
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ get_interval_count(&argc, argv, &interval, &count);
+
+ if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
+ usage(B_FALSE);
+
+ for (;;) {
+ if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
+ &ret)) == NULL)
+ return (1);
+
+ if (pool_list_count(list) == 0)
+ break;
+
+ cb.cb_namewidth = 0;
+ (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb);
+
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
+
+ if (!cb.cb_scripted && (first || cb.cb_verbose)) {
+ print_header(&cb);
+ first = B_FALSE;
+ }
+ ret = pool_list_iter(list, B_TRUE, list_callback, &cb);
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ pool_list_free(list);
+ (void) fsleep(interval);
+ }
+
+ if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
+ (void) printf(gettext("no pools available\n"));
+ ret = 0;
+ }
+
+ pool_list_free(list);
+ zprop_free_list(cb.cb_proplist);
+ return (ret);
+}
+
+static int
+zpool_do_attach_or_replace(int argc, char **argv, int replacing)
+{
+ boolean_t force = B_FALSE;
+ boolean_t rebuild = B_FALSE;
+ boolean_t wait = B_FALSE;
+ int c;
+ nvlist_t *nvroot;
+ char *poolname, *old_disk, *new_disk;
+ zpool_handle_t *zhp;
+ nvlist_t *props = NULL;
+ char *propval;
+ int ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "fo:sw")) != -1) {
+ switch (c) {
+ case 'f':
+ force = B_TRUE;
+ break;
+ case 'o':
+ if ((propval = strchr(optarg, '=')) == NULL) {
+ (void) fprintf(stderr, gettext("missing "
+ "'=' for -o option\n"));
+ usage(B_FALSE);
+ }
+ *propval = '\0';
+ propval++;
+
+ if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) ||
+ (add_prop_list(optarg, propval, &props, B_TRUE)))
+ usage(B_FALSE);
+ break;
+ case 's':
+ rebuild = B_TRUE;
+ break;
+ case 'w':
+ wait = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if (argc < 2) {
+ (void) fprintf(stderr,
+ gettext("missing <device> specification\n"));
+ usage(B_FALSE);
+ }
+
+ old_disk = argv[1];
+
+ if (argc < 3) {
+ if (!replacing) {
+ (void) fprintf(stderr,
+ gettext("missing <new_device> specification\n"));
+ usage(B_FALSE);
+ }
+ new_disk = old_disk;
+ argc -= 1;
+ argv += 1;
+ } else {
+ new_disk = argv[2];
+ argc -= 2;
+ argv += 2;
+ }
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
+ nvlist_free(props);
+ return (1);
+ }
+
+ if (zpool_get_config(zhp, NULL) == NULL) {
+ (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"),
+ poolname);
+ zpool_close(zhp);
+ nvlist_free(props);
+ return (1);
+ }
+
+ /* unless manually specified use "ashift" pool property (if set) */
+ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) {
+ int intval;
+ zprop_source_t src;
+ char strval[ZPOOL_MAXPROPLEN];
+
+ intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src);
+ if (src != ZPROP_SRC_DEFAULT) {
+ (void) sprintf(strval, "%" PRId32, intval);
+ verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval,
+ &props, B_TRUE) == 0);
+ }
+ }
+
+ nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE,
+ argc, argv);
+ if (nvroot == NULL) {
+ zpool_close(zhp);
+ nvlist_free(props);
+ return (1);
+ }
+
+ ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
+ rebuild);
+
+ if (ret == 0 && wait)
+ ret = zpool_wait(zhp,
+ replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER);
+
+ nvlist_free(props);
+ nvlist_free(nvroot);
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device>
+ *
+ * -f Force attach, even if <new_device> appears to be in use.
+ * -s Use sequential instead of healing reconstruction for resilver.
+ * -o Set property=value.
+ * -w Wait for replacing to complete before returning
+ *
+ * Replace <device> with <new_device>.
+ */
+/* ARGSUSED */
+int
+zpool_do_replace(int argc, char **argv)
+{
+ return (zpool_do_attach_or_replace(argc, argv, B_TRUE));
+}
+
+/*
+ * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
+ *
+ * -f Force attach, even if <new_device> appears to be in use.
+ * -s Use sequential instead of healing reconstruction for resilver.
+ * -o Set property=value.
+ * -w Wait for resilvering to complete before returning
+ *
+ * Attach <new_device> to the mirror containing <device>. If <device> is not
+ * part of a mirror, then <device> will be transformed into a mirror of
+ * <device> and <new_device>. In either case, <new_device> will begin life
+ * with a DTL of [0, now], and will immediately begin to resilver itself.
+ */
+int
+zpool_do_attach(int argc, char **argv)
+{
+ return (zpool_do_attach_or_replace(argc, argv, B_FALSE));
+}
+
+/*
+ * zpool detach [-f] <pool> <device>
+ *
+ * -f Force detach of <device>, even if DTLs argue against it
+ * (not supported yet)
+ *
+ * Detach a device from a mirror. The operation will be refused if <device>
+ * is the last device in the mirror, or if the DTLs indicate that this device
+ * has the only valid copy of some data.
+ */
+/* ARGSUSED */
+int
+zpool_do_detach(int argc, char **argv)
+{
+ int c;
+ char *poolname, *path;
+ zpool_handle_t *zhp;
+ int ret;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "")) != -1) {
+ switch (c) {
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc < 2) {
+ (void) fprintf(stderr,
+ gettext("missing <device> specification\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+ path = argv[1];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ ret = zpool_vdev_detach(zhp, path);
+
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool split [-gLnP] [-o prop=val] ...
+ * [-o mntopt] ...
+ * [-R altroot] <pool> <newpool> [<device> ...]
+ *
+ * -g Display guid for individual vdev name.
+ * -L Follow links when resolving vdev path name.
+ * -n Do not split the pool, but display the resulting layout if
+ * it were to be split.
+ * -o Set property=value, or set mount options.
+ * -P Display full path for vdev name.
+ * -R Mount the split-off pool under an alternate root.
+ * -l Load encryption keys while importing.
+ *
+ * Splits the named pool and gives it the new pool name. Devices to be split
+ * off may be listed, provided that no more than one device is specified
+ * per top-level vdev mirror. The newly split pool is left in an exported
+ * state unless -R is specified.
+ *
+ * Restrictions: the top-level of the pool pool must only be made up of
+ * mirrors; all devices in the pool must be healthy; no device may be
+ * undergoing a resilvering operation.
+ */
+int
+zpool_do_split(int argc, char **argv)
+{
+ char *srcpool, *newpool, *propval;
+ char *mntopts = NULL;
+ splitflags_t flags;
+ int c, ret = 0;
+ boolean_t loadkeys = B_FALSE;
+ zpool_handle_t *zhp;
+ nvlist_t *config, *props = NULL;
+
+ flags.dryrun = B_FALSE;
+ flags.import = B_FALSE;
+ flags.name_flags = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) {
+ switch (c) {
+ case 'g':
+ flags.name_flags |= VDEV_NAME_GUID;
+ break;
+ case 'L':
+ flags.name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ break;
+ case 'R':
+ flags.import = B_TRUE;
+ if (add_prop_list(
+ zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
+ &props, B_TRUE) != 0) {
+ nvlist_free(props);
+ usage(B_FALSE);
+ }
+ break;
+ case 'l':
+ loadkeys = B_TRUE;
+ break;
+ case 'n':
+ flags.dryrun = B_TRUE;
+ break;
+ case 'o':
+ if ((propval = strchr(optarg, '=')) != NULL) {
+ *propval = '\0';
+ propval++;
+ if (add_prop_list(optarg, propval,
+ &props, B_TRUE) != 0) {
+ nvlist_free(props);
+ usage(B_FALSE);
+ }
+ } else {
+ mntopts = optarg;
+ }
+ break;
+ case 'P':
+ flags.name_flags |= VDEV_NAME_PATH;
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ break;
+ }
+ }
+
+ if (!flags.import && mntopts != NULL) {
+ (void) fprintf(stderr, gettext("setting mntopts is only "
+ "valid when importing the pool\n"));
+ usage(B_FALSE);
+ }
+
+ if (!flags.import && loadkeys) {
+ (void) fprintf(stderr, gettext("loading keys is only "
+ "valid when importing the pool\n"));
+ usage(B_FALSE);
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("Missing pool name\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("Missing new pool name\n"));
+ usage(B_FALSE);
+ }
+
+ srcpool = argv[0];
+ newpool = argv[1];
+
+ argc -= 2;
+ argv += 2;
+
+ if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) {
+ nvlist_free(props);
+ return (1);
+ }
+
+ config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
+ if (config == NULL) {
+ ret = 1;
+ } else {
+ if (flags.dryrun) {
+ (void) printf(gettext("would create '%s' with the "
+ "following layout:\n\n"), newpool);
+ print_vdev_tree(NULL, newpool, config, 0, "",
+ flags.name_flags);
+ }
+ }
+
+ zpool_close(zhp);
+
+ if (ret != 0 || flags.dryrun || !flags.import) {
+ nvlist_free(config);
+ nvlist_free(props);
+ return (ret);
+ }
+
+ /*
+ * The split was successful. Now we need to open the new
+ * pool and import it.
+ */
+ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) {
+ nvlist_free(config);
+ nvlist_free(props);
+ return (1);
+ }
+
+ if (loadkeys) {
+ ret = zfs_crypto_attempt_load_keys(g_zfs, newpool);
+ if (ret != 0)
+ ret = 1;
+ }
+
+ if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+ zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+ ret = 1;
+ (void) fprintf(stderr, gettext("Split was successful, but "
+ "the datasets could not all be mounted\n"));
+ (void) fprintf(stderr, gettext("Try doing '%s' with a "
+ "different altroot\n"), "zpool import");
+ }
+ zpool_close(zhp);
+ nvlist_free(config);
+ nvlist_free(props);
+
+ return (ret);
+}
+
+
+
+/*
+ * zpool online <pool> <device> ...
+ */
+int
+zpool_do_online(int argc, char **argv)
+{
+ int c, i;
+ char *poolname;
+ zpool_handle_t *zhp;
+ int ret = 0;
+ vdev_state_t newstate;
+ int flags = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "e")) != -1) {
+ switch (c) {
+ case 'e':
+ flags |= ZFS_ONLINE_EXPAND;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device name\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ for (i = 1; i < argc; i++) {
+ if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
+ if (newstate != VDEV_STATE_HEALTHY) {
+ (void) printf(gettext("warning: device '%s' "
+ "onlined, but remains in faulted state\n"),
+ argv[i]);
+ if (newstate == VDEV_STATE_FAULTED)
+ (void) printf(gettext("use 'zpool "
+ "clear' to restore a faulted "
+ "device\n"));
+ else
+ (void) printf(gettext("use 'zpool "
+ "replace' to replace devices "
+ "that are no longer present\n"));
+ }
+ } else {
+ ret = 1;
+ }
+ }
+
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool offline [-ft] <pool> <device> ...
+ *
+ * -f Force the device into a faulted state.
+ *
+ * -t Only take the device off-line temporarily. The offline/faulted
+ * state will not be persistent across reboots.
+ */
+/* ARGSUSED */
+int
+zpool_do_offline(int argc, char **argv)
+{
+ int c, i;
+ char *poolname;
+ zpool_handle_t *zhp;
+ int ret = 0;
+ boolean_t istmp = B_FALSE;
+ boolean_t fault = B_FALSE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "ft")) != -1) {
+ switch (c) {
+ case 'f':
+ fault = B_TRUE;
+ break;
+ case 't':
+ istmp = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing device name\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ for (i = 1; i < argc; i++) {
+ if (fault) {
+ uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]);
+ vdev_aux_t aux;
+ if (istmp == B_FALSE) {
+ /* Force the fault to persist across imports */
+ aux = VDEV_AUX_EXTERNAL_PERSIST;
+ } else {
+ aux = VDEV_AUX_EXTERNAL;
+ }
+
+ if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0)
+ ret = 1;
+ } else {
+ if (zpool_vdev_offline(zhp, argv[i], istmp) != 0)
+ ret = 1;
+ }
+ }
+
+ zpool_close(zhp);
+
+ return (ret);
+}
+
+/*
+ * zpool clear <pool> [device]
+ *
+ * Clear all errors associated with a pool or a particular device.
+ */
+int
+zpool_do_clear(int argc, char **argv)
+{
+ int c;
+ int ret = 0;
+ boolean_t dryrun = B_FALSE;
+ boolean_t do_rewind = B_FALSE;
+ boolean_t xtreme_rewind = B_FALSE;
+ uint32_t rewind_policy = ZPOOL_NO_REWIND;
+ nvlist_t *policy = NULL;
+ zpool_handle_t *zhp;
+ char *pool, *device;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "FnX")) != -1) {
+ switch (c) {
+ case 'F':
+ do_rewind = B_TRUE;
+ break;
+ case 'n':
+ dryrun = B_TRUE;
+ break;
+ case 'X':
+ xtreme_rewind = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 2) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if ((dryrun || xtreme_rewind) && !do_rewind) {
+ (void) fprintf(stderr,
+ gettext("-n or -X only meaningful with -F\n"));
+ usage(B_FALSE);
+ }
+ if (dryrun)
+ rewind_policy = ZPOOL_TRY_REWIND;
+ else if (do_rewind)
+ rewind_policy = ZPOOL_DO_REWIND;
+ if (xtreme_rewind)
+ rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+ /* In future, further rewind policy choices can be passed along here */
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY,
+ rewind_policy) != 0) {
+ return (1);
+ }
+
+ pool = argv[0];
+ device = argc == 2 ? argv[1] : NULL;
+
+ if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+ nvlist_free(policy);
+ return (1);
+ }
+
+ if (zpool_clear(zhp, device, policy) != 0)
+ ret = 1;
+
+ zpool_close(zhp);
+
+ nvlist_free(policy);
+
+ return (ret);
+}
+
+/*
+ * zpool reguid <pool>
+ */
+int
+zpool_do_reguid(int argc, char **argv)
+{
+ int c;
+ char *poolname;
+ zpool_handle_t *zhp;
+ int ret = 0;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "")) != -1) {
+ switch (c) {
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* get pool name and check number of arguments */
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ poolname = argv[0];
+ if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+ return (1);
+
+ ret = zpool_reguid(zhp);
+
+ zpool_close(zhp);
+ return (ret);
+}
+
+
+/*
+ * zpool reopen <pool>
+ *
+ * Reopen the pool so that the kernel can update the sizes of all vdevs.
+ */
+int
+zpool_do_reopen(int argc, char **argv)
+{
+ int c;
+ int ret = 0;
+ boolean_t scrub_restart = B_TRUE;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "n")) != -1) {
+ switch (c) {
+ case 'n':
+ scrub_restart = B_FALSE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* if argc == 0 we will execute zpool_reopen_one on all pools */
+ ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one,
+ &scrub_restart);
+
+ return (ret);
+}
+
+typedef struct scrub_cbdata {
+ int cb_type;
+ pool_scrub_cmd_t cb_scrub_cmd;
+} scrub_cbdata_t;
+
+static boolean_t
+zpool_has_checkpoint(zpool_handle_t *zhp)
+{
+ nvlist_t *config, *nvroot;
+
+ config = zpool_get_config(zhp, NULL);
+
+ if (config != NULL) {
+ pool_checkpoint_stat_t *pcs = NULL;
+ uint_t c;
+
+ nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+
+ if (pcs == NULL || pcs->pcs_state == CS_NONE)
+ return (B_FALSE);
+
+ assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS ||
+ pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+scrub_callback(zpool_handle_t *zhp, void *data)
+{
+ scrub_cbdata_t *cb = data;
+ int err;
+
+ /*
+ * Ignore faulted pools.
+ */
+ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+ (void) fprintf(stderr, gettext("cannot scan '%s': pool is "
+ "currently unavailable\n"), zpool_get_name(zhp));
+ return (1);
+ }
+
+ err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd);
+
+ if (err == 0 && zpool_has_checkpoint(zhp) &&
+ cb->cb_type == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("warning: will not scrub state that "
+ "belongs to the checkpoint of pool '%s'\n"),
+ zpool_get_name(zhp));
+ }
+
+ return (err != 0);
+}
+
+static int
+wait_callback(zpool_handle_t *zhp, void *data)
+{
+ zpool_wait_activity_t *act = data;
+ return (zpool_wait(zhp, *act));
+}
+
+/*
+ * zpool scrub [-s | -p] [-w] <pool> ...
+ *
+ * -s Stop. Stops any in-progress scrub.
+ * -p Pause. Pause in-progress scrub.
+ * -w Wait. Blocks until scrub has completed.
+ */
+int
+zpool_do_scrub(int argc, char **argv)
+{
+ int c;
+ scrub_cbdata_t cb;
+ boolean_t wait = B_FALSE;
+ int error;
+
+ cb.cb_type = POOL_SCAN_SCRUB;
+ cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "spw")) != -1) {
+ switch (c) {
+ case 's':
+ cb.cb_type = POOL_SCAN_NONE;
+ break;
+ case 'p':
+ cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
+ break;
+ case 'w':
+ wait = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ if (cb.cb_type == POOL_SCAN_NONE &&
+ cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
+ (void) fprintf(stderr, gettext("invalid option combination: "
+ "-s and -p are mutually exclusive\n"));
+ usage(B_FALSE);
+ }
+
+ if (wait && (cb.cb_type == POOL_SCAN_NONE ||
+ cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) {
+ (void) fprintf(stderr, gettext("invalid option combination: "
+ "-w cannot be used with -p or -s\n"));
+ usage(B_FALSE);
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb);
+
+ if (wait && !error) {
+ zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB;
+ error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback,
+ &act);
+ }
+
+ return (error);
+}
+
+/*
+ * zpool resilver <pool> ...
+ *
+ * Restarts any in-progress resilver
+ */
+int
+zpool_do_resilver(int argc, char **argv)
+{
+ int c;
+ scrub_cbdata_t cb;
+
+ cb.cb_type = POOL_SCAN_RESILVER;
+ cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "")) != -1) {
+ switch (c) {
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ }
+
+ return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
+}
+
+/*
+ * zpool trim [-d] [-r <rate>] [-c | -s] <pool> [<device> ...]
+ *
+ * -c Cancel. Ends any in-progress trim.
+ * -d Secure trim. Requires kernel and device support.
+ * -r <rate> Sets the TRIM rate in bytes (per second). Supports
+ * adding a multiplier suffix such as 'k' or 'm'.
+ * -s Suspend. TRIM can then be restarted with no flags.
+ * -w Wait. Blocks until trimming has completed.
+ */
+int
+zpool_do_trim(int argc, char **argv)
+{
+ struct option long_options[] = {
+ {"cancel", no_argument, NULL, 'c'},
+ {"secure", no_argument, NULL, 'd'},
+ {"rate", required_argument, NULL, 'r'},
+ {"suspend", no_argument, NULL, 's'},
+ {"wait", no_argument, NULL, 'w'},
+ {0, 0, 0, 0}
+ };
+
+ pool_trim_func_t cmd_type = POOL_TRIM_START;
+ uint64_t rate = 0;
+ boolean_t secure = B_FALSE;
+ boolean_t wait = B_FALSE;
+
+ int c;
+ while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL))
+ != -1) {
+ switch (c) {
+ case 'c':
+ if (cmd_type != POOL_TRIM_START &&
+ cmd_type != POOL_TRIM_CANCEL) {
+ (void) fprintf(stderr, gettext("-c cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_TRIM_CANCEL;
+ break;
+ case 'd':
+ if (cmd_type != POOL_TRIM_START) {
+ (void) fprintf(stderr, gettext("-d cannot be "
+ "combined with the -c or -s options\n"));
+ usage(B_FALSE);
+ }
+ secure = B_TRUE;
+ break;
+ case 'r':
+ if (cmd_type != POOL_TRIM_START) {
+ (void) fprintf(stderr, gettext("-r cannot be "
+ "combined with the -c or -s options\n"));
+ usage(B_FALSE);
+ }
+ if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) {
+ (void) fprintf(stderr,
+ gettext("invalid value for rate\n"));
+ usage(B_FALSE);
+ }
+ break;
+ case 's':
+ if (cmd_type != POOL_TRIM_START &&
+ cmd_type != POOL_TRIM_SUSPEND) {
+ (void) fprintf(stderr, gettext("-s cannot be "
+ "combined with other options\n"));
+ usage(B_FALSE);
+ }
+ cmd_type = POOL_TRIM_SUSPEND;
+ break;
+ case 'w':
+ wait = B_TRUE;
+ break;
+ case '?':
+ if (optopt != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ } else {
+ (void) fprintf(stderr,
+ gettext("invalid option '%s'\n"),
+ argv[optind - 1]);
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing pool name argument\n"));
+ usage(B_FALSE);
+ return (-1);
+ }
+
+ if (wait && (cmd_type != POOL_TRIM_START)) {
+ (void) fprintf(stderr, gettext("-w cannot be used with -c or "
+ "-s\n"));
+ usage(B_FALSE);
+ }
+
+ char *poolname = argv[0];
+ zpool_handle_t *zhp = zpool_open(g_zfs, poolname);
+ if (zhp == NULL)
+ return (-1);
+
+ trimflags_t trim_flags = {
+ .secure = secure,
+ .rate = rate,
+ .wait = wait,
+ };
+
+ nvlist_t *vdevs = fnvlist_alloc();
+ if (argc == 1) {
+ /* no individual leaf vdevs specified, so add them all */
+ nvlist_t *config = zpool_get_config(zhp, NULL);
+ nvlist_t *nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ zpool_collect_leaves(zhp, nvroot, vdevs);
+ trim_flags.fullpool = B_TRUE;
+ } else {
+ trim_flags.fullpool = B_FALSE;
+ for (int i = 1; i < argc; i++) {
+ fnvlist_add_boolean(vdevs, argv[i]);
+ }
+ }
+
+ int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags);
+
+ fnvlist_free(vdevs);
+ zpool_close(zhp);
+
+ return (error);
+}
+
+/*
+ * Converts a total number of seconds to a human readable string broken
+ * down in to days/hours/minutes/seconds.
+ */
+static void
+secs_to_dhms(uint64_t total, char *buf)
+{
+ uint64_t days = total / 60 / 60 / 24;
+ uint64_t hours = (total / 60 / 60) % 24;
+ uint64_t mins = (total / 60) % 60;
+ uint64_t secs = (total % 60);
+
+ if (days > 0) {
+ (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu",
+ (u_longlong_t)days, (u_longlong_t)hours,
+ (u_longlong_t)mins, (u_longlong_t)secs);
+ } else {
+ (void) sprintf(buf, "%02llu:%02llu:%02llu",
+ (u_longlong_t)hours, (u_longlong_t)mins,
+ (u_longlong_t)secs);
+ }
+}
+
+/*
+ * Print out detailed scrub status.
+ */
+static void
+print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
+{
+ time_t start, end, pause;
+ uint64_t pass_scanned, scanned, pass_issued, issued, total;
+ uint64_t elapsed, scan_rate, issue_rate;
+ double fraction_done;
+ char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+ char srate_buf[7], irate_buf[7], time_buf[32];
+
+ printf(" ");
+ printf_color(ANSI_BOLD, gettext("scan:"));
+ printf(" ");
+
+ /* If there's never been a scan, there's not much to say. */
+ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
+ ps->pss_func >= POOL_SCAN_FUNCS) {
+ (void) printf(gettext("none requested\n"));
+ return;
+ }
+
+ start = ps->pss_start_time;
+ end = ps->pss_end_time;
+ pause = ps->pss_pass_scrub_pause;
+
+ zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));
+
+ assert(ps->pss_func == POOL_SCAN_SCRUB ||
+ ps->pss_func == POOL_SCAN_RESILVER);
+
+ /* Scan is finished or canceled. */
+ if (ps->pss_state == DSS_FINISHED) {
+ secs_to_dhms(end - start, time_buf);
+
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("scrub repaired %s "
+ "in %s with %llu errors on %s"), processed_buf,
+ time_buf, (u_longlong_t)ps->pss_errors,
+ ctime(&end));
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilvered %s "
+ "in %s with %llu errors on %s"), processed_buf,
+ time_buf, (u_longlong_t)ps->pss_errors,
+ ctime(&end));
+ }
+ return;
+ } else if (ps->pss_state == DSS_CANCELED) {
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("scrub canceled on %s"),
+ ctime(&end));
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilver canceled on %s"),
+ ctime(&end));
+ }
+ return;
+ }
+
+ assert(ps->pss_state == DSS_SCANNING);
+
+ /* Scan is in progress. Resilvers can't be paused. */
+ if (ps->pss_func == POOL_SCAN_SCRUB) {
+ if (pause == 0) {
+ (void) printf(gettext("scrub in progress since %s"),
+ ctime(&start));
+ } else {
+ (void) printf(gettext("scrub paused since %s"),
+ ctime(&pause));
+ (void) printf(gettext("\tscrub started on %s"),
+ ctime(&start));
+ }
+ } else if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("resilver in progress since %s"),
+ ctime(&start));
+ }
+
+ scanned = ps->pss_examined;
+ pass_scanned = ps->pss_pass_exam;
+ issued = ps->pss_issued;
+ pass_issued = ps->pss_pass_issued;
+ total = ps->pss_to_examine;
+
+ /* we are only done with a block once we have issued the IO for it */
+ fraction_done = (double)issued / total;
+
+ /* elapsed time for this pass, rounding up to 1 if it's 0 */
+ elapsed = time(NULL) - ps->pss_pass_start;
+ elapsed -= ps->pss_pass_scrub_spent_paused;
+ elapsed = (elapsed != 0) ? elapsed : 1;
+
+ scan_rate = pass_scanned / elapsed;
+ issue_rate = pass_issued / elapsed;
+ uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
+ ((total - issued) / issue_rate) : UINT64_MAX;
+ secs_to_dhms(total_secs_left, time_buf);
+
+ /* format all of the numbers we will be reporting */
+ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
+ zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
+ zfs_nicebytes(total, total_buf, sizeof (total_buf));
+ zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
+ zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
+
+ /* do not print estimated time if we have a paused scrub */
+ if (pause == 0) {
+ (void) printf(gettext("\t%s scanned at %s/s, "
+ "%s issued at %s/s, %s total\n"),
+ scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
+ } else {
+ (void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+ scanned_buf, issued_buf, total_buf);
+ }
+
+ if (ps->pss_func == POOL_SCAN_RESILVER) {
+ (void) printf(gettext("\t%s resilvered, %.2f%% done"),
+ processed_buf, 100 * fraction_done);
+ } else if (ps->pss_func == POOL_SCAN_SCRUB) {
+ (void) printf(gettext("\t%s repaired, %.2f%% done"),
+ processed_buf, 100 * fraction_done);
+ }
+
+ if (pause == 0) {
+ if (total_secs_left != UINT64_MAX &&
+ issue_rate >= 10 * 1024 * 1024) {
+ (void) printf(gettext(", %s to go\n"), time_buf);
+ } else {
+ (void) printf(gettext(", no estimated "
+ "completion time\n"));
+ }
+ } else {
+ (void) printf(gettext("\n"));
+ }
+}
+
+static void
+print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
+{
+ if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
+ return;
+
+ printf(" ");
+ printf_color(ANSI_BOLD, gettext("scan:"));
+ printf(" ");
+
+ uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
+ uint64_t bytes_issued = vrs->vrs_bytes_issued;
+ uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
+ uint64_t bytes_est = vrs->vrs_bytes_est;
+ uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
+ (vrs->vrs_pass_time_ms + 1)) * 1000;
+ uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
+ (vrs->vrs_pass_time_ms + 1)) * 1000;
+ double scan_pct = MIN((double)bytes_scanned * 100 /
+ (bytes_est + 1), 100);
+
+ /* Format all of the numbers we will be reporting */
+ char bytes_scanned_buf[7], bytes_issued_buf[7];
+ char bytes_rebuilt_buf[7], bytes_est_buf[7];
+ char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
+ zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
+ sizeof (bytes_scanned_buf));
+ zfs_nicebytes(bytes_issued, bytes_issued_buf,
+ sizeof (bytes_issued_buf));
+ zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
+ sizeof (bytes_rebuilt_buf));
+ zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
+ zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
+ zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
+
+ time_t start = vrs->vrs_start_time;
+ time_t end = vrs->vrs_end_time;
+
+ /* Rebuild is finished or canceled. */
+ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) {
+ secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf);
+ (void) printf(gettext("resilvered (%s) %s in %s "
+ "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf,
+ time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end));
+ return;
+ } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) {
+ (void) printf(gettext("resilver (%s) canceled on %s"),
+ vdev_name, ctime(&end));
+ return;
+ } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ (void) printf(gettext("resilver (%s) in progress since %s"),
+ vdev_name, ctime(&start));
+ }
+
+ assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
+
+ secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
+ MAX(scan_rate, 1), time_buf);
+
+ (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
+ "%s total\n"), bytes_scanned_buf, scan_rate_buf,
+ bytes_issued_buf, issue_rate_buf, bytes_est_buf);
+ (void) printf(gettext("\t%s resilvered, %.2f%% done"),
+ bytes_rebuilt_buf, scan_pct);
+
+ if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ if (scan_rate >= 10 * 1024 * 1024) {
+ (void) printf(gettext(", %s to go\n"), time_buf);
+ } else {
+ (void) printf(gettext(", no estimated "
+ "completion time\n"));
+ }
+ } else {
+ (void) printf(gettext("\n"));
+ }
+}
+
+/*
+ * Print rebuild status for top-level vdevs.
+ */
+static void
+print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+ nvlist_t **child;
+ uint_t children;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (uint_t c = 0; c < children; c++) {
+ vdev_rebuild_stat_t *vrs;
+ uint_t i;
+
+ if (nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+ char *name = zpool_vdev_name(g_zfs, zhp,
+ child[c], VDEV_NAME_TYPE_ID);
+ print_rebuild_status_impl(vrs, name);
+ free(name);
+ }
+ }
+}
+
+/*
+ * As we don't scrub checkpointed blocks, we want to warn the user that we
+ * skipped scanning some blocks if a checkpoint exists or existed at any
+ * time during the scan. If a sequential instead of healing reconstruction
+ * was performed then the blocks were reconstructed. However, their checksums
+ * have not been verified so we still print the warning.
+ */
+static void
+print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
+{
+ if (ps == NULL || pcs == NULL)
+ return;
+
+ if (pcs->pcs_state == CS_NONE ||
+ pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+ return;
+
+ assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS);
+
+ if (ps->pss_state == DSS_NONE)
+ return;
+
+ if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) &&
+ ps->pss_end_time < pcs->pcs_start_time)
+ return;
+
+ if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) {
+ (void) printf(gettext(" scan warning: skipped blocks "
+ "that are only referenced by the checkpoint.\n"));
+ } else {
+ assert(ps->pss_state == DSS_SCANNING);
+ (void) printf(gettext(" scan warning: skipping blocks "
+ "that are only referenced by the checkpoint.\n"));
+ }
+}
+
+/*
+ * Returns B_TRUE if there is an active rebuild in progress. Otherwise,
+ * B_FALSE is returned and 'rebuild_end_time' is set to the end time for
+ * the last completed (or cancelled) rebuild.
+ */
+static boolean_t
+check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time)
+{
+ nvlist_t **child;
+ uint_t children;
+ boolean_t rebuilding = B_FALSE;
+ uint64_t end_time = 0;
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (uint_t c = 0; c < children; c++) {
+ vdev_rebuild_stat_t *vrs;
+ uint_t i;
+
+ if (nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+
+ if (vrs->vrs_end_time > end_time)
+ end_time = vrs->vrs_end_time;
+
+ if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ rebuilding = B_TRUE;
+ end_time = 0;
+ break;
+ }
+ }
+ }
+
+ if (rebuild_end_time != NULL)
+ *rebuild_end_time = end_time;
+
+ return (rebuilding);
+}
+
+/*
+ * Print the scan status.
+ */
+static void
+print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+ uint64_t rebuild_end_time = 0, resilver_end_time = 0;
+ boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
+ boolean_t active_resilver = B_FALSE;
+ pool_checkpoint_stat_t *pcs = NULL;
+ pool_scan_stat_t *ps = NULL;
+ uint_t c;
+
+ if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+ (uint64_t **)&ps, &c) == 0) {
+ if (ps->pss_func == POOL_SCAN_RESILVER) {
+ resilver_end_time = ps->pss_end_time;
+ active_resilver = (ps->pss_state == DSS_SCANNING);
+ }
+
+ have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
+ have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
+ }
+
+ boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
+ boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
+
+ /* Always print the scrub status when available. */
+ if (have_scrub)
+ print_scan_scrub_resilver_status(ps);
+
+ /*
+ * When there is an active resilver or rebuild print its status.
+ * Otherwise print the status of the last resilver or rebuild.
+ */
+ if (active_resilver || (!active_rebuild && have_resilver &&
+ resilver_end_time && resilver_end_time > rebuild_end_time)) {
+ print_scan_scrub_resilver_status(ps);
+ } else if (active_rebuild || (!active_resilver && have_rebuild &&
+ rebuild_end_time && rebuild_end_time > resilver_end_time)) {
+ print_rebuild_status(zhp, nvroot);
+ }
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+ print_checkpoint_scan_warning(ps, pcs);
+}
+
+/*
+ * Print out detailed removal status.
+ */
+static void
+print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs)
+{
+ char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+ time_t start, end;
+ nvlist_t *config, *nvroot;
+ nvlist_t **child;
+ uint_t children;
+ char *vdev_name;
+
+ if (prs == NULL || prs->prs_state == DSS_NONE)
+ return;
+
+ /*
+ * Determine name of vdev.
+ */
+ config = zpool_get_config(zhp, NULL);
+ nvroot = fnvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE);
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0);
+ assert(prs->prs_removing_vdev < children);
+ vdev_name = zpool_vdev_name(g_zfs, zhp,
+ child[prs->prs_removing_vdev], B_TRUE);
+
+ (void) printf(gettext("remove: "));
+
+ start = prs->prs_start_time;
+ end = prs->prs_end_time;
+ zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf));
+
+ /*
+ * Removal is finished or canceled.
+ */
+ if (prs->prs_state == DSS_FINISHED) {
+ uint64_t minutes_taken = (end - start) / 60;
+
+ (void) printf(gettext("Removal of vdev %llu copied %s "
+ "in %lluh%um, completed on %s"),
+ (longlong_t)prs->prs_removing_vdev,
+ copied_buf,
+ (u_longlong_t)(minutes_taken / 60),
+ (uint_t)(minutes_taken % 60),
+ ctime((time_t *)&end));
+ } else if (prs->prs_state == DSS_CANCELED) {
+ (void) printf(gettext("Removal of %s canceled on %s"),
+ vdev_name, ctime(&end));
+ } else {
+ uint64_t copied, total, elapsed, mins_left, hours_left;
+ double fraction_done;
+ uint_t rate;
+
+ assert(prs->prs_state == DSS_SCANNING);
+
+ /*
+ * Removal is in progress.
+ */
+ (void) printf(gettext(
+ "Evacuation of %s in progress since %s"),
+ vdev_name, ctime(&start));
+
+ copied = prs->prs_copied > 0 ? prs->prs_copied : 1;
+ total = prs->prs_to_copy;
+ fraction_done = (double)copied / total;
+
+ /* elapsed time for this pass */
+ elapsed = time(NULL) - prs->prs_start_time;
+ elapsed = elapsed > 0 ? elapsed : 1;
+ rate = copied / elapsed;
+ rate = rate > 0 ? rate : 1;
+ mins_left = ((total - copied) / rate) / 60;
+ hours_left = mins_left / 60;
+
+ zfs_nicenum(copied, examined_buf, sizeof (examined_buf));
+ zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+ /*
+ * do not print estimated time if hours_left is more than
+ * 30 days
+ */
+ (void) printf(gettext(" %s copied out of %s at %s/s, "
+ "%.2f%% done"),
+ examined_buf, total_buf, rate_buf, 100 * fraction_done);
+ if (hours_left < (30 * 24)) {
+ (void) printf(gettext(", %lluh%um to go\n"),
+ (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+ } else {
+ (void) printf(gettext(
+ ", (copy is slow, no estimated time)\n"));
+ }
+ }
+ free(vdev_name);
+
+ if (prs->prs_mapping_memory > 0) {
+ char mem_buf[7];
+ zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf));
+ (void) printf(gettext(" %s memory used for "
+ "removed device mappings\n"),
+ mem_buf);
+ }
+}
+
+static void
+print_checkpoint_status(pool_checkpoint_stat_t *pcs)
+{
+ time_t start;
+ char space_buf[7];
+
+ if (pcs == NULL || pcs->pcs_state == CS_NONE)
+ return;
+
+ (void) printf(gettext("checkpoint: "));
+
+ start = pcs->pcs_start_time;
+ zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf));
+
+ if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) {
+ char *date = ctime(&start);
+
+ /*
+ * ctime() adds a newline at the end of the generated
+ * string, thus the weird format specifier and the
+ * strlen() call used to chop it off from the output.
+ */
+ (void) printf(gettext("created %.*s, consumes %s\n"),
+ (int)(strlen(date) - 1), date, space_buf);
+ return;
+ }
+
+ assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING);
+
+ (void) printf(gettext("discarding, %s remaining.\n"),
+ space_buf);
+}
+
+static void
+print_error_log(zpool_handle_t *zhp)
+{
+ nvlist_t *nverrlist = NULL;
+ nvpair_t *elem;
+ char *pathname;
+ size_t len = MAXPATHLEN * 2;
+
+ if (zpool_get_errlog(zhp, &nverrlist) != 0)
+ return;
+
+ (void) printf("errors: Permanent errors have been "
+ "detected in the following files:\n\n");
+
+ pathname = safe_malloc(len);
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) {
+ nvlist_t *nv;
+ uint64_t dsobj, obj;
+
+ verify(nvpair_value_nvlist(elem, &nv) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET,
+ &dsobj) == 0);
+ verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT,
+ &obj) == 0);
+ zpool_obj_to_path(zhp, dsobj, obj, pathname, len);
+ (void) printf("%7s %s\n", "", pathname);
+ }
+ free(pathname);
+ nvlist_free(nverrlist);
+}
+
+static void
+print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares,
+ uint_t nspares)
+{
+ uint_t i;
+ char *name;
+
+ if (nspares == 0)
+ return;
+
+ (void) printf(gettext("\tspares\n"));
+
+ for (i = 0; i < nspares; i++) {
+ name = zpool_vdev_name(g_zfs, zhp, spares[i],
+ cb->cb_name_flags);
+ print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL);
+ free(name);
+ }
+}
+
+static void
+print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
+ uint_t nl2cache)
+{
+ uint_t i;
+ char *name;
+
+ if (nl2cache == 0)
+ return;
+
+ (void) printf(gettext("\tcache\n"));
+
+ for (i = 0; i < nl2cache; i++) {
+ name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
+ cb->cb_name_flags);
+ print_status_config(zhp, cb, name, l2cache[i], 2,
+ B_FALSE, NULL);
+ free(name);
+ }
+}
+
+static void
+print_dedup_stats(nvlist_t *config)
+{
+ ddt_histogram_t *ddh;
+ ddt_stat_t *dds;
+ ddt_object_t *ddo;
+ uint_t c;
+ char dspace[6], mspace[6];
+
+ /*
+ * If the pool was faulted then we may not have been able to
+ * obtain the config. Otherwise, if we have anything in the dedup
+ * table continue processing the stats.
+ */
+ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
+ (uint64_t **)&ddo, &c) != 0)
+ return;
+
+ (void) printf("\n");
+ (void) printf(gettext(" dedup: "));
+ if (ddo->ddo_count == 0) {
+ (void) printf(gettext("no DDT entries\n"));
+ return;
+ }
+
+ zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace));
+ zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace));
+ (void) printf("DDT entries %llu, size %s on disk, %s in core\n",
+ (u_longlong_t)ddo->ddo_count,
+ dspace,
+ mspace);
+
+ verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
+ (uint64_t **)&dds, &c) == 0);
+ verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
+ (uint64_t **)&ddh, &c) == 0);
+ zpool_dump_ddt(dds, ddh);
+}
+
+/*
+ * Display a summary of pool status. Displays a summary such as:
+ *
+ * pool: tank
+ * status: DEGRADED
+ * reason: One or more devices ...
+ * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01
+ * config:
+ * mirror DEGRADED
+ * c1t0d0 OK
+ * c2t0d0 UNAVAIL
+ *
+ * When given the '-v' option, we print out the complete config. If the '-e'
+ * option is specified, then we print out error rate information as well.
+ */
+static int
+status_callback(zpool_handle_t *zhp, void *data)
+{
+ status_cbdata_t *cbp = data;
+ nvlist_t *config, *nvroot;
+ char *msgid;
+ zpool_status_t reason;
+ zpool_errata_t errata;
+ const char *health;
+ uint_t c;
+ vdev_stat_t *vs;
+
+ config = zpool_get_config(zhp, NULL);
+ reason = zpool_get_status(zhp, &msgid, &errata);
+
+ cbp->cb_count++;
+
+ /*
+ * If we were given 'zpool status -x', only report those pools with
+ * problems.
+ */
+ if (cbp->cb_explain &&
+ (reason == ZPOOL_STATUS_OK ||
+ reason == ZPOOL_STATUS_VERSION_OLDER ||
+ reason == ZPOOL_STATUS_FEAT_DISABLED)) {
+ if (!cbp->cb_allpools) {
+ (void) printf(gettext("pool '%s' is healthy\n"),
+ zpool_get_name(zhp));
+ if (cbp->cb_first)
+ cbp->cb_first = B_FALSE;
+ }
+ return (0);
+ }
+
+ if (cbp->cb_first)
+ cbp->cb_first = B_FALSE;
+ else
+ (void) printf("\n");
+
+ nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
+ health = zpool_get_state_str(zhp);
+
+ printf(" ");
+ printf_color(ANSI_BOLD, gettext("pool:"));
+ printf(" %s\n", zpool_get_name(zhp));
+ printf(" ");
+ printf_color(ANSI_BOLD, gettext("state: "));
+
+ printf_color(health_str_to_color(health), "%s", health);
+
+ printf("\n");
+
+ switch (reason) {
+ case ZPOOL_STATUS_MISSING_DEV_R:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices could "
+ "not be opened. Sufficient replicas exist for\n\tthe pool "
+ "to continue functioning in a degraded state.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Attach the missing device "
+ "and online it using 'zpool online'.\n"));
+ break;
+
+ case ZPOOL_STATUS_MISSING_DEV_NR:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices could "
+ "not be opened. There are insufficient\n\treplicas for the"
+ " pool to continue functioning.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Attach the missing device "
+ "and online it using 'zpool online'.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_LABEL_R:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices could "
+ "not be used because the label is missing or\n\tinvalid. "
+ "Sufficient replicas exist for the pool to continue\n\t"
+ "functioning in a degraded state.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Replace the device using "
+ "'zpool replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_LABEL_NR:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices could "
+ "not be used because the label is missing \n\tor invalid. "
+ "There are insufficient replicas for the pool to "
+ "continue\n\tfunctioning.\n"));
+ zpool_explain_recover(zpool_get_handle(zhp),
+ zpool_get_name(zhp), reason, config);
+ break;
+
+ case ZPOOL_STATUS_FAILING_DEV:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices has "
+ "experienced an unrecoverable error. An\n\tattempt was "
+ "made to correct the error. Applications are "
+ "unaffected.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Determine if the "
+ "device needs to be replaced, and clear the errors\n\tusing"
+ " 'zpool clear' or replace the device with 'zpool "
+ "replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_OFFLINE_DEV:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices has "
+ "been taken offline by the administrator.\n\tSufficient "
+ "replicas exist for the pool to continue functioning in "
+ "a\n\tdegraded state.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Online the device "
+ "using 'zpool online' or replace the device with\n\t'zpool "
+ "replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_REMOVED_DEV:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices has "
+ "been removed by the administrator.\n\tSufficient "
+ "replicas exist for the pool to continue functioning in "
+ "a\n\tdegraded state.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Online the device "
+ "using zpool online' or replace the device with\n\t'zpool "
+ "replace'.\n"));
+ break;
+
+ case ZPOOL_STATUS_RESILVERING:
+ case ZPOOL_STATUS_REBUILDING:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices is "
+ "currently being resilvered. The pool will\n\tcontinue "
+ "to function, possibly in a degraded state.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Wait for the resilver to "
+ "complete.\n"));
+ break;
+
+ case ZPOOL_STATUS_REBUILD_SCRUB:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices have "
+ "been sequentially resilvered, scrubbing\n\tthe pool "
+ "is recommended.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to "
+ "verify all data checksums.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_DATA:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices has "
+ "experienced an error resulting in data\n\tcorruption. "
+ "Applications may be affected.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Restore the file in question"
+ " if possible. Otherwise restore the\n\tentire pool from "
+ "backup.\n"));
+ break;
+
+ case ZPOOL_STATUS_CORRUPT_POOL:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool metadata is "
+ "corrupted and the pool cannot be opened.\n"));
+ zpool_explain_recover(zpool_get_handle(zhp),
+ zpool_get_name(zhp), reason, config);
+ break;
+
+ case ZPOOL_STATUS_VERSION_OLDER:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool is formatted using "
+ "a legacy on-disk format. The pool can\n\tstill be used, "
+ "but some features are unavailable.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Upgrade the pool using "
+ "'zpool upgrade'. Once this is done, the\n\tpool will no "
+ "longer be accessible on software that does not support\n\t"
+ "feature flags.\n"));
+ break;
+
+ case ZPOOL_STATUS_VERSION_NEWER:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool has been upgraded "
+ "to a newer, incompatible on-disk version.\n\tThe pool "
+ "cannot be accessed on this system.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Access the pool from a "
+ "system running more recent software, or\n\trestore the "
+ "pool from backup.\n"));
+ break;
+
+ case ZPOOL_STATUS_FEAT_DISABLED:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("Some supported features are "
+ "not enabled on the pool. The pool can\n\tstill be used, "
+ "but some features are unavailable.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Enable all features using "
+ "'zpool upgrade'. Once this is done,\n\tthe pool may no "
+ "longer be accessible by software that does not support\n\t"
+ "the features. See zpool-features(5) for details.\n"));
+ break;
+
+ case ZPOOL_STATUS_UNSUP_FEAT_READ:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed "
+ "on this system because it uses the\n\tfollowing feature(s)"
+ " not supported on this system:\n"));
+ zpool_print_unsup_feat(config);
+ (void) printf("\n");
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Access the pool from a "
+ "system that supports the required feature(s),\n\tor "
+ "restore the pool from backup.\n"));
+ break;
+
+ case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool can only be "
+ "accessed in read-only mode on this system. It\n\tcannot be"
+ " accessed in read-write mode because it uses the "
+ "following\n\tfeature(s) not supported on this system:\n"));
+ zpool_print_unsup_feat(config);
+ (void) printf("\n");
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed "
+ "in read-write mode. Import the pool with\n"
+ "\t\"-o readonly=on\", access the pool from a system that "
+ "supports the\n\trequired feature(s), or restore the "
+ "pool from backup.\n"));
+ break;
+
+ case ZPOOL_STATUS_FAULTED_DEV_R:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices are "
+ "faulted in response to persistent errors.\n\tSufficient "
+ "replicas exist for the pool to continue functioning "
+ "in a\n\tdegraded state.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Replace the faulted device, "
+ "or use 'zpool clear' to mark the device\n\trepaired.\n"));
+ break;
+
+ case ZPOOL_STATUS_FAULTED_DEV_NR:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices are "
+ "faulted in response to persistent errors. There are "
+ "insufficient replicas for the pool to\n\tcontinue "
+ "functioning.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Destroy and re-create the "
+ "pool from a backup source. Manually marking the device\n"
+ "\trepaired using 'zpool clear' may allow some data "
+ "to be recovered.\n"));
+ break;
+
+ case ZPOOL_STATUS_IO_FAILURE_MMP:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("The pool is suspended "
+ "because multihost writes failed or were delayed;\n\t"
+ "another system could import the pool undetected.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
+ " are connected, then reboot your system and\n\timport the "
+ "pool.\n"));
+ break;
+
+ case ZPOOL_STATUS_IO_FAILURE_WAIT:
+ case ZPOOL_STATUS_IO_FAILURE_CONTINUE:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("One or more devices are "
+ "faulted in response to IO failures.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Make sure the affected "
+ "devices are connected, then run 'zpool clear'.\n"));
+ break;
+
+ case ZPOOL_STATUS_BAD_LOG:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("An intent log record "
+ "could not be read.\n"
+ "\tWaiting for administrator intervention to fix the "
+ "faulted pool.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Either restore the affected "
+ "device(s) and run 'zpool online',\n"
+ "\tor ignore the intent log records by running "
+ "'zpool clear'.\n"));
+ break;
+
+ case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
+ (void) printf(gettext("status: One or more devices are "
+ "configured to use a non-native block size.\n"
+ "\tExpect reduced performance.\n"));
+ (void) printf(gettext("action: Replace affected devices with "
+ "devices that support the\n\tconfigured block size, or "
+ "migrate data to a properly configured\n\tpool.\n"));
+ break;
+
+ case ZPOOL_STATUS_HOSTID_MISMATCH:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid"
+ " and system hostid on imported pool.\n\tThis pool was "
+ "previously imported into a system with a different "
+ "hostid,\n\tand then was verbatim imported into this "
+ "system.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("Export this pool on all "
+ "systems on which it is imported.\n"
+ "\tThen import it to correct the mismatch.\n"));
+ break;
+
+ case ZPOOL_STATUS_ERRATA:
+ printf_color(ANSI_BOLD, gettext("status: "));
+ printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"),
+ errata);
+
+ switch (errata) {
+ case ZPOOL_ERRATA_NONE:
+ break;
+
+ case ZPOOL_ERRATA_ZOL_2094_SCRUB:
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("To correct the issue"
+ " run 'zpool scrub'.\n"));
+ break;
+
+ case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION:
+ (void) printf(gettext("\tExisting encrypted datasets "
+ "contain an on-disk incompatibility\n\twhich "
+ "needs to be corrected.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("To correct the issue"
+ " backup existing encrypted datasets to new\n\t"
+ "encrypted datasets and destroy the old ones. "
+ "'zfs mount -o ro' can\n\tbe used to temporarily "
+ "mount existing encrypted datasets readonly.\n"));
+ break;
+
+ case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION:
+ (void) printf(gettext("\tExisting encrypted snapshots "
+ "and bookmarks contain an on-disk\n\tincompat"
+ "ibility. This may cause on-disk corruption if "
+ "they are used\n\twith 'zfs recv'.\n"));
+ printf_color(ANSI_BOLD, gettext("action: "));
+ printf_color(ANSI_YELLOW, gettext("To correct the"
+ "issue, enable the bookmark_v2 feature. No "
+ "additional\n\taction is needed if there are no "
+ "encrypted snapshots or bookmarks.\n\tIf preserving"
+ "the encrypted snapshots and bookmarks is required,"
+ " use\n\ta non-raw send to backup and restore them."
+ " Alternately, they may be\n\tremoved to resolve "
+ "the incompatibility.\n"));
+ break;
+
+ default:
+ /*
+ * All errata which allow the pool to be imported
+ * must contain an action message.
+ */
+ assert(0);
+ }
+ break;
+
+ default:
+ /*
+ * The remaining errors can't actually be generated, yet.
+ */
+ assert(reason == ZPOOL_STATUS_OK);
+ }
+
+ if (msgid != NULL) {
+ printf(" ");
+ printf_color(ANSI_BOLD, gettext("see:"));
+ printf(gettext(
+ " https://openzfs.github.io/openzfs-docs/msg/%s\n"),
+ msgid);
+ }
+
+ if (config != NULL) {
+ uint64_t nerr;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ pool_checkpoint_stat_t *pcs = NULL;
+ pool_removal_stat_t *prs = NULL;
+
+ print_scan_status(zhp, nvroot);
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+ print_removal_status(zhp, prs);
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+ print_checkpoint_status(pcs);
+
+ cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
+ cbp->cb_name_flags | VDEV_NAME_TYPE_ID);
+ if (cbp->cb_namewidth < 10)
+ cbp->cb_namewidth = 10;
+
+ color_start(ANSI_BOLD);
+ (void) printf(gettext("config:\n\n"));
+ (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"),
+ cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE",
+ "CKSUM");
+ color_end();
+
+ if (cbp->cb_print_slow_ios) {
+ printf_color(ANSI_BOLD, " %5s", gettext("SLOW"));
+ }
+
+ if (cbp->vcdl != NULL)
+ print_cmd_columns(cbp->vcdl, 0);
+
+ printf("\n");
+
+ print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
+ B_FALSE, NULL);
+
+ print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
+ print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
+ print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS);
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0)
+ print_l2cache(zhp, cbp, l2cache, nl2cache);
+
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0)
+ print_spares(zhp, cbp, spares, nspares);
+
+ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
+ &nerr) == 0) {
+ nvlist_t *nverrlist = NULL;
+
+ /*
+ * If the approximate error count is small, get a
+ * precise count by fetching the entire log and
+ * uniquifying the results.
+ */
+ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
+ zpool_get_errlog(zhp, &nverrlist) == 0) {
+ nvpair_t *elem;
+
+ elem = NULL;
+ nerr = 0;
+ while ((elem = nvlist_next_nvpair(nverrlist,
+ elem)) != NULL) {
+ nerr++;
+ }
+ }
+ nvlist_free(nverrlist);
+
+ (void) printf("\n");
+
+ if (nerr == 0)
+ (void) printf(gettext("errors: No known data "
+ "errors\n"));
+ else if (!cbp->cb_verbose)
+ (void) printf(gettext("errors: %llu data "
+ "errors, use '-v' for a list\n"),
+ (u_longlong_t)nerr);
+ else
+ print_error_log(zhp);
+ }
+
+ if (cbp->cb_dedup_stats)
+ print_dedup_stats(config);
+ } else {
+ (void) printf(gettext("config: The configuration cannot be "
+ "determined.\n"));
+ }
+
+ return (0);
+}
+
+/*
+ * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ...
+ * [interval [count]]
+ *
+ * -c CMD For each vdev, run command CMD
+ * -i Display vdev initialization status.
+ * -g Display guid for individual vdev name.
+ * -L Follow links when resolving vdev path name.
+ * -p Display values in parsable (exact) format.
+ * -P Display full path for vdev name.
+ * -s Display slow IOs column.
+ * -v Display complete error logs
+ * -x Display only pools with potential problems
+ * -D Display dedup status (undocumented)
+ * -t Display vdev TRIM status.
+ * -T Display a timestamp in date(1) or Unix format
+ *
+ * Describes the health status of all pools or some subset.
+ */
+int
+zpool_do_status(int argc, char **argv)
+{
+ int c;
+ int ret;
+ float interval = 0;
+ unsigned long count = 0;
+ status_cbdata_t cb = { 0 };
+ char *cmd = NULL;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) {
+ switch (c) {
+ case 'c':
+ if (cmd != NULL) {
+ fprintf(stderr,
+ gettext("Can't set -c flag twice\n"));
+ exit(1);
+ }
+
+ if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL &&
+ !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) {
+ fprintf(stderr, gettext(
+ "Can't run -c, disabled by "
+ "ZPOOL_SCRIPTS_ENABLED.\n"));
+ exit(1);
+ }
+
+ if ((getuid() <= 0 || geteuid() <= 0) &&
+ !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) {
+ fprintf(stderr, gettext(
+ "Can't run -c with root privileges "
+ "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n"));
+ exit(1);
+ }
+ cmd = optarg;
+ break;
+ case 'i':
+ cb.cb_print_vdev_init = B_TRUE;
+ break;
+ case 'g':
+ cb.cb_name_flags |= VDEV_NAME_GUID;
+ break;
+ case 'L':
+ cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
+ break;
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ break;
+ case 'P':
+ cb.cb_name_flags |= VDEV_NAME_PATH;
+ break;
+ case 's':
+ cb.cb_print_slow_ios = B_TRUE;
+ break;
+ case 'v':
+ cb.cb_verbose = B_TRUE;
+ break;
+ case 'x':
+ cb.cb_explain = B_TRUE;
+ break;
+ case 'D':
+ cb.cb_dedup_stats = B_TRUE;
+ break;
+ case 't':
+ cb.cb_print_vdev_trim = B_TRUE;
+ break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
+ case '?':
+ if (optopt == 'c') {
+ print_zpool_script_list("status");
+ exit(0);
+ } else {
+ fprintf(stderr,
+ gettext("invalid option '%c'\n"), optopt);
+ }
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ get_interval_count(&argc, argv, &interval, &count);
+
+ if (argc == 0)
+ cb.cb_allpools = B_TRUE;
+
+ cb.cb_first = B_TRUE;
+ cb.cb_print_status = B_TRUE;
+
+ for (;;) {
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
+
+ if (cmd != NULL)
+ cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd,
+ NULL, NULL, 0, 0);
+
+ ret = for_each_pool(argc, argv, B_TRUE, NULL,
+ status_callback, &cb);
+
+ if (cb.vcdl != NULL)
+ free_vdev_cmd_data_list(cb.vcdl);
+
+ if (argc == 0 && cb.cb_count == 0)
+ (void) fprintf(stderr, gettext("no pools available\n"));
+ else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+ (void) printf(gettext("all pools are healthy\n"));
+
+ if (ret != 0)
+ return (ret);
+
+ if (interval == 0)
+ break;
+
+ if (count != 0 && --count == 0)
+ break;
+
+ (void) fsleep(interval);
+ }
+
+ return (0);
+}
+
+typedef struct upgrade_cbdata {
+ int cb_first;
+ int cb_argc;
+ uint64_t cb_version;
+ char **cb_argv;
+} upgrade_cbdata_t;
+
+static int
+check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs)
+{
+ int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+ int *count = (int *)unsupp_fs;
+
+ if (zfs_version > ZPL_VERSION) {
+ (void) printf(gettext("%s (v%d) is not supported by this "
+ "implementation of ZFS.\n"),
+ zfs_get_name(zhp), zfs_version);
+ (*count)++;
+ }
+
+ zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs);
+
+ zfs_close(zhp);
+
+ return (0);
+}
+
+static int
+upgrade_version(zpool_handle_t *zhp, uint64_t version)
+{
+ int ret;
+ nvlist_t *config;
+ uint64_t oldversion;
+ int unsupp_fs = 0;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &oldversion) == 0);
+
+ assert(SPA_VERSION_IS_SUPPORTED(oldversion));
+ assert(oldversion < version);
+
+ ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs);
+ if (ret != 0)
+ return (ret);
+
+ if (unsupp_fs) {
+ (void) fprintf(stderr, gettext("Upgrade not performed due "
+ "to %d unsupported filesystems (max v%d).\n"),
+ unsupp_fs, (int)ZPL_VERSION);
+ return (1);
+ }
+
+ ret = zpool_upgrade(zhp, version);
+ if (ret != 0)
+ return (ret);
+
+ if (version >= SPA_VERSION_FEATURES) {
+ (void) printf(gettext("Successfully upgraded "
+ "'%s' from version %llu to feature flags.\n"),
+ zpool_get_name(zhp), (u_longlong_t)oldversion);
+ } else {
+ (void) printf(gettext("Successfully upgraded "
+ "'%s' from version %llu to version %llu.\n"),
+ zpool_get_name(zhp), (u_longlong_t)oldversion,
+ (u_longlong_t)version);
+ }
+
+ return (0);
+}
+
+static int
+upgrade_enable_all(zpool_handle_t *zhp, int *countp)
+{
+ int i, ret, count;
+ boolean_t firstff = B_TRUE;
+ nvlist_t *enabled = zpool_get_features(zhp);
+
+ count = 0;
+ for (i = 0; i < SPA_FEATURES; i++) {
+ const char *fname = spa_feature_table[i].fi_uname;
+ const char *fguid = spa_feature_table[i].fi_guid;
+ if (!nvlist_exists(enabled, fguid)) {
+ char *propname;
+ verify(-1 != asprintf(&propname, "feature@%s", fname));
+ ret = zpool_set_prop(zhp, propname,
+ ZFS_FEATURE_ENABLED);
+ if (ret != 0) {
+ free(propname);
+ return (ret);
+ }
+ count++;
+
+ if (firstff) {
+ (void) printf(gettext("Enabled the "
+ "following features on '%s':\n"),
+ zpool_get_name(zhp));
+ firstff = B_FALSE;
+ }
+ (void) printf(gettext(" %s\n"), fname);
+ free(propname);
+ }
+ }
+
+ if (countp != NULL)
+ *countp = count;
+ return (0);
+}
+
+static int
+upgrade_cb(zpool_handle_t *zhp, void *arg)
+{
+ upgrade_cbdata_t *cbp = arg;
+ nvlist_t *config;
+ uint64_t version;
+ boolean_t printnl = B_FALSE;
+ int ret;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &version) == 0);
+
+ assert(SPA_VERSION_IS_SUPPORTED(version));
+
+ if (version < cbp->cb_version) {
+ cbp->cb_first = B_FALSE;
+ ret = upgrade_version(zhp, cbp->cb_version);
+ if (ret != 0)
+ return (ret);
+ printnl = B_TRUE;
+
+ /*
+ * If they did "zpool upgrade -a", then we could
+ * be doing ioctls to different pools. We need
+ * to log this history once to each pool, and bypass
+ * the normal history logging that happens in main().
+ */
+ (void) zpool_log_history(g_zfs, history_str);
+ log_history = B_FALSE;
+ }
+
+ if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+ int count;
+ ret = upgrade_enable_all(zhp, &count);
+ if (ret != 0)
+ return (ret);
+
+ if (count > 0) {
+ cbp->cb_first = B_FALSE;
+ printnl = B_TRUE;
+ }
+ }
+
+ if (printnl) {
+ (void) printf(gettext("\n"));
+ }
+
+ return (0);
+}
+
+static int
+upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
+{
+ upgrade_cbdata_t *cbp = arg;
+ nvlist_t *config;
+ uint64_t version;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &version) == 0);
+
+ assert(SPA_VERSION_IS_SUPPORTED(version));
+
+ if (version < SPA_VERSION_FEATURES) {
+ if (cbp->cb_first) {
+ (void) printf(gettext("The following pools are "
+ "formatted with legacy version numbers and can\n"
+ "be upgraded to use feature flags. After "
+ "being upgraded, these pools\nwill no "
+ "longer be accessible by software that does not "
+ "support feature\nflags.\n\n"));
+ (void) printf(gettext("VER POOL\n"));
+ (void) printf(gettext("--- ------------\n"));
+ cbp->cb_first = B_FALSE;
+ }
+
+ (void) printf("%2llu %s\n", (u_longlong_t)version,
+ zpool_get_name(zhp));
+ }
+
+ return (0);
+}
+
+static int
+upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
+{
+ upgrade_cbdata_t *cbp = arg;
+ nvlist_t *config;
+ uint64_t version;
+
+ config = zpool_get_config(zhp, NULL);
+ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+ &version) == 0);
+
+ if (version >= SPA_VERSION_FEATURES) {
+ int i;
+ boolean_t poolfirst = B_TRUE;
+ nvlist_t *enabled = zpool_get_features(zhp);
+
+ for (i = 0; i < SPA_FEATURES; i++) {
+ const char *fguid = spa_feature_table[i].fi_guid;
+ const char *fname = spa_feature_table[i].fi_uname;
+ if (!nvlist_exists(enabled, fguid)) {
+ if (cbp->cb_first) {
+ (void) printf(gettext("\nSome "
+ "supported features are not "
+ "enabled on the following pools. "
+ "Once a\nfeature is enabled the "
+ "pool may become incompatible with "
+ "software\nthat does not support "
+ "the feature. See "
+ "zpool-features(5) for "
+ "details.\n\n"));
+ (void) printf(gettext("POOL "
+ "FEATURE\n"));
+ (void) printf(gettext("------"
+ "---------\n"));
+ cbp->cb_first = B_FALSE;
+ }
+
+ if (poolfirst) {
+ (void) printf(gettext("%s\n"),
+ zpool_get_name(zhp));
+ poolfirst = B_FALSE;
+ }
+
+ (void) printf(gettext(" %s\n"), fname);
+ }
+ /*
+ * If they did "zpool upgrade -a", then we could
+ * be doing ioctls to different pools. We need
+ * to log this history once to each pool, and bypass
+ * the normal history logging that happens in main().
+ */
+ (void) zpool_log_history(g_zfs, history_str);
+ log_history = B_FALSE;
+ }
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+upgrade_one(zpool_handle_t *zhp, void *data)
+{
+ boolean_t printnl = B_FALSE;
+ upgrade_cbdata_t *cbp = data;
+ uint64_t cur_version;
+ int ret;
+
+ if (strcmp("log", zpool_get_name(zhp)) == 0) {
+ (void) fprintf(stderr, gettext("'log' is now a reserved word\n"
+ "Pool 'log' must be renamed using export and import"
+ " to upgrade.\n"));
+ return (1);
+ }
+
+ cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+ if (cur_version > cbp->cb_version) {
+ (void) printf(gettext("Pool '%s' is already formatted "
+ "using more current version '%llu'.\n\n"),
+ zpool_get_name(zhp), (u_longlong_t)cur_version);
+ return (0);
+ }
+
+ if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
+ (void) printf(gettext("Pool '%s' is already formatted "
+ "using version %llu.\n\n"), zpool_get_name(zhp),
+ (u_longlong_t)cbp->cb_version);
+ return (0);
+ }
+
+ if (cur_version != cbp->cb_version) {
+ printnl = B_TRUE;
+ ret = upgrade_version(zhp, cbp->cb_version);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+ int count = 0;
+ ret = upgrade_enable_all(zhp, &count);
+ if (ret != 0)
+ return (ret);
+
+ if (count != 0) {
+ printnl = B_TRUE;
+ } else if (cur_version == SPA_VERSION) {
+ (void) printf(gettext("Pool '%s' already has all "
+ "supported features enabled.\n"),
+ zpool_get_name(zhp));
+ }
+ }
+
+ if (printnl) {
+ (void) printf(gettext("\n"));
+ }
+
+ return (0);
+}
+
+/*
+ * zpool upgrade
+ * zpool upgrade -v
+ * zpool upgrade [-V version] <-a | pool ...>
+ *
+ * With no arguments, display downrev'd ZFS pool available for upgrade.
+ * Individual pools can be upgraded by specifying the pool, and '-a' will
+ * upgrade all pools.
+ */
+int
+zpool_do_upgrade(int argc, char **argv)
+{
+ int c;
+ upgrade_cbdata_t cb = { 0 };
+ int ret = 0;
+ boolean_t showversions = B_FALSE;
+ boolean_t upgradeall = B_FALSE;
+ char *end;
+
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":avV:")) != -1) {
+ switch (c) {
+ case 'a':
+ upgradeall = B_TRUE;
+ break;
+ case 'v':
+ showversions = B_TRUE;
+ break;
+ case 'V':
+ cb.cb_version = strtoll(optarg, &end, 10);
+ if (*end != '\0' ||
+ !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
+ (void) fprintf(stderr,
+ gettext("invalid version '%s'\n"), optarg);
+ usage(B_FALSE);
+ }
+ break;
+ case ':':
+ (void) fprintf(stderr, gettext("missing argument for "
+ "'%c' option\n"), optopt);
+ usage(B_FALSE);
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ cb.cb_argc = argc;
+ cb.cb_argv = argv;
+ argc -= optind;
+ argv += optind;
+
+ if (cb.cb_version == 0) {
+ cb.cb_version = SPA_VERSION;
+ } else if (!upgradeall && argc == 0) {
+ (void) fprintf(stderr, gettext("-V option is "
+ "incompatible with other arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (showversions) {
+ if (upgradeall || argc != 0) {
+ (void) fprintf(stderr, gettext("-v option is "
+ "incompatible with other arguments\n"));
+ usage(B_FALSE);
+ }
+ } else if (upgradeall) {
+ if (argc != 0) {
+ (void) fprintf(stderr, gettext("-a option should not "
+ "be used along with a pool name\n"));
+ usage(B_FALSE);
+ }
+ }
+
+ (void) printf(gettext("This system supports ZFS pool feature "
+ "flags.\n\n"));
+ if (showversions) {
+ int i;
+
+ (void) printf(gettext("The following features are "
+ "supported:\n\n"));
+ (void) printf(gettext("FEAT DESCRIPTION\n"));
+ (void) printf("----------------------------------------------"
+ "---------------\n");
+ for (i = 0; i < SPA_FEATURES; i++) {
+ zfeature_info_t *fi = &spa_feature_table[i];
+ const char *ro =
+ (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ " (read-only compatible)" : "";
+
+ (void) printf("%-37s%s\n", fi->fi_uname, ro);
+ (void) printf(" %s\n", fi->fi_desc);
+ }
+ (void) printf("\n");
+
+ (void) printf(gettext("The following legacy versions are also "
+ "supported:\n\n"));
+ (void) printf(gettext("VER DESCRIPTION\n"));
+ (void) printf("--- -----------------------------------------"
+ "---------------\n");
+ (void) printf(gettext(" 1 Initial ZFS version\n"));
+ (void) printf(gettext(" 2 Ditto blocks "
+ "(replicated metadata)\n"));
+ (void) printf(gettext(" 3 Hot spares and double parity "
+ "RAID-Z\n"));
+ (void) printf(gettext(" 4 zpool history\n"));
+ (void) printf(gettext(" 5 Compression using the gzip "
+ "algorithm\n"));
+ (void) printf(gettext(" 6 bootfs pool property\n"));
+ (void) printf(gettext(" 7 Separate intent log devices\n"));
+ (void) printf(gettext(" 8 Delegated administration\n"));
+ (void) printf(gettext(" 9 refquota and refreservation "
+ "properties\n"));
+ (void) printf(gettext(" 10 Cache devices\n"));
+ (void) printf(gettext(" 11 Improved scrub performance\n"));
+ (void) printf(gettext(" 12 Snapshot properties\n"));
+ (void) printf(gettext(" 13 snapused property\n"));
+ (void) printf(gettext(" 14 passthrough-x aclinherit\n"));
+ (void) printf(gettext(" 15 user/group space accounting\n"));
+ (void) printf(gettext(" 16 stmf property support\n"));
+ (void) printf(gettext(" 17 Triple-parity RAID-Z\n"));
+ (void) printf(gettext(" 18 Snapshot user holds\n"));
+ (void) printf(gettext(" 19 Log device removal\n"));
+ (void) printf(gettext(" 20 Compression using zle "
+ "(zero-length encoding)\n"));
+ (void) printf(gettext(" 21 Deduplication\n"));
+ (void) printf(gettext(" 22 Received properties\n"));
+ (void) printf(gettext(" 23 Slim ZIL\n"));
+ (void) printf(gettext(" 24 System attributes\n"));
+ (void) printf(gettext(" 25 Improved scrub stats\n"));
+ (void) printf(gettext(" 26 Improved snapshot deletion "
+ "performance\n"));
+ (void) printf(gettext(" 27 Improved snapshot creation "
+ "performance\n"));
+ (void) printf(gettext(" 28 Multiple vdev replacements\n"));
+ (void) printf(gettext("\nFor more information on a particular "
+ "version, including supported releases,\n"));
+ (void) printf(gettext("see the ZFS Administration Guide.\n\n"));
+ } else if (argc == 0 && upgradeall) {
+ cb.cb_first = B_TRUE;
+ ret = zpool_iter(g_zfs, upgrade_cb, &cb);
+ if (ret == 0 && cb.cb_first) {
+ if (cb.cb_version == SPA_VERSION) {
+ (void) printf(gettext("All pools are already "
+ "formatted using feature flags.\n\n"));
+ (void) printf(gettext("Every feature flags "
+ "pool already has all supported features "
+ "enabled.\n"));
+ } else {
+ (void) printf(gettext("All pools are already "
+ "formatted with version %llu or higher.\n"),
+ (u_longlong_t)cb.cb_version);
+ }
+ }
+ } else if (argc == 0) {
+ cb.cb_first = B_TRUE;
+ ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
+ assert(ret == 0);
+
+ if (cb.cb_first) {
+ (void) printf(gettext("All pools are formatted "
+ "using feature flags.\n\n"));
+ } else {
+ (void) printf(gettext("\nUse 'zpool upgrade -v' "
+ "for a list of available legacy versions.\n"));
+ }
+
+ cb.cb_first = B_TRUE;
+ ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
+ assert(ret == 0);
+
+ if (cb.cb_first) {
+ (void) printf(gettext("Every feature flags pool has "
+ "all supported features enabled.\n"));
+ } else {
+ (void) printf(gettext("\n"));
+ }
+ } else {
+ ret = for_each_pool(argc, argv, B_FALSE, NULL,
+ upgrade_one, &cb);
+ }
+
+ return (ret);
+}
+
+typedef struct hist_cbdata {
+ boolean_t first;
+ boolean_t longfmt;
+ boolean_t internal;
+} hist_cbdata_t;
+
+static void
+print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb)
+{
+ nvlist_t **records;
+ uint_t numrecords;
+ int i;
+
+ verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
+ &records, &numrecords) == 0);
+ for (i = 0; i < numrecords; i++) {
+ nvlist_t *rec = records[i];
+ char tbuf[30] = "";
+
+ if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
+ time_t tsec;
+ struct tm t;
+
+ tsec = fnvlist_lookup_uint64(records[i],
+ ZPOOL_HIST_TIME);
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ }
+
+ if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
+ (void) printf("%s %s", tbuf,
+ fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
+ } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
+ int ievent =
+ fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
+ if (!cb->internal)
+ continue;
+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
+ (void) printf("%s unrecognized record:\n",
+ tbuf);
+ dump_nvlist(rec, 4);
+ continue;
+ }
+ (void) printf("%s [internal %s txg:%lld] %s", tbuf,
+ zfs_history_event_names[ievent],
+ (longlong_t)fnvlist_lookup_uint64(
+ rec, ZPOOL_HIST_TXG),
+ fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
+ } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
+ if (!cb->internal)
+ continue;
+ (void) printf("%s [txg:%lld] %s", tbuf,
+ (longlong_t)fnvlist_lookup_uint64(
+ rec, ZPOOL_HIST_TXG),
+ fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
+ if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
+ (void) printf(" %s (%llu)",
+ fnvlist_lookup_string(rec,
+ ZPOOL_HIST_DSNAME),
+ (u_longlong_t)fnvlist_lookup_uint64(rec,
+ ZPOOL_HIST_DSID));
+ }
+ (void) printf(" %s", fnvlist_lookup_string(rec,
+ ZPOOL_HIST_INT_STR));
+ } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
+ if (!cb->internal)
+ continue;
+ (void) printf("%s ioctl %s\n", tbuf,
+ fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
+ if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
+ (void) printf(" input:\n");
+ dump_nvlist(fnvlist_lookup_nvlist(rec,
+ ZPOOL_HIST_INPUT_NVL), 8);
+ }
+ if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
+ (void) printf(" output:\n");
+ dump_nvlist(fnvlist_lookup_nvlist(rec,
+ ZPOOL_HIST_OUTPUT_NVL), 8);
+ }
+ if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) {
+ (void) printf(" errno: %lld\n",
+ (longlong_t)fnvlist_lookup_int64(rec,
+ ZPOOL_HIST_ERRNO));
+ }
+ } else {
+ if (!cb->internal)
+ continue;
+ (void) printf("%s unrecognized record:\n", tbuf);
+ dump_nvlist(rec, 4);
+ }
+
+ if (!cb->longfmt) {
+ (void) printf("\n");
+ continue;
+ }
+ (void) printf(" [");
+ if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
+ uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
+ struct passwd *pwd = getpwuid(who);
+ (void) printf("user %d ", (int)who);
+ if (pwd != NULL)
+ (void) printf("(%s) ", pwd->pw_name);
+ }
+ if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
+ (void) printf("on %s",
+ fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
+ }
+ if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
+ (void) printf(":%s",
+ fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
+ }
+
+ (void) printf("]");
+ (void) printf("\n");
+ }
+}
+
+/*
+ * Print out the command history for a specific pool.
+ */
+static int
+get_history_one(zpool_handle_t *zhp, void *data)
+{
+ nvlist_t *nvhis;
+ int ret;
+ hist_cbdata_t *cb = (hist_cbdata_t *)data;
+ uint64_t off = 0;
+ boolean_t eof = B_FALSE;
+
+ cb->first = B_FALSE;
+
+ (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp));
+
+ while (!eof) {
+ if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0)
+ return (ret);
+
+ print_history_records(nvhis, cb);
+ nvlist_free(nvhis);
+ }
+ (void) printf("\n");
+
+ return (ret);
+}
+
+/*
+ * zpool history <pool>
+ *
+ * Displays the history of commands that modified pools.
+ */
+int
+zpool_do_history(int argc, char **argv)
+{
+ hist_cbdata_t cbdata = { 0 };
+ int ret;
+ int c;
+
+ cbdata.first = B_TRUE;
+ /* check options */
+ while ((c = getopt(argc, argv, "li")) != -1) {
+ switch (c) {
+ case 'l':
+ cbdata.longfmt = B_TRUE;
+ break;
+ case 'i':
+ cbdata.internal = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one,
+ &cbdata);
+
+ if (argc == 0 && cbdata.first == B_TRUE) {
+ (void) fprintf(stderr, gettext("no pools available\n"));
+ return (0);
+ }
+
+ return (ret);
+}
+
+typedef struct ev_opts {
+ int verbose;
+ int scripted;
+ int follow;
+ int clear;
+ char poolname[ZFS_MAX_DATASET_NAME_LEN];
+} ev_opts_t;
+
+static void
+zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts)
+{
+ char ctime_str[26], str[32], *ptr;
+ int64_t *tv;
+ uint_t n;
+
+ verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0);
+ memset(str, ' ', 32);
+ (void) ctime_r((const time_t *)&tv[0], ctime_str);
+ (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */
+ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */
+ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */
+ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */
+ if (opts->scripted)
+ (void) printf(gettext("%s\t"), str);
+ else
+ (void) printf(gettext("%s "), str);
+
+ verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0);
+ (void) printf(gettext("%s\n"), ptr);
+}
+
+static void
+zpool_do_events_nvprint(nvlist_t *nvl, int depth)
+{
+ nvpair_t *nvp;
+
+ for (nvp = nvlist_next_nvpair(nvl, NULL);
+ nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+ data_type_t type = nvpair_type(nvp);
+ const char *name = nvpair_name(nvp);
+
+ boolean_t b;
+ uint8_t i8;
+ uint16_t i16;
+ uint32_t i32;
+ uint64_t i64;
+ char *str;
+ nvlist_t *cnv;
+
+ printf(gettext("%*s%s = "), depth, "", name);
+
+ switch (type) {
+ case DATA_TYPE_BOOLEAN:
+ printf(gettext("%s"), "1");
+ break;
+
+ case DATA_TYPE_BOOLEAN_VALUE:
+ (void) nvpair_value_boolean_value(nvp, &b);
+ printf(gettext("%s"), b ? "1" : "0");
+ break;
+
+ case DATA_TYPE_BYTE:
+ (void) nvpair_value_byte(nvp, &i8);
+ printf(gettext("0x%x"), i8);
+ break;
+
+ case DATA_TYPE_INT8:
+ (void) nvpair_value_int8(nvp, (void *)&i8);
+ printf(gettext("0x%x"), i8);
+ break;
+
+ case DATA_TYPE_UINT8:
+ (void) nvpair_value_uint8(nvp, &i8);
+ printf(gettext("0x%x"), i8);
+ break;
+
+ case DATA_TYPE_INT16:
+ (void) nvpair_value_int16(nvp, (void *)&i16);
+ printf(gettext("0x%x"), i16);
+ break;
+
+ case DATA_TYPE_UINT16:
+ (void) nvpair_value_uint16(nvp, &i16);
+ printf(gettext("0x%x"), i16);
+ break;
+
+ case DATA_TYPE_INT32:
+ (void) nvpair_value_int32(nvp, (void *)&i32);
+ printf(gettext("0x%x"), i32);
+ break;
+
+ case DATA_TYPE_UINT32:
+ (void) nvpair_value_uint32(nvp, &i32);
+ printf(gettext("0x%x"), i32);
+ break;
+
+ case DATA_TYPE_INT64:
+ (void) nvpair_value_int64(nvp, (void *)&i64);
+ printf(gettext("0x%llx"), (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_UINT64:
+ (void) nvpair_value_uint64(nvp, &i64);
+ /*
+ * translate vdev state values to readable
+ * strings to aide zpool events consumers
+ */
+ if (strcmp(name,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 ||
+ strcmp(name,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) {
+ printf(gettext("\"%s\" (0x%llx)"),
+ zpool_state_to_name(i64, VDEV_AUX_NONE),
+ (u_longlong_t)i64);
+ } else {
+ printf(gettext("0x%llx"), (u_longlong_t)i64);
+ }
+ break;
+
+ case DATA_TYPE_HRTIME:
+ (void) nvpair_value_hrtime(nvp, (void *)&i64);
+ printf(gettext("0x%llx"), (u_longlong_t)i64);
+ break;
+
+ case DATA_TYPE_STRING:
+ (void) nvpair_value_string(nvp, &str);
+ printf(gettext("\"%s\""), str ? str : "<NULL>");
+ break;
+
+ case DATA_TYPE_NVLIST:
+ printf(gettext("(embedded nvlist)\n"));
+ (void) nvpair_value_nvlist(nvp, &cnv);
+ zpool_do_events_nvprint(cnv, depth + 8);
+ printf(gettext("%*s(end %s)"), depth, "", name);
+ break;
+
+ case DATA_TYPE_NVLIST_ARRAY: {
+ nvlist_t **val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+ printf(gettext("(%d embedded nvlists)\n"), nelem);
+ for (i = 0; i < nelem; i++) {
+ printf(gettext("%*s%s[%d] = %s\n"),
+ depth, "", name, i, "(embedded nvlist)");
+ zpool_do_events_nvprint(val[i], depth + 8);
+ printf(gettext("%*s(end %s[%i])\n"),
+ depth, "", name, i);
+ }
+ printf(gettext("%*s(end %s)\n"), depth, "", name);
+ }
+ break;
+
+ case DATA_TYPE_INT8_ARRAY: {
+ int8_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_int8_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%x "), val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_UINT8_ARRAY: {
+ uint8_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_uint8_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%x "), val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_INT16_ARRAY: {
+ int16_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_int16_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%x "), val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_UINT16_ARRAY: {
+ uint16_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_uint16_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%x "), val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_INT32_ARRAY: {
+ int32_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_int32_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%x "), val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_UINT32_ARRAY: {
+ uint32_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_uint32_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%x "), val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_INT64_ARRAY: {
+ int64_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_int64_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%llx "),
+ (u_longlong_t)val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_UINT64_ARRAY: {
+ uint64_t *val;
+ uint_t i, nelem;
+
+ (void) nvpair_value_uint64_array(nvp, &val, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("0x%llx "),
+ (u_longlong_t)val[i]);
+
+ break;
+ }
+
+ case DATA_TYPE_STRING_ARRAY: {
+ char **str;
+ uint_t i, nelem;
+
+ (void) nvpair_value_string_array(nvp, &str, &nelem);
+ for (i = 0; i < nelem; i++)
+ printf(gettext("\"%s\" "),
+ str[i] ? str[i] : "<NULL>");
+
+ break;
+ }
+
+ case DATA_TYPE_BOOLEAN_ARRAY:
+ case DATA_TYPE_BYTE_ARRAY:
+ case DATA_TYPE_DOUBLE:
+ case DATA_TYPE_DONTCARE:
+ case DATA_TYPE_UNKNOWN:
+ printf(gettext("<unknown>"));
+ break;
+ }
+
+ printf(gettext("\n"));
+ }
+}
+
+static int
+zpool_do_events_next(ev_opts_t *opts)
+{
+ nvlist_t *nvl;
+ int zevent_fd, ret, dropped;
+ char *pool;
+
+ zevent_fd = open(ZFS_DEV, O_RDWR);
+ VERIFY(zevent_fd >= 0);
+
+ if (!opts->scripted)
+ (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS");
+
+ while (1) {
+ ret = zpool_events_next(g_zfs, &nvl, &dropped,
+ (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd);
+ if (ret || nvl == NULL)
+ break;
+
+ if (dropped > 0)
+ (void) printf(gettext("dropped %d events\n"), dropped);
+
+ if (strlen(opts->poolname) > 0 &&
+ nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 &&
+ strcmp(opts->poolname, pool) != 0)
+ continue;
+
+ zpool_do_events_short(nvl, opts);
+
+ if (opts->verbose) {
+ zpool_do_events_nvprint(nvl, 8);
+ printf(gettext("\n"));
+ }
+ (void) fflush(stdout);
+
+ nvlist_free(nvl);
+ }
+
+ VERIFY(0 == close(zevent_fd));
+
+ return (ret);
+}
+
+static int
+zpool_do_events_clear(ev_opts_t *opts)
+{
+ int count, ret;
+
+ ret = zpool_events_clear(g_zfs, &count);
+ if (!ret)
+ (void) printf(gettext("cleared %d events\n"), count);
+
+ return (ret);
+}
+
+/*
+ * zpool events [-vHf [pool] | -c]
+ *
+ * Displays events logs by ZFS.
+ */
+int
+zpool_do_events(int argc, char **argv)
+{
+ ev_opts_t opts = { 0 };
+ int ret;
+ int c;
+
+ /* check options */
+ while ((c = getopt(argc, argv, "vHfc")) != -1) {
+ switch (c) {
+ case 'v':
+ opts.verbose = 1;
+ break;
+ case 'H':
+ opts.scripted = 1;
+ break;
+ case 'f':
+ opts.follow = 1;
+ break;
+ case 'c':
+ opts.clear = 1;
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ } else if (argc == 1) {
+ (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname));
+ if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) {
+ (void) fprintf(stderr,
+ gettext("invalid pool name '%s'\n"), opts.poolname);
+ usage(B_FALSE);
+ }
+ }
+
+ if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) &&
+ opts.clear) {
+ (void) fprintf(stderr,
+ gettext("invalid options combined with -c\n"));
+ usage(B_FALSE);
+ }
+
+ if (opts.clear)
+ ret = zpool_do_events_clear(&opts);
+ else
+ ret = zpool_do_events_next(&opts);
+
+ return (ret);
+}
+
+static int
+get_callback(zpool_handle_t *zhp, void *data)
+{
+ zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data;
+ char value[MAXNAMELEN];
+ zprop_source_t srctype;
+ zprop_list_t *pl;
+
+ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
+
+ /*
+ * Skip the special fake placeholder. This will also skip
+ * over the name property when 'all' is specified.
+ */
+ if (pl->pl_prop == ZPOOL_PROP_NAME &&
+ pl == cbp->cb_proplist)
+ continue;
+
+ if (pl->pl_prop == ZPROP_INVAL &&
+ (zpool_prop_feature(pl->pl_user_prop) ||
+ zpool_prop_unsupported(pl->pl_user_prop))) {
+ srctype = ZPROP_SRC_LOCAL;
+
+ if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
+ value, sizeof (value)) == 0) {
+ zprop_print_one_property(zpool_get_name(zhp),
+ cbp, pl->pl_user_prop, value, srctype,
+ NULL, NULL);
+ }
+ } else {
+ if (zpool_get_prop(zhp, pl->pl_prop, value,
+ sizeof (value), &srctype, cbp->cb_literal) != 0)
+ continue;
+
+ zprop_print_one_property(zpool_get_name(zhp), cbp,
+ zpool_prop_to_name(pl->pl_prop), value, srctype,
+ NULL, NULL);
+ }
+ }
+ return (0);
+}
+
+/*
+ * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> <pool> ...
+ *
+ * -H Scripted mode. Don't display headers, and separate properties
+ * by a single tab.
+ * -o List of columns to display. Defaults to
+ * "name,property,value,source".
+ * -p Display values in parsable (exact) format.
+ *
+ * Get properties of pools in the system. Output space statistics
+ * for each one as well as other attributes.
+ */
+int
+zpool_do_get(int argc, char **argv)
+{
+ zprop_get_cbdata_t cb = { 0 };
+ zprop_list_t fake_name = { 0 };
+ int ret;
+ int c, i;
+ char *value;
+
+ cb.cb_first = B_TRUE;
+
+ /*
+ * Set up default columns and sources.
+ */
+ cb.cb_sources = ZPROP_SRC_ALL;
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_SOURCE;
+ cb.cb_type = ZFS_TYPE_POOL;
+
+ /* check options */
+ while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
+ switch (c) {
+ case 'p':
+ cb.cb_literal = B_TRUE;
+ break;
+ case 'H':
+ cb.cb_scripted = B_TRUE;
+ break;
+ case 'o':
+ bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+ i = 0;
+ while (*optarg != '\0') {
+ static char *col_subopts[] =
+ { "name", "property", "value", "source",
+ "all", NULL };
+
+ if (i == ZFS_GET_NCOLS) {
+ (void) fprintf(stderr, gettext("too "
+ "many fields given to -o "
+ "option\n"));
+ usage(B_FALSE);
+ }
+
+ switch (getsubopt(&optarg, col_subopts,
+ &value)) {
+ case 0:
+ cb.cb_columns[i++] = GET_COL_NAME;
+ break;
+ case 1:
+ cb.cb_columns[i++] = GET_COL_PROPERTY;
+ break;
+ case 2:
+ cb.cb_columns[i++] = GET_COL_VALUE;
+ break;
+ case 3:
+ cb.cb_columns[i++] = GET_COL_SOURCE;
+ break;
+ case 4:
+ if (i > 0) {
+ (void) fprintf(stderr,
+ gettext("\"all\" conflicts "
+ "with specific fields "
+ "given to -o option\n"));
+ usage(B_FALSE);
+ }
+ cb.cb_columns[0] = GET_COL_NAME;
+ cb.cb_columns[1] = GET_COL_PROPERTY;
+ cb.cb_columns[2] = GET_COL_VALUE;
+ cb.cb_columns[3] = GET_COL_SOURCE;
+ i = ZFS_GET_NCOLS;
+ break;
+ default:
+ (void) fprintf(stderr,
+ gettext("invalid column name "
+ "'%s'\n"), value);
+ usage(B_FALSE);
+ }
+ }
+ break;
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing property "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
+ ZFS_TYPE_POOL) != 0)
+ usage(B_FALSE);
+
+ argc--;
+ argv++;
+
+ if (cb.cb_proplist != NULL) {
+ fake_name.pl_prop = ZPOOL_PROP_NAME;
+ fake_name.pl_width = strlen(gettext("NAME"));
+ fake_name.pl_next = cb.cb_proplist;
+ cb.cb_proplist = &fake_name;
+ }
+
+ ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
+ get_callback, &cb);
+
+ if (cb.cb_proplist == &fake_name)
+ zprop_free_list(fake_name.pl_next);
+ else
+ zprop_free_list(cb.cb_proplist);
+
+ return (ret);
+}
+
+typedef struct set_cbdata {
+ char *cb_propname;
+ char *cb_value;
+ boolean_t cb_any_successful;
+} set_cbdata_t;
+
+static int
+set_callback(zpool_handle_t *zhp, void *data)
+{
+ int error;
+ set_cbdata_t *cb = (set_cbdata_t *)data;
+
+ error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value);
+
+ if (!error)
+ cb->cb_any_successful = B_TRUE;
+
+ return (error);
+}
+
+int
+zpool_do_set(int argc, char **argv)
+{
+ set_cbdata_t cb = { 0 };
+ int error;
+
+ if (argc > 1 && argv[1][0] == '-') {
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ argv[1][1]);
+ usage(B_FALSE);
+ }
+
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing property=value "
+ "argument\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc < 3) {
+ (void) fprintf(stderr, gettext("missing pool name\n"));
+ usage(B_FALSE);
+ }
+
+ if (argc > 3) {
+ (void) fprintf(stderr, gettext("too many pool names\n"));
+ usage(B_FALSE);
+ }
+
+ cb.cb_propname = argv[1];
+ cb.cb_value = strchr(cb.cb_propname, '=');
+ if (cb.cb_value == NULL) {
+ (void) fprintf(stderr, gettext("missing value in "
+ "property=value argument\n"));
+ usage(B_FALSE);
+ }
+
+ *(cb.cb_value) = '\0';
+ cb.cb_value++;
+
+ error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL,
+ set_callback, &cb);
+
+ return (error);
+}
+
+/* Add up the total number of bytes left to initialize/trim across all vdevs */
+static uint64_t
+vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
+{
+ uint64_t bytes_remaining;
+ nvlist_t **child;
+ uint_t c, children;
+ vdev_stat_t *vs;
+
+ assert(activity == ZPOOL_WAIT_INITIALIZE ||
+ activity == ZPOOL_WAIT_TRIM);
+
+ verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+
+ if (activity == ZPOOL_WAIT_INITIALIZE &&
+ vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE)
+ bytes_remaining = vs->vs_initialize_bytes_est -
+ vs->vs_initialize_bytes_done;
+ else if (activity == ZPOOL_WAIT_TRIM &&
+ vs->vs_trim_state == VDEV_TRIM_ACTIVE)
+ bytes_remaining = vs->vs_trim_bytes_est -
+ vs->vs_trim_bytes_done;
+ else
+ bytes_remaining = 0;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (c = 0; c < children; c++)
+ bytes_remaining += vdev_activity_remaining(child[c], activity);
+
+ return (bytes_remaining);
+}
+
+/* Add up the total number of bytes left to rebuild across top-level vdevs */
+static uint64_t
+vdev_activity_top_remaining(nvlist_t *nv)
+{
+ uint64_t bytes_remaining = 0;
+ nvlist_t **child;
+ uint_t children;
+ int error;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (uint_t c = 0; c < children; c++) {
+ vdev_rebuild_stat_t *vrs;
+ uint_t i;
+
+ error = nvlist_lookup_uint64_array(child[c],
+ ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i);
+ if (error == 0) {
+ if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+ bytes_remaining += (vrs->vrs_bytes_est -
+ vrs->vrs_bytes_rebuilt);
+ }
+ }
+ }
+
+ return (bytes_remaining);
+}
+
+/* Whether any vdevs are 'spare' or 'replacing' vdevs */
+static boolean_t
+vdev_any_spare_replacing(nvlist_t *nv)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *vdev_type;
+
+ (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type);
+
+ if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 ||
+ strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) {
+ return (B_TRUE);
+ }
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ children = 0;
+
+ for (c = 0; c < children; c++) {
+ if (vdev_any_spare_replacing(child[c]))
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+typedef struct wait_data {
+ char *wd_poolname;
+ boolean_t wd_scripted;
+ boolean_t wd_exact;
+ boolean_t wd_headers_once;
+ boolean_t wd_should_exit;
+ /* Which activities to wait for */
+ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES];
+ float wd_interval;
+ pthread_cond_t wd_cv;
+ pthread_mutex_t wd_mutex;
+} wait_data_t;
+
+/*
+ * Print to stdout a single line, containing one column for each activity that
+ * we are waiting for specifying how many bytes of work are left for that
+ * activity.
+ */
+static void
+print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
+{
+ nvlist_t *config, *nvroot;
+ uint_t c;
+ int i;
+ pool_checkpoint_stat_t *pcs = NULL;
+ pool_scan_stat_t *pss = NULL;
+ pool_removal_stat_t *prs = NULL;
+ char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE",
+ "REMOVE", "RESILVER", "SCRUB", "TRIM"};
+ int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES];
+
+ /* Calculate the width of each column */
+ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+ /*
+ * Make sure we have enough space in the col for pretty-printed
+ * numbers and for the column header, and then leave a couple
+ * spaces between cols for readability.
+ */
+ col_widths[i] = MAX(strlen(headers[i]), 6) + 2;
+ }
+
+ /* Print header if appropriate */
+ int term_height = terminal_height();
+ boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 &&
+ row % (term_height-1) == 0);
+ if (!wd->wd_scripted && (row == 0 || reprint_header)) {
+ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+ if (wd->wd_enabled[i])
+ (void) printf("%*s", col_widths[i], headers[i]);
+ }
+ (void) printf("\n");
+ }
+
+ /* Bytes of work remaining in each activity */
+ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0};
+
+ bytes_rem[ZPOOL_WAIT_FREE] =
+ zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL);
+
+ config = zpool_get_config(zhp, NULL);
+ nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+ if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING)
+ bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space;
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
+ if (prs != NULL && prs->prs_state == DSS_SCANNING)
+ bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy -
+ prs->prs_copied;
+
+ (void) nvlist_lookup_uint64_array(nvroot,
+ ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c);
+ if (pss != NULL && pss->pss_state == DSS_SCANNING &&
+ pss->pss_pass_scrub_pause == 0) {
+ int64_t rem = pss->pss_to_examine - pss->pss_issued;
+ if (pss->pss_func == POOL_SCAN_SCRUB)
+ bytes_rem[ZPOOL_WAIT_SCRUB] = rem;
+ else
+ bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
+ } else if (check_rebuilding(nvroot, NULL)) {
+ bytes_rem[ZPOOL_WAIT_RESILVER] =
+ vdev_activity_top_remaining(nvroot);
+ }
+
+ bytes_rem[ZPOOL_WAIT_INITIALIZE] =
+ vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE);
+ bytes_rem[ZPOOL_WAIT_TRIM] =
+ vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM);
+
+ /*
+ * A replace finishes after resilvering finishes, so the amount of work
+ * left for a replace is the same as for resilvering.
+ *
+ * It isn't quite correct to say that if we have any 'spare' or
+ * 'replacing' vdevs and a resilver is happening, then a replace is in
+ * progress, like we do here. When a hot spare is used, the faulted vdev
+ * is not removed after the hot spare is resilvered, so parent 'spare'
+ * vdev is not removed either. So we could have a 'spare' vdev, but be
+ * resilvering for a different reason. However, we use it as a heuristic
+ * because we don't have access to the DTLs, which could tell us whether
+ * or not we have really finished resilvering a hot spare.
+ */
+ if (vdev_any_spare_replacing(nvroot))
+ bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER];
+
+ if (timestamp_fmt != NODATE)
+ print_timestamp(timestamp_fmt);
+
+ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+ char buf[64];
+ if (!wd->wd_enabled[i])
+ continue;
+
+ if (wd->wd_exact)
+ (void) snprintf(buf, sizeof (buf), "%" PRIi64,
+ bytes_rem[i]);
+ else
+ zfs_nicenum(bytes_rem[i], buf, sizeof (buf));
+
+ if (wd->wd_scripted)
+ (void) printf(i == 0 ? "%s" : "\t%s", buf);
+ else
+ (void) printf(" %*s", col_widths[i] - 1, buf);
+ }
+ (void) printf("\n");
+ (void) fflush(stdout);
+}
+
+static void *
+wait_status_thread(void *arg)
+{
+ wait_data_t *wd = (wait_data_t *)arg;
+ zpool_handle_t *zhp;
+
+ if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL)
+ return (void *)(1);
+
+ for (int row = 0; ; row++) {
+ boolean_t missing;
+ struct timespec timeout;
+ int ret = 0;
+ (void) clock_gettime(CLOCK_REALTIME, &timeout);
+
+ if (zpool_refresh_stats(zhp, &missing) != 0 || missing ||
+ zpool_props_refresh(zhp) != 0) {
+ zpool_close(zhp);
+ return (void *)(uintptr_t)(missing ? 0 : 1);
+ }
+
+ print_wait_status_row(wd, zhp, row);
+
+ timeout.tv_sec += floor(wd->wd_interval);
+ long nanos = timeout.tv_nsec +
+ (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC;
+ if (nanos >= NANOSEC) {
+ timeout.tv_sec++;
+ timeout.tv_nsec = nanos - NANOSEC;
+ } else {
+ timeout.tv_nsec = nanos;
+ }
+ pthread_mutex_lock(&wd->wd_mutex);
+ if (!wd->wd_should_exit)
+ ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex,
+ &timeout);
+ pthread_mutex_unlock(&wd->wd_mutex);
+ if (ret == 0) {
+ break; /* signaled by main thread */
+ } else if (ret != ETIMEDOUT) {
+ (void) fprintf(stderr, gettext("pthread_cond_timedwait "
+ "failed: %s\n"), strerror(ret));
+ zpool_close(zhp);
+ return (void *)(uintptr_t)(1);
+ }
+ }
+
+ zpool_close(zhp);
+ return (void *)(0);
+}
+
+int
+zpool_do_wait(int argc, char **argv)
+{
+ boolean_t verbose = B_FALSE;
+ char c;
+ char *value;
+ int i;
+ unsigned long count;
+ pthread_t status_thr;
+ int error = 0;
+ zpool_handle_t *zhp;
+
+ wait_data_t wd;
+ wd.wd_scripted = B_FALSE;
+ wd.wd_exact = B_FALSE;
+ wd.wd_headers_once = B_FALSE;
+ wd.wd_should_exit = B_FALSE;
+
+ pthread_mutex_init(&wd.wd_mutex, NULL);
+ pthread_cond_init(&wd.wd_cv, NULL);
+
+ /* By default, wait for all types of activity. */
+ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++)
+ wd.wd_enabled[i] = B_TRUE;
+
+ while ((c = getopt(argc, argv, "HpT:t:")) != -1) {
+ switch (c) {
+ case 'H':
+ wd.wd_scripted = B_TRUE;
+ break;
+ case 'n':
+ wd.wd_headers_once = B_TRUE;
+ break;
+ case 'p':
+ wd.wd_exact = B_TRUE;
+ break;
+ case 'T':
+ get_timestamp_arg(*optarg);
+ break;
+ case 't':
+ {
+ static char *col_subopts[] = { "discard", "free",
+ "initialize", "replace", "remove", "resilver",
+ "scrub", "trim", NULL };
+
+ /* Reset activities array */
+ bzero(&wd.wd_enabled, sizeof (wd.wd_enabled));
+ while (*optarg != '\0') {
+ int activity = getsubopt(&optarg, col_subopts,
+ &value);
+
+ if (activity < 0) {
+ (void) fprintf(stderr,
+ gettext("invalid activity '%s'\n"),
+ value);
+ usage(B_FALSE);
+ }
+
+ wd.wd_enabled[activity] = B_TRUE;
+ }
+ break;
+ }
+ case '?':
+ (void) fprintf(stderr, gettext("invalid option '%c'\n"),
+ optopt);
+ usage(B_FALSE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ get_interval_count(&argc, argv, &wd.wd_interval, &count);
+ if (count != 0) {
+ /* This subcmd only accepts an interval, not a count */
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ if (wd.wd_interval != 0)
+ verbose = B_TRUE;
+
+ if (argc < 1) {
+ (void) fprintf(stderr, gettext("missing 'pool' argument\n"));
+ usage(B_FALSE);
+ }
+ if (argc > 1) {
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ usage(B_FALSE);
+ }
+
+ wd.wd_poolname = argv[0];
+
+ if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL)
+ return (1);
+
+ if (verbose) {
+ /*
+ * We use a separate thread for printing status updates because
+ * the main thread will call lzc_wait(), which blocks as long
+ * as an activity is in progress, which can be a long time.
+ */
+ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd)
+ != 0) {
+ (void) fprintf(stderr, gettext("failed to create status"
+ "thread: %s\n"), strerror(errno));
+ zpool_close(zhp);
+ return (1);
+ }
+ }
+
+ /*
+ * Loop over all activities that we are supposed to wait for until none
+ * of them are in progress. Note that this means we can end up waiting
+ * for more activities to complete than just those that were in progress
+ * when we began waiting; if an activity we are interested in begins
+ * while we are waiting for another activity, we will wait for both to
+ * complete before exiting.
+ */
+ for (;;) {
+ boolean_t missing = B_FALSE;
+ boolean_t any_waited = B_FALSE;
+
+ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) {
+ boolean_t waited;
+
+ if (!wd.wd_enabled[i])
+ continue;
+
+ error = zpool_wait_status(zhp, i, &missing, &waited);
+ if (error != 0 || missing)
+ break;
+
+ any_waited = (any_waited || waited);
+ }
+
+ if (error != 0 || missing || !any_waited)
+ break;
+ }
+
+ zpool_close(zhp);
+
+ if (verbose) {
+ uintptr_t status;
+ pthread_mutex_lock(&wd.wd_mutex);
+ wd.wd_should_exit = B_TRUE;
+ pthread_cond_signal(&wd.wd_cv);
+ pthread_mutex_unlock(&wd.wd_mutex);
+ (void) pthread_join(status_thr, (void *)&status);
+ if (status != 0)
+ error = status;
+ }
+
+ pthread_mutex_destroy(&wd.wd_mutex);
+ pthread_cond_destroy(&wd.wd_cv);
+ return (error);
+}
+
+static int
+find_command_idx(char *command, int *idx)
+{
+ int i;
+
+ for (i = 0; i < NCOMMAND; i++) {
+ if (command_table[i].name == NULL)
+ continue;
+
+ if (strcmp(command, command_table[i].name) == 0) {
+ *idx = i;
+ return (0);
+ }
+ }
+ return (1);
+}
+
+/*
+ * Display version message
+ */
+static int
+zpool_do_version(int argc, char **argv)
+{
+ if (zfs_version_print() == -1)
+ return (1);
+
+ return (0);
+}
+
+int
+main(int argc, char **argv)
+{
+ int ret = 0;
+ int i = 0;
+ char *cmdname;
+ char **newargv;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+ srand(time(NULL));
+
+ opterr = 0;
+
+ /*
+ * Make sure the user has specified some command.
+ */
+ if (argc < 2) {
+ (void) fprintf(stderr, gettext("missing command\n"));
+ usage(B_FALSE);
+ }
+
+ cmdname = argv[1];
+
+ /*
+ * Special case '-?'
+ */
+ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0)
+ usage(B_TRUE);
+
+ /*
+ * Special case '-V|--version'
+ */
+ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0))
+ return (zpool_do_version(argc, argv));
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+ return (1);
+ }
+
+ libzfs_print_on_error(g_zfs, B_TRUE);
+
+ zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
+
+ /*
+ * Many commands modify input strings for string parsing reasons.
+ * We create a copy to protect the original argv.
+ */
+ newargv = malloc((argc + 1) * sizeof (newargv[0]));
+ for (i = 0; i < argc; i++)
+ newargv[i] = strdup(argv[i]);
+ newargv[argc] = NULL;
+
+ /*
+ * Run the appropriate command.
+ */
+ if (find_command_idx(cmdname, &i) == 0) {
+ current_command = &command_table[i];
+ ret = command_table[i].func(argc - 1, newargv + 1);
+ } else if (strchr(cmdname, '=')) {
+ verify(find_command_idx("set", &i) == 0);
+ current_command = &command_table[i];
+ ret = command_table[i].func(argc, newargv);
+ } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) {
+ /*
+ * 'freeze' is a vile debugging abomination, so we treat
+ * it as such.
+ */
+ zfs_cmd_t zc = {"\0"};
+
+ (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
+ ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc);
+ if (ret != 0) {
+ (void) fprintf(stderr,
+ gettext("failed to freeze pool: %d\n"), errno);
+ ret = 1;
+ }
+
+ log_history = 0;
+ } else {
+ (void) fprintf(stderr, gettext("unrecognized "
+ "command '%s'\n"), cmdname);
+ usage(B_FALSE);
+ ret = 1;
+ }
+
+ for (i = 0; i < argc; i++)
+ free(newargv[i]);
+ free(newargv);
+
+ if (ret == 0 && log_history)
+ (void) zpool_log_history(g_zfs, history_str);
+
+ libzfs_fini(g_zfs);
+
+ /*
+ * The 'ZFS_ABORT' environment variable causes us to dump core on exit
+ * for the purposes of running ::findleaks.
+ */
+ if (getenv("ZFS_ABORT") != NULL) {
+ (void) printf("dumping core by request\n");
+ abort();
+ }
+
+ return (ret);
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.c b/sys/contrib/openzfs/cmd/zpool/zpool_util.c
new file mode 100644
index 000000000000..1c1eb024f365
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.c
@@ -0,0 +1,125 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <errno.h>
+#include <libgen.h>
+#include <libintl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <ctype.h>
+
+#include "zpool_util.h"
+
+/*
+ * Utility function to guarantee malloc() success.
+ */
+void *
+safe_malloc(size_t size)
+{
+ void *data;
+
+ if ((data = calloc(1, size)) == NULL) {
+ (void) fprintf(stderr, "internal error: out of memory\n");
+ exit(1);
+ }
+
+ return (data);
+}
+
+/*
+ * Display an out of memory error message and abort the current program.
+ */
+void
+zpool_no_memory(void)
+{
+ assert(errno == ENOMEM);
+ (void) fprintf(stderr,
+ gettext("internal error: out of memory\n"));
+ exit(1);
+}
+
+/*
+ * Return the number of logs in supplied nvlist
+ */
+uint_t
+num_logs(nvlist_t *nv)
+{
+ uint_t nlogs = 0;
+ uint_t c, children;
+ nvlist_t **child;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0)
+ return (0);
+
+ for (c = 0; c < children; c++) {
+ uint64_t is_log = B_FALSE;
+
+ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ &is_log);
+ if (is_log)
+ nlogs++;
+ }
+ return (nlogs);
+}
+
+/* Find the max element in an array of uint64_t values */
+uint64_t
+array64_max(uint64_t array[], unsigned int len)
+{
+ uint64_t max = 0;
+ int i;
+ for (i = 0; i < len; i++)
+ max = MAX(max, array[i]);
+
+ return (max);
+}
+
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+int
+highbit64(uint64_t i)
+{
+ if (i == 0)
+ return (0);
+
+ return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
+}
+
+/*
+ * Find lowest one bit set.
+ * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
+ */
+int
+lowbit64(uint64_t i)
+{
+ if (i == 0)
+ return (0);
+
+ return (__builtin_ffsll(i));
+}
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
new file mode 100644
index 000000000000..265aa58953a0
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef ZPOOL_UTIL_H
+#define ZPOOL_UTIL_H
+
+#include <libnvpair.h>
+#include <libzfs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Path to scripts you can run with "zpool status/iostat -c" */
+#define ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d"
+
+/*
+ * Basic utility functions
+ */
+void *safe_malloc(size_t);
+void zpool_no_memory(void);
+uint_t num_logs(nvlist_t *nv);
+uint64_t array64_max(uint64_t array[], unsigned int len);
+int highbit64(uint64_t i);
+int lowbit64(uint64_t i);
+
+/*
+ * Misc utility functions
+ */
+char *zpool_get_cmd_search_path(void);
+
+/*
+ * Virtual device functions
+ */
+
+nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force,
+ int check_rep, boolean_t replacing, boolean_t dryrun, int argc,
+ char **argv);
+nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
+ nvlist_t *props, splitflags_t flags, int argc, char **argv);
+
+/*
+ * Pool list functions
+ */
+int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **,
+ zpool_iter_f, void *);
+
+/* Vdev list functions */
+typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *);
+int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data);
+
+typedef struct zpool_list zpool_list_t;
+
+zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *);
+void pool_list_update(zpool_list_t *);
+int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *);
+void pool_list_free(zpool_list_t *);
+int pool_list_count(zpool_list_t *);
+void pool_list_remove(zpool_list_t *, zpool_handle_t *);
+
+extern libzfs_handle_t *g_zfs;
+
+
+typedef struct vdev_cmd_data
+{
+ char **lines; /* Array of lines of output, minus the column name */
+ int lines_cnt; /* Number of lines in the array */
+
+ char **cols; /* Array of column names */
+ int cols_cnt; /* Number of column names */
+
+
+ char *path; /* vdev path */
+ char *upath; /* vdev underlying path */
+ char *pool; /* Pool name */
+ char *cmd; /* backpointer to cmd */
+ char *vdev_enc_sysfs_path; /* enclosure sysfs path (if any) */
+} vdev_cmd_data_t;
+
+typedef struct vdev_cmd_data_list
+{
+ char *cmd; /* Command to run */
+ unsigned int count; /* Number of vdev_cmd_data items (vdevs) */
+
+ /* fields used to select only certain vdevs, if requested */
+ libzfs_handle_t *g_zfs;
+ char **vdev_names;
+ int vdev_names_count;
+ int cb_name_flags;
+
+ vdev_cmd_data_t *data; /* Array of vdevs */
+
+ /* List of unique column names and widths */
+ char **uniq_cols;
+ int uniq_cols_cnt;
+ int *uniq_cols_width;
+
+} vdev_cmd_data_list_t;
+
+vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv,
+ char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count,
+ int cb_name_flags);
+
+void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl);
+
+int check_device(const char *path, boolean_t force,
+ boolean_t isspare, boolean_t iswholedisk);
+boolean_t check_sector_size_database(char *path, int *sector_size);
+void vdev_error(const char *fmt, ...);
+int check_file(const char *file, boolean_t force, boolean_t isspare);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZPOOL_UTIL_H */
diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
new file mode 100644
index 000000000000..9aa09b18c4ae
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c
@@ -0,0 +1,1581 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 Intel Corporation.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
+
+/*
+ * Functions to convert between a list of vdevs and an nvlist representing the
+ * configuration. Each entry in the list can be one of:
+ *
+ * Device vdevs
+ * disk=(path=..., devid=...)
+ * file=(path=...)
+ *
+ * Group vdevs
+ * raidz[1|2]=(...)
+ * mirror=(...)
+ *
+ * Hot spares
+ *
+ * While the underlying implementation supports it, group vdevs cannot contain
+ * other group vdevs. All userland verification of devices is contained within
+ * this file. If successful, the nvlist returned can be passed directly to the
+ * kernel; we've done as much verification as possible in userland.
+ *
+ * Hot spares are a special case, and passed down as an array of disk vdevs, at
+ * the same level as the root of the vdev tree.
+ *
+ * The only function exported by this file is 'make_root_vdev'. The
+ * function performs several passes:
+ *
+ * 1. Construct the vdev specification. Performs syntax validation and
+ * makes sure each device is valid.
+ * 2. Check for devices in use. Using libblkid to make sure that no
+ * devices are also in use. Some can be overridden using the 'force'
+ * flag, others cannot.
+ * 3. Check for replication errors if the 'force' flag is not specified.
+ * validates that the replication level is consistent across the
+ * entire pool.
+ * 4. Call libzfs to label any whole disks with an EFI label.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libintl.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <sys/spa.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "zpool_util.h"
+#include <sys/zfs_context.h>
+#include <sys/stat.h>
+
+/*
+ * For any given vdev specification, we can have multiple errors. The
+ * vdev_error() function keeps track of whether we have seen an error yet, and
+ * prints out a header if its the first error we've seen.
+ */
+boolean_t error_seen;
+boolean_t is_force;
+
+
+
+
+/*PRINTFLIKE1*/
+void
+vdev_error(const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!error_seen) {
+ (void) fprintf(stderr, gettext("invalid vdev specification\n"));
+ if (!is_force)
+ (void) fprintf(stderr, gettext("use '-f' to override "
+ "the following errors:\n"));
+ else
+ (void) fprintf(stderr, gettext("the following errors "
+ "must be manually repaired:\n"));
+ error_seen = B_TRUE;
+ }
+
+ va_start(ap, fmt);
+ (void) vfprintf(stderr, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Check that a file is valid. All we can do in this case is check that it's
+ * not in use by another pool, and not in use by swap.
+ */
+int
+check_file(const char *file, boolean_t force, boolean_t isspare)
+{
+ char *name;
+ int fd;
+ int ret = 0;
+ pool_state_t state;
+ boolean_t inuse;
+
+ if ((fd = open(file, O_RDONLY)) < 0)
+ return (0);
+
+ if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
+ const char *desc;
+
+ switch (state) {
+ case POOL_STATE_ACTIVE:
+ desc = gettext("active");
+ break;
+
+ case POOL_STATE_EXPORTED:
+ desc = gettext("exported");
+ break;
+
+ case POOL_STATE_POTENTIALLY_ACTIVE:
+ desc = gettext("potentially active");
+ break;
+
+ default:
+ desc = gettext("unknown");
+ break;
+ }
+
+ /*
+ * Allow hot spares to be shared between pools.
+ */
+ if (state == POOL_STATE_SPARE && isspare) {
+ free(name);
+ (void) close(fd);
+ return (0);
+ }
+
+ if (state == POOL_STATE_ACTIVE ||
+ state == POOL_STATE_SPARE || !force) {
+ switch (state) {
+ case POOL_STATE_SPARE:
+ vdev_error(gettext("%s is reserved as a hot "
+ "spare for pool %s\n"), file, name);
+ break;
+ default:
+ vdev_error(gettext("%s is part of %s pool "
+ "'%s'\n"), file, desc, name);
+ break;
+ }
+ ret = -1;
+ }
+
+ free(name);
+ }
+
+ (void) close(fd);
+ return (ret);
+}
+
+/*
+ * This may be a shorthand device path or it could be total gibberish.
+ * Check to see if it is a known device available in zfs_vdev_paths.
+ * As part of this check, see if we've been given an entire disk
+ * (minus the slice number).
+ */
+static int
+is_shorthand_path(const char *arg, char *path, size_t path_size,
+ struct stat64 *statbuf, boolean_t *wholedisk)
+{
+ int error;
+
+ error = zfs_resolve_shortname(arg, path, path_size);
+ if (error == 0) {
+ *wholedisk = zfs_dev_is_whole_disk(path);
+ if (*wholedisk || (stat64(path, statbuf) == 0))
+ return (0);
+ }
+
+ strlcpy(path, arg, path_size);
+ memset(statbuf, 0, sizeof (*statbuf));
+ *wholedisk = B_FALSE;
+
+ return (error);
+}
+
+/*
+ * Determine if the given path is a hot spare within the given configuration.
+ * If no configuration is given we rely solely on the label.
+ */
+static boolean_t
+is_spare(nvlist_t *config, const char *path)
+{
+ int fd;
+ pool_state_t state;
+ char *name = NULL;
+ nvlist_t *label;
+ uint64_t guid, spareguid;
+ nvlist_t *nvroot;
+ nvlist_t **spares;
+ uint_t i, nspares;
+ boolean_t inuse;
+
+ if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
+ return (B_FALSE);
+
+ if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
+ !inuse ||
+ state != POOL_STATE_SPARE ||
+ zpool_read_label(fd, &label, NULL) != 0) {
+ free(name);
+ (void) close(fd);
+ return (B_FALSE);
+ }
+ free(name);
+ (void) close(fd);
+
+ if (config == NULL) {
+ nvlist_free(label);
+ return (B_TRUE);
+ }
+
+ verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
+ nvlist_free(label);
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0) {
+ for (i = 0; i < nspares; i++) {
+ verify(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &spareguid) == 0);
+ if (spareguid == guid)
+ return (B_TRUE);
+ }
+ }
+
+ return (B_FALSE);
+}
+
+/*
+ * Create a leaf vdev. Determine if this is a file or a device. If it's a
+ * device, fill in the device id to make a complete nvlist. Valid forms for a
+ * leaf vdev are:
+ *
+ * /dev/xxx Complete disk path
+ * /xxx Full path to file
+ * xxx Shorthand for <zfs_vdev_paths>/xxx
+ */
+static nvlist_t *
+make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
+{
+ char path[MAXPATHLEN];
+ struct stat64 statbuf;
+ nvlist_t *vdev = NULL;
+ char *type = NULL;
+ boolean_t wholedisk = B_FALSE;
+ uint64_t ashift = 0;
+ int err;
+
+ /*
+ * Determine what type of vdev this is, and put the full path into
+ * 'path'. We detect whether this is a device of file afterwards by
+ * checking the st_mode of the file.
+ */
+ if (arg[0] == '/') {
+ /*
+ * Complete device or file path. Exact type is determined by
+ * examining the file descriptor afterwards. Symbolic links
+ * are resolved to their real paths to determine whole disk
+ * and S_ISBLK/S_ISREG type checks. However, we are careful
+ * to store the given path as ZPOOL_CONFIG_PATH to ensure we
+ * can leverage udev's persistent device labels.
+ */
+ if (realpath(arg, path) == NULL) {
+ (void) fprintf(stderr,
+ gettext("cannot resolve path '%s'\n"), arg);
+ return (NULL);
+ }
+
+ wholedisk = zfs_dev_is_whole_disk(path);
+ if (!wholedisk && (stat64(path, &statbuf) != 0)) {
+ (void) fprintf(stderr,
+ gettext("cannot open '%s': %s\n"),
+ path, strerror(errno));
+ return (NULL);
+ }
+
+ /* After whole disk check restore original passed path */
+ strlcpy(path, arg, sizeof (path));
+ } else {
+ err = is_shorthand_path(arg, path, sizeof (path),
+ &statbuf, &wholedisk);
+ if (err != 0) {
+ /*
+ * If we got ENOENT, then the user gave us
+ * gibberish, so try to direct them with a
+ * reasonable error message. Otherwise,
+ * regurgitate strerror() since it's the best we
+ * can do.
+ */
+ if (err == ENOENT) {
+ (void) fprintf(stderr,
+ gettext("cannot open '%s': no such "
+ "device in %s\n"), arg, DISK_ROOT);
+ (void) fprintf(stderr,
+ gettext("must be a full path or "
+ "shorthand device name\n"));
+ return (NULL);
+ } else {
+ (void) fprintf(stderr,
+ gettext("cannot open '%s': %s\n"),
+ path, strerror(errno));
+ return (NULL);
+ }
+ }
+ }
+
+ /*
+ * Determine whether this is a device or a file.
+ */
+ if (wholedisk || S_ISBLK(statbuf.st_mode)) {
+ type = VDEV_TYPE_DISK;
+ } else if (S_ISREG(statbuf.st_mode)) {
+ type = VDEV_TYPE_FILE;
+ } else {
+ (void) fprintf(stderr, gettext("cannot use '%s': must be a "
+ "block device or regular file\n"), path);
+ return (NULL);
+ }
+
+ /*
+ * Finally, we have the complete device or file, and we know that it is
+ * acceptable to use. Construct the nvlist to describe this vdev. All
+ * vdevs have a 'path' element, and devices also have a 'devid' element.
+ */
+ verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
+ verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
+ verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
+ verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+ if (is_log)
+ verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_LOG) == 0);
+ if (strcmp(type, VDEV_TYPE_DISK) == 0)
+ verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
+ (uint64_t)wholedisk) == 0);
+
+ /*
+ * Override defaults if custom properties are provided.
+ */
+ if (props != NULL) {
+ char *value = NULL;
+
+ if (nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
+ if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
+ (void) fprintf(stderr,
+ gettext("ashift must be a number.\n"));
+ return (NULL);
+ }
+ if (ashift != 0 &&
+ (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
+ (void) fprintf(stderr,
+ gettext("invalid 'ashift=%" PRIu64 "' "
+ "property: only values between %" PRId32 " "
+ "and %" PRId32 " are allowed.\n"),
+ ashift, ASHIFT_MIN, ASHIFT_MAX);
+ return (NULL);
+ }
+ }
+ }
+
+ /*
+ * If the device is known to incorrectly report its physical sector
+ * size explicitly provide the known correct value.
+ */
+ if (ashift == 0) {
+ int sector_size;
+
+ if (check_sector_size_database(path, &sector_size) == B_TRUE)
+ ashift = highbit64(sector_size) - 1;
+ }
+
+ if (ashift > 0)
+ (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
+
+ return (vdev);
+}
+
+/*
+ * Go through and verify the replication level of the pool is consistent.
+ * Performs the following checks:
+ *
+ * For the new spec, verifies that devices in mirrors and raidz are the
+ * same size.
+ *
+ * If the current configuration already has inconsistent replication
+ * levels, ignore any other potential problems in the new spec.
+ *
+ * Otherwise, make sure that the current spec (if there is one) and the new
+ * spec have consistent replication levels.
+ *
+ * If there is no current spec (create), make sure new spec has at least
+ * one general purpose vdev.
+ */
+typedef struct replication_level {
+ char *zprl_type;
+ uint64_t zprl_children;
+ uint64_t zprl_parity;
+} replication_level_t;
+
+#define ZPOOL_FUZZ (16 * 1024 * 1024)
+
+static boolean_t
+is_raidz_mirror(replication_level_t *a, replication_level_t *b,
+ replication_level_t **raidz, replication_level_t **mirror)
+{
+ if (strcmp(a->zprl_type, "raidz") == 0 &&
+ strcmp(b->zprl_type, "mirror") == 0) {
+ *raidz = a;
+ *mirror = b;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Given a list of toplevel vdevs, return the current replication level. If
+ * the config is inconsistent, then NULL is returned. If 'fatal' is set, then
+ * an error message will be displayed for each self-inconsistent vdev.
+ */
+static replication_level_t *
+get_replication(nvlist_t *nvroot, boolean_t fatal)
+{
+ nvlist_t **top;
+ uint_t t, toplevels;
+ nvlist_t **child;
+ uint_t c, children;
+ nvlist_t *nv;
+ char *type;
+ replication_level_t lastrep = {0};
+ replication_level_t rep;
+ replication_level_t *ret;
+ replication_level_t *raidz, *mirror;
+ boolean_t dontreport;
+
+ ret = safe_malloc(sizeof (replication_level_t));
+
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &top, &toplevels) == 0);
+
+ for (t = 0; t < toplevels; t++) {
+ uint64_t is_log = B_FALSE;
+
+ nv = top[t];
+
+ /*
+ * For separate logs we ignore the top level vdev replication
+ * constraints.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
+ if (is_log)
+ continue;
+
+ /* Ignore holes introduced by removing aux devices */
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+ if (strcmp(type, VDEV_TYPE_HOLE) == 0)
+ continue;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+ /*
+ * This is a 'file' or 'disk' vdev.
+ */
+ rep.zprl_type = type;
+ rep.zprl_children = 1;
+ rep.zprl_parity = 0;
+ } else {
+ int64_t vdev_size;
+
+ /*
+ * This is a mirror or RAID-Z vdev. Go through and make
+ * sure the contents are all the same (files vs. disks),
+ * keeping track of the number of elements in the
+ * process.
+ *
+ * We also check that the size of each vdev (if it can
+ * be determined) is the same.
+ */
+ rep.zprl_type = type;
+ rep.zprl_children = 0;
+
+ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+ verify(nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_NPARITY,
+ &rep.zprl_parity) == 0);
+ assert(rep.zprl_parity != 0);
+ } else {
+ rep.zprl_parity = 0;
+ }
+
+ /*
+ * The 'dontreport' variable indicates that we've
+ * already reported an error for this spec, so don't
+ * bother doing it again.
+ */
+ type = NULL;
+ dontreport = 0;
+ vdev_size = -1LL;
+ for (c = 0; c < children; c++) {
+ nvlist_t *cnv = child[c];
+ char *path;
+ struct stat64 statbuf;
+ int64_t size = -1LL;
+ char *childtype;
+ int fd, err;
+
+ rep.zprl_children++;
+
+ verify(nvlist_lookup_string(cnv,
+ ZPOOL_CONFIG_TYPE, &childtype) == 0);
+
+ /*
+ * If this is a replacing or spare vdev, then
+ * get the real first child of the vdev: do this
+ * in a loop because replacing and spare vdevs
+ * can be nested.
+ */
+ while (strcmp(childtype,
+ VDEV_TYPE_REPLACING) == 0 ||
+ strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
+ nvlist_t **rchild;
+ uint_t rchildren;
+
+ verify(nvlist_lookup_nvlist_array(cnv,
+ ZPOOL_CONFIG_CHILDREN, &rchild,
+ &rchildren) == 0);
+ assert(rchildren == 2);
+ cnv = rchild[0];
+
+ verify(nvlist_lookup_string(cnv,
+ ZPOOL_CONFIG_TYPE,
+ &childtype) == 0);
+ }
+
+ verify(nvlist_lookup_string(cnv,
+ ZPOOL_CONFIG_PATH, &path) == 0);
+
+ /*
+ * If we have a raidz/mirror that combines disks
+ * with files, report it as an error.
+ */
+ if (!dontreport && type != NULL &&
+ strcmp(type, childtype) != 0) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication "
+ "level: %s contains both "
+ "files and devices\n"),
+ rep.zprl_type);
+ else
+ return (NULL);
+ dontreport = B_TRUE;
+ }
+
+ /*
+ * According to stat(2), the value of 'st_size'
+ * is undefined for block devices and character
+ * devices. But there is no effective way to
+ * determine the real size in userland.
+ *
+ * Instead, we'll take advantage of an
+ * implementation detail of spec_size(). If the
+ * device is currently open, then we (should)
+ * return a valid size.
+ *
+ * If we still don't get a valid size (indicated
+ * by a size of 0 or MAXOFFSET_T), then ignore
+ * this device altogether.
+ */
+ if ((fd = open(path, O_RDONLY)) >= 0) {
+ err = fstat64_blk(fd, &statbuf);
+ (void) close(fd);
+ } else {
+ err = stat64(path, &statbuf);
+ }
+
+ if (err != 0 ||
+ statbuf.st_size == 0 ||
+ statbuf.st_size == MAXOFFSET_T)
+ continue;
+
+ size = statbuf.st_size;
+
+ /*
+ * Also make sure that devices and
+ * slices have a consistent size. If
+ * they differ by a significant amount
+ * (~16MB) then report an error.
+ */
+ if (!dontreport &&
+ (vdev_size != -1LL &&
+ (llabs(size - vdev_size) >
+ ZPOOL_FUZZ))) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "%s contains devices of "
+ "different sizes\n"),
+ rep.zprl_type);
+ else
+ return (NULL);
+ dontreport = B_TRUE;
+ }
+
+ type = childtype;
+ vdev_size = size;
+ }
+ }
+
+ /*
+ * At this point, we have the replication of the last toplevel
+ * vdev in 'rep'. Compare it to 'lastrep' to see if it is
+ * different.
+ */
+ if (lastrep.zprl_type != NULL) {
+ if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
+ is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
+ /*
+ * Accepted raidz and mirror when they can
+ * handle the same number of disk failures.
+ */
+ if (raidz->zprl_parity !=
+ mirror->zprl_children - 1) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication "
+ "level: "
+ "%s and %s vdevs with "
+ "different redundancy, "
+ "%llu vs. %llu (%llu-way) "
+ "are present\n"),
+ raidz->zprl_type,
+ mirror->zprl_type,
+ raidz->zprl_parity,
+ mirror->zprl_children - 1,
+ mirror->zprl_children);
+ else
+ return (NULL);
+ }
+ } else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
+ 0) {
+ if (ret != NULL)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication level: "
+ "both %s and %s vdevs are "
+ "present\n"),
+ lastrep.zprl_type, rep.zprl_type);
+ else
+ return (NULL);
+ } else if (lastrep.zprl_parity != rep.zprl_parity) {
+ if (ret)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication level: "
+ "both %llu and %llu device parity "
+ "%s vdevs are present\n"),
+ lastrep.zprl_parity,
+ rep.zprl_parity,
+ rep.zprl_type);
+ else
+ return (NULL);
+ } else if (lastrep.zprl_children != rep.zprl_children) {
+ if (ret)
+ free(ret);
+ ret = NULL;
+ if (fatal)
+ vdev_error(gettext(
+ "mismatched replication level: "
+ "both %llu-way and %llu-way %s "
+ "vdevs are present\n"),
+ lastrep.zprl_children,
+ rep.zprl_children,
+ rep.zprl_type);
+ else
+ return (NULL);
+ }
+ }
+ lastrep = rep;
+ }
+
+ if (ret != NULL)
+ *ret = rep;
+
+ return (ret);
+}
+
+/*
+ * Check the replication level of the vdev spec against the current pool. Calls
+ * get_replication() to make sure the new spec is self-consistent. If the pool
+ * has a consistent replication level, then we ignore any errors. Otherwise,
+ * report any difference between the two.
+ */
+static int
+check_replication(nvlist_t *config, nvlist_t *newroot)
+{
+ nvlist_t **child;
+ uint_t children;
+ replication_level_t *current = NULL, *new;
+ replication_level_t *raidz, *mirror;
+ int ret;
+
+ /*
+ * If we have a current pool configuration, check to see if it's
+ * self-consistent. If not, simply return success.
+ */
+ if (config != NULL) {
+ nvlist_t *nvroot;
+
+ verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvroot) == 0);
+ if ((current = get_replication(nvroot, B_FALSE)) == NULL)
+ return (0);
+ }
+ /*
+ * for spares there may be no children, and therefore no
+ * replication level to check
+ */
+ if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) || (children == 0)) {
+ free(current);
+ return (0);
+ }
+
+ /*
+ * If all we have is logs then there's no replication level to check.
+ */
+ if (num_logs(newroot) == children) {
+ free(current);
+ return (0);
+ }
+
+ /*
+ * Get the replication level of the new vdev spec, reporting any
+ * inconsistencies found.
+ */
+ if ((new = get_replication(newroot, B_TRUE)) == NULL) {
+ free(current);
+ return (-1);
+ }
+
+ /*
+ * Check to see if the new vdev spec matches the replication level of
+ * the current pool.
+ */
+ ret = 0;
+ if (current != NULL) {
+ if (is_raidz_mirror(current, new, &raidz, &mirror) ||
+ is_raidz_mirror(new, current, &raidz, &mirror)) {
+ if (raidz->zprl_parity != mirror->zprl_children - 1) {
+ vdev_error(gettext(
+ "mismatched replication level: pool and "
+ "new vdev with different redundancy, %s "
+ "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
+ raidz->zprl_type,
+ mirror->zprl_type,
+ raidz->zprl_parity,
+ mirror->zprl_children - 1,
+ mirror->zprl_children);
+ ret = -1;
+ }
+ } else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
+ vdev_error(gettext(
+ "mismatched replication level: pool uses %s "
+ "and new vdev is %s\n"),
+ current->zprl_type, new->zprl_type);
+ ret = -1;
+ } else if (current->zprl_parity != new->zprl_parity) {
+ vdev_error(gettext(
+ "mismatched replication level: pool uses %llu "
+ "device parity and new vdev uses %llu\n"),
+ current->zprl_parity, new->zprl_parity);
+ ret = -1;
+ } else if (current->zprl_children != new->zprl_children) {
+ vdev_error(gettext(
+ "mismatched replication level: pool uses %llu-way "
+ "%s and new vdev uses %llu-way %s\n"),
+ current->zprl_children, current->zprl_type,
+ new->zprl_children, new->zprl_type);
+ ret = -1;
+ }
+ }
+
+ free(new);
+ if (current != NULL)
+ free(current);
+
+ return (ret);
+}
+
+static int
+zero_label(char *path)
+{
+ const int size = 4096;
+ char buf[size];
+ int err, fd;
+
+ if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
+ (void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
+ path, strerror(errno));
+ return (-1);
+ }
+
+ memset(buf, 0, size);
+ err = write(fd, buf, size);
+ (void) fdatasync(fd);
+ (void) close(fd);
+
+ if (err == -1) {
+ (void) fprintf(stderr, gettext("cannot zero first %d bytes "
+ "of '%s': %s\n"), size, path, strerror(errno));
+ return (-1);
+ }
+
+ if (err != size) {
+ (void) fprintf(stderr, gettext("could only zero %d/%d bytes "
+ "of '%s'\n"), err, size, path);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Go through and find any whole disks in the vdev specification, labelling them
+ * as appropriate. When constructing the vdev spec, we were unable to open this
+ * device in order to provide a devid. Now that we have labelled the disk and
+ * know that slice 0 is valid, we can construct the devid now.
+ *
+ * If the disk was already labeled with an EFI label, we will have gotten the
+ * devid already (because we were able to open the whole disk). Otherwise, we
+ * need to get the devid after we label the disk.
+ */
+static int
+make_disks(zpool_handle_t *zhp, nvlist_t *nv)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *type, *path;
+ char devpath[MAXPATHLEN];
+ char udevpath[MAXPATHLEN];
+ uint64_t wholedisk;
+ struct stat64 statbuf;
+ int is_exclusive = 0;
+ int fd;
+ int ret;
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+
+ if (strcmp(type, VDEV_TYPE_DISK) != 0)
+ return (0);
+
+ /*
+ * We have a disk device. If this is a whole disk write
+ * out the efi partition table, otherwise write zero's to
+ * the first 4k of the partition. This is to ensure that
+ * libblkid will not misidentify the partition due to a
+ * magic value left by the previous filesystem.
+ */
+ verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
+ verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ &wholedisk));
+
+ if (!wholedisk) {
+ /*
+ * Update device id string for mpath nodes (Linux only)
+ */
+ if (is_mpath_whole_disk(path))
+ update_vdev_config_dev_strs(nv);
+
+ if (!is_spare(NULL, path))
+ (void) zero_label(path);
+ return (0);
+ }
+
+ if (realpath(path, devpath) == NULL) {
+ ret = errno;
+ (void) fprintf(stderr,
+ gettext("cannot resolve path '%s'\n"), path);
+ return (ret);
+ }
+
+ /*
+ * Remove any previously existing symlink from a udev path to
+ * the device before labeling the disk. This ensures that
+ * only newly created links are used. Otherwise there is a
+ * window between when udev deletes and recreates the link
+ * during which access attempts will fail with ENOENT.
+ */
+ strlcpy(udevpath, path, MAXPATHLEN);
+ (void) zfs_append_partition(udevpath, MAXPATHLEN);
+
+ fd = open(devpath, O_RDWR|O_EXCL);
+ if (fd == -1) {
+ if (errno == EBUSY)
+ is_exclusive = 1;
+#ifdef __FreeBSD__
+ if (errno == EPERM)
+ is_exclusive = 1;
+#endif
+ } else {
+ (void) close(fd);
+ }
+
+ /*
+ * If the partition exists, contains a valid spare label,
+ * and is opened exclusively there is no need to partition
+ * it. Hot spares have already been partitioned and are
+ * held open exclusively by the kernel as a safety measure.
+ *
+ * If the provided path is for a /dev/disk/ device its
+ * symbolic link will be removed, partition table created,
+ * and then block until udev creates the new link.
+ */
+ if (!is_exclusive && !is_spare(NULL, udevpath)) {
+ char *devnode = strrchr(devpath, '/') + 1;
+
+ ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
+ if (ret == 0) {
+ ret = lstat64(udevpath, &statbuf);
+ if (ret == 0 && S_ISLNK(statbuf.st_mode))
+ (void) unlink(udevpath);
+ }
+
+ /*
+ * When labeling a pool the raw device node name
+ * is provided as it appears under /dev/.
+ */
+ if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
+ return (-1);
+
+ /*
+ * Wait for udev to signal the device is available
+ * by the provided path.
+ */
+ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
+ if (ret) {
+ (void) fprintf(stderr,
+ gettext("missing link: %s was "
+ "partitioned but %s is missing\n"),
+ devnode, udevpath);
+ return (ret);
+ }
+
+ ret = zero_label(udevpath);
+ if (ret)
+ return (ret);
+ }
+
+ /*
+ * Update the path to refer to the partition. The presence of
+ * the 'whole_disk' field indicates to the CLI that we should
+ * chop off the partition number when displaying the device in
+ * future output.
+ */
+ verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
+
+ /*
+ * Update device id strings for whole disks (Linux only)
+ */
+ update_vdev_config_dev_strs(nv);
+
+ return (0);
+ }
+
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * Go through and find any devices that are in use. We rely on libdiskmgt for
+ * the majority of this task.
+ */
+static boolean_t
+is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+ boolean_t replacing, boolean_t isspare)
+{
+ nvlist_t **child;
+ uint_t c, children;
+ char *type, *path;
+ int ret = 0;
+ char buf[MAXPATHLEN];
+ uint64_t wholedisk = B_FALSE;
+ boolean_t anyinuse = B_FALSE;
+
+ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) != 0) {
+
+ verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
+ if (strcmp(type, VDEV_TYPE_DISK) == 0)
+ verify(!nvlist_lookup_uint64(nv,
+ ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
+
+ /*
+ * As a generic check, we look to see if this is a replace of a
+ * hot spare within the same pool. If so, we allow it
+ * regardless of what libblkid or zpool_in_use() says.
+ */
+ if (replacing) {
+ (void) strlcpy(buf, path, sizeof (buf));
+ if (wholedisk) {
+ ret = zfs_append_partition(buf, sizeof (buf));
+ if (ret == -1)
+ return (-1);
+ }
+
+ if (is_spare(config, buf))
+ return (B_FALSE);
+ }
+
+ if (strcmp(type, VDEV_TYPE_DISK) == 0)
+ ret = check_device(path, force, isspare, wholedisk);
+
+ else if (strcmp(type, VDEV_TYPE_FILE) == 0)
+ ret = check_file(path, force, isspare);
+
+ return (ret != 0);
+ }
+
+ for (c = 0; c < children; c++)
+ if (is_device_in_use(config, child[c], force, replacing,
+ B_FALSE))
+ anyinuse = B_TRUE;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if (is_device_in_use(config, child[c], force, replacing,
+ B_TRUE))
+ anyinuse = B_TRUE;
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if (is_device_in_use(config, child[c], force, replacing,
+ B_FALSE))
+ anyinuse = B_TRUE;
+
+ return (anyinuse);
+}
+
+static const char *
+is_grouping(const char *type, int *mindev, int *maxdev)
+{
+ if (strncmp(type, "raidz", 5) == 0) {
+ const char *p = type + 5;
+ char *end;
+ long nparity;
+
+ if (*p == '\0') {
+ nparity = 1;
+ } else if (*p == '0') {
+ return (NULL); /* no zero prefixes allowed */
+ } else {
+ errno = 0;
+ nparity = strtol(p, &end, 10);
+ if (errno != 0 || nparity < 1 || nparity >= 255 ||
+ *end != '\0')
+ return (NULL);
+ }
+
+ if (mindev != NULL)
+ *mindev = nparity + 1;
+ if (maxdev != NULL)
+ *maxdev = 255;
+ return (VDEV_TYPE_RAIDZ);
+ }
+
+ if (maxdev != NULL)
+ *maxdev = INT_MAX;
+
+ if (strcmp(type, "mirror") == 0) {
+ if (mindev != NULL)
+ *mindev = 2;
+ return (VDEV_TYPE_MIRROR);
+ }
+
+ if (strcmp(type, "spare") == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (VDEV_TYPE_SPARE);
+ }
+
+ if (strcmp(type, "log") == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (VDEV_TYPE_LOG);
+ }
+
+ if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
+ strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (type);
+ }
+
+ if (strcmp(type, "cache") == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (VDEV_TYPE_L2CACHE);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Construct a syntactically valid vdev specification,
+ * and ensure that all devices and files exist and can be opened.
+ * Note: we don't bother freeing anything in the error paths
+ * because the program is just going to exit anyway.
+ */
+static nvlist_t *
+construct_spec(nvlist_t *props, int argc, char **argv)
+{
+ nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
+ int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
+ const char *type;
+ uint64_t is_log, is_special, is_dedup;
+ boolean_t seen_logs;
+
+ top = NULL;
+ toplevels = 0;
+ spares = NULL;
+ l2cache = NULL;
+ nspares = 0;
+ nlogs = 0;
+ nl2cache = 0;
+ is_log = is_special = is_dedup = B_FALSE;
+ seen_logs = B_FALSE;
+ nvroot = NULL;
+
+ while (argc > 0) {
+ nv = NULL;
+
+ /*
+ * If it's a mirror or raidz, the subsequent arguments are
+ * its leaves -- until we encounter the next mirror or raidz.
+ */
+ if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
+ nvlist_t **child = NULL;
+ int c, children = 0;
+
+ if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
+ if (spares != NULL) {
+ (void) fprintf(stderr,
+ gettext("invalid vdev "
+ "specification: 'spare' can be "
+ "specified only once\n"));
+ goto spec_out;
+ }
+ is_log = is_special = is_dedup = B_FALSE;
+ }
+
+ if (strcmp(type, VDEV_TYPE_LOG) == 0) {
+ if (seen_logs) {
+ (void) fprintf(stderr,
+ gettext("invalid vdev "
+ "specification: 'log' can be "
+ "specified only once\n"));
+ goto spec_out;
+ }
+ seen_logs = B_TRUE;
+ is_log = B_TRUE;
+ is_special = B_FALSE;
+ is_dedup = B_FALSE;
+ argc--;
+ argv++;
+ /*
+ * A log is not a real grouping device.
+ * We just set is_log and continue.
+ */
+ continue;
+ }
+
+ if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
+ is_special = B_TRUE;
+ is_log = B_FALSE;
+ is_dedup = B_FALSE;
+ argc--;
+ argv++;
+ continue;
+ }
+
+ if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+ is_dedup = B_TRUE;
+ is_log = B_FALSE;
+ is_special = B_FALSE;
+ argc--;
+ argv++;
+ continue;
+ }
+
+ if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
+ if (l2cache != NULL) {
+ (void) fprintf(stderr,
+ gettext("invalid vdev "
+ "specification: 'cache' can be "
+ "specified only once\n"));
+ goto spec_out;
+ }
+ is_log = is_special = is_dedup = B_FALSE;
+ }
+
+ if (is_log || is_special || is_dedup) {
+ if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
+ (void) fprintf(stderr,
+ gettext("invalid vdev "
+ "specification: unsupported '%s' "
+ "device: %s\n"), is_log ? "log" :
+ "special", type);
+ goto spec_out;
+ }
+ nlogs++;
+ }
+
+ for (c = 1; c < argc; c++) {
+ if (is_grouping(argv[c], NULL, NULL) != NULL)
+ break;
+ children++;
+ child = realloc(child,
+ children * sizeof (nvlist_t *));
+ if (child == NULL)
+ zpool_no_memory();
+ if ((nv = make_leaf_vdev(props, argv[c],
+ B_FALSE)) == NULL) {
+ for (c = 0; c < children - 1; c++)
+ nvlist_free(child[c]);
+ free(child);
+ goto spec_out;
+ }
+
+ child[children - 1] = nv;
+ }
+
+ if (children < mindev) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: %s requires at least %d "
+ "devices\n"), argv[0], mindev);
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ goto spec_out;
+ }
+
+ if (children > maxdev) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: %s supports no more than "
+ "%d devices\n"), argv[0], maxdev);
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ goto spec_out;
+ }
+
+ argc -= c;
+ argv += c;
+
+ if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
+ spares = child;
+ nspares = children;
+ continue;
+ } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
+ l2cache = child;
+ nl2cache = children;
+ continue;
+ } else {
+ /* create a top-level vdev with children */
+ verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
+ 0) == 0);
+ verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
+ type) == 0);
+ verify(nvlist_add_uint64(nv,
+ ZPOOL_CONFIG_IS_LOG, is_log) == 0);
+ if (is_log)
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_LOG) == 0);
+ if (is_special) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_SPECIAL) == 0);
+ }
+ if (is_dedup) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_DEDUP) == 0);
+ }
+ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
+ verify(nvlist_add_uint64(nv,
+ ZPOOL_CONFIG_NPARITY,
+ mindev - 1) == 0);
+ }
+ verify(nvlist_add_nvlist_array(nv,
+ ZPOOL_CONFIG_CHILDREN, child,
+ children) == 0);
+
+ for (c = 0; c < children; c++)
+ nvlist_free(child[c]);
+ free(child);
+ }
+ } else {
+ /*
+ * We have a device. Pass off to make_leaf_vdev() to
+ * construct the appropriate nvlist describing the vdev.
+ */
+ if ((nv = make_leaf_vdev(props, argv[0],
+ is_log)) == NULL)
+ goto spec_out;
+
+ if (is_log)
+ nlogs++;
+ if (is_special) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_SPECIAL) == 0);
+ }
+ if (is_dedup) {
+ verify(nvlist_add_string(nv,
+ ZPOOL_CONFIG_ALLOCATION_BIAS,
+ VDEV_ALLOC_BIAS_DEDUP) == 0);
+ }
+ argc--;
+ argv++;
+ }
+
+ toplevels++;
+ top = realloc(top, toplevels * sizeof (nvlist_t *));
+ if (top == NULL)
+ zpool_no_memory();
+ top[toplevels - 1] = nv;
+ }
+
+ if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
+ (void) fprintf(stderr, gettext("invalid vdev "
+ "specification: at least one toplevel vdev must be "
+ "specified\n"));
+ goto spec_out;
+ }
+
+ if (seen_logs && nlogs == 0) {
+ (void) fprintf(stderr, gettext("invalid vdev specification: "
+ "log requires at least 1 device\n"));
+ goto spec_out;
+ }
+
+ /*
+ * Finally, create nvroot and add all top-level vdevs to it.
+ */
+ verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
+ verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ top, toplevels) == 0);
+ if (nspares != 0)
+ verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+ spares, nspares) == 0);
+ if (nl2cache != 0)
+ verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ l2cache, nl2cache) == 0);
+
+spec_out:
+ for (t = 0; t < toplevels; t++)
+ nvlist_free(top[t]);
+ for (t = 0; t < nspares; t++)
+ nvlist_free(spares[t]);
+ for (t = 0; t < nl2cache; t++)
+ nvlist_free(l2cache[t]);
+
+ free(spares);
+ free(l2cache);
+ free(top);
+
+ return (nvroot);
+}
+
+nvlist_t *
+split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
+ splitflags_t flags, int argc, char **argv)
+{
+ nvlist_t *newroot = NULL, **child;
+ uint_t c, children;
+
+ if (argc > 0) {
+ if ((newroot = construct_spec(props, argc, argv)) == NULL) {
+ (void) fprintf(stderr, gettext("Unable to build a "
+ "pool from the specified devices\n"));
+ return (NULL);
+ }
+
+ if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ /* avoid any tricks in the spec */
+ verify(nvlist_lookup_nvlist_array(newroot,
+ ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+ for (c = 0; c < children; c++) {
+ char *path;
+ const char *type;
+ int min, max;
+
+ verify(nvlist_lookup_string(child[c],
+ ZPOOL_CONFIG_PATH, &path) == 0);
+ if ((type = is_grouping(path, &min, &max)) != NULL) {
+ (void) fprintf(stderr, gettext("Cannot use "
+ "'%s' as a device for splitting\n"), type);
+ nvlist_free(newroot);
+ return (NULL);
+ }
+ }
+ }
+
+ if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ return (newroot);
+}
+
+static int
+num_normal_vdevs(nvlist_t *nvroot)
+{
+ nvlist_t **top;
+ uint_t t, toplevels, normal = 0;
+
+ verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+ &top, &toplevels) == 0);
+
+ for (t = 0; t < toplevels; t++) {
+ uint64_t log = B_FALSE;
+
+ (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
+ if (log)
+ continue;
+ if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
+ continue;
+
+ normal++;
+ }
+
+ return (normal);
+}
+
+/*
+ * Get and validate the contents of the given vdev specification. This ensures
+ * that the nvlist returned is well-formed, that all the devices exist, and that
+ * they are not currently in use by any other known consumer. The 'poolconfig'
+ * parameter is the current configuration of the pool when adding devices
+ * existing pool, and is used to perform additional checks, such as changing the
+ * replication level of the pool. It can be 'NULL' to indicate that this is a
+ * new pool. The 'force' flag controls whether devices should be forcefully
+ * added, even if they appear in use.
+ */
+nvlist_t *
+make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
+ boolean_t replacing, boolean_t dryrun, int argc, char **argv)
+{
+ nvlist_t *newroot;
+ nvlist_t *poolconfig = NULL;
+ is_force = force;
+
+ /*
+ * Construct the vdev specification. If this is successful, we know
+ * that we have a valid specification, and that all devices can be
+ * opened.
+ */
+ if ((newroot = construct_spec(props, argc, argv)) == NULL)
+ return (NULL);
+
+ if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ /*
+ * Validate each device to make sure that it's not shared with another
+ * subsystem. We do this even if 'force' is set, because there are some
+ * uses (such as a dedicated dump device) that even '-f' cannot
+ * override.
+ */
+ if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ /*
+ * Check the replication level of the given vdevs and report any errors
+ * found. We include the existing pool spec, if any, as we need to
+ * catch changes against the existing replication level.
+ */
+ if (check_rep && check_replication(poolconfig, newroot) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ /*
+ * On pool create the new vdev spec must have one normal vdev.
+ */
+ if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
+ vdev_error(gettext("at least one general top-level vdev must "
+ "be specified\n"));
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ /*
+ * Run through the vdev specification and label any whole disks found.
+ */
+ if (!dryrun && make_disks(zhp, newroot) != 0) {
+ nvlist_free(newroot);
+ return (NULL);
+ }
+
+ return (newroot);
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/.gitignore b/sys/contrib/openzfs/cmd/zstream/.gitignore
new file mode 100644
index 000000000000..fd1240d55c4b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/.gitignore
@@ -0,0 +1 @@
+zstream
diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am
new file mode 100644
index 000000000000..5e2ac5d69f1a
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am
@@ -0,0 +1,15 @@
+include $(top_srcdir)/config/Rules.am
+
+sbin_PROGRAMS = zstream
+
+zstream_SOURCES = \
+ zstream.c \
+ zstream.h \
+ zstream_dump.c \
+ zstream_redup.c \
+ zstream_token.c
+
+zstream_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.c b/sys/contrib/openzfs/cmd/zstream/zstream.c
new file mode 100644
index 000000000000..cbcb560a8638
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream.c
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2020 by Datto Inc. All rights reserved.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <libintl.h>
+#include <stddef.h>
+#include <libzfs.h>
+#include "zstream.h"
+
+void
+zstream_usage(void)
+{
+ (void) fprintf(stderr,
+ "usage: zstream command args ...\n"
+ "Available commands are:\n"
+ "\n"
+ "\tzstream dump [-vCd] FILE\n"
+ "\t... | zstream dump [-vCd]\n"
+ "\n"
+ "\tzstream token resume_token\n"
+ "\n"
+ "\tzstream redup [-v] FILE | ...\n");
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ if (argc < 2)
+ zstream_usage();
+
+ char *subcommand = argv[1];
+
+ if (strcmp(subcommand, "dump") == 0) {
+ return (zstream_do_dump(argc - 1, argv + 1));
+ } else if (strcmp(subcommand, "token") == 0) {
+ return (zstream_do_token(argc - 1, argv + 1));
+ } else if (strcmp(subcommand, "redup") == 0) {
+ return (zstream_do_redup(argc - 1, argv + 1));
+ } else {
+ zstream_usage();
+ }
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream.h b/sys/contrib/openzfs/cmd/zstream/zstream.h
new file mode 100644
index 000000000000..319fecb2876b
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ */
+
+#ifndef _ZSTREAM_H
+#define _ZSTREAM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int zstream_do_redup(int, char *[]);
+extern int zstream_do_dump(int, char *[]);
+extern int zstream_do_token(int, char *[]);
+extern void zstream_usage(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTREAM_H */
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_dump.c b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c
new file mode 100644
index 000000000000..45cf7b97a147
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_dump.c
@@ -0,0 +1,799 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Portions Copyright 2012 Martin Matuska <martin@matuska.org>
+ */
+
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stddef.h>
+
+#include <sys/dmu.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zio.h>
+#include <zfs_fletcher.h>
+#include "zstream.h"
+
+/*
+ * If dump mode is enabled, the number of bytes to print per line
+ */
+#define BYTES_PER_LINE 16
+/*
+ * If dump mode is enabled, the number of bytes to group together, separated
+ * by newlines or spaces
+ */
+#define DUMP_GROUPING 4
+
+uint64_t total_stream_len = 0;
+FILE *send_stream = 0;
+boolean_t do_byteswap = B_FALSE;
+boolean_t do_cksum = B_TRUE;
+
+static void *
+safe_malloc(size_t size)
+{
+ void *rv = malloc(size);
+ if (rv == NULL) {
+ (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+ size);
+ abort();
+ }
+ return (rv);
+}
+
+/*
+ * ssread - send stream read.
+ *
+ * Read while computing incremental checksum
+ */
+static size_t
+ssread(void *buf, size_t len, zio_cksum_t *cksum)
+{
+ size_t outlen;
+
+ if ((outlen = fread(buf, len, 1, send_stream)) == 0)
+ return (0);
+
+ if (do_cksum) {
+ if (do_byteswap)
+ fletcher_4_incremental_byteswap(buf, len, cksum);
+ else
+ fletcher_4_incremental_native(buf, len, cksum);
+ }
+ total_stream_len += len;
+ return (outlen);
+}
+
+static size_t
+read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum)
+{
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum);
+ if (r == 0)
+ return (0);
+ zio_cksum_t saved_cksum = *cksum;
+ r = ssread(&drr->drr_u.drr_checksum.drr_checksum,
+ sizeof (zio_cksum_t), cksum);
+ if (r == 0)
+ return (0);
+ if (do_cksum &&
+ !ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) &&
+ !ZIO_CHECKSUM_EQUAL(saved_cksum,
+ drr->drr_u.drr_checksum.drr_checksum)) {
+ fprintf(stderr, "invalid checksum\n");
+ (void) printf("Incorrect checksum in record header.\n");
+ (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n",
+ (longlong_t)saved_cksum.zc_word[0],
+ (longlong_t)saved_cksum.zc_word[1],
+ (longlong_t)saved_cksum.zc_word[2],
+ (longlong_t)saved_cksum.zc_word[3]);
+ return (0);
+ }
+ return (sizeof (*drr));
+}
+
+/*
+ * Print part of a block in ASCII characters
+ */
+static void
+print_ascii_block(char *subbuf, int length)
+{
+ int i;
+
+ for (i = 0; i < length; i++) {
+ char char_print = isprint(subbuf[i]) ? subbuf[i] : '.';
+ if (i != 0 && i % DUMP_GROUPING == 0) {
+ (void) printf(" ");
+ }
+ (void) printf("%c", char_print);
+ }
+ (void) printf("\n");
+}
+
+/*
+ * print_block - Dump the contents of a modified block to STDOUT
+ *
+ * Assume that buf has capacity evenly divisible by BYTES_PER_LINE
+ */
+static void
+print_block(char *buf, int length)
+{
+ int i;
+ /*
+ * Start printing ASCII characters at a constant offset, after
+ * the hex prints. Leave 3 characters per byte on a line (2 digit
+ * hex number plus 1 space) plus spaces between characters and
+ * groupings.
+ */
+ int ascii_start = BYTES_PER_LINE * 3 +
+ BYTES_PER_LINE / DUMP_GROUPING + 2;
+
+ for (i = 0; i < length; i += BYTES_PER_LINE) {
+ int j;
+ int this_line_length = MIN(BYTES_PER_LINE, length - i);
+ int print_offset = 0;
+
+ for (j = 0; j < this_line_length; j++) {
+ int buf_offset = i + j;
+
+ /*
+ * Separate every DUMP_GROUPING bytes by a space.
+ */
+ if (buf_offset % DUMP_GROUPING == 0) {
+ print_offset += printf(" ");
+ }
+
+ /*
+ * Print the two-digit hex value for this byte.
+ */
+ unsigned char hex_print = buf[buf_offset];
+ print_offset += printf("%02x ", hex_print);
+ }
+
+ (void) printf("%*s", ascii_start - print_offset, " ");
+
+ print_ascii_block(buf + i, this_line_length);
+ }
+}
+
+/*
+ * Print an array of bytes to stdout as hexadecimal characters. str must
+ * have buf_len * 2 + 1 bytes of space.
+ */
+static void
+sprintf_bytes(char *str, uint8_t *buf, uint_t buf_len)
+{
+ int i, n;
+
+ for (i = 0; i < buf_len; i++) {
+ n = sprintf(str, "%02x", buf[i] & 0xff);
+ str += n;
+ }
+
+ str[0] = '\0';
+}
+
+int
+zstream_do_dump(int argc, char *argv[])
+{
+ char *buf = safe_malloc(SPA_MAXBLOCKSIZE);
+ uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
+ uint64_t total_payload_size = 0;
+ uint64_t total_overhead_size = 0;
+ uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 };
+ char salt[ZIO_DATA_SALT_LEN * 2 + 1];
+ char iv[ZIO_DATA_IV_LEN * 2 + 1];
+ char mac[ZIO_DATA_MAC_LEN * 2 + 1];
+ uint64_t total_records = 0;
+ uint64_t payload_size;
+ dmu_replay_record_t thedrr;
+ dmu_replay_record_t *drr = &thedrr;
+ struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
+ struct drr_end *drre = &thedrr.drr_u.drr_end;
+ struct drr_object *drro = &thedrr.drr_u.drr_object;
+ struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects;
+ struct drr_write *drrw = &thedrr.drr_u.drr_write;
+ struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
+ struct drr_free *drrf = &thedrr.drr_u.drr_free;
+ struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+ struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
+ struct drr_object_range *drror = &thedrr.drr_u.drr_object_range;
+ struct drr_redact *drrr = &thedrr.drr_u.drr_redact;
+ struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum;
+ int c;
+ boolean_t verbose = B_FALSE;
+ boolean_t very_verbose = B_FALSE;
+ boolean_t first = B_TRUE;
+ /*
+ * dump flag controls whether the contents of any modified data blocks
+ * are printed to the console during processing of the stream. Warning:
+ * for large streams, this can obviously lead to massive prints.
+ */
+ boolean_t dump = B_FALSE;
+ int err;
+ zio_cksum_t zc = { { 0 } };
+ zio_cksum_t pcksum = { { 0 } };
+
+ while ((c = getopt(argc, argv, ":vCd")) != -1) {
+ switch (c) {
+ case 'C':
+ do_cksum = B_FALSE;
+ break;
+ case 'v':
+ if (verbose)
+ very_verbose = B_TRUE;
+ verbose = B_TRUE;
+ break;
+ case 'd':
+ dump = B_TRUE;
+ verbose = B_TRUE;
+ very_verbose = B_TRUE;
+ break;
+ case ':':
+ (void) fprintf(stderr,
+ "missing argument for '%c' option\n", optopt);
+ zstream_usage();
+ break;
+ case '?':
+ (void) fprintf(stderr, "invalid option '%c'\n",
+ optopt);
+ zstream_usage();
+ break;
+ }
+ }
+
+ if (argc > optind) {
+ const char *filename = argv[optind];
+ send_stream = fopen(filename, "r");
+ if (send_stream == NULL) {
+ (void) fprintf(stderr,
+ "Error while opening file '%s': %s\n",
+ filename, strerror(errno));
+ exit(1);
+ }
+ } else {
+ if (isatty(STDIN_FILENO)) {
+ (void) fprintf(stderr,
+ "Error: The send stream is a binary format "
+ "and can not be read from a\n"
+ "terminal. Standard input must be redirected, "
+ "or a file must be\n"
+ "specified as a command-line argument.\n");
+ exit(1);
+ }
+ send_stream = stdin;
+ }
+
+ fletcher_4_init();
+ while (read_hdr(drr, &zc)) {
+
+ /*
+ * If this is the first DMU record being processed, check for
+ * the magic bytes and figure out the endian-ness based on them.
+ */
+ if (first) {
+ if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+ do_byteswap = B_TRUE;
+ if (do_cksum) {
+ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+ /*
+ * recalculate header checksum now
+ * that we know it needs to be
+ * byteswapped.
+ */
+ fletcher_4_incremental_byteswap(drr,
+ sizeof (dmu_replay_record_t), &zc);
+ }
+ } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) {
+ (void) fprintf(stderr, "Invalid stream "
+ "(bad magic number)\n");
+ exit(1);
+ }
+ first = B_FALSE;
+ }
+ if (do_byteswap) {
+ drr->drr_type = BSWAP_32(drr->drr_type);
+ drr->drr_payloadlen =
+ BSWAP_32(drr->drr_payloadlen);
+ }
+
+ /*
+ * At this point, the leading fields of the replay record
+ * (drr_type and drr_payloadlen) have been byte-swapped if
+ * necessary, but the rest of the data structure (the
+ * union of type-specific structures) is still in its
+ * original state.
+ */
+ if (drr->drr_type >= DRR_NUMTYPES) {
+ (void) printf("INVALID record found: type 0x%x\n",
+ drr->drr_type);
+ (void) printf("Aborting.\n");
+ exit(1);
+ }
+
+ drr_record_count[drr->drr_type]++;
+ total_overhead_size += sizeof (*drr);
+ total_records++;
+ payload_size = 0;
+
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ if (do_byteswap) {
+ drrb->drr_magic = BSWAP_64(drrb->drr_magic);
+ drrb->drr_versioninfo =
+ BSWAP_64(drrb->drr_versioninfo);
+ drrb->drr_creation_time =
+ BSWAP_64(drrb->drr_creation_time);
+ drrb->drr_type = BSWAP_32(drrb->drr_type);
+ drrb->drr_flags = BSWAP_32(drrb->drr_flags);
+ drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
+ drrb->drr_fromguid =
+ BSWAP_64(drrb->drr_fromguid);
+ }
+
+ (void) printf("BEGIN record\n");
+ (void) printf("\thdrtype = %lld\n",
+ DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo));
+ (void) printf("\tfeatures = %llx\n",
+ DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo));
+ (void) printf("\tmagic = %llx\n",
+ (u_longlong_t)drrb->drr_magic);
+ (void) printf("\tcreation_time = %llx\n",
+ (u_longlong_t)drrb->drr_creation_time);
+ (void) printf("\ttype = %u\n", drrb->drr_type);
+ (void) printf("\tflags = 0x%x\n", drrb->drr_flags);
+ (void) printf("\ttoguid = %llx\n",
+ (u_longlong_t)drrb->drr_toguid);
+ (void) printf("\tfromguid = %llx\n",
+ (u_longlong_t)drrb->drr_fromguid);
+ (void) printf("\ttoname = %s\n", drrb->drr_toname);
+ (void) printf("\tpayloadlen = %u\n",
+ drr->drr_payloadlen);
+ if (verbose)
+ (void) printf("\n");
+
+ if (drr->drr_payloadlen != 0) {
+ nvlist_t *nv;
+ int sz = drr->drr_payloadlen;
+
+ if (sz > SPA_MAXBLOCKSIZE) {
+ free(buf);
+ buf = safe_malloc(sz);
+ }
+ (void) ssread(buf, sz, &zc);
+ if (ferror(send_stream))
+ perror("fread");
+ err = nvlist_unpack(buf, sz, &nv, 0);
+ if (err) {
+ perror(strerror(err));
+ } else {
+ nvlist_print(stdout, nv);
+ nvlist_free(nv);
+ }
+ payload_size = sz;
+ }
+ break;
+
+ case DRR_END:
+ if (do_byteswap) {
+ drre->drr_checksum.zc_word[0] =
+ BSWAP_64(drre->drr_checksum.zc_word[0]);
+ drre->drr_checksum.zc_word[1] =
+ BSWAP_64(drre->drr_checksum.zc_word[1]);
+ drre->drr_checksum.zc_word[2] =
+ BSWAP_64(drre->drr_checksum.zc_word[2]);
+ drre->drr_checksum.zc_word[3] =
+ BSWAP_64(drre->drr_checksum.zc_word[3]);
+ }
+ /*
+ * We compare against the *previous* checksum
+ * value, because the stored checksum is of
+ * everything before the DRR_END record.
+ */
+ if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum,
+ pcksum)) {
+ (void) printf("Expected checksum differs from "
+ "checksum in stream.\n");
+ (void) printf("Expected checksum = "
+ "%llx/%llx/%llx/%llx\n",
+ (long long unsigned int)pcksum.zc_word[0],
+ (long long unsigned int)pcksum.zc_word[1],
+ (long long unsigned int)pcksum.zc_word[2],
+ (long long unsigned int)pcksum.zc_word[3]);
+ }
+ (void) printf("END checksum = %llx/%llx/%llx/%llx\n",
+ (long long unsigned int)
+ drre->drr_checksum.zc_word[0],
+ (long long unsigned int)
+ drre->drr_checksum.zc_word[1],
+ (long long unsigned int)
+ drre->drr_checksum.zc_word[2],
+ (long long unsigned int)
+ drre->drr_checksum.zc_word[3]);
+
+ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+ break;
+
+ case DRR_OBJECT:
+ if (do_byteswap) {
+ drro->drr_object = BSWAP_64(drro->drr_object);
+ drro->drr_type = BSWAP_32(drro->drr_type);
+ drro->drr_bonustype =
+ BSWAP_32(drro->drr_bonustype);
+ drro->drr_blksz = BSWAP_32(drro->drr_blksz);
+ drro->drr_bonuslen =
+ BSWAP_32(drro->drr_bonuslen);
+ drro->drr_raw_bonuslen =
+ BSWAP_32(drro->drr_raw_bonuslen);
+ drro->drr_toguid = BSWAP_64(drro->drr_toguid);
+ drro->drr_maxblkid =
+ BSWAP_64(drro->drr_maxblkid);
+ }
+
+ payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+
+ if (verbose) {
+ (void) printf("OBJECT object = %llu type = %u "
+ "bonustype = %u blksz = %u bonuslen = %u "
+ "dn_slots = %u raw_bonuslen = %u "
+ "flags = %u maxblkid = %llu "
+ "indblkshift = %u nlevels = %u "
+ "nblkptr = %u\n",
+ (u_longlong_t)drro->drr_object,
+ drro->drr_type,
+ drro->drr_bonustype,
+ drro->drr_blksz,
+ drro->drr_bonuslen,
+ drro->drr_dn_slots,
+ drro->drr_raw_bonuslen,
+ drro->drr_flags,
+ (u_longlong_t)drro->drr_maxblkid,
+ drro->drr_indblkshift,
+ drro->drr_nlevels,
+ drro->drr_nblkptr);
+ }
+ if (drro->drr_bonuslen > 0) {
+ (void) ssread(buf, payload_size, &zc);
+ if (dump)
+ print_block(buf, payload_size);
+ }
+ break;
+
+ case DRR_FREEOBJECTS:
+ if (do_byteswap) {
+ drrfo->drr_firstobj =
+ BSWAP_64(drrfo->drr_firstobj);
+ drrfo->drr_numobjs =
+ BSWAP_64(drrfo->drr_numobjs);
+ drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid);
+ }
+ if (verbose) {
+ (void) printf("FREEOBJECTS firstobj = %llu "
+ "numobjs = %llu\n",
+ (u_longlong_t)drrfo->drr_firstobj,
+ (u_longlong_t)drrfo->drr_numobjs);
+ }
+ break;
+
+ case DRR_WRITE:
+ if (do_byteswap) {
+ drrw->drr_object = BSWAP_64(drrw->drr_object);
+ drrw->drr_type = BSWAP_32(drrw->drr_type);
+ drrw->drr_offset = BSWAP_64(drrw->drr_offset);
+ drrw->drr_logical_size =
+ BSWAP_64(drrw->drr_logical_size);
+ drrw->drr_toguid = BSWAP_64(drrw->drr_toguid);
+ drrw->drr_key.ddk_prop =
+ BSWAP_64(drrw->drr_key.ddk_prop);
+ drrw->drr_compressed_size =
+ BSWAP_64(drrw->drr_compressed_size);
+ }
+
+ payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+
+ /*
+ * If this is verbose and/or dump output,
+ * print info on the modified block
+ */
+ if (verbose) {
+ sprintf_bytes(salt, drrw->drr_salt,
+ ZIO_DATA_SALT_LEN);
+ sprintf_bytes(iv, drrw->drr_iv,
+ ZIO_DATA_IV_LEN);
+ sprintf_bytes(mac, drrw->drr_mac,
+ ZIO_DATA_MAC_LEN);
+
+ (void) printf("WRITE object = %llu type = %u "
+ "checksum type = %u compression type = %u "
+ "flags = %u offset = %llu "
+ "logical_size = %llu "
+ "compressed_size = %llu "
+ "payload_size = %llu props = %llx "
+ "salt = %s iv = %s mac = %s\n",
+ (u_longlong_t)drrw->drr_object,
+ drrw->drr_type,
+ drrw->drr_checksumtype,
+ drrw->drr_compressiontype,
+ drrw->drr_flags,
+ (u_longlong_t)drrw->drr_offset,
+ (u_longlong_t)drrw->drr_logical_size,
+ (u_longlong_t)drrw->drr_compressed_size,
+ (u_longlong_t)payload_size,
+ (u_longlong_t)drrw->drr_key.ddk_prop,
+ salt,
+ iv,
+ mac);
+ }
+
+ /*
+ * Read the contents of the block in from STDIN to buf
+ */
+ (void) ssread(buf, payload_size, &zc);
+ /*
+ * If in dump mode
+ */
+ if (dump) {
+ print_block(buf, payload_size);
+ }
+ break;
+
+ case DRR_WRITE_BYREF:
+ if (do_byteswap) {
+ drrwbr->drr_object =
+ BSWAP_64(drrwbr->drr_object);
+ drrwbr->drr_offset =
+ BSWAP_64(drrwbr->drr_offset);
+ drrwbr->drr_length =
+ BSWAP_64(drrwbr->drr_length);
+ drrwbr->drr_toguid =
+ BSWAP_64(drrwbr->drr_toguid);
+ drrwbr->drr_refguid =
+ BSWAP_64(drrwbr->drr_refguid);
+ drrwbr->drr_refobject =
+ BSWAP_64(drrwbr->drr_refobject);
+ drrwbr->drr_refoffset =
+ BSWAP_64(drrwbr->drr_refoffset);
+ drrwbr->drr_key.ddk_prop =
+ BSWAP_64(drrwbr->drr_key.ddk_prop);
+ }
+ if (verbose) {
+ (void) printf("WRITE_BYREF object = %llu "
+ "checksum type = %u props = %llx "
+ "offset = %llu length = %llu "
+ "toguid = %llx refguid = %llx "
+ "refobject = %llu refoffset = %llu\n",
+ (u_longlong_t)drrwbr->drr_object,
+ drrwbr->drr_checksumtype,
+ (u_longlong_t)drrwbr->drr_key.ddk_prop,
+ (u_longlong_t)drrwbr->drr_offset,
+ (u_longlong_t)drrwbr->drr_length,
+ (u_longlong_t)drrwbr->drr_toguid,
+ (u_longlong_t)drrwbr->drr_refguid,
+ (u_longlong_t)drrwbr->drr_refobject,
+ (u_longlong_t)drrwbr->drr_refoffset);
+ }
+ break;
+
+ case DRR_FREE:
+ if (do_byteswap) {
+ drrf->drr_object = BSWAP_64(drrf->drr_object);
+ drrf->drr_offset = BSWAP_64(drrf->drr_offset);
+ drrf->drr_length = BSWAP_64(drrf->drr_length);
+ }
+ if (verbose) {
+ (void) printf("FREE object = %llu "
+ "offset = %llu length = %lld\n",
+ (u_longlong_t)drrf->drr_object,
+ (u_longlong_t)drrf->drr_offset,
+ (longlong_t)drrf->drr_length);
+ }
+ break;
+ case DRR_SPILL:
+ if (do_byteswap) {
+ drrs->drr_object = BSWAP_64(drrs->drr_object);
+ drrs->drr_length = BSWAP_64(drrs->drr_length);
+ drrs->drr_compressed_size =
+ BSWAP_64(drrs->drr_compressed_size);
+ drrs->drr_type = BSWAP_32(drrs->drr_type);
+ }
+
+ payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+
+ if (verbose) {
+ sprintf_bytes(salt, drrs->drr_salt,
+ ZIO_DATA_SALT_LEN);
+ sprintf_bytes(iv, drrs->drr_iv,
+ ZIO_DATA_IV_LEN);
+ sprintf_bytes(mac, drrs->drr_mac,
+ ZIO_DATA_MAC_LEN);
+
+ (void) printf("SPILL block for object = %llu "
+ "length = %llu flags = %u "
+ "compression type = %u "
+ "compressed_size = %llu "
+ "payload_size = %llu "
+ "salt = %s iv = %s mac = %s\n",
+ (u_longlong_t)drrs->drr_object,
+ (u_longlong_t)drrs->drr_length,
+ drrs->drr_flags,
+ drrs->drr_compressiontype,
+ (u_longlong_t)drrs->drr_compressed_size,
+ (u_longlong_t)payload_size,
+ salt,
+ iv,
+ mac);
+ }
+ (void) ssread(buf, payload_size, &zc);
+ if (dump) {
+ print_block(buf, payload_size);
+ }
+ break;
+ case DRR_WRITE_EMBEDDED:
+ if (do_byteswap) {
+ drrwe->drr_object =
+ BSWAP_64(drrwe->drr_object);
+ drrwe->drr_offset =
+ BSWAP_64(drrwe->drr_offset);
+ drrwe->drr_length =
+ BSWAP_64(drrwe->drr_length);
+ drrwe->drr_toguid =
+ BSWAP_64(drrwe->drr_toguid);
+ drrwe->drr_lsize =
+ BSWAP_32(drrwe->drr_lsize);
+ drrwe->drr_psize =
+ BSWAP_32(drrwe->drr_psize);
+ }
+ if (verbose) {
+ (void) printf("WRITE_EMBEDDED object = %llu "
+ "offset = %llu length = %llu "
+ "toguid = %llx comp = %u etype = %u "
+ "lsize = %u psize = %u\n",
+ (u_longlong_t)drrwe->drr_object,
+ (u_longlong_t)drrwe->drr_offset,
+ (u_longlong_t)drrwe->drr_length,
+ (u_longlong_t)drrwe->drr_toguid,
+ drrwe->drr_compression,
+ drrwe->drr_etype,
+ drrwe->drr_lsize,
+ drrwe->drr_psize);
+ }
+ (void) ssread(buf,
+ P2ROUNDUP(drrwe->drr_psize, 8), &zc);
+ if (dump) {
+ print_block(buf,
+ P2ROUNDUP(drrwe->drr_psize, 8));
+ }
+ payload_size = P2ROUNDUP(drrwe->drr_psize, 8);
+ break;
+ case DRR_OBJECT_RANGE:
+ if (do_byteswap) {
+ drror->drr_firstobj =
+ BSWAP_64(drror->drr_firstobj);
+ drror->drr_numslots =
+ BSWAP_64(drror->drr_numslots);
+ drror->drr_toguid = BSWAP_64(drror->drr_toguid);
+ }
+ if (verbose) {
+ sprintf_bytes(salt, drror->drr_salt,
+ ZIO_DATA_SALT_LEN);
+ sprintf_bytes(iv, drror->drr_iv,
+ ZIO_DATA_IV_LEN);
+ sprintf_bytes(mac, drror->drr_mac,
+ ZIO_DATA_MAC_LEN);
+
+ (void) printf("OBJECT_RANGE firstobj = %llu "
+ "numslots = %llu flags = %u "
+ "salt = %s iv = %s mac = %s\n",
+ (u_longlong_t)drror->drr_firstobj,
+ (u_longlong_t)drror->drr_numslots,
+ drror->drr_flags,
+ salt,
+ iv,
+ mac);
+ }
+ break;
+ case DRR_REDACT:
+ if (do_byteswap) {
+ drrr->drr_object = BSWAP_64(drrr->drr_object);
+ drrr->drr_offset = BSWAP_64(drrr->drr_offset);
+ drrr->drr_length = BSWAP_64(drrr->drr_length);
+ drrr->drr_toguid = BSWAP_64(drrr->drr_toguid);
+ }
+ if (verbose) {
+ (void) printf("REDACT object = %llu offset = "
+ "%llu length = %llu\n",
+ (u_longlong_t)drrr->drr_object,
+ (u_longlong_t)drrr->drr_offset,
+ (u_longlong_t)drrr->drr_length);
+ }
+ break;
+ case DRR_NUMTYPES:
+ /* should never be reached */
+ exit(1);
+ }
+ if (drr->drr_type != DRR_BEGIN && very_verbose) {
+ (void) printf(" checksum = %llx/%llx/%llx/%llx\n",
+ (longlong_t)drrc->drr_checksum.zc_word[0],
+ (longlong_t)drrc->drr_checksum.zc_word[1],
+ (longlong_t)drrc->drr_checksum.zc_word[2],
+ (longlong_t)drrc->drr_checksum.zc_word[3]);
+ }
+ pcksum = zc;
+ drr_byte_count[drr->drr_type] += payload_size;
+ total_payload_size += payload_size;
+ }
+ free(buf);
+ fletcher_4_fini();
+
+ /* Print final summary */
+
+ (void) printf("SUMMARY:\n");
+ (void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_BEGIN],
+ (u_longlong_t)drr_byte_count[DRR_BEGIN]);
+ (void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_END],
+ (u_longlong_t)drr_byte_count[DRR_END]);
+ (void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_OBJECT],
+ (u_longlong_t)drr_byte_count[DRR_OBJECT]);
+ (void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_FREEOBJECTS],
+ (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]);
+ (void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_WRITE],
+ (u_longlong_t)drr_byte_count[DRR_WRITE]);
+ (void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_WRITE_BYREF],
+ (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]);
+ (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu "
+ "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED],
+ (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]);
+ (void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_FREE],
+ (u_longlong_t)drr_byte_count[DRR_FREE]);
+ (void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n",
+ (u_longlong_t)drr_record_count[DRR_SPILL],
+ (u_longlong_t)drr_byte_count[DRR_SPILL]);
+ (void) printf("\tTotal records = %lld\n",
+ (u_longlong_t)total_records);
+ (void) printf("\tTotal payload size = %lld (0x%llx)\n",
+ (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size);
+ (void) printf("\tTotal header overhead = %lld (0x%llx)\n",
+ (u_longlong_t)total_overhead_size,
+ (u_longlong_t)total_overhead_size);
+ (void) printf("\tTotal stream length = %lld (0x%llx)\n",
+ (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c
new file mode 100644
index 000000000000..379025ce59e5
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c
@@ -0,0 +1,469 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020 by Delphix. All rights reserved.
+ */
+
+#include <assert.h>
+#include <cityhash.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libzfs_impl.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <umem.h>
+#include <unistd.h>
+#include <sys/debug.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zio_checksum.h>
+#include "zfs_fletcher.h"
+#include "zstream.h"
+
+
+#define MAX_RDT_PHYSMEM_PERCENT 20
+#define SMALLEST_POSSIBLE_MAX_RDT_MB 128
+
+typedef struct redup_entry {
+ struct redup_entry *rde_next;
+ uint64_t rde_guid;
+ uint64_t rde_object;
+ uint64_t rde_offset;
+ uint64_t rde_stream_offset;
+} redup_entry_t;
+
+typedef struct redup_table {
+ redup_entry_t **redup_hash_array;
+ umem_cache_t *ddecache;
+ uint64_t ddt_count;
+ int numhashbits;
+} redup_table_t;
+
+int
+highbit64(uint64_t i)
+{
+ if (i == 0)
+ return (0);
+
+ return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
+}
+
+static void *
+safe_calloc(size_t n)
+{
+ void *rv = calloc(1, n);
+ if (rv == NULL) {
+ fprintf(stderr,
+ "Error: could not allocate %u bytes of memory\n",
+ (int)n);
+ exit(1);
+ }
+ return (rv);
+}
+
+/*
+ * Safe version of fread(), exits on error.
+ */
+static int
+sfread(void *buf, size_t size, FILE *fp)
+{
+ int rv = fread(buf, size, 1, fp);
+ if (rv == 0 && ferror(fp)) {
+ (void) fprintf(stderr, "Error while reading file: %s\n",
+ strerror(errno));
+ exit(1);
+ }
+ return (rv);
+}
+
+/*
+ * Safe version of pread(), exits on error.
+ */
+static void
+spread(int fd, void *buf, size_t count, off_t offset)
+{
+ ssize_t err = pread(fd, buf, count, offset);
+ if (err == -1) {
+ (void) fprintf(stderr,
+ "Error while reading file: %s\n",
+ strerror(errno));
+ exit(1);
+ } else if (err != count) {
+ (void) fprintf(stderr,
+ "Error while reading file: short read\n");
+ exit(1);
+ }
+}
+
+static int
+dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
+ zio_cksum_t *zc, int outfd)
+{
+ assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
+ == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ fletcher_4_incremental_native(drr,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
+ if (drr->drr_type != DRR_BEGIN) {
+ assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
+ drr_checksum.drr_checksum));
+ drr->drr_u.drr_checksum.drr_checksum = *zc;
+ }
+ fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
+ sizeof (zio_cksum_t), zc);
+ if (write(outfd, drr, sizeof (*drr)) == -1)
+ return (errno);
+ if (payload_len != 0) {
+ fletcher_4_incremental_native(payload, payload_len, zc);
+ if (write(outfd, payload, payload_len) == -1)
+ return (errno);
+ }
+ return (0);
+}
+
+static void
+rdt_insert(redup_table_t *rdt,
+ uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
+{
+ uint64_t ch = cityhash4(guid, object, offset, 0);
+ uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
+ redup_entry_t **rdepp;
+
+ rdepp = &(rdt->redup_hash_array[hashcode]);
+ redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
+ rde->rde_next = *rdepp;
+ rde->rde_guid = guid;
+ rde->rde_object = object;
+ rde->rde_offset = offset;
+ rde->rde_stream_offset = stream_offset;
+ *rdepp = rde;
+ rdt->ddt_count++;
+}
+
+static void
+rdt_lookup(redup_table_t *rdt,
+ uint64_t guid, uint64_t object, uint64_t offset,
+ uint64_t *stream_offsetp)
+{
+ uint64_t ch = cityhash4(guid, object, offset, 0);
+ uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
+
+ for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
+ rde != NULL; rde = rde->rde_next) {
+ if (rde->rde_guid == guid &&
+ rde->rde_object == object &&
+ rde->rde_offset == offset) {
+ *stream_offsetp = rde->rde_stream_offset;
+ return;
+ }
+ }
+ assert(!"could not find expected redup table entry");
+}
+
+/*
+ * Convert a dedup stream (generated by "zfs send -D") to a
+ * non-deduplicated stream. The entire infd will be converted, including
+ * any substreams in a stream package (generated by "zfs send -RD"). The
+ * infd must be seekable.
+ */
+static void
+zfs_redup_stream(int infd, int outfd, boolean_t verbose)
+{
+ int bufsz = SPA_MAXBLOCKSIZE;
+ dmu_replay_record_t thedrr = { 0 };
+ dmu_replay_record_t *drr = &thedrr;
+ redup_table_t rdt;
+ zio_cksum_t stream_cksum;
+ uint64_t numbuckets;
+ uint64_t num_records = 0;
+ uint64_t num_write_byref_records = 0;
+
+#ifdef _ILP32
+ uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
+#else
+ uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
+ uint64_t max_rde_size =
+ MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
+ SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
+#endif
+
+ numbuckets = max_rde_size / (sizeof (redup_entry_t));
+
+ /*
+ * numbuckets must be a power of 2. Increase number to
+ * a power of 2 if necessary.
+ */
+ if (!ISP2(numbuckets))
+ numbuckets = 1ULL << highbit64(numbuckets);
+
+ rdt.redup_hash_array =
+ safe_calloc(numbuckets * sizeof (redup_entry_t *));
+ rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
+ NULL, NULL, NULL, NULL, NULL, 0);
+ rdt.numhashbits = highbit64(numbuckets) - 1;
+ rdt.ddt_count = 0;
+
+ char *buf = safe_calloc(bufsz);
+ FILE *ofp = fdopen(infd, "r");
+ long offset = ftell(ofp);
+ while (sfread(drr, sizeof (*drr), ofp) != 0) {
+ num_records++;
+
+ /*
+ * We need to regenerate the checksum.
+ */
+ if (drr->drr_type != DRR_BEGIN) {
+ bzero(&drr->drr_u.drr_checksum.drr_checksum,
+ sizeof (drr->drr_u.drr_checksum.drr_checksum));
+ }
+
+ uint64_t payload_size = 0;
+ switch (drr->drr_type) {
+ case DRR_BEGIN:
+ {
+ struct drr_begin *drrb = &drr->drr_u.drr_begin;
+ int fflags;
+ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+
+ assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
+
+ /* clear the DEDUP feature flag for this stream */
+ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
+ DMU_BACKUP_FEATURE_DEDUPPROPS);
+ DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
+
+ int sz = drr->drr_payloadlen;
+ if (sz != 0) {
+ if (sz > bufsz) {
+ free(buf);
+ buf = safe_calloc(sz);
+ bufsz = sz;
+ }
+ (void) sfread(buf, sz, ofp);
+ }
+ payload_size = sz;
+ break;
+ }
+
+ case DRR_END:
+ {
+ struct drr_end *drre = &drr->drr_u.drr_end;
+ /*
+ * Use the recalculated checksum, unless this is
+ * the END record of a stream package, which has
+ * no checksum.
+ */
+ if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
+ drre->drr_checksum = stream_cksum;
+ break;
+ }
+
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &drr->drr_u.drr_object;
+
+ if (drro->drr_bonuslen > 0) {
+ payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+ (void) sfread(buf, payload_size, ofp);
+ }
+ break;
+ }
+
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &drr->drr_u.drr_spill;
+ payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+ (void) sfread(buf, payload_size, ofp);
+ break;
+ }
+
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref drrwb =
+ drr->drr_u.drr_write_byref;
+
+ num_write_byref_records++;
+
+ /*
+ * Look up in hash table by drrwb->drr_refguid,
+ * drr_refobject, drr_refoffset. Replace this
+ * record with the found WRITE record, but with
+ * drr_object,drr_offset,drr_toguid replaced with ours.
+ */
+ uint64_t stream_offset = 0;
+ rdt_lookup(&rdt, drrwb.drr_refguid,
+ drrwb.drr_refobject, drrwb.drr_refoffset,
+ &stream_offset);
+
+ spread(infd, drr, sizeof (*drr), stream_offset);
+
+ assert(drr->drr_type == DRR_WRITE);
+ struct drr_write *drrw = &drr->drr_u.drr_write;
+ assert(drrw->drr_toguid == drrwb.drr_refguid);
+ assert(drrw->drr_object == drrwb.drr_refobject);
+ assert(drrw->drr_offset == drrwb.drr_refoffset);
+
+ payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ spread(infd, buf, payload_size,
+ stream_offset + sizeof (*drr));
+
+ drrw->drr_toguid = drrwb.drr_toguid;
+ drrw->drr_object = drrwb.drr_object;
+ drrw->drr_offset = drrwb.drr_offset;
+ break;
+ }
+
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &drr->drr_u.drr_write;
+ payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+ (void) sfread(buf, payload_size, ofp);
+
+ rdt_insert(&rdt, drrw->drr_toguid,
+ drrw->drr_object, drrw->drr_offset, offset);
+ break;
+ }
+
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &drr->drr_u.drr_write_embedded;
+ payload_size =
+ P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
+ (void) sfread(buf, payload_size, ofp);
+ break;
+ }
+
+ case DRR_FREEOBJECTS:
+ case DRR_FREE:
+ case DRR_OBJECT_RANGE:
+ break;
+
+ default:
+ (void) fprintf(stderr, "INVALID record type 0x%x\n",
+ drr->drr_type);
+ /* should never happen, so assert */
+ assert(B_FALSE);
+ }
+
+ if (feof(ofp)) {
+ fprintf(stderr, "Error: unexpected end-of-file\n");
+ exit(1);
+ }
+ if (ferror(ofp)) {
+ fprintf(stderr, "Error while reading file: %s\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ /*
+ * We need to recalculate the checksum, and it needs to be
+ * initially zero to do that. BEGIN records don't have
+ * a checksum.
+ */
+ if (drr->drr_type != DRR_BEGIN) {
+ bzero(&drr->drr_u.drr_checksum.drr_checksum,
+ sizeof (drr->drr_u.drr_checksum.drr_checksum));
+ }
+ if (dump_record(drr, buf, payload_size,
+ &stream_cksum, outfd) != 0)
+ break;
+ if (drr->drr_type == DRR_END) {
+ /*
+ * Typically the END record is either the last
+ * thing in the stream, or it is followed
+ * by a BEGIN record (which also zeros the checksum).
+ * However, a stream package ends with two END
+ * records. The last END record's checksum starts
+ * from zero.
+ */
+ ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+ }
+ offset = ftell(ofp);
+ }
+
+ if (verbose) {
+ char mem_str[16];
+ zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
+ mem_str, sizeof (mem_str));
+ fprintf(stderr, "converted stream with %llu total records, "
+ "including %llu dedup records, using %sB memory.\n",
+ (long long)num_records,
+ (long long)num_write_byref_records,
+ mem_str);
+ }
+
+ umem_cache_destroy(rdt.ddecache);
+ free(rdt.redup_hash_array);
+ free(buf);
+ (void) fclose(ofp);
+}
+
+int
+zstream_do_redup(int argc, char *argv[])
+{
+ boolean_t verbose = B_FALSE;
+ char c;
+
+ while ((c = getopt(argc, argv, "v")) != -1) {
+ switch (c) {
+ case 'v':
+ verbose = B_TRUE;
+ break;
+ case '?':
+ (void) fprintf(stderr, "invalid option '%c'\n",
+ optopt);
+ zstream_usage();
+ break;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ zstream_usage();
+
+ const char *filename = argv[0];
+
+ if (isatty(STDOUT_FILENO)) {
+ (void) fprintf(stderr,
+ "Error: Stream can not be written to a terminal.\n"
+ "You must redirect standard output.\n");
+ return (1);
+ }
+
+ int fd = open(filename, O_RDONLY);
+ if (fd == -1) {
+ (void) fprintf(stderr,
+ "Error while opening file '%s': %s\n",
+ filename, strerror(errno));
+ exit(1);
+ }
+
+ fletcher_4_init();
+ zfs_redup_stream(fd, STDOUT_FILENO, verbose);
+ fletcher_4_fini();
+
+ close(fd);
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_token.c b/sys/contrib/openzfs/cmd/zstream/zstream_token.c
new file mode 100644
index 000000000000..36a76a4bb851
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstream/zstream_token.c
@@ -0,0 +1,78 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Portions Copyright 2012 Martin Matuska <martin@matuska.org>
+ */
+
+/*
+ * Copyright (c) 2020 by Datto Inc. All rights reserved.
+ */
+
+#include <ctype.h>
+#include <libnvpair.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stddef.h>
+
+#include <libzfs.h>
+#include <libzfs_core.h>
+
+#include <sys/dmu.h>
+#include <sys/zfs_ioctl.h>
+#include "zstream.h"
+
+int
+zstream_do_token(int argc, char *argv[])
+{
+ char *resume_token = NULL;
+
+ if (argc < 2) {
+ (void) fprintf(stderr, "Need to pass the resume token\n");
+ zstream_usage();
+ }
+
+ resume_token = argv[1];
+
+ libzfs_handle_t *hdl = libzfs_init();
+
+ nvlist_t *resume_nvl =
+ zfs_send_resume_token_to_nvlist(hdl, resume_token);
+
+ if (resume_nvl == NULL) {
+ (void) fprintf(stderr,
+ "Unable to parse resume token: %s\n",
+ libzfs_error_description(hdl));
+ libzfs_fini(hdl);
+ return (1);
+ }
+
+ dump_nvlist(resume_nvl, 5);
+ nvlist_free(resume_nvl);
+
+ libzfs_fini(hdl);
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am b/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am
new file mode 100644
index 000000000000..2c04d8513150
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstreamdump/Makefile.am
@@ -0,0 +1 @@
+dist_sbin_SCRIPTS = zstreamdump
diff --git a/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump b/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump
new file mode 100755
index 000000000000..fbf02ee687f6
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zstreamdump/zstreamdump
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+zstream dump "$@"
diff --git a/sys/contrib/openzfs/cmd/ztest/.gitignore b/sys/contrib/openzfs/cmd/ztest/.gitignore
new file mode 100644
index 000000000000..d3d498dae693
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/ztest/.gitignore
@@ -0,0 +1 @@
+/ztest
diff --git a/sys/contrib/openzfs/cmd/ztest/Makefile.am b/sys/contrib/openzfs/cmd/ztest/Makefile.am
new file mode 100644
index 000000000000..6042b44d1dde
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/ztest/Makefile.am
@@ -0,0 +1,23 @@
+include $(top_srcdir)/config/Rules.am
+
+# Get rid of compiler warning for unchecked truncating snprintfs on gcc 7.1.1
+AM_CFLAGS += $(NO_FORMAT_TRUNCATION)
+
+# Includes kernel code, generate warnings for large stack frames
+AM_CFLAGS += $(FRAME_LARGER_THAN)
+
+# Unconditionally enable ASSERTs
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = ztest
+
+ztest_SOURCES = \
+ ztest.c
+
+ztest_LDADD = \
+ $(abs_top_builddir)/lib/libzpool/libzpool.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
+
+ztest_LDADD += -lm
+ztest_LDFLAGS = -pthread
diff --git a/sys/contrib/openzfs/cmd/ztest/ztest.c b/sys/contrib/openzfs/cmd/ztest/ztest.c
new file mode 100644
index 000000000000..31205a5bf8cf
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/ztest/ztest.c
@@ -0,0 +1,7818 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * The objective of this program is to provide a DMU/ZAP/SPA stress test
+ * that runs entirely in userland, is easy to use, and easy to extend.
+ *
+ * The overall design of the ztest program is as follows:
+ *
+ * (1) For each major functional area (e.g. adding vdevs to a pool,
+ * creating and destroying datasets, reading and writing objects, etc)
+ * we have a simple routine to test that functionality. These
+ * individual routines do not have to do anything "stressful".
+ *
+ * (2) We turn these simple functionality tests into a stress test by
+ * running them all in parallel, with as many threads as desired,
+ * and spread across as many datasets, objects, and vdevs as desired.
+ *
+ * (3) While all this is happening, we inject faults into the pool to
+ * verify that self-healing data really works.
+ *
+ * (4) Every time we open a dataset, we change its checksum and compression
+ * functions. Thus even individual objects vary from block to block
+ * in which checksum they use and whether they're compressed.
+ *
+ * (5) To verify that we never lose on-disk consistency after a crash,
+ * we run the entire test in a child of the main process.
+ * At random times, the child self-immolates with a SIGKILL.
+ * This is the software equivalent of pulling the power cord.
+ * The parent then runs the test again, using the existing
+ * storage pool, as many times as desired. If backwards compatibility
+ * testing is enabled ztest will sometimes run the "older" version
+ * of ztest after a SIGKILL.
+ *
+ * (6) To verify that we don't have future leaks or temporal incursions,
+ * many of the functional tests record the transaction group number
+ * as part of their data. When reading old data, they verify that
+ * the transaction group number is less than the current, open txg.
+ * If you add a new test, please do this if applicable.
+ *
+ * (7) Threads are created with a reduced stack size, for sanity checking.
+ * Therefore, it's important not to allocate huge buffers on the stack.
+ *
+ * When run with no arguments, ztest runs for about five minutes and
+ * produces no output if successful. To get a little bit of information,
+ * specify -V. To get more information, specify -VV, and so on.
+ *
+ * To turn this into an overnight stress test, use -T to specify run time.
+ *
+ * You can ask more vdevs [-v], datasets [-d], or threads [-t]
+ * to increase the pool capacity, fanout, and overall stress level.
+ *
+ * Use the -k option to set the desired frequency of kills.
+ *
+ * When ztest invokes itself it passes all relevant information through a
+ * temporary file which is mmap-ed in the child process. This allows shared
+ * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
+ * stored at offset 0 of this file and contains information on the size and
+ * number of shared structures in the file. The information stored in this file
+ * must remain backwards compatible with older versions of ztest so that
+ * ztest can invoke them during backwards compatibility testing (-B).
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/dmu_objset.h>
+#include <sys/poll.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_trim.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_refcount.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_userhold.h>
+#include <sys/abd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <umem.h>
+#include <ctype.h>
+#include <math.h>
+#include <sys/fs/zfs.h>
+#include <zfs_fletcher.h>
+#include <libnvpair.h>
+#include <libzutil.h>
+#include <sys/crypto/icp.h>
+#ifdef __GLIBC__
+#include <execinfo.h> /* for backtrace() */
+#endif
+
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
+
+typedef struct ztest_shared_hdr {
+ uint64_t zh_hdr_size;
+ uint64_t zh_opts_size;
+ uint64_t zh_size;
+ uint64_t zh_stats_size;
+ uint64_t zh_stats_count;
+ uint64_t zh_ds_size;
+ uint64_t zh_ds_count;
+} ztest_shared_hdr_t;
+
+static ztest_shared_hdr_t *ztest_shared_hdr;
+
+enum ztest_class_state {
+ ZTEST_VDEV_CLASS_OFF,
+ ZTEST_VDEV_CLASS_ON,
+ ZTEST_VDEV_CLASS_RND
+};
+
+typedef struct ztest_shared_opts {
+ char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
+ char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
+ char zo_alt_ztest[MAXNAMELEN];
+ char zo_alt_libpath[MAXNAMELEN];
+ uint64_t zo_vdevs;
+ uint64_t zo_vdevtime;
+ size_t zo_vdev_size;
+ int zo_ashift;
+ int zo_mirrors;
+ int zo_raidz;
+ int zo_raidz_parity;
+ int zo_datasets;
+ int zo_threads;
+ uint64_t zo_passtime;
+ uint64_t zo_killrate;
+ int zo_verbose;
+ int zo_init;
+ uint64_t zo_time;
+ uint64_t zo_maxloops;
+ uint64_t zo_metaslab_force_ganging;
+ int zo_mmp_test;
+ int zo_special_vdevs;
+ int zo_dump_dbgmsg;
+} ztest_shared_opts_t;
+
+static const ztest_shared_opts_t ztest_opts_defaults = {
+ .zo_pool = "ztest",
+ .zo_dir = "/tmp",
+ .zo_alt_ztest = { '\0' },
+ .zo_alt_libpath = { '\0' },
+ .zo_vdevs = 5,
+ .zo_ashift = SPA_MINBLOCKSHIFT,
+ .zo_mirrors = 2,
+ .zo_raidz = 4,
+ .zo_raidz_parity = 1,
+ .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */
+ .zo_datasets = 7,
+ .zo_threads = 23,
+ .zo_passtime = 60, /* 60 seconds */
+ .zo_killrate = 70, /* 70% kill rate */
+ .zo_verbose = 0,
+ .zo_mmp_test = 0,
+ .zo_init = 1,
+ .zo_time = 300, /* 5 minutes */
+ .zo_maxloops = 50, /* max loops during spa_freeze() */
+ .zo_metaslab_force_ganging = 64 << 10,
+ .zo_special_vdevs = ZTEST_VDEV_CLASS_RND,
+};
+
+extern uint64_t metaslab_force_ganging;
+extern uint64_t metaslab_df_alloc_threshold;
+extern unsigned long zfs_deadman_synctime_ms;
+extern int metaslab_preload_limit;
+extern boolean_t zfs_compressed_arc_enabled;
+extern int zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
+extern boolean_t zfs_force_some_double_word_sm_entries;
+extern unsigned long zio_decompress_fail_fraction;
+extern unsigned long zfs_reconstruct_indirect_damage_fraction;
+
+
+static ztest_shared_opts_t *ztest_shared_opts;
+static ztest_shared_opts_t ztest_opts;
+static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345";
+
+typedef struct ztest_shared_ds {
+ uint64_t zd_seq;
+} ztest_shared_ds_t;
+
+static ztest_shared_ds_t *ztest_shared_ds;
+#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
+
+#define BT_MAGIC 0x123456789abcdefULL
+#define MAXFAULTS(zs) \
+ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
+
+enum ztest_io_type {
+ ZTEST_IO_WRITE_TAG,
+ ZTEST_IO_WRITE_PATTERN,
+ ZTEST_IO_WRITE_ZEROES,
+ ZTEST_IO_TRUNCATE,
+ ZTEST_IO_SETATTR,
+ ZTEST_IO_REWRITE,
+ ZTEST_IO_TYPES
+};
+
+typedef struct ztest_block_tag {
+ uint64_t bt_magic;
+ uint64_t bt_objset;
+ uint64_t bt_object;
+ uint64_t bt_dnodesize;
+ uint64_t bt_offset;
+ uint64_t bt_gen;
+ uint64_t bt_txg;
+ uint64_t bt_crtxg;
+} ztest_block_tag_t;
+
+typedef struct bufwad {
+ uint64_t bw_index;
+ uint64_t bw_txg;
+ uint64_t bw_data;
+} bufwad_t;
+
+/*
+ * It would be better to use a rangelock_t per object. Unfortunately
+ * the rangelock_t is not a drop-in replacement for rl_t, because we
+ * still need to map from object ID to rangelock_t.
+ */
+typedef enum {
+ RL_READER,
+ RL_WRITER,
+ RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+ void *rll_writer;
+ int rll_readers;
+ kmutex_t rll_lock;
+ kcondvar_t rll_cv;
+} rll_t;
+
+typedef struct rl {
+ uint64_t rl_object;
+ uint64_t rl_offset;
+ uint64_t rl_size;
+ rll_t *rl_lock;
+} rl_t;
+
+#define ZTEST_RANGE_LOCKS 64
+#define ZTEST_OBJECT_LOCKS 64
+
+/*
+ * Object descriptor. Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+ uint64_t od_dir;
+ uint64_t od_object;
+ dmu_object_type_t od_type;
+ dmu_object_type_t od_crtype;
+ uint64_t od_blocksize;
+ uint64_t od_crblocksize;
+ uint64_t od_crdnodesize;
+ uint64_t od_gen;
+ uint64_t od_crgen;
+ char od_name[ZFS_MAX_DATASET_NAME_LEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+ ztest_shared_ds_t *zd_shared;
+ objset_t *zd_os;
+ pthread_rwlock_t zd_zilog_lock;
+ zilog_t *zd_zilog;
+ ztest_od_t *zd_od; /* debugging aid */
+ char zd_name[ZFS_MAX_DATASET_NAME_LEN];
+ kmutex_t zd_dirobj_lock;
+ rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
+ rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+ ztest_func_t *zi_func; /* test function */
+ uint64_t zi_iters; /* iterations per execution */
+ uint64_t *zi_interval; /* execute every <interval> seconds */
+ const char *zi_funcname; /* name of test function */
+} ztest_info_t;
+
+typedef struct ztest_shared_callstate {
+ uint64_t zc_count; /* per-pass count */
+ uint64_t zc_time; /* per-pass time */
+ uint64_t zc_next; /* next time to call this function */
+} ztest_shared_callstate_t;
+
+static ztest_shared_callstate_t *ztest_shared_callstate;
+#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
+
+ztest_func_t ztest_dmu_read_write;
+ztest_func_t ztest_dmu_write_parallel;
+ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
+ztest_func_t ztest_dmu_commit_callbacks;
+ztest_func_t ztest_zap;
+ztest_func_t ztest_zap_parallel;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
+ztest_func_t ztest_dmu_read_write_zcopy;
+ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
+ztest_func_t ztest_fzap;
+ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_spa_prop_get_set;
+ztest_func_t ztest_spa_create_destroy;
+ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_dmu_snapshot_hold;
+ztest_func_t ztest_mmp_enable_disable;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
+ztest_func_t ztest_vdev_attach_detach;
+ztest_func_t ztest_vdev_LUN_growth;
+ztest_func_t ztest_vdev_add_remove;
+ztest_func_t ztest_vdev_class_add;
+ztest_func_t ztest_vdev_aux_add_remove;
+ztest_func_t ztest_split_pool;
+ztest_func_t ztest_reguid;
+ztest_func_t ztest_spa_upgrade;
+ztest_func_t ztest_device_removal;
+ztest_func_t ztest_spa_checkpoint_create_discard;
+ztest_func_t ztest_initialize;
+ztest_func_t ztest_trim;
+ztest_func_t ztest_fletcher;
+ztest_func_t ztest_fletcher_incr;
+ztest_func_t ztest_verify_dnode_bt;
+
+uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
+
+#define ZTI_INIT(func, iters, interval) \
+ { .zi_func = (func), \
+ .zi_iters = (iters), \
+ .zi_interval = (interval), \
+ .zi_funcname = # func }
+
+ztest_info_t ztest_info[] = {
+ ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
+ ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
+ ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+ ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
+ ZTI_INIT(ztest_zap, 30, &zopt_always),
+ ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
+ ZTI_INIT(ztest_split_pool, 1, &zopt_always),
+ ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant),
+ ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often),
+ ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often),
+ ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often),
+ ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes),
+#if 0
+ ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes),
+#endif
+ ZTI_INIT(ztest_fzap, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_reguid, 1, &zopt_rarely),
+ ZTI_INIT(ztest_scrub, 1, &zopt_rarely),
+ ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely),
+ ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely),
+ ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely),
+ ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime),
+ ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime),
+ ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime),
+ ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
+ ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_trim, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
+ ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
+ ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
+};
+
+#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
+
+/*
+ * The following struct is used to hold a list of uncalled commit callbacks.
+ * The callbacks are ordered by txg number.
+ */
+typedef struct ztest_cb_list {
+ kmutex_t zcl_callbacks_lock;
+ list_t zcl_callbacks;
+} ztest_cb_list_t;
+
+/*
+ * Stuff we need to share writably between parent and child.
+ */
+typedef struct ztest_shared {
+ boolean_t zs_do_init;
+ hrtime_t zs_proc_start;
+ hrtime_t zs_proc_stop;
+ hrtime_t zs_thread_start;
+ hrtime_t zs_thread_stop;
+ hrtime_t zs_thread_kill;
+ uint64_t zs_enospc_count;
+ uint64_t zs_vdev_next_leaf;
+ uint64_t zs_vdev_aux;
+ uint64_t zs_alloc;
+ uint64_t zs_space;
+ uint64_t zs_splits;
+ uint64_t zs_mirrors;
+ uint64_t zs_metaslab_sz;
+ uint64_t zs_metaslab_df_alloc_threshold;
+ uint64_t zs_guid;
+} ztest_shared_t;
+
+#define ID_PARALLEL -1ULL
+
+static char ztest_dev_template[] = "%s/%s.%llua";
+static char ztest_aux_template[] = "%s/%s.%s.%llu";
+ztest_shared_t *ztest_shared;
+
+static spa_t *ztest_spa = NULL;
+static ztest_ds_t *ztest_ds;
+
+static kmutex_t ztest_vdev_lock;
+static boolean_t ztest_device_removal_active = B_FALSE;
+static boolean_t ztest_pool_scrubbed = B_FALSE;
+static kmutex_t ztest_checkpoint_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
+static pthread_rwlock_t ztest_name_lock;
+
+static boolean_t ztest_dump_core = B_TRUE;
+static boolean_t ztest_exiting;
+
+/* Global commit callback list */
+static ztest_cb_list_t zcl;
+/* Commit cb delay */
+static uint64_t zc_min_txg_delay = UINT64_MAX;
+static int zc_cb_counter = 0;
+
+/*
+ * Minimum number of commit callbacks that need to be registered for us to check
+ * whether the minimum txg delay is acceptable.
+ */
+#define ZTEST_COMMIT_CB_MIN_REG 100
+
+/*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+#define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000)
+
+enum ztest_object {
+ ZTEST_META_DNODE = 0,
+ ZTEST_DIROBJ,
+ ZTEST_OBJECTS
+};
+
+static void usage(boolean_t) __NORETURN;
+static int ztest_scrub_impl(spa_t *spa);
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+dump_debug_buffer(void)
+{
+ ssize_t ret __attribute__((unused));
+
+ if (!ztest_opts.zo_dump_dbgmsg)
+ return;
+
+ /*
+ * We use write() instead of printf() so that this function
+ * is safe to call from a signal handler.
+ */
+ ret = write(STDOUT_FILENO, "\n", 1);
+ zfs_dbgmsg_print("ztest");
+}
+
+#define BACKTRACE_SZ 100
+
+static void sig_handler(int signo)
+{
+ struct sigaction action;
+#ifdef __GLIBC__ /* backtrace() is a GNU extension */
+ int nptrs;
+ void *buffer[BACKTRACE_SZ];
+
+ nptrs = backtrace(buffer, BACKTRACE_SZ);
+ backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO);
+#endif
+ dump_debug_buffer();
+
+ /*
+ * Restore default action and re-raise signal so SIGSEGV and
+ * SIGABRT can trigger a core dump.
+ */
+ action.sa_handler = SIG_DFL;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ (void) sigaction(signo, &action, NULL);
+ raise(signo);
+}
+
+#define FATAL_MSG_SZ 1024
+
+char *fatal_msg;
+
+static void
+fatal(int do_perror, char *message, ...)
+{
+ va_list args;
+ int save_errno = errno;
+ char *buf;
+
+ (void) fflush(stdout);
+ buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL);
+
+ va_start(args, message);
+ (void) sprintf(buf, "ztest: ");
+ /* LINTED */
+ (void) vsprintf(buf + strlen(buf), message, args);
+ va_end(args);
+ if (do_perror) {
+ (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
+ ": %s", strerror(save_errno));
+ }
+ (void) fprintf(stderr, "%s\n", buf);
+ fatal_msg = buf; /* to ease debugging */
+
+ if (ztest_dump_core)
+ abort();
+ else
+ dump_debug_buffer();
+
+ exit(3);
+}
+
+static int
+str2shift(const char *buf)
+{
+ const char *ends = "BKMGTPEZ";
+ int i;
+
+ if (buf[0] == '\0')
+ return (0);
+ for (i = 0; i < strlen(ends); i++) {
+ if (toupper(buf[0]) == ends[i])
+ break;
+ }
+ if (i == strlen(ends)) {
+ (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
+ buf);
+ usage(B_FALSE);
+ }
+ if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
+ return (10*i);
+ }
+ (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
+ usage(B_FALSE);
+ /* NOTREACHED */
+}
+
+static uint64_t
+nicenumtoull(const char *buf)
+{
+ char *end;
+ uint64_t val;
+
+ val = strtoull(buf, &end, 0);
+ if (end == buf) {
+ (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
+ usage(B_FALSE);
+ } else if (end[0] == '.') {
+ double fval = strtod(buf, &end);
+ fval *= pow(2, str2shift(end));
+ /*
+ * UINT64_MAX is not exactly representable as a double.
+ * The closest representation is UINT64_MAX + 1, so we
+ * use a >= comparison instead of > for the bounds check.
+ */
+ if (fval >= (double)UINT64_MAX) {
+ (void) fprintf(stderr, "ztest: value too large: %s\n",
+ buf);
+ usage(B_FALSE);
+ }
+ val = (uint64_t)fval;
+ } else {
+ int shift = str2shift(end);
+ if (shift >= 64 || (val << shift) >> shift != val) {
+ (void) fprintf(stderr, "ztest: value too large: %s\n",
+ buf);
+ usage(B_FALSE);
+ }
+ val <<= shift;
+ }
+ return (val);
+}
+
+static void
+usage(boolean_t requested)
+{
+ const ztest_shared_opts_t *zo = &ztest_opts_defaults;
+
+ char nice_vdev_size[NN_NUMBUF_SZ];
+ char nice_force_ganging[NN_NUMBUF_SZ];
+ FILE *fp = requested ? stdout : stderr;
+
+ nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size));
+ nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging,
+ sizeof (nice_force_ganging));
+
+ (void) fprintf(fp, "Usage: %s\n"
+ "\t[-v vdevs (default: %llu)]\n"
+ "\t[-s size_of_each_vdev (default: %s)]\n"
+ "\t[-a alignment_shift (default: %d)] use 0 for random\n"
+ "\t[-m mirror_copies (default: %d)]\n"
+ "\t[-r raidz_disks (default: %d)]\n"
+ "\t[-R raidz_parity (default: %d)]\n"
+ "\t[-d datasets (default: %d)]\n"
+ "\t[-t threads (default: %d)]\n"
+ "\t[-g gang_block_threshold (default: %s)]\n"
+ "\t[-i init_count (default: %d)] initialize pool i times\n"
+ "\t[-k kill_percentage (default: %llu%%)]\n"
+ "\t[-p pool_name (default: %s)]\n"
+ "\t[-f dir (default: %s)] file directory for vdev files\n"
+ "\t[-M] Multi-host simulate pool imported on remote host\n"
+ "\t[-V] verbose (use multiple times for ever more blather)\n"
+ "\t[-E] use existing pool instead of creating new one\n"
+ "\t[-T time (default: %llu sec)] total run time\n"
+ "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
+ "\t[-P passtime (default: %llu sec)] time per pass\n"
+ "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
+ "\t[-C vdev class state (default: random)] special=on|off|random\n"
+ "\t[-o variable=value] ... set global variable to an unsigned\n"
+ "\t 32-bit integer value\n"
+ "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n"
+ "\t[-h] (print help)\n"
+ "",
+ zo->zo_pool,
+ (u_longlong_t)zo->zo_vdevs, /* -v */
+ nice_vdev_size, /* -s */
+ zo->zo_ashift, /* -a */
+ zo->zo_mirrors, /* -m */
+ zo->zo_raidz, /* -r */
+ zo->zo_raidz_parity, /* -R */
+ zo->zo_datasets, /* -d */
+ zo->zo_threads, /* -t */
+ nice_force_ganging, /* -g */
+ zo->zo_init, /* -i */
+ (u_longlong_t)zo->zo_killrate, /* -k */
+ zo->zo_pool, /* -p */
+ zo->zo_dir, /* -f */
+ (u_longlong_t)zo->zo_time, /* -T */
+ (u_longlong_t)zo->zo_maxloops, /* -F */
+ (u_longlong_t)zo->zo_passtime);
+ exit(requested ? 0 : 1);
+}
+
+
+static void
+ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo)
+{
+ char name[32];
+ char *value;
+ int state = ZTEST_VDEV_CLASS_RND;
+
+ (void) strlcpy(name, input, sizeof (name));
+
+ value = strchr(name, '=');
+ if (value == NULL) {
+ (void) fprintf(stderr, "missing value in property=value "
+ "'-C' argument (%s)\n", input);
+ usage(B_FALSE);
+ }
+ *(value) = '\0';
+ value++;
+
+ if (strcmp(value, "on") == 0) {
+ state = ZTEST_VDEV_CLASS_ON;
+ } else if (strcmp(value, "off") == 0) {
+ state = ZTEST_VDEV_CLASS_OFF;
+ } else if (strcmp(value, "random") == 0) {
+ state = ZTEST_VDEV_CLASS_RND;
+ } else {
+ (void) fprintf(stderr, "invalid property value '%s'\n", value);
+ usage(B_FALSE);
+ }
+
+ if (strcmp(name, "special") == 0) {
+ zo->zo_special_vdevs = state;
+ } else {
+ (void) fprintf(stderr, "invalid property name '%s'\n", name);
+ usage(B_FALSE);
+ }
+ if (zo->zo_verbose >= 3)
+ (void) printf("%s vdev state is '%s'\n", name, value);
+}
+
+static void
+process_options(int argc, char **argv)
+{
+ char *path;
+ ztest_shared_opts_t *zo = &ztest_opts;
+
+ int opt;
+ uint64_t value;
+ char altdir[MAXNAMELEN] = { 0 };
+
+ bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
+
+ while ((opt = getopt(argc, argv,
+ "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) {
+ value = 0;
+ switch (opt) {
+ case 'v':
+ case 's':
+ case 'a':
+ case 'm':
+ case 'r':
+ case 'R':
+ case 'd':
+ case 't':
+ case 'g':
+ case 'i':
+ case 'k':
+ case 'T':
+ case 'P':
+ case 'F':
+ value = nicenumtoull(optarg);
+ }
+ switch (opt) {
+ case 'v':
+ zo->zo_vdevs = value;
+ break;
+ case 's':
+ zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
+ break;
+ case 'a':
+ zo->zo_ashift = value;
+ break;
+ case 'm':
+ zo->zo_mirrors = value;
+ break;
+ case 'r':
+ zo->zo_raidz = MAX(1, value);
+ break;
+ case 'R':
+ zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
+ break;
+ case 'd':
+ zo->zo_datasets = MAX(1, value);
+ break;
+ case 't':
+ zo->zo_threads = MAX(1, value);
+ break;
+ case 'g':
+ zo->zo_metaslab_force_ganging =
+ MAX(SPA_MINBLOCKSIZE << 1, value);
+ break;
+ case 'i':
+ zo->zo_init = value;
+ break;
+ case 'k':
+ zo->zo_killrate = value;
+ break;
+ case 'p':
+ (void) strlcpy(zo->zo_pool, optarg,
+ sizeof (zo->zo_pool));
+ break;
+ case 'f':
+ path = realpath(optarg, NULL);
+ if (path == NULL) {
+ (void) fprintf(stderr, "error: %s: %s\n",
+ optarg, strerror(errno));
+ usage(B_FALSE);
+ } else {
+ (void) strlcpy(zo->zo_dir, path,
+ sizeof (zo->zo_dir));
+ free(path);
+ }
+ break;
+ case 'M':
+ zo->zo_mmp_test = 1;
+ break;
+ case 'V':
+ zo->zo_verbose++;
+ break;
+ case 'E':
+ zo->zo_init = 0;
+ break;
+ case 'T':
+ zo->zo_time = value;
+ break;
+ case 'P':
+ zo->zo_passtime = MAX(1, value);
+ break;
+ case 'F':
+ zo->zo_maxloops = MAX(1, value);
+ break;
+ case 'B':
+ (void) strlcpy(altdir, optarg, sizeof (altdir));
+ break;
+ case 'C':
+ ztest_parse_name_value(optarg, zo);
+ break;
+ case 'o':
+ if (set_global_var(optarg) != 0)
+ usage(B_FALSE);
+ break;
+ case 'G':
+ zo->zo_dump_dbgmsg = 1;
+ break;
+ case 'h':
+ usage(B_TRUE);
+ break;
+ case '?':
+ default:
+ usage(B_FALSE);
+ break;
+ }
+ }
+
+ zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
+
+ zo->zo_vdevtime =
+ (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
+ UINT64_MAX >> 2);
+
+ if (strlen(altdir) > 0) {
+ char *cmd;
+ char *realaltdir;
+ char *bin;
+ char *ztest;
+ char *isa;
+ int isalen;
+
+ cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+ VERIFY(NULL != realpath(getexecname(), cmd));
+ if (0 != access(altdir, F_OK)) {
+ ztest_dump_core = B_FALSE;
+ fatal(B_TRUE, "invalid alternate ztest path: %s",
+ altdir);
+ }
+ VERIFY(NULL != realpath(altdir, realaltdir));
+
+ /*
+ * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
+ * We want to extract <isa> to determine if we should use
+ * 32 or 64 bit binaries.
+ */
+ bin = strstr(cmd, "/usr/bin/");
+ ztest = strstr(bin, "/ztest");
+ isa = bin + 9;
+ isalen = ztest - isa;
+ (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
+ "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
+ (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
+ "%s/usr/lib/%.*s", realaltdir, isalen, isa);
+
+ if (0 != access(zo->zo_alt_ztest, X_OK)) {
+ ztest_dump_core = B_FALSE;
+ fatal(B_TRUE, "invalid alternate ztest: %s",
+ zo->zo_alt_ztest);
+ } else if (0 != access(zo->zo_alt_libpath, X_OK)) {
+ ztest_dump_core = B_FALSE;
+ fatal(B_TRUE, "invalid alternate lib directory %s",
+ zo->zo_alt_libpath);
+ }
+
+ umem_free(cmd, MAXPATHLEN);
+ umem_free(realaltdir, MAXPATHLEN);
+ }
+}
+
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
+
+ /*
+ * Before we kill off ztest, make sure that the config is updated.
+ * See comment above spa_write_cachefile().
+ */
+ mutex_enter(&spa_namespace_lock);
+ spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE);
+ mutex_exit(&spa_namespace_lock);
+
+ (void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+ uint64_t r;
+
+ ASSERT3S(ztest_fd_rand, >=, 0);
+
+ if (range == 0)
+ return (0);
+
+ if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
+ fatal(1, "short read from /dev/urandom");
+
+ return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+ ztest_shared->zs_enospc_count++;
+}
+
+static uint64_t
+ztest_get_ashift(void)
+{
+ if (ztest_opts.zo_ashift == 0)
+ return (SPA_MINBLOCKSHIFT + ztest_random(5));
+ return (ztest_opts.zo_ashift);
+}
+
+static nvlist_t *
+make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
+{
+ char *pathbuf;
+ uint64_t vdev;
+ nvlist_t *file;
+
+ pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+ if (ashift == 0)
+ ashift = ztest_get_ashift();
+
+ if (path == NULL) {
+ path = pathbuf;
+
+ if (aux != NULL) {
+ vdev = ztest_shared->zs_vdev_aux;
+ (void) snprintf(path, MAXPATHLEN,
+ ztest_aux_template, ztest_opts.zo_dir,
+ pool == NULL ? ztest_opts.zo_pool : pool,
+ aux, vdev);
+ } else {
+ vdev = ztest_shared->zs_vdev_next_leaf++;
+ (void) snprintf(path, MAXPATHLEN,
+ ztest_dev_template, ztest_opts.zo_dir,
+ pool == NULL ? ztest_opts.zo_pool : pool, vdev);
+ }
+ }
+
+ if (size != 0) {
+ int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
+ if (fd == -1)
+ fatal(1, "can't open %s", path);
+ if (ftruncate(fd, size) != 0)
+ fatal(1, "can't ftruncate %s", path);
+ (void) close(fd);
+ }
+
+ VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
+ VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
+ VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
+ umem_free(pathbuf, MAXPATHLEN);
+
+ return (file);
+}
+
+static nvlist_t *
+make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
+ uint64_t ashift, int r)
+{
+ nvlist_t *raidz, **child;
+ int c;
+
+ if (r < 2)
+ return (make_vdev_file(path, aux, pool, size, ashift));
+ child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+ for (c = 0; c < r; c++)
+ child[c] = make_vdev_file(path, aux, pool, size, ashift);
+
+ VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_RAIDZ) == 0);
+ VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
+ ztest_opts.zo_raidz_parity) == 0);
+ VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
+ child, r) == 0);
+
+ for (c = 0; c < r; c++)
+ nvlist_free(child[c]);
+
+ umem_free(child, r * sizeof (nvlist_t *));
+
+ return (raidz);
+}
+
+static nvlist_t *
+make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
+ uint64_t ashift, int r, int m)
+{
+ nvlist_t *mirror, **child;
+ int c;
+
+ if (m < 1)
+ return (make_vdev_raidz(path, aux, pool, size, ashift, r));
+
+ child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+ for (c = 0; c < m; c++)
+ child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
+
+ VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_MIRROR) == 0);
+ VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
+ child, m) == 0);
+
+ for (c = 0; c < m; c++)
+ nvlist_free(child[c]);
+
+ umem_free(child, m * sizeof (nvlist_t *));
+
+ return (mirror);
+}
+
+static nvlist_t *
+make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
+ const char *class, int r, int m, int t)
+{
+ nvlist_t *root, **child;
+ int c;
+ boolean_t log;
+
+ ASSERT(t > 0);
+
+ log = (class != NULL && strcmp(class, "log") == 0);
+
+ child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
+
+ for (c = 0; c < t; c++) {
+ child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
+ r, m);
+ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+ log) == 0);
+
+ if (class != NULL && class[0] != '\0') {
+ ASSERT(m > 1 || log); /* expecting a mirror */
+ VERIFY(nvlist_add_string(child[c],
+ ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0);
+ }
+ }
+
+ VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
+ child, t) == 0);
+
+ for (c = 0; c < t; c++)
+ nvlist_free(child[c]);
+
+ umem_free(child, t * sizeof (nvlist_t *));
+
+ return (root);
+}
+
+/*
+ * Find a random spa version. Returns back a random spa version in the
+ * range [initial_version, SPA_VERSION_FEATURES].
+ */
+static uint64_t
+ztest_random_spa_version(uint64_t initial_version)
+{
+ uint64_t version = initial_version;
+
+ if (version <= SPA_VERSION_BEFORE_FEATURES) {
+ version = version +
+ ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
+ }
+
+ if (version > SPA_VERSION_BEFORE_FEATURES)
+ version = SPA_VERSION_FEATURES;
+
+ ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+ return (version);
+}
+
+static int
+ztest_random_blocksize(void)
+{
+ ASSERT(ztest_spa->spa_max_ashift != 0);
+
+ /*
+ * Choose a block size >= the ashift.
+ * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+ */
+ int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+ if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+ maxbs = 20;
+ uint64_t block_shift =
+ ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
+ return (1 << (SPA_MINBLOCKSHIFT + block_shift));
+}
+
+static int
+ztest_random_dnodesize(void)
+{
+ int slots;
+ int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT;
+
+ if (max_slots == DNODE_MIN_SLOTS)
+ return (DNODE_MIN_SIZE);
+
+ /*
+ * Weight the random distribution more heavily toward smaller
+ * dnode sizes since that is more likely to reflect real-world
+ * usage.
+ */
+ ASSERT3U(max_slots, >, 4);
+ switch (ztest_random(10)) {
+ case 0:
+ slots = 5 + ztest_random(max_slots - 4);
+ break;
+ case 1 ... 4:
+ slots = 2 + ztest_random(3);
+ break;
+ default:
+ slots = 1;
+ break;
+ }
+
+ return (slots << DNODE_SHIFT);
+}
+
+static int
+ztest_random_ibshift(void)
+{
+ return (DN_MIN_INDBLKSHIFT +
+ ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
+}
+
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
+{
+ uint64_t top;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd;
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+ do {
+ top = ztest_random(rvd->vdev_children);
+ tvd = rvd->vdev_child[top];
+ } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) ||
+ tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
+
+ return (top);
+}
+
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
+{
+ uint64_t value;
+
+ do {
+ value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+ } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+ return (value);
+}
+
+static int
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+ boolean_t inherit)
+{
+ const char *propname = zfs_prop_to_name(prop);
+ const char *valname;
+ char *setpoint;
+ uint64_t curval;
+ int error;
+
+ error = dsl_prop_set_int(osname, propname,
+ (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT0(error);
+
+ setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
+
+ if (ztest_opts.zo_verbose >= 6) {
+ int err;
+
+ err = zfs_prop_index_to_string(prop, curval, &valname);
+ if (err)
+ (void) printf("%s %s = %llu at '%s'\n", osname,
+ propname, (unsigned long long)curval, setpoint);
+ else
+ (void) printf("%s %s = %s at '%s'\n",
+ osname, propname, valname, setpoint);
+ }
+ umem_free(setpoint, MAXPATHLEN);
+
+ return (error);
+}
+
+static int
+ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
+{
+ spa_t *spa = ztest_spa;
+ nvlist_t *props = NULL;
+ int error;
+
+ VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
+
+ error = spa_prop_set(spa, props);
+
+ nvlist_free(props);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT0(error);
+
+ return (error);
+}
+
+static int
+ztest_dmu_objset_own(const char *name, dmu_objset_type_t type,
+ boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+ int err;
+ char *cp = NULL;
+ char ddname[ZFS_MAX_DATASET_NAME_LEN];
+
+ strcpy(ddname, name);
+ cp = strchr(ddname, '@');
+ if (cp != NULL)
+ *cp = '\0';
+
+ err = dmu_objset_own(name, type, readonly, decrypt, tag, osp);
+ while (decrypt && err == EACCES) {
+ dsl_crypto_params_t *dcp;
+ nvlist_t *crypto_args = fnvlist_alloc();
+
+ fnvlist_add_uint8_array(crypto_args, "wkeydata",
+ (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN);
+ VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
+ crypto_args, &dcp));
+ err = spa_keystore_load_wkey(ddname, dcp, B_FALSE);
+ dsl_crypto_params_free(dcp, B_FALSE);
+ fnvlist_free(crypto_args);
+
+ if (err == EINVAL) {
+ /*
+ * We couldn't load a key for this dataset so try
+ * the parent. This loop will eventually hit the
+ * encryption root since ztest only makes clones
+ * as children of their origin datasets.
+ */
+ cp = strrchr(ddname, '/');
+ if (cp == NULL)
+ return (err);
+
+ *cp = '\0';
+ err = EACCES;
+ continue;
+ } else if (err != 0) {
+ break;
+ }
+
+ err = dmu_objset_own(name, type, readonly, decrypt, tag, osp);
+ break;
+ }
+
+ return (err);
+}
+
+static void
+ztest_rll_init(rll_t *rll)
+{
+ rll->rll_writer = NULL;
+ rll->rll_readers = 0;
+ mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL);
+}
+
+static void
+ztest_rll_destroy(rll_t *rll)
+{
+ ASSERT(rll->rll_writer == NULL);
+ ASSERT(rll->rll_readers == 0);
+ mutex_destroy(&rll->rll_lock);
+ cv_destroy(&rll->rll_cv);
+}
+
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+ mutex_enter(&rll->rll_lock);
+
+ if (type == RL_READER) {
+ while (rll->rll_writer != NULL)
+ (void) cv_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_readers++;
+ } else {
+ while (rll->rll_writer != NULL || rll->rll_readers)
+ (void) cv_wait(&rll->rll_cv, &rll->rll_lock);
+ rll->rll_writer = curthread;
+ }
+
+ mutex_exit(&rll->rll_lock);
+}
+
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+ mutex_enter(&rll->rll_lock);
+
+ if (rll->rll_writer) {
+ ASSERT(rll->rll_readers == 0);
+ rll->rll_writer = NULL;
+ } else {
+ ASSERT(rll->rll_readers != 0);
+ ASSERT(rll->rll_writer == NULL);
+ rll->rll_readers--;
+ }
+
+ if (rll->rll_writer == NULL && rll->rll_readers == 0)
+ cv_broadcast(&rll->rll_cv);
+
+ mutex_exit(&rll->rll_lock);
+}
+
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
+{
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+ ztest_rll_lock(rll, type);
+}
+
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+ rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
+
+ ztest_rll_unlock(rll);
+}
+
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+ uint64_t size, rl_type_t type)
+{
+ uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+ rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+ rl_t *rl;
+
+ rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+ rl->rl_object = object;
+ rl->rl_offset = offset;
+ rl->rl_size = size;
+ rl->rl_lock = rll;
+
+ ztest_rll_lock(rll, type);
+
+ return (rl);
+}
+
+static void
+ztest_range_unlock(rl_t *rl)
+{
+ rll_t *rll = rl->rl_lock;
+
+ ztest_rll_unlock(rll);
+
+ umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
+{
+ zd->zd_os = os;
+ zd->zd_zilog = dmu_objset_zil(os);
+ zd->zd_shared = szd;
+ dmu_objset_name(os, zd->zd_name);
+ int l;
+
+ if (zd->zd_shared != NULL)
+ zd->zd_shared->zd_seq = 0;
+
+ VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL));
+ mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_init(&zd->zd_object_lock[l]);
+
+ for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_init(&zd->zd_range_lock[l]);
+}
+
+static void
+ztest_zd_fini(ztest_ds_t *zd)
+{
+ int l;
+
+ mutex_destroy(&zd->zd_dirobj_lock);
+ (void) pthread_rwlock_destroy(&zd->zd_zilog_lock);
+
+ for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_object_lock[l]);
+
+ for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
+ ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
+
+#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
+
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+{
+ uint64_t txg;
+ int error;
+
+ /*
+ * Attempt to assign tx to some transaction group.
+ */
+ error = dmu_tx_assign(tx, txg_how);
+ if (error) {
+ if (error == ERESTART) {
+ ASSERT(txg_how == TXG_NOWAIT);
+ dmu_tx_wait(tx);
+ } else {
+ ASSERT3U(error, ==, ENOSPC);
+ ztest_record_enospc(tag);
+ }
+ dmu_tx_abort(tx);
+ return (0);
+ }
+ txg = dmu_tx_get_txg(tx);
+ ASSERT(txg != 0);
+ return (txg);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
+ uint64_t crtxg)
+{
+ bt->bt_magic = BT_MAGIC;
+ bt->bt_objset = dmu_objset_id(os);
+ bt->bt_object = object;
+ bt->bt_dnodesize = dnodesize;
+ bt->bt_offset = offset;
+ bt->bt_gen = gen;
+ bt->bt_txg = txg;
+ bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+ uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg,
+ uint64_t crtxg)
+{
+ ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
+ ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+ ASSERT3U(bt->bt_object, ==, object);
+ ASSERT3U(bt->bt_dnodesize, ==, dnodesize);
+ ASSERT3U(bt->bt_offset, ==, offset);
+ ASSERT3U(bt->bt_gen, <=, gen);
+ ASSERT3U(bt->bt_txg, <=, txg);
+ ASSERT3U(bt->bt_crtxg, ==, crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+ dmu_object_info_t doi;
+ ztest_block_tag_t *bt;
+
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+ bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+ return (bt);
+}
+
+/*
+ * Generate a token to fill up unused bonus buffer space. Try to make
+ * it unique to the object, generation, and offset to verify that data
+ * is not getting overwritten by data from other dnodes.
+ */
+#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \
+ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset))
+
+/*
+ * Fill up the unused bonus buffer region before the block tag with a
+ * verifiable pattern. Filling the whole bonus area with non-zero data
+ * helps ensure that all dnode traversal code properly skips the
+ * interior regions of large dnodes.
+ */
+static void
+ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
+ objset_t *os, uint64_t gen)
+{
+ uint64_t *bonusp;
+
+ ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8));
+
+ for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
+ uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
+ gen, bonusp - (uint64_t *)db->db_data);
+ *bonusp = token;
+ }
+}
+
+/*
+ * Verify that the unused area of a bonus buffer is filled with the
+ * expected tokens.
+ */
+static void
+ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj,
+ objset_t *os, uint64_t gen)
+{
+ uint64_t *bonusp;
+
+ for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) {
+ uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os),
+ gen, bonusp - (uint64_t *)db->db_data);
+ VERIFY3U(*bonusp, ==, token);
+ }
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define lrz_type lr_mode
+#define lrz_blocksize lr_uid
+#define lrz_ibshift lr_gid
+#define lrz_bonustype lr_rdev
+#define lrz_dnodesize lr_crtime[1]
+
+static void
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
+{
+ char *name = (void *)(lr + 1); /* name follows lr */
+ size_t namesize = strlen(name) + 1;
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) + namesize - sizeof (lr_t));
+
+ itx->itx_oid = object;
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+ itx_t *itx;
+ itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ if (lr->lr_length > zil_max_log_data(zd->zd_zilog))
+ write_state = WR_INDIRECT;
+
+ itx = zil_itx_create(TX_WRITE,
+ sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+ if (write_state == WR_COPIED &&
+ dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+ zil_itx_destroy(itx);
+ itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+ write_state = WR_NEED_COPY;
+ }
+ itx->itx_private = zd;
+ itx->itx_wr_state = write_state;
+ itx->itx_sync = (ztest_random(8) == 0);
+
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ itx->itx_sync = B_FALSE;
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+static void
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+ itx_t *itx;
+
+ if (zil_replaying(zd->zd_zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+ bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+ sizeof (*lr) - sizeof (lr_t));
+
+ itx->itx_sync = B_FALSE;
+ zil_itx_assign(zd->zd_zilog, itx, tx);
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap)
+{
+ ztest_ds_t *zd = arg1;
+ lr_create_t *lr = arg2;
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ ztest_block_tag_t *bbt;
+ dmu_buf_t *db;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ int error = 0;
+ int bonuslen;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+ } else {
+ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+ }
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0)
+ return (ENOSPC);
+
+ ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+ bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize);
+
+ if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = zap_create_dnsize(os,
+ lr->lrz_type, lr->lrz_bonustype,
+ bonuslen, lr->lrz_dnodesize, tx);
+ } else {
+ error = zap_create_claim_dnsize(os, lr->lr_foid,
+ lr->lrz_type, lr->lrz_bonustype,
+ bonuslen, lr->lrz_dnodesize, tx);
+ }
+ } else {
+ if (lr->lr_foid == 0) {
+ lr->lr_foid = dmu_object_alloc_dnsize(os,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ bonuslen, lr->lrz_dnodesize, tx);
+ } else {
+ error = dmu_object_claim_dnsize(os, lr->lr_foid,
+ lr->lrz_type, 0, lr->lrz_bonustype,
+ bonuslen, lr->lrz_dnodesize, tx);
+ }
+ }
+
+ if (error) {
+ ASSERT3U(error, ==, EEXIST);
+ ASSERT(zd->zd_zilog->zl_replay);
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ ASSERT(lr->lr_foid != 0);
+
+ if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+ VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+ lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+ bbt = ztest_bt_bonus(db);
+ dmu_buf_will_dirty(db, tx);
+ ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL,
+ lr->lr_gen, txg, txg);
+ ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen);
+ dmu_buf_rele(db, FTAG);
+
+ VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+ &lr->lr_foid, tx));
+
+ (void) ztest_log_create(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+static int
+ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
+{
+ ztest_ds_t *zd = arg1;
+ lr_remove_t *lr = arg2;
+ char *name = (void *)(lr + 1); /* name follows lr */
+ objset_t *os = zd->zd_os;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ uint64_t object, txg;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+ ASSERT(name[0] != '\0');
+
+ VERIFY3U(0, ==,
+ zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+ ASSERT(object != 0);
+
+ ztest_object_lock(zd, object, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_object_unlock(zd, object);
+ return (ENOSPC);
+ }
+
+ if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+ VERIFY3U(0, ==, zap_destroy(os, object, tx));
+ } else {
+ VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+ }
+
+ VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+ (void) ztest_log_remove(zd, tx, lr, object);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, object);
+
+ return (0);
+}
+
+static int
+ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+ ztest_ds_t *zd = arg1;
+ lr_write_t *lr = arg2;
+ objset_t *os = zd->zd_os;
+ void *data = lr + 1; /* data follows lr */
+ uint64_t offset, length;
+ ztest_block_tag_t *bt = data;
+ ztest_block_tag_t *bbt;
+ uint64_t gen, txg, lrtxg, crtxg;
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ arc_buf_t *abuf = NULL;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ /* If it's a dmu_sync() block, write the whole block */
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+ if (length < blocksize) {
+ offset -= offset % blocksize;
+ length = blocksize;
+ }
+ }
+
+ if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+ byteswap_uint64_array(bt, sizeof (*bt));
+
+ if (bt->bt_magic != BT_MAGIC)
+ bt = NULL;
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ dmu_object_info_from_db(db, &doi);
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ gen = bbt->bt_gen;
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+ if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+ P2PHASE(offset, length) == 0)
+ abuf = dmu_request_arcbuf(db, length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ dmu_buf_rele(db, FTAG);
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ if (bt != NULL) {
+ /*
+ * Usually, verify the old data before writing new data --
+ * but not always, because we also want to verify correct
+ * behavior when the data was not recently read into cache.
+ */
+ ASSERT(offset % doi.doi_data_block_size == 0);
+ if (ztest_random(4) != 0) {
+ int prefetch = ztest_random(2) ?
+ DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+ ztest_block_tag_t rbt;
+
+ VERIFY(dmu_read(os, lr->lr_foid, offset,
+ sizeof (rbt), &rbt, prefetch) == 0);
+ if (rbt.bt_magic == BT_MAGIC) {
+ ztest_bt_verify(&rbt, os, lr->lr_foid, 0,
+ offset, gen, txg, crtxg);
+ }
+ }
+
+ /*
+ * Writes can appear to be newer than the bonus buffer because
+ * the ztest_get_data() callback does a dmu_read() of the
+ * open-context data, which may be different than the data
+ * as it was when the write was generated.
+ */
+ if (zd->zd_zilog->zl_replay) {
+ ztest_bt_verify(bt, os, lr->lr_foid, 0, offset,
+ MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+ bt->bt_crtxg);
+ }
+
+ /*
+ * Set the bt's gen/txg to the bonus buffer's gen/txg
+ * so that all of the usual ASSERTs will work.
+ */
+ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg,
+ crtxg);
+ }
+
+ if (abuf == NULL) {
+ dmu_write(os, lr->lr_foid, offset, length, data, tx);
+ } else {
+ bcopy(data, abuf->b_data, length);
+ dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx);
+ }
+
+ (void) ztest_log_write(zd, tx, lr);
+
+ dmu_buf_rele(db, FTAG);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+static int
+ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+ ztest_ds_t *zd = arg1;
+ lr_truncate_t *lr = arg2;
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_READER);
+ rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+ RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+ lr->lr_length, tx) == 0);
+
+ (void) ztest_log_truncate(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+static int
+ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+ ztest_ds_t *zd = arg1;
+ lr_setattr_t *lr = arg2;
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ uint64_t txg, lrtxg, crtxg, dnodesize;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+ if (txg == 0) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, lr->lr_foid);
+ return (ENOSPC);
+ }
+
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ crtxg = bbt->bt_crtxg;
+ lrtxg = lr->lr_common.lrc_txg;
+ dnodesize = bbt->bt_dnodesize;
+
+ if (zd->zd_zilog->zl_replay) {
+ ASSERT(lr->lr_size != 0);
+ ASSERT(lr->lr_mode != 0);
+ ASSERT(lrtxg != 0);
+ } else {
+ /*
+ * Randomly change the size and increment the generation.
+ */
+ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+ sizeof (*bbt);
+ lr->lr_mode = bbt->bt_gen + 1;
+ ASSERT(lrtxg == 0);
+ }
+
+ /*
+ * Verify that the current bonus buffer is not newer than our txg.
+ */
+ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
+ MAX(txg, lrtxg), crtxg);
+
+ dmu_buf_will_dirty(db, tx);
+
+ ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+ ASSERT3U(lr->lr_size, <=, db->db_size);
+ VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
+ bbt = ztest_bt_bonus(db);
+
+ ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode,
+ txg, crtxg);
+ ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen);
+ dmu_buf_rele(db, FTAG);
+
+ (void) ztest_log_setattr(zd, tx, lr);
+
+ dmu_tx_commit(tx);
+
+ ztest_object_unlock(zd, lr->lr_foid);
+
+ return (0);
+}
+
+zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+ NULL, /* 0 no such transaction type */
+ ztest_replay_create, /* TX_CREATE */
+ NULL, /* TX_MKDIR */
+ NULL, /* TX_MKXATTR */
+ NULL, /* TX_SYMLINK */
+ ztest_replay_remove, /* TX_REMOVE */
+ NULL, /* TX_RMDIR */
+ NULL, /* TX_LINK */
+ NULL, /* TX_RENAME */
+ ztest_replay_write, /* TX_WRITE */
+ ztest_replay_truncate, /* TX_TRUNCATE */
+ ztest_replay_setattr, /* TX_SETATTR */
+ NULL, /* TX_ACL */
+ NULL, /* TX_CREATE_ACL */
+ NULL, /* TX_CREATE_ATTR */
+ NULL, /* TX_CREATE_ACL_ATTR */
+ NULL, /* TX_MKDIR_ACL */
+ NULL, /* TX_MKDIR_ATTR */
+ NULL, /* TX_MKDIR_ACL_ATTR */
+ NULL, /* TX_WRITE2 */
+};
+
+/*
+ * ZIL get_data callbacks
+ */
+
+/* ARGSUSED */
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+ ztest_ds_t *zd = zgd->zgd_private;
+ uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ ztest_range_unlock((rl_t *)zgd->zgd_lr);
+ ztest_object_unlock(zd, object);
+
+ umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
+ zio_t *zio)
+{
+ ztest_ds_t *zd = arg;
+ objset_t *os = zd->zd_os;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ uint64_t txg = lr->lr_common.lrc_txg;
+ uint64_t crtxg;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ ztest_object_lock(zd, object, RL_READER);
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error) {
+ ztest_object_unlock(zd, object);
+ return (error);
+ }
+
+ crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+ if (crtxg == 0 || crtxg > txg) {
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, object);
+ return (ENOENT);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ dmu_buf_rele(db, FTAG);
+ db = NULL;
+
+ zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zd;
+
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,
+ object, offset, size, RL_READER);
+
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ ASSERT(error == 0);
+ } else {
+ size = doi.doi_data_block_size;
+ if (ISP2(size)) {
+ offset = P2ALIGN(offset, size);
+ } else {
+ ASSERT(offset < size);
+ offset = 0;
+ }
+
+ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd,
+ object, offset, size, RL_READER);
+
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ ztest_get_done, zgd);
+
+ if (error == 0)
+ return (0);
+ }
+ }
+
+ ztest_get_done(zgd, error);
+
+ return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+ char *lr;
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+ if (name)
+ bcopy(name, lr + lrsize, namesize);
+
+ return (lr);
+}
+
+static void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+ size_t namesize = name ? strlen(name) + 1 : 0;
+
+ umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects. Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int error;
+ int i;
+
+ ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
+
+ for (i = 0; i < count; i++, od++) {
+ od->od_object = 0;
+ error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+ sizeof (uint64_t), 1, &od->od_object);
+ if (error) {
+ ASSERT(error == ENOENT);
+ ASSERT(od->od_object == 0);
+ missing++;
+ } else {
+ dmu_buf_t *db;
+ ztest_block_tag_t *bbt;
+ dmu_object_info_t doi;
+
+ ASSERT(od->od_object != 0);
+ ASSERT(missing == 0); /* there should be no gaps */
+
+ ztest_object_lock(zd, od->od_object, RL_READER);
+ VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+ od->od_object, FTAG, &db));
+ dmu_object_info_from_db(db, &doi);
+ bbt = ztest_bt_bonus(db);
+ ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+ od->od_type = doi.doi_type;
+ od->od_blocksize = doi.doi_data_block_size;
+ od->od_gen = bbt->bt_gen;
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, od->od_object);
+ }
+ }
+
+ return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
+
+ for (i = 0; i < count; i++, od++) {
+ if (missing) {
+ od->od_object = 0;
+ missing++;
+ continue;
+ }
+
+ lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+ lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
+ lr->lrz_type = od->od_crtype;
+ lr->lrz_blocksize = od->od_crblocksize;
+ lr->lrz_ibshift = ztest_random_ibshift();
+ lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+ lr->lrz_dnodesize = od->od_crdnodesize;
+ lr->lr_gen = od->od_crgen;
+ lr->lr_crtime[0] = time(NULL);
+
+ if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+ ASSERT(missing == 0);
+ od->od_object = 0;
+ missing++;
+ } else {
+ od->od_object = lr->lr_foid;
+ od->od_type = od->od_crtype;
+ od->od_blocksize = od->od_crblocksize;
+ od->od_gen = od->od_crgen;
+ ASSERT(od->od_object != 0);
+ }
+
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+ int missing = 0;
+ int error;
+ int i;
+
+ ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
+
+ od += count - 1;
+
+ for (i = count - 1; i >= 0; i--, od--) {
+ if (missing) {
+ missing++;
+ continue;
+ }
+
+ /*
+ * No object was found.
+ */
+ if (od->od_object == 0)
+ continue;
+
+ lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+ lr->lr_doid = od->od_dir;
+
+ if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+ ASSERT3U(error, ==, ENOSPC);
+ missing++;
+ } else {
+ od->od_object = 0;
+ }
+ ztest_lr_free(lr, sizeof (*lr), od->od_name);
+ }
+
+ return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+ void *data)
+{
+ lr_write_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+ lr->lr_blkoff = 0;
+ BP_ZERO(&lr->lr_blkptr);
+
+ bcopy(data, lr + 1, size);
+
+ error = ztest_replay_write(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+ return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+ lr_truncate_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_offset = offset;
+ lr->lr_length = size;
+
+ error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+ lr_setattr_t *lr;
+ int error;
+
+ lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+ lr->lr_foid = object;
+ lr->lr_size = 0;
+ lr->lr_mode = 0;
+
+ error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+ ztest_lr_free(lr, sizeof (*lr), NULL);
+
+ return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+ objset_t *os = zd->zd_os;
+ dmu_tx_t *tx;
+ uint64_t txg;
+ rl_t *rl;
+
+ txg_wait_synced(dmu_objset_pool(os), 0);
+
+ ztest_object_lock(zd, object, RL_READER);
+ rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, object, offset, size);
+
+ txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+ if (txg != 0) {
+ dmu_prealloc(os, object, offset, size, tx);
+ dmu_tx_commit(tx);
+ txg_wait_synced(dmu_objset_pool(os), txg);
+ } else {
+ (void) dmu_free_long_range(os, object, offset, size);
+ }
+
+ ztest_range_unlock(rl);
+ ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+ int err;
+ ztest_block_tag_t wbt;
+ dmu_object_info_t doi;
+ enum ztest_io_type io_type;
+ uint64_t blocksize;
+ void *data;
+
+ VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+ blocksize = doi.doi_data_block_size;
+ data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+ /*
+ * Pick an i/o type at random, biased toward writing block tags.
+ */
+ io_type = ztest_random(ZTEST_IO_TYPES);
+ if (ztest_random(2) == 0)
+ io_type = ZTEST_IO_WRITE_TAG;
+
+ (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock);
+
+ switch (io_type) {
+
+ case ZTEST_IO_WRITE_TAG:
+ ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize,
+ offset, 0, 0, 0);
+ (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+ break;
+
+ case ZTEST_IO_WRITE_PATTERN:
+ (void) memset(data, 'a' + (object + offset) % 5, blocksize);
+ if (ztest_random(2) == 0) {
+ /*
+ * Induce fletcher2 collisions to ensure that
+ * zio_ddt_collision() detects and resolves them
+ * when using fletcher2-verify for deduplication.
+ */
+ ((uint64_t *)data)[0] ^= 1ULL << 63;
+ ((uint64_t *)data)[4] ^= 1ULL << 63;
+ }
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_WRITE_ZEROES:
+ bzero(data, blocksize);
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+
+ case ZTEST_IO_TRUNCATE:
+ (void) ztest_truncate(zd, object, offset, blocksize);
+ break;
+
+ case ZTEST_IO_SETATTR:
+ (void) ztest_setattr(zd, object);
+ break;
+ default:
+ break;
+
+ case ZTEST_IO_REWRITE:
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+ err = ztest_dsl_prop_set_uint64(zd->zd_name,
+ ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
+ B_FALSE);
+ VERIFY(err == 0 || err == ENOSPC);
+ err = ztest_dsl_prop_set_uint64(zd->zd_name,
+ ZFS_PROP_COMPRESSION,
+ ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
+ B_FALSE);
+ VERIFY(err == 0 || err == ENOSPC);
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
+ DMU_READ_NO_PREFETCH));
+
+ (void) ztest_write(zd, object, offset, blocksize, data);
+ break;
+ }
+
+ (void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+
+ umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+ dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize,
+ uint64_t gen)
+{
+ od->od_dir = ZTEST_DIROBJ;
+ od->od_object = 0;
+
+ od->od_crtype = type;
+ od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+ od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize();
+ od->od_crgen = gen;
+
+ od->od_type = DMU_OT_NONE;
+ od->od_blocksize = 0;
+ od->od_gen = 0;
+
+ (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+ tag, (longlong_t)id, (u_longlong_t)index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones. Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+ int count = size / sizeof (*od);
+ int rv = 0;
+
+ mutex_enter(&zd->zd_dirobj_lock);
+ if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+ (ztest_remove(zd, od, count) != 0 ||
+ ztest_create(zd, od, count) != 0))
+ rv = -1;
+ zd->zd_od = od;
+ mutex_exit(&zd->zd_dirobj_lock);
+
+ return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+ zilog_t *zilog = zd->zd_zilog;
+
+ (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock);
+
+ zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
+
+ /*
+ * Remember the committed values in zd, which is in parent/child
+ * shared memory. If we die, the next iteration of ztest_run()
+ * will verify that the log really does contain this record.
+ */
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(zd->zd_shared != NULL);
+ ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
+ zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
+ mutex_exit(&zilog->zl_lock);
+
+ (void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation. We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+
+ /*
+ * We hold the ztest_vdev_lock so we don't cause problems with
+ * other threads that wish to remove a log device, such as
+ * ztest_device_removal().
+ */
+ mutex_enter(&ztest_vdev_lock);
+
+ /*
+ * We grab the zd_dirobj_lock to ensure that no other thread is
+ * updating the zil (i.e. adding in-memory log records) and the
+ * zd_zilog_lock to block any I/O.
+ */
+ mutex_enter(&zd->zd_dirobj_lock);
+ (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock);
+
+ /* zfsvfs_teardown() */
+ zil_close(zd->zd_zilog);
+
+ /* zfsvfs_setup() */
+ VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+ zil_replay(os, zd, ztest_replay_vector);
+
+ (void) pthread_rwlock_unlock(&zd->zd_zilog_lock);
+ mutex_exit(&zd->zd_dirobj_lock);
+ mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+/* ARGSUSED */
+void
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_opts_t *zo = &ztest_opts;
+ spa_t *spa;
+ nvlist_t *nvroot;
+
+ if (zo->zo_mmp_test)
+ return;
+
+ /*
+ * Attempt to create using a bad file.
+ */
+ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+ nvlist_free(nvroot);
+
+ /*
+ * Attempt to create using a bad mirror.
+ */
+ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1);
+ VERIFY3U(ENOENT, ==,
+ spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+ nvlist_free(nvroot);
+
+ /*
+ * Attempt to create an existing pool. It shouldn't matter
+ * what's in the nvroot; we should fail with EEXIST.
+ */
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1);
+ VERIFY3U(EEXIST, ==,
+ spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL));
+ nvlist_free(nvroot);
+
+ /*
+ * We open a reference to the spa and then we try to export it
+ * expecting one of the following errors:
+ *
+ * EBUSY
+ * Because of the reference we just opened.
+ *
+ * ZFS_ERR_EXPORT_IN_PROGRESS
+ * For the case that there is another ztest thread doing
+ * an export concurrently.
+ */
+ VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
+ int error = spa_destroy(zo->zo_pool);
+ if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) {
+ fatal(0, "spa_destroy(%s) returned unexpected value %d",
+ spa->spa_name, error);
+ }
+ spa_close(spa, FTAG);
+
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/*
+ * Start and then stop the MMP threads to ensure the startup and shutdown code
+ * works properly. Actual protection and property-related code tested via ZTS.
+ */
+/* ARGSUSED */
+void
+ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_opts_t *zo = &ztest_opts;
+ spa_t *spa = ztest_spa;
+
+ if (zo->zo_mmp_test)
+ return;
+
+ /*
+ * Since enabling MMP involves setting a property, it could not be done
+ * while the pool is suspended.
+ */
+ if (spa_suspended(spa))
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ mutex_enter(&spa->spa_props_lock);
+
+ zfs_multihost_fail_intervals = 0;
+
+ if (!spa_multihost(spa)) {
+ spa->spa_multihost = B_TRUE;
+ mmp_thread_start(spa);
+ }
+
+ mutex_exit(&spa->spa_props_lock);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ mmp_signal_all_threads();
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ mutex_enter(&spa->spa_props_lock);
+
+ if (spa_multihost(spa)) {
+ mmp_thread_stop(spa);
+ spa->spa_multihost = B_FALSE;
+ }
+
+ mutex_exit(&spa->spa_props_lock);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa;
+ uint64_t initial_version = SPA_VERSION_INITIAL;
+ uint64_t version, newversion;
+ nvlist_t *nvroot, *props;
+ char *name;
+
+ if (ztest_opts.zo_mmp_test)
+ return;
+
+ mutex_enter(&ztest_vdev_lock);
+ name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
+
+ /*
+ * Clean up from previous runs.
+ */
+ (void) spa_destroy(name);
+
+ nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
+ NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+
+ /*
+ * If we're configuring a RAIDZ device then make sure that the
+ * initial version is capable of supporting that feature.
+ */
+ switch (ztest_opts.zo_raidz_parity) {
+ case 0:
+ case 1:
+ initial_version = SPA_VERSION_INITIAL;
+ break;
+ case 2:
+ initial_version = SPA_VERSION_RAIDZ2;
+ break;
+ case 3:
+ initial_version = SPA_VERSION_RAIDZ3;
+ break;
+ }
+
+ /*
+ * Create a pool with a spa version that can be upgraded. Pick
+ * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
+ */
+ do {
+ version = ztest_random_spa_version(initial_version);
+ } while (version > SPA_VERSION_BEFORE_FEATURES);
+
+ props = fnvlist_alloc();
+ fnvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
+ VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0);
+ fnvlist_free(nvroot);
+ fnvlist_free(props);
+
+ VERIFY3S(spa_open(name, &spa, FTAG), ==, 0);
+ VERIFY3U(spa_version(spa), ==, version);
+ newversion = ztest_random_spa_version(version + 1);
+
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("upgrading spa version from %llu to %llu\n",
+ (u_longlong_t)version, (u_longlong_t)newversion);
+ }
+
+ spa_upgrade(spa, newversion);
+ VERIFY3U(spa_version(spa), >, version);
+ VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
+ zpool_prop_to_name(ZPOOL_PROP_VERSION)));
+ spa_close(spa, FTAG);
+
+ kmem_strfree(name);
+ mutex_exit(&ztest_vdev_lock);
+}
+
+static void
+ztest_spa_checkpoint(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+ int error = spa_checkpoint(spa->spa_name);
+
+ switch (error) {
+ case 0:
+ case ZFS_ERR_DEVRM_IN_PROGRESS:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ break;
+ case ENOSPC:
+ ztest_record_enospc(FTAG);
+ break;
+ default:
+ fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error);
+ }
+}
+
+static void
+ztest_spa_discard_checkpoint(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&ztest_checkpoint_lock));
+
+ int error = spa_checkpoint_discard(spa->spa_name);
+
+ switch (error) {
+ case 0:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ case ZFS_ERR_NO_CHECKPOINT:
+ break;
+ default:
+ fatal(0, "spa_discard_checkpoint(%s) = %d",
+ spa->spa_name, error);
+ }
+
+}
+
+/* ARGSUSED */
+void
+ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+
+ mutex_enter(&ztest_checkpoint_lock);
+ if (ztest_random(2) == 0) {
+ ztest_spa_checkpoint(spa);
+ } else {
+ ztest_spa_discard_checkpoint(spa);
+ }
+ mutex_exit(&ztest_checkpoint_lock);
+}
+
+
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+ vdev_t *mvd;
+ int c;
+
+ if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+ return (vd);
+
+ for (c = 0; c < vd->vdev_children; c++)
+ if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+ NULL)
+ return (mvd);
+
+ return (NULL);
+}
+
+static int
+spa_num_top_vdevs(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV);
+ return (rvd->vdev_children);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ uint64_t leaves;
+ uint64_t guid;
+ nvlist_t *nvroot;
+ int error;
+
+ if (ztest_opts.zo_mmp_test)
+ return;
+
+ mutex_enter(&ztest_vdev_lock);
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
+
+ /*
+ * If we have slogs then remove them 1/4 of the time.
+ */
+ if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+ metaslab_group_t *mg;
+
+ /*
+ * find the first real slog in log allocation class
+ */
+ mg = spa_log_class(spa)->mc_rotor;
+ while (!mg->mg_vd->vdev_islog)
+ mg = mg->mg_next;
+
+ guid = mg->mg_vd->vdev_guid;
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between removing a slog (dmu_objset_find)
+ * and destroying a dataset. Removing the slog will
+ * grab a reference on the dataset which may cause
+ * dsl_destroy_head() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ pthread_rwlock_wrlock(&ztest_name_lock);
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ pthread_rwlock_unlock(&ztest_name_lock);
+
+ switch (error) {
+ case 0:
+ case EEXIST: /* Generic zil_reset() error */
+ case EBUSY: /* Replay required */
+ case EACCES: /* Crypto key not loaded */
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ break;
+ default:
+ fatal(0, "spa_vdev_remove() = %d", error);
+ }
+ } else {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ /*
+ * Make 1/4 of the devices be log devices
+ */
+ nvroot = make_vdev_root(NULL, NULL, NULL,
+ ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ?
+ "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ switch (error) {
+ case 0:
+ break;
+ case ENOSPC:
+ ztest_record_enospc("spa_vdev_add");
+ break;
+ default:
+ fatal(0, "spa_vdev_add() = %d", error);
+ }
+ }
+
+ mutex_exit(&ztest_vdev_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ uint64_t leaves;
+ nvlist_t *nvroot;
+ const char *class = (ztest_random(2) == 0) ?
+ VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP;
+ int error;
+
+ /*
+ * By default add a special vdev 50% of the time
+ */
+ if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) ||
+ (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND &&
+ ztest_random(2) == 0)) {
+ return;
+ }
+
+ mutex_enter(&ztest_vdev_lock);
+
+ /* Only test with mirrors */
+ if (zs->zs_mirrors < 2) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /* requires feature@allocation_classes */
+ if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+ class, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+
+ error = spa_vdev_add(spa, nvroot);
+ nvlist_free(nvroot);
+
+ if (error == ENOSPC)
+ ztest_record_enospc("spa_vdev_add");
+ else if (error != 0)
+ fatal(0, "spa_vdev_add() = %d", error);
+
+ /*
+ * 50% of the time allow small blocks in the special class
+ */
+ if (error == 0 &&
+ spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) {
+ if (ztest_opts.zo_verbose >= 3)
+ (void) printf("Enabling special VDEV small blocks\n");
+ (void) ztest_dsl_prop_set_uint64(zd->zd_name,
+ ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE);
+ }
+
+ mutex_exit(&ztest_vdev_lock);
+
+ if (ztest_opts.zo_verbose >= 3) {
+ metaslab_class_t *mc;
+
+ if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+ mc = spa_special_class(spa);
+ else
+ mc = spa_dedup_class(spa);
+ (void) printf("Added a %s mirrored vdev (of %d)\n",
+ class, (int)mc->mc_groups);
+ }
+}
+
+/*
+ * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ spa_aux_vdev_t *sav;
+ char *aux;
+ char *path;
+ uint64_t guid = 0;
+ int error;
+
+ if (ztest_opts.zo_mmp_test)
+ return;
+
+ path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+ if (ztest_random(2) == 0) {
+ sav = &spa->spa_spares;
+ aux = ZPOOL_CONFIG_SPARES;
+ } else {
+ sav = &spa->spa_l2cache;
+ aux = ZPOOL_CONFIG_L2CACHE;
+ }
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ if (sav->sav_count != 0 && ztest_random(4) == 0) {
+ /*
+ * Pick a random device to remove.
+ */
+ guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
+ } else {
+ /*
+ * Find an unused device we can add.
+ */
+ zs->zs_vdev_aux = 0;
+ for (;;) {
+ int c;
+ (void) snprintf(path, MAXPATHLEN, ztest_aux_template,
+ ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
+ zs->zs_vdev_aux);
+ for (c = 0; c < sav->sav_count; c++)
+ if (strcmp(sav->sav_vdevs[c]->vdev_path,
+ path) == 0)
+ break;
+ if (c == sav->sav_count &&
+ vdev_lookup_by_path(rvd, path) == NULL)
+ break;
+ zs->zs_vdev_aux++;
+ }
+ }
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ if (guid == 0) {
+ /*
+ * Add a new device.
+ */
+ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
+ (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
+ error = spa_vdev_add(spa, nvroot);
+
+ switch (error) {
+ case 0:
+ break;
+ default:
+ fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
+ }
+ nvlist_free(nvroot);
+ } else {
+ /*
+ * Remove an existing device. Sometimes, dirty its
+ * vdev state first to make sure we handle removal
+ * of devices that have pending state changes.
+ */
+ if (ztest_random(2) == 0)
+ (void) vdev_online(spa, guid, 0, NULL);
+
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+
+ switch (error) {
+ case 0:
+ case EBUSY:
+ case ZFS_ERR_CHECKPOINT_EXISTS:
+ case ZFS_ERR_DISCARDING_CHECKPOINT:
+ break;
+ default:
+ fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
+ }
+ }
+
+ mutex_exit(&ztest_vdev_lock);
+
+ umem_free(path, MAXPATHLEN);
+}
+
+/*
+ * split a pool if it has mirror tlvdevs
+ */
+/* ARGSUSED */
+void
+ztest_split_pool(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ nvlist_t *tree, **child, *config, *split, **schild;
+ uint_t c, children, schildren = 0, lastlogid = 0;
+ int error = 0;
+
+ if (ztest_opts.zo_mmp_test)
+ return;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ /* ensure we have a usable config; mirrors of raidz aren't supported */
+ if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /* clean up the old pool, if any */
+ (void) spa_destroy("splitp");
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ /* generate a config from the existing config */
+ mutex_enter(&spa->spa_props_lock);
+ VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
+ &tree) == 0);
+ mutex_exit(&spa->spa_props_lock);
+
+ VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0);
+
+ schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
+ for (c = 0; c < children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ nvlist_t **mchild;
+ uint_t mchildren;
+
+ if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
+ VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
+ 0) == 0);
+ VERIFY(nvlist_add_string(schild[schildren],
+ ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
+ VERIFY(nvlist_add_uint64(schild[schildren],
+ ZPOOL_CONFIG_IS_HOLE, 1) == 0);
+ if (lastlogid == 0)
+ lastlogid = schildren;
+ ++schildren;
+ continue;
+ }
+ lastlogid = 0;
+ VERIFY(nvlist_lookup_nvlist_array(child[c],
+ ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+ VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
+ }
+
+ /* OK, create a config that can be used to split */
+ VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
+ VDEV_TYPE_ROOT) == 0);
+ VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
+ lastlogid != 0 ? lastlogid : schildren) == 0);
+
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
+
+ for (c = 0; c < schildren; c++)
+ nvlist_free(schild[c]);
+ free(schild);
+ nvlist_free(split);
+
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ (void) pthread_rwlock_wrlock(&ztest_name_lock);
+ error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ nvlist_free(config);
+
+ if (error == 0) {
+ (void) printf("successful split - results:\n");
+ mutex_enter(&spa_namespace_lock);
+ show_pool_stats(spa);
+ show_pool_stats(spa_lookup("splitp"));
+ mutex_exit(&spa_namespace_lock);
+ ++zs->zs_splits;
+ --zs->zs_mirrors;
+ }
+ mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify that we can attach and detach devices.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *oldvd, *newvd, *pvd;
+ nvlist_t *root;
+ uint64_t leaves;
+ uint64_t leaf, top;
+ uint64_t ashift = ztest_get_ashift();
+ uint64_t oldguid, pguid;
+ uint64_t oldsize, newsize;
+ char *oldpath, *newpath;
+ int replacing;
+ int oldvd_has_siblings = B_FALSE;
+ int newvd_is_spare = B_FALSE;
+ int oldvd_is_log;
+ int error, expected_error;
+
+ if (ztest_opts.zo_mmp_test)
+ return;
+
+ oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+ mutex_enter(&ztest_vdev_lock);
+ leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+ /*
+ * If a vdev is in the process of being removed, its removal may
+ * finish while we are in progress, leading to an unexpected error
+ * value. Don't bother trying to attach while we are in the middle
+ * of removal.
+ */
+ if (ztest_device_removal_active) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * Decide whether to do an attach or a replace.
+ */
+ replacing = ztest_random(2);
+
+ /*
+ * Pick a random top-level vdev.
+ */
+ top = ztest_random_vdev_top(spa, B_TRUE);
+
+ /*
+ * Pick a random leaf within it.
+ */
+ leaf = ztest_random(leaves);
+
+ /*
+ * Locate this vdev.
+ */
+ oldvd = rvd->vdev_child[top];
+
+ /* pick a child from the mirror */
+ if (zs->zs_mirrors >= 1) {
+ ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
+ ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
+ oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
+ }
+
+ /* pick a child out of the raidz group */
+ if (ztest_opts.zo_raidz > 1) {
+ ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+ ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
+ oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
+ }
+
+ /*
+ * If we're already doing an attach or replace, oldvd may be a
+ * mirror vdev -- in which case, pick a random child.
+ */
+ while (oldvd->vdev_children != 0) {
+ oldvd_has_siblings = B_TRUE;
+ ASSERT(oldvd->vdev_children >= 2);
+ oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
+ }
+
+ oldguid = oldvd->vdev_guid;
+ oldsize = vdev_get_min_asize(oldvd);
+ oldvd_is_log = oldvd->vdev_top->vdev_islog;
+ (void) strcpy(oldpath, oldvd->vdev_path);
+ pvd = oldvd->vdev_parent;
+ pguid = pvd->vdev_guid;
+
+ /*
+ * If oldvd has siblings, then half of the time, detach it. Prior
+ * to the detach the pool is scrubbed in order to prevent creating
+ * unrepairable blocks as a result of the data corruption injection.
+ */
+ if (oldvd_has_siblings && ztest_random(2) == 0) {
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ error = ztest_scrub_impl(spa);
+ if (error)
+ goto out;
+
+ error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
+ if (error != 0 && error != ENODEV && error != EBUSY &&
+ error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS &&
+ error != ZFS_ERR_DISCARDING_CHECKPOINT)
+ fatal(0, "detach (%s) returned %d", oldpath, error);
+ goto out;
+ }
+
+ /*
+ * For the new vdev, choose with equal probability between the two
+ * standard paths (ending in either 'a' or 'b') or a random hot spare.
+ */
+ if (sav->sav_count != 0 && ztest_random(3) == 0) {
+ newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
+ newvd_is_spare = B_TRUE;
+ (void) strcpy(newpath, newvd->vdev_path);
+ } else {
+ (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template,
+ ztest_opts.zo_dir, ztest_opts.zo_pool,
+ top * leaves + leaf);
+ if (ztest_random(2) == 0)
+ newpath[strlen(newpath) - 1] = 'b';
+ newvd = vdev_lookup_by_path(rvd, newpath);
+ }
+
+ if (newvd) {
+ /*
+ * Reopen to ensure the vdev's asize field isn't stale.
+ */
+ vdev_reopen(newvd);
+ newsize = vdev_get_min_asize(newvd);
+ } else {
+ /*
+ * Make newsize a little bigger or smaller than oldsize.
+ * If it's smaller, the attach should fail.
+ * If it's larger, and we're doing a replace,
+ * we should get dynamic LUN growth when we're done.
+ */
+ newsize = 10 * oldsize / (9 + ztest_random(3));
+ }
+
+ /*
+ * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
+ * unless it's a replace; in that case any non-replacing parent is OK.
+ *
+ * If newvd is already part of the pool, it should fail with EBUSY.
+ *
+ * If newvd is too small, it should fail with EOVERFLOW.
+ */
+ if (pvd->vdev_ops != &vdev_mirror_ops &&
+ pvd->vdev_ops != &vdev_root_ops && (!replacing ||
+ pvd->vdev_ops == &vdev_replacing_ops ||
+ pvd->vdev_ops == &vdev_spare_ops))
+ expected_error = ENOTSUP;
+ else if (newvd_is_spare && (!replacing || oldvd_is_log))
+ expected_error = ENOTSUP;
+ else if (newvd == oldvd)
+ expected_error = replacing ? 0 : EBUSY;
+ else if (vdev_lookup_by_path(rvd, newpath) != NULL)
+ expected_error = EBUSY;
+ else if (newsize < oldsize)
+ expected_error = EOVERFLOW;
+ else if (ashift > oldvd->vdev_top->vdev_ashift)
+ expected_error = EDOM;
+ else
+ expected_error = 0;
+
+ spa_config_exit(spa, SCL_ALL, FTAG);
+
+ /*
+ * Build the nvlist describing newpath.
+ */
+ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
+ ashift, NULL, 0, 0, 1);
+
+ /*
+ * When supported select either a healing or sequential resilver.
+ */
+ boolean_t rebuilding = B_FALSE;
+ if (pvd->vdev_ops == &vdev_mirror_ops ||
+ pvd->vdev_ops == &vdev_root_ops) {
+ rebuilding = !!ztest_random(2);
+ }
+
+ error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding);
+
+ nvlist_free(root);
+
+ /*
+ * If our parent was the replacing vdev, but the replace completed,
+ * then instead of failing with ENOTSUP we may either succeed,
+ * fail with ENODEV, or fail with EOVERFLOW.
+ */
+ if (expected_error == ENOTSUP &&
+ (error == 0 || error == ENODEV || error == EOVERFLOW))
+ expected_error = error;
+
+ /*
+ * If someone grew the LUN, the replacement may be too small.
+ */
+ if (error == EOVERFLOW || error == EBUSY)
+ expected_error = error;
+
+ if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
+ error == ZFS_ERR_DISCARDING_CHECKPOINT ||
+ error == ZFS_ERR_RESILVER_IN_PROGRESS ||
+ error == ZFS_ERR_REBUILD_IN_PROGRESS)
+ expected_error = error;
+
+ if (error != expected_error && expected_error != EBUSY) {
+ fatal(0, "attach (%s %llu, %s %llu, %d) "
+ "returned %d, expected %d",
+ oldpath, oldsize, newpath,
+ newsize, replacing, error, expected_error);
+ }
+out:
+ mutex_exit(&ztest_vdev_lock);
+
+ umem_free(oldpath, MAXPATHLEN);
+ umem_free(newpath, MAXPATHLEN);
+}
+
+/* ARGSUSED */
+void
+ztest_device_removal(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ vdev_t *vd;
+ uint64_t guid;
+ int error;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ if (ztest_device_removal_active) {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * Remove a random top-level vdev and wait for removal to finish.
+ */
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE));
+ guid = vd->vdev_guid;
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ error = spa_vdev_remove(spa, guid, B_FALSE);
+ if (error == 0) {
+ ztest_device_removal_active = B_TRUE;
+ mutex_exit(&ztest_vdev_lock);
+
+ /*
+ * spa->spa_vdev_removal is created in a sync task that
+ * is initiated via dsl_sync_task_nowait(). Since the
+ * task may not run before spa_vdev_remove() returns, we
+ * must wait at least 1 txg to ensure that the removal
+ * struct has been created.
+ */
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ while (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ } else {
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * The pool needs to be scrubbed after completing device removal.
+ * Failure to do so may result in checksum errors due to the
+ * strategy employed by ztest_fault_inject() when selecting which
+ * offset are redundant and can be damaged.
+ */
+ error = spa_scan(spa, POOL_SCAN_SCRUB);
+ if (error == 0) {
+ while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ mutex_enter(&ztest_vdev_lock);
+ ztest_device_removal_active = B_FALSE;
+ mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Callback function which expands the physical size of the vdev.
+ */
+static vdev_t *
+grow_vdev(vdev_t *vd, void *arg)
+{
+ spa_t *spa __maybe_unused = vd->vdev_spa;
+ size_t *newsize = arg;
+ size_t fsize;
+ int fd;
+
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
+ return (vd);
+
+ fsize = lseek(fd, 0, SEEK_END);
+ VERIFY(ftruncate(fd, *newsize) == 0);
+
+ if (ztest_opts.zo_verbose >= 6) {
+ (void) printf("%s grew from %lu to %lu bytes\n",
+ vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
+ }
+ (void) close(fd);
+ return (NULL);
+}
+
+/*
+ * Callback function which expands a given vdev by calling vdev_online().
+ */
+/* ARGSUSED */
+static vdev_t *
+online_vdev(vdev_t *vd, void *arg)
+{
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *tvd = vd->vdev_top;
+ uint64_t guid = vd->vdev_guid;
+ uint64_t generation = spa->spa_config_generation + 1;
+ vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+ int error;
+
+ ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ /* Calling vdev_online will initialize the new metaslabs */
+ spa_config_exit(spa, SCL_STATE, spa);
+ error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ /*
+ * If vdev_online returned an error or the underlying vdev_open
+ * failed then we abort the expand. The only way to know that
+ * vdev_open fails is by checking the returned newstate.
+ */
+ if (error || newstate != VDEV_STATE_HEALTHY) {
+ if (ztest_opts.zo_verbose >= 5) {
+ (void) printf("Unable to expand vdev, state %llu, "
+ "error %d\n", (u_longlong_t)newstate, error);
+ }
+ return (vd);
+ }
+ ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
+
+ /*
+ * Since we dropped the lock we need to ensure that we're
+ * still talking to the original vdev. It's possible this
+ * vdev may have been detached/replaced while we were
+ * trying to online it.
+ */
+ if (generation != spa->spa_config_generation) {
+ if (ztest_opts.zo_verbose >= 5) {
+ (void) printf("vdev configuration has changed, "
+ "guid %llu, state %llu, expected gen %llu, "
+ "got gen %llu\n",
+ (u_longlong_t)guid,
+ (u_longlong_t)tvd->vdev_state,
+ (u_longlong_t)generation,
+ (u_longlong_t)spa->spa_config_generation);
+ }
+ return (vd);
+ }
+ return (NULL);
+}
+
+/*
+ * Traverse the vdev tree calling the supplied function.
+ * We continue to walk the tree until we either have walked all
+ * children or we receive a non-NULL return from the callback.
+ * If a NULL callback is passed, then we just return back the first
+ * leaf vdev we encounter.
+ */
+static vdev_t *
+vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
+{
+ uint_t c;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ if (func == NULL)
+ return (vd);
+ else
+ return (func(vd, arg));
+ }
+
+ for (c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
+ return (cvd);
+ }
+ return (NULL);
+}
+
+/*
+ * Verify that dynamic LUN growth works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ vdev_t *vd, *tvd;
+ metaslab_class_t *mc;
+ metaslab_group_t *mg;
+ size_t psize, newsize;
+ uint64_t top;
+ uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+ mutex_enter(&ztest_checkpoint_lock);
+ mutex_enter(&ztest_vdev_lock);
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ /*
+ * If there is a vdev removal in progress, it could complete while
+ * we are running, in which case we would not be able to verify
+ * that the metaslab_class space increased (because it decreases
+ * when the device removal completes).
+ */
+ if (ztest_device_removal_active) {
+ spa_config_exit(spa, SCL_STATE, spa);
+ mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
+ return;
+ }
+
+ top = ztest_random_vdev_top(spa, B_TRUE);
+
+ tvd = spa->spa_root_vdev->vdev_child[top];
+ mg = tvd->vdev_mg;
+ mc = mg->mg_class;
+ old_ms_count = tvd->vdev_ms_count;
+ old_class_space = metaslab_class_get_space(mc);
+
+ /*
+ * Determine the size of the first leaf vdev associated with
+ * our top-level device.
+ */
+ vd = vdev_walk_tree(tvd, NULL, NULL);
+ ASSERT3P(vd, !=, NULL);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+ psize = vd->vdev_psize;
+
+ /*
+ * We only try to expand the vdev if it's healthy, less than 4x its
+ * original size, and it has a valid psize.
+ */
+ if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+ psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
+ spa_config_exit(spa, SCL_STATE, spa);
+ mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
+ return;
+ }
+ ASSERT(psize > 0);
+ newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE);
+ ASSERT3U(newsize, >, psize);
+
+ if (ztest_opts.zo_verbose >= 6) {
+ (void) printf("Expanding LUN %s from %lu to %lu\n",
+ vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
+ }
+
+ /*
+ * Growing the vdev is a two step process:
+ * 1). expand the physical size (i.e. relabel)
+ * 2). online the vdev to create the new metaslabs
+ */
+ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
+ vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
+ tvd->vdev_state != VDEV_STATE_HEALTHY) {
+ if (ztest_opts.zo_verbose >= 5) {
+ (void) printf("Could not expand LUN because "
+ "the vdev configuration changed.\n");
+ }
+ spa_config_exit(spa, SCL_STATE, spa);
+ mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
+ return;
+ }
+
+ spa_config_exit(spa, SCL_STATE, spa);
+
+ /*
+ * Expanding the LUN will update the config asynchronously,
+ * thus we must wait for the async thread to complete any
+ * pending tasks before proceeding.
+ */
+ for (;;) {
+ boolean_t done;
+ mutex_enter(&spa->spa_async_lock);
+ done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+ mutex_exit(&spa->spa_async_lock);
+ if (done)
+ break;
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ (void) poll(NULL, 0, 100);
+ }
+
+ spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+ tvd = spa->spa_root_vdev->vdev_child[top];
+ new_ms_count = tvd->vdev_ms_count;
+ new_class_space = metaslab_class_get_space(mc);
+
+ if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+ if (ztest_opts.zo_verbose >= 5) {
+ (void) printf("Could not verify LUN expansion due to "
+ "intervening vdev offline or remove.\n");
+ }
+ spa_config_exit(spa, SCL_STATE, spa);
+ mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
+ return;
+ }
+
+ /*
+ * Make sure we were able to grow the vdev.
+ */
+ if (new_ms_count <= old_ms_count) {
+ fatal(0, "LUN expansion failed: ms_count %llu < %llu\n",
+ old_ms_count, new_ms_count);
+ }
+
+ /*
+ * Make sure we were able to grow the pool.
+ */
+ if (new_class_space <= old_class_space) {
+ fatal(0, "LUN expansion failed: class_space %llu < %llu\n",
+ old_class_space, new_class_space);
+ }
+
+ if (ztest_opts.zo_verbose >= 5) {
+ char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
+
+ nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
+ nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
+ (void) printf("%s grew from %s to %s\n",
+ spa->spa_name, oldnumbuf, newnumbuf);
+ }
+
+ spa_config_exit(spa, SCL_STATE, spa);
+ mutex_exit(&ztest_vdev_lock);
+ mutex_exit(&ztest_checkpoint_lock);
+}
+
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
+/* ARGSUSED */
+static void
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+ /*
+ * Create the objects common to all ztest datasets.
+ */
+ VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
+ DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
+}
+
+static int
+ztest_dataset_create(char *dsname)
+{
+ int err;
+ uint64_t rand;
+ dsl_crypto_params_t *dcp = NULL;
+
+ /*
+ * 50% of the time, we create encrypted datasets
+ * using a random cipher suite and a hard-coded
+ * wrapping key.
+ */
+ rand = ztest_random(2);
+ if (rand != 0) {
+ nvlist_t *crypto_args = fnvlist_alloc();
+ nvlist_t *props = fnvlist_alloc();
+
+ /* slight bias towards the default cipher suite */
+ rand = ztest_random(ZIO_CRYPT_FUNCTIONS);
+ if (rand < ZIO_CRYPT_AES_128_CCM)
+ rand = ZIO_CRYPT_ON;
+
+ fnvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand);
+ fnvlist_add_uint8_array(crypto_args, "wkeydata",
+ (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN);
+
+ /*
+ * These parameters aren't really used by the kernel. They
+ * are simply stored so that userspace knows how to load
+ * the wrapping key.
+ */
+ fnvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW);
+ fnvlist_add_string(props,
+ zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt");
+ fnvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL);
+ fnvlist_add_uint64(props,
+ zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL);
+
+ VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props,
+ crypto_args, &dcp));
+
+ /*
+ * Cycle through all available encryption implementations
+ * to verify interoperability.
+ */
+ VERIFY0(gcm_impl_set("cycle"));
+ VERIFY0(aes_impl_set("cycle"));
+
+ fnvlist_free(crypto_args);
+ fnvlist_free(props);
+ }
+
+ err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp,
+ ztest_objset_create_cb, NULL);
+ dsl_crypto_params_free(dcp, !!err);
+
+ rand = ztest_random(100);
+ if (err || rand < 80)
+ return (err);
+
+ if (ztest_opts.zo_verbose >= 5)
+ (void) printf("Setting dataset %s to sync always\n", dsname);
+ return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
+ ZFS_SYNC_ALWAYS, B_FALSE));
+}
+
+/* ARGSUSED */
+static int
+ztest_objset_destroy_cb(const char *name, void *arg)
+{
+ objset_t *os;
+ dmu_object_info_t doi;
+ int error;
+
+ /*
+ * Verify that the dataset contains a directory object.
+ */
+ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+ B_TRUE, FTAG, &os));
+ error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
+ if (error != ENOENT) {
+ /* We could have crashed in the middle of destroying it */
+ ASSERT0(error);
+ ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+ ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
+ }
+ dmu_objset_disown(os, B_TRUE, FTAG);
+
+ /*
+ * Destroy the dataset.
+ */
+ if (strchr(name, '@') != NULL) {
+ VERIFY0(dsl_destroy_snapshot(name, B_TRUE));
+ } else {
+ error = dsl_destroy_head(name);
+ if (error == ENOSPC) {
+ /* There could be checkpoint or insufficient slop */
+ ztest_record_enospc(FTAG);
+ } else if (error != EBUSY) {
+ /* There could be a hold on this dataset */
+ ASSERT0(error);
+ }
+ }
+ return (0);
+}
+
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+
+ (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
+
+ error = dmu_objset_snapshot_one(osname, snapname);
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ return (B_FALSE);
+ }
+ if (error != 0 && error != EEXIST) {
+ fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
+ snapname, error);
+ }
+ return (B_TRUE);
+}
+
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+
+ (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
+ (u_longlong_t)id);
+
+ error = dsl_destroy_snapshot(snapname, B_FALSE);
+ if (error != 0 && error != ENOENT)
+ fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+ return (B_TRUE);
+}
+
+/* ARGSUSED */
+void
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_ds_t *zdtmp;
+ int iters;
+ int error;
+ objset_t *os, *os2;
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ zilog_t *zilog;
+ int i;
+
+ zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
+
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ (void) snprintf(name, sizeof (name), "%s/temp_%llu",
+ ztest_opts.zo_pool, (u_longlong_t)id);
+
+ /*
+ * If this dataset exists from a previous run, process its replay log
+ * half of the time. If we don't replay it, then dsl_destroy_head()
+ * (invoked from ztest_objset_destroy_cb()) should just throw it away.
+ */
+ if (ztest_random(2) == 0 &&
+ ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
+ B_TRUE, FTAG, &os) == 0) {
+ ztest_zd_init(zdtmp, NULL, os);
+ zil_replay(os, zdtmp, ztest_replay_vector);
+ ztest_zd_fini(zdtmp);
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ }
+
+ /*
+ * There may be an old instance of the dataset we're about to
+ * create lying around from a previous run. If so, destroy it
+ * and all of its snapshots.
+ */
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+ DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+
+ /*
+ * Verify that the destroyed dataset is no longer in the namespace.
+ */
+ VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+ B_TRUE, FTAG, &os));
+
+ /*
+ * Verify that we can create a new dataset.
+ */
+ error = ztest_dataset_create(name);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ fatal(0, "dmu_objset_create(%s) = %d", name, error);
+ }
+
+ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE,
+ FTAG, &os));
+
+ ztest_zd_init(zdtmp, NULL, os);
+
+ /*
+ * Open the intent log for it.
+ */
+ zilog = zil_open(os, ztest_get_data);
+
+ /*
+ * Put some objects in there, do a little I/O to them,
+ * and randomly take a couple of snapshots along the way.
+ */
+ iters = ztest_random(5);
+ for (i = 0; i < iters; i++) {
+ ztest_dmu_object_alloc_free(zdtmp, id);
+ if (ztest_random(iters) == 0)
+ (void) ztest_snapshot_create(name, i);
+ }
+
+ /*
+ * Verify that we cannot create an existing dataset.
+ */
+ VERIFY3U(EEXIST, ==,
+ dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL));
+
+ /*
+ * Verify that we can hold an objset that is also owned.
+ */
+ VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
+ dmu_objset_rele(os2, FTAG);
+
+ /*
+ * Verify that we cannot own an objset that is already owned.
+ */
+ VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER,
+ B_FALSE, B_TRUE, FTAG, &os2));
+
+ zil_close(zilog);
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ ztest_zd_fini(zdtmp);
+out:
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ umem_free(zdtmp, sizeof (ztest_ds_t));
+}
+
+/*
+ * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
+ */
+void
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+ (void) ztest_snapshot_destroy(zd->zd_name, id);
+ (void) ztest_snapshot_create(zd->zd_name, id);
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/*
+ * Cleanup non-standard snapshots and clones.
+ */
+static void
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
+{
+ char *snap1name;
+ char *clone1name;
+ char *snap2name;
+ char *clone2name;
+ char *snap3name;
+ int error;
+
+ snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+
+ (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s@s1_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s/c1_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s@s2_%llu", clone1name, (u_longlong_t)id);
+ (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s/c2_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s@s3_%llu", clone1name, (u_longlong_t)id);
+
+ error = dsl_destroy_head(clone2name);
+ if (error && error != ENOENT)
+ fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
+ error = dsl_destroy_snapshot(snap3name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
+ error = dsl_destroy_snapshot(snap2name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
+ error = dsl_destroy_head(clone1name);
+ if (error && error != ENOENT)
+ fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
+ error = dsl_destroy_snapshot(snap1name, B_FALSE);
+ if (error && error != ENOENT)
+ fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
+
+ umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN);
+}
+
+/*
+ * Verify dsl_dataset_promote handles EBUSY
+ */
+void
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os;
+ char *snap1name;
+ char *clone1name;
+ char *snap2name;
+ char *clone2name;
+ char *snap3name;
+ char *osname = zd->zd_name;
+ int error;
+
+ snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+ snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL);
+
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ ztest_dsl_dataset_cleanup(osname, id);
+
+ (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s@s1_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s/c1_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s@s2_%llu", clone1name, (u_longlong_t)id);
+ (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s/c2_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN,
+ "%s@s3_%llu", clone1name, (u_longlong_t)id);
+
+ error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
+ if (error && error != EEXIST) {
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
+ }
+
+ error = dmu_objset_clone(clone1name, snap1name);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
+ }
+
+ error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
+ if (error && error != EEXIST) {
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
+ }
+
+ error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
+ if (error && error != EEXIST) {
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
+ }
+
+ error = dmu_objset_clone(clone2name, snap3name);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
+ }
+
+ error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE,
+ FTAG, &os);
+ if (error)
+ fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
+ error = dsl_dataset_promote(clone2name, NULL);
+ if (error == ENOSPC) {
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ ztest_record_enospc(FTAG);
+ goto out;
+ }
+ if (error != EBUSY)
+ fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
+ error);
+ dmu_objset_disown(os, B_TRUE, FTAG);
+
+out:
+ ztest_dsl_dataset_cleanup(osname, id);
+
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN);
+ umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN);
+}
+
+#undef OD_ARRAY_SIZE
+#define OD_ARRAY_SIZE 4
+
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_od_t *od;
+ int batchsize;
+ int size;
+ int b;
+
+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+ od = umem_alloc(size, UMEM_NOFAIL);
+ batchsize = OD_ARRAY_SIZE;
+
+ for (b = 0; b < batchsize; b++)
+ ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER,
+ 0, 0, 0);
+
+ /*
+ * Destroy the previous batch of objects, create a new batch,
+ * and do some I/O on the new objects.
+ */
+ if (ztest_object_init(zd, od, size, B_TRUE) != 0)
+ return;
+
+ while (ztest_random(4 * batchsize) != 0)
+ ztest_io(zd, od[ztest_random(batchsize)].od_object,
+ ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+
+ umem_free(od, size);
+}
+
+/*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ uint64_t object;
+
+ /*
+ * Rewind the global allocator randomly back to a lower object number
+ * to force backfilling and reclamation of recently freed dnodes.
+ */
+ mutex_enter(&os->os_obj_lock);
+ object = ztest_random(os->os_obj_next_chunk);
+ os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+ mutex_exit(&os->os_obj_lock);
+}
+
+#undef OD_ARRAY_SIZE
+#define OD_ARRAY_SIZE 2
+
+/*
+ * Verify that dmu_{read,write} work as expected.
+ */
+void
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
+{
+ int size;
+ ztest_od_t *od;
+
+ objset_t *os = zd->zd_os;
+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+ od = umem_alloc(size, UMEM_NOFAIL);
+ dmu_tx_t *tx;
+ int i, freeit, error;
+ uint64_t n, s, txg;
+ bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
+ uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+ uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+ uint64_t regions = 997;
+ uint64_t stride = 123456789ULL;
+ uint64_t width = 40;
+ int free_percent = 5;
+
+ /*
+ * This test uses two objects, packobj and bigobj, that are always
+ * updated together (i.e. in the same tx) so that their contents are
+ * in sync and can be compared. Their contents relate to each other
+ * in a simple way: packobj is a dense array of 'bufwad' structures,
+ * while bigobj is a sparse array of the same bufwads. Specifically,
+ * for any index n, there are three bufwads that should be identical:
+ *
+ * packobj, at offset n * sizeof (bufwad_t)
+ * bigobj, at the head of the nth chunk
+ * bigobj, at the tail of the nth chunk
+ *
+ * The chunk size is arbitrary. It doesn't have to be a power of two,
+ * and it doesn't have any relation to the object blocksize.
+ * The only requirement is that it can hold at least two bufwads.
+ *
+ * Normally, we write the bufwad to each of these locations.
+ * However, free_percent of the time we instead write zeroes to
+ * packobj and perform a dmu_free_range() on bigobj. By comparing
+ * bigobj to packobj, we can verify that the DMU is correctly
+ * tracking which parts of an object are allocated and free,
+ * and that the contents of the allocated blocks are correct.
+ */
+
+ /*
+ * Read the directory info. If it's the first time, set things up.
+ */
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize);
+ ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+ chunksize);
+
+ if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
+ umem_free(od, size);
+ return;
+ }
+
+ bigobj = od[0].od_object;
+ packobj = od[1].od_object;
+ chunksize = od[0].od_gen;
+ ASSERT(chunksize == od[1].od_gen);
+
+ /*
+ * Prefetch a random chunk of the big object.
+ * Our aim here is to get some async reads in flight
+ * for blocks that we may free below; the DMU should
+ * handle this race correctly.
+ */
+ n = ztest_random(regions) * stride + ztest_random(width);
+ s = 1 + ztest_random(2 * width - 1);
+ dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
+ ZIO_PRIORITY_SYNC_READ);
+
+ /*
+ * Pick a random index and compute the offsets into packobj and bigobj.
+ */
+ n = ztest_random(regions) * stride + ztest_random(width);
+ s = 1 + ztest_random(width - 1);
+
+ packoff = n * sizeof (bufwad_t);
+ packsize = s * sizeof (bufwad_t);
+
+ bigoff = n * chunksize;
+ bigsize = s * chunksize;
+
+ packbuf = umem_alloc(packsize, UMEM_NOFAIL);
+ bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
+
+ /*
+ * free_percent of the time, free a range of bigobj rather than
+ * overwriting it.
+ */
+ freeit = (ztest_random(100) < free_percent);
+
+ /*
+ * Read the current contents of our objects.
+ */
+ error = dmu_read(os, packobj, packoff, packsize, packbuf,
+ DMU_READ_PREFETCH);
+ ASSERT0(error);
+ error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
+ DMU_READ_PREFETCH);
+ ASSERT0(error);
+
+ /*
+ * Get a tx for the mods to both packobj and bigobj.
+ */
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, packobj, packoff, packsize);
+
+ if (freeit)
+ dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
+ else
+ dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+ /* This accounts for setting the checksum/compression. */
+ dmu_tx_hold_bonus(tx, bigobj);
+
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
+ umem_free(packbuf, packsize);
+ umem_free(bigbuf, bigsize);
+ umem_free(od, size);
+ return;
+ }
+
+ enum zio_checksum cksum;
+ do {
+ cksum = (enum zio_checksum)
+ ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
+ } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+ dmu_object_set_checksum(os, bigobj, cksum, tx);
+
+ enum zio_compress comp;
+ do {
+ comp = (enum zio_compress)
+ ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
+ } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
+ dmu_object_set_compress(os, bigobj, comp, tx);
+
+ /*
+ * For each index from n to n + s, verify that the existing bufwad
+ * in packobj matches the bufwads at the head and tail of the
+ * corresponding chunk in bigobj. Then update all three bufwads
+ * with the new values we want to write out.
+ */
+ for (i = 0; i < s; i++) {
+ /* LINTED */
+ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+ /* LINTED */
+ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+ /* LINTED */
+ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+ ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+ ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+ if (pack->bw_txg > txg)
+ fatal(0, "future leak: got %llx, open txg is %llx",
+ pack->bw_txg, txg);
+
+ if (pack->bw_data != 0 && pack->bw_index != n + i)
+ fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+ pack->bw_index, n, i);
+
+ if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+ fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+ if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+ fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+ if (freeit) {
+ bzero(pack, sizeof (bufwad_t));
+ } else {
+ pack->bw_index = n + i;
+ pack->bw_txg = txg;
+ pack->bw_data = 1 + ztest_random(-2ULL);
+ }
+ *bigH = *pack;
+ *bigT = *pack;
+ }
+
+ /*
+ * We've verified all the old bufwads, and made new ones.
+ * Now write them out.
+ */
+ dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+
+ if (freeit) {
+ if (ztest_opts.zo_verbose >= 7) {
+ (void) printf("freeing offset %llx size %llx"
+ " txg %llx\n",
+ (u_longlong_t)bigoff,
+ (u_longlong_t)bigsize,
+ (u_longlong_t)txg);
+ }
+ VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
+ } else {
+ if (ztest_opts.zo_verbose >= 7) {
+ (void) printf("writing offset %llx size %llx"
+ " txg %llx\n",
+ (u_longlong_t)bigoff,
+ (u_longlong_t)bigsize,
+ (u_longlong_t)txg);
+ }
+ dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
+ }
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Sanity check the stuff we just wrote.
+ */
+ {
+ void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+ void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+ VERIFY(0 == dmu_read(os, packobj, packoff,
+ packsize, packcheck, DMU_READ_PREFETCH));
+ VERIFY(0 == dmu_read(os, bigobj, bigoff,
+ bigsize, bigcheck, DMU_READ_PREFETCH));
+
+ ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+ ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+ umem_free(packcheck, packsize);
+ umem_free(bigcheck, bigsize);
+ }
+
+ umem_free(packbuf, packsize);
+ umem_free(bigbuf, bigsize);
+ umem_free(od, size);
+}
+
+static void
+compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
+ uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
+{
+ uint64_t i;
+ bufwad_t *pack;
+ bufwad_t *bigH;
+ bufwad_t *bigT;
+
+ /*
+ * For each index from n to n + s, verify that the existing bufwad
+ * in packobj matches the bufwads at the head and tail of the
+ * corresponding chunk in bigobj. Then update all three bufwads
+ * with the new values we want to write out.
+ */
+ for (i = 0; i < s; i++) {
+ /* LINTED */
+ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+ /* LINTED */
+ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+ /* LINTED */
+ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+ ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+ ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+ if (pack->bw_txg > txg)
+ fatal(0, "future leak: got %llx, open txg is %llx",
+ pack->bw_txg, txg);
+
+ if (pack->bw_data != 0 && pack->bw_index != n + i)
+ fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+ pack->bw_index, n, i);
+
+ if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+ fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+ if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+ fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+ pack->bw_index = n + i;
+ pack->bw_txg = txg;
+ pack->bw_data = 1 + ztest_random(-2ULL);
+
+ *bigH = *pack;
+ *bigT = *pack;
+ }
+}
+
+#undef OD_ARRAY_SIZE
+#define OD_ARRAY_SIZE 2
+
+void
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ ztest_od_t *od;
+ dmu_tx_t *tx;
+ uint64_t i;
+ int error;
+ int size;
+ uint64_t n, s, txg;
+ bufwad_t *packbuf, *bigbuf;
+ uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+ uint64_t blocksize = ztest_random_blocksize();
+ uint64_t chunksize = blocksize;
+ uint64_t regions = 997;
+ uint64_t stride = 123456789ULL;
+ uint64_t width = 9;
+ dmu_buf_t *bonus_db;
+ arc_buf_t **bigbuf_arcbufs;
+ dmu_object_info_t doi;
+
+ size = sizeof (ztest_od_t) * OD_ARRAY_SIZE;
+ od = umem_alloc(size, UMEM_NOFAIL);
+
+ /*
+ * This test uses two objects, packobj and bigobj, that are always
+ * updated together (i.e. in the same tx) so that their contents are
+ * in sync and can be compared. Their contents relate to each other
+ * in a simple way: packobj is a dense array of 'bufwad' structures,
+ * while bigobj is a sparse array of the same bufwads. Specifically,
+ * for any index n, there are three bufwads that should be identical:
+ *
+ * packobj, at offset n * sizeof (bufwad_t)
+ * bigobj, at the head of the nth chunk
+ * bigobj, at the tail of the nth chunk
+ *
+ * The chunk size is set equal to bigobj block size so that
+ * dmu_assign_arcbuf_by_dbuf() can be tested for object updates.
+ */
+
+ /*
+ * Read the directory info. If it's the first time, set things up.
+ */
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
+ ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0,
+ chunksize);
+
+
+ if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
+ umem_free(od, size);
+ return;
+ }
+
+ bigobj = od[0].od_object;
+ packobj = od[1].od_object;
+ blocksize = od[0].od_blocksize;
+ chunksize = blocksize;
+ ASSERT(chunksize == od[1].od_gen);
+
+ VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+ VERIFY(ISP2(doi.doi_data_block_size));
+ VERIFY(chunksize == doi.doi_data_block_size);
+ VERIFY(chunksize >= 2 * sizeof (bufwad_t));
+
+ /*
+ * Pick a random index and compute the offsets into packobj and bigobj.
+ */
+ n = ztest_random(regions) * stride + ztest_random(width);
+ s = 1 + ztest_random(width - 1);
+
+ packoff = n * sizeof (bufwad_t);
+ packsize = s * sizeof (bufwad_t);
+
+ bigoff = n * chunksize;
+ bigsize = s * chunksize;
+
+ packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
+ bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
+
+ bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
+
+ /*
+ * Iteration 0 test zcopy for DB_UNCACHED dbufs.
+ * Iteration 1 test zcopy to already referenced dbufs.
+ * Iteration 2 test zcopy to dirty dbuf in the same txg.
+ * Iteration 3 test zcopy to dbuf dirty in previous txg.
+ * Iteration 4 test zcopy when dbuf is no longer dirty.
+ * Iteration 5 test zcopy when it can't be done.
+ * Iteration 6 one more zcopy write.
+ */
+ for (i = 0; i < 7; i++) {
+ uint64_t j;
+ uint64_t off;
+
+ /*
+ * In iteration 5 (i == 5) use arcbufs
+ * that don't match bigobj blksz to test
+ * dmu_assign_arcbuf_by_dbuf() when it can't directly
+ * assign an arcbuf to a dbuf.
+ */
+ for (j = 0; j < s; j++) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+ bigbuf_arcbufs[j] =
+ dmu_request_arcbuf(bonus_db, chunksize);
+ } else {
+ bigbuf_arcbufs[2 * j] =
+ dmu_request_arcbuf(bonus_db, chunksize / 2);
+ bigbuf_arcbufs[2 * j + 1] =
+ dmu_request_arcbuf(bonus_db, chunksize / 2);
+ }
+ }
+
+ /*
+ * Get a tx for the mods to both packobj and bigobj.
+ */
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_write(tx, packobj, packoff, packsize);
+ dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
+ umem_free(packbuf, packsize);
+ umem_free(bigbuf, bigsize);
+ for (j = 0; j < s; j++) {
+ if (i != 5 ||
+ chunksize < (SPA_MINBLOCKSIZE * 2)) {
+ dmu_return_arcbuf(bigbuf_arcbufs[j]);
+ } else {
+ dmu_return_arcbuf(
+ bigbuf_arcbufs[2 * j]);
+ dmu_return_arcbuf(
+ bigbuf_arcbufs[2 * j + 1]);
+ }
+ }
+ umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+ umem_free(od, size);
+ dmu_buf_rele(bonus_db, FTAG);
+ return;
+ }
+
+ /*
+ * 50% of the time don't read objects in the 1st iteration to
+ * test dmu_assign_arcbuf_by_dbuf() for the case when there are
+ * no existing dbufs for the specified offsets.
+ */
+ if (i != 0 || ztest_random(2) != 0) {
+ error = dmu_read(os, packobj, packoff,
+ packsize, packbuf, DMU_READ_PREFETCH);
+ ASSERT0(error);
+ error = dmu_read(os, bigobj, bigoff, bigsize,
+ bigbuf, DMU_READ_PREFETCH);
+ ASSERT0(error);
+ }
+ compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
+ n, chunksize, txg);
+
+ /*
+ * We've verified all the old bufwads, and made new ones.
+ * Now write them out.
+ */
+ dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+ if (ztest_opts.zo_verbose >= 7) {
+ (void) printf("writing offset %llx size %llx"
+ " txg %llx\n",
+ (u_longlong_t)bigoff,
+ (u_longlong_t)bigsize,
+ (u_longlong_t)txg);
+ }
+ for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
+ dmu_buf_t *dbt;
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+ bcopy((caddr_t)bigbuf + (off - bigoff),
+ bigbuf_arcbufs[j]->b_data, chunksize);
+ } else {
+ bcopy((caddr_t)bigbuf + (off - bigoff),
+ bigbuf_arcbufs[2 * j]->b_data,
+ chunksize / 2);
+ bcopy((caddr_t)bigbuf + (off - bigoff) +
+ chunksize / 2,
+ bigbuf_arcbufs[2 * j + 1]->b_data,
+ chunksize / 2);
+ }
+
+ if (i == 1) {
+ VERIFY(dmu_buf_hold(os, bigobj, off,
+ FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
+ }
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
+ VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
+ off, bigbuf_arcbufs[j], tx));
+ } else {
+ VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
+ off, bigbuf_arcbufs[2 * j], tx));
+ VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db,
+ off + chunksize / 2,
+ bigbuf_arcbufs[2 * j + 1], tx));
+ }
+ if (i == 1) {
+ dmu_buf_rele(dbt, FTAG);
+ }
+ }
+ dmu_tx_commit(tx);
+
+ /*
+ * Sanity check the stuff we just wrote.
+ */
+ {
+ void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+ void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
+
+ VERIFY(0 == dmu_read(os, packobj, packoff,
+ packsize, packcheck, DMU_READ_PREFETCH));
+ VERIFY(0 == dmu_read(os, bigobj, bigoff,
+ bigsize, bigcheck, DMU_READ_PREFETCH));
+
+ ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+ ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
+
+ umem_free(packcheck, packsize);
+ umem_free(bigcheck, bigsize);
+ }
+ if (i == 2) {
+ txg_wait_open(dmu_objset_pool(os), 0, B_TRUE);
+ } else if (i == 3) {
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ }
+ }
+
+ dmu_buf_rele(bonus_db, FTAG);
+ umem_free(packbuf, packsize);
+ umem_free(bigbuf, bigsize);
+ umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+ umem_free(od, size);
+}
+
+/* ARGSUSED */
+void
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_od_t *od;
+
+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+ uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+ (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+
+ /*
+ * Have multiple threads write to large offsets in an object
+ * to verify that parallel writes to an object -- even to the
+ * same blocks within the object -- doesn't cause any trouble.
+ */
+ ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0)
+ return;
+
+ while (ztest_random(10) != 0)
+ ztest_io(zd, od->od_object, offset);
+
+ umem_free(od, sizeof (ztest_od_t));
+}
+
+void
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_od_t *od;
+ uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+ (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+ uint64_t count = ztest_random(20) + 1;
+ uint64_t blocksize = ztest_random_blocksize();
+ void *data;
+
+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+ !ztest_random(2)) != 0) {
+ umem_free(od, sizeof (ztest_od_t));
+ return;
+ }
+
+ if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) {
+ umem_free(od, sizeof (ztest_od_t));
+ return;
+ }
+
+ ztest_prealloc(zd, od->od_object, offset, count * blocksize);
+
+ data = umem_zalloc(blocksize, UMEM_NOFAIL);
+
+ while (ztest_random(count) != 0) {
+ uint64_t randoff = offset + (ztest_random(count) * blocksize);
+ if (ztest_write(zd, od->od_object, randoff, blocksize,
+ data) != 0)
+ break;
+ while (ztest_random(4) != 0)
+ ztest_io(zd, od->od_object, randoff);
+ }
+
+ umem_free(data, blocksize);
+ umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Verify that zap_{create,destroy,add,remove,update} work as expected.
+ */
+#define ZTEST_ZAP_MIN_INTS 1
+#define ZTEST_ZAP_MAX_INTS 4
+#define ZTEST_ZAP_MAX_PROPS 1000
+
+void
+ztest_zap(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ ztest_od_t *od;
+ uint64_t object;
+ uint64_t txg, last_txg;
+ uint64_t value[ZTEST_ZAP_MAX_INTS];
+ uint64_t zl_ints, zl_intsize, prop;
+ int i, ints;
+ dmu_tx_t *tx;
+ char propname[100], txgname[100];
+ int error;
+ char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
+
+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+ !ztest_random(2)) != 0)
+ goto out;
+
+ object = od->od_object;
+
+ /*
+ * Generate a known hash collision, and verify that
+ * we can lookup and remove both entries.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ goto out;
+ for (i = 0; i < 2; i++) {
+ value[i] = i;
+ VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+ 1, &value[i], tx));
+ }
+ for (i = 0; i < 2; i++) {
+ VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+ sizeof (uint64_t), 1, &value[i], tx));
+ VERIFY3U(0, ==,
+ zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
+ }
+ for (i = 0; i < 2; i++) {
+ VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
+ }
+ dmu_tx_commit(tx);
+
+ /*
+ * Generate a bunch of random entries.
+ */
+ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
+
+ prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+ (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+ (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+ bzero(value, sizeof (value));
+ last_txg = 0;
+
+ /*
+ * If these zap entries already exist, validate their contents.
+ */
+ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+ if (error == 0) {
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, 1);
+
+ VERIFY(zap_lookup(os, object, txgname, zl_intsize,
+ zl_ints, &last_txg) == 0);
+
+ VERIFY(zap_length(os, object, propname, &zl_intsize,
+ &zl_ints) == 0);
+
+ ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+ ASSERT3U(zl_ints, ==, ints);
+
+ VERIFY(zap_lookup(os, object, propname, zl_intsize,
+ zl_ints, value) == 0);
+
+ for (i = 0; i < ints; i++) {
+ ASSERT3U(value[i], ==, last_txg + object + i);
+ }
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+
+ /*
+ * Atomically update two entries in our zap object.
+ * The first is named txg_%llu, and contains the txg
+ * in which the property was last updated. The second
+ * is named prop_%llu, and the nth element of its value
+ * should be txg + object + n.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ goto out;
+
+ if (last_txg > txg)
+ fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
+
+ for (i = 0; i < ints; i++)
+ value[i] = txg + object + i;
+
+ VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+ 1, &txg, tx));
+ VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+ ints, value, tx));
+
+ dmu_tx_commit(tx);
+
+ /*
+ * Remove a random pair of entries.
+ */
+ prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
+ (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+ (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
+
+ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
+
+ if (error == ENOENT)
+ goto out;
+
+ ASSERT0(error);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ goto out;
+ VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+ VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+ dmu_tx_commit(tx);
+out:
+ umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Test case to test the upgrading of a microzap to fatzap.
+ */
+void
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ ztest_od_t *od;
+ uint64_t object, txg;
+ int i;
+
+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (ztest_od_t),
+ !ztest_random(2)) != 0)
+ goto out;
+ object = od->od_object;
+
+ /*
+ * Add entries to this ZAP and make sure it spills over
+ * and gets upgraded to a fatzap. Also, since we are adding
+ * 2050 entries we should see ptrtbl growth and leaf-block split.
+ */
+ for (i = 0; i < 2050; i++) {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t value = i;
+ dmu_tx_t *tx;
+ int error;
+
+ (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+ (u_longlong_t)id, (u_longlong_t)value);
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, name);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0)
+ goto out;
+ error = zap_add(os, object, name, sizeof (uint64_t), 1,
+ &value, tx);
+ ASSERT(error == 0 || error == EEXIST);
+ dmu_tx_commit(tx);
+ }
+out:
+ umem_free(od, sizeof (ztest_od_t));
+}
+
+/* ARGSUSED */
+void
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ ztest_od_t *od;
+ uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+ dmu_tx_t *tx;
+ int i, namelen, error;
+ int micro = ztest_random(2);
+ char name[20], string_value[20];
+ void *data;
+
+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+ ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+ umem_free(od, sizeof (ztest_od_t));
+ return;
+ }
+
+ object = od->od_object;
+
+ /*
+ * Generate a random name of the form 'xxx.....' where each
+ * x is a random printable character and the dots are dots.
+ * There are 94 such characters, and the name length goes from
+ * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+ */
+ namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+ for (i = 0; i < 3; i++)
+ name[i] = '!' + ztest_random('~' - '!' + 1);
+ for (; i < namelen - 1; i++)
+ name[i] = '.';
+ name[i] = '\0';
+
+ if ((namelen & 1) || micro) {
+ wsize = sizeof (txg);
+ wc = 1;
+ data = &txg;
+ } else {
+ wsize = 1;
+ wc = namelen;
+ data = string_value;
+ }
+
+ count = -1ULL;
+ VERIFY0(zap_count(os, object, &count));
+ ASSERT(count != -1ULL);
+
+ /*
+ * Select an operation: length, lookup, add, update, remove.
+ */
+ i = ztest_random(5);
+
+ if (i >= 2) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+ txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+ if (txg == 0) {
+ umem_free(od, sizeof (ztest_od_t));
+ return;
+ }
+ bcopy(name, string_value, namelen);
+ } else {
+ tx = NULL;
+ txg = 0;
+ bzero(string_value, namelen);
+ }
+
+ switch (i) {
+
+ case 0:
+ error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+ if (error == 0) {
+ ASSERT3U(wsize, ==, zl_wsize);
+ ASSERT3U(wc, ==, zl_wc);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
+
+ case 1:
+ error = zap_lookup(os, object, name, wsize, wc, data);
+ if (error == 0) {
+ if (data == string_value &&
+ bcmp(name, data, namelen) != 0)
+ fatal(0, "name '%s' != val '%s' len %d",
+ name, data, namelen);
+ } else {
+ ASSERT3U(error, ==, ENOENT);
+ }
+ break;
+
+ case 2:
+ error = zap_add(os, object, name, wsize, wc, data, tx);
+ ASSERT(error == 0 || error == EEXIST);
+ break;
+
+ case 3:
+ VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+ break;
+
+ case 4:
+ error = zap_remove(os, object, name, tx);
+ ASSERT(error == 0 || error == ENOENT);
+ break;
+ }
+
+ if (tx != NULL)
+ dmu_tx_commit(tx);
+
+ umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Commit callback data.
+ */
+typedef struct ztest_cb_data {
+ list_node_t zcd_node;
+ uint64_t zcd_txg;
+ int zcd_expected_err;
+ boolean_t zcd_added;
+ boolean_t zcd_called;
+ spa_t *zcd_spa;
+} ztest_cb_data_t;
+
+/* This is the actual commit callback function */
+static void
+ztest_commit_callback(void *arg, int error)
+{
+ ztest_cb_data_t *data = arg;
+ uint64_t synced_txg;
+
+ VERIFY(data != NULL);
+ VERIFY3S(data->zcd_expected_err, ==, error);
+ VERIFY(!data->zcd_called);
+
+ synced_txg = spa_last_synced_txg(data->zcd_spa);
+ if (data->zcd_txg > synced_txg)
+ fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+ ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+ synced_txg);
+
+ data->zcd_called = B_TRUE;
+
+ if (error == ECANCELED) {
+ ASSERT0(data->zcd_txg);
+ ASSERT(!data->zcd_added);
+
+ /*
+ * The private callback data should be destroyed here, but
+ * since we are going to check the zcd_called field after
+ * dmu_tx_abort(), we will destroy it there.
+ */
+ return;
+ }
+
+ ASSERT(data->zcd_added);
+ ASSERT3U(data->zcd_txg, !=, 0);
+
+ (void) mutex_enter(&zcl.zcl_callbacks_lock);
+
+ /* See if this cb was called more quickly */
+ if ((synced_txg - data->zcd_txg) < zc_min_txg_delay)
+ zc_min_txg_delay = synced_txg - data->zcd_txg;
+
+ /* Remove our callback from the list */
+ list_remove(&zcl.zcl_callbacks, data);
+
+ (void) mutex_exit(&zcl.zcl_callbacks_lock);
+
+ umem_free(data, sizeof (ztest_cb_data_t));
+}
+
+/* Allocate and initialize callback data structure */
+static ztest_cb_data_t *
+ztest_create_cb_data(objset_t *os, uint64_t txg)
+{
+ ztest_cb_data_t *cb_data;
+
+ cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+ cb_data->zcd_txg = txg;
+ cb_data->zcd_spa = dmu_objset_spa(os);
+ list_link_init(&cb_data->zcd_node);
+
+ return (cb_data);
+}
+
+/*
+ * Commit callback test.
+ */
+void
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ ztest_od_t *od;
+ dmu_tx_t *tx;
+ ztest_cb_data_t *cb_data[3], *tmp_cb;
+ uint64_t old_txg, txg;
+ int i, error = 0;
+
+ od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL);
+ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
+
+ if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
+ umem_free(od, sizeof (ztest_od_t));
+ return;
+ }
+
+ tx = dmu_tx_create(os);
+
+ cb_data[0] = ztest_create_cb_data(os, 0);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+ dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t));
+
+ /* Every once in a while, abort the transaction on purpose */
+ if (ztest_random(100) == 0)
+ error = -1;
+
+ if (!error)
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+ txg = error ? 0 : dmu_tx_get_txg(tx);
+
+ cb_data[0]->zcd_txg = txg;
+ cb_data[1] = ztest_create_cb_data(os, txg);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
+ if (error) {
+ /*
+ * It's not a strict requirement to call the registered
+ * callbacks from inside dmu_tx_abort(), but that's what
+ * it's supposed to happen in the current implementation
+ * so we will check for that.
+ */
+ for (i = 0; i < 2; i++) {
+ cb_data[i]->zcd_expected_err = ECANCELED;
+ VERIFY(!cb_data[i]->zcd_called);
+ }
+
+ dmu_tx_abort(tx);
+
+ for (i = 0; i < 2; i++) {
+ VERIFY(cb_data[i]->zcd_called);
+ umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+ }
+
+ umem_free(od, sizeof (ztest_od_t));
+ return;
+ }
+
+ cb_data[2] = ztest_create_cb_data(os, txg);
+ dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
+
+ /*
+ * Read existing data to make sure there isn't a future leak.
+ */
+ VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t),
+ &old_txg, DMU_READ_PREFETCH));
+
+ if (old_txg > txg)
+ fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+ old_txg, txg);
+
+ dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx);
+
+ (void) mutex_enter(&zcl.zcl_callbacks_lock);
+
+ /*
+ * Since commit callbacks don't have any ordering requirement and since
+ * it is theoretically possible for a commit callback to be called
+ * after an arbitrary amount of time has elapsed since its txg has been
+ * synced, it is difficult to reliably determine whether a commit
+ * callback hasn't been called due to high load or due to a flawed
+ * implementation.
+ *
+ * In practice, we will assume that if after a certain number of txgs a
+ * commit callback hasn't been called, then most likely there's an
+ * implementation bug..
+ */
+ tmp_cb = list_head(&zcl.zcl_callbacks);
+ if (tmp_cb != NULL &&
+ tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) {
+ fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+ PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+ }
+
+ /*
+ * Let's find the place to insert our callbacks.
+ *
+ * Even though the list is ordered by txg, it is possible for the
+ * insertion point to not be the end because our txg may already be
+ * quiescing at this point and other callbacks in the open txg
+ * (from other objsets) may have sneaked in.
+ */
+ tmp_cb = list_tail(&zcl.zcl_callbacks);
+ while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+ tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+ /* Add the 3 callbacks to the list */
+ for (i = 0; i < 3; i++) {
+ if (tmp_cb == NULL)
+ list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+ else
+ list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+ cb_data[i]);
+
+ cb_data[i]->zcd_added = B_TRUE;
+ VERIFY(!cb_data[i]->zcd_called);
+
+ tmp_cb = cb_data[i];
+ }
+
+ zc_cb_counter += 3;
+
+ (void) mutex_exit(&zcl.zcl_callbacks_lock);
+
+ dmu_tx_commit(tx);
+
+ umem_free(od, sizeof (ztest_od_t));
+}
+
+/*
+ * Visit each object in the dataset. Verify that its properties
+ * are consistent what was stored in the block tag when it was created,
+ * and that its unused bonus buffer space has not been overwritten.
+ */
+/* ARGSUSED */
+void
+ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id)
+{
+ objset_t *os = zd->zd_os;
+ uint64_t obj;
+ int err = 0;
+
+ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+ ztest_block_tag_t *bt = NULL;
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+
+ ztest_object_lock(zd, obj, RL_READER);
+ if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) {
+ ztest_object_unlock(zd, obj);
+ continue;
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_size >= sizeof (*bt))
+ bt = ztest_bt_bonus(db);
+
+ if (bt && bt->bt_magic == BT_MAGIC) {
+ ztest_bt_verify(bt, os, obj, doi.doi_dnodesize,
+ bt->bt_offset, bt->bt_gen, bt->bt_txg,
+ bt->bt_crtxg);
+ ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen);
+ }
+
+ dmu_buf_rele(db, FTAG);
+ ztest_object_unlock(zd, obj);
+ }
+}
+
+/* ARGSUSED */
+void
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+ zfs_prop_t proplist[] = {
+ ZFS_PROP_CHECKSUM,
+ ZFS_PROP_COMPRESSION,
+ ZFS_PROP_COPIES,
+ ZFS_PROP_DEDUP
+ };
+ int p;
+
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+ (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+ ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
+
+ VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE,
+ ztest_random_blocksize(), (int)ztest_random(2)));
+
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+ nvlist_t *props = NULL;
+
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2));
+
+ VERIFY0(spa_prop_get(ztest_spa, &props));
+
+ if (ztest_opts.zo_verbose >= 6)
+ dump_nvlist(props, 4);
+
+ nvlist_free(props);
+
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+static int
+user_release_one(const char *snapname, const char *holdname)
+{
+ nvlist_t *snaps, *holds;
+ int error;
+
+ snaps = fnvlist_alloc();
+ holds = fnvlist_alloc();
+ fnvlist_add_boolean(holds, holdname);
+ fnvlist_add_nvlist(snaps, snapname, holds);
+ fnvlist_free(holds);
+ error = dsl_dataset_user_release(snaps, NULL);
+ fnvlist_free(snaps);
+ return (error);
+}
+
+/*
+ * Test snapshot hold/release and deferred destroy.
+ */
+void
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
+{
+ int error;
+ objset_t *os = zd->zd_os;
+ objset_t *origin;
+ char snapname[100];
+ char fullname[100];
+ char clonename[100];
+ char tag[100];
+ char osname[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *holds;
+
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ dmu_objset_name(os, osname);
+
+ (void) snprintf(snapname, sizeof (snapname), "sh1_%llu",
+ (u_longlong_t)id);
+ (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
+ (void) snprintf(clonename, sizeof (clonename),
+ "%s/ch1_%llu", osname, (u_longlong_t)id);
+ (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id);
+
+ /*
+ * Clean up from any previous run.
+ */
+ error = dsl_destroy_head(clonename);
+ if (error != ENOENT)
+ ASSERT0(error);
+ error = user_release_one(fullname, tag);
+ if (error != ESRCH && error != ENOENT)
+ ASSERT0(error);
+ error = dsl_destroy_snapshot(fullname, B_FALSE);
+ if (error != ENOENT)
+ ASSERT0(error);
+
+ /*
+ * Create snapshot, clone it, mark snap for deferred destroy,
+ * destroy clone, verify snap was also destroyed.
+ */
+ error = dmu_objset_snapshot_one(osname, snapname);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc("dmu_objset_snapshot");
+ goto out;
+ }
+ fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+ }
+
+ error = dmu_objset_clone(clonename, fullname);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc("dmu_objset_clone");
+ goto out;
+ }
+ fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
+ }
+
+ error = dsl_destroy_snapshot(fullname, B_TRUE);
+ if (error) {
+ fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
+ fullname, error);
+ }
+
+ error = dsl_destroy_head(clonename);
+ if (error)
+ fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
+
+ error = dmu_objset_hold(fullname, FTAG, &origin);
+ if (error != ENOENT)
+ fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
+
+ /*
+ * Create snapshot, add temporary hold, verify that we can't
+ * destroy a held snapshot, mark for deferred destroy,
+ * release hold, verify snapshot was destroyed.
+ */
+ error = dmu_objset_snapshot_one(osname, snapname);
+ if (error) {
+ if (error == ENOSPC) {
+ ztest_record_enospc("dmu_objset_snapshot");
+ goto out;
+ }
+ fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+ }
+
+ holds = fnvlist_alloc();
+ fnvlist_add_string(holds, fullname, tag);
+ error = dsl_dataset_user_hold(holds, 0, NULL);
+ fnvlist_free(holds);
+
+ if (error == ENOSPC) {
+ ztest_record_enospc("dsl_dataset_user_hold");
+ goto out;
+ } else if (error) {
+ fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
+ fullname, tag, error);
+ }
+
+ error = dsl_destroy_snapshot(fullname, B_FALSE);
+ if (error != EBUSY) {
+ fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
+ fullname, error);
+ }
+
+ error = dsl_destroy_snapshot(fullname, B_TRUE);
+ if (error) {
+ fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
+ fullname, error);
+ }
+
+ error = user_release_one(fullname, tag);
+ if (error)
+ fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
+
+ VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
+
+out:
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+}
+
+/*
+ * Inject random faults into the on-disk data.
+ */
+/* ARGSUSED */
+void
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
+{
+ ztest_shared_t *zs = ztest_shared;
+ spa_t *spa = ztest_spa;
+ int fd;
+ uint64_t offset;
+ uint64_t leaves;
+ uint64_t bad = 0x1990c0ffeedecadeull;
+ uint64_t top, leaf;
+ char *path0;
+ char *pathrand;
+ size_t fsize;
+ int bshift = SPA_MAXBLOCKSHIFT + 2;
+ int iters = 1000;
+ int maxfaults;
+ int mirror_save;
+ vdev_t *vd0 = NULL;
+ uint64_t guid0 = 0;
+ boolean_t islog = B_FALSE;
+
+ path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+ mutex_enter(&ztest_vdev_lock);
+
+ /*
+ * Device removal is in progress, fault injection must be disabled
+ * until it completes and the pool is scrubbed. The fault injection
+ * strategy for damaging blocks does not take in to account evacuated
+ * blocks which may have already been damaged.
+ */
+ if (ztest_device_removal_active) {
+ mutex_exit(&ztest_vdev_lock);
+ goto out;
+ }
+
+ maxfaults = MAXFAULTS(zs);
+ leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
+ mirror_save = zs->zs_mirrors;
+ mutex_exit(&ztest_vdev_lock);
+
+ ASSERT(leaves >= 1);
+
+ /*
+ * While ztest is running the number of leaves will not change. This
+ * is critical for the fault injection logic as it determines where
+ * errors can be safely injected such that they are always repairable.
+ *
+ * When restarting ztest a different number of leaves may be requested
+ * which will shift the regions to be damaged. This is fine as long
+ * as the pool has been scrubbed prior to using the new mapping.
+ * Failure to do can result in non-repairable damage being injected.
+ */
+ if (ztest_pool_scrubbed == B_FALSE)
+ goto out;
+
+ /*
+ * Grab the name lock as reader. There are some operations
+ * which don't like to have their vdevs changed while
+ * they are in progress (i.e. spa_change_guid). Those
+ * operations will have grabbed the name lock as writer.
+ */
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ /*
+ * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
+ */
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+ if (ztest_random(2) == 0) {
+ /*
+ * Inject errors on a normal data device or slog device.
+ */
+ top = ztest_random_vdev_top(spa, B_TRUE);
+ leaf = ztest_random(leaves) + zs->zs_splits;
+
+ /*
+ * Generate paths to the first leaf in this top-level vdev,
+ * and to the random leaf we selected. We'll induce transient
+ * write failures and random online/offline activity on leaf 0,
+ * and we'll write random garbage to the randomly chosen leaf.
+ */
+ (void) snprintf(path0, MAXPATHLEN, ztest_dev_template,
+ ztest_opts.zo_dir, ztest_opts.zo_pool,
+ top * leaves + zs->zs_splits);
+ (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template,
+ ztest_opts.zo_dir, ztest_opts.zo_pool,
+ top * leaves + leaf);
+
+ vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+ if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+ islog = B_TRUE;
+
+ /*
+ * If the top-level vdev needs to be resilvered
+ * then we only allow faults on the device that is
+ * resilvering.
+ */
+ if (vd0 != NULL && maxfaults != 1 &&
+ (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+ vd0->vdev_resilver_txg != 0)) {
+ /*
+ * Make vd0 explicitly claim to be unreadable,
+ * or unwriteable, or reach behind its back
+ * and close the underlying fd. We can do this if
+ * maxfaults == 0 because we'll fail and reexecute,
+ * and we can do it if maxfaults >= 2 because we'll
+ * have enough redundancy. If maxfaults == 1, the
+ * combination of this with injection of random data
+ * corruption below exceeds the pool's fault tolerance.
+ */
+ vdev_file_t *vf = vd0->vdev_tsd;
+
+ zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d",
+ (long long)vd0->vdev_id, (int)maxfaults);
+
+ if (vf != NULL && ztest_random(3) == 0) {
+ (void) close(vf->vf_file->f_fd);
+ vf->vf_file->f_fd = -1;
+ } else if (ztest_random(2) == 0) {
+ vd0->vdev_cant_read = B_TRUE;
+ } else {
+ vd0->vdev_cant_write = B_TRUE;
+ }
+ guid0 = vd0->vdev_guid;
+ }
+ } else {
+ /*
+ * Inject errors on an l2cache device.
+ */
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ if (sav->sav_count == 0) {
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+ goto out;
+ }
+ vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
+ guid0 = vd0->vdev_guid;
+ (void) strcpy(path0, vd0->vdev_path);
+ (void) strcpy(pathrand, vd0->vdev_path);
+
+ leaf = 0;
+ leaves = 1;
+ maxfaults = INT_MAX; /* no limit on cache devices */
+ }
+
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ /*
+ * If we can tolerate two or more faults, or we're dealing
+ * with a slog, randomly online/offline vd0.
+ */
+ if ((maxfaults >= 2 || islog) && guid0 != 0) {
+ if (ztest_random(10) < 6) {
+ int flags = (ztest_random(2) == 0 ?
+ ZFS_OFFLINE_TEMPORARY : 0);
+
+ /*
+ * We have to grab the zs_name_lock as writer to
+ * prevent a race between offlining a slog and
+ * destroying a dataset. Offlining the slog will
+ * grab a reference on the dataset which may cause
+ * dsl_destroy_head() to fail with EBUSY thus
+ * leaving the dataset in an inconsistent state.
+ */
+ if (islog)
+ (void) pthread_rwlock_wrlock(&ztest_name_lock);
+
+ VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+ if (islog)
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+ } else {
+ /*
+ * Ideally we would like to be able to randomly
+ * call vdev_[on|off]line without holding locks
+ * to force unpredictable failures but the side
+ * effects of vdev_[on|off]line prevent us from
+ * doing so. We grab the ztest_vdev_lock here to
+ * prevent a race between injection testing and
+ * aux_vdev removal.
+ */
+ mutex_enter(&ztest_vdev_lock);
+ (void) vdev_online(spa, guid0, 0, NULL);
+ mutex_exit(&ztest_vdev_lock);
+ }
+ }
+
+ if (maxfaults == 0)
+ goto out;
+
+ /*
+ * We have at least single-fault tolerance, so inject data corruption.
+ */
+ fd = open(pathrand, O_RDWR);
+
+ if (fd == -1) /* we hit a gap in the device namespace */
+ goto out;
+
+ fsize = lseek(fd, 0, SEEK_END);
+
+ while (--iters != 0) {
+ /*
+ * The offset must be chosen carefully to ensure that
+ * we do not inject a given logical block with errors
+ * on two different leaf devices, because ZFS can not
+ * tolerate that (if maxfaults==1).
+ *
+ * To achieve this we divide each leaf device into
+ * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4).
+ * Each chunk is further divided into error-injection
+ * ranges (can accept errors) and clear ranges (we do
+ * not inject errors in those). Each error-injection
+ * range can accept errors only for a single leaf vdev.
+ * Error-injection ranges are separated by clear ranges.
+ *
+ * For example, with 3 leaves, each chunk looks like:
+ * 0 to 32M: injection range for leaf 0
+ * 32M to 64M: clear range - no injection allowed
+ * 64M to 96M: injection range for leaf 1
+ * 96M to 128M: clear range - no injection allowed
+ * 128M to 160M: injection range for leaf 2
+ * 160M to 192M: clear range - no injection allowed
+ *
+ * Each clear range must be large enough such that a
+ * single block cannot straddle it. This way a block
+ * can't be a target in two different injection ranges
+ * (on different leaf vdevs).
+ */
+ offset = ztest_random(fsize / (leaves << bshift)) *
+ (leaves << bshift) + (leaf << bshift) +
+ (ztest_random(1ULL << (bshift - 1)) & -8ULL);
+
+ /*
+ * Only allow damage to the labels at one end of the vdev.
+ *
+ * If all labels are damaged, the device will be totally
+ * inaccessible, which will result in loss of data,
+ * because we also damage (parts of) the other side of
+ * the mirror/raidz.
+ *
+ * Additionally, we will always have both an even and an
+ * odd label, so that we can handle crashes in the
+ * middle of vdev_config_sync().
+ */
+ if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
+ continue;
+
+ /*
+ * The two end labels are stored at the "end" of the disk, but
+ * the end of the disk (vdev_psize) is aligned to
+ * sizeof (vdev_label_t).
+ */
+ uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
+ if ((leaf & 1) == 1 &&
+ offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
+ continue;
+
+ mutex_enter(&ztest_vdev_lock);
+ if (mirror_save != zs->zs_mirrors) {
+ mutex_exit(&ztest_vdev_lock);
+ (void) close(fd);
+ goto out;
+ }
+
+ if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
+ fatal(1, "can't inject bad word at 0x%llx in %s",
+ offset, pathrand);
+
+ mutex_exit(&ztest_vdev_lock);
+
+ if (ztest_opts.zo_verbose >= 7)
+ (void) printf("injected bad word into %s,"
+ " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+ }
+
+ (void) close(fd);
+out:
+ umem_free(path0, MAXPATHLEN);
+ umem_free(pathrand, MAXPATHLEN);
+}
+
+/*
+ * By design ztest will never inject uncorrectable damage in to the pool.
+ * Issue a scrub, wait for it to complete, and verify there is never any
+ * any persistent damage.
+ *
+ * Only after a full scrub has been completed is it safe to start injecting
+ * data corruption. See the comment in zfs_fault_inject().
+ */
+static int
+ztest_scrub_impl(spa_t *spa)
+{
+ int error = spa_scan(spa, POOL_SCAN_SCRUB);
+ if (error)
+ return (error);
+
+ while (dsl_scan_scrubbing(spa_get_dsl(spa)))
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ if (spa_get_errlog_size(spa) > 0)
+ return (ECKSUM);
+
+ ztest_pool_scrubbed = B_TRUE;
+
+ return (0);
+}
+
+/*
+ * Scrub the pool.
+ */
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ int error;
+
+ /*
+ * Scrub in progress by device removal.
+ */
+ if (ztest_device_removal_active)
+ return;
+
+ /*
+ * Start a scrub, wait a moment, then force a restart.
+ */
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
+ (void) poll(NULL, 0, 100);
+
+ error = ztest_scrub_impl(spa);
+ if (error == EBUSY)
+ error = 0;
+ ASSERT0(error);
+}
+
+/*
+ * Change the guid for the pool.
+ */
+/* ARGSUSED */
+void
+ztest_reguid(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ uint64_t orig, load;
+ int error;
+
+ if (ztest_opts.zo_mmp_test)
+ return;
+
+ orig = spa_guid(spa);
+ load = spa_load_guid(spa);
+
+ (void) pthread_rwlock_wrlock(&ztest_name_lock);
+ error = spa_change_guid(spa);
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ if (error != 0)
+ return;
+
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Changed guid old %llu -> %llu\n",
+ (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
+ }
+
+ VERIFY3U(orig, !=, spa_guid(spa));
+ VERIFY3U(load, ==, spa_load_guid(spa));
+}
+
+void
+ztest_fletcher(ztest_ds_t *zd, uint64_t id)
+{
+ hrtime_t end = gethrtime() + NANOSEC;
+
+ while (gethrtime() <= end) {
+ int run_count = 100;
+ void *buf;
+ struct abd *abd_data, *abd_meta;
+ uint32_t size;
+ int *ptr;
+ int i;
+ zio_cksum_t zc_ref;
+ zio_cksum_t zc_ref_byteswap;
+
+ size = ztest_random_blocksize();
+
+ buf = umem_alloc(size, UMEM_NOFAIL);
+ abd_data = abd_alloc(size, B_FALSE);
+ abd_meta = abd_alloc(size, B_TRUE);
+
+ for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
+ *ptr = ztest_random(UINT_MAX);
+
+ abd_copy_from_buf_off(abd_data, buf, 0, size);
+ abd_copy_from_buf_off(abd_meta, buf, 0, size);
+
+ VERIFY0(fletcher_4_impl_set("scalar"));
+ fletcher_4_native(buf, size, NULL, &zc_ref);
+ fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap);
+
+ VERIFY0(fletcher_4_impl_set("cycle"));
+ while (run_count-- > 0) {
+ zio_cksum_t zc;
+ zio_cksum_t zc_byteswap;
+
+ fletcher_4_byteswap(buf, size, NULL, &zc_byteswap);
+ fletcher_4_native(buf, size, NULL, &zc);
+
+ VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
+ VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
+ sizeof (zc_byteswap)));
+
+ /* Test ABD - data */
+ abd_fletcher_4_byteswap(abd_data, size, NULL,
+ &zc_byteswap);
+ abd_fletcher_4_native(abd_data, size, NULL, &zc);
+
+ VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
+ VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
+ sizeof (zc_byteswap)));
+
+ /* Test ABD - metadata */
+ abd_fletcher_4_byteswap(abd_meta, size, NULL,
+ &zc_byteswap);
+ abd_fletcher_4_native(abd_meta, size, NULL, &zc);
+
+ VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
+ VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
+ sizeof (zc_byteswap)));
+
+ }
+
+ umem_free(buf, size);
+ abd_free(abd_data);
+ abd_free(abd_meta);
+ }
+}
+
+void
+ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id)
+{
+ void *buf;
+ size_t size;
+ int *ptr;
+ int i;
+ zio_cksum_t zc_ref;
+ zio_cksum_t zc_ref_bswap;
+
+ hrtime_t end = gethrtime() + NANOSEC;
+
+ while (gethrtime() <= end) {
+ int run_count = 100;
+
+ size = ztest_random_blocksize();
+ buf = umem_alloc(size, UMEM_NOFAIL);
+
+ for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
+ *ptr = ztest_random(UINT_MAX);
+
+ VERIFY0(fletcher_4_impl_set("scalar"));
+ fletcher_4_native(buf, size, NULL, &zc_ref);
+ fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap);
+
+ VERIFY0(fletcher_4_impl_set("cycle"));
+
+ while (run_count-- > 0) {
+ zio_cksum_t zc;
+ zio_cksum_t zc_bswap;
+ size_t pos = 0;
+
+ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+ ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
+
+ while (pos < size) {
+ size_t inc = 64 * ztest_random(size / 67);
+ /* sometimes add few bytes to test non-simd */
+ if (ztest_random(100) < 10)
+ inc += P2ALIGN(ztest_random(64),
+ sizeof (uint32_t));
+
+ if (inc > (size - pos))
+ inc = size - pos;
+
+ fletcher_4_incremental_native(buf + pos, inc,
+ &zc);
+ fletcher_4_incremental_byteswap(buf + pos, inc,
+ &zc_bswap);
+
+ pos += inc;
+ }
+
+ VERIFY3U(pos, ==, size);
+
+ VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
+ VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
+
+ /*
+ * verify if incremental on the whole buffer is
+ * equivalent to non-incremental version
+ */
+ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
+ ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0);
+
+ fletcher_4_incremental_native(buf, size, &zc);
+ fletcher_4_incremental_byteswap(buf, size, &zc_bswap);
+
+ VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref));
+ VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap));
+ }
+
+ umem_free(buf, size);
+ }
+}
+
+static int
+ztest_check_path(char *path)
+{
+ struct stat s;
+ /* return true on success */
+ return (!stat(path, &s));
+}
+
+static void
+ztest_get_zdb_bin(char *bin, int len)
+{
+ char *zdb_path;
+ /*
+ * Try to use ZDB_PATH and in-tree zdb path. If not successful, just
+ * let popen to search through PATH.
+ */
+ if ((zdb_path = getenv("ZDB_PATH"))) {
+ strlcpy(bin, zdb_path, len); /* In env */
+ if (!ztest_check_path(bin)) {
+ ztest_dump_core = 0;
+ fatal(1, "invalid ZDB_PATH '%s'", bin);
+ }
+ return;
+ }
+
+ VERIFY(realpath(getexecname(), bin) != NULL);
+ if (strstr(bin, "/ztest/")) {
+ strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */
+ strcat(bin, "/zdb/zdb");
+ if (ztest_check_path(bin))
+ return;
+ }
+ strcpy(bin, "zdb");
+}
+
+static vdev_t *
+ztest_random_concrete_vdev_leaf(vdev_t *vd)
+{
+ if (vd == NULL)
+ return (NULL);
+
+ if (vd->vdev_children == 0)
+ return (vd);
+
+ vdev_t *eligible[vd->vdev_children];
+ int eligible_idx = 0, i;
+ for (i = 0; i < vd->vdev_children; i++) {
+ vdev_t *cvd = vd->vdev_child[i];
+ if (cvd->vdev_top->vdev_removing)
+ continue;
+ if (cvd->vdev_children > 0 ||
+ (vdev_is_concrete(cvd) && !cvd->vdev_detached)) {
+ eligible[eligible_idx++] = cvd;
+ }
+ }
+ VERIFY(eligible_idx > 0);
+
+ uint64_t child_no = ztest_random(eligible_idx);
+ return (ztest_random_concrete_vdev_leaf(eligible[child_no]));
+}
+
+/* ARGSUSED */
+void
+ztest_initialize(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ int error = 0;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ /* Random leaf vdev */
+ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+ if (rand_vd == NULL) {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * The random vdev we've selected may change as soon as we
+ * drop the spa_config_lock. We create local copies of things
+ * we're interested in.
+ */
+ uint64_t guid = rand_vd->vdev_guid;
+ char *path = strdup(rand_vd->vdev_path);
+ boolean_t active = rand_vd->vdev_initialize_thread != NULL;
+
+ zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS);
+
+ nvlist_t *vdev_guids = fnvlist_alloc();
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ fnvlist_add_uint64(vdev_guids, path, guid);
+ error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist);
+ fnvlist_free(vdev_guids);
+ fnvlist_free(vdev_errlist);
+
+ switch (cmd) {
+ case POOL_INITIALIZE_CANCEL:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Cancel initialize %s", path);
+ if (!active)
+ (void) printf(" failed (no initialize active)");
+ (void) printf("\n");
+ }
+ break;
+ case POOL_INITIALIZE_START:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Start initialize %s", path);
+ if (active && error == 0)
+ (void) printf(" failed (already active)");
+ else if (error != 0)
+ (void) printf(" failed (error %d)", error);
+ (void) printf("\n");
+ }
+ break;
+ case POOL_INITIALIZE_SUSPEND:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Suspend initialize %s", path);
+ if (!active)
+ (void) printf(" failed (no initialize active)");
+ (void) printf("\n");
+ }
+ break;
+ }
+ free(path);
+ mutex_exit(&ztest_vdev_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_trim(ztest_ds_t *zd, uint64_t id)
+{
+ spa_t *spa = ztest_spa;
+ int error = 0;
+
+ mutex_enter(&ztest_vdev_lock);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+ /* Random leaf vdev */
+ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev);
+ if (rand_vd == NULL) {
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+ mutex_exit(&ztest_vdev_lock);
+ return;
+ }
+
+ /*
+ * The random vdev we've selected may change as soon as we
+ * drop the spa_config_lock. We create local copies of things
+ * we're interested in.
+ */
+ uint64_t guid = rand_vd->vdev_guid;
+ char *path = strdup(rand_vd->vdev_path);
+ boolean_t active = rand_vd->vdev_trim_thread != NULL;
+
+ zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ uint64_t cmd = ztest_random(POOL_TRIM_FUNCS);
+ uint64_t rate = 1 << ztest_random(30);
+ boolean_t partial = (ztest_random(5) > 0);
+ boolean_t secure = (ztest_random(5) > 0);
+
+ nvlist_t *vdev_guids = fnvlist_alloc();
+ nvlist_t *vdev_errlist = fnvlist_alloc();
+ fnvlist_add_uint64(vdev_guids, path, guid);
+ error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial,
+ secure, vdev_errlist);
+ fnvlist_free(vdev_guids);
+ fnvlist_free(vdev_errlist);
+
+ switch (cmd) {
+ case POOL_TRIM_CANCEL:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Cancel TRIM %s", path);
+ if (!active)
+ (void) printf(" failed (no TRIM active)");
+ (void) printf("\n");
+ }
+ break;
+ case POOL_TRIM_START:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Start TRIM %s", path);
+ if (active && error == 0)
+ (void) printf(" failed (already active)");
+ else if (error != 0)
+ (void) printf(" failed (error %d)", error);
+ (void) printf("\n");
+ }
+ break;
+ case POOL_TRIM_SUSPEND:
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("Suspend TRIM %s", path);
+ if (!active)
+ (void) printf(" failed (no TRIM active)");
+ (void) printf("\n");
+ }
+ break;
+ }
+ free(path);
+ mutex_exit(&ztest_vdev_lock);
+}
+
+/*
+ * Verify pool integrity by running zdb.
+ */
+static void
+ztest_run_zdb(char *pool)
+{
+ int status;
+ char *bin;
+ char *zdb;
+ char *zbuf;
+ const int len = MAXPATHLEN + MAXNAMELEN + 20;
+ FILE *fp;
+
+ bin = umem_alloc(len, UMEM_NOFAIL);
+ zdb = umem_alloc(len, UMEM_NOFAIL);
+ zbuf = umem_alloc(1024, UMEM_NOFAIL);
+
+ ztest_get_zdb_bin(bin, len);
+
+ (void) sprintf(zdb,
+ "%s -bcc%s%s -G -d -Y -e -y -p %s %s",
+ bin,
+ ztest_opts.zo_verbose >= 3 ? "s" : "",
+ ztest_opts.zo_verbose >= 4 ? "v" : "",
+ ztest_opts.zo_dir,
+ pool);
+
+ if (ztest_opts.zo_verbose >= 5)
+ (void) printf("Executing %s\n", strstr(zdb, "zdb "));
+
+ fp = popen(zdb, "r");
+
+ while (fgets(zbuf, 1024, fp) != NULL)
+ if (ztest_opts.zo_verbose >= 3)
+ (void) printf("%s", zbuf);
+
+ status = pclose(fp);
+
+ if (status == 0)
+ goto out;
+
+ ztest_dump_core = 0;
+ if (WIFEXITED(status))
+ fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
+ else
+ fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
+out:
+ umem_free(bin, len);
+ umem_free(zdb, len);
+ umem_free(zbuf, 1024);
+}
+
+static void
+ztest_walk_pool_directory(char *header)
+{
+ spa_t *spa = NULL;
+
+ if (ztest_opts.zo_verbose >= 6)
+ (void) printf("%s\n", header);
+
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL)
+ if (ztest_opts.zo_verbose >= 6)
+ (void) printf("\t%s\n", spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+}
+
+static void
+ztest_spa_import_export(char *oldname, char *newname)
+{
+ nvlist_t *config, *newconfig;
+ uint64_t pool_guid;
+ spa_t *spa;
+ int error;
+
+ if (ztest_opts.zo_verbose >= 4) {
+ (void) printf("import/export: old = %s, new = %s\n",
+ oldname, newname);
+ }
+
+ /*
+ * Clean up from previous runs.
+ */
+ (void) spa_destroy(newname);
+
+ /*
+ * Get the pool's configuration and guid.
+ */
+ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+ /*
+ * Kick off a scrub to tickle scrub/export races.
+ */
+ if (ztest_random(2) == 0)
+ (void) spa_scan(spa, POOL_SCAN_SCRUB);
+
+ pool_guid = spa_guid(spa);
+ spa_close(spa, FTAG);
+
+ ztest_walk_pool_directory("pools before export");
+
+ /*
+ * Export it.
+ */
+ VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
+
+ ztest_walk_pool_directory("pools after export");
+
+ /*
+ * Try to import it.
+ */
+ newconfig = spa_tryimport(config);
+ ASSERT(newconfig != NULL);
+ nvlist_free(newconfig);
+
+ /*
+ * Import it under the new name.
+ */
+ error = spa_import(newname, config, NULL, 0);
+ if (error != 0) {
+ dump_nvlist(config, 0);
+ fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
+ oldname, newname, error);
+ }
+
+ ztest_walk_pool_directory("pools after import");
+
+ /*
+ * Try to import it again -- should fail with EEXIST.
+ */
+ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
+
+ /*
+ * Try to import it under a different name -- should fail with EEXIST.
+ */
+ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
+
+ /*
+ * Verify that the pool is no longer visible under the old name.
+ */
+ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
+
+ /*
+ * Verify that we can open and close the pool using the new name.
+ */
+ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
+ ASSERT(pool_guid == spa_guid(spa));
+ spa_close(spa, FTAG);
+
+ nvlist_free(config);
+}
+
+static void
+ztest_resume(spa_t *spa)
+{
+ if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
+ (void) printf("resuming from suspended state\n");
+ spa_vdev_state_enter(spa, SCL_NONE);
+ vdev_clear(spa, NULL);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ (void) zio_resume(spa);
+}
+
+static void
+ztest_resume_thread(void *arg)
+{
+ spa_t *spa = arg;
+
+ while (!ztest_exiting) {
+ if (spa_suspended(spa))
+ ztest_resume(spa);
+ (void) poll(NULL, 0, 100);
+
+ /*
+ * Periodically change the zfs_compressed_arc_enabled setting.
+ */
+ if (ztest_random(10) == 0)
+ zfs_compressed_arc_enabled = ztest_random(2);
+
+ /*
+ * Periodically change the zfs_abd_scatter_enabled setting.
+ */
+ if (ztest_random(10) == 0)
+ zfs_abd_scatter_enabled = ztest_random(2);
+ }
+
+ thread_exit();
+}
+
+static void
+ztest_deadman_thread(void *arg)
+{
+ ztest_shared_t *zs = arg;
+ spa_t *spa = ztest_spa;
+ hrtime_t delay, overdue, last_run = gethrtime();
+
+ delay = (zs->zs_thread_stop - zs->zs_thread_start) +
+ MSEC2NSEC(zfs_deadman_synctime_ms);
+
+ while (!ztest_exiting) {
+ /*
+ * Wait for the delay timer while checking occasionally
+ * if we should stop.
+ */
+ if (gethrtime() < last_run + delay) {
+ (void) poll(NULL, 0, 1000);
+ continue;
+ }
+
+ /*
+ * If the pool is suspended then fail immediately. Otherwise,
+ * check to see if the pool is making any progress. If
+ * vdev_deadman() discovers that there hasn't been any recent
+ * I/Os then it will end up aborting the tests.
+ */
+ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
+ fatal(0, "aborting test after %llu seconds because "
+ "pool has transitioned to a suspended state.",
+ zfs_deadman_synctime_ms / 1000);
+ }
+ vdev_deadman(spa->spa_root_vdev, FTAG);
+
+ /*
+ * If the process doesn't complete within a grace period of
+ * zfs_deadman_synctime_ms over the expected finish time,
+ * then it may be hung and is terminated.
+ */
+ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms);
+ if (gethrtime() > overdue) {
+ fatal(0, "aborting test after %llu seconds because "
+ "the process is overdue for termination.",
+ (gethrtime() - zs->zs_proc_start) / NANOSEC);
+ }
+
+ (void) printf("ztest has been running for %lld seconds\n",
+ (gethrtime() - zs->zs_proc_start) / NANOSEC);
+
+ last_run = gethrtime();
+ delay = MSEC2NSEC(zfs_deadman_checktime_ms);
+ }
+
+ thread_exit();
+}
+
+static void
+ztest_execute(int test, ztest_info_t *zi, uint64_t id)
+{
+ ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
+ ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
+ hrtime_t functime = gethrtime();
+ int i;
+
+ for (i = 0; i < zi->zi_iters; i++)
+ zi->zi_func(zd, id);
+
+ functime = gethrtime() - functime;
+
+ atomic_add_64(&zc->zc_count, 1);
+ atomic_add_64(&zc->zc_time, functime);
+
+ if (ztest_opts.zo_verbose >= 4)
+ (void) printf("%6.2f sec in %s\n",
+ (double)functime / NANOSEC, zi->zi_funcname);
+}
+
+static void
+ztest_thread(void *arg)
+{
+ int rand;
+ uint64_t id = (uintptr_t)arg;
+ ztest_shared_t *zs = ztest_shared;
+ uint64_t call_next;
+ hrtime_t now;
+ ztest_info_t *zi;
+ ztest_shared_callstate_t *zc;
+
+ while ((now = gethrtime()) < zs->zs_thread_stop) {
+ /*
+ * See if it's time to force a crash.
+ */
+ if (now > zs->zs_thread_kill)
+ ztest_kill(zs);
+
+ /*
+ * If we're getting ENOSPC with some regularity, stop.
+ */
+ if (zs->zs_enospc_count > 10)
+ break;
+
+ /*
+ * Pick a random function to execute.
+ */
+ rand = ztest_random(ZTEST_FUNCS);
+ zi = &ztest_info[rand];
+ zc = ZTEST_GET_SHARED_CALLSTATE(rand);
+ call_next = zc->zc_next;
+
+ if (now >= call_next &&
+ atomic_cas_64(&zc->zc_next, call_next, call_next +
+ ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
+ ztest_execute(rand, zi, id);
+ }
+ }
+
+ thread_exit();
+}
+
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+ (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
+}
+
+static void
+ztest_dataset_destroy(int d)
+{
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ int t;
+
+ ztest_dataset_name(name, ztest_opts.zo_pool, d);
+
+ if (ztest_opts.zo_verbose >= 3)
+ (void) printf("Destroying %s to free up space\n", name);
+
+ /*
+ * Cleanup any non-standard clones and snapshots. In general,
+ * ztest thread t operates on dataset (t % zopt_datasets),
+ * so there may be more than one thing to clean up.
+ */
+ for (t = d; t < ztest_opts.zo_threads;
+ t += ztest_opts.zo_datasets)
+ ztest_dsl_dataset_cleanup(name, t);
+
+ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+ DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+ uint64_t usedobjs, dirobjs, scratch;
+
+ /*
+ * ZTEST_DIROBJ is the object directory for the entire dataset.
+ * Therefore, the number of objects in use should equal the
+ * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+ * If not, we have an object leak.
+ *
+ * Note that we can only check this in ztest_dataset_open(),
+ * when the open-context and syncing-context values agree.
+ * That's because zap_count() returns the open-context value,
+ * while dmu_objset_space() returns the rootbp fill count.
+ */
+ VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+ dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+ ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(int d)
+{
+ ztest_ds_t *zd = &ztest_ds[d];
+ uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
+ objset_t *os;
+ zilog_t *zilog;
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ int error;
+
+ ztest_dataset_name(name, ztest_opts.zo_pool, d);
+
+ (void) pthread_rwlock_rdlock(&ztest_name_lock);
+
+ error = ztest_dataset_create(name);
+ if (error == ENOSPC) {
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+ ztest_record_enospc(FTAG);
+ return (error);
+ }
+ ASSERT(error == 0 || error == EEXIST);
+
+ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE,
+ B_TRUE, zd, &os));
+ (void) pthread_rwlock_unlock(&ztest_name_lock);
+
+ ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
+
+ zilog = zd->zd_zilog;
+
+ if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+ zilog->zl_header->zh_claim_lr_seq < committed_seq)
+ fatal(0, "missing log records: claimed %llu < committed %llu",
+ zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ zil_replay(os, zd, ztest_replay_vector);
+
+ ztest_dataset_dirobj_verify(zd);
+
+ if (ztest_opts.zo_verbose >= 6)
+ (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+ zd->zd_name,
+ (u_longlong_t)zilog->zl_parse_blk_count,
+ (u_longlong_t)zilog->zl_parse_lr_count,
+ (u_longlong_t)zilog->zl_replaying_seq);
+
+ zilog = zil_open(os, ztest_get_data);
+
+ if (zilog->zl_replaying_seq != 0 &&
+ zilog->zl_replaying_seq < committed_seq)
+ fatal(0, "missing log records: replayed %llu < committed %llu",
+ zilog->zl_replaying_seq, committed_seq);
+
+ return (0);
+}
+
+static void
+ztest_dataset_close(int d)
+{
+ ztest_ds_t *zd = &ztest_ds[d];
+
+ zil_close(zd->zd_zilog);
+ dmu_objset_disown(zd->zd_os, B_TRUE, zd);
+
+ ztest_zd_fini(zd);
+}
+
+/* ARGSUSED */
+static int
+ztest_replay_zil_cb(const char *name, void *arg)
+{
+ objset_t *os;
+ ztest_ds_t *zdtmp;
+
+ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE,
+ B_TRUE, FTAG, &os));
+
+ zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
+
+ ztest_zd_init(zdtmp, NULL, os);
+ zil_replay(os, zdtmp, ztest_replay_vector);
+ ztest_zd_fini(zdtmp);
+
+ if (dmu_objset_zil(os)->zl_parse_lr_count != 0 &&
+ ztest_opts.zo_verbose >= 6) {
+ zilog_t *zilog = dmu_objset_zil(os);
+
+ (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+ name,
+ (u_longlong_t)zilog->zl_parse_blk_count,
+ (u_longlong_t)zilog->zl_parse_lr_count,
+ (u_longlong_t)zilog->zl_replaying_seq);
+ }
+
+ umem_free(zdtmp, sizeof (ztest_ds_t));
+
+ dmu_objset_disown(os, B_TRUE, FTAG);
+ return (0);
+}
+
+static void
+ztest_freeze(void)
+{
+ ztest_ds_t *zd = &ztest_ds[0];
+ spa_t *spa;
+ int numloops = 0;
+
+ if (ztest_opts.zo_verbose >= 3)
+ (void) printf("testing spa_freeze()...\n");
+
+ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+ VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+ VERIFY3U(0, ==, ztest_dataset_open(0));
+ ztest_spa = spa;
+
+ /*
+ * Force the first log block to be transactionally allocated.
+ * We have to do this before we freeze the pool -- otherwise
+ * the log chain won't be anchored.
+ */
+ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+ ztest_dmu_object_alloc_free(zd, 0);
+ zil_commit(zd->zd_zilog, 0);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Freeze the pool. This stops spa_sync() from doing anything,
+ * so that the only way to record changes from now on is the ZIL.
+ */
+ spa_freeze(spa);
+
+ /*
+ * Because it is hard to predict how much space a write will actually
+ * require beforehand, we leave ourselves some fudge space to write over
+ * capacity.
+ */
+ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
+
+ /*
+ * Run tests that generate log records but don't alter the pool config
+ * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+ * We do a txg_wait_synced() after each iteration to force the txg
+ * to increase well beyond the last synced value in the uberblock.
+ * The ZIL should be OK with that.
+ *
+ * Run a random number of times less than zo_maxloops and ensure we do
+ * not run out of space on the pool.
+ */
+ while (ztest_random(10) != 0 &&
+ numloops++ < ztest_opts.zo_maxloops &&
+ metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
+ ztest_od_t od;
+ ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0);
+ VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
+ ztest_io(zd, od.od_object,
+ ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ }
+
+ /*
+ * Commit all of the changes we just generated.
+ */
+ zil_commit(zd->zd_zilog, 0);
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ /*
+ * Close our dataset and close the pool.
+ */
+ ztest_dataset_close(0);
+ spa_close(spa, FTAG);
+ kernel_fini();
+
+ /*
+ * Open and close the pool and dataset to induce log replay.
+ */
+ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+ VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+ ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
+ VERIFY3U(0, ==, ztest_dataset_open(0));
+ ztest_spa = spa;
+ txg_wait_synced(spa_get_dsl(spa), 0);
+ ztest_dataset_close(0);
+ ztest_reguid(NULL, 0);
+
+ spa_close(spa, FTAG);
+ kernel_fini();
+}
+
+static void
+ztest_import_impl(ztest_shared_t *zs)
+{
+ importargs_t args = { 0 };
+ nvlist_t *cfg = NULL;
+ int nsearch = 1;
+ char *searchdirs[nsearch];
+ int flags = ZFS_IMPORT_MISSING_LOG;
+
+ searchdirs[0] = ztest_opts.zo_dir;
+ args.paths = nsearch;
+ args.path = searchdirs;
+ args.can_be_active = B_FALSE;
+
+ VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args,
+ &libzpool_config_ops));
+ VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags));
+}
+
+/*
+ * Import a storage pool with the given name.
+ */
+static void
+ztest_import(ztest_shared_t *zs)
+{
+ spa_t *spa;
+
+ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
+ VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+
+ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+
+ ztest_import_impl(zs);
+
+ VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+ zs->zs_metaslab_sz =
+ 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+ spa_close(spa, FTAG);
+
+ kernel_fini();
+
+ if (!ztest_opts.zo_mmp_test) {
+ ztest_run_zdb(ztest_opts.zo_pool);
+ ztest_freeze();
+ ztest_run_zdb(ztest_opts.zo_pool);
+ }
+
+ (void) pthread_rwlock_destroy(&ztest_name_lock);
+ mutex_destroy(&ztest_vdev_lock);
+ mutex_destroy(&ztest_checkpoint_lock);
+}
+
+/*
+ * Kick off threads to run tests on all datasets in parallel.
+ */
+static void
+ztest_run(ztest_shared_t *zs)
+{
+ spa_t *spa;
+ objset_t *os;
+ kthread_t *resume_thread, *deadman_thread;
+ kthread_t **run_threads;
+ uint64_t object;
+ int error;
+ int t, d;
+
+ ztest_exiting = B_FALSE;
+
+ /*
+ * Initialize parent/child shared state.
+ */
+ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
+ VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+
+ zs->zs_thread_start = gethrtime();
+ zs->zs_thread_stop =
+ zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
+ zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+ zs->zs_thread_kill = zs->zs_thread_stop;
+ if (ztest_random(100) < ztest_opts.zo_killrate) {
+ zs->zs_thread_kill -=
+ ztest_random(ztest_opts.zo_passtime * NANOSEC);
+ }
+
+ mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
+ offsetof(ztest_cb_data_t, zcd_node));
+
+ /*
+ * Open our pool. It may need to be imported first depending on
+ * what tests were running when the previous pass was terminated.
+ */
+ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+ error = spa_open(ztest_opts.zo_pool, &spa, FTAG);
+ if (error) {
+ VERIFY3S(error, ==, ENOENT);
+ ztest_import_impl(zs);
+ VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+ zs->zs_metaslab_sz =
+ 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+ }
+
+ metaslab_preload_limit = ztest_random(20) + 1;
+ ztest_spa = spa;
+
+ VERIFY0(vdev_raidz_impl_set("cycle"));
+
+ dmu_objset_stats_t dds;
+ VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
+ DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ dmu_objset_fast_stat(os, &dds);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ zs->zs_guid = dds.dds_guid;
+ dmu_objset_disown(os, B_TRUE, FTAG);
+
+ /*
+ * Create a thread to periodically resume suspended I/O.
+ */
+ resume_thread = thread_create(NULL, 0, ztest_resume_thread,
+ spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+
+ /*
+ * Create a deadman thread and set to panic if we hang.
+ */
+ deadman_thread = thread_create(NULL, 0, ztest_deadman_thread,
+ zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri);
+
+ spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
+
+ /*
+ * Verify that we can safely inquire about any object,
+ * whether it's allocated or not. To make it interesting,
+ * we probe a 5-wide window around each power of two.
+ * This hits all edge cases, including zero and the max.
+ */
+ for (t = 0; t < 64; t++) {
+ for (d = -5; d <= 5; d++) {
+ error = dmu_object_info(spa->spa_meta_objset,
+ (1ULL << t) + d, NULL);
+ ASSERT(error == 0 || error == ENOENT ||
+ error == EINVAL);
+ }
+ }
+
+ /*
+ * If we got any ENOSPC errors on the previous run, destroy something.
+ */
+ if (zs->zs_enospc_count != 0) {
+ int d = ztest_random(ztest_opts.zo_datasets);
+ ztest_dataset_destroy(d);
+ }
+ zs->zs_enospc_count = 0;
+
+ /*
+ * If we were in the middle of ztest_device_removal() and were killed
+ * we need to ensure the removal and scrub complete before running
+ * any tests that check ztest_device_removal_active. The removal will
+ * be restarted automatically when the spa is opened, but we need to
+ * initiate the scrub manually if it is not already in progress. Note
+ * that we always run the scrub whenever an indirect vdev exists
+ * because we have no way of knowing for sure if ztest_device_removal()
+ * fully completed its scrub before the pool was reimported.
+ */
+ if (spa->spa_removing_phys.sr_state == DSS_SCANNING ||
+ spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+ while (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ error = ztest_scrub_impl(spa);
+ if (error == EBUSY)
+ error = 0;
+ ASSERT0(error);
+ }
+
+ run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *),
+ UMEM_NOFAIL);
+
+ if (ztest_opts.zo_verbose >= 4)
+ (void) printf("starting main threads...\n");
+
+ /*
+ * Replay all logs of all datasets in the pool. This is primarily for
+ * temporary datasets which wouldn't otherwise get replayed, which
+ * can trigger failures when attempting to offline a SLOG in
+ * ztest_fault_inject().
+ */
+ (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb,
+ NULL, DS_FIND_CHILDREN);
+
+ /*
+ * Kick off all the tests that run in parallel.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++) {
+ if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) {
+ umem_free(run_threads, ztest_opts.zo_threads *
+ sizeof (kthread_t *));
+ return;
+ }
+
+ run_threads[t] = thread_create(NULL, 0, ztest_thread,
+ (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE,
+ defclsyspri);
+ }
+
+ /*
+ * Wait for all of the tests to complete.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++)
+ VERIFY0(thread_join(run_threads[t]));
+
+ /*
+ * Close all datasets. This must be done after all the threads
+ * are joined so we can be sure none of the datasets are in-use
+ * by any of the threads.
+ */
+ for (t = 0; t < ztest_opts.zo_threads; t++) {
+ if (t < ztest_opts.zo_datasets)
+ ztest_dataset_close(t);
+ }
+
+ txg_wait_synced(spa_get_dsl(spa), 0);
+
+ zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *));
+
+ /* Kill the resume and deadman threads */
+ ztest_exiting = B_TRUE;
+ VERIFY0(thread_join(resume_thread));
+ VERIFY0(thread_join(deadman_thread));
+ ztest_resume(spa);
+
+ /*
+ * Right before closing the pool, kick off a bunch of async I/O;
+ * spa_close() should wait for it to complete.
+ */
+ for (object = 1; object < 50; object++) {
+ dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ /* Verify that at least one commit cb was called in a timely fashion */
+ if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)
+ VERIFY0(zc_min_txg_delay);
+
+ spa_close(spa, FTAG);
+
+ /*
+ * Verify that we can loop over all pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+ if (ztest_opts.zo_verbose > 3)
+ (void) printf("spa_next: found %s\n", spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+
+ /*
+ * Verify that we can export the pool and reimport it under a
+ * different name.
+ */
+ if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ (void) snprintf(name, sizeof (name), "%s_import",
+ ztest_opts.zo_pool);
+ ztest_spa_import_export(ztest_opts.zo_pool, name);
+ ztest_spa_import_export(name, ztest_opts.zo_pool);
+ }
+
+ kernel_fini();
+
+ list_destroy(&zcl.zcl_callbacks);
+ mutex_destroy(&zcl.zcl_callbacks_lock);
+ (void) pthread_rwlock_destroy(&ztest_name_lock);
+ mutex_destroy(&ztest_vdev_lock);
+ mutex_destroy(&ztest_checkpoint_lock);
+}
+
+static void
+print_time(hrtime_t t, char *timebuf)
+{
+ hrtime_t s = t / NANOSEC;
+ hrtime_t m = s / 60;
+ hrtime_t h = m / 60;
+ hrtime_t d = h / 24;
+
+ s -= m * 60;
+ m -= h * 60;
+ h -= d * 24;
+
+ timebuf[0] = '\0';
+
+ if (d)
+ (void) sprintf(timebuf,
+ "%llud%02lluh%02llum%02llus", d, h, m, s);
+ else if (h)
+ (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
+ else if (m)
+ (void) sprintf(timebuf, "%llum%02llus", m, s);
+ else
+ (void) sprintf(timebuf, "%llus", s);
+}
+
+static nvlist_t *
+make_random_props(void)
+{
+ nvlist_t *props;
+
+ VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
+
+ if (ztest_random(2) == 0)
+ return (props);
+
+ VERIFY0(nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1));
+
+ return (props);
+}
+
+/*
+ * Create a storage pool with the given name and initial vdev size.
+ * Then test spa_freeze() functionality.
+ */
+static void
+ztest_init(ztest_shared_t *zs)
+{
+ spa_t *spa;
+ nvlist_t *nvroot, *props;
+ int i;
+
+ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL);
+ VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL));
+
+ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE);
+
+ /*
+ * Create the storage pool.
+ */
+ (void) spa_destroy(ztest_opts.zo_pool);
+ ztest_shared->zs_vdev_next_leaf = 0;
+ zs->zs_splits = 0;
+ zs->zs_mirrors = ztest_opts.zo_mirrors;
+ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+ NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
+ props = make_random_props();
+
+ /*
+ * We don't expect the pool to suspend unless maxfaults == 0,
+ * in which case ztest_fault_inject() temporarily takes away
+ * the only valid replica.
+ */
+ VERIFY0(nvlist_add_uint64(props,
+ zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
+ MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT));
+
+ for (i = 0; i < SPA_FEATURES; i++) {
+ char *buf;
+
+ /*
+ * 75% chance of using the log space map feature. We want ztest
+ * to exercise both the code paths that use the log space map
+ * feature and the ones that don't.
+ */
+ if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
+ continue;
+
+ VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
+ spa_feature_table[i].fi_uname));
+ VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
+ free(buf);
+ }
+
+ VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL));
+ nvlist_free(nvroot);
+ nvlist_free(props);
+
+ VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+ zs->zs_metaslab_sz =
+ 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
+ spa_close(spa, FTAG);
+
+ kernel_fini();
+
+ if (!ztest_opts.zo_mmp_test) {
+ ztest_run_zdb(ztest_opts.zo_pool);
+ ztest_freeze();
+ ztest_run_zdb(ztest_opts.zo_pool);
+ }
+
+ (void) pthread_rwlock_destroy(&ztest_name_lock);
+ mutex_destroy(&ztest_vdev_lock);
+ mutex_destroy(&ztest_checkpoint_lock);
+}
+
+static void
+setup_data_fd(void)
+{
+ static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
+
+ ztest_fd_data = mkstemp(ztest_name_data);
+ ASSERT3S(ztest_fd_data, >=, 0);
+ (void) unlink(ztest_name_data);
+}
+
+static int
+shared_data_size(ztest_shared_hdr_t *hdr)
+{
+ int size;
+
+ size = hdr->zh_hdr_size;
+ size += hdr->zh_opts_size;
+ size += hdr->zh_size;
+ size += hdr->zh_stats_size * hdr->zh_stats_count;
+ size += hdr->zh_ds_size * hdr->zh_ds_count;
+
+ return (size);
+}
+
+static void
+setup_hdr(void)
+{
+ int size;
+ ztest_shared_hdr_t *hdr;
+
+ hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+ PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+ ASSERT(hdr != MAP_FAILED);
+
+ VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
+
+ hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
+ hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
+ hdr->zh_size = sizeof (ztest_shared_t);
+ hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
+ hdr->zh_stats_count = ZTEST_FUNCS;
+ hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
+ hdr->zh_ds_count = ztest_opts.zo_datasets;
+
+ size = shared_data_size(hdr);
+ VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
+
+ (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+}
+
+static void
+setup_data(void)
+{
+ int size, offset;
+ ztest_shared_hdr_t *hdr;
+ uint8_t *buf;
+
+ hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+ PROT_READ, MAP_SHARED, ztest_fd_data, 0);
+ ASSERT(hdr != MAP_FAILED);
+
+ size = shared_data_size(hdr);
+
+ (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+ hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
+ PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+ ASSERT(hdr != MAP_FAILED);
+ buf = (uint8_t *)hdr;
+
+ offset = hdr->zh_hdr_size;
+ ztest_shared_opts = (void *)&buf[offset];
+ offset += hdr->zh_opts_size;
+ ztest_shared = (void *)&buf[offset];
+ offset += hdr->zh_size;
+ ztest_shared_callstate = (void *)&buf[offset];
+ offset += hdr->zh_stats_size * hdr->zh_stats_count;
+ ztest_shared_ds = (void *)&buf[offset];
+}
+
+static boolean_t
+exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
+{
+ pid_t pid;
+ int status;
+ char *cmdbuf = NULL;
+
+ pid = fork();
+
+ if (cmd == NULL) {
+ cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+ (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
+ cmd = cmdbuf;
+ }
+
+ if (pid == -1)
+ fatal(1, "fork failed");
+
+ if (pid == 0) { /* child */
+ char *emptyargv[2] = { cmd, NULL };
+ char fd_data_str[12];
+
+ struct rlimit rl = { 1024, 1024 };
+ (void) setrlimit(RLIMIT_NOFILE, &rl);
+
+ (void) close(ztest_fd_rand);
+ VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+ VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
+ (void) enable_extended_FILE_stdio(-1, -1);
+ if (libpath != NULL)
+ VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
+ (void) execv(cmd, emptyargv);
+ ztest_dump_core = B_FALSE;
+ fatal(B_TRUE, "exec failed: %s", cmd);
+ }
+
+ if (cmdbuf != NULL) {
+ umem_free(cmdbuf, MAXPATHLEN);
+ cmd = NULL;
+ }
+
+ while (waitpid(pid, &status, 0) != pid)
+ continue;
+ if (statusp != NULL)
+ *statusp = status;
+
+ if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status) != 0) {
+ (void) fprintf(stderr, "child exited with code %d\n",
+ WEXITSTATUS(status));
+ exit(2);
+ }
+ return (B_FALSE);
+ } else if (WIFSIGNALED(status)) {
+ if (!ignorekill || WTERMSIG(status) != SIGKILL) {
+ (void) fprintf(stderr, "child died with signal %d\n",
+ WTERMSIG(status));
+ exit(3);
+ }
+ return (B_TRUE);
+ } else {
+ (void) fprintf(stderr, "something strange happened to child\n");
+ exit(4);
+ /* NOTREACHED */
+ }
+}
+
+static void
+ztest_run_init(void)
+{
+ int i;
+
+ ztest_shared_t *zs = ztest_shared;
+
+ /*
+ * Blow away any existing copy of zpool.cache
+ */
+ (void) remove(spa_config_path);
+
+ if (ztest_opts.zo_init == 0) {
+ if (ztest_opts.zo_verbose >= 1)
+ (void) printf("Importing pool %s\n",
+ ztest_opts.zo_pool);
+ ztest_import(zs);
+ return;
+ }
+
+ /*
+ * Create and initialize our storage pool.
+ */
+ for (i = 1; i <= ztest_opts.zo_init; i++) {
+ bzero(zs, sizeof (ztest_shared_t));
+ if (ztest_opts.zo_verbose >= 3 &&
+ ztest_opts.zo_init != 1) {
+ (void) printf("ztest_init(), pass %d\n", i);
+ }
+ ztest_init(zs);
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ int kills = 0;
+ int iters = 0;
+ int older = 0;
+ int newer = 0;
+ ztest_shared_t *zs;
+ ztest_info_t *zi;
+ ztest_shared_callstate_t *zc;
+ char timebuf[100];
+ char numbuf[NN_NUMBUF_SZ];
+ char *cmd;
+ boolean_t hasalt;
+ int f;
+ char *fd_data_str = getenv("ZTEST_FD_DATA");
+ struct sigaction action;
+
+ (void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+ dprintf_setup(&argc, argv);
+ zfs_deadman_synctime_ms = 300000;
+ zfs_deadman_checktime_ms = 30000;
+ /*
+ * As two-word space map entries may not come up often (especially
+ * if pool and vdev sizes are small) we want to force at least some
+ * of them so the feature get tested.
+ */
+ zfs_force_some_double_word_sm_entries = B_TRUE;
+
+ /*
+ * Verify that even extensively damaged split blocks with many
+ * segments can be reconstructed in a reasonable amount of time
+ * when reconstruction is known to be possible.
+ *
+ * Note: the lower this value is, the more damage we inflict, and
+ * the more time ztest spends in recovering that damage. We chose
+ * to induce damage 1/100th of the time so recovery is tested but
+ * not so frequently that ztest doesn't get to test other code paths.
+ */
+ zfs_reconstruct_indirect_damage_fraction = 100;
+
+ action.sa_handler = sig_handler;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+
+ if (sigaction(SIGSEGV, &action, NULL) < 0) {
+ (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ if (sigaction(SIGABRT, &action, NULL) < 0) {
+ (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Force random_get_bytes() to use /dev/urandom in order to prevent
+ * ztest from needlessly depleting the system entropy pool.
+ */
+ random_path = "/dev/urandom";
+ ztest_fd_rand = open(random_path, O_RDONLY);
+ ASSERT3S(ztest_fd_rand, >=, 0);
+
+ if (!fd_data_str) {
+ process_options(argc, argv);
+
+ setup_data_fd();
+ setup_hdr();
+ setup_data();
+ bcopy(&ztest_opts, ztest_shared_opts,
+ sizeof (*ztest_shared_opts));
+ } else {
+ ztest_fd_data = atoi(fd_data_str);
+ setup_data();
+ bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
+ }
+ ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
+
+ /* Override location of zpool.cache */
+ VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+ ztest_opts.zo_dir) != -1);
+
+ ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
+ UMEM_NOFAIL);
+ zs = ztest_shared;
+
+ if (fd_data_str) {
+ metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging;
+ metaslab_df_alloc_threshold =
+ zs->zs_metaslab_df_alloc_threshold;
+
+ if (zs->zs_do_init)
+ ztest_run_init();
+ else
+ ztest_run(zs);
+ exit(0);
+ }
+
+ hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
+
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("%llu vdevs, %d datasets, %d threads,"
+ " %llu seconds...\n",
+ (u_longlong_t)ztest_opts.zo_vdevs,
+ ztest_opts.zo_datasets,
+ ztest_opts.zo_threads,
+ (u_longlong_t)ztest_opts.zo_time);
+ }
+
+ cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+ (void) strlcpy(cmd, getexecname(), MAXNAMELEN);
+
+ zs->zs_do_init = B_TRUE;
+ if (strlen(ztest_opts.zo_alt_ztest) != 0) {
+ if (ztest_opts.zo_verbose >= 1) {
+ (void) printf("Executing older ztest for "
+ "initialization: %s\n", ztest_opts.zo_alt_ztest);
+ }
+ VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
+ ztest_opts.zo_alt_libpath, B_FALSE, NULL));
+ } else {
+ VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
+ }
+ zs->zs_do_init = B_FALSE;
+
+ zs->zs_proc_start = gethrtime();
+ zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
+
+ for (f = 0; f < ZTEST_FUNCS; f++) {
+ zi = &ztest_info[f];
+ zc = ZTEST_GET_SHARED_CALLSTATE(f);
+ if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+ zc->zc_next = UINT64_MAX;
+ else
+ zc->zc_next = zs->zs_proc_start +
+ ztest_random(2 * zi->zi_interval[0] + 1);
+ }
+
+ /*
+ * Run the tests in a loop. These tests include fault injection
+ * to verify that self-healing data works, and forced crashes
+ * to verify that we never lose on-disk consistency.
+ */
+ while (gethrtime() < zs->zs_proc_stop) {
+ int status;
+ boolean_t killed;
+
+ /*
+ * Initialize the workload counters for each function.
+ */
+ for (f = 0; f < ZTEST_FUNCS; f++) {
+ zc = ZTEST_GET_SHARED_CALLSTATE(f);
+ zc->zc_count = 0;
+ zc->zc_time = 0;
+ }
+
+ /* Set the allocation switch size */
+ zs->zs_metaslab_df_alloc_threshold =
+ ztest_random(zs->zs_metaslab_sz / 4) + 1;
+
+ if (!hasalt || ztest_random(2) == 0) {
+ if (hasalt && ztest_opts.zo_verbose >= 1) {
+ (void) printf("Executing newer ztest: %s\n",
+ cmd);
+ }
+ newer++;
+ killed = exec_child(cmd, NULL, B_TRUE, &status);
+ } else {
+ if (hasalt && ztest_opts.zo_verbose >= 1) {
+ (void) printf("Executing older ztest: %s\n",
+ ztest_opts.zo_alt_ztest);
+ }
+ older++;
+ killed = exec_child(ztest_opts.zo_alt_ztest,
+ ztest_opts.zo_alt_libpath, B_TRUE, &status);
+ }
+
+ if (killed)
+ kills++;
+ iters++;
+
+ if (ztest_opts.zo_verbose >= 1) {
+ hrtime_t now = gethrtime();
+
+ now = MIN(now, zs->zs_proc_stop);
+ print_time(zs->zs_proc_stop - now, timebuf);
+ nicenum(zs->zs_space, numbuf, sizeof (numbuf));
+
+ (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
+ "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
+ iters,
+ WIFEXITED(status) ? "Complete" : "SIGKILL",
+ (u_longlong_t)zs->zs_enospc_count,
+ 100.0 * zs->zs_alloc / zs->zs_space,
+ numbuf,
+ 100.0 * (now - zs->zs_proc_start) /
+ (ztest_opts.zo_time * NANOSEC), timebuf);
+ }
+
+ if (ztest_opts.zo_verbose >= 2) {
+ (void) printf("\nWorkload summary:\n\n");
+ (void) printf("%7s %9s %s\n",
+ "Calls", "Time", "Function");
+ (void) printf("%7s %9s %s\n",
+ "-----", "----", "--------");
+ for (f = 0; f < ZTEST_FUNCS; f++) {
+ zi = &ztest_info[f];
+ zc = ZTEST_GET_SHARED_CALLSTATE(f);
+ print_time(zc->zc_time, timebuf);
+ (void) printf("%7llu %9s %s\n",
+ (u_longlong_t)zc->zc_count, timebuf,
+ zi->zi_funcname);
+ }
+ (void) printf("\n");
+ }
+
+ if (!ztest_opts.zo_mmp_test)
+ ztest_run_zdb(ztest_opts.zo_pool);
+ }
+
+ if (ztest_opts.zo_verbose >= 1) {
+ if (hasalt) {
+ (void) printf("%d runs of older ztest: %s\n", older,
+ ztest_opts.zo_alt_ztest);
+ (void) printf("%d runs of newer ztest: %s\n", newer,
+ cmd);
+ }
+ (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
+ kills, iters - kills, (100.0 * kills) / MAX(1, iters));
+ }
+
+ umem_free(cmd, MAXNAMELEN);
+
+ return (0);
+}
diff --git a/sys/contrib/openzfs/cmd/zvol_id/.gitignore b/sys/contrib/openzfs/cmd/zvol_id/.gitignore
new file mode 100644
index 000000000000..8b757a2d6781
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_id/.gitignore
@@ -0,0 +1 @@
+zvol_id
diff --git a/sys/contrib/openzfs/cmd/zvol_id/Makefile.am b/sys/contrib/openzfs/cmd/zvol_id/Makefile.am
new file mode 100644
index 000000000000..a584875081eb
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_id/Makefile.am
@@ -0,0 +1,10 @@
+include $(top_srcdir)/config/Rules.am
+
+# Disable GCC stack protection for zvol_id. This is a kludge and should be
+# removed once https://github.com/zfsonlinux/zfs/issues/569 is resolved.
+AM_CFLAGS += -fno-stack-protector
+
+udev_PROGRAMS = zvol_id
+
+zvol_id_SOURCES = \
+ zvol_id_main.c
diff --git a/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c b/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c
new file mode 100644
index 000000000000..4a2d74cc203c
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_id/zvol_id_main.c
@@ -0,0 +1,110 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Fajar A. Nugraha. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <linux/ioctl.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+static int
+ioctl_get_msg(char *var, int fd)
+{
+ int error = 0;
+ char msg[ZFS_MAX_DATASET_NAME_LEN];
+
+ error = ioctl(fd, BLKZNAME, msg);
+ if (error < 0) {
+ return (error);
+ }
+
+ snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg);
+ return (error);
+}
+
+int
+main(int argc, char **argv)
+{
+ int fd, error = 0;
+ char zvol_name[ZFS_MAX_DATASET_NAME_LEN];
+ char *zvol_name_part = NULL;
+ char *dev_name;
+ struct stat64 statbuf;
+ int dev_minor, dev_part;
+ int i;
+ int rc;
+
+ if (argc < 2) {
+ printf("Usage: %s /dev/zvol_device_node\n", argv[0]);
+ return (EINVAL);
+ }
+
+ dev_name = argv[1];
+ error = stat64(dev_name, &statbuf);
+ if (error != 0) {
+ printf("Unable to access device file: %s\n", dev_name);
+ return (errno);
+ }
+
+ dev_minor = minor(statbuf.st_rdev);
+ dev_part = dev_minor % ZVOL_MINORS;
+
+ fd = open(dev_name, O_RDONLY);
+ if (fd < 0) {
+ printf("Unable to open device file: %s\n", dev_name);
+ return (errno);
+ }
+
+ error = ioctl_get_msg(zvol_name, fd);
+ if (error < 0) {
+ printf("ioctl_get_msg failed:%s\n", strerror(errno));
+ return (errno);
+ }
+ if (dev_part > 0)
+ rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name,
+ dev_part);
+ else
+ rc = asprintf(&zvol_name_part, "%s", zvol_name);
+
+ if (rc == -1 || zvol_name_part == NULL)
+ goto error;
+
+ for (i = 0; i < strlen(zvol_name_part); i++) {
+ if (isblank(zvol_name_part[i]))
+ zvol_name_part[i] = '+';
+ }
+
+ printf("%s\n", zvol_name_part);
+ free(zvol_name_part);
+error:
+ close(fd);
+ return (error);
+}
diff --git a/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am b/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am
new file mode 100644
index 000000000000..564031c9799d
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_wait/Makefile.am
@@ -0,0 +1 @@
+dist_bin_SCRIPTS = zvol_wait
diff --git a/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait b/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait
new file mode 100755
index 000000000000..9a3948da5564
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zvol_wait/zvol_wait
@@ -0,0 +1,116 @@
+#!/bin/sh
+
+count_zvols() {
+ if [ -z "$zvols" ]; then
+ echo 0
+ else
+ echo "$zvols" | wc -l
+ fi
+}
+
+filter_out_zvols_with_links() {
+ while read -r zvol; do
+ if [ ! -L "/dev/zvol/$zvol" ]; then
+ echo "$zvol"
+ fi
+ done
+}
+
+filter_out_deleted_zvols() {
+ while read -r zvol; do
+ if zfs list "$zvol" >/dev/null 2>&1; then
+ echo "$zvol"
+ fi
+ done
+}
+
+list_zvols() {
+ zfs list -t volume -H -o \
+ name,volmode,receive_resume_token,redact_snaps |
+ while read -r zvol_line; do
+ name=$(echo "$zvol_line" | awk '{print $1}')
+ volmode=$(echo "$zvol_line" | awk '{print $2}')
+ token=$(echo "$zvol_line" | awk '{print $3}')
+ redacted=$(echo "$zvol_line" | awk '{print $4}')
+ #
+ # /dev links are not created for zvols with volmode = "none"
+ # or for redacted zvols.
+ #
+ [ "$volmode" = "none" ] && continue
+ [ "$redacted" = "-" ] || continue
+ #
+ # We also also ignore partially received zvols if it is
+ # not an incremental receive, as those won't even have a block
+ # device minor node created yet.
+ #
+ if [ "$token" != "-" ]; then
+ #
+ # Incremental receives create an invisible clone that
+ # is not automatically displayed by zfs list.
+ #
+ if ! zfs list "$name/%recv" >/dev/null 2>&1; then
+ continue
+ fi
+ fi
+ echo "$name"
+ done
+}
+
+zvols=$(list_zvols)
+zvols_count=$(count_zvols)
+if [ "$zvols_count" -eq 0 ]; then
+ echo "No zvols found, nothing to do."
+ exit 0
+fi
+
+echo "Testing $zvols_count zvol links"
+
+outer_loop=0
+while [ "$outer_loop" -lt 20 ]; do
+ outer_loop=$((outer_loop + 1))
+
+ old_zvols_count=$(count_zvols)
+
+ inner_loop=0
+ while [ "$inner_loop" -lt 30 ]; do
+ inner_loop=$((inner_loop + 1))
+
+ zvols="$(echo "$zvols" | filter_out_zvols_with_links)"
+
+ zvols_count=$(count_zvols)
+ if [ "$zvols_count" -eq 0 ]; then
+ echo "All zvol links are now present."
+ exit 0
+ fi
+ sleep 1
+ done
+
+ echo "Still waiting on $zvols_count zvol links ..."
+ #
+ # Although zvols should normally not be deleted at boot time,
+ # if that is the case then their links will be missing and
+ # we would stall.
+ #
+ if [ "$old_zvols_count" -eq "$zvols_count" ]; then
+ echo "No progress since last loop."
+ echo "Checking if any zvols were deleted."
+
+ zvols=$(echo "$zvols" | filter_out_deleted_zvols)
+ zvols_count=$(count_zvols)
+
+ if [ "$old_zvols_count" -ne "$zvols_count" ]; then
+ echo "$((old_zvols_count - zvols_count)) zvol(s) deleted."
+ fi
+
+ if [ "$zvols_count" -ne 0 ]; then
+ echo "Remaining zvols:"
+ echo "$zvols"
+ else
+ echo "All zvol links are now present."
+ exit 0
+ fi
+ fi
+done
+
+echo "Timed out waiting on zvol links"
+exit 1