aboutsummaryrefslogtreecommitdiff
path: root/cmd
diff options
context:
space:
mode:
authorMatt Macy <mmacy@FreeBSD.org>2020-08-24 22:48:19 +0000
committerMatt Macy <mmacy@FreeBSD.org>2020-08-24 22:48:19 +0000
commit3b0ce0e28db46d0403929aba45c682285e1ac217 (patch)
tree91721e6e5518bd0d8113dee535898f2225443411 /cmd
downloadsrc-3b0ce0e28db46d0403929aba45c682285e1ac217.tar.gz
src-3b0ce0e28db46d0403929aba45c682285e1ac217.zip
Vendor import of openzfs master @ 184df27eef0abdc7ab2105b21257f753834b936bvendor/openzfs/2.0-rc0-g184df27
Sponsored by: iX Systems, Inc.
Notes
Notes: svn path=/vendor-sys/openzfs/dist/; revision=364736 svn path=/vendor-sys/openzfs/2.0-rc0-g184df27/; revision=364741; tag=vendor/openzfs/2.0-rc0-g184df27
Diffstat (limited to 'cmd')
-rw-r--r--cmd/Makefile.am10
-rw-r--r--cmd/arc_summary/.gitignore1
-rw-r--r--cmd/arc_summary/Makefile.am13
-rwxr-xr-xcmd/arc_summary/arc_summary21093
-rwxr-xr-xcmd/arc_summary/arc_summary3943
-rw-r--r--cmd/arcstat/.gitignore1
-rw-r--r--cmd/arcstat/Makefile.am5
-rwxr-xr-xcmd/arcstat/arcstat.in494
-rw-r--r--cmd/dbufstat/.gitignore1
-rw-r--r--cmd/dbufstat/Makefile.am5
-rwxr-xr-xcmd/dbufstat/dbufstat.in669
-rw-r--r--cmd/fsck_zfs/Makefile.am1
-rwxr-xr-xcmd/fsck_zfs/fsck.zfs9
-rw-r--r--cmd/mount_zfs/.gitignore1
-rw-r--r--cmd/mount_zfs/Makefile.am20
-rw-r--r--cmd/mount_zfs/mount_zfs.c408
-rw-r--r--cmd/raidz_test/.gitignore1
-rw-r--r--cmd/raidz_test/Makefile.am20
-rw-r--r--cmd/raidz_test/raidz_bench.c227
-rw-r--r--cmd/raidz_test/raidz_test.c782
-rw-r--r--cmd/raidz_test/raidz_test.h116
-rw-r--r--cmd/vdev_id/Makefile.am1
-rwxr-xr-xcmd/vdev_id/vdev_id605
-rw-r--r--cmd/zdb/.gitignore1
-rw-r--r--cmd/zdb/Makefile.am16
-rw-r--r--cmd/zdb/zdb.c8606
-rw-r--r--cmd/zdb/zdb.h33
-rw-r--r--cmd/zdb/zdb_il.c431
-rw-r--r--cmd/zed/.gitignore1
-rw-r--r--cmd/zed/Makefile.am49
-rw-r--r--cmd/zed/agents/README.md112
-rw-r--r--cmd/zed/agents/fmd_api.c760
-rw-r--r--cmd/zed/agents/fmd_api.h246
-rw-r--r--cmd/zed/agents/fmd_serd.c316
-rw-r--r--cmd/zed/agents/fmd_serd.h86
-rw-r--r--cmd/zed/agents/zfs_agents.c422
-rw-r--r--cmd/zed/agents/zfs_agents.h46
-rw-r--r--cmd/zed/agents/zfs_diagnosis.c981
-rw-r--r--cmd/zed/agents/zfs_mod.c956
-rw-r--r--cmd/zed/agents/zfs_retire.c557
-rw-r--r--cmd/zed/zed.c306
-rw-r--r--cmd/zed/zed.d/.gitignore1
-rw-r--r--cmd/zed/zed.d/Makefile.am53
-rw-r--r--cmd/zed/zed.d/README30
-rwxr-xr-xcmd/zed/zed.d/all-debug.sh26
-rwxr-xr-xcmd/zed/zed.d/all-syslog.sh14
-rwxr-xr-xcmd/zed/zed.d/data-notify.sh43
-rwxr-xr-xcmd/zed/zed.d/generic-notify.sh54
-rwxr-xr-xcmd/zed/zed.d/history_event-zfs-list-cacher.sh.in85
l---------cmd/zed/zed.d/pool_import-led.sh1
l---------cmd/zed/zed.d/resilver_finish-notify.sh1
-rwxr-xr-xcmd/zed/zed.d/resilver_finish-start-scrub.sh19
-rwxr-xr-xcmd/zed/zed.d/scrub_finish-notify.sh59
-rwxr-xr-xcmd/zed/zed.d/statechange-led.sh177
-rwxr-xr-xcmd/zed/zed.d/statechange-notify.sh74
-rwxr-xr-xcmd/zed/zed.d/trim_finish-notify.sh37
l---------cmd/zed/zed.d/vdev_attach-led.sh1
l---------cmd/zed/zed.d/vdev_clear-led.sh1
-rwxr-xr-xcmd/zed/zed.d/zed-functions.sh538
-rw-r--r--cmd/zed/zed.d/zed.rc122
-rw-r--r--cmd/zed/zed.h58
-rw-r--r--cmd/zed/zed_conf.c735
-rw-r--r--cmd/zed/zed_conf.h62
-rw-r--r--cmd/zed/zed_disk_event.c416
-rw-r--r--cmd/zed/zed_disk_event.h31
-rw-r--r--cmd/zed/zed_event.c965
-rw-r--r--cmd/zed/zed_event.h29
-rw-r--r--cmd/zed/zed_exec.c232
-rw-r--r--cmd/zed/zed_exec.h25
-rw-r--r--cmd/zed/zed_file.c217
-rw-r--r--cmd/zed/zed_file.h35
-rw-r--r--cmd/zed/zed_log.c256
-rw-r--r--cmd/zed/zed_log.h44
-rw-r--r--cmd/zed/zed_strings.c247
-rw-r--r--cmd/zed/zed_strings.h27
-rw-r--r--cmd/zfs/.gitignore1
-rw-r--r--cmd/zfs/Makefile.am23
-rw-r--r--cmd/zfs/zfs_iter.c512
-rw-r--r--cmd/zfs/zfs_iter.h61
-rw-r--r--cmd/zfs/zfs_main.c8620
-rw-r--r--cmd/zfs/zfs_project.c295
-rw-r--r--cmd/zfs/zfs_projectutil.h49
-rw-r--r--cmd/zfs/zfs_util.h42
-rw-r--r--cmd/zfs_ids_to_path/.gitignore1
-rw-r--r--cmd/zfs_ids_to_path/Makefile.am9
-rw-r--r--cmd/zfs_ids_to_path/zfs_ids_to_path.c96
-rw-r--r--cmd/zgenhostid/Makefile.am1
-rwxr-xr-xcmd/zgenhostid/zgenhostid61
-rw-r--r--cmd/zhack/.gitignore1
-rw-r--r--cmd/zhack/Makefile.am14
-rw-r--r--cmd/zhack/zhack.c532
-rw-r--r--cmd/zinject/.gitignore1
-rw-r--r--cmd/zinject/Makefile.am13
-rw-r--r--cmd/zinject/translate.c397
-rw-r--r--cmd/zinject/zinject.c1287
-rw-r--r--cmd/zinject/zinject.h70
-rw-r--r--cmd/zpool/.gitignore1
-rw-r--r--cmd/zpool/Makefile.am136
-rw-r--r--cmd/zpool/os/freebsd/zpool_vdev_os.c103
-rw-r--r--cmd/zpool/os/linux/zpool_vdev_os.c410
-rw-r--r--cmd/zpool/zpool.d/README9
l---------cmd/zpool/zpool.d/ata_err1
l---------cmd/zpool/zpool.d/cmd_to1
l---------cmd/zpool/zpool.d/defect1
-rwxr-xr-xcmd/zpool/zpool.d/dm-deps29
l---------cmd/zpool/zpool.d/enc1
l---------cmd/zpool/zpool.d/encdev1
l---------cmd/zpool/zpool.d/fault_led1
l---------cmd/zpool/zpool.d/health1
l---------cmd/zpool/zpool.d/hours_on1
-rwxr-xr-xcmd/zpool/zpool.d/iostat77
l---------cmd/zpool/zpool.d/iostat-10s1
l---------cmd/zpool/zpool.d/iostat-1s1
l---------cmd/zpool/zpool.d/label1
l---------cmd/zpool/zpool.d/locate_led1
-rwxr-xr-xcmd/zpool/zpool.d/lsblk83
-rwxr-xr-xcmd/zpool/zpool.d/media27
l---------cmd/zpool/zpool.d/model1
l---------cmd/zpool/zpool.d/nonmed1
l---------cmd/zpool/zpool.d/nvme_err1
l---------cmd/zpool/zpool.d/off_ucor1
l---------cmd/zpool/zpool.d/pend_sec1
l---------cmd/zpool/zpool.d/pwr_cyc1
l---------cmd/zpool/zpool.d/r_proc1
l---------cmd/zpool/zpool.d/r_ucor1
l---------cmd/zpool/zpool.d/realloc1
l---------cmd/zpool/zpool.d/rep_ucor1
l---------cmd/zpool/zpool.d/serial1
-rwxr-xr-xcmd/zpool/zpool.d/ses52
l---------cmd/zpool/zpool.d/size1
l---------cmd/zpool/zpool.d/slot1
-rwxr-xr-xcmd/zpool/zpool.d/smart243
l---------cmd/zpool/zpool.d/smart_test1
l---------cmd/zpool/zpool.d/smartx1
l---------cmd/zpool/zpool.d/temp1
l---------cmd/zpool/zpool.d/test_ended1
l---------cmd/zpool/zpool.d/test_progress1
l---------cmd/zpool/zpool.d/test_status1
l---------cmd/zpool/zpool.d/test_type1
-rwxr-xr-xcmd/zpool/zpool.d/upath7
l---------cmd/zpool/zpool.d/vendor1
l---------cmd/zpool/zpool.d/w_proc1
l---------cmd/zpool/zpool.d/w_ucor1
-rw-r--r--cmd/zpool/zpool_iter.c757
-rw-r--r--cmd/zpool/zpool_main.c10326
-rw-r--r--cmd/zpool/zpool_util.c125
-rw-r--r--cmd/zpool/zpool_util.h137
-rw-r--r--cmd/zpool/zpool_vdev.c1581
-rw-r--r--cmd/zstream/.gitignore1
-rw-r--r--cmd/zstream/Makefile.am15
-rw-r--r--cmd/zstream/zstream.c66
-rw-r--r--cmd/zstream/zstream.h36
-rw-r--r--cmd/zstream/zstream_dump.c799
-rw-r--r--cmd/zstream/zstream_redup.c469
-rw-r--r--cmd/zstream/zstream_token.c78
-rw-r--r--cmd/zstreamdump/Makefile.am1
-rwxr-xr-xcmd/zstreamdump/zstreamdump3
-rw-r--r--cmd/ztest/.gitignore1
-rw-r--r--cmd/ztest/Makefile.am23
-rw-r--r--cmd/ztest/ztest.c7818
-rw-r--r--cmd/zvol_id/.gitignore1
-rw-r--r--cmd/zvol_id/Makefile.am10
-rw-r--r--cmd/zvol_id/zvol_id_main.c110
-rw-r--r--cmd/zvol_wait/Makefile.am1
-rwxr-xr-xcmd/zvol_wait/zvol_wait116
165 files changed, 59838 insertions, 0 deletions
diff --git a/cmd/Makefile.am b/cmd/Makefile.am
new file mode 100644
index 000000000000..88d32b1c538c
--- /dev/null
+++ b/cmd/Makefile.am
@@ -0,0 +1,10 @@
+SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest
+SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path
+
+if USING_PYTHON
+SUBDIRS += arcstat arc_summary dbufstat
+endif
+
+if BUILD_LINUX
+SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait
+endif
diff --git a/cmd/arc_summary/.gitignore b/cmd/arc_summary/.gitignore
new file mode 100644
index 000000000000..50ba15f034e2
--- /dev/null
+++ b/cmd/arc_summary/.gitignore
@@ -0,0 +1 @@
+arc_summary
diff --git a/cmd/arc_summary/Makefile.am b/cmd/arc_summary/Makefile.am
new file mode 100644
index 000000000000..1a26c2c199f8
--- /dev/null
+++ b/cmd/arc_summary/Makefile.am
@@ -0,0 +1,13 @@
+bin_SCRIPTS = arc_summary
+
+CLEANFILES = arc_summary
+EXTRA_DIST = arc_summary2 arc_summary3
+
+if USING_PYTHON_2
+SCRIPT = arc_summary2
+else
+SCRIPT = arc_summary3
+endif
+
+arc_summary: $(SCRIPT)
+ cp $< $@
diff --git a/cmd/arc_summary/arc_summary2 b/cmd/arc_summary/arc_summary2
new file mode 100755
index 000000000000..5dc40d759dce
--- /dev/null
+++ b/cmd/arc_summary/arc_summary2
@@ -0,0 +1,1093 @@
+#!/usr/bin/env python2
+#
+# $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $
+#
+# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
+# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
+# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# If you are having troubles when using this script from cron(8) please try
+# adjusting your PATH before reporting problems.
+#
+# Note some of this code uses older code (eg getopt instead of argparse,
+# subprocess.Popen() instead of subprocess.run()) because we need to support
+# some very old versions of Python.
+#
+
+"""Print statistics on the ZFS Adjustable Replacement Cache (ARC)
+
+Provides basic information on the ARC, its efficiency, the L2ARC (if present),
+the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the
+in-source documentation and code at
+https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details.
+"""
+
+import getopt
+import os
+import sys
+import time
+import errno
+
+from subprocess import Popen, PIPE
+from decimal import Decimal as D
+
+
+if sys.platform.startswith('freebsd'):
+ # Requires py27-sysctl on FreeBSD
+ import sysctl
+
+ def load_kstats(namespace):
+ """Collect information on a specific subsystem of the ARC"""
+
+ base = 'kstat.zfs.misc.%s.' % namespace
+ return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)]
+
+ def load_tunables():
+ return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs'))
+
+elif sys.platform.startswith('linux'):
+
+ def load_kstats(namespace):
+ """Collect information on a specific subsystem of the ARC"""
+
+ kstat = 'kstat.zfs.misc.%s.%%s' % namespace
+ path = '/proc/spl/kstat/zfs/%s' % namespace
+ with open(path) as f:
+ entries = [line.strip().split() for line in f][2:] # Skip header
+ return [(kstat % name, D(value)) for name, _, value in entries]
+
+ def load_tunables():
+ basepath = '/sys/module/zfs/parameters'
+ tunables = {}
+ for name in os.listdir(basepath):
+ if not name:
+ continue
+ path = '%s/%s' % (basepath, name)
+ with open(path) as f:
+ value = f.read()
+ tunables[name] = value.strip()
+ return tunables
+
+
+show_tunable_descriptions = False
+alternate_tunable_layout = False
+
+
+def handle_Exception(ex_cls, ex, tb):
+ if ex is IOError:
+ if ex.errno == errno.EPIPE:
+ sys.exit()
+
+ if ex is KeyboardInterrupt:
+ sys.exit()
+
+
+sys.excepthook = handle_Exception
+
+
+def get_Kstat():
+ """Collect information on the ZFS subsystem from the /proc virtual
+ file system. The name "kstat" is a holdover from the Solaris utility
+ of the same name.
+ """
+
+ Kstat = {}
+ Kstat.update(load_kstats('arcstats'))
+ Kstat.update(load_kstats('zfetchstats'))
+ Kstat.update(load_kstats('vdev_cache_stats'))
+ return Kstat
+
+
+def fBytes(b=0):
+ """Return human-readable representation of a byte value in
+ powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
+ points. Values smaller than one KiB are returned without
+ decimal points.
+ """
+
+ prefixes = [
+ [2**80, "YiB"], # yobibytes (yotta)
+ [2**70, "ZiB"], # zebibytes (zetta)
+ [2**60, "EiB"], # exbibytes (exa)
+ [2**50, "PiB"], # pebibytes (peta)
+ [2**40, "TiB"], # tebibytes (tera)
+ [2**30, "GiB"], # gibibytes (giga)
+ [2**20, "MiB"], # mebibytes (mega)
+ [2**10, "KiB"]] # kibibytes (kilo)
+
+ if b >= 2**10:
+
+ for limit, unit in prefixes:
+
+ if b >= limit:
+ value = b / limit
+ break
+
+ result = "%0.2f\t%s" % (value, unit)
+
+ else:
+
+ result = "%d\tBytes" % b
+
+ return result
+
+
+def fHits(hits=0):
+ """Create a human-readable representation of the number of hits.
+ The single-letter symbols used are SI to avoid the confusion caused
+ by the different "short scale" and "long scale" representations in
+ English, which use the same words for different values. See
+ https://en.wikipedia.org/wiki/Names_of_large_numbers and
+ https://physics.nist.gov/cuu/Units/prefixes.html
+ """
+
+ numbers = [
+ [10**24, 'Y'], # yotta (septillion)
+ [10**21, 'Z'], # zetta (sextillion)
+ [10**18, 'E'], # exa (quintrillion)
+ [10**15, 'P'], # peta (quadrillion)
+ [10**12, 'T'], # tera (trillion)
+ [10**9, 'G'], # giga (billion)
+ [10**6, 'M'], # mega (million)
+ [10**3, 'k']] # kilo (thousand)
+
+ if hits >= 1000:
+
+ for limit, symbol in numbers:
+
+ if hits >= limit:
+ value = hits/limit
+ break
+
+ result = "%0.2f%s" % (value, symbol)
+
+ else:
+
+ result = "%d" % hits
+
+ return result
+
+
+def fPerc(lVal=0, rVal=0, Decimal=2):
+ """Calculate percentage value and return in human-readable format"""
+
+ if rVal > 0:
+ return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%"
+ else:
+ return str("%0." + str(Decimal) + "f") % 100 + "%"
+
+
+def get_arc_summary(Kstat):
+ """Collect general data on the ARC"""
+
+ output = {}
+ memory_throttle_count = Kstat[
+ "kstat.zfs.misc.arcstats.memory_throttle_count"
+ ]
+
+ if memory_throttle_count > 0:
+ output['health'] = 'THROTTLED'
+ else:
+ output['health'] = 'HEALTHY'
+
+ output['memory_throttle_count'] = fHits(memory_throttle_count)
+
+ # ARC Misc.
+ deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
+ mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
+ evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"]
+
+ # ARC Misc.
+ output["arc_misc"] = {}
+ output["arc_misc"]["deleted"] = fHits(deleted)
+ output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
+ output["arc_misc"]['evict_skips'] = fHits(evict_skip)
+
+ # ARC Sizing
+ arc_size = Kstat["kstat.zfs.misc.arcstats.size"]
+ mru_size = Kstat["kstat.zfs.misc.arcstats.mru_size"]
+ mfu_size = Kstat["kstat.zfs.misc.arcstats.mfu_size"]
+ meta_limit = Kstat["kstat.zfs.misc.arcstats.arc_meta_limit"]
+ meta_size = Kstat["kstat.zfs.misc.arcstats.arc_meta_used"]
+ dnode_limit = Kstat["kstat.zfs.misc.arcstats.arc_dnode_limit"]
+ dnode_size = Kstat["kstat.zfs.misc.arcstats.dnode_size"]
+ target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"]
+ target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"]
+ target_size = Kstat["kstat.zfs.misc.arcstats.c"]
+
+ target_size_ratio = (target_max_size / target_min_size)
+
+ # ARC Sizing
+ output['arc_sizing'] = {}
+ output['arc_sizing']['arc_size'] = {
+ 'per': fPerc(arc_size, target_max_size),
+ 'num': fBytes(arc_size),
+ }
+ output['arc_sizing']['target_max_size'] = {
+ 'ratio': target_size_ratio,
+ 'num': fBytes(target_max_size),
+ }
+ output['arc_sizing']['target_min_size'] = {
+ 'per': fPerc(target_min_size, target_max_size),
+ 'num': fBytes(target_min_size),
+ }
+ output['arc_sizing']['target_size'] = {
+ 'per': fPerc(target_size, target_max_size),
+ 'num': fBytes(target_size),
+ }
+ output['arc_sizing']['meta_limit'] = {
+ 'per': fPerc(meta_limit, target_max_size),
+ 'num': fBytes(meta_limit),
+ }
+ output['arc_sizing']['meta_size'] = {
+ 'per': fPerc(meta_size, meta_limit),
+ 'num': fBytes(meta_size),
+ }
+ output['arc_sizing']['dnode_limit'] = {
+ 'per': fPerc(dnode_limit, meta_limit),
+ 'num': fBytes(dnode_limit),
+ }
+ output['arc_sizing']['dnode_size'] = {
+ 'per': fPerc(dnode_size, dnode_limit),
+ 'num': fBytes(dnode_size),
+ }
+
+ # ARC Hash Breakdown
+ output['arc_hash_break'] = {}
+ output['arc_hash_break']['hash_chain_max'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_chain_max"
+ ]
+ output['arc_hash_break']['hash_chains'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_chains"
+ ]
+ output['arc_hash_break']['hash_collisions'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_collisions"
+ ]
+ output['arc_hash_break']['hash_elements'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_elements"
+ ]
+ output['arc_hash_break']['hash_elements_max'] = Kstat[
+ "kstat.zfs.misc.arcstats.hash_elements_max"
+ ]
+
+ output['arc_size_break'] = {}
+ output['arc_size_break']['recently_used_cache_size'] = {
+ 'per': fPerc(mru_size, mru_size + mfu_size),
+ 'num': fBytes(mru_size),
+ }
+ output['arc_size_break']['frequently_used_cache_size'] = {
+ 'per': fPerc(mfu_size, mru_size + mfu_size),
+ 'num': fBytes(mfu_size),
+ }
+
+ # ARC Hash Breakdown
+ hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"]
+ hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"]
+ hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"]
+ hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"]
+ hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"]
+
+ output['arc_hash_break'] = {}
+ output['arc_hash_break']['elements_max'] = fHits(hash_elements_max)
+ output['arc_hash_break']['elements_current'] = {
+ 'per': fPerc(hash_elements, hash_elements_max),
+ 'num': fHits(hash_elements),
+ }
+ output['arc_hash_break']['collisions'] = fHits(hash_collisions)
+ output['arc_hash_break']['chain_max'] = fHits(hash_chain_max)
+ output['arc_hash_break']['chains'] = fHits(hash_chains)
+
+ return output
+
+
+def _arc_summary(Kstat):
+ """Print information on the ARC"""
+
+ # ARC Sizing
+ arc = get_arc_summary(Kstat)
+
+ sys.stdout.write("ARC Summary: (%s)\n" % arc['health'])
+
+ sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" %
+ arc['memory_throttle_count'])
+ sys.stdout.write("\n")
+
+ # ARC Misc.
+ sys.stdout.write("ARC Misc:\n")
+ sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
+ sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
+ arc['arc_misc']['mutex_miss'])
+ sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %
+ arc['arc_misc']['evict_skips'])
+ sys.stdout.write("\n")
+
+ # ARC Sizing
+ sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % (
+ arc['arc_sizing']['arc_size']['per'],
+ arc['arc_sizing']['arc_size']['num']
+ )
+ )
+ sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % (
+ arc['arc_sizing']['target_size']['per'],
+ arc['arc_sizing']['target_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % (
+ arc['arc_sizing']['target_min_size']['per'],
+ arc['arc_sizing']['target_min_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % (
+ arc['arc_sizing']['target_max_size']['ratio'],
+ arc['arc_sizing']['target_max_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\nARC Size Breakdown:\n")
+ sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % (
+ arc['arc_size_break']['recently_used_cache_size']['per'],
+ arc['arc_size_break']['recently_used_cache_size']['num'],
+ )
+ )
+ sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % (
+ arc['arc_size_break']['frequently_used_cache_size']['per'],
+ arc['arc_size_break']['frequently_used_cache_size']['num'],
+ )
+ )
+ sys.stdout.write("\tMetadata Size (Hard Limit):\t%s\t%s\n" % (
+ arc['arc_sizing']['meta_limit']['per'],
+ arc['arc_sizing']['meta_limit']['num'],
+ )
+ )
+ sys.stdout.write("\tMetadata Size:\t\t\t%s\t%s\n" % (
+ arc['arc_sizing']['meta_size']['per'],
+ arc['arc_sizing']['meta_size']['num'],
+ )
+ )
+ sys.stdout.write("\tDnode Size (Hard Limit):\t%s\t%s\n" % (
+ arc['arc_sizing']['dnode_limit']['per'],
+ arc['arc_sizing']['dnode_limit']['num'],
+ )
+ )
+ sys.stdout.write("\tDnode Size:\t\t\t%s\t%s\n" % (
+ arc['arc_sizing']['dnode_size']['per'],
+ arc['arc_sizing']['dnode_size']['num'],
+ )
+ )
+
+ sys.stdout.write("\n")
+
+ # ARC Hash Breakdown
+ sys.stdout.write("ARC Hash Breakdown:\n")
+ sys.stdout.write("\tElements Max:\t\t\t\t%s\n" %
+ arc['arc_hash_break']['elements_max'])
+ sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % (
+ arc['arc_hash_break']['elements_current']['per'],
+ arc['arc_hash_break']['elements_current']['num'],
+ )
+ )
+ sys.stdout.write("\tCollisions:\t\t\t\t%s\n" %
+ arc['arc_hash_break']['collisions'])
+ sys.stdout.write("\tChain Max:\t\t\t\t%s\n" %
+ arc['arc_hash_break']['chain_max'])
+ sys.stdout.write("\tChains:\t\t\t\t\t%s\n" %
+ arc['arc_hash_break']['chains'])
+
+
+def get_arc_efficiency(Kstat):
+ """Collect information on the efficiency of the ARC"""
+
+ output = {}
+
+ arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"]
+ arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"]
+ demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"]
+ demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"]
+ demand_metadata_hits = Kstat[
+ "kstat.zfs.misc.arcstats.demand_metadata_hits"
+ ]
+ demand_metadata_misses = Kstat[
+ "kstat.zfs.misc.arcstats.demand_metadata_misses"
+ ]
+ mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"]
+ mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"]
+ mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"]
+ mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"]
+ prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"]
+ prefetch_data_misses = Kstat[
+ "kstat.zfs.misc.arcstats.prefetch_data_misses"
+ ]
+ prefetch_metadata_hits = Kstat[
+ "kstat.zfs.misc.arcstats.prefetch_metadata_hits"
+ ]
+ prefetch_metadata_misses = Kstat[
+ "kstat.zfs.misc.arcstats.prefetch_metadata_misses"
+ ]
+
+ anon_hits = arc_hits - (
+ mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits
+ )
+ arc_accesses_total = (arc_hits + arc_misses)
+ demand_data_total = (demand_data_hits + demand_data_misses)
+ prefetch_data_total = (prefetch_data_hits + prefetch_data_misses)
+ real_hits = (mfu_hits + mru_hits)
+
+ output["total_accesses"] = fHits(arc_accesses_total)
+ output["cache_hit_ratio"] = {
+ 'per': fPerc(arc_hits, arc_accesses_total),
+ 'num': fHits(arc_hits),
+ }
+ output["cache_miss_ratio"] = {
+ 'per': fPerc(arc_misses, arc_accesses_total),
+ 'num': fHits(arc_misses),
+ }
+ output["actual_hit_ratio"] = {
+ 'per': fPerc(real_hits, arc_accesses_total),
+ 'num': fHits(real_hits),
+ }
+ output["data_demand_efficiency"] = {
+ 'per': fPerc(demand_data_hits, demand_data_total),
+ 'num': fHits(demand_data_total),
+ }
+
+ if prefetch_data_total > 0:
+ output["data_prefetch_efficiency"] = {
+ 'per': fPerc(prefetch_data_hits, prefetch_data_total),
+ 'num': fHits(prefetch_data_total),
+ }
+
+ if anon_hits > 0:
+ output["cache_hits_by_cache_list"] = {}
+ output["cache_hits_by_cache_list"]["anonymously_used"] = {
+ 'per': fPerc(anon_hits, arc_hits),
+ 'num': fHits(anon_hits),
+ }
+
+ output["most_recently_used"] = {
+ 'per': fPerc(mru_hits, arc_hits),
+ 'num': fHits(mru_hits),
+ }
+ output["most_frequently_used"] = {
+ 'per': fPerc(mfu_hits, arc_hits),
+ 'num': fHits(mfu_hits),
+ }
+ output["most_recently_used_ghost"] = {
+ 'per': fPerc(mru_ghost_hits, arc_hits),
+ 'num': fHits(mru_ghost_hits),
+ }
+ output["most_frequently_used_ghost"] = {
+ 'per': fPerc(mfu_ghost_hits, arc_hits),
+ 'num': fHits(mfu_ghost_hits),
+ }
+
+ output["cache_hits_by_data_type"] = {}
+ output["cache_hits_by_data_type"]["demand_data"] = {
+ 'per': fPerc(demand_data_hits, arc_hits),
+ 'num': fHits(demand_data_hits),
+ }
+ output["cache_hits_by_data_type"]["prefetch_data"] = {
+ 'per': fPerc(prefetch_data_hits, arc_hits),
+ 'num': fHits(prefetch_data_hits),
+ }
+ output["cache_hits_by_data_type"]["demand_metadata"] = {
+ 'per': fPerc(demand_metadata_hits, arc_hits),
+ 'num': fHits(demand_metadata_hits),
+ }
+ output["cache_hits_by_data_type"]["prefetch_metadata"] = {
+ 'per': fPerc(prefetch_metadata_hits, arc_hits),
+ 'num': fHits(prefetch_metadata_hits),
+ }
+
+ output["cache_misses_by_data_type"] = {}
+ output["cache_misses_by_data_type"]["demand_data"] = {
+ 'per': fPerc(demand_data_misses, arc_misses),
+ 'num': fHits(demand_data_misses),
+ }
+ output["cache_misses_by_data_type"]["prefetch_data"] = {
+ 'per': fPerc(prefetch_data_misses, arc_misses),
+ 'num': fHits(prefetch_data_misses),
+ }
+ output["cache_misses_by_data_type"]["demand_metadata"] = {
+ 'per': fPerc(demand_metadata_misses, arc_misses),
+ 'num': fHits(demand_metadata_misses),
+ }
+ output["cache_misses_by_data_type"]["prefetch_metadata"] = {
+ 'per': fPerc(prefetch_metadata_misses, arc_misses),
+ 'num': fHits(prefetch_metadata_misses),
+ }
+
+ return output
+
+
+def _arc_efficiency(Kstat):
+ """Print information on the efficiency of the ARC"""
+
+ arc = get_arc_efficiency(Kstat)
+
+ sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" %
+ arc['total_accesses'])
+ sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % (
+ arc['cache_hit_ratio']['per'],
+ arc['cache_hit_ratio']['num'],
+ )
+ )
+ sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % (
+ arc['cache_miss_ratio']['per'],
+ arc['cache_miss_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % (
+ arc['actual_hit_ratio']['per'],
+ arc['actual_hit_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\n")
+ sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % (
+ arc['data_demand_efficiency']['per'],
+ arc['data_demand_efficiency']['num'],
+ )
+ )
+
+ if 'data_prefetch_efficiency' in arc:
+ sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % (
+ arc['data_prefetch_efficiency']['per'],
+ arc['data_prefetch_efficiency']['num'],
+ )
+ )
+ sys.stdout.write("\n")
+
+ sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n")
+ if 'cache_hits_by_cache_list' in arc:
+ sys.stdout.write("\t Anonymously Used:\t\t%s\t%s\n" % (
+ arc['cache_hits_by_cache_list']['anonymously_used']['per'],
+ arc['cache_hits_by_cache_list']['anonymously_used']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Recently Used:\t\t%s\t%s\n" % (
+ arc['most_recently_used']['per'],
+ arc['most_recently_used']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Frequently Used:\t\t%s\t%s\n" % (
+ arc['most_frequently_used']['per'],
+ arc['most_frequently_used']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Recently Used Ghost:\t%s\t%s\n" % (
+ arc['most_recently_used_ghost']['per'],
+ arc['most_recently_used_ghost']['num'],
+ )
+ )
+ sys.stdout.write("\t Most Frequently Used Ghost:\t%s\t%s\n" % (
+ arc['most_frequently_used_ghost']['per'],
+ arc['most_frequently_used_ghost']['num'],
+ )
+ )
+
+ sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n")
+ sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['demand_data']['per'],
+ arc["cache_hits_by_data_type"]['demand_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['prefetch_data']['per'],
+ arc["cache_hits_by_data_type"]['prefetch_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['demand_metadata']['per'],
+ arc["cache_hits_by_data_type"]['demand_metadata']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % (
+ arc["cache_hits_by_data_type"]['prefetch_metadata']['per'],
+ arc["cache_hits_by_data_type"]['prefetch_metadata']['num'],
+ )
+ )
+
+ sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n")
+ sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['demand_data']['per'],
+ arc["cache_misses_by_data_type"]['demand_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['prefetch_data']['per'],
+ arc["cache_misses_by_data_type"]['prefetch_data']['num'],
+ )
+ )
+ sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['demand_metadata']['per'],
+ arc["cache_misses_by_data_type"]['demand_metadata']['num'],
+ )
+ )
+ sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % (
+ arc["cache_misses_by_data_type"]['prefetch_metadata']['per'],
+ arc["cache_misses_by_data_type"]['prefetch_metadata']['num'],
+ )
+ )
+
+
+def get_l2arc_summary(Kstat):
+ """Collection information on the L2ARC"""
+
+ output = {}
+
+ l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"]
+ l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"]
+ l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"]
+ l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"]
+ l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"]
+ l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"]
+ l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"]
+ l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"]
+ l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"]
+ l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"]
+ l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"]
+ l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"]
+ l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"]
+ l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"]
+ l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"]
+ l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"]
+
+ l2_access_total = (l2_hits + l2_misses)
+ output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error)
+
+ output['l2_access_total'] = l2_access_total
+ output['l2_size'] = l2_size
+ output['l2_asize'] = l2_asize
+
+ if l2_size > 0 and l2_access_total > 0:
+
+ if output['l2_health_count'] > 0:
+ output["health"] = "DEGRADED"
+ else:
+ output["health"] = "HEALTHY"
+
+ output["low_memory_aborts"] = fHits(l2_abort_lowmem)
+ output["free_on_write"] = fHits(l2_free_on_write)
+ output["rw_clashes"] = fHits(l2_rw_clash)
+ output["bad_checksums"] = fHits(l2_cksum_bad)
+ output["io_errors"] = fHits(l2_io_error)
+
+ output["l2_arc_size"] = {}
+ output["l2_arc_size"]["adative"] = fBytes(l2_size)
+ output["l2_arc_size"]["actual"] = {
+ 'per': fPerc(l2_asize, l2_size),
+ 'num': fBytes(l2_asize)
+ }
+ output["l2_arc_size"]["head_size"] = {
+ 'per': fPerc(l2_hdr_size, l2_size),
+ 'num': fBytes(l2_hdr_size),
+ }
+
+ output["l2_arc_evicts"] = {}
+ output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry)
+ output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading)
+
+ output['l2_arc_breakdown'] = {}
+ output['l2_arc_breakdown']['value'] = fHits(l2_access_total)
+ output['l2_arc_breakdown']['hit_ratio'] = {
+ 'per': fPerc(l2_hits, l2_access_total),
+ 'num': fHits(l2_hits),
+ }
+ output['l2_arc_breakdown']['miss_ratio'] = {
+ 'per': fPerc(l2_misses, l2_access_total),
+ 'num': fHits(l2_misses),
+ }
+ output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds)
+
+ output['l2_arc_buffer'] = {}
+
+ output['l2_arc_writes'] = {}
+ output['l2_writes_done'] = l2_writes_done
+ output['l2_writes_sent'] = l2_writes_sent
+ if l2_writes_done != l2_writes_sent:
+ output['l2_arc_writes']['writes_sent'] = {
+ 'value': "FAULTED",
+ 'num': fHits(l2_writes_sent),
+ }
+ output['l2_arc_writes']['done_ratio'] = {
+ 'per': fPerc(l2_writes_done, l2_writes_sent),
+ 'num': fHits(l2_writes_done),
+ }
+ output['l2_arc_writes']['error_ratio'] = {
+ 'per': fPerc(l2_writes_error, l2_writes_sent),
+ 'num': fHits(l2_writes_error),
+ }
+ else:
+ output['l2_arc_writes']['writes_sent'] = {
+ 'per': fPerc(100),
+ 'num': fHits(l2_writes_sent),
+ }
+
+ return output
+
+
+def _l2arc_summary(Kstat):
+ """Print information on the L2ARC"""
+
+ arc = get_l2arc_summary(Kstat)
+
+ if arc['l2_size'] > 0 and arc['l2_access_total'] > 0:
+ sys.stdout.write("L2 ARC Summary: ")
+ if arc['l2_health_count'] > 0:
+ sys.stdout.write("(DEGRADED)\n")
+ else:
+ sys.stdout.write("(HEALTHY)\n")
+ sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" %
+ arc['low_memory_aborts'])
+ sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write'])
+ sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes'])
+ sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums'])
+ sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors'])
+ sys.stdout.write("\n")
+
+ sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" %
+ arc["l2_arc_size"]["adative"])
+ sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % (
+ arc["l2_arc_size"]["actual"]["per"],
+ arc["l2_arc_size"]["actual"]["num"],
+ )
+ )
+ sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % (
+ arc["l2_arc_size"]["head_size"]["per"],
+ arc["l2_arc_size"]["head_size"]["num"],
+ )
+ )
+ sys.stdout.write("\n")
+
+ if arc["l2_arc_evicts"]['lock_retries'] != '0' or \
+ arc["l2_arc_evicts"]["reading"] != '0':
+ sys.stdout.write("L2 ARC Evicts:\n")
+ sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" %
+ arc["l2_arc_evicts"]['lock_retries'])
+ sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" %
+ arc["l2_arc_evicts"]["reading"])
+ sys.stdout.write("\n")
+
+ sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" %
+ arc['l2_arc_breakdown']['value'])
+ sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_breakdown']['hit_ratio']['per'],
+ arc['l2_arc_breakdown']['hit_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_breakdown']['miss_ratio']['per'],
+ arc['l2_arc_breakdown']['miss_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" %
+ arc['l2_arc_breakdown']['feeds'])
+ sys.stdout.write("\n")
+
+ sys.stdout.write("L2 ARC Writes:\n")
+ if arc['l2_writes_done'] != arc['l2_writes_sent']:
+ sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % (
+ arc['l2_arc_writes']['writes_sent']['value'],
+ arc['l2_arc_writes']['writes_sent']['num'],
+ )
+ )
+ sys.stdout.write("\t Done Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_writes']['done_ratio']['per'],
+ arc['l2_arc_writes']['done_ratio']['num'],
+ )
+ )
+ sys.stdout.write("\t Error Ratio:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_writes']['error_ratio']['per'],
+ arc['l2_arc_writes']['error_ratio']['num'],
+ )
+ )
+ else:
+ sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % (
+ arc['l2_arc_writes']['writes_sent']['per'],
+ arc['l2_arc_writes']['writes_sent']['num'],
+ )
+ )
+
+
+def get_dmu_summary(Kstat):
+ """Collect information on the DMU"""
+
+ output = {}
+
+ zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"]
+ zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"]
+
+ zfetch_access_total = (zfetch_hits + zfetch_misses)
+ output['zfetch_access_total'] = zfetch_access_total
+
+ if zfetch_access_total > 0:
+ output['dmu'] = {}
+ output['dmu']['efficiency'] = {}
+ output['dmu']['efficiency']['value'] = fHits(zfetch_access_total)
+ output['dmu']['efficiency']['hit_ratio'] = {
+ 'per': fPerc(zfetch_hits, zfetch_access_total),
+ 'num': fHits(zfetch_hits),
+ }
+ output['dmu']['efficiency']['miss_ratio'] = {
+ 'per': fPerc(zfetch_misses, zfetch_access_total),
+ 'num': fHits(zfetch_misses),
+ }
+
+ return output
+
+
+def _dmu_summary(Kstat):
+ """Print information on the DMU"""
+
+ arc = get_dmu_summary(Kstat)
+
+ if arc['zfetch_access_total'] > 0:
+ sys.stdout.write("DMU Prefetch Efficiency:\t\t\t\t\t%s\n" %
+ arc['dmu']['efficiency']['value'])
+ sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+ arc['dmu']['efficiency']['hit_ratio']['per'],
+ arc['dmu']['efficiency']['hit_ratio']['num'],
+ )
+ )
+ sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+ arc['dmu']['efficiency']['miss_ratio']['per'],
+ arc['dmu']['efficiency']['miss_ratio']['num'],
+ )
+ )
+
+ sys.stdout.write("\n")
+
+
+def get_vdev_summary(Kstat):
+ """Collect information on the VDEVs"""
+
+ output = {}
+
+ vdev_cache_delegations = \
+ Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"]
+ vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"]
+ vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"]
+ vdev_cache_total = (vdev_cache_misses + vdev_cache_hits +
+ vdev_cache_delegations)
+
+ output['vdev_cache_total'] = vdev_cache_total
+
+ if vdev_cache_total > 0:
+ output['summary'] = fHits(vdev_cache_total)
+ output['hit_ratio'] = {
+ 'per': fPerc(vdev_cache_hits, vdev_cache_total),
+ 'num': fHits(vdev_cache_hits),
+ }
+ output['miss_ratio'] = {
+ 'per': fPerc(vdev_cache_misses, vdev_cache_total),
+ 'num': fHits(vdev_cache_misses),
+ }
+ output['delegations'] = {
+ 'per': fPerc(vdev_cache_delegations, vdev_cache_total),
+ 'num': fHits(vdev_cache_delegations),
+ }
+
+ return output
+
+
+def _vdev_summary(Kstat):
+ """Print information on the VDEVs"""
+
+ arc = get_vdev_summary(Kstat)
+
+ if arc['vdev_cache_total'] > 0:
+ sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary'])
+ sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % (
+ arc['hit_ratio']['per'],
+ arc['hit_ratio']['num'],
+ ))
+ sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % (
+ arc['miss_ratio']['per'],
+ arc['miss_ratio']['num'],
+ ))
+ sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % (
+ arc['delegations']['per'],
+ arc['delegations']['num'],
+ ))
+
+
+def _tunable_summary(Kstat):
+ """Print information on tunables, including descriptions if requested"""
+
+ global show_tunable_descriptions
+ global alternate_tunable_layout
+
+ tunables = load_tunables()
+ descriptions = {}
+
+ if show_tunable_descriptions:
+
+ command = ["/sbin/modinfo", "zfs", "-0"]
+
+ try:
+ p = Popen(command, stdin=PIPE, stdout=PIPE,
+ stderr=PIPE, shell=False, close_fds=True)
+ p.wait()
+
+ # By default, Python 2 returns a string as the first element of the
+ # tuple from p.communicate(), while Python 3 returns bytes which
+ # must be decoded first. The better way to do this would be with
+ # subprocess.run() or at least .check_output(), but this fails on
+ # CentOS 6 because of its old version of Python 2
+ desc = bytes.decode(p.communicate()[0])
+ description_list = desc.strip().split('\0')
+
+ if p.returncode == 0:
+ for tunable in description_list:
+ if tunable[0:5] == 'parm:':
+ tunable = tunable[5:].strip()
+ name, description = tunable.split(':', 1)
+ if not description:
+ description = "Description unavailable"
+ descriptions[name] = description
+ else:
+ sys.stderr.write("%s: '%s' exited with code %i\n" %
+ (sys.argv[0], command[0], p.returncode))
+ sys.stderr.write("Tunable descriptions will be disabled.\n")
+ except OSError as e:
+ sys.stderr.write("%s: Cannot run '%s': %s\n" %
+ (sys.argv[0], command[0], e.strerror))
+ sys.stderr.write("Tunable descriptions will be disabled.\n")
+
+ sys.stdout.write("ZFS Tunables:\n")
+
+ if alternate_tunable_layout:
+ fmt = "\t%s=%s\n"
+ else:
+ fmt = "\t%-50s%s\n"
+
+ for name in sorted(tunables.keys()):
+ if show_tunable_descriptions and name in descriptions:
+ sys.stdout.write("\t# %s\n" % descriptions[name])
+
+ sys.stdout.write(fmt % (name, tunables[name]))
+
+
+unSub = [
+ _arc_summary,
+ _arc_efficiency,
+ _l2arc_summary,
+ _dmu_summary,
+ _vdev_summary,
+ _tunable_summary
+]
+
+
+def zfs_header():
+ """Print title string with date"""
+
+ daydate = time.strftime('%a %b %d %H:%M:%S %Y')
+
+ sys.stdout.write('\n'+'-'*72+'\n')
+ sys.stdout.write('ZFS Subsystem Report\t\t\t\t%s' % daydate)
+ sys.stdout.write('\n')
+
+
+def usage():
+ """Print usage information"""
+
+ sys.stdout.write("Usage: arc_summary [-h] [-a] [-d] [-p PAGE]\n\n")
+ sys.stdout.write("\t -h, --help : "
+ "Print this help message and exit\n")
+ sys.stdout.write("\t -a, --alternate : "
+ "Show an alternate sysctl layout\n")
+ sys.stdout.write("\t -d, --description : "
+ "Show the sysctl descriptions\n")
+ sys.stdout.write("\t -p PAGE, --page=PAGE : "
+ "Select a single output page to display,\n")
+ sys.stdout.write("\t "
+ "should be an integer between 1 and " +
+ str(len(unSub)) + "\n\n")
+ sys.stdout.write("Examples:\n")
+ sys.stdout.write("\tarc_summary -a\n")
+ sys.stdout.write("\tarc_summary -p 4\n")
+ sys.stdout.write("\tarc_summary -ad\n")
+ sys.stdout.write("\tarc_summary --page=2\n")
+
+
+def main():
+ """Main function"""
+
+ global show_tunable_descriptions
+ global alternate_tunable_layout
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "adp:h", ["alternate", "description", "page=", "help"]
+ )
+ except getopt.error as e:
+ sys.stderr.write("Error: %s\n" % e.msg)
+ usage()
+ sys.exit(1)
+
+ args = {}
+ for opt, arg in opts:
+ if opt in ('-a', '--alternate'):
+ args['a'] = True
+ if opt in ('-d', '--description'):
+ args['d'] = True
+ if opt in ('-p', '--page'):
+ args['p'] = arg
+ if opt in ('-h', '--help'):
+ usage()
+ sys.exit(0)
+
+ Kstat = get_Kstat()
+
+ alternate_tunable_layout = 'a' in args
+ show_tunable_descriptions = 'd' in args
+
+ pages = []
+
+ if 'p' in args:
+ try:
+ pages.append(unSub[int(args['p']) - 1])
+ except IndexError:
+ sys.stderr.write('the argument to -p must be between 1 and ' +
+ str(len(unSub)) + '\n')
+ sys.exit(1)
+ else:
+ pages = unSub
+
+ zfs_header()
+ for page in pages:
+ page(Kstat)
+ sys.stdout.write("\n")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3
new file mode 100755
index 000000000000..c920b8e5395d
--- /dev/null
+++ b/cmd/arc_summary/arc_summary3
@@ -0,0 +1,943 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
+# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
+# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
+# Copyright (c) 2017 Scot W. Stevenson <scot.stevenson@gmail.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+"""Print statistics on the ZFS ARC Cache and other information
+
+Provides basic information on the ARC, its efficiency, the L2ARC (if present),
+the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See
+the in-source documentation and code at
+https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details.
+The original introduction to arc_summary can be found at
+http://cuddletech.com/?p=454
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+DESCRIPTION = 'Print ARC and other statistics for ZFS on Linux'
+INDENT = ' '*8
+LINE_LENGTH = 72
+DATE_FORMAT = '%a %b %d %H:%M:%S %Y'
+TITLE = 'ZFS Subsystem Report'
+
+SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split()
+SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')'
+
+# Tunables and SPL are handled separately because they come from
+# different sources
+SECTION_PATHS = {'arc': 'arcstats',
+ 'dmu': 'dmu_tx',
+ 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats
+ 'vdev': 'vdev_cache_stats',
+ 'xuio': 'xuio_stats',
+ 'zfetch': 'zfetchstats',
+ 'zil': 'zil'}
+
+parser = argparse.ArgumentParser(description=DESCRIPTION)
+parser.add_argument('-a', '--alternate', action='store_true', default=False,
+ help='use alternate formatting for tunables and SPL',
+ dest='alt')
+parser.add_argument('-d', '--description', action='store_true', default=False,
+ help='print descriptions with tunables and SPL',
+ dest='desc')
+parser.add_argument('-g', '--graph', action='store_true', default=False,
+ help='print graph on ARC use and exit', dest='graph')
+parser.add_argument('-p', '--page', type=int, dest='page',
+ help='print page by number (DEPRECATED, use "-s")')
+parser.add_argument('-r', '--raw', action='store_true', default=False,
+ help='dump all available data with minimal formatting',
+ dest='raw')
+parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP)
+ARGS = parser.parse_args()
+
+
+if sys.platform.startswith('freebsd'):
+ # Requires py36-sysctl on FreeBSD
+ import sysctl
+
+ VDEV_CACHE_SIZE = 'vdev.cache_size'
+
+ def load_kstats(section):
+ base = 'kstat.zfs.misc.{section}.'.format(section=section)
+ # base is removed from the name
+ fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):],
+ value=kstat.value)
+ return [fmt(kstat) for kstat in sysctl.filter(base)]
+
+ def get_params(base):
+ cut = 8 # = len('vfs.zfs.')
+ return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)}
+
+ def get_tunable_params():
+ return get_params('vfs.zfs')
+
+ def get_vdev_params():
+ return get_params('vfs.zfs.vdev')
+
+ def get_version_impl(request):
+ # FreeBSD reports versions for zpl and spa instead of zfs and spl.
+ name = {'zfs': 'zpl',
+ 'spl': 'spa'}[request]
+ mib = 'vfs.zfs.version.{}'.format(name)
+ version = sysctl.filter(mib)[0].value
+ return '{} version {}'.format(name, version)
+
+ def get_descriptions(_request):
+ # py-sysctl doesn't give descriptions, so we have to shell out.
+ command = ['sysctl', '-d', 'vfs.zfs']
+
+ # The recommended way to do this is with subprocess.run(). However,
+ # some installed versions of Python are < 3.5, so we offer them
+ # the option of doing it the old way (for now)
+ if 'run' in dir(subprocess):
+ info = subprocess.run(command, stdout=subprocess.PIPE,
+ universal_newlines=True)
+ lines = info.stdout.split('\n')
+ else:
+ info = subprocess.check_output(command, universal_newlines=True)
+ lines = info.split('\n')
+
+ def fmt(line):
+ name, desc = line.split(':', 1)
+ return (name.strip(), desc.strip())
+
+ return dict([fmt(line) for line in lines if len(line) > 0])
+
+
+elif sys.platform.startswith('linux'):
+ KSTAT_PATH = '/proc/spl/kstat/zfs'
+ SPL_PATH = '/sys/module/spl/parameters'
+ TUNABLES_PATH = '/sys/module/zfs/parameters'
+
+ VDEV_CACHE_SIZE = 'zfs_vdev_cache_size'
+
+ def load_kstats(section):
+ path = os.path.join(KSTAT_PATH, section)
+ with open(path) as f:
+ return list(f)[2:] # Get rid of header
+
+ def get_params(basepath):
+ """Collect information on the Solaris Porting Layer (SPL) or the
+ tunables, depending on the PATH given. Does not check if PATH is
+ legal.
+ """
+ result = {}
+ for name in os.listdir(basepath):
+ path = os.path.join(basepath, name)
+ with open(path) as f:
+ value = f.read()
+ result[name] = value.strip()
+ return result
+
+ def get_spl_params():
+ return get_params(SPL_PATH)
+
+ def get_tunable_params():
+ return get_params(TUNABLES_PATH)
+
+ def get_vdev_params():
+ return get_params(TUNABLES_PATH)
+
+ def get_version_impl(request):
+ # The original arc_summary called /sbin/modinfo/{spl,zfs} to get
+ # the version information. We switch to /sys/module/{spl,zfs}/version
+ # to make sure we get what is really loaded in the kernel
+ command = ["cat", "/sys/module/{0}/version".format(request)]
+ req = request.upper()
+
+ # The recommended way to do this is with subprocess.run(). However,
+ # some installed versions of Python are < 3.5, so we offer them
+ # the option of doing it the old way (for now)
+ if 'run' in dir(subprocess):
+ info = subprocess.run(command, stdout=subprocess.PIPE,
+ universal_newlines=True)
+ version = info.stdout.strip()
+ else:
+ info = subprocess.check_output(command, universal_newlines=True)
+ version = info.strip()
+
+ return version
+
+ def get_descriptions(request):
+ """Get the descriptions of the Solaris Porting Layer (SPL) or the
+ tunables, return with minimal formatting.
+ """
+
+ if request not in ('spl', 'zfs'):
+ print('ERROR: description of "{0}" requested)'.format(request))
+ sys.exit(1)
+
+ descs = {}
+ target_prefix = 'parm:'
+
+ # We would prefer to do this with /sys/modules -- see the discussion at
+ # get_version() -- but there isn't a way to get the descriptions from
+ # there, so we fall back on modinfo
+ command = ["/sbin/modinfo", request, "-0"]
+
+ # The recommended way to do this is with subprocess.run(). However,
+ # some installed versions of Python are < 3.5, so we offer them
+ # the option of doing it the old way (for now)
+ info = ''
+
+ try:
+
+ if 'run' in dir(subprocess):
+ info = subprocess.run(command, stdout=subprocess.PIPE,
+ universal_newlines=True)
+ raw_output = info.stdout.split('\0')
+ else:
+ info = subprocess.check_output(command,
+ universal_newlines=True)
+ raw_output = info.split('\0')
+
+ except subprocess.CalledProcessError:
+ print("Error: Descriptions not available",
+ "(can't access kernel module)")
+ sys.exit(1)
+
+ for line in raw_output:
+
+ if not line.startswith(target_prefix):
+ continue
+
+ line = line[len(target_prefix):].strip()
+ name, raw_desc = line.split(':', 1)
+ desc = raw_desc.rsplit('(', 1)[0]
+
+ if desc == '':
+ desc = '(No description found)'
+
+ descs[name.strip()] = desc.strip()
+
+ return descs
+
+
+def cleanup_line(single_line):
+ """Format a raw line of data from /proc and isolate the name value
+ part, returning a tuple with each. Currently, this gets rid of the
+ middle '4'. For example "arc_no_grow 4 0" returns the tuple
+ ("arc_no_grow", "0").
+ """
+ name, _, value = single_line.split()
+
+ return name, value
+
+
+def draw_graph(kstats_dict):
+ """Draw a primitive graph representing the basic information on the
+ ARC -- its size and the proportion used by MFU and MRU -- and quit.
+ We use max size of the ARC to calculate how full it is. This is a
+ very rough representation.
+ """
+
+ arc_stats = isolate_section('arcstats', kstats_dict)
+
+ GRAPH_INDENT = ' '*4
+ GRAPH_WIDTH = 60
+ arc_size = f_bytes(arc_stats['size'])
+ arc_perc = f_perc(arc_stats['size'], arc_stats['c_max'])
+ mfu_size = f_bytes(arc_stats['mfu_size'])
+ mru_size = f_bytes(arc_stats['mru_size'])
+ meta_limit = f_bytes(arc_stats['arc_meta_limit'])
+ meta_size = f_bytes(arc_stats['arc_meta_used'])
+ dnode_limit = f_bytes(arc_stats['arc_dnode_limit'])
+ dnode_size = f_bytes(arc_stats['dnode_size'])
+
+ info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) '
+ 'DNODE {6} ({7})')
+ info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size,
+ meta_size, meta_limit, dnode_size,
+ dnode_limit)
+ info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2)
+ info_line = GRAPH_INDENT+info_spc+info_line
+
+ graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+'
+
+ mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max']))
+ mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max']))
+ arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max']))
+ total_ticks = float(arc_perc)*GRAPH_WIDTH
+ mfu_ticks = mfu_perc*GRAPH_WIDTH
+ mru_ticks = mru_perc*GRAPH_WIDTH
+ other_ticks = total_ticks-(mfu_ticks+mru_ticks)
+
+ core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks)
+ core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form)))
+ core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|'
+
+ for line in ('', info_line, graph_line, core_line, graph_line, ''):
+ print(line)
+
+
+def f_bytes(byte_string):
+ """Return human-readable representation of a byte value in
+ powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
+ points. Values smaller than one KiB are returned without
+ decimal points. Note "bytes" is a reserved keyword.
+ """
+
+ prefixes = ([2**80, "YiB"], # yobibytes (yotta)
+ [2**70, "ZiB"], # zebibytes (zetta)
+ [2**60, "EiB"], # exbibytes (exa)
+ [2**50, "PiB"], # pebibytes (peta)
+ [2**40, "TiB"], # tebibytes (tera)
+ [2**30, "GiB"], # gibibytes (giga)
+ [2**20, "MiB"], # mebibytes (mega)
+ [2**10, "KiB"]) # kibibytes (kilo)
+
+ bites = int(byte_string)
+
+ if bites >= 2**10:
+ for limit, unit in prefixes:
+
+ if bites >= limit:
+ value = bites / limit
+ break
+
+ result = '{0:.1f} {1}'.format(value, unit)
+ else:
+ result = '{0} Bytes'.format(bites)
+
+ return result
+
+
+def f_hits(hits_string):
+ """Create a human-readable representation of the number of hits.
+ The single-letter symbols used are SI to avoid the confusion caused
+ by the different "short scale" and "long scale" representations in
+ English, which use the same words for different values. See
+ https://en.wikipedia.org/wiki/Names_of_large_numbers and:
+ https://physics.nist.gov/cuu/Units/prefixes.html
+ """
+
+ numbers = ([10**24, 'Y'], # yotta (septillion)
+ [10**21, 'Z'], # zetta (sextillion)
+ [10**18, 'E'], # exa (quintrillion)
+ [10**15, 'P'], # peta (quadrillion)
+ [10**12, 'T'], # tera (trillion)
+ [10**9, 'G'], # giga (billion)
+ [10**6, 'M'], # mega (million)
+ [10**3, 'k']) # kilo (thousand)
+
+ hits = int(hits_string)
+
+ if hits >= 1000:
+ for limit, symbol in numbers:
+
+ if hits >= limit:
+ value = hits/limit
+ break
+
+ result = "%0.1f%s" % (value, symbol)
+ else:
+ result = "%d" % hits
+
+ return result
+
+
+def f_perc(value1, value2):
+ """Calculate percentage and return in human-readable form. If
+ rounding produces the result '0.0' though the first number is
+ not zero, include a 'less-than' symbol to avoid confusion.
+ Division by zero is handled by returning 'n/a'; no error
+ is called.
+ """
+
+ v1 = float(value1)
+ v2 = float(value2)
+
+ try:
+ perc = 100 * v1/v2
+ except ZeroDivisionError:
+ result = 'n/a'
+ else:
+ result = '{0:0.1f} %'.format(perc)
+
+ if result == '0.0 %' and v1 > 0:
+ result = '< 0.1 %'
+
+ return result
+
+
+def format_raw_line(name, value):
+ """For the --raw option for the tunable and SPL outputs, decide on the
+ correct formatting based on the --alternate flag.
+ """
+
+ if ARGS.alt:
+ result = '{0}{1}={2}'.format(INDENT, name, value)
+ else:
+ spc = LINE_LENGTH-(len(INDENT)+len(value))
+ result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc)
+
+ return result
+
+
+def get_kstats():
+ """Collect information on the ZFS subsystem. The step does not perform any
+ further processing, giving us the option to only work on what is actually
+ needed. The name "kstat" is a holdover from the Solaris utility of the same
+ name.
+ """
+
+ result = {}
+
+ for section in SECTION_PATHS.values():
+ if section not in result:
+ result[section] = load_kstats(section)
+
+ return result
+
+
+def get_version(request):
+ """Get the version number of ZFS or SPL on this machine for header.
+ Returns an error string, but does not raise an error, if we can't
+ get the ZFS/SPL version.
+ """
+
+ if request not in ('spl', 'zfs'):
+ error_msg = '(ERROR: "{0}" requested)'.format(request)
+ return error_msg
+
+ return get_version_impl(request)
+
+
+def print_header():
+ """Print the initial heading with date and time as well as info on the
+ kernel and ZFS versions. This is not called for the graph.
+ """
+
+ # datetime is now recommended over time but we keep the exact formatting
+ # from the older version of arc_summary in case there are scripts
+ # that expect it in this way
+ daydate = time.strftime(DATE_FORMAT)
+ spc_date = LINE_LENGTH-len(daydate)
+ sys_version = os.uname()
+
+ sys_msg = sys_version.sysname+' '+sys_version.release
+ zfs = get_version('zfs')
+ spc_zfs = LINE_LENGTH-len(zfs)
+
+ machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')'
+ spl = get_version('spl')
+ spc_spl = LINE_LENGTH-len(spl)
+
+ print('\n'+('-'*LINE_LENGTH))
+ print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date))
+ print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs))
+ print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl))
+
+
+def print_raw(kstats_dict):
+ """Print all available data from the system in a minimally sorted format.
+ This can be used as a source to be piped through 'grep'.
+ """
+
+ sections = sorted(kstats_dict.keys())
+
+ for section in sections:
+
+ print('\n{0}:'.format(section.upper()))
+ lines = sorted(kstats_dict[section])
+
+ for line in lines:
+ name, value = cleanup_line(line)
+ print(format_raw_line(name, value))
+
+ # Tunables and SPL must be handled separately because they come from a
+ # different source and have descriptions the user might request
+ print()
+ section_spl()
+ section_tunables()
+
+
+def isolate_section(section_name, kstats_dict):
+ """From the complete information on all sections, retrieve only those
+ for one section.
+ """
+
+ try:
+ section_data = kstats_dict[section_name]
+ except KeyError:
+ print('ERROR: Data on {0} not available'.format(section_data))
+ sys.exit(1)
+
+ section_dict = dict(cleanup_line(l) for l in section_data)
+
+ return section_dict
+
+
+# Formatted output helper functions
+
+
+def prt_1(text, value):
+ """Print text and one value, no indent"""
+ spc = ' '*(LINE_LENGTH-(len(text)+len(value)))
+ print('{0}{spc}{1}'.format(text, value, spc=spc))
+
+
+def prt_i1(text, value):
+ """Print text and one value, with indent"""
+ spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value)))
+ print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc))
+
+
+def prt_2(text, value1, value2):
+ """Print text and two values, no indent"""
+ values = '{0:>9} {1:>9}'.format(value1, value2)
+ spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2))
+ print('{0}{spc} {1}'.format(text, values, spc=spc))
+
+
+def prt_i2(text, value1, value2):
+ """Print text and two values, with indent"""
+ values = '{0:>9} {1:>9}'.format(value1, value2)
+ spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2))
+ print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc))
+
+
+# The section output concentrates on important parameters instead of
+# being exhaustive (that is what the --raw parameter is for)
+
+
+def section_arc(kstats_dict):
+ """Give basic information on the ARC, MRU and MFU. This is the first
+ and most used section.
+ """
+
+ arc_stats = isolate_section('arcstats', kstats_dict)
+
+ throttle = arc_stats['memory_throttle_count']
+
+ if throttle == '0':
+ health = 'HEALTHY'
+ else:
+ health = 'THROTTLED'
+
+ prt_1('ARC status:', health)
+ prt_i1('Memory throttle count:', throttle)
+ print()
+
+ arc_size = arc_stats['size']
+ arc_target_size = arc_stats['c']
+ arc_max = arc_stats['c_max']
+ arc_min = arc_stats['c_min']
+ mfu_size = arc_stats['mfu_size']
+ mru_size = arc_stats['mru_size']
+ meta_limit = arc_stats['arc_meta_limit']
+ meta_size = arc_stats['arc_meta_used']
+ dnode_limit = arc_stats['arc_dnode_limit']
+ dnode_size = arc_stats['dnode_size']
+ target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min))
+
+ prt_2('ARC size (current):',
+ f_perc(arc_size, arc_max), f_bytes(arc_size))
+ prt_i2('Target size (adaptive):',
+ f_perc(arc_target_size, arc_max), f_bytes(arc_target_size))
+ prt_i2('Min size (hard limit):',
+ f_perc(arc_min, arc_max), f_bytes(arc_min))
+ prt_i2('Max size (high water):',
+ target_size_ratio, f_bytes(arc_max))
+ caches_size = int(mfu_size)+int(mru_size)
+ prt_i2('Most Frequently Used (MFU) cache size:',
+ f_perc(mfu_size, caches_size), f_bytes(mfu_size))
+ prt_i2('Most Recently Used (MRU) cache size:',
+ f_perc(mru_size, caches_size), f_bytes(mru_size))
+ prt_i2('Metadata cache size (hard limit):',
+ f_perc(meta_limit, arc_max), f_bytes(meta_limit))
+ prt_i2('Metadata cache size (current):',
+ f_perc(meta_size, meta_limit), f_bytes(meta_size))
+ prt_i2('Dnode cache size (hard limit):',
+ f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit))
+ prt_i2('Dnode cache size (current):',
+ f_perc(dnode_size, dnode_limit), f_bytes(dnode_size))
+ print()
+
+ print('ARC hash breakdown:')
+ prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max']))
+ prt_i2('Elements current:',
+ f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']),
+ f_hits(arc_stats['hash_elements']))
+ prt_i1('Collisions:', f_hits(arc_stats['hash_collisions']))
+
+ prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max']))
+ prt_i1('Chains:', f_hits(arc_stats['hash_chains']))
+ print()
+
+ print('ARC misc:')
+ prt_i1('Deleted:', f_hits(arc_stats['deleted']))
+ prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss']))
+ prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip']))
+ print()
+
+
+def section_archits(kstats_dict):
+ """Print information on how the caches are accessed ("arc hits").
+ """
+
+ arc_stats = isolate_section('arcstats', kstats_dict)
+ all_accesses = int(arc_stats['hits'])+int(arc_stats['misses'])
+ actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits'])
+
+ prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses))
+ ta_todo = (('Cache hit ratio:', arc_stats['hits']),
+ ('Cache miss ratio:', arc_stats['misses']),
+ ('Actual hit ratio (MFU + MRU hits):', actual_hits))
+
+ for title, value in ta_todo:
+ prt_i2(title, f_perc(value, all_accesses), f_hits(value))
+
+ dd_total = int(arc_stats['demand_data_hits']) +\
+ int(arc_stats['demand_data_misses'])
+ prt_i2('Data demand efficiency:',
+ f_perc(arc_stats['demand_data_hits'], dd_total),
+ f_hits(dd_total))
+
+ dp_total = int(arc_stats['prefetch_data_hits']) +\
+ int(arc_stats['prefetch_data_misses'])
+ prt_i2('Data prefetch efficiency:',
+ f_perc(arc_stats['prefetch_data_hits'], dp_total),
+ f_hits(dp_total))
+
+ known_hits = int(arc_stats['mfu_hits']) +\
+ int(arc_stats['mru_hits']) +\
+ int(arc_stats['mfu_ghost_hits']) +\
+ int(arc_stats['mru_ghost_hits'])
+
+ anon_hits = int(arc_stats['hits'])-known_hits
+
+ print()
+ print('Cache hits by cache type:')
+ cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']),
+ ('Most recently used (MRU):', arc_stats['mru_hits']),
+ ('Most frequently used (MFU) ghost:',
+ arc_stats['mfu_ghost_hits']),
+ ('Most recently used (MRU) ghost:',
+ arc_stats['mru_ghost_hits']))
+
+ for title, value in cl_todo:
+ prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value))
+
+ # For some reason, anon_hits can turn negative, which is weird. Until we
+ # have figured out why this happens, we just hide the problem, following
+ # the behavior of the original arc_summary.
+ if anon_hits >= 0:
+ prt_i2('Anonymously used:',
+ f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits))
+
+ print()
+ print('Cache hits by data type:')
+ dt_todo = (('Demand data:', arc_stats['demand_data_hits']),
+ ('Demand prefetch data:', arc_stats['prefetch_data_hits']),
+ ('Demand metadata:', arc_stats['demand_metadata_hits']),
+ ('Demand prefetch metadata:',
+ arc_stats['prefetch_metadata_hits']))
+
+ for title, value in dt_todo:
+ prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value))
+
+ print()
+ print('Cache misses by data type:')
+ dm_todo = (('Demand data:', arc_stats['demand_data_misses']),
+ ('Demand prefetch data:',
+ arc_stats['prefetch_data_misses']),
+ ('Demand metadata:', arc_stats['demand_metadata_misses']),
+ ('Demand prefetch metadata:',
+ arc_stats['prefetch_metadata_misses']))
+
+ for title, value in dm_todo:
+ prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value))
+
+ print()
+
+
+def section_dmu(kstats_dict):
+ """Collect information on the DMU"""
+
+ zfetch_stats = isolate_section('zfetchstats', kstats_dict)
+
+ zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses'])
+
+ prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total))
+ prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total),
+ f_hits(zfetch_stats['hits']))
+ prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total),
+ f_hits(zfetch_stats['misses']))
+ print()
+
+
+def section_l2arc(kstats_dict):
+ """Collect information on L2ARC device if present. If not, tell user
+ that we're skipping the section.
+ """
+
+ # The L2ARC statistics live in the same section as the normal ARC stuff
+ arc_stats = isolate_section('arcstats', kstats_dict)
+
+ if arc_stats['l2_size'] == '0':
+ print('L2ARC not detected, skipping section\n')
+ return
+
+ l2_errors = int(arc_stats['l2_writes_error']) +\
+ int(arc_stats['l2_cksum_bad']) +\
+ int(arc_stats['l2_io_error'])
+
+ l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses'])
+ health = 'HEALTHY'
+
+ if l2_errors > 0:
+ health = 'DEGRADED'
+
+ prt_1('L2ARC status:', health)
+
+ l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'),
+ ('Free on write:', 'l2_free_on_write'),
+ ('R/W clashes:', 'l2_rw_clash'),
+ ('Bad checksums:', 'l2_cksum_bad'),
+ ('I/O errors:', 'l2_io_error'))
+
+ for title, value in l2_todo:
+ prt_i1(title, f_hits(arc_stats[value]))
+
+ print()
+ prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size']))
+ prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']),
+ f_bytes(arc_stats['l2_asize']))
+ prt_i2('Header size:',
+ f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
+ f_bytes(arc_stats['l2_hdr_size']))
+
+ print()
+ prt_1('L2ARC breakdown:', f_hits(l2_access_total))
+ prt_i2('Hit ratio:',
+ f_perc(arc_stats['l2_hits'], l2_access_total),
+ f_hits(arc_stats['l2_hits']))
+ prt_i2('Miss ratio:',
+ f_perc(arc_stats['l2_misses'], l2_access_total),
+ f_hits(arc_stats['l2_misses']))
+ prt_i1('Feeds:', f_hits(arc_stats['l2_feeds']))
+
+ print()
+ print('L2ARC writes:')
+
+ if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']:
+ prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent']))
+ prt_i2('Done ratio:',
+ f_perc(arc_stats['l2_writes_done'],
+ arc_stats['l2_writes_sent']),
+ f_hits(arc_stats['l2_writes_done']))
+ prt_i2('Error ratio:',
+ f_perc(arc_stats['l2_writes_error'],
+ arc_stats['l2_writes_sent']),
+ f_hits(arc_stats['l2_writes_error']))
+ else:
+ prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent']))
+
+ print()
+ print('L2ARC evicts:')
+ prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry']))
+ prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading']))
+ print()
+
+
+def section_spl(*_):
+ """Print the SPL parameters, if requested with alternative format
+ and/or descriptions. This does not use kstats.
+ """
+
+ if sys.platform.startswith('freebsd'):
+ # No SPL support in FreeBSD
+ return
+
+ spls = get_spl_params()
+ keylist = sorted(spls.keys())
+ print('Solaris Porting Layer (SPL):')
+
+ if ARGS.desc:
+ descriptions = get_descriptions('spl')
+
+ for key in keylist:
+ value = spls[key]
+
+ if ARGS.desc:
+ try:
+ print(INDENT+'#', descriptions[key])
+ except KeyError:
+ print(INDENT+'# (No description found)') # paranoid
+
+ print(format_raw_line(key, value))
+
+ print()
+
+
+def section_tunables(*_):
+ """Print the tunables, if requested with alternative format and/or
+ descriptions. This does not use kstasts.
+ """
+
+ tunables = get_tunable_params()
+ keylist = sorted(tunables.keys())
+ print('Tunables:')
+
+ if ARGS.desc:
+ descriptions = get_descriptions('zfs')
+
+ for key in keylist:
+ value = tunables[key]
+
+ if ARGS.desc:
+ try:
+ print(INDENT+'#', descriptions[key])
+ except KeyError:
+ print(INDENT+'# (No description found)') # paranoid
+
+ print(format_raw_line(key, value))
+
+ print()
+
+
+def section_vdev(kstats_dict):
+ """Collect information on VDEV caches"""
+
+ # Currently [Nov 2017] the VDEV cache is disabled, because it is actually
+ # harmful. When this is the case, we just skip the whole entry. See
+ # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c
+ # for details
+ tunables = get_vdev_params()
+
+ if tunables[VDEV_CACHE_SIZE] == '0':
+ print('VDEV cache disabled, skipping section\n')
+ return
+
+ vdev_stats = isolate_section('vdev_cache_stats', kstats_dict)
+
+ vdev_cache_total = int(vdev_stats['hits']) +\
+ int(vdev_stats['misses']) +\
+ int(vdev_stats['delegations'])
+
+ prt_1('VDEV cache summary:', f_hits(vdev_cache_total))
+ prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total),
+ f_hits(vdev_stats['hits']))
+ prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total),
+ f_hits(vdev_stats['misses']))
+ prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total),
+ f_hits(vdev_stats['delegations']))
+ print()
+
+
+def section_zil(kstats_dict):
+ """Collect information on the ZFS Intent Log. Some of the information
+ taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h
+ """
+
+ zil_stats = isolate_section('zil', kstats_dict)
+
+ prt_1('ZIL committed transactions:',
+ f_hits(zil_stats['zil_itx_count']))
+ prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count']))
+ prt_i1('Flushes to stable storage:',
+ f_hits(zil_stats['zil_commit_writer_count']))
+ prt_i2('Transactions to SLOG storage pool:',
+ f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']),
+ f_hits(zil_stats['zil_itx_metaslab_slog_count']))
+ prt_i2('Transactions to non-SLOG storage pool:',
+ f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']),
+ f_hits(zil_stats['zil_itx_metaslab_normal_count']))
+ print()
+
+
+section_calls = {'arc': section_arc,
+ 'archits': section_archits,
+ 'dmu': section_dmu,
+ 'l2arc': section_l2arc,
+ 'spl': section_spl,
+ 'tunables': section_tunables,
+ 'vdev': section_vdev,
+ 'zil': section_zil}
+
+
+def main():
+ """Run program. The options to draw a graph and to print all data raw are
+ treated separately because they come with their own call.
+ """
+
+ kstats = get_kstats()
+
+ if ARGS.graph:
+ draw_graph(kstats)
+ sys.exit(0)
+
+ print_header()
+
+ if ARGS.raw:
+ print_raw(kstats)
+
+ elif ARGS.section:
+
+ try:
+ section_calls[ARGS.section](kstats)
+ except KeyError:
+ print('Error: Section "{0}" unknown'.format(ARGS.section))
+ sys.exit(1)
+
+ elif ARGS.page:
+ print('WARNING: Pages are deprecated, please use "--section"\n')
+
+ pages_to_calls = {1: 'arc',
+ 2: 'archits',
+ 3: 'l2arc',
+ 4: 'dmu',
+ 5: 'vdev',
+ 6: 'tunables'}
+
+ try:
+ call = pages_to_calls[ARGS.page]
+ except KeyError:
+ print('Error: Page "{0}" not supported'.format(ARGS.page))
+ sys.exit(1)
+ else:
+ section_calls[call](kstats)
+
+ else:
+ # If no parameters were given, we print all sections. We might want to
+ # change the sequence by hand
+ calls = sorted(section_calls.keys())
+
+ for section in calls:
+ section_calls[section](kstats)
+
+ sys.exit(0)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cmd/arcstat/.gitignore b/cmd/arcstat/.gitignore
new file mode 100644
index 000000000000..6d6cd1ab75fc
--- /dev/null
+++ b/cmd/arcstat/.gitignore
@@ -0,0 +1 @@
+arcstat
diff --git a/cmd/arcstat/Makefile.am b/cmd/arcstat/Makefile.am
new file mode 100644
index 000000000000..d1ba989a0cd8
--- /dev/null
+++ b/cmd/arcstat/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/config/Substfiles.am
+
+bin_SCRIPTS = arcstat
+
+SUBSTFILES += $(bin_SCRIPTS)
diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in
new file mode 100755
index 000000000000..c83a1c74599e
--- /dev/null
+++ b/cmd/arcstat/arcstat.in
@@ -0,0 +1,494 @@
+#!/usr/bin/env @PYTHON_SHEBANG@
+#
+# Print out ZFS ARC Statistics exported via kstat(1)
+# For a definition of fields, or usage, use arcstat -v
+#
+# This script was originally a fork of the original arcstat.pl (0.1)
+# by Neelakanth Nadgir, originally published on his Sun blog on
+# 09/18/2007
+# http://blogs.sun.com/realneel/entry/zfs_arc_statistics
+#
+# A new version aimed to improve upon the original by adding features
+# and fixing bugs as needed. This version was maintained by Mike
+# Harsch and was hosted in a public open source repository:
+# http://github.com/mharsch/arcstat
+#
+# but has since moved to the illumos-gate repository.
+#
+# This Python port was written by John Hixson for FreeNAS, introduced
+# in commit e2c29f:
+# https://github.com/freenas/freenas
+#
+# and has been improved by many people since.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Fields have a fixed width. Every interval, we fill the "v"
+# hash with its corresponding value (v[field]=value) using calculate().
+# @hdr is the array of fields that needs to be printed, so we
+# just iterate over this array and print the values using our pretty printer.
+#
+# This script must remain compatible with Python 2.6+ and Python 3.4+.
+#
+
+import sys
+import time
+import getopt
+import re
+import copy
+
+from signal import signal, SIGINT, SIGWINCH, SIG_DFL
+
+
+cols = {
+ # HDR: [Size, Scale, Description]
+ "time": [8, -1, "Time"],
+ "hits": [4, 1000, "ARC reads per second"],
+ "miss": [4, 1000, "ARC misses per second"],
+ "read": [4, 1000, "Total ARC accesses per second"],
+ "hit%": [4, 100, "ARC hit percentage"],
+ "miss%": [5, 100, "ARC miss percentage"],
+ "dhit": [4, 1000, "Demand hits per second"],
+ "dmis": [4, 1000, "Demand misses per second"],
+ "dh%": [3, 100, "Demand hit percentage"],
+ "dm%": [3, 100, "Demand miss percentage"],
+ "phit": [4, 1000, "Prefetch hits per second"],
+ "pmis": [4, 1000, "Prefetch misses per second"],
+ "ph%": [3, 100, "Prefetch hits percentage"],
+ "pm%": [3, 100, "Prefetch miss percentage"],
+ "mhit": [4, 1000, "Metadata hits per second"],
+ "mmis": [4, 1000, "Metadata misses per second"],
+ "mread": [5, 1000, "Metadata accesses per second"],
+ "mh%": [3, 100, "Metadata hit percentage"],
+ "mm%": [3, 100, "Metadata miss percentage"],
+ "arcsz": [5, 1024, "ARC size"],
+ "size": [4, 1024, "ARC size"],
+ "c": [4, 1024, "ARC target size"],
+ "mfu": [4, 1000, "MFU list hits per second"],
+ "mru": [4, 1000, "MRU list hits per second"],
+ "mfug": [4, 1000, "MFU ghost list hits per second"],
+ "mrug": [4, 1000, "MRU ghost list hits per second"],
+ "eskip": [5, 1000, "evict_skip per second"],
+ "mtxmis": [6, 1000, "mutex_miss per second"],
+ "dread": [5, 1000, "Demand accesses per second"],
+ "pread": [5, 1000, "Prefetch accesses per second"],
+ "l2hits": [6, 1000, "L2ARC hits per second"],
+ "l2miss": [6, 1000, "L2ARC misses per second"],
+ "l2read": [6, 1000, "Total L2ARC accesses per second"],
+ "l2hit%": [6, 100, "L2ARC access hit percentage"],
+ "l2miss%": [7, 100, "L2ARC access miss percentage"],
+ "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"],
+ "l2size": [6, 1024, "Size of the L2ARC"],
+ "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"],
+ "grow": [4, 1000, "ARC grow disabled"],
+ "need": [4, 1024, "ARC reclaim need"],
+ "free": [4, 1024, "ARC free memory"],
+ "avail": [5, 1024, "ARC available memory"],
+ "waste": [5, 1024, "Wasted memory due to round up to pagesize"],
+}
+
+v = {}
+hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis",
+ "mm%", "size", "c", "avail"]
+xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread",
+ "pread", "read"]
+sint = 1 # Default interval is 1 second
+count = 1 # Default count is 1
+hdr_intr = 20 # Print header every 20 lines of output
+opfile = None
+sep = " " # Default separator is 2 spaces
+version = "0.4"
+l2exist = False
+cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval "
+ "[count]]\n")
+cur = {}
+d = {}
+out = None
+kstat = None
+
+
+if sys.platform.startswith('freebsd'):
+ # Requires py27-sysctl on FreeBSD
+ import sysctl
+
+ def kstat_update():
+ global kstat
+
+ k = sysctl.filter('kstat.zfs.misc.arcstats')
+
+ if not k:
+ sys.exit(1)
+
+ kstat = {}
+
+ for s in k:
+ if not s:
+ continue
+
+ name, value = s.name, s.value
+ # Trims 'kstat.zfs.misc.arcstats' from the name
+ kstat[name[24:]] = int(value)
+
+elif sys.platform.startswith('linux'):
+ def kstat_update():
+ global kstat
+
+ k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
+
+ if not k:
+ sys.exit(1)
+
+ del k[0:2]
+ kstat = {}
+
+ for s in k:
+ if not s:
+ continue
+
+ name, unused, value = s.split()
+ kstat[name] = int(value)
+
+
+def detailed_usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("Field definitions are as follows:\n")
+ for key in cols:
+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+ sys.stderr.write("\n")
+
+ sys.exit(0)
+
+
+def usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("\t -h : Print this help message\n")
+ sys.stderr.write("\t -v : List all possible field headers and definitions"
+ "\n")
+ sys.stderr.write("\t -x : Print extended stats\n")
+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+ sys.stderr.write("\t -o : Redirect output to the specified file\n")
+ sys.stderr.write("\t -s : Override default field separator with custom "
+ "character or string\n")
+ sys.stderr.write("\nExamples:\n")
+ sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n")
+ sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n")
+ sys.stderr.write("\tarcstat -v\n")
+ sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n")
+ sys.stderr.write("\n")
+
+ sys.exit(1)
+
+
+def snap_stats():
+ global cur
+ global kstat
+
+ prev = copy.deepcopy(cur)
+ kstat_update()
+
+ cur = kstat
+ for key in cur:
+ if re.match(key, "class"):
+ continue
+ if key in prev:
+ d[key] = cur[key] - prev[key]
+ else:
+ d[key] = cur[key]
+
+
+def prettynum(sz, scale, num=0):
+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+ index = 0
+ save = 0
+
+ # Special case for date field
+ if scale == -1:
+ return "%s" % num
+
+ # Rounding error, return 0
+ elif 0 < num < 1:
+ num = 0
+
+ while abs(num) > scale and index < 5:
+ save = num
+ num = num / scale
+ index += 1
+
+ if index == 0:
+ return "%*d" % (sz, num)
+
+ if abs(save / scale) < 10:
+ return "%*.1f%s" % (sz - 1, num, suffix[index])
+ else:
+ return "%*d%s" % (sz - 1, num, suffix[index])
+
+
+def print_values():
+ global hdr
+ global sep
+ global v
+
+ sys.stdout.write(sep.join(
+ prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr))
+
+ sys.stdout.write("\n")
+ sys.stdout.flush()
+
+
+def print_header():
+ global hdr
+ global sep
+
+ sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr))
+
+ sys.stdout.write("\n")
+
+
+def get_terminal_lines():
+ try:
+ import fcntl
+ import termios
+ import struct
+ data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234')
+ sz = struct.unpack('hh', data)
+ return sz[0]
+ except Exception:
+ pass
+
+
+def update_hdr_intr():
+ global hdr_intr
+
+ lines = get_terminal_lines()
+ if lines and lines > 3:
+ hdr_intr = lines - 3
+
+
+def resize_handler(signum, frame):
+ update_hdr_intr()
+
+
+def init():
+ global sint
+ global count
+ global hdr
+ global xhdr
+ global opfile
+ global sep
+ global out
+ global l2exist
+
+ desired_cols = None
+ xflag = False
+ hflag = False
+ vflag = False
+ i = 1
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "xo:hvs:f:",
+ [
+ "extended",
+ "outfile",
+ "help",
+ "verbose",
+ "separator",
+ "columns"
+ ]
+ )
+ except getopt.error as msg:
+ sys.stderr.write("Error: %s\n" % str(msg))
+ usage()
+ opts = None
+
+ for opt, arg in opts:
+ if opt in ('-x', '--extended'):
+ xflag = True
+ if opt in ('-o', '--outfile'):
+ opfile = arg
+ i += 1
+ if opt in ('-h', '--help'):
+ hflag = True
+ if opt in ('-v', '--verbose'):
+ vflag = True
+ if opt in ('-s', '--separator'):
+ sep = arg
+ i += 1
+ if opt in ('-f', '--columns'):
+ desired_cols = arg
+ i += 1
+ i += 1
+
+ argv = sys.argv[i:]
+ sint = int(argv[0]) if argv else sint
+ count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1)
+
+ if hflag or (xflag and desired_cols):
+ usage()
+
+ if vflag:
+ detailed_usage()
+
+ if xflag:
+ hdr = xhdr
+
+ update_hdr_intr()
+
+ # check if L2ARC exists
+ snap_stats()
+ l2_size = cur.get("l2_size")
+ if l2_size:
+ l2exist = True
+
+ if desired_cols:
+ hdr = desired_cols.split(",")
+
+ invalid = []
+ incompat = []
+ for ele in hdr:
+ if ele not in cols:
+ invalid.append(ele)
+ elif not l2exist and ele.startswith("l2"):
+ sys.stdout.write("No L2ARC Here\n%s\n" % ele)
+ incompat.append(ele)
+
+ if len(invalid) > 0:
+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+ usage()
+
+ if len(incompat) > 0:
+ sys.stderr.write("Incompatible field specified! -- %s\n" %
+ incompat)
+ usage()
+
+ if opfile:
+ try:
+ out = open(opfile, "w")
+ sys.stdout = out
+
+ except IOError:
+ sys.stderr.write("Cannot open %s for writing\n" % opfile)
+ sys.exit(1)
+
+
+def calculate():
+ global d
+ global v
+ global l2exist
+
+ v = dict()
+ v["time"] = time.strftime("%H:%M:%S", time.localtime())
+ v["hits"] = d["hits"] / sint
+ v["miss"] = d["misses"] / sint
+ v["read"] = v["hits"] + v["miss"]
+ v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0
+ v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0
+
+ v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint
+ v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint
+
+ v["dread"] = v["dhit"] + v["dmis"]
+ v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0
+ v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0
+
+ v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint
+ v["pmis"] = (d["prefetch_data_misses"] +
+ d["prefetch_metadata_misses"]) / sint
+
+ v["pread"] = v["phit"] + v["pmis"]
+ v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0
+ v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0
+
+ v["mhit"] = (d["prefetch_metadata_hits"] +
+ d["demand_metadata_hits"]) / sint
+ v["mmis"] = (d["prefetch_metadata_misses"] +
+ d["demand_metadata_misses"]) / sint
+
+ v["mread"] = v["mhit"] + v["mmis"]
+ v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0
+ v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0
+
+ v["arcsz"] = cur["size"]
+ v["size"] = cur["size"]
+ v["c"] = cur["c"]
+ v["mfu"] = d["mfu_hits"] / sint
+ v["mru"] = d["mru_hits"] / sint
+ v["mrug"] = d["mru_ghost_hits"] / sint
+ v["mfug"] = d["mfu_ghost_hits"] / sint
+ v["eskip"] = d["evict_skip"] / sint
+ v["mtxmis"] = d["mutex_miss"] / sint
+
+ if l2exist:
+ v["l2hits"] = d["l2_hits"] / sint
+ v["l2miss"] = d["l2_misses"] / sint
+ v["l2read"] = v["l2hits"] + v["l2miss"]
+ v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0
+
+ v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0
+ v["l2asize"] = cur["l2_asize"]
+ v["l2size"] = cur["l2_size"]
+ v["l2bytes"] = d["l2_read_bytes"] / sint
+
+ v["grow"] = 0 if cur["arc_no_grow"] else 1
+ v["need"] = cur["arc_need_free"]
+ v["free"] = cur["memory_free_bytes"]
+ v["avail"] = cur["memory_available_bytes"]
+ v["waste"] = cur["abd_chunk_waste_size"]
+
+
+def main():
+ global sint
+ global count
+ global hdr_intr
+
+ i = 0
+ count_flag = 0
+
+ init()
+ if count > 0:
+ count_flag = 1
+
+ signal(SIGINT, SIG_DFL)
+ signal(SIGWINCH, resize_handler)
+ while True:
+ if i == 0:
+ print_header()
+
+ snap_stats()
+ calculate()
+ print_values()
+
+ if count_flag == 1:
+ if count <= 1:
+ break
+ count -= 1
+
+ i = 0 if i >= hdr_intr else i + 1
+ time.sleep(sint)
+
+ if out:
+ out.close()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cmd/dbufstat/.gitignore b/cmd/dbufstat/.gitignore
new file mode 100644
index 000000000000..2c2e913cef70
--- /dev/null
+++ b/cmd/dbufstat/.gitignore
@@ -0,0 +1 @@
+dbufstat
diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am
new file mode 100644
index 000000000000..e672a01a4227
--- /dev/null
+++ b/cmd/dbufstat/Makefile.am
@@ -0,0 +1,5 @@
+include $(top_srcdir)/config/Substfiles.am
+
+bin_SCRIPTS = dbufstat
+
+SUBSTFILES += $(bin_SCRIPTS)
diff --git a/cmd/dbufstat/dbufstat.in b/cmd/dbufstat/dbufstat.in
new file mode 100755
index 000000000000..98eb79057388
--- /dev/null
+++ b/cmd/dbufstat/dbufstat.in
@@ -0,0 +1,669 @@
+#!/usr/bin/env @PYTHON_SHEBANG@
+#
+# Print out statistics for all cached dmu buffers. This information
+# is available through the dbufs kstat and may be post-processed as
+# needed by the script.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License"). You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (C) 2013 Lawrence Livermore National Security, LLC.
+# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+#
+# This script must remain compatible with Python 2.6+ and Python 3.4+.
+#
+
+import sys
+import getopt
+import errno
+import re
+
+bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"]
+bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize",
+ "meta", "state", "dbholds", "dbc", "list", "atype", "flags",
+ "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2",
+ "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype",
+ "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"]
+bincompat = ["cached", "direct", "indirect", "bonus", "spill"]
+
+dhdr = ["pool", "objset", "object", "dtype", "cached"]
+dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct",
+ "indirect", "bonus", "spill"]
+dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds",
+ "dbc", "list", "atype", "flags", "count", "asize", "access",
+ "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
+ "l2_comp", "aholds"]
+
+thdr = ["pool", "objset", "dtype", "cached"]
+txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect",
+ "bonus", "spill"]
+tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state",
+ "dbc", "dbholds", "list", "atype", "flags", "count", "asize",
+ "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr",
+ "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs",
+ "bsize", "lvls", "dholds", "blocks", "dsize"]
+
+cols = {
+ # hdr: [size, scale, description]
+ "pool": [15, -1, "pool name"],
+ "objset": [6, -1, "dataset identification number"],
+ "object": [10, -1, "object number"],
+ "level": [5, -1, "indirection level of buffer"],
+ "blkid": [8, -1, "block number of buffer"],
+ "offset": [12, 1024, "offset in object of buffer"],
+ "dbsize": [7, 1024, "size of buffer"],
+ "meta": [4, -1, "is this buffer metadata?"],
+ "state": [5, -1, "state of buffer (read, cached, etc)"],
+ "dbholds": [7, 1000, "number of holds on buffer"],
+ "dbc": [3, -1, "in dbuf cache"],
+ "list": [4, -1, "which ARC list contains this buffer"],
+ "atype": [7, -1, "ARC header type (data or metadata)"],
+ "flags": [9, -1, "ARC read flags"],
+ "count": [5, -1, "ARC data count"],
+ "asize": [7, 1024, "size of this ARC buffer"],
+ "access": [10, -1, "time this ARC buffer was last accessed"],
+ "mru": [5, 1000, "hits while on the ARC's MRU list"],
+ "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"],
+ "mfu": [5, 1000, "hits while on the ARC's MFU list"],
+ "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"],
+ "l2": [5, 1000, "hits while on the L2ARC"],
+ "l2_dattr": [8, -1, "L2ARC disk address/offset"],
+ "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"],
+ "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"],
+ "aholds": [6, 1000, "number of holds on this ARC buffer"],
+ "dtype": [27, -1, "dnode type"],
+ "btype": [27, -1, "bonus buffer type"],
+ "data_bs": [7, 1024, "data block size"],
+ "meta_bs": [7, 1024, "metadata block size"],
+ "bsize": [6, 1024, "bonus buffer size"],
+ "lvls": [6, -1, "number of indirection levels"],
+ "dholds": [6, 1000, "number of holds on dnode"],
+ "blocks": [8, 1000, "number of allocated blocks"],
+ "dsize": [12, 1024, "size of dnode"],
+ "cached": [6, 1024, "bytes cached for all blocks"],
+ "direct": [6, 1024, "bytes cached for direct blocks"],
+ "indirect": [8, 1024, "bytes cached for indirect blocks"],
+ "bonus": [5, 1024, "bytes cached for bonus buffer"],
+ "spill": [5, 1024, "bytes cached for spill block"],
+}
+
+hdr = None
+xhdr = None
+sep = " " # Default separator is 2 spaces
+cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] "
+ "[-s string] [-F filter]\n")
+raw = 0
+
+
+def print_incompat_helper(incompat):
+ cnt = 0
+ for key in sorted(incompat):
+ if cnt is 0:
+ sys.stderr.write("\t")
+ elif cnt > 8:
+ sys.stderr.write(",\n\t")
+ cnt = 0
+ else:
+ sys.stderr.write(", ")
+
+ sys.stderr.write("%s" % key)
+ cnt += 1
+
+ sys.stderr.write("\n\n")
+
+
+def detailed_usage():
+ sys.stderr.write("%s\n" % cmd)
+
+ sys.stderr.write("Field definitions incompatible with '-b' option:\n")
+ print_incompat_helper(bincompat)
+
+ sys.stderr.write("Field definitions incompatible with '-d' option:\n")
+ print_incompat_helper(dincompat)
+
+ sys.stderr.write("Field definitions incompatible with '-t' option:\n")
+ print_incompat_helper(tincompat)
+
+ sys.stderr.write("Field definitions are as follows:\n")
+ for key in sorted(cols.keys()):
+ sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
+ sys.stderr.write("\n")
+
+ sys.exit(0)
+
+
+def usage():
+ sys.stderr.write("%s\n" % cmd)
+ sys.stderr.write("\t -b : Print table of information for each dbuf\n")
+ sys.stderr.write("\t -d : Print table of information for each dnode\n")
+ sys.stderr.write("\t -h : Print this help message\n")
+ sys.stderr.write("\t -n : Exclude header from output\n")
+ sys.stderr.write("\t -r : Print raw values\n")
+ sys.stderr.write("\t -t : Print table of information for each dnode type"
+ "\n")
+ sys.stderr.write("\t -v : List all possible field headers and definitions"
+ "\n")
+ sys.stderr.write("\t -x : Print extended stats\n")
+ sys.stderr.write("\t -i : Redirect input from the specified file\n")
+ sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
+ sys.stderr.write("\t -o : Redirect output to the specified file\n")
+ sys.stderr.write("\t -s : Override default field separator with custom "
+ "character or string\n")
+ sys.stderr.write("\t -F : Filter output by value or regex\n")
+ sys.stderr.write("\nExamples:\n")
+ sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n")
+ sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n")
+ sys.stderr.write("\tdbufstat -v\n")
+ sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n")
+ sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n")
+ sys.stderr.write("\n")
+
+ sys.exit(1)
+
+
+def prettynum(sz, scale, num=0):
+ global raw
+
+ suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
+ index = 0
+ save = 0
+
+ if raw or scale == -1:
+ return "%*s" % (sz, num)
+
+ # Rounding error, return 0
+ elif 0 < num < 1:
+ num = 0
+
+ while num > scale and index < 5:
+ save = num
+ num = num / scale
+ index += 1
+
+ if index == 0:
+ return "%*d" % (sz, num)
+
+ if (save / scale) < 10:
+ return "%*.1f%s" % (sz - 1, num, suffix[index])
+ else:
+ return "%*d%s" % (sz - 1, num, suffix[index])
+
+
+def print_values(v):
+ global hdr
+ global sep
+
+ try:
+ for col in hdr:
+ sys.stdout.write("%s%s" % (
+ prettynum(cols[col][0], cols[col][1], v[col]), sep))
+ sys.stdout.write("\n")
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+
+
+def print_header():
+ global hdr
+ global sep
+
+ try:
+ for col in hdr:
+ sys.stdout.write("%*s%s" % (cols[col][0], col, sep))
+ sys.stdout.write("\n")
+ except IOError as e:
+ if e.errno == errno.EPIPE:
+ sys.exit(1)
+
+
+def get_typestring(t):
+ ot_strings = [
+ "DMU_OT_NONE",
+ # general:
+ "DMU_OT_OBJECT_DIRECTORY",
+ "DMU_OT_OBJECT_ARRAY",
+ "DMU_OT_PACKED_NVLIST",
+ "DMU_OT_PACKED_NVLIST_SIZE",
+ "DMU_OT_BPOBJ",
+ "DMU_OT_BPOBJ_HDR",
+ # spa:
+ "DMU_OT_SPACE_MAP_HEADER",
+ "DMU_OT_SPACE_MAP",
+ # zil:
+ "DMU_OT_INTENT_LOG",
+ # dmu:
+ "DMU_OT_DNODE",
+ "DMU_OT_OBJSET",
+ # dsl:
+ "DMU_OT_DSL_DIR",
+ "DMU_OT_DSL_DIR_CHILD_MAP",
+ "DMU_OT_DSL_DS_SNAP_MAP",
+ "DMU_OT_DSL_PROPS",
+ "DMU_OT_DSL_DATASET",
+ # zpl:
+ "DMU_OT_ZNODE",
+ "DMU_OT_OLDACL",
+ "DMU_OT_PLAIN_FILE_CONTENTS",
+ "DMU_OT_DIRECTORY_CONTENTS",
+ "DMU_OT_MASTER_NODE",
+ "DMU_OT_UNLINKED_SET",
+ # zvol:
+ "DMU_OT_ZVOL",
+ "DMU_OT_ZVOL_PROP",
+ # other; for testing only!
+ "DMU_OT_PLAIN_OTHER",
+ "DMU_OT_UINT64_OTHER",
+ "DMU_OT_ZAP_OTHER",
+ # new object types:
+ "DMU_OT_ERROR_LOG",
+ "DMU_OT_SPA_HISTORY",
+ "DMU_OT_SPA_HISTORY_OFFSETS",
+ "DMU_OT_POOL_PROPS",
+ "DMU_OT_DSL_PERMS",
+ "DMU_OT_ACL",
+ "DMU_OT_SYSACL",
+ "DMU_OT_FUID",
+ "DMU_OT_FUID_SIZE",
+ "DMU_OT_NEXT_CLONES",
+ "DMU_OT_SCAN_QUEUE",
+ "DMU_OT_USERGROUP_USED",
+ "DMU_OT_USERGROUP_QUOTA",
+ "DMU_OT_USERREFS",
+ "DMU_OT_DDT_ZAP",
+ "DMU_OT_DDT_STATS",
+ "DMU_OT_SA",
+ "DMU_OT_SA_MASTER_NODE",
+ "DMU_OT_SA_ATTR_REGISTRATION",
+ "DMU_OT_SA_ATTR_LAYOUTS",
+ "DMU_OT_SCAN_XLATE",
+ "DMU_OT_DEDUP",
+ "DMU_OT_DEADLIST",
+ "DMU_OT_DEADLIST_HDR",
+ "DMU_OT_DSL_CLONES",
+ "DMU_OT_BPOBJ_SUBOBJ"]
+ otn_strings = {
+ 0x80: "DMU_OTN_UINT8_DATA",
+ 0xc0: "DMU_OTN_UINT8_METADATA",
+ 0x81: "DMU_OTN_UINT16_DATA",
+ 0xc1: "DMU_OTN_UINT16_METADATA",
+ 0x82: "DMU_OTN_UINT32_DATA",
+ 0xc2: "DMU_OTN_UINT32_METADATA",
+ 0x83: "DMU_OTN_UINT64_DATA",
+ 0xc3: "DMU_OTN_UINT64_METADATA",
+ 0x84: "DMU_OTN_ZAP_DATA",
+ 0xc4: "DMU_OTN_ZAP_METADATA",
+ 0xa0: "DMU_OTN_UINT8_ENC_DATA",
+ 0xe0: "DMU_OTN_UINT8_ENC_METADATA",
+ 0xa1: "DMU_OTN_UINT16_ENC_DATA",
+ 0xe1: "DMU_OTN_UINT16_ENC_METADATA",
+ 0xa2: "DMU_OTN_UINT32_ENC_DATA",
+ 0xe2: "DMU_OTN_UINT32_ENC_METADATA",
+ 0xa3: "DMU_OTN_UINT64_ENC_DATA",
+ 0xe3: "DMU_OTN_UINT64_ENC_METADATA",
+ 0xa4: "DMU_OTN_ZAP_ENC_DATA",
+ 0xe4: "DMU_OTN_ZAP_ENC_METADATA"}
+
+ # If "-rr" option is used, don't convert to string representation
+ if raw > 1:
+ return "%i" % t
+
+ try:
+ if t < len(ot_strings):
+ return ot_strings[t]
+ else:
+ return otn_strings[t]
+ except (IndexError, KeyError):
+ return "(UNKNOWN)"
+
+
+def get_compstring(c):
+ comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON",
+ "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB",
+ "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1",
+ "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3",
+ "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5",
+ "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7",
+ "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9",
+ "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4",
+ "ZIO_COMPRESS_ZSTD", "ZIO_COMPRESS_FUNCTION"]
+
+ # If "-rr" option is used, don't convert to string representation
+ if raw > 1:
+ return "%i" % c
+
+ try:
+ return comp_strings[c]
+ except IndexError:
+ return "%i" % c
+
+
+def parse_line(line, labels):
+ global hdr
+
+ new = dict()
+ val = None
+ for col in hdr:
+ # These are "special" fields computed in the update_dict
+ # function, prevent KeyError exception on labels[col] for these.
+ if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']:
+ val = line[labels[col]]
+
+ if col in ['pool', 'flags']:
+ new[col] = str(val)
+ elif col in ['dtype', 'btype']:
+ new[col] = get_typestring(int(val))
+ elif col in ['l2_comp']:
+ new[col] = get_compstring(int(val))
+ else:
+ new[col] = int(val)
+
+ return new
+
+
+def update_dict(d, k, line, labels):
+ pool = line[labels['pool']]
+ objset = line[labels['objset']]
+ key = line[labels[k]]
+
+ dbsize = int(line[labels['dbsize']])
+ blkid = int(line[labels['blkid']])
+ level = int(line[labels['level']])
+
+ if pool not in d:
+ d[pool] = dict()
+
+ if objset not in d[pool]:
+ d[pool][objset] = dict()
+
+ if key not in d[pool][objset]:
+ d[pool][objset][key] = parse_line(line, labels)
+ d[pool][objset][key]['bonus'] = 0
+ d[pool][objset][key]['cached'] = 0
+ d[pool][objset][key]['direct'] = 0
+ d[pool][objset][key]['indirect'] = 0
+ d[pool][objset][key]['spill'] = 0
+
+ d[pool][objset][key]['cached'] += dbsize
+
+ if blkid == -1:
+ d[pool][objset][key]['bonus'] += dbsize
+ elif blkid == -2:
+ d[pool][objset][key]['spill'] += dbsize
+ else:
+ if level == 0:
+ d[pool][objset][key]['direct'] += dbsize
+ else:
+ d[pool][objset][key]['indirect'] += dbsize
+
+ return d
+
+
+def skip_line(vals, filters):
+ '''
+ Determines if a line should be skipped during printing
+ based on a set of filters
+ '''
+ if len(filters) == 0:
+ return False
+
+ for key in vals:
+ if key in filters:
+ val = prettynum(cols[key][0], cols[key][1], vals[key]).strip()
+ # we want a full match here
+ if re.match("(?:" + filters[key] + r")\Z", val) is None:
+ return True
+
+ return False
+
+
+def print_dict(d, filters, noheader):
+ if not noheader:
+ print_header()
+ for pool in list(d.keys()):
+ for objset in list(d[pool].keys()):
+ for v in list(d[pool][objset].values()):
+ if not skip_line(v, filters):
+ print_values(v)
+
+
+def dnodes_build_dict(filehandle):
+ labels = dict()
+ dnodes = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ update_dict(dnodes, 'object', line.split(), labels)
+
+ return dnodes
+
+
+def types_build_dict(filehandle):
+ labels = dict()
+ types = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ update_dict(types, 'dtype', line.split(), labels)
+
+ return types
+
+
+def buffers_print_all(filehandle, filters, noheader):
+ labels = dict()
+
+ # First 3 lines are header information, skip the first two
+ for i in range(2):
+ next(filehandle)
+
+ # The third line contains the labels and index locations
+ for i, v in enumerate(next(filehandle).split()):
+ labels[v] = i
+
+ if not noheader:
+ print_header()
+
+ # The rest of the file is buffer information
+ for line in filehandle:
+ vals = parse_line(line.split(), labels)
+ if not skip_line(vals, filters):
+ print_values(vals)
+
+
+def main():
+ global hdr
+ global sep
+ global raw
+
+ desired_cols = None
+ bflag = False
+ dflag = False
+ hflag = False
+ ifile = None
+ ofile = None
+ tflag = False
+ vflag = False
+ xflag = False
+ nflag = False
+ filters = dict()
+
+ try:
+ opts, args = getopt.getopt(
+ sys.argv[1:],
+ "bdf:hi:o:rs:tvxF:n",
+ [
+ "buffers",
+ "dnodes",
+ "columns",
+ "help",
+ "infile",
+ "outfile",
+ "separator",
+ "types",
+ "verbose",
+ "extended",
+ "filter"
+ ]
+ )
+ except getopt.error:
+ usage()
+ opts = None
+
+ for opt, arg in opts:
+ if opt in ('-b', '--buffers'):
+ bflag = True
+ if opt in ('-d', '--dnodes'):
+ dflag = True
+ if opt in ('-f', '--columns'):
+ desired_cols = arg
+ if opt in ('-h', '--help'):
+ hflag = True
+ if opt in ('-i', '--infile'):
+ ifile = arg
+ if opt in ('-o', '--outfile'):
+ ofile = arg
+ if opt in ('-r', '--raw'):
+ raw += 1
+ if opt in ('-s', '--separator'):
+ sep = arg
+ if opt in ('-t', '--types'):
+ tflag = True
+ if opt in ('-v', '--verbose'):
+ vflag = True
+ if opt in ('-x', '--extended'):
+ xflag = True
+ if opt in ('-n', '--noheader'):
+ nflag = True
+ if opt in ('-F', '--filter'):
+ fils = [x.strip() for x in arg.split(",")]
+
+ for fil in fils:
+ f = [x.strip() for x in fil.split("=")]
+
+ if len(f) != 2:
+ sys.stderr.write("Invalid filter '%s'.\n" % fil)
+ sys.exit(1)
+
+ if f[0] not in cols:
+ sys.stderr.write("Invalid field '%s' in filter.\n" % f[0])
+ sys.exit(1)
+
+ if f[0] in filters:
+ sys.stderr.write("Field '%s' specified multiple times in "
+ "filter.\n" % f[0])
+ sys.exit(1)
+
+ try:
+ re.compile("(?:" + f[1] + r")\Z")
+ except re.error:
+ sys.stderr.write("Invalid regex for field '%s' in "
+ "filter.\n" % f[0])
+ sys.exit(1)
+
+ filters[f[0]] = f[1]
+
+ if hflag or (xflag and desired_cols):
+ usage()
+
+ if vflag:
+ detailed_usage()
+
+ # Ensure at most only one of b, d, or t flags are set
+ if (bflag and dflag) or (bflag and tflag) or (dflag and tflag):
+ usage()
+
+ if bflag:
+ hdr = bxhdr if xflag else bhdr
+ elif tflag:
+ hdr = txhdr if xflag else thdr
+ else: # Even if dflag is False, it's the default if none set
+ dflag = True
+ hdr = dxhdr if xflag else dhdr
+
+ if desired_cols:
+ hdr = desired_cols.split(",")
+
+ invalid = []
+ incompat = []
+ for ele in hdr:
+ if ele not in cols:
+ invalid.append(ele)
+ elif ((bflag and bincompat and ele in bincompat) or
+ (dflag and dincompat and ele in dincompat) or
+ (tflag and tincompat and ele in tincompat)):
+ incompat.append(ele)
+
+ if len(invalid) > 0:
+ sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
+ usage()
+
+ if len(incompat) > 0:
+ sys.stderr.write("Incompatible field specified! -- %s\n" %
+ incompat)
+ usage()
+
+ if ofile:
+ try:
+ tmp = open(ofile, "w")
+ sys.stdout = tmp
+
+ except IOError:
+ sys.stderr.write("Cannot open %s for writing\n" % ofile)
+ sys.exit(1)
+
+ if not ifile:
+ ifile = '/proc/spl/kstat/zfs/dbufs'
+
+ if ifile is not "-":
+ try:
+ tmp = open(ifile, "r")
+ sys.stdin = tmp
+ except IOError:
+ sys.stderr.write("Cannot open %s for reading\n" % ifile)
+ sys.exit(1)
+
+ if bflag:
+ buffers_print_all(sys.stdin, filters, nflag)
+
+ if dflag:
+ print_dict(dnodes_build_dict(sys.stdin), filters, nflag)
+
+ if tflag:
+ print_dict(types_build_dict(sys.stdin), filters, nflag)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cmd/fsck_zfs/Makefile.am b/cmd/fsck_zfs/Makefile.am
new file mode 100644
index 000000000000..2380f56fa4d4
--- /dev/null
+++ b/cmd/fsck_zfs/Makefile.am
@@ -0,0 +1 @@
+dist_sbin_SCRIPTS = fsck.zfs
diff --git a/cmd/fsck_zfs/fsck.zfs b/cmd/fsck_zfs/fsck.zfs
new file mode 100755
index 000000000000..129a7f39c388
--- /dev/null
+++ b/cmd/fsck_zfs/fsck.zfs
@@ -0,0 +1,9 @@
+#!/bin/sh
+#
+# fsck.zfs: A fsck helper to accommodate distributions that expect
+# to be able to execute a fsck on all filesystem types. Currently
+# this script does nothing but it could be extended to act as a
+# compatibility wrapper for 'zpool scrub'.
+#
+
+exit 0
diff --git a/cmd/mount_zfs/.gitignore b/cmd/mount_zfs/.gitignore
new file mode 100644
index 000000000000..cd9254bde3da
--- /dev/null
+++ b/cmd/mount_zfs/.gitignore
@@ -0,0 +1 @@
+mount.zfs
diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am
new file mode 100644
index 000000000000..6c4d6ff79f16
--- /dev/null
+++ b/cmd/mount_zfs/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+#
+# Ignore the prefix for the mount helper. It must be installed in /sbin/
+# because this path is hardcoded in the mount(8) for security reasons.
+# However, if needed, the configure option --with-mounthelperdir= can be used
+# to override the default install location.
+#
+sbindir=$(mounthelperdir)
+sbin_PROGRAMS = mount.zfs
+
+mount_zfs_SOURCES = \
+ mount_zfs.c
+
+mount_zfs_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
+
+mount_zfs_LDADD += $(LTLIBINTL)
diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c
new file mode 100644
index 000000000000..87d2ccadcded
--- /dev/null
+++ b/cmd/mount_zfs/mount_zfs.c
@@ -0,0 +1,408 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Lawrence Livermore National Security, LLC.
+ */
+
+#include <libintl.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/mntent.h>
+#include <sys/stat.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <locale.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#define ZS_COMMENT 0x00000000 /* comment */
+#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */
+
+libzfs_handle_t *g_zfs;
+
+/*
+ * Return the pool/dataset to mount given the name passed to mount. This
+ * is expected to be of the form pool/dataset, however may also refer to
+ * a block device if that device contains a valid zfs label.
+ */
+static char *
+parse_dataset(char *dataset)
+{
+ char cwd[PATH_MAX];
+ struct stat64 statbuf;
+ int error;
+ int len;
+
+ /*
+ * We expect a pool/dataset to be provided, however if we're
+ * given a device which is a member of a zpool we attempt to
+ * extract the pool name stored in the label. Given the pool
+ * name we can mount the root dataset.
+ */
+ error = stat64(dataset, &statbuf);
+ if (error == 0) {
+ nvlist_t *config;
+ char *name;
+ int fd;
+
+ fd = open(dataset, O_RDONLY);
+ if (fd < 0)
+ goto out;
+
+ error = zpool_read_label(fd, &config, NULL);
+ (void) close(fd);
+ if (error)
+ goto out;
+
+ error = nvlist_lookup_string(config,
+ ZPOOL_CONFIG_POOL_NAME, &name);
+ if (error) {
+ nvlist_free(config);
+ } else {
+ dataset = strdup(name);
+ nvlist_free(config);
+ return (dataset);
+ }
+ }
+out:
+ /*
+ * If a file or directory in your current working directory is
+ * named 'dataset' then mount(8) will prepend your current working
+ * directory to the dataset. There is no way to prevent this
+ * behavior so we simply check for it and strip the prepended
+ * patch when it is added.
+ */
+ if (getcwd(cwd, PATH_MAX) == NULL)
+ return (dataset);
+
+ len = strlen(cwd);
+
+ /* Do not add one when cwd already ends in a trailing '/' */
+ if (strncmp(cwd, dataset, len) == 0)
+ return (dataset + len + (cwd[len-1] != '/'));
+
+ return (dataset);
+}
+
+/*
+ * Update the mtab_* code to use the libmount library when it is commonly
+ * available otherwise fallback to legacy mode. The mount(8) utility will
+ * manage the lock file for us to prevent racing updates to /etc/mtab.
+ */
+static int
+mtab_is_writeable(void)
+{
+ struct stat st;
+ int error, fd;
+
+ error = lstat("/etc/mtab", &st);
+ if (error || S_ISLNK(st.st_mode))
+ return (0);
+
+ fd = open("/etc/mtab", O_RDWR | O_CREAT, 0644);
+ if (fd < 0)
+ return (0);
+
+ close(fd);
+ return (1);
+}
+
+static int
+mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts)
+{
+ struct mntent mnt;
+ FILE *fp;
+ int error;
+
+ mnt.mnt_fsname = dataset;
+ mnt.mnt_dir = mntpoint;
+ mnt.mnt_type = type;
+ mnt.mnt_opts = mntopts ? mntopts : "";
+ mnt.mnt_freq = 0;
+ mnt.mnt_passno = 0;
+
+ fp = setmntent("/etc/mtab", "a+");
+ if (!fp) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' was mounted, but /etc/mtab "
+ "could not be opened due to error %d\n"),
+ dataset, errno);
+ return (MOUNT_FILEIO);
+ }
+
+ error = addmntent(fp, &mnt);
+ if (error) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' was mounted, but /etc/mtab "
+ "could not be updated due to error %d\n"),
+ dataset, errno);
+ return (MOUNT_FILEIO);
+ }
+
+ (void) endmntent(fp);
+
+ return (MOUNT_SUCCESS);
+}
+
+int
+main(int argc, char **argv)
+{
+ zfs_handle_t *zhp;
+ char prop[ZFS_MAXPROPLEN];
+ uint64_t zfs_version = 0;
+ char mntopts[MNT_LINE_MAX] = { '\0' };
+ char badopt[MNT_LINE_MAX] = { '\0' };
+ char mtabopt[MNT_LINE_MAX] = { '\0' };
+ char mntpoint[PATH_MAX];
+ char *dataset;
+ unsigned long mntflags = 0, zfsflags = 0, remount = 0;
+ int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0;
+ int error, c;
+
+ (void) setlocale(LC_ALL, "");
+ (void) textdomain(TEXT_DOMAIN);
+
+ opterr = 0;
+
+ /* check options */
+ while ((c = getopt_long(argc, argv, "sfnvo:h?", 0, 0)) != -1) {
+ switch (c) {
+ case 's':
+ sloppy = 1;
+ break;
+ case 'f':
+ fake = 1;
+ break;
+ case 'n':
+ nomtab = 1;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'o':
+ (void) strlcpy(mntopts, optarg, sizeof (mntopts));
+ break;
+ case 'h':
+ case '?':
+ (void) fprintf(stderr, gettext("Invalid option '%c'\n"),
+ optopt);
+ (void) fprintf(stderr, gettext("Usage: mount.zfs "
+ "[-sfnv] [-o options] <dataset> <mountpoint>\n"));
+ return (MOUNT_USAGE);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* check that we only have two arguments */
+ if (argc != 2) {
+ if (argc == 0)
+ (void) fprintf(stderr, gettext("missing dataset "
+ "argument\n"));
+ else if (argc == 1)
+ (void) fprintf(stderr,
+ gettext("missing mountpoint argument\n"));
+ else
+ (void) fprintf(stderr, gettext("too many arguments\n"));
+ (void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n");
+ return (MOUNT_USAGE);
+ }
+
+ dataset = parse_dataset(argv[0]);
+
+ /* canonicalize the mount point */
+ if (realpath(argv[1], mntpoint) == NULL) {
+ (void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+ "mounted at '%s' due to canonicalization error %d.\n"),
+ dataset, argv[1], errno);
+ return (MOUNT_SYSERR);
+ }
+
+ /* validate mount options and set mntflags */
+ error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy,
+ badopt, mtabopt);
+ if (error) {
+ switch (error) {
+ case ENOMEM:
+ (void) fprintf(stderr, gettext("filesystem '%s' "
+ "cannot be mounted due to a memory allocation "
+ "failure.\n"), dataset);
+ return (MOUNT_SYSERR);
+ case ENOENT:
+ (void) fprintf(stderr, gettext("filesystem '%s' "
+ "cannot be mounted due to invalid option "
+ "'%s'.\n"), dataset, badopt);
+ (void) fprintf(stderr, gettext("Use the '-s' option "
+ "to ignore the bad mount option.\n"));
+ return (MOUNT_USAGE);
+ default:
+ (void) fprintf(stderr, gettext("filesystem '%s' "
+ "cannot be mounted due to internal error %d.\n"),
+ dataset, error);
+ return (MOUNT_SOFTWARE);
+ }
+ }
+
+ if (verbose)
+ (void) fprintf(stdout, gettext("mount.zfs:\n"
+ " dataset: \"%s\"\n mountpoint: \"%s\"\n"
+ " mountflags: 0x%lx\n zfsflags: 0x%lx\n"
+ " mountopts: \"%s\"\n mtabopts: \"%s\"\n"),
+ dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
+
+ if (mntflags & MS_REMOUNT) {
+ nomtab = 1;
+ remount = 1;
+ }
+
+ if (zfsflags & ZS_ZFSUTIL)
+ zfsutil = 1;
+
+ if ((g_zfs = libzfs_init()) == NULL) {
+ (void) fprintf(stderr, "%s\n", libzfs_error_init(errno));
+ return (MOUNT_SYSERR);
+ }
+
+ /* try to open the dataset to access the mount point */
+ if ((zhp = zfs_open(g_zfs, dataset,
+ ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) {
+ (void) fprintf(stderr, gettext("filesystem '%s' cannot be "
+ "mounted, unable to open the dataset\n"), dataset);
+ libzfs_fini(g_zfs);
+ return (MOUNT_USAGE);
+ }
+
+ zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
+
+ /* treat all snapshots as legacy mount points */
+ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT)
+ (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN);
+ else
+ (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop,
+ sizeof (prop), NULL, NULL, 0, B_FALSE);
+
+ /*
+ * Fetch the max supported zfs version in case we get ENOTSUP
+ * back from the mount command, since we need the zfs handle
+ * to do so.
+ */
+ zfs_version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+ if (zfs_version == 0) {
+ fprintf(stderr, gettext("unable to fetch "
+ "ZFS version for filesystem '%s'\n"), dataset);
+ return (MOUNT_SYSERR);
+ }
+
+ zfs_close(zhp);
+ libzfs_fini(g_zfs);
+
+ /*
+ * Legacy mount points may only be mounted using 'mount', never using
+ * 'zfs mount'. However, since 'zfs mount' actually invokes 'mount'
+ * we differentiate the two cases using the 'zfsutil' mount option.
+ * This mount option should only be supplied by the 'zfs mount' util.
+ *
+ * The only exception to the above rule is '-o remount' which is
+ * always allowed for non-legacy datasets. This is done because when
+ * using zfs as your root file system both rc.sysinit/umountroot and
+ * systemd depend on 'mount -o remount <mountpoint>' to work.
+ */
+ if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' cannot be mounted using 'zfs mount'.\n"
+ "Use 'zfs set mountpoint=%s' or 'mount -t zfs %s %s'.\n"
+ "See zfs(8) for more information.\n"),
+ dataset, mntpoint, dataset, mntpoint);
+ return (MOUNT_USAGE);
+ }
+
+ if (!zfsutil && !(remount || fake) &&
+ strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) {
+ (void) fprintf(stderr, gettext(
+ "filesystem '%s' cannot be mounted using 'mount'.\n"
+ "Use 'zfs set mountpoint=%s' or 'zfs mount %s'.\n"
+ "See zfs(8) for more information.\n"),
+ dataset, "legacy", dataset);
+ return (MOUNT_USAGE);
+ }
+
+ if (!fake) {
+ error = mount(dataset, mntpoint, MNTTYPE_ZFS,
+ mntflags, mntopts);
+ }
+
+ if (error) {
+ switch (errno) {
+ case ENOENT:
+ (void) fprintf(stderr, gettext("mount point "
+ "'%s' does not exist\n"), mntpoint);
+ return (MOUNT_SYSERR);
+ case EBUSY:
+ (void) fprintf(stderr, gettext("filesystem "
+ "'%s' is already mounted\n"), dataset);
+ return (MOUNT_BUSY);
+ case ENOTSUP:
+ if (zfs_version > ZPL_VERSION) {
+ (void) fprintf(stderr,
+ gettext("filesystem '%s' (v%d) is not "
+ "supported by this implementation of "
+ "ZFS (max v%d).\n"), dataset,
+ (int)zfs_version, (int)ZPL_VERSION);
+ } else {
+ (void) fprintf(stderr,
+ gettext("filesystem '%s' mount "
+ "failed for unknown reason.\n"), dataset);
+ }
+ return (MOUNT_SYSERR);
+#ifdef MS_MANDLOCK
+ case EPERM:
+ if (mntflags & MS_MANDLOCK) {
+ (void) fprintf(stderr, gettext("filesystem "
+ "'%s' has the 'nbmand=on' property set, "
+ "this mount\noption may be disabled in "
+ "your kernel. Use 'zfs set nbmand=off'\n"
+ "to disable this option and try to "
+ "mount the filesystem again.\n"), dataset);
+ return (MOUNT_SYSERR);
+ }
+ /* fallthru */
+#endif
+ default:
+ (void) fprintf(stderr, gettext("filesystem "
+ "'%s' can not be mounted: %s\n"), dataset,
+ strerror(errno));
+ return (MOUNT_USAGE);
+ }
+ }
+
+ if (!nomtab && mtab_is_writeable()) {
+ error = mtab_update(dataset, mntpoint, MNTTYPE_ZFS, mtabopt);
+ if (error)
+ return (error);
+ }
+
+ return (MOUNT_SUCCESS);
+}
diff --git a/cmd/raidz_test/.gitignore b/cmd/raidz_test/.gitignore
new file mode 100644
index 000000000000..f8b83d9cce03
--- /dev/null
+++ b/cmd/raidz_test/.gitignore
@@ -0,0 +1 @@
+/raidz_test
diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am
new file mode 100644
index 000000000000..72c914e641e4
--- /dev/null
+++ b/cmd/raidz_test/Makefile.am
@@ -0,0 +1,20 @@
+include $(top_srcdir)/config/Rules.am
+
+# Includes kernel code, generate warnings for large stack frames
+AM_CFLAGS += $(FRAME_LARGER_THAN)
+
+# Unconditionally enable ASSERTs
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+bin_PROGRAMS = raidz_test
+
+raidz_test_SOURCES = \
+ raidz_test.h \
+ raidz_test.c \
+ raidz_bench.c
+
+raidz_test_LDADD = \
+ $(abs_top_builddir)/lib/libzpool/libzpool.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la
+
+raidz_test_LDADD += -lm
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c
new file mode 100644
index 000000000000..8a2cec4ca685
--- /dev/null
+++ b/cmd/raidz_test/raidz_bench.c
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/zio.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <stdio.h>
+
+#include <sys/time.h>
+
+#include "raidz_test.h"
+
+#define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32)
+#define REC_BENCH_MEMORY (((uint64_t)1ULL)<<29)
+#define BENCH_ASHIFT 12
+#define MIN_CS_SHIFT BENCH_ASHIFT
+#define MAX_CS_SHIFT SPA_MAXBLOCKSHIFT
+
+static zio_t zio_bench;
+static raidz_map_t *rm_bench;
+static size_t max_data_size = SPA_MAXBLOCKSIZE;
+
+static void
+bench_init_raidz_map(void)
+{
+ zio_bench.io_offset = 0;
+ zio_bench.io_size = max_data_size;
+
+ /*
+ * To permit larger column sizes these have to be done
+ * allocated using aligned alloc instead of zio_abd_buf_alloc
+ */
+ zio_bench.io_abd = raidz_alloc(max_data_size);
+
+ init_zio_abd(&zio_bench);
+}
+
+static void
+bench_fini_raidz_maps(void)
+{
+ /* tear down golden zio */
+ raidz_free(zio_bench.io_abd, max_data_size);
+ bzero(&zio_bench, sizeof (zio_t));
+}
+
+static inline void
+run_gen_bench_impl(const char *impl)
+{
+ int fn, ncols;
+ uint64_t ds, iter_cnt, iter, disksize;
+ hrtime_t start;
+ double elapsed, d_bw;
+
+ /* Benchmark generate functions */
+ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+
+ for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
+ /* create suitable raidz_map */
+ ncols = rto_opts.rto_dcols + fn + 1;
+ zio_bench.io_size = 1ULL << ds;
+ rm_bench = vdev_raidz_map_alloc(&zio_bench,
+ BENCH_ASHIFT, ncols, fn+1);
+
+ /* estimate iteration count */
+ iter_cnt = GEN_BENCH_MEMORY;
+ iter_cnt /= zio_bench.io_size;
+
+ start = gethrtime();
+ for (iter = 0; iter < iter_cnt; iter++)
+ vdev_raidz_generate_parity(rm_bench);
+ elapsed = NSEC2SEC((double)(gethrtime() - start));
+
+ disksize = (1ULL << ds) / rto_opts.rto_dcols;
+ d_bw = (double)iter_cnt * (double)disksize;
+ d_bw /= (1024.0 * 1024.0 * elapsed);
+
+ LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n",
+ impl,
+ raidz_gen_name[fn],
+ rto_opts.rto_dcols,
+ (1ULL<<ds),
+ d_bw,
+ d_bw * (double)(ncols),
+ (unsigned)iter_cnt);
+
+ vdev_raidz_map_free(rm_bench);
+ }
+ }
+}
+
+static void
+run_gen_bench(void)
+{
+ char **impl_name;
+
+ LOG(D_INFO, DBLSEP "\nBenchmarking parity generation...\n\n");
+ LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n");
+
+ for (impl_name = (char **)raidz_impl_names; *impl_name != NULL;
+ impl_name++) {
+
+ if (vdev_raidz_impl_set(*impl_name) != 0)
+ continue;
+
+ run_gen_bench_impl(*impl_name);
+ }
+}
+
+static void
+run_rec_bench_impl(const char *impl)
+{
+ int fn, ncols, nbad;
+ uint64_t ds, iter_cnt, iter, disksize;
+ hrtime_t start;
+ double elapsed, d_bw;
+ static const int tgt[7][3] = {
+ {1, 2, 3}, /* rec_p: bad QR & D[0] */
+ {0, 2, 3}, /* rec_q: bad PR & D[0] */
+ {0, 1, 3}, /* rec_r: bad PQ & D[0] */
+ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
+ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
+ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
+ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
+ };
+
+ for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
+ for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) {
+
+ /* create suitable raidz_map */
+ ncols = rto_opts.rto_dcols + PARITY_PQR;
+ zio_bench.io_size = 1ULL << ds;
+
+ /*
+ * raidz block is too short to test
+ * the requested method
+ */
+ if (zio_bench.io_size / rto_opts.rto_dcols <
+ (1ULL << BENCH_ASHIFT))
+ continue;
+
+ rm_bench = vdev_raidz_map_alloc(&zio_bench,
+ BENCH_ASHIFT, ncols, PARITY_PQR);
+
+ /* estimate iteration count */
+ iter_cnt = (REC_BENCH_MEMORY);
+ iter_cnt /= zio_bench.io_size;
+
+ /* calculate how many bad columns there are */
+ nbad = MIN(3, raidz_ncols(rm_bench) -
+ raidz_parity(rm_bench));
+
+ start = gethrtime();
+ for (iter = 0; iter < iter_cnt; iter++)
+ vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad);
+ elapsed = NSEC2SEC((double)(gethrtime() - start));
+
+ disksize = (1ULL << ds) / rto_opts.rto_dcols;
+ d_bw = (double)iter_cnt * (double)(disksize);
+ d_bw /= (1024.0 * 1024.0 * elapsed);
+
+ LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n",
+ impl,
+ raidz_rec_name[fn],
+ rto_opts.rto_dcols,
+ (1ULL<<ds),
+ d_bw,
+ d_bw * (double)ncols,
+ (unsigned)iter_cnt);
+
+ vdev_raidz_map_free(rm_bench);
+ }
+ }
+}
+
+static void
+run_rec_bench(void)
+{
+ char **impl_name;
+
+ LOG(D_INFO, DBLSEP "\nBenchmarking data reconstruction...\n\n");
+ LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n");
+
+ for (impl_name = (char **)raidz_impl_names; *impl_name != NULL;
+ impl_name++) {
+
+ if (vdev_raidz_impl_set(*impl_name) != 0)
+ continue;
+
+ run_rec_bench_impl(*impl_name);
+ }
+}
+
+void
+run_raidz_benchmark(void)
+{
+ bench_init_raidz_map();
+
+ run_gen_bench();
+ run_rec_bench();
+
+ bench_fini_raidz_maps();
+}
diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c
new file mode 100644
index 000000000000..66f36b0d56ca
--- /dev/null
+++ b/cmd/raidz_test/raidz_test.c
@@ -0,0 +1,782 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/zio.h>
+#include <umem.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <assert.h>
+#include <stdio.h>
+#include "raidz_test.h"
+
+static int *rand_data;
+raidz_test_opts_t rto_opts;
+
+static char gdb[256];
+static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d";
+
+static void sig_handler(int signo)
+{
+ struct sigaction action;
+ /*
+ * Restore default action and re-raise signal so SIGSEGV and
+ * SIGABRT can trigger a core dump.
+ */
+ action.sa_handler = SIG_DFL;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ (void) sigaction(signo, &action, NULL);
+
+ if (rto_opts.rto_gdb)
+ if (system(gdb)) { }
+
+ raise(signo);
+}
+
+static void print_opts(raidz_test_opts_t *opts, boolean_t force)
+{
+ char *verbose;
+ switch (opts->rto_v) {
+ case 0:
+ verbose = "no";
+ break;
+ case 1:
+ verbose = "info";
+ break;
+ default:
+ verbose = "debug";
+ break;
+ }
+
+ if (force || opts->rto_v >= D_INFO) {
+ (void) fprintf(stdout, DBLSEP "Running with options:\n"
+ " (-a) zio ashift : %zu\n"
+ " (-o) zio offset : 1 << %zu\n"
+ " (-d) number of raidz data columns : %zu\n"
+ " (-s) size of DATA : 1 << %zu\n"
+ " (-S) sweep parameters : %s \n"
+ " (-v) verbose : %s \n\n",
+ opts->rto_ashift, /* -a */
+ ilog2(opts->rto_offset), /* -o */
+ opts->rto_dcols, /* -d */
+ ilog2(opts->rto_dsize), /* -s */
+ opts->rto_sweep ? "yes" : "no", /* -S */
+ verbose); /* -v */
+ }
+}
+
+static void usage(boolean_t requested)
+{
+ const raidz_test_opts_t *o = &rto_opts_defaults;
+
+ FILE *fp = requested ? stdout : stderr;
+
+ (void) fprintf(fp, "Usage:\n"
+ "\t[-a zio ashift (default: %zu)]\n"
+ "\t[-o zio offset, exponent radix 2 (default: %zu)]\n"
+ "\t[-d number of raidz data columns (default: %zu)]\n"
+ "\t[-s zio size, exponent radix 2 (default: %zu)]\n"
+ "\t[-S parameter sweep (default: %s)]\n"
+ "\t[-t timeout for parameter sweep test]\n"
+ "\t[-B benchmark all raidz implementations]\n"
+ "\t[-v increase verbosity (default: %zu)]\n"
+ "\t[-h (print help)]\n"
+ "\t[-T test the test, see if failure would be detected]\n"
+ "\t[-D debug (attach gdb on SIGSEGV)]\n"
+ "",
+ o->rto_ashift, /* -a */
+ ilog2(o->rto_offset), /* -o */
+ o->rto_dcols, /* -d */
+ ilog2(o->rto_dsize), /* -s */
+ rto_opts.rto_sweep ? "yes" : "no", /* -S */
+ o->rto_v); /* -d */
+
+ exit(requested ? 0 : 1);
+}
+
+static void process_options(int argc, char **argv)
+{
+ size_t value;
+ int opt;
+
+ raidz_test_opts_t *o = &rto_opts;
+
+ bcopy(&rto_opts_defaults, o, sizeof (*o));
+
+ while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) {
+ value = 0;
+
+ switch (opt) {
+ case 'a':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_ashift = MIN(13, MAX(9, value));
+ break;
+ case 'o':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9;
+ break;
+ case 'd':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_dcols = MIN(255, MAX(1, value));
+ break;
+ case 's':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT,
+ MAX(SPA_MINBLOCKSHIFT, value));
+ break;
+ case 't':
+ value = strtoull(optarg, NULL, 0);
+ o->rto_sweep_timeout = value;
+ break;
+ case 'v':
+ o->rto_v++;
+ break;
+ case 'S':
+ o->rto_sweep = 1;
+ break;
+ case 'B':
+ o->rto_benchmark = 1;
+ break;
+ case 'D':
+ o->rto_gdb = 1;
+ break;
+ case 'T':
+ o->rto_sanity = 1;
+ break;
+ case 'h':
+ usage(B_TRUE);
+ break;
+ case '?':
+ default:
+ usage(B_FALSE);
+ break;
+ }
+ }
+}
+
+#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd)
+#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size)
+
+#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd)
+#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size)
+
+static int
+cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity)
+{
+ int i, ret = 0;
+
+ VERIFY(parity >= 1 && parity <= 3);
+
+ for (i = 0; i < parity; i++) {
+ if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i))
+ != 0) {
+ ret++;
+ LOG_OPT(D_DEBUG, opts,
+ "\nParity block [%d] different!\n", i);
+ }
+ }
+ return (ret);
+}
+
+static int
+cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm)
+{
+ int i, ret = 0;
+ int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden);
+
+ for (i = 0; i < dcols; i++) {
+ if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i))
+ != 0) {
+ ret++;
+
+ LOG_OPT(D_DEBUG, opts,
+ "\nData block [%d] different!\n", i);
+ }
+ }
+ return (ret);
+}
+
+static int
+init_rand(void *data, size_t size, void *private)
+{
+ int i;
+ int *dst = (int *)data;
+
+ for (i = 0; i < size / sizeof (int); i++)
+ dst[i] = rand_data[i];
+
+ return (0);
+}
+
+static void
+corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt)
+{
+ int i;
+ raidz_col_t *col;
+
+ for (i = 0; i < cnt; i++) {
+ col = &rm->rm_col[tgts[i]];
+ abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL);
+ }
+}
+
+void
+init_zio_abd(zio_t *zio)
+{
+ abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL);
+}
+
+static void
+fini_raidz_map(zio_t **zio, raidz_map_t **rm)
+{
+ vdev_raidz_map_free(*rm);
+ raidz_free((*zio)->io_abd, (*zio)->io_size);
+ umem_free(*zio, sizeof (zio_t));
+
+ *zio = NULL;
+ *rm = NULL;
+}
+
+static int
+init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
+{
+ int err = 0;
+ zio_t *zio_test;
+ raidz_map_t *rm_test;
+ const size_t total_ncols = opts->rto_dcols + parity;
+
+ if (opts->rm_golden) {
+ fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+ }
+
+ opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+ zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+
+ opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset;
+ opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize;
+
+ opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize);
+ zio_test->io_abd = raidz_alloc(opts->rto_dsize);
+
+ init_zio_abd(opts->zio_golden);
+ init_zio_abd(zio_test);
+
+ VERIFY0(vdev_raidz_impl_set("original"));
+
+ opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
+ opts->rto_ashift, total_ncols, parity);
+ rm_test = vdev_raidz_map_alloc(zio_test,
+ opts->rto_ashift, total_ncols, parity);
+
+ VERIFY(opts->zio_golden);
+ VERIFY(opts->rm_golden);
+
+ vdev_raidz_generate_parity(opts->rm_golden);
+ vdev_raidz_generate_parity(rm_test);
+
+ /* sanity check */
+ err |= cmp_data(opts, rm_test);
+ err |= cmp_code(opts, rm_test, parity);
+
+ if (err)
+ ERR("initializing the golden copy ... [FAIL]!\n");
+
+ /* tear down raidz_map of test zio */
+ fini_raidz_map(&zio_test, &rm_test);
+
+ return (err);
+}
+
+static raidz_map_t *
+init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
+{
+ raidz_map_t *rm = NULL;
+ const size_t alloc_dsize = opts->rto_dsize;
+ const size_t total_ncols = opts->rto_dcols + parity;
+ const int ccols[] = { 0, 1, 2 };
+
+ VERIFY(zio);
+ VERIFY(parity <= 3 && parity >= 1);
+
+ *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL);
+
+ (*zio)->io_offset = 0;
+ (*zio)->io_size = alloc_dsize;
+ (*zio)->io_abd = raidz_alloc(alloc_dsize);
+ init_zio_abd(*zio);
+
+ rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
+ total_ncols, parity);
+ VERIFY(rm);
+
+ /* Make sure code columns are destroyed */
+ corrupt_colums(rm, ccols, parity);
+
+ return (rm);
+}
+
+static int
+run_gen_check(raidz_test_opts_t *opts)
+{
+ char **impl_name;
+ int fn, err = 0;
+ zio_t *zio_test;
+ raidz_map_t *rm_test;
+
+ err = init_raidz_golden_map(opts, PARITY_PQR);
+ if (0 != err)
+ return (err);
+
+ LOG(D_INFO, DBLSEP);
+ LOG(D_INFO, "Testing parity generation...\n");
+
+ for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
+ impl_name++) {
+
+ LOG(D_INFO, SEP);
+ LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
+
+ if (0 != vdev_raidz_impl_set(*impl_name)) {
+ LOG(D_INFO, "[SKIP]\n");
+ continue;
+ } else {
+ LOG(D_INFO, "[SUPPORTED]\n");
+ }
+
+ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ /* create suitable raidz_map */
+ rm_test = init_raidz_map(opts, &zio_test, fn+1);
+ VERIFY(rm_test);
+
+ LOG(D_INFO, "\t\tTesting method [%s] ...",
+ raidz_gen_name[fn]);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_generate_parity(rm_test);
+
+ if (cmp_code(opts, rm_test, fn+1) != 0) {
+ LOG(D_INFO, "[FAIL]\n");
+ err++;
+ } else
+ LOG(D_INFO, "[PASS]\n");
+
+ fini_raidz_map(&zio_test, &rm_test);
+ }
+ }
+
+ fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+
+ return (err);
+}
+
+static int
+run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn)
+{
+ int x0, x1, x2;
+ int tgtidx[3];
+ int err = 0;
+ static const int rec_tgts[7][3] = {
+ {1, 2, 3}, /* rec_p: bad QR & D[0] */
+ {0, 2, 3}, /* rec_q: bad PR & D[0] */
+ {0, 1, 3}, /* rec_r: bad PQ & D[0] */
+ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
+ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
+ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
+ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
+ };
+
+ memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx));
+
+ if (fn < RAIDZ_REC_PQ) {
+ /* can reconstruct 1 failed data disk */
+ for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+ if (x0 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ LOG(D_DEBUG, "[%d] ", x0);
+
+ tgtidx[2] = x0 + raidz_parity(rm);
+
+ corrupt_colums(rm, tgtidx+2, 1);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_reconstruct(rm, tgtidx, 3);
+
+ if (cmp_data(opts, rm) != 0) {
+ err++;
+ LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0);
+ }
+ }
+
+ } else if (fn < RAIDZ_REC_PQR) {
+ /* can reconstruct 2 failed data disk */
+ for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+ if (x0 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+ for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
+ if (x1 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ LOG(D_DEBUG, "[%d %d] ", x0, x1);
+
+ tgtidx[1] = x0 + raidz_parity(rm);
+ tgtidx[2] = x1 + raidz_parity(rm);
+
+ corrupt_colums(rm, tgtidx+1, 2);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_reconstruct(rm, tgtidx, 3);
+
+ if (cmp_data(opts, rm) != 0) {
+ err++;
+ LOG(D_DEBUG, "\nREC D[%d %d]... "
+ "[FAIL]\n", x0, x1);
+ }
+ }
+ }
+ } else {
+ /* can reconstruct 3 failed data disk */
+ for (x0 = 0; x0 < opts->rto_dcols; x0++) {
+ if (x0 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+ for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) {
+ if (x1 >= rm->rm_cols - raidz_parity(rm))
+ continue;
+ for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) {
+ if (x2 >=
+ rm->rm_cols - raidz_parity(rm))
+ continue;
+
+ /* Check if should stop */
+ if (rto_opts.rto_should_stop)
+ return (err);
+
+ LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2);
+
+ tgtidx[0] = x0 + raidz_parity(rm);
+ tgtidx[1] = x1 + raidz_parity(rm);
+ tgtidx[2] = x2 + raidz_parity(rm);
+
+ corrupt_colums(rm, tgtidx, 3);
+
+ if (!opts->rto_sanity)
+ vdev_raidz_reconstruct(rm,
+ tgtidx, 3);
+
+ if (cmp_data(opts, rm) != 0) {
+ err++;
+ LOG(D_DEBUG,
+ "\nREC D[%d %d %d]... "
+ "[FAIL]\n", x0, x1, x2);
+ }
+ }
+ }
+ }
+ }
+ return (err);
+}
+
+static int
+run_rec_check(raidz_test_opts_t *opts)
+{
+ char **impl_name;
+ unsigned fn, err = 0;
+ zio_t *zio_test;
+ raidz_map_t *rm_test;
+
+ err = init_raidz_golden_map(opts, PARITY_PQR);
+ if (0 != err)
+ return (err);
+
+ LOG(D_INFO, DBLSEP);
+ LOG(D_INFO, "Testing data reconstruction...\n");
+
+ for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL;
+ impl_name++) {
+
+ LOG(D_INFO, SEP);
+ LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name);
+
+ if (vdev_raidz_impl_set(*impl_name) != 0) {
+ LOG(D_INFO, "[SKIP]\n");
+ continue;
+ } else
+ LOG(D_INFO, "[SUPPORTED]\n");
+
+
+ /* create suitable raidz_map */
+ rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR);
+ /* generate parity */
+ vdev_raidz_generate_parity(rm_test);
+
+ for (fn = 0; fn < RAIDZ_REC_NUM; fn++) {
+
+ LOG(D_INFO, "\t\tTesting method [%s] ...",
+ raidz_rec_name[fn]);
+
+ if (run_rec_check_impl(opts, rm_test, fn) != 0) {
+ LOG(D_INFO, "[FAIL]\n");
+ err++;
+
+ } else
+ LOG(D_INFO, "[PASS]\n");
+
+ }
+ /* tear down test raidz_map */
+ fini_raidz_map(&zio_test, &rm_test);
+ }
+
+ fini_raidz_map(&opts->zio_golden, &opts->rm_golden);
+
+ return (err);
+}
+
+static int
+run_test(raidz_test_opts_t *opts)
+{
+ int err = 0;
+
+ if (opts == NULL)
+ opts = &rto_opts;
+
+ print_opts(opts, B_FALSE);
+
+ err |= run_gen_check(opts);
+ err |= run_rec_check(opts);
+
+ return (err);
+}
+
+#define SWEEP_RUNNING 0
+#define SWEEP_FINISHED 1
+#define SWEEP_ERROR 2
+#define SWEEP_TIMEOUT 3
+
+static int sweep_state = 0;
+static raidz_test_opts_t failed_opts;
+
+static kmutex_t sem_mtx;
+static kcondvar_t sem_cv;
+static int max_free_slots;
+static int free_slots;
+
+static void
+sweep_thread(void *arg)
+{
+ int err = 0;
+ raidz_test_opts_t *opts = (raidz_test_opts_t *)arg;
+ VERIFY(opts != NULL);
+
+ err = run_test(opts);
+
+ if (rto_opts.rto_sanity) {
+ /* 25% chance that a sweep test fails */
+ if (rand() < (RAND_MAX/4))
+ err = 1;
+ }
+
+ if (0 != err) {
+ mutex_enter(&sem_mtx);
+ memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t));
+ sweep_state = SWEEP_ERROR;
+ mutex_exit(&sem_mtx);
+ }
+
+ umem_free(opts, sizeof (raidz_test_opts_t));
+
+ /* signal the next thread */
+ mutex_enter(&sem_mtx);
+ free_slots++;
+ cv_signal(&sem_cv);
+ mutex_exit(&sem_mtx);
+
+ thread_exit();
+}
+
+static int
+run_sweep(void)
+{
+ static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 };
+ static const size_t ashift_v[] = { 9, 12, 14 };
+ static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12),
+ 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE };
+
+ (void) setvbuf(stdout, NULL, _IONBF, 0);
+
+ ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) *
+ ARRAY_SIZE(dcols_v);
+ ulong_t tried_comb = 0;
+ hrtime_t time_diff, start_time = gethrtime();
+ raidz_test_opts_t *opts;
+ int a, d, s;
+
+ max_free_slots = free_slots = MAX(2, boot_ncpus);
+
+ mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&sem_cv, NULL, CV_DEFAULT, NULL);
+
+ for (s = 0; s < ARRAY_SIZE(size_v); s++)
+ for (a = 0; a < ARRAY_SIZE(ashift_v); a++)
+ for (d = 0; d < ARRAY_SIZE(dcols_v); d++) {
+
+ if (size_v[s] < (1 << ashift_v[a])) {
+ total_comb--;
+ continue;
+ }
+
+ if (++tried_comb % 20 == 0)
+ LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb);
+
+ /* wait for signal to start new thread */
+ mutex_enter(&sem_mtx);
+ while (cv_timedwait_sig(&sem_cv, &sem_mtx,
+ ddi_get_lbolt() + hz)) {
+
+ /* check if should stop the test (timeout) */
+ time_diff = (gethrtime() - start_time) / NANOSEC;
+ if (rto_opts.rto_sweep_timeout > 0 &&
+ time_diff >= rto_opts.rto_sweep_timeout) {
+ sweep_state = SWEEP_TIMEOUT;
+ rto_opts.rto_should_stop = B_TRUE;
+ mutex_exit(&sem_mtx);
+ goto exit;
+ }
+
+ /* check if should stop the test (error) */
+ if (sweep_state != SWEEP_RUNNING) {
+ mutex_exit(&sem_mtx);
+ goto exit;
+ }
+
+ /* exit loop if a slot is available */
+ if (free_slots > 0) {
+ break;
+ }
+ }
+
+ free_slots--;
+ mutex_exit(&sem_mtx);
+
+ opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL);
+ opts->rto_ashift = ashift_v[a];
+ opts->rto_dcols = dcols_v[d];
+ opts->rto_offset = (1 << ashift_v[a]) * rand();
+ opts->rto_dsize = size_v[s];
+ opts->rto_v = 0; /* be quiet */
+
+ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts,
+ 0, NULL, TS_RUN, defclsyspri), !=, NULL);
+ }
+
+exit:
+ LOG(D_ALL, "\nWaiting for test threads to finish...\n");
+ mutex_enter(&sem_mtx);
+ VERIFY(free_slots <= max_free_slots);
+ while (free_slots < max_free_slots) {
+ (void) cv_wait(&sem_cv, &sem_mtx);
+ }
+ mutex_exit(&sem_mtx);
+
+ if (sweep_state == SWEEP_ERROR) {
+ ERR("Sweep test failed! Failed option: \n");
+ print_opts(&failed_opts, B_TRUE);
+ } else {
+ if (sweep_state == SWEEP_TIMEOUT)
+ LOG(D_ALL, "Test timeout (%lus). Stopping...\n",
+ (ulong_t)rto_opts.rto_sweep_timeout);
+
+ LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n",
+ (ulong_t)tried_comb);
+ }
+
+ mutex_destroy(&sem_mtx);
+
+ return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0);
+}
+
+int
+main(int argc, char **argv)
+{
+ size_t i;
+ struct sigaction action;
+ int err = 0;
+
+ /* init gdb string early */
+ (void) sprintf(gdb, gdb_tmpl, getpid());
+
+ action.sa_handler = sig_handler;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+
+ if (sigaction(SIGSEGV, &action, NULL) < 0) {
+ ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ (void) setvbuf(stdout, NULL, _IOLBF, 0);
+
+ dprintf_setup(&argc, argv);
+
+ process_options(argc, argv);
+
+ kernel_init(SPA_MODE_READ);
+
+ /* setup random data because rand() is not reentrant */
+ rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ srand((unsigned)time(NULL) * getpid());
+ for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++)
+ rand_data[i] = rand();
+
+ mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ);
+
+ if (rto_opts.rto_benchmark) {
+ run_raidz_benchmark();
+ } else if (rto_opts.rto_sweep) {
+ err = run_sweep();
+ } else {
+ err = run_test(NULL);
+ }
+
+ umem_free(rand_data, SPA_MAXBLOCKSIZE);
+ kernel_fini();
+
+ return (err);
+}
diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h
new file mode 100644
index 000000000000..09c825ae43c7
--- /dev/null
+++ b/cmd/raidz_test/raidz_test.h
@@ -0,0 +1,116 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef RAIDZ_TEST_H
+#define RAIDZ_TEST_H
+
+#include <sys/spa.h>
+
+static const char *raidz_impl_names[] = {
+ "original",
+ "scalar",
+ "sse2",
+ "ssse3",
+ "avx2",
+ "avx512f",
+ "avx512bw",
+ "aarch64_neon",
+ "aarch64_neonx2",
+ "powerpc_altivec",
+ NULL
+};
+
+typedef struct raidz_test_opts {
+ size_t rto_ashift;
+ size_t rto_offset;
+ size_t rto_dcols;
+ size_t rto_dsize;
+ size_t rto_v;
+ size_t rto_sweep;
+ size_t rto_sweep_timeout;
+ size_t rto_benchmark;
+ size_t rto_sanity;
+ size_t rto_gdb;
+
+ /* non-user options */
+ boolean_t rto_should_stop;
+
+ zio_t *zio_golden;
+ raidz_map_t *rm_golden;
+} raidz_test_opts_t;
+
+static const raidz_test_opts_t rto_opts_defaults = {
+ .rto_ashift = 9,
+ .rto_offset = 1ULL << 0,
+ .rto_dcols = 8,
+ .rto_dsize = 1<<19,
+ .rto_v = 0,
+ .rto_sweep = 0,
+ .rto_benchmark = 0,
+ .rto_sanity = 0,
+ .rto_gdb = 0,
+ .rto_should_stop = B_FALSE
+};
+
+extern raidz_test_opts_t rto_opts;
+
+static inline size_t ilog2(size_t a)
+{
+ return (a > 1 ? 1 + ilog2(a >> 1) : 0);
+}
+
+
+#define D_ALL 0
+#define D_INFO 1
+#define D_DEBUG 2
+
+#define LOG(lvl, a...) \
+{ \
+ if (rto_opts.rto_v >= lvl) \
+ (void) fprintf(stdout, a); \
+} \
+
+#define LOG_OPT(lvl, opt, a...) \
+{ \
+ if (opt->rto_v >= lvl) \
+ (void) fprintf(stdout, a); \
+} \
+
+#define ERR(a...) (void) fprintf(stderr, a)
+
+
+#define DBLSEP "================\n"
+#define SEP "----------------\n"
+
+
+#define raidz_alloc(size) abd_alloc(size, B_FALSE)
+#define raidz_free(p, size) abd_free(p)
+
+
+void init_zio_abd(zio_t *zio);
+
+void run_raidz_benchmark(void);
+
+#endif /* RAIDZ_TEST_H */
diff --git a/cmd/vdev_id/Makefile.am b/cmd/vdev_id/Makefile.am
new file mode 100644
index 000000000000..fb815faad084
--- /dev/null
+++ b/cmd/vdev_id/Makefile.am
@@ -0,0 +1 @@
+dist_udev_SCRIPTS = vdev_id
diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id
new file mode 100755
index 000000000000..8a75e638b67e
--- /dev/null
+++ b/cmd/vdev_id/vdev_id
@@ -0,0 +1,605 @@
+#!/bin/sh
+#
+# vdev_id: udev helper to generate user-friendly names for JBOD disks
+#
+# This script parses the file /etc/zfs/vdev_id.conf to map a
+# physical path in a storage topology to a channel name. The
+# channel name is combined with a disk enclosure slot number to
+# create an alias that reflects the physical location of the drive.
+# This is particularly helpful when it comes to tasks like replacing
+# failed drives. Slot numbers may also be re-mapped in case the
+# default numbering is unsatisfactory. The drive aliases will be
+# created as symbolic links in /dev/disk/by-vdev.
+#
+# The currently supported topologies are sas_direct and sas_switch.
+# A multipath mode is supported in which dm-mpath devices are
+# handled by examining the first-listed running component disk. In
+# multipath mode the configuration file should contain a channel
+# definition with the same name for each path to a given enclosure.
+#
+# The alias keyword provides a simple way to map already-existing
+# device symlinks to more convenient names. It is suitable for
+# small, static configurations or for sites that have some automated
+# way to generate the mapping file.
+#
+#
+# Some example configuration files are given below.
+
+# #
+# # Example vdev_id.conf - sas_direct.
+# #
+#
+# multipath no
+# topology sas_direct
+# phys_per_port 4
+# slot bay
+#
+# # PCI_ID HBA PORT CHANNEL NAME
+# channel 85:00.0 1 A
+# channel 85:00.0 0 B
+# channel 86:00.0 1 C
+# channel 86:00.0 0 D
+#
+# # Custom mapping for Channel A
+#
+# # Linux Mapped
+# # Slot Slot Channel
+# slot 1 7 A
+# slot 2 10 A
+# slot 3 3 A
+# slot 4 6 A
+#
+# # Default mapping for B, C, and D
+# slot 1 4
+# slot 2 2
+# slot 3 1
+# slot 4 3
+
+# #
+# # Example vdev_id.conf - sas_switch
+# #
+#
+# topology sas_switch
+#
+# # SWITCH PORT CHANNEL NAME
+# channel 1 A
+# channel 2 B
+# channel 3 C
+# channel 4 D
+
+# #
+# # Example vdev_id.conf - multipath
+# #
+#
+# multipath yes
+#
+# # PCI_ID HBA PORT CHANNEL NAME
+# channel 85:00.0 1 A
+# channel 85:00.0 0 B
+# channel 86:00.0 1 A
+# channel 86:00.0 0 B
+
+# #
+# # Example vdev_id.conf - alias
+# #
+#
+# # by-vdev
+# # name fully qualified or base name of device link
+# alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca
+# alias d2 wwn-0x5000c5002def789e
+
+PATH=/bin:/sbin:/usr/bin:/usr/sbin
+CONFIG=/etc/zfs/vdev_id.conf
+PHYS_PER_PORT=
+DEV=
+MULTIPATH=
+TOPOLOGY=
+BAY=
+
+usage() {
+ cat << EOF
+Usage: vdev_id [-h]
+ vdev_id <-d device> [-c config_file] [-p phys_per_port]
+ [-g sas_direct|sas_switch|scsi] [-m]
+
+ -c specify name of an alternative config file [default=$CONFIG]
+ -d specify basename of device (i.e. sda)
+ -e Create enclose device symlinks only (/dev/by-enclosure)
+ -g Storage network topology [default="$TOPOLOGY"]
+ -m Run in multipath mode
+ -p number of phy's per switch port [default=$PHYS_PER_PORT]
+ -h show this summary
+EOF
+ exit 0
+}
+
+map_slot() {
+ LINUX_SLOT=$1
+ CHANNEL=$2
+
+ MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \
+ \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG`
+ if [ -z "$MAPPED_SLOT" ] ; then
+ MAPPED_SLOT=$LINUX_SLOT
+ fi
+ printf "%d" ${MAPPED_SLOT}
+}
+
+map_channel() {
+ MAPPED_CHAN=
+ PCI_ID=$1
+ PORT=$2
+
+ case $TOPOLOGY in
+ "sas_switch")
+ MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \
+ { print \\$3; exit }" $CONFIG`
+ ;;
+ "sas_direct"|"scsi")
+ MAPPED_CHAN=`awk "\\$1 == \"channel\" && \
+ \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \
+ { print \\$4; exit }" $CONFIG`
+ ;;
+ esac
+ printf "%s" ${MAPPED_CHAN}
+}
+
+sas_handler() {
+ if [ -z "$PHYS_PER_PORT" ] ; then
+ PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+ PHYS_PER_PORT=${PHYS_PER_PORT:-4}
+ if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+ echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
+ exit 1
+ fi
+
+ if [ -z "$MULTIPATH_MODE" ] ; then
+ MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+
+ # Use first running component device if we're handling a dm-mpath device
+ if [ "$MULTIPATH_MODE" = "yes" ] ; then
+ # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
+ if [ -z "$DM_NAME" ] ; then
+ DM_NAME=`ls -l --full-time /dev/mapper |
+ awk "/\/$DEV$/{print \\$9}"`
+ fi
+
+ # For raw disks udev exports DEVTYPE=partition when
+ # handling partitions, and the rules can be written to
+ # take advantage of this to append a -part suffix. For
+ # dm devices we get DEVTYPE=disk even for partitions so
+ # we have to append the -part suffix directly in the
+ # helper.
+ if [ "$DEVTYPE" != "partition" ] ; then
+ PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+ fi
+
+ # Strip off partition information.
+ DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+ if [ -z "$DM_NAME" ] ; then
+ return
+ fi
+
+ # Get the raw scsi device name from multipath -ll. Strip off
+ # leading pipe symbols to make field numbering consistent.
+ DEV=`multipath -ll $DM_NAME |
+ awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+ if [ -z "$DEV" ] ; then
+ return
+ fi
+ fi
+
+ if echo $DEV | grep -q ^/devices/ ; then
+ sys_path=$DEV
+ else
+ sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+ fi
+
+ # Use positional parameters as an ad-hoc array
+ set -- $(echo "$sys_path" | tr / ' ')
+ num_dirs=$#
+ scsi_host_dir="/sys"
+
+ # Get path up to /sys/.../hostX
+ i=1
+ while [ $i -le $num_dirs ] ; do
+ d=$(eval echo \${$i})
+ scsi_host_dir="$scsi_host_dir/$d"
+ echo $d | grep -q -E '^host[0-9]+$' && break
+ i=$(($i + 1))
+ done
+
+ if [ $i = $num_dirs ] ; then
+ return
+ fi
+
+ PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+
+ # In sas_switch mode, the directory four levels beneath
+ # /sys/.../hostX contains symlinks to phy devices that reveal
+ # the switch port number. In sas_direct mode, the phy links one
+ # directory down reveal the HBA port.
+ port_dir=$scsi_host_dir
+ case $TOPOLOGY in
+ "sas_switch") j=$(($i + 4)) ;;
+ "sas_direct") j=$(($i + 1)) ;;
+ esac
+
+ i=$(($i + 1))
+ while [ $i -le $j ] ; do
+ port_dir="$port_dir/$(eval echo \${$i})"
+ i=$(($i + 1))
+ done
+
+ PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'`
+ if [ -z "$PHY" ] ; then
+ PHY=0
+ fi
+ PORT=$(( $PHY / $PHYS_PER_PORT ))
+
+ # Look in /sys/.../sas_device/end_device-X for the bay_identifier
+ # attribute.
+ end_device_dir=$port_dir
+ while [ $i -lt $num_dirs ] ; do
+ d=$(eval echo \${$i})
+ end_device_dir="$end_device_dir/$d"
+ if echo $d | grep -q '^end_device' ; then
+ end_device_dir="$end_device_dir/sas_device/$d"
+ break
+ fi
+ i=$(($i + 1))
+ done
+
+ SLOT=
+ case $BAY in
+ "bay")
+ SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null`
+ ;;
+ "phy")
+ SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null`
+ ;;
+ "port")
+ d=$(eval echo \${$i})
+ SLOT=`echo $d | sed -e 's/^.*://'`
+ ;;
+ "id")
+ i=$(($i + 1))
+ d=$(eval echo \${$i})
+ SLOT=`echo $d | sed -e 's/^.*://'`
+ ;;
+ "lun")
+ i=$(($i + 2))
+ d=$(eval echo \${$i})
+ SLOT=`echo $d | sed -e 's/^.*://'`
+ ;;
+ "ses")
+ # look for this SAS path in all SCSI Enclosure Services
+ # (SES) enclosures
+ sas_address=`cat $end_device_dir/sas_address 2>/dev/null`
+ enclosures=`lsscsi -g | \
+ sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'`
+ for enclosure in $enclosures; do
+ set -- $(sg_ses -p aes $enclosure | \
+ awk "/device slot number:/{slot=\$12} \
+ /SAS address: $sas_address/\
+ {print slot}")
+ SLOT=$1
+ if [ -n "$SLOT" ] ; then
+ break
+ fi
+ done
+ ;;
+ esac
+ if [ -z "$SLOT" ] ; then
+ return
+ fi
+
+ CHAN=`map_channel $PCI_ID $PORT`
+ SLOT=`map_slot $SLOT $CHAN`
+ if [ -z "$CHAN" ] ; then
+ return
+ fi
+ echo ${CHAN}${SLOT}${PART}
+}
+
+scsi_handler() {
+ if [ -z "$FIRST_BAY_NUMBER" ] ; then
+ FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+ FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0}
+
+ if [ -z "$PHYS_PER_PORT" ] ; then
+ PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+ PHYS_PER_PORT=${PHYS_PER_PORT:-4}
+ if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then
+ echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric"
+ exit 1
+ fi
+
+ if [ -z "$MULTIPATH_MODE" ] ; then
+ MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \
+ {print \\$2; exit}" $CONFIG`
+ fi
+
+ # Use first running component device if we're handling a dm-mpath device
+ if [ "$MULTIPATH_MODE" = "yes" ] ; then
+ # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper
+ if [ -z "$DM_NAME" ] ; then
+ DM_NAME=`ls -l --full-time /dev/mapper |
+ awk "/\/$DEV$/{print \\$9}"`
+ fi
+
+ # For raw disks udev exports DEVTYPE=partition when
+ # handling partitions, and the rules can be written to
+ # take advantage of this to append a -part suffix. For
+ # dm devices we get DEVTYPE=disk even for partitions so
+ # we have to append the -part suffix directly in the
+ # helper.
+ if [ "$DEVTYPE" != "partition" ] ; then
+ PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+ fi
+
+ # Strip off partition information.
+ DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'`
+ if [ -z "$DM_NAME" ] ; then
+ return
+ fi
+
+ # Get the raw scsi device name from multipath -ll. Strip off
+ # leading pipe symbols to make field numbering consistent.
+ DEV=`multipath -ll $DM_NAME |
+ awk '/running/{gsub("^[|]"," "); print $3 ; exit}'`
+ if [ -z "$DEV" ] ; then
+ return
+ fi
+ fi
+
+ if echo $DEV | grep -q ^/devices/ ; then
+ sys_path=$DEV
+ else
+ sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null`
+ fi
+
+ # expect sys_path like this, for example:
+ # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv
+
+ # Use positional parameters as an ad-hoc array
+ set -- $(echo "$sys_path" | tr / ' ')
+ num_dirs=$#
+ scsi_host_dir="/sys"
+
+ # Get path up to /sys/.../hostX
+ i=1
+ while [ $i -le $num_dirs ] ; do
+ d=$(eval echo \${$i})
+ scsi_host_dir="$scsi_host_dir/$d"
+ echo $d | grep -q -E '^host[0-9]+$' && break
+ i=$(($i + 1))
+ done
+
+ if [ $i = $num_dirs ] ; then
+ return
+ fi
+
+ PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}')
+
+ # In scsi mode, the directory two levels beneath
+ # /sys/.../hostX reveals the port and slot.
+ port_dir=$scsi_host_dir
+ j=$(($i + 2))
+
+ i=$(($i + 1))
+ while [ $i -le $j ] ; do
+ port_dir="$port_dir/$(eval echo \${$i})"
+ i=$(($i + 1))
+ done
+
+ set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/')
+ PORT=$1
+ SLOT=$(($2 + $FIRST_BAY_NUMBER))
+
+ if [ -z "$SLOT" ] ; then
+ return
+ fi
+
+ CHAN=`map_channel $PCI_ID $PORT`
+ SLOT=`map_slot $SLOT $CHAN`
+ if [ -z "$CHAN" ] ; then
+ return
+ fi
+ echo ${CHAN}${SLOT}${PART}
+}
+
+# Figure out the name for the enclosure symlink
+enclosure_handler () {
+ # We get all the info we need from udev's DEVPATH variable:
+ #
+ # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0
+
+ # Get the enclosure ID ("0:0:0:0")
+ ENC=$(basename $(readlink -m "/sys/$DEVPATH/../.."))
+ if [ ! -d /sys/class/enclosure/$ENC ] ; then
+ # Not an enclosure, bail out
+ return
+ fi
+
+ # Get the long sysfs device path to our enclosure. Looks like:
+ # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0
+
+ ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC)
+
+ # Grab the full path to the hosts port dir:
+ # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0
+ PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+')
+
+ # Get the port number
+ PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$")
+
+ # The PCI directory is two directories up from the port directory
+ # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0
+ PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../.."))
+
+ # Strip down the PCI address from 0000:05:00.0 to 05:00.0
+ PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g')
+
+ # Name our device according to vdev_id.conf (like "L0" or "U1").
+ NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \
+ \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG)
+
+ echo "${NAME}"
+}
+
+alias_handler () {
+ # Special handling is needed to correctly append a -part suffix
+ # to partitions of device mapper devices. The DEVTYPE attribute
+ # is normally set to "disk" instead of "partition" in this case,
+ # so the udev rules won't handle that for us as they do for
+ # "plain" block devices.
+ #
+ # For example, we may have the following links for a device and its
+ # partitions,
+ #
+ # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0
+ # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1
+ # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3
+ #
+ # and the following alias in vdev_id.conf.
+ #
+ # alias A0 dm-name-isw_dibgbfcije_ARRAY0
+ #
+ # The desired outcome is for the following links to be created
+ # without having explicitly defined aliases for the partitions.
+ #
+ # /dev/disk/by-vdev/A0 -> ../../dm-0
+ # /dev/disk/by-vdev/A0-part1 -> ../../dm-1
+ # /dev/disk/by-vdev/A0-part2 -> ../../dm-3
+ #
+ # Warning: The following grep pattern will misidentify whole-disk
+ # devices whose names end with 'p' followed by a string of
+ # digits as partitions, causing alias creation to fail. This
+ # ambiguity seems unavoidable, so devices using this facility
+ # must not use such names.
+ DM_PART=
+ if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then
+ if [ "$DEVTYPE" != "partition" ] ; then
+ DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'`
+ fi
+ fi
+
+ # DEVLINKS attribute must have been populated by already-run udev rules.
+ for link in $DEVLINKS ; do
+ # Remove partition information to match key of top-level device.
+ if [ -n "$DM_PART" ] ; then
+ link=`echo $link | sed 's/p[0-9][0-9]*$//'`
+ fi
+ # Check both the fully qualified and the base name of link.
+ for l in $link `basename $link` ; do
+ alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \
+ { print \\$2; exit }" $CONFIG`
+ if [ -n "$alias" ] ; then
+ echo ${alias}${DM_PART}
+ return
+ fi
+ done
+ done
+}
+
+while getopts 'c:d:eg:mp:h' OPTION; do
+ case ${OPTION} in
+ c)
+ CONFIG=${OPTARG}
+ ;;
+ d)
+ DEV=${OPTARG}
+ ;;
+ e)
+ # When udev sees a scsi_generic device, it calls this script with -e to
+ # create the enclosure device symlinks only. We also need
+ # "enclosure_symlinks yes" set in vdev_id.config to actually create the
+ # symlink.
+ ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG)
+ if [ "$ENCLOSURE_MODE" != "yes" ] ; then
+ exit 0
+ fi
+ ;;
+ g)
+ TOPOLOGY=$OPTARG
+ ;;
+ p)
+ PHYS_PER_PORT=${OPTARG}
+ ;;
+ m)
+ MULTIPATH_MODE=yes
+ ;;
+ h)
+ usage
+ ;;
+ esac
+done
+
+if [ ! -r $CONFIG ] ; then
+ exit 0
+fi
+
+if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then
+ echo "Error: missing required option -d"
+ exit 1
+fi
+
+if [ -z "$TOPOLOGY" ] ; then
+ TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG`
+fi
+
+if [ -z "$BAY" ] ; then
+ BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG`
+fi
+
+TOPOLOGY=${TOPOLOGY:-sas_direct}
+
+# Should we create /dev/by-enclosure symlinks?
+if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then
+ ID_ENCLOSURE=$(enclosure_handler)
+ if [ -z "$ID_ENCLOSURE" ] ; then
+ exit 0
+ fi
+
+ # Just create the symlinks to the enclosure devices and then exit.
+ ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG)
+ if [ -z "$ENCLOSURE_PREFIX" ] ; then
+ ENCLOSURE_PREFIX="enc"
+ fi
+ echo "ID_ENCLOSURE=$ID_ENCLOSURE"
+ echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE"
+ exit 0
+fi
+
+# First check if an alias was defined for this device.
+ID_VDEV=`alias_handler`
+
+if [ -z "$ID_VDEV" ] ; then
+ BAY=${BAY:-bay}
+ case $TOPOLOGY in
+ sas_direct|sas_switch)
+ ID_VDEV=`sas_handler`
+ ;;
+ scsi)
+ ID_VDEV=`scsi_handler`
+ ;;
+ *)
+ echo "Error: unknown topology $TOPOLOGY"
+ exit 1
+ ;;
+ esac
+fi
+
+if [ -n "$ID_VDEV" ] ; then
+ echo "ID_VDEV=${ID_VDEV}"
+ echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}"
+fi
diff --git a/cmd/zdb/.gitignore b/cmd/zdb/.gitignore
new file mode 100644
index 000000000000..f64a3fc5a160
--- /dev/null
+++ b/cmd/zdb/.gitignore
@@ -0,0 +1 @@
+/zdb
diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am
new file mode 100644
index 000000000000..b325cb060bd2
--- /dev/null
+++ b/cmd/zdb/Makefile.am
@@ -0,0 +1,16 @@
+include $(top_srcdir)/config/Rules.am
+
+# Unconditionally enable debugging for zdb
+AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG
+
+sbin_PROGRAMS = zdb
+
+zdb_SOURCES = \
+ zdb.c \
+ zdb_il.c \
+ zdb.h
+
+zdb_LDADD = \
+ $(abs_top_builddir)/lib/libzpool/libzpool.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
new file mode 100644
index 000000000000..e7211711a41c
--- /dev/null
+++ b/cmd/zdb/zdb.c
@@ -0,0 +1,8606 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015, 2017, Intel Corporation.
+ * Copyright (c) 2020 Datto Inc.
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ * under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dbuf.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_traverse.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/zfs_fuid.h>
+#include <sys/arc.h>
+#include <sys/arc_impl.h>
+#include <sys/ddt.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_scan.h>
+#include <sys/btree.h>
+#include <zfs_comutil.h>
+#include <sys/zstd/zstd.h>
+
+#include <libnvpair.h>
+#include <libzutil.h>
+
+#include "zdb.h"
+
+#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
+ zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
+ zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \
+ (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \
+ DMU_OT_ZAP_OTHER : \
+ (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \
+ DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES)
+
+static char *
+zdb_ot_name(dmu_object_type_t type)
+{
+ if (type < DMU_OT_NUMTYPES)
+ return (dmu_ot[type].ot_name);
+ else if ((type & DMU_OT_NEWTYPE) &&
+ ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS))
+ return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name);
+ else
+ return ("UNKNOWN");
+}
+
+extern int reference_tracking_enable;
+extern int zfs_recover;
+extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
+extern boolean_t spa_load_verify_dryrun;
+extern int zfs_reconstruct_indirect_combinations_max;
+extern int zfs_btree_verify_intensity;
+
+static const char cmdname[] = "zdb";
+uint8_t dump_opt[256];
+
+typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
+
+uint64_t *zopt_metaslab = NULL;
+static unsigned zopt_metaslab_args = 0;
+
+typedef struct zopt_object_range {
+ uint64_t zor_obj_start;
+ uint64_t zor_obj_end;
+ uint64_t zor_flags;
+} zopt_object_range_t;
+zopt_object_range_t *zopt_object_ranges = NULL;
+static unsigned zopt_object_args = 0;
+
+static int flagbits[256];
+
+#define ZOR_FLAG_PLAIN_FILE 0x0001
+#define ZOR_FLAG_DIRECTORY 0x0002
+#define ZOR_FLAG_SPACE_MAP 0x0004
+#define ZOR_FLAG_ZAP 0x0008
+#define ZOR_FLAG_ALL_TYPES -1
+#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \
+ ZOR_FLAG_DIRECTORY | \
+ ZOR_FLAG_SPACE_MAP | \
+ ZOR_FLAG_ZAP)
+
+#define ZDB_FLAG_CHECKSUM 0x0001
+#define ZDB_FLAG_DECOMPRESS 0x0002
+#define ZDB_FLAG_BSWAP 0x0004
+#define ZDB_FLAG_GBH 0x0008
+#define ZDB_FLAG_INDIRECT 0x0010
+#define ZDB_FLAG_RAW 0x0020
+#define ZDB_FLAG_PRINT_BLKPTR 0x0040
+#define ZDB_FLAG_VERBOSE 0x0080
+
+uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
+static int leaked_objects = 0;
+static range_tree_t *mos_refd_objs;
+
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
+ boolean_t);
+static void mos_obj_refd(uint64_t);
+static void mos_obj_refd_multiple(uint64_t);
+static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
+ dmu_tx_t *tx);
+
+typedef struct sublivelist_verify {
+ /* all ALLOC'd blkptr_t in one sub-livelist */
+ zfs_btree_t sv_all_allocs;
+
+ /* all FREE'd blkptr_t in one sub-livelist */
+ zfs_btree_t sv_all_frees;
+
+ /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */
+ zfs_btree_t sv_pair;
+
+ /* ALLOC's without a matching FREE, accumulates across sub-livelists */
+ zfs_btree_t sv_leftover;
+} sublivelist_verify_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+ const blkptr_t *l = larg;
+ const blkptr_t *r = rarg;
+
+ /* Sort them according to dva[0] */
+ uint64_t l_dva0_vdev, r_dva0_vdev;
+ l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+ r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+ if (l_dva0_vdev < r_dva0_vdev)
+ return (-1);
+ else if (l_dva0_vdev > r_dva0_vdev)
+ return (+1);
+
+ /* if vdevs are equal, sort by offsets. */
+ uint64_t l_dva0_offset;
+ uint64_t r_dva0_offset;
+ l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+ r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+ if (l_dva0_offset < r_dva0_offset) {
+ return (-1);
+ } else if (l_dva0_offset > r_dva0_offset) {
+ return (+1);
+ }
+
+ /*
+ * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
+ * it's possible the offsets are equal. In that case, sort by txg
+ */
+ if (l->blk_birth < r->blk_birth) {
+ return (-1);
+ } else if (l->blk_birth > r->blk_birth) {
+ return (+1);
+ }
+ return (0);
+}
+
+typedef struct sublivelist_verify_block {
+ dva_t svb_dva;
+
+ /*
+ * We need this to check if the block marked as allocated
+ * in the livelist was freed (and potentially reallocated)
+ * in the metaslab spacemaps at a later TXG.
+ */
+ uint64_t svb_allocated_txg;
+} sublivelist_verify_block_t;
+
+static void zdb_print_blkptr(const blkptr_t *bp, int flags);
+
+static int
+sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
+ dmu_tx_t *tx)
+{
+ ASSERT3P(tx, ==, NULL);
+ struct sublivelist_verify *sv = arg;
+ char blkbuf[BP_SPRINTF_LEN];
+ zfs_btree_index_t where;
+ if (free) {
+ zfs_btree_add(&sv->sv_pair, bp);
+ /* Check if the FREE is a duplicate */
+ if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+ free);
+ (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf);
+ } else {
+ zfs_btree_add_idx(&sv->sv_all_frees, bp, &where);
+ }
+ } else {
+ /* Check if the ALLOC has been freed */
+ if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) {
+ zfs_btree_remove_idx(&sv->sv_pair, &where);
+ } else {
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ if (DVA_IS_EMPTY(&bp->blk_dva[i]))
+ break;
+ sublivelist_verify_block_t svb = {
+ .svb_dva = bp->blk_dva[i],
+ .svb_allocated_txg = bp->blk_birth
+ };
+
+ if (zfs_btree_find(&sv->sv_leftover, &svb,
+ &where) == NULL) {
+ zfs_btree_add_idx(&sv->sv_leftover,
+ &svb, &where);
+ }
+ }
+ }
+ /* Check if the ALLOC is a duplicate */
+ if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp,
+ free);
+ (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf);
+ } else {
+ zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where);
+ }
+ }
+ return (0);
+}
+
+static int
+sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
+{
+ int err;
+ char blkbuf[BP_SPRINTF_LEN];
+ struct sublivelist_verify *sv = args;
+
+ zfs_btree_create(&sv->sv_all_allocs, livelist_compare,
+ sizeof (blkptr_t));
+
+ zfs_btree_create(&sv->sv_all_frees, livelist_compare,
+ sizeof (blkptr_t));
+
+ zfs_btree_create(&sv->sv_pair, livelist_compare,
+ sizeof (blkptr_t));
+
+ err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
+ sv, NULL);
+
+ zfs_btree_clear(&sv->sv_all_allocs);
+ zfs_btree_destroy(&sv->sv_all_allocs);
+
+ zfs_btree_clear(&sv->sv_all_frees);
+ zfs_btree_destroy(&sv->sv_all_frees);
+
+ blkptr_t *e;
+ zfs_btree_index_t *cookie = NULL;
+ while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE);
+ (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf);
+ }
+ zfs_btree_destroy(&sv->sv_pair);
+
+ return (err);
+}
+
+static int
+livelist_block_compare(const void *larg, const void *rarg)
+{
+ const sublivelist_verify_block_t *l = larg;
+ const sublivelist_verify_block_t *r = rarg;
+
+ if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
+ return (+1);
+
+ if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
+ return (+1);
+
+ if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
+ return (-1);
+ else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
+ return (+1);
+
+ return (0);
+}
+
+/*
+ * Check for errors in a livelist while tracking all unfreed ALLOCs in the
+ * sublivelist_verify_t: sv->sv_leftover
+ */
+static void
+livelist_verify(dsl_deadlist_t *dl, void *arg)
+{
+ sublivelist_verify_t *sv = arg;
+ dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
+}
+
+/*
+ * Check for errors in the livelist entry and discard the intermediary
+ * data structures
+ */
+/* ARGSUSED */
+static int
+sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
+{
+ sublivelist_verify_t sv;
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+ int err = sublivelist_verify_func(&sv, dle);
+ zfs_btree_clear(&sv.sv_leftover);
+ zfs_btree_destroy(&sv.sv_leftover);
+ return (err);
+}
+
+typedef struct metaslab_verify {
+ /*
+ * Tree containing all the leftover ALLOCs from the livelists
+ * that are part of this metaslab.
+ */
+ zfs_btree_t mv_livelist_allocs;
+
+ /*
+ * Metaslab information.
+ */
+ uint64_t mv_vdid;
+ uint64_t mv_msid;
+ uint64_t mv_start;
+ uint64_t mv_end;
+
+ /*
+ * What's currently allocated for this metaslab.
+ */
+ range_tree_t *mv_allocated;
+} metaslab_verify_t;
+
+typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
+
+typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
+ void *arg);
+
+typedef struct unflushed_iter_cb_arg {
+ spa_t *uic_spa;
+ uint64_t uic_txg;
+ void *uic_arg;
+ zdb_log_sm_cb_t uic_cb;
+} unflushed_iter_cb_arg_t;
+
+static int
+iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
+{
+ unflushed_iter_cb_arg_t *uic = arg;
+ return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
+}
+
+static void
+iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ space_map_t *sm = NULL;
+ VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+ sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+ unflushed_iter_cb_arg_t uic = {
+ .uic_spa = spa,
+ .uic_txg = sls->sls_txg,
+ .uic_arg = arg,
+ .uic_cb = cb
+ };
+ VERIFY0(space_map_iterate(sm, space_map_length(sm),
+ iterate_through_spacemap_logs_cb, &uic));
+ space_map_close(sm);
+ }
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
+ uint64_t offset, uint64_t size)
+{
+ sublivelist_verify_block_t svb;
+ DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
+ DVA_SET_OFFSET(&svb.svb_dva, offset);
+ DVA_SET_ASIZE(&svb.svb_dva, size);
+ zfs_btree_index_t where;
+ uint64_t end_offset = offset + size;
+
+ /*
+ * Look for an exact match for spacemap entry in the livelist entries.
+ * Then, look for other livelist entries that fall within the range
+ * of the spacemap entry as it may have been condensed
+ */
+ sublivelist_verify_block_t *found =
+ zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
+ if (found == NULL) {
+ found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
+ }
+ for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
+ DVA_GET_OFFSET(&found->svb_dva) < end_offset;
+ found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+ if (found->svb_allocated_txg <= txg) {
+ (void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
+ "from TXG %llx FREED at TXG %llx\n",
+ (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
+ (u_longlong_t)found->svb_allocated_txg,
+ (u_longlong_t)txg);
+ }
+ }
+}
+
+static int
+metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
+{
+ metaslab_verify_t *mv = arg;
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+ uint64_t txg = sme->sme_txg;
+
+ if (sme->sme_type == SM_ALLOC) {
+ if (range_tree_contains(mv->mv_allocated,
+ offset, size)) {
+ (void) printf("ERROR: DOUBLE ALLOC: "
+ "%llu [%llx:%llx] "
+ "%llu:%llu LOG_SM\n",
+ (u_longlong_t)txg, (u_longlong_t)offset,
+ (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+ (u_longlong_t)mv->mv_msid);
+ } else {
+ range_tree_add(mv->mv_allocated,
+ offset, size);
+ }
+ } else {
+ if (!range_tree_contains(mv->mv_allocated,
+ offset, size)) {
+ (void) printf("ERROR: DOUBLE FREE: "
+ "%llu [%llx:%llx] "
+ "%llu:%llu LOG_SM\n",
+ (u_longlong_t)txg, (u_longlong_t)offset,
+ (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
+ (u_longlong_t)mv->mv_msid);
+ } else {
+ range_tree_remove(mv->mv_allocated,
+ offset, size);
+ }
+ }
+
+ if (sme->sme_type != SM_ALLOC) {
+ /*
+ * If something is freed in the spacemap, verify that
+ * it is not listed as allocated in the livelist.
+ */
+ verify_livelist_allocs(mv, txg, offset, size);
+ }
+ return (0);
+}
+
+static int
+spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ metaslab_verify_t *mv = arg;
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+ /* skip indirect vdevs */
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ if (vdev_id != mv->mv_vdid)
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ if (ms->ms_id != mv->mv_msid)
+ return (0);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+
+ ASSERT3U(txg, ==, sme->sme_txg);
+ return (metaslab_spacemap_validation_cb(sme, mv));
+}
+
+static void
+spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
+{
+ iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
+}
+
+static void
+spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv)
+{
+ if (sm == NULL)
+ return;
+
+ VERIFY0(space_map_iterate(sm, space_map_length(sm),
+ metaslab_spacemap_validation_cb, mv));
+}
+
+static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
+
+/*
+ * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
+ * they are part of that metaslab (mv_msid).
+ */
+static void
+mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
+{
+ zfs_btree_index_t where;
+ sublivelist_verify_block_t *svb;
+ ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
+ for (svb = zfs_btree_first(&sv->sv_leftover, &where);
+ svb != NULL;
+ svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
+ if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
+ continue;
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
+ (DVA_GET_OFFSET(&svb->svb_dva) +
+ DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
+ (void) printf("ERROR: Found block that crosses "
+ "metaslab boundary: <%llu:%llx:%llx>\n",
+ (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+ continue;
+ }
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
+ continue;
+
+ if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
+ continue;
+
+ if ((DVA_GET_OFFSET(&svb->svb_dva) +
+ DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
+ (void) printf("ERROR: Found block that crosses "
+ "metaslab boundary: <%llu:%llx:%llx>\n",
+ (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
+ continue;
+ }
+
+ zfs_btree_add(&mv->mv_livelist_allocs, svb);
+ }
+
+ for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
+ svb != NULL;
+ svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
+ zfs_btree_remove(&sv->sv_leftover, svb);
+ }
+}
+
+/*
+ * [Livelist Check]
+ * Iterate through all the sublivelists and:
+ * - report leftover frees
+ * - report double ALLOCs/FREEs
+ * - record leftover ALLOCs together with their TXG [see Cross Check]
+ *
+ * [Spacemap Check]
+ * for each metaslab:
+ * - iterate over spacemap and then the metaslab's entries in the
+ * spacemap log, then report any double FREEs and ALLOCs (do not
+ * blow up).
+ *
+ * [Cross Check]
+ * After finishing the Livelist Check phase and while being in the
+ * Spacemap Check phase, we find all the recorded leftover ALLOCs
+ * of the livelist check that are part of the metaslab that we are
+ * currently looking at in the Spacemap Check. We report any entries
+ * that are marked as ALLOCs in the livelists but have been actually
+ * freed (and potentially allocated again) after their TXG stamp in
+ * the spacemaps. Also report any ALLOCs from the livelists that
+ * belong to indirect vdevs (e.g. their vdev completed removal).
+ *
+ * Note that this will miss Log Spacemap entries that cancelled each other
+ * out before being flushed to the metaslab, so we are not guaranteed
+ * to match all erroneous ALLOCs.
+ */
+static void
+livelist_metaslab_validate(spa_t *spa)
+{
+ (void) printf("Verifying deleted livelist entries\n");
+
+ sublivelist_verify_t sv;
+ zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+ iterate_deleted_livelists(spa, livelist_verify, &sv);
+
+ (void) printf("Verifying metaslab entries\n");
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ if (!vdev_is_concrete(vd))
+ continue;
+
+ for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
+ metaslab_t *m = vd->vdev_ms[mid];
+
+ (void) fprintf(stderr,
+ "\rverifying concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)mid,
+ (longlong_t)vd->vdev_ms_count);
+
+ uint64_t shift, start;
+ range_seg_type_t type =
+ metaslab_calculate_range_tree_type(vd, m,
+ &start, &shift);
+ metaslab_verify_t mv;
+ mv.mv_allocated = range_tree_create(NULL,
+ type, NULL, start, shift);
+ mv.mv_vdid = vd->vdev_id;
+ mv.mv_msid = m->ms_id;
+ mv.mv_start = m->ms_start;
+ mv.mv_end = m->ms_start + m->ms_size;
+ zfs_btree_create(&mv.mv_livelist_allocs,
+ livelist_block_compare,
+ sizeof (sublivelist_verify_block_t));
+
+ mv_populate_livelist_allocs(&mv, &sv);
+
+ spacemap_check_ms_sm(m->ms_sm, &mv);
+ spacemap_check_sm_log(spa, &mv);
+
+ range_tree_vacate(mv.mv_allocated, NULL, NULL);
+ range_tree_destroy(mv.mv_allocated);
+ zfs_btree_clear(&mv.mv_livelist_allocs);
+ zfs_btree_destroy(&mv.mv_livelist_allocs);
+ }
+ }
+ (void) fprintf(stderr, "\n");
+
+ /*
+ * If there are any segments in the leftover tree after we walked
+ * through all the metaslabs in the concrete vdevs then this means
+ * that we have segments in the livelists that belong to indirect
+ * vdevs and are marked as allocated.
+ */
+ if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
+ zfs_btree_destroy(&sv.sv_leftover);
+ return;
+ }
+ (void) printf("ERROR: Found livelist blocks marked as allocated "
+ "for indirect vdevs:\n");
+
+ zfs_btree_index_t *where = NULL;
+ sublivelist_verify_block_t *svb;
+ while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
+ NULL) {
+ int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
+ ASSERT3U(vdev_id, <, rvd->vdev_children);
+ vdev_t *vd = rvd->vdev_child[vdev_id];
+ ASSERT(!vdev_is_concrete(vd));
+ (void) printf("<%d:%llx:%llx> TXG %llx\n",
+ vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
+ (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
+ (u_longlong_t)svb->svb_allocated_txg);
+ }
+ (void) printf("\n");
+ zfs_btree_destroy(&sv.sv_leftover);
+}
+
+/*
+ * These libumem hooks provide a reasonable set of defaults for the allocator's
+ * debugging facilities.
+ */
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr,
+ "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
+ "[-I <inflight I/Os>]\n"
+ "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
+ "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
+ "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
+ "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
+ "\t%s [-v] <bookmark>\n"
+ "\t%s -C [-A] [-U <cache>]\n"
+ "\t%s -l [-Aqu] <device>\n"
+ "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
+ "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
+ "\t%s -O <dataset> <path>\n"
+ "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
+ "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
+ "\t%s -E [-A] word0:word1:...:word15\n"
+ "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
+ "<poolname>\n\n",
+ cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
+ cmdname, cmdname, cmdname);
+
+ (void) fprintf(stderr, " Dataset name must include at least one "
+ "separator character '/' or '@'\n");
+ (void) fprintf(stderr, " If dataset name is specified, only that "
+ "dataset is dumped\n");
+ (void) fprintf(stderr, " If object numbers or object number "
+ "ranges are specified, only those\n"
+ " objects or ranges are dumped.\n\n");
+ (void) fprintf(stderr,
+ " Object ranges take the form <start>:<end>[:<flags>]\n"
+ " start Starting object number\n"
+ " end Ending object number, or -1 for no upper bound\n"
+ " flags Optional flags to select object types:\n"
+ " A All objects (this is the default)\n"
+ " d ZFS directories\n"
+ " f ZFS files \n"
+ " m SPA space maps\n"
+ " z ZAPs\n"
+ " - Negate effect of next flag\n\n");
+ (void) fprintf(stderr, " Options to control amount of output:\n");
+ (void) fprintf(stderr, " -b block statistics\n");
+ (void) fprintf(stderr, " -c checksum all metadata (twice for "
+ "all data) blocks\n");
+ (void) fprintf(stderr, " -C config (or cachefile if alone)\n");
+ (void) fprintf(stderr, " -d dataset(s)\n");
+ (void) fprintf(stderr, " -D dedup statistics\n");
+ (void) fprintf(stderr, " -E decode and display block from an "
+ "embedded block pointer\n");
+ (void) fprintf(stderr, " -h pool history\n");
+ (void) fprintf(stderr, " -i intent logs\n");
+ (void) fprintf(stderr, " -l read label contents\n");
+ (void) fprintf(stderr, " -k examine the checkpointed state "
+ "of the pool\n");
+ (void) fprintf(stderr, " -L disable leak tracking (do not "
+ "load spacemaps)\n");
+ (void) fprintf(stderr, " -m metaslabs\n");
+ (void) fprintf(stderr, " -M metaslab groups\n");
+ (void) fprintf(stderr, " -O perform object lookups by path\n");
+ (void) fprintf(stderr, " -R read and display block from a "
+ "device\n");
+ (void) fprintf(stderr, " -s report stats on zdb's I/O\n");
+ (void) fprintf(stderr, " -S simulate dedup to measure effect\n");
+ (void) fprintf(stderr, " -v verbose (applies to all "
+ "others)\n");
+ (void) fprintf(stderr, " -y perform livelist and metaslab "
+ "validation on any livelists being deleted\n\n");
+ (void) fprintf(stderr, " Below options are intended for use "
+ "with other options:\n");
+ (void) fprintf(stderr, " -A ignore assertions (-A), enable "
+ "panic recovery (-AA) or both (-AAA)\n");
+ (void) fprintf(stderr, " -e pool is exported/destroyed/"
+ "has altroot/not in a cachefile\n");
+ (void) fprintf(stderr, " -F attempt automatic rewind within "
+ "safe range of transaction groups\n");
+ (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before "
+ "exiting\n");
+ (void) fprintf(stderr, " -I <number of inflight I/Os> -- "
+ "specify the maximum number of\n "
+ "checksumming I/Os [default is 200]\n");
+ (void) fprintf(stderr, " -o <variable>=<value> set global "
+ "variable to an unsigned 32-bit integer\n");
+ (void) fprintf(stderr, " -p <path> -- use one or more with "
+ "-e to specify path to vdev dir\n");
+ (void) fprintf(stderr, " -P print numbers in parseable form\n");
+ (void) fprintf(stderr, " -q don't print label contents\n");
+ (void) fprintf(stderr, " -t <txg> -- highest txg to use when "
+ "searching for uberblocks\n");
+ (void) fprintf(stderr, " -u uberblock\n");
+ (void) fprintf(stderr, " -U <cachefile_path> -- use alternate "
+ "cachefile\n");
+ (void) fprintf(stderr, " -V do verbatim import\n");
+ (void) fprintf(stderr, " -x <dumpdir> -- "
+ "dump all read blocks into specified directory\n");
+ (void) fprintf(stderr, " -X attempt extreme rewind (does not "
+ "work with dataset)\n");
+ (void) fprintf(stderr, " -Y attempt all reconstruction "
+ "combinations for split blocks\n");
+ (void) fprintf(stderr, " -Z show ZSTD headers \n");
+ (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
+ "to make only that option verbose\n");
+ (void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
+ exit(1);
+}
+
+static void
+dump_debug_buffer(void)
+{
+ if (dump_opt['G']) {
+ (void) printf("\n");
+ (void) fflush(stdout);
+ zfs_dbgmsg_print("zdb");
+ }
+}
+
+/*
+ * Called for usage errors that are discovered after a call to spa_open(),
+ * dmu_bonus_hold(), or pool_match(). abort() is called for other errors.
+ */
+
+static void
+fatal(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void) fprintf(stderr, "%s: ", cmdname);
+ (void) vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ (void) fprintf(stderr, "\n");
+
+ dump_debug_buffer();
+
+ exit(1);
+}
+
+/* ARGSUSED */
+static void
+dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ nvlist_t *nv;
+ size_t nvsize = *(uint64_t *)data;
+ char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
+
+ VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
+
+ VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
+
+ umem_free(packed, nvsize);
+
+ dump_nvlist(nv, 8);
+
+ nvlist_free(nv);
+}
+
+/* ARGSUSED */
+static void
+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ spa_history_phys_t *shp = data;
+
+ if (shp == NULL)
+ return;
+
+ (void) printf("\t\tpool_create_len = %llu\n",
+ (u_longlong_t)shp->sh_pool_create_len);
+ (void) printf("\t\tphys_max_off = %llu\n",
+ (u_longlong_t)shp->sh_phys_max_off);
+ (void) printf("\t\tbof = %llu\n",
+ (u_longlong_t)shp->sh_bof);
+ (void) printf("\t\teof = %llu\n",
+ (u_longlong_t)shp->sh_eof);
+ (void) printf("\t\trecords_lost = %llu\n",
+ (u_longlong_t)shp->sh_records_lost);
+}
+
+static void
+zdb_nicenum(uint64_t num, char *buf, size_t buflen)
+{
+ if (dump_opt['P'])
+ (void) snprintf(buf, buflen, "%llu", (longlong_t)num);
+ else
+ nicenum(num, buf, sizeof (buf));
+}
+
+static const char histo_stars[] = "****************************************";
+static const uint64_t histo_width = sizeof (histo_stars) - 1;
+
+static void
+dump_histogram(const uint64_t *histo, int size, int offset)
+{
+ int i;
+ int minidx = size - 1;
+ int maxidx = 0;
+ uint64_t max = 0;
+
+ for (i = 0; i < size; i++) {
+ if (histo[i] > max)
+ max = histo[i];
+ if (histo[i] > 0 && i > maxidx)
+ maxidx = i;
+ if (histo[i] > 0 && i < minidx)
+ minidx = i;
+ }
+
+ if (max < histo_width)
+ max = histo_width;
+
+ for (i = minidx; i <= maxidx; i++) {
+ (void) printf("\t\t\t%3u: %6llu %s\n",
+ i + offset, (u_longlong_t)histo[i],
+ &histo_stars[(max - histo[i]) * histo_width / max]);
+ }
+}
+
+static void
+dump_zap_stats(objset_t *os, uint64_t object)
+{
+ int error;
+ zap_stats_t zs;
+
+ error = zap_get_stats(os, object, &zs);
+ if (error)
+ return;
+
+ if (zs.zs_ptrtbl_len == 0) {
+ ASSERT(zs.zs_num_blocks == 1);
+ (void) printf("\tmicrozap: %llu bytes, %llu entries\n",
+ (u_longlong_t)zs.zs_blocksize,
+ (u_longlong_t)zs.zs_num_entries);
+ return;
+ }
+
+ (void) printf("\tFat ZAP stats:\n");
+
+ (void) printf("\t\tPointer table:\n");
+ (void) printf("\t\t\t%llu elements\n",
+ (u_longlong_t)zs.zs_ptrtbl_len);
+ (void) printf("\t\t\tzt_blk: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_blk);
+ (void) printf("\t\t\tzt_numblks: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
+ (void) printf("\t\t\tzt_shift: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_zt_shift);
+ (void) printf("\t\t\tzt_blks_copied: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_blks_copied);
+ (void) printf("\t\t\tzt_nextblk: %llu\n",
+ (u_longlong_t)zs.zs_ptrtbl_nextblk);
+
+ (void) printf("\t\tZAP entries: %llu\n",
+ (u_longlong_t)zs.zs_num_entries);
+ (void) printf("\t\tLeaf blocks: %llu\n",
+ (u_longlong_t)zs.zs_num_leafs);
+ (void) printf("\t\tTotal blocks: %llu\n",
+ (u_longlong_t)zs.zs_num_blocks);
+ (void) printf("\t\tzap_block_type: 0x%llx\n",
+ (u_longlong_t)zs.zs_block_type);
+ (void) printf("\t\tzap_magic: 0x%llx\n",
+ (u_longlong_t)zs.zs_magic);
+ (void) printf("\t\tzap_salt: 0x%llx\n",
+ (u_longlong_t)zs.zs_salt);
+
+ (void) printf("\t\tLeafs with 2^n pointers:\n");
+ dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tBlocks with n*5 entries:\n");
+ dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tBlocks n/10 full:\n");
+ dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tEntries with n chunks:\n");
+ dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
+
+ (void) printf("\t\tBuckets with n entries:\n");
+ dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
+}
+
+/*ARGSUSED*/
+static void
+dump_none(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ (void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ uint64_t *arr;
+ uint64_t oursize;
+ if (dump_opt['d'] < 6)
+ return;
+
+ if (data == NULL) {
+ dmu_object_info_t doi;
+
+ VERIFY0(dmu_object_info(os, object, &doi));
+ size = doi.doi_max_offset;
+ /*
+ * We cap the size at 1 mebibyte here to prevent
+ * allocation failures and nigh-infinite printing if the
+ * object is extremely large.
+ */
+ oursize = MIN(size, 1 << 20);
+ arr = kmem_alloc(oursize, KM_SLEEP);
+
+ int err = dmu_read(os, object, 0, oursize, arr, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ kmem_free(arr, oursize);
+ return;
+ }
+ } else {
+ /*
+ * Even though the allocation is already done in this code path,
+ * we still cap the size to prevent excessive printing.
+ */
+ oursize = MIN(size, 1 << 20);
+ arr = data;
+ }
+
+ if (size == 0) {
+ (void) printf("\t\t[]\n");
+ return;
+ }
+
+ (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
+ for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
+ if (i % 4 != 0)
+ (void) printf(", %0llx", (u_longlong_t)arr[i]);
+ else
+ (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
+ }
+ if (oursize != size)
+ (void) printf(", ... ");
+ (void) printf("]\n");
+
+ if (data == NULL)
+ kmem_free(arr, oursize);
+}
+
+/*ARGSUSED*/
+static void
+dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ void *prop;
+ unsigned i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ prop = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+ (void) zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length, attr.za_num_integers, prop);
+ if (attr.za_integer_length == 1) {
+ (void) printf("%s", (char *)prop);
+ } else {
+ for (i = 0; i < attr.za_num_integers; i++) {
+ switch (attr.za_integer_length) {
+ case 2:
+ (void) printf("%u ",
+ ((uint16_t *)prop)[i]);
+ break;
+ case 4:
+ (void) printf("%u ",
+ ((uint32_t *)prop)[i]);
+ break;
+ case 8:
+ (void) printf("%lld ",
+ (u_longlong_t)((int64_t *)prop)[i]);
+ break;
+ }
+ }
+ }
+ (void) printf("\n");
+ umem_free(prop, attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ bpobj_phys_t *bpop = data;
+ uint64_t i;
+ char bytes[32], comp[32], uncomp[32];
+
+ /* make sure the output won't get truncated */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+
+ if (bpop == NULL)
+ return;
+
+ zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
+ zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
+ zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
+
+ (void) printf("\t\tnum_blkptrs = %llu\n",
+ (u_longlong_t)bpop->bpo_num_blkptrs);
+ (void) printf("\t\tbytes = %s\n", bytes);
+ if (size >= BPOBJ_SIZE_V1) {
+ (void) printf("\t\tcomp = %s\n", comp);
+ (void) printf("\t\tuncomp = %s\n", uncomp);
+ }
+ if (size >= BPOBJ_SIZE_V2) {
+ (void) printf("\t\tsubobjs = %llu\n",
+ (u_longlong_t)bpop->bpo_subobjs);
+ (void) printf("\t\tnum_subobjs = %llu\n",
+ (u_longlong_t)bpop->bpo_num_subobjs);
+ }
+ if (size >= sizeof (*bpop)) {
+ (void) printf("\t\tnum_freed = %llu\n",
+ (u_longlong_t)bpop->bpo_num_freed);
+ }
+
+ if (dump_opt['d'] < 5)
+ return;
+
+ for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t bp;
+
+ int err = dmu_read(os, object,
+ i * sizeof (bp), sizeof (bp), &bp, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ break;
+ }
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
+ BP_GET_FREE(&bp));
+ (void) printf("\t%s\n", blkbuf);
+ }
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dmu_object_info_t doi;
+ int64_t i;
+
+ VERIFY0(dmu_object_info(os, object, &doi));
+ uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+ int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+ if (err != 0) {
+ (void) printf("got error %u from dmu_read\n", err);
+ kmem_free(subobjs, doi.doi_max_offset);
+ return;
+ }
+
+ int64_t last_nonzero = -1;
+ for (i = 0; i < doi.doi_max_offset / 8; i++) {
+ if (subobjs[i] != 0)
+ last_nonzero = i;
+ }
+
+ for (i = 0; i <= last_nonzero; i++) {
+ (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
+ }
+ kmem_free(subobjs, doi.doi_max_offset);
+}
+
+/*ARGSUSED*/
+static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dump_zap_stats(os, object);
+ /* contents are printed elsewhere, properly decoded */
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = ", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+ (void) printf(" %llx : [%d:%d:%d]\n",
+ (u_longlong_t)attr.za_first_integer,
+ (int)ATTR_LENGTH(attr.za_first_integer),
+ (int)ATTR_BSWAP(attr.za_first_integer),
+ (int)ATTR_NUM(attr.za_first_integer));
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ uint16_t *layout_attrs;
+ unsigned i;
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = [", attr.za_name);
+ if (attr.za_num_integers == 0) {
+ (void) printf("\n");
+ continue;
+ }
+
+ VERIFY(attr.za_integer_length == 2);
+ layout_attrs = umem_zalloc(attr.za_num_integers *
+ attr.za_integer_length, UMEM_NOFAIL);
+
+ VERIFY(zap_lookup(os, object, attr.za_name,
+ attr.za_integer_length,
+ attr.za_num_integers, layout_attrs) == 0);
+
+ for (i = 0; i != attr.za_num_integers; i++)
+ (void) printf(" %d ", (int)layout_attrs[i]);
+ (void) printf("]\n");
+ umem_free(layout_attrs,
+ attr.za_num_integers * attr.za_integer_length);
+ }
+ zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ const char *typenames[] = {
+ /* 0 */ "not specified",
+ /* 1 */ "FIFO",
+ /* 2 */ "Character Device",
+ /* 3 */ "3 (invalid)",
+ /* 4 */ "Directory",
+ /* 5 */ "5 (invalid)",
+ /* 6 */ "Block Device",
+ /* 7 */ "7 (invalid)",
+ /* 8 */ "Regular File",
+ /* 9 */ "9 (invalid)",
+ /* 10 */ "Symbolic Link",
+ /* 11 */ "11 (invalid)",
+ /* 12 */ "Socket",
+ /* 13 */ "Door",
+ /* 14 */ "Event Port",
+ /* 15 */ "15 (invalid)",
+ };
+
+ dump_zap_stats(os, object);
+ (void) printf("\n");
+
+ for (zap_cursor_init(&zc, os, object);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ (void) printf("\t\t%s = %lld (type: %s)\n",
+ attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
+ typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static int
+get_dtl_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf) {
+ space_map_t *sm = vd->vdev_dtl_sm;
+
+ if (sm != NULL &&
+ sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+ return (1);
+ return (0);
+ }
+
+ for (unsigned c = 0; c < vd->vdev_children; c++)
+ refcount += get_dtl_refcount(vd->vdev_child[c]);
+ return (refcount);
+}
+
+static int
+get_metaslab_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_top == vd) {
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ space_map_t *sm = vd->vdev_ms[m]->ms_sm;
+
+ if (sm != NULL &&
+ sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+ refcount++;
+ }
+ }
+ for (unsigned c = 0; c < vd->vdev_children; c++)
+ refcount += get_metaslab_refcount(vd->vdev_child[c]);
+
+ return (refcount);
+}
+
+static int
+get_obsolete_refcount(vdev_t *vd)
+{
+ uint64_t obsolete_sm_object;
+ int refcount = 0;
+
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (vd->vdev_top == vd && obsolete_sm_object != 0) {
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
+ obsolete_sm_object, &doi));
+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+ refcount++;
+ }
+ } else {
+ ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+ ASSERT3U(obsolete_sm_object, ==, 0);
+ }
+ for (unsigned c = 0; c < vd->vdev_children; c++) {
+ refcount += get_obsolete_refcount(vd->vdev_child[c]);
+ }
+
+ return (refcount);
+}
+
+static int
+get_prev_obsolete_spacemap_refcount(spa_t *spa)
+{
+ uint64_t prev_obj =
+ spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
+ if (prev_obj != 0) {
+ dmu_object_info_t doi;
+ VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
+ if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
+ return (1);
+ }
+ }
+ return (0);
+}
+
+static int
+get_checkpoint_refcount(vdev_t *vd)
+{
+ int refcount = 0;
+
+ if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
+ zap_contains(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
+ refcount++;
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++)
+ refcount += get_checkpoint_refcount(vd->vdev_child[c]);
+
+ return (refcount);
+}
+
+static int
+get_log_spacemap_refcount(spa_t *spa)
+{
+ return (avl_numnodes(&spa->spa_sm_logs_by_txg));
+}
+
+static int
+verify_spacemap_refcounts(spa_t *spa)
+{
+ uint64_t expected_refcount = 0;
+ uint64_t actual_refcount;
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
+ &expected_refcount);
+ actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
+ actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+ actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
+ actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
+ actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
+ actual_refcount += get_log_spacemap_refcount(spa);
+
+ if (expected_refcount != actual_refcount) {
+ (void) printf("space map refcount mismatch: expected %lld != "
+ "actual %lld\n",
+ (longlong_t)expected_refcount,
+ (longlong_t)actual_refcount);
+ return (2);
+ }
+ return (0);
+}
+
+static void
+dump_spacemap(objset_t *os, space_map_t *sm)
+{
+ const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
+ "INVALID", "INVALID", "INVALID", "INVALID" };
+
+ if (sm == NULL)
+ return;
+
+ (void) printf("space map object %llu:\n",
+ (longlong_t)sm->sm_object);
+ (void) printf(" smp_length = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_length);
+ (void) printf(" smp_alloc = 0x%llx\n",
+ (longlong_t)sm->sm_phys->smp_alloc);
+
+ if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
+ return;
+
+ /*
+ * Print out the freelist entries in both encoded and decoded form.
+ */
+ uint8_t mapshift = sm->sm_shift;
+ int64_t alloc = 0;
+ uint64_t word, entry_id = 0;
+ for (uint64_t offset = 0; offset < space_map_length(sm);
+ offset += sizeof (word)) {
+
+ VERIFY0(dmu_read(os, space_map_object(sm), offset,
+ sizeof (word), &word, DMU_READ_PREFETCH));
+
+ if (sm_entry_is_debug(word)) {
+ uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
+ uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
+ if (de_txg == 0) {
+ (void) printf(
+ "\t [%6llu] PADDING\n",
+ (u_longlong_t)entry_id);
+ } else {
+ (void) printf(
+ "\t [%6llu] %s: txg %llu pass %llu\n",
+ (u_longlong_t)entry_id,
+ ddata[SM_DEBUG_ACTION_DECODE(word)],
+ (u_longlong_t)de_txg,
+ (u_longlong_t)de_sync_pass);
+ }
+ entry_id++;
+ continue;
+ }
+
+ uint8_t words;
+ char entry_type;
+ uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
+
+ if (sm_entry_is_single_word(word)) {
+ entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
+ 'A' : 'F';
+ entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
+ sm->sm_start;
+ entry_run = SM_RUN_DECODE(word) << mapshift;
+ words = 1;
+ } else {
+ /* it is a two-word entry so we read another word */
+ ASSERT(sm_entry_is_double_word(word));
+
+ uint64_t extra_word;
+ offset += sizeof (extra_word);
+ VERIFY0(dmu_read(os, space_map_object(sm), offset,
+ sizeof (extra_word), &extra_word,
+ DMU_READ_PREFETCH));
+
+ ASSERT3U(offset, <=, space_map_length(sm));
+
+ entry_run = SM2_RUN_DECODE(word) << mapshift;
+ entry_vdev = SM2_VDEV_DECODE(word);
+ entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
+ 'A' : 'F';
+ entry_off = (SM2_OFFSET_DECODE(extra_word) <<
+ mapshift) + sm->sm_start;
+ words = 2;
+ }
+
+ (void) printf("\t [%6llu] %c range:"
+ " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n",
+ (u_longlong_t)entry_id,
+ entry_type, (u_longlong_t)entry_off,
+ (u_longlong_t)(entry_off + entry_run),
+ (u_longlong_t)entry_run,
+ (u_longlong_t)entry_vdev, words);
+
+ if (entry_type == 'A')
+ alloc += entry_run;
+ else
+ alloc -= entry_run;
+ entry_id++;
+ }
+ if (alloc != space_map_allocated(sm)) {
+ (void) printf("space_map_object alloc (%lld) INCONSISTENT "
+ "with space map summary (%lld)\n",
+ (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
+ }
+}
+
+static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+ char maxbuf[32];
+ range_tree_t *rt = msp->ms_allocatable;
+ zfs_btree_t *t = &msp->ms_allocatable_by_size;
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+
+ /* max sure nicenum has enough space */
+ CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ);
+
+ zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
+
+ (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n",
+ "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
+ "freepct", free_pct);
+ (void) printf("\tIn-memory histogram:\n");
+ dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+dump_metaslab(metaslab_t *msp)
+{
+ vdev_t *vd = msp->ms_group->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ space_map_t *sm = msp->ms_sm;
+ char freebuf[32];
+
+ zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
+ sizeof (freebuf));
+
+ (void) printf(
+ "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n",
+ (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
+ (u_longlong_t)space_map_object(sm), freebuf);
+
+ if (dump_opt['m'] > 2 && !dump_opt['L']) {
+ mutex_enter(&msp->ms_lock);
+ VERIFY0(metaslab_load(msp));
+ range_tree_stat_verify(msp->ms_allocatable);
+ dump_metaslab_stats(msp);
+ metaslab_unload(msp);
+ mutex_exit(&msp->ms_lock);
+ }
+
+ if (dump_opt['m'] > 1 && sm != NULL &&
+ spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ /*
+ * The space map histogram represents free space in chunks
+ * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
+ */
+ (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
+ (u_longlong_t)msp->ms_fragmentation);
+ dump_histogram(sm->sm_phys->smp_histogram,
+ SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
+ }
+
+ ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
+ dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+ (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
+ (u_longlong_t)metaslab_unflushed_txg(msp));
+ }
+}
+
+static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+ vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+ const char *bias_str = "";
+ if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
+ bias_str = VDEV_ALLOC_BIAS_LOG;
+ } else if (alloc_bias == VDEV_BIAS_SPECIAL) {
+ bias_str = VDEV_ALLOC_BIAS_SPECIAL;
+ } else if (alloc_bias == VDEV_BIAS_DEDUP) {
+ bias_str = VDEV_ALLOC_BIAS_DEDUP;
+ }
+
+ uint64_t ms_flush_data_obj = 0;
+ if (vd->vdev_top_zap != 0) {
+ int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+ sizeof (uint64_t), 1, &ms_flush_data_obj);
+ if (error != ENOENT) {
+ ASSERT0(error);
+ }
+ }
+
+ (void) printf("\tvdev %10llu %s",
+ (u_longlong_t)vd->vdev_id, bias_str);
+
+ if (ms_flush_data_obj != 0) {
+ (void) printf(" ms_unflushed_phys object %llu",
+ (u_longlong_t)ms_flush_data_obj);
+ }
+
+ (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n",
+ "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+ "offset", "spacemap", "free");
+ (void) printf("\t%15s %19s %15s %12s\n",
+ "---------------", "-------------------",
+ "---------------", "------------");
+}
+
+static void
+dump_metaslab_groups(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ metaslab_class_t *mc = spa_normal_class(spa);
+ uint64_t fragmentation;
+
+ metaslab_class_histogram_verify(mc);
+
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (mg == NULL || mg->mg_class != mc)
+ continue;
+
+ metaslab_group_histogram_verify(mg);
+ mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+ (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
+ "fragmentation",
+ (u_longlong_t)tvd->vdev_id,
+ (u_longlong_t)tvd->vdev_ms_count);
+ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+ (void) printf("%3s\n", "-");
+ } else {
+ (void) printf("%3llu%%\n",
+ (u_longlong_t)mg->mg_fragmentation);
+ }
+ dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+ }
+
+ (void) printf("\tpool %s\tfragmentation", spa_name(spa));
+ fragmentation = metaslab_class_fragmentation(mc);
+ if (fragmentation == ZFS_FRAG_INVALID)
+ (void) printf("\t%3s\n", "-");
+ else
+ (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
+ dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
+print_vdev_indirect(vdev_t *vd)
+{
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+ if (vim == NULL) {
+ ASSERT3P(vib, ==, NULL);
+ return;
+ }
+
+ ASSERT3U(vdev_indirect_mapping_object(vim), ==,
+ vic->vic_mapping_object);
+ ASSERT3U(vdev_indirect_births_object(vib), ==,
+ vic->vic_births_object);
+
+ (void) printf("indirect births obj %llu:\n",
+ (longlong_t)vic->vic_births_object);
+ (void) printf(" vib_count = %llu\n",
+ (longlong_t)vdev_indirect_births_count(vib));
+ for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
+ vdev_indirect_birth_entry_phys_t *cur_vibe =
+ &vib->vib_entries[i];
+ (void) printf("\toffset %llx -> txg %llu\n",
+ (longlong_t)cur_vibe->vibe_offset,
+ (longlong_t)cur_vibe->vibe_phys_birth_txg);
+ }
+ (void) printf("\n");
+
+ (void) printf("indirect mapping obj %llu:\n",
+ (longlong_t)vic->vic_mapping_object);
+ (void) printf(" vim_max_offset = 0x%llx\n",
+ (longlong_t)vdev_indirect_mapping_max_offset(vim));
+ (void) printf(" vim_bytes_mapped = 0x%llx\n",
+ (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
+ (void) printf(" vim_count = %llu\n",
+ (longlong_t)vdev_indirect_mapping_num_entries(vim));
+
+ if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
+ return;
+
+ uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+
+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[i];
+ (void) printf("\t<%llx:%llx:%llx> -> "
+ "<%llx:%llx:%llx> (%x obsolete)\n",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
+ (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ counts[i]);
+ }
+ (void) printf("\n");
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ (void) printf("obsolete space map object %llu:\n",
+ (u_longlong_t)obsolete_sm_object);
+ ASSERT(vd->vdev_obsolete_sm != NULL);
+ ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
+ obsolete_sm_object);
+ dump_spacemap(mos, vd->vdev_obsolete_sm);
+ (void) printf("\n");
+ }
+}
+
+static void
+dump_metaslabs(spa_t *spa)
+{
+ vdev_t *vd, *rvd = spa->spa_root_vdev;
+ uint64_t m, c = 0, children = rvd->vdev_children;
+
+ (void) printf("\nMetaslabs:\n");
+
+ if (!dump_opt['d'] && zopt_metaslab_args > 0) {
+ c = zopt_metaslab[0];
+
+ if (c >= children)
+ (void) fatal("bad vdev id: %llu", (u_longlong_t)c);
+
+ if (zopt_metaslab_args > 1) {
+ vd = rvd->vdev_child[c];
+ print_vdev_metaslab_header(vd);
+
+ for (m = 1; m < zopt_metaslab_args; m++) {
+ if (zopt_metaslab[m] < vd->vdev_ms_count)
+ dump_metaslab(
+ vd->vdev_ms[zopt_metaslab[m]]);
+ else
+ (void) fprintf(stderr, "bad metaslab "
+ "number %llu\n",
+ (u_longlong_t)zopt_metaslab[m]);
+ }
+ (void) printf("\n");
+ return;
+ }
+ children = c + 1;
+ }
+ for (; c < children; c++) {
+ vd = rvd->vdev_child[c];
+ print_vdev_metaslab_header(vd);
+
+ print_vdev_indirect(vd);
+
+ for (m = 0; m < vd->vdev_ms_count; m++)
+ dump_metaslab(vd->vdev_ms[m]);
+ (void) printf("\n");
+ }
+}
+
+static void
+dump_log_spacemaps(spa_t *spa)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ (void) printf("\nLog Space Maps in Pool:\n");
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+ space_map_t *sm = NULL;
+ VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
+ sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+ (void) printf("Log Spacemap object %llu txg %llu\n",
+ (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
+ dump_spacemap(spa->spa_meta_objset, sm);
+ space_map_close(sm);
+ }
+ (void) printf("\n");
+}
+
+static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+ const ddt_phys_t *ddp = dde->dde_phys;
+ const ddt_key_t *ddk = &dde->dde_key;
+ const char *types[4] = { "ditto", "single", "double", "triple" };
+ char blkbuf[BP_SPRINTF_LEN];
+ blkptr_t blk;
+ int p;
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
+ (void) printf("index %llx refcnt %llu %s %s\n",
+ (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+ types[p], blkbuf);
+ }
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+ double rL, rP, rD, D, dedup, compress, copies;
+
+ if (dds->dds_blocks == 0)
+ return;
+
+ rL = (double)dds->dds_ref_lsize;
+ rP = (double)dds->dds_ref_psize;
+ rD = (double)dds->dds_ref_dsize;
+ D = (double)dds->dds_dsize;
+
+ dedup = rD / D;
+ compress = rL / rP;
+ copies = rD / rP;
+
+ (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+ "dedup * compress / copies = %.2f\n\n",
+ dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+ char name[DDT_NAMELEN];
+ ddt_entry_t dde;
+ uint64_t walk = 0;
+ dmu_object_info_t doi;
+ uint64_t count, dspace, mspace;
+ int error;
+
+ error = ddt_object_info(ddt, type, class, &doi);
+
+ if (error == ENOENT)
+ return;
+ ASSERT(error == 0);
+
+ error = ddt_object_count(ddt, type, class, &count);
+ ASSERT(error == 0);
+ if (count == 0)
+ return;
+
+ dspace = doi.doi_physical_blocks_512 << 9;
+ mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+ ddt_object_name(ddt, type, class, name);
+
+ (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+ name,
+ (u_longlong_t)count,
+ (u_longlong_t)(dspace / count),
+ (u_longlong_t)(mspace / count));
+
+ if (dump_opt['D'] < 3)
+ return;
+
+ zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+ if (dump_opt['D'] < 4)
+ return;
+
+ if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+ return;
+
+ (void) printf("%s contents:\n\n", name);
+
+ while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+ dump_dde(ddt, &dde, walk);
+
+ ASSERT3U(error, ==, ENOENT);
+
+ (void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+ ddt_histogram_t ddh_total;
+ ddt_stat_t dds_total;
+
+ bzero(&ddh_total, sizeof (ddh_total));
+ bzero(&dds_total, sizeof (dds_total));
+
+ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+ ddt_t *ddt = spa->spa_ddt[c];
+ for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+ for (enum ddt_class class = 0; class < DDT_CLASSES;
+ class++) {
+ dump_ddt(ddt, type, class);
+ }
+ }
+ }
+
+ ddt_get_dedup_stats(spa, &dds_total);
+
+ if (dds_total.dds_blocks == 0) {
+ (void) printf("All DDTs are empty\n");
+ return;
+ }
+
+ (void) printf("\n");
+
+ if (dump_opt['D'] > 1) {
+ (void) printf("DDT histogram (aggregated over all DDTs):\n");
+ ddt_get_dedup_histogram(spa, &ddh_total);
+ zpool_dump_ddt(&dds_total, &ddh_total);
+ }
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static void
+dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
+{
+ char *prefix = arg;
+
+ (void) printf("%s [%llu,%llu) length %llu\n",
+ prefix,
+ (u_longlong_t)start,
+ (u_longlong_t)(start + size),
+ (u_longlong_t)(size));
+}
+
+static void
+dump_dtl(vdev_t *vd, int indent)
+{
+ spa_t *spa = vd->vdev_spa;
+ boolean_t required;
+ const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
+ "outage" };
+ char prefix[256];
+
+ spa_vdev_state_enter(spa, SCL_NONE);
+ required = vdev_dtl_required(vd);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+
+ if (indent == 0)
+ (void) printf("\nDirty time logs:\n\n");
+
+ (void) printf("\t%*s%s [%s]\n", indent, "",
+ vd->vdev_path ? vd->vdev_path :
+ vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+ required ? "DTL-required" : "DTL-expendable");
+
+ for (int t = 0; t < DTL_TYPES; t++) {
+ range_tree_t *rt = vd->vdev_dtl[t];
+ if (range_tree_space(rt) == 0)
+ continue;
+ (void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+ indent + 2, "", name[t]);
+ range_tree_walk(rt, dump_dtl_seg, prefix);
+ if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+ dump_spacemap(spa->spa_meta_objset,
+ vd->vdev_dtl_sm);
+ }
+
+ for (unsigned c = 0; c < vd->vdev_children; c++)
+ dump_dtl(vd->vdev_child[c], indent + 4);
+}
+
+static void
+dump_history(spa_t *spa)
+{
+ nvlist_t **events = NULL;
+ char *buf;
+ uint64_t resid, len, off = 0;
+ uint_t num = 0;
+ int error;
+ time_t tsec;
+ struct tm t;
+ char tbuf[30];
+ char internalstr[MAXPATHLEN];
+
+ if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
+ (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
+ __func__);
+ return;
+ }
+
+ do {
+ len = SPA_OLD_MAXBLOCKSIZE;
+
+ if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+ (void) fprintf(stderr, "Unable to read history: "
+ "error %d\n", error);
+ free(buf);
+ return;
+ }
+
+ if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+ break;
+
+ off -= resid;
+ } while (len != 0);
+
+ (void) printf("\nHistory:\n");
+ for (unsigned i = 0; i < num; i++) {
+ uint64_t time, txg, ievent;
+ char *cmd, *intstr;
+ boolean_t printed = B_FALSE;
+
+ if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+ &time) != 0)
+ goto next;
+ if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+ &cmd) != 0) {
+ if (nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+ goto next;
+ verify(nvlist_lookup_uint64(events[i],
+ ZPOOL_HIST_TXG, &txg) == 0);
+ verify(nvlist_lookup_string(events[i],
+ ZPOOL_HIST_INT_STR, &intstr) == 0);
+ if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
+ goto next;
+
+ (void) snprintf(internalstr,
+ sizeof (internalstr),
+ "[internal %s txg:%lld] %s",
+ zfs_history_event_names[ievent],
+ (longlong_t)txg, intstr);
+ cmd = internalstr;
+ }
+ tsec = time;
+ (void) localtime_r(&tsec, &t);
+ (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+ (void) printf("%s %s\n", tbuf, cmd);
+ printed = B_TRUE;
+
+next:
+ if (dump_opt['h'] > 1) {
+ if (!printed)
+ (void) printf("unrecognized record:\n");
+ dump_nvlist(events[i], 2);
+ }
+ }
+ free(buf);
+}
+
+/*ARGSUSED*/
+static void
+dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static uint64_t
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
+ const zbookmark_phys_t *zb)
+{
+ if (dnp == NULL) {
+ ASSERT(zb->zb_level < 0);
+ if (zb->zb_object == 0)
+ return (zb->zb_blkid);
+ return (zb->zb_blkid * BP_GET_LSIZE(bp));
+ }
+
+ ASSERT(zb->zb_level >= 0);
+
+ return ((zb->zb_blkid <<
+ (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+ dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+}
+
+static void
+snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
+ const blkptr_t *bp)
+{
+ abd_t *pabd;
+ void *buf;
+ zio_t *zio;
+ zfs_zstdhdr_t zstd_hdr;
+ int error;
+
+ if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
+ return;
+
+ if (BP_IS_HOLE(bp))
+ return;
+
+ if (BP_IS_EMBEDDED(bp)) {
+ buf = malloc(SPA_MAXBLOCKSIZE);
+ if (buf == NULL) {
+ (void) fprintf(stderr, "out of memory\n");
+ exit(1);
+ }
+ decode_embedded_bp_compressed(bp, buf);
+ memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
+ free(buf);
+ zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
+ zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
+ zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
+ return;
+ }
+
+ pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
+ zio = zio_root(spa, NULL, NULL, 0);
+
+ /* Decrypt but don't decompress so we can read the compression header */
+ zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
+ NULL));
+ error = zio_wait(zio);
+ if (error) {
+ (void) fprintf(stderr, "read failed: %d\n", error);
+ return;
+ }
+ buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
+ memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
+ zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
+ zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
+
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ " ZSTD:size=%u:version=%u:level=%u:NORMAL",
+ zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
+
+ abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
+}
+
+static void
+snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
+ boolean_t bp_freed)
+{
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+ int i;
+
+ if (dump_opt['b'] >= 6) {
+ snprintf_blkptr(blkbuf, buflen, bp);
+ if (bp_freed) {
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), " %s", "FREE");
+ }
+ return;
+ }
+
+ if (BP_IS_EMBEDDED(bp)) {
+ (void) sprintf(blkbuf,
+ "EMBEDDED et=%u %llxL/%llxP B=%llu",
+ (int)BPE_GET_ETYPE(bp),
+ (u_longlong_t)BPE_GET_LSIZE(bp),
+ (u_longlong_t)BPE_GET_PSIZE(bp),
+ (u_longlong_t)bp->blk_birth);
+ return;
+ }
+
+ blkbuf[0] = '\0';
+
+ for (i = 0; i < ndvas; i++)
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), "%llu:%llx:%llx ",
+ (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+ (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+ (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+ if (BP_IS_HOLE(bp)) {
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ "%llxL B=%llu",
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)bp->blk_birth);
+ } else {
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf),
+ "%llxL/%llxP F=%llu B=%llu/%llu",
+ (u_longlong_t)BP_GET_LSIZE(bp),
+ (u_longlong_t)BP_GET_PSIZE(bp),
+ (u_longlong_t)BP_GET_FILL(bp),
+ (u_longlong_t)bp->blk_birth,
+ (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+ if (bp_freed)
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), " %s", "FREE");
+ (void) snprintf(blkbuf + strlen(blkbuf),
+ buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx",
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ }
+}
+
+static void
+print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
+ const dnode_phys_t *dnp)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+ int l;
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+ ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+ }
+
+ (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
+
+ ASSERT(zb->zb_level >= 0);
+
+ for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
+ if (l == zb->zb_level) {
+ (void) printf("L%llx", (u_longlong_t)zb->zb_level);
+ } else {
+ (void) printf(" ");
+ }
+ }
+
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
+ if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
+ snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
+ (void) printf("%s\n", blkbuf);
+}
+
+static int
+visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
+ blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+ int err = 0;
+
+ if (bp->blk_birth == 0)
+ return (0);
+
+ print_indirect(spa, bp, zb, dnp);
+
+ if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
+ int i;
+ blkptr_t *cbp;
+ int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ arc_buf_t *buf;
+ uint64_t fill = 0;
+ ASSERT(!BP_IS_REDACTED(bp));
+
+ err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+ if (err)
+ return (err);
+ ASSERT(buf->b_data);
+
+ /* recursively visit blocks below this */
+ cbp = buf->b_data;
+ for (i = 0; i < epb; i++, cbp++) {
+ zbookmark_phys_t czb;
+
+ SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+ zb->zb_level - 1,
+ zb->zb_blkid * epb + i);
+ err = visit_indirect(spa, dnp, cbp, &czb);
+ if (err)
+ break;
+ fill += BP_GET_FILL(cbp);
+ }
+ if (!err)
+ ASSERT3U(fill, ==, BP_GET_FILL(bp));
+ arc_buf_destroy(buf, &buf);
+ }
+
+ return (err);
+}
+
+/*ARGSUSED*/
+static void
+dump_indirect(dnode_t *dn)
+{
+ dnode_phys_t *dnp = dn->dn_phys;
+ int j;
+ zbookmark_phys_t czb;
+
+ (void) printf("Indirect blocks:\n");
+
+ SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
+ dn->dn_object, dnp->dn_nlevels - 1, 0);
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ czb.zb_blkid = j;
+ (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
+ &dnp->dn_blkptr[j], &czb);
+ }
+
+ (void) printf("\n");
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dsl_dir_phys_t *dd = data;
+ time_t crtime;
+ char nice[32];
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ);
+
+ if (dd == NULL)
+ return;
+
+ ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
+
+ crtime = dd->dd_creation_time;
+ (void) printf("\t\tcreation_time = %s", ctime(&crtime));
+ (void) printf("\t\thead_dataset_obj = %llu\n",
+ (u_longlong_t)dd->dd_head_dataset_obj);
+ (void) printf("\t\tparent_dir_obj = %llu\n",
+ (u_longlong_t)dd->dd_parent_obj);
+ (void) printf("\t\torigin_obj = %llu\n",
+ (u_longlong_t)dd->dd_origin_obj);
+ (void) printf("\t\tchild_dir_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_child_dir_zapobj);
+ zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
+ (void) printf("\t\tused_bytes = %s\n", nice);
+ zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
+ (void) printf("\t\tcompressed_bytes = %s\n", nice);
+ zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
+ (void) printf("\t\tuncompressed_bytes = %s\n", nice);
+ zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
+ (void) printf("\t\tquota = %s\n", nice);
+ zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
+ (void) printf("\t\treserved = %s\n", nice);
+ (void) printf("\t\tprops_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_props_zapobj);
+ (void) printf("\t\tdeleg_zapobj = %llu\n",
+ (u_longlong_t)dd->dd_deleg_zapobj);
+ (void) printf("\t\tflags = %llx\n",
+ (u_longlong_t)dd->dd_flags);
+
+#define DO(which) \
+ zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
+ sizeof (nice)); \
+ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
+ DO(HEAD);
+ DO(SNAP);
+ DO(CHILD);
+ DO(CHILD_RSRV);
+ DO(REFRSRV);
+#undef DO
+ (void) printf("\t\tclones = %llu\n",
+ (u_longlong_t)dd->dd_clones);
+}
+
+/*ARGSUSED*/
+static void
+dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ dsl_dataset_phys_t *ds = data;
+ time_t crtime;
+ char used[32], compressed[32], uncompressed[32], unique[32];
+ char blkbuf[BP_SPRINTF_LEN];
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (used) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ);
+
+ if (ds == NULL)
+ return;
+
+ ASSERT(size == sizeof (*ds));
+ crtime = ds->ds_creation_time;
+ zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
+ zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
+ zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
+ sizeof (uncompressed));
+ zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
+
+ (void) printf("\t\tdir_obj = %llu\n",
+ (u_longlong_t)ds->ds_dir_obj);
+ (void) printf("\t\tprev_snap_obj = %llu\n",
+ (u_longlong_t)ds->ds_prev_snap_obj);
+ (void) printf("\t\tprev_snap_txg = %llu\n",
+ (u_longlong_t)ds->ds_prev_snap_txg);
+ (void) printf("\t\tnext_snap_obj = %llu\n",
+ (u_longlong_t)ds->ds_next_snap_obj);
+ (void) printf("\t\tsnapnames_zapobj = %llu\n",
+ (u_longlong_t)ds->ds_snapnames_zapobj);
+ (void) printf("\t\tnum_children = %llu\n",
+ (u_longlong_t)ds->ds_num_children);
+ (void) printf("\t\tuserrefs_obj = %llu\n",
+ (u_longlong_t)ds->ds_userrefs_obj);
+ (void) printf("\t\tcreation_time = %s", ctime(&crtime));
+ (void) printf("\t\tcreation_txg = %llu\n",
+ (u_longlong_t)ds->ds_creation_txg);
+ (void) printf("\t\tdeadlist_obj = %llu\n",
+ (u_longlong_t)ds->ds_deadlist_obj);
+ (void) printf("\t\tused_bytes = %s\n", used);
+ (void) printf("\t\tcompressed_bytes = %s\n", compressed);
+ (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
+ (void) printf("\t\tunique = %s\n", unique);
+ (void) printf("\t\tfsid_guid = %llu\n",
+ (u_longlong_t)ds->ds_fsid_guid);
+ (void) printf("\t\tguid = %llu\n",
+ (u_longlong_t)ds->ds_guid);
+ (void) printf("\t\tflags = %llx\n",
+ (u_longlong_t)ds->ds_flags);
+ (void) printf("\t\tnext_clones_obj = %llu\n",
+ (u_longlong_t)ds->ds_next_clones_obj);
+ (void) printf("\t\tprops_obj = %llu\n",
+ (u_longlong_t)ds->ds_props_obj);
+ (void) printf("\t\tbp = %s\n", blkbuf);
+}
+
+/* ARGSUSED */
+static int
+dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ if (bp->blk_birth != 0) {
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("\t%s\n", blkbuf);
+ }
+ return (0);
+}
+
+static void
+dump_bptree(objset_t *os, uint64_t obj, const char *name)
+{
+ char bytes[32];
+ bptree_phys_t *bt;
+ dmu_buf_t *db;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
+ (void) printf("\n %s: %llu datasets, %s\n",
+ name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
+ dmu_buf_rele(db, FTAG);
+
+ if (dump_opt['d'] < 5)
+ return;
+
+ (void) printf("\n");
+
+ (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
+}
+
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ ASSERT(bp->blk_birth != 0);
+ snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
+ (void) printf("\t%s\n", blkbuf);
+ return (0);
+}
+
+static void
+dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
+{
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
+ uint64_t i;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+ zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
+ zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
+ if (bpo->bpo_havefreed) {
+ (void) printf(" %*s: object %llu, %llu local "
+ "blkptrs, %llu freed, %llu subobjs in object %llu, "
+ "%s (%s/%s comp)\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+ (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+ bytes, comp, uncomp);
+ } else {
+ (void) printf(" %*s: object %llu, %llu local "
+ "blkptrs, %llu subobjs in object %llu, "
+ "%s (%s/%s comp)\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+ (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
+ bytes, comp, uncomp);
+ }
+
+ for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+ uint64_t subobj;
+ bpobj_t subbpo;
+ int error;
+ VERIFY0(dmu_read(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+ error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+ if (error != 0) {
+ (void) printf("ERROR %u while trying to open "
+ "subobj id %llu\n",
+ error, (u_longlong_t)subobj);
+ continue;
+ }
+ dump_full_bpobj(&subbpo, "subobj", indent + 1);
+ bpobj_close(&subbpo);
+ }
+ } else {
+ if (bpo->bpo_havefreed) {
+ (void) printf(" %*s: object %llu, %llu blkptrs, "
+ "%llu freed, %s\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
+ bytes);
+ } else {
+ (void) printf(" %*s: object %llu, %llu blkptrs, "
+ "%s\n",
+ indent * 8, name,
+ (u_longlong_t)bpo->bpo_object,
+ (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+ bytes);
+ }
+ }
+
+ if (dump_opt['d'] < 5)
+ return;
+
+
+ if (indent == 0) {
+ (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+ (void) printf("\n");
+ }
+}
+
+static int
+dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
+ boolean_t print_list)
+{
+ int err = 0;
+ zfs_bookmark_phys_t prop;
+ objset_t *mos = dp->dp_spa->spa_meta_objset;
+ err = dsl_bookmark_lookup(dp, name, NULL, &prop);
+
+ if (err != 0) {
+ return (err);
+ }
+
+ (void) printf("\t#%s: ", strchr(name, '#') + 1);
+ (void) printf("{guid: %llx creation_txg: %llu creation_time: "
+ "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
+ (u_longlong_t)prop.zbm_creation_txg,
+ (u_longlong_t)prop.zbm_creation_time,
+ (u_longlong_t)prop.zbm_redaction_obj);
+
+ IMPLY(print_list, print_redact);
+ if (!print_redact || prop.zbm_redaction_obj == 0)
+ return (0);
+
+ redaction_list_t *rl;
+ VERIFY0(dsl_redaction_list_hold_obj(dp,
+ prop.zbm_redaction_obj, FTAG, &rl));
+
+ redaction_list_phys_t *rlp = rl->rl_phys;
+ (void) printf("\tRedacted:\n\t\tProgress: ");
+ if (rlp->rlp_last_object != UINT64_MAX ||
+ rlp->rlp_last_blkid != UINT64_MAX) {
+ (void) printf("%llu %llu (incomplete)\n",
+ (u_longlong_t)rlp->rlp_last_object,
+ (u_longlong_t)rlp->rlp_last_blkid);
+ } else {
+ (void) printf("complete\n");
+ }
+ (void) printf("\t\tSnapshots: [");
+ for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
+ if (i > 0)
+ (void) printf(", ");
+ (void) printf("%0llu",
+ (u_longlong_t)rlp->rlp_snaps[i]);
+ }
+ (void) printf("]\n\t\tLength: %llu\n",
+ (u_longlong_t)rlp->rlp_num_entries);
+
+ if (!print_list) {
+ dsl_redaction_list_rele(rl, FTAG);
+ return (0);
+ }
+
+ if (rlp->rlp_num_entries == 0) {
+ dsl_redaction_list_rele(rl, FTAG);
+ (void) printf("\t\tRedaction List: []\n\n");
+ return (0);
+ }
+
+ redact_block_phys_t *rbp_buf;
+ uint64_t size;
+ dmu_object_info_t doi;
+
+ VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
+ size = doi.doi_max_offset;
+ rbp_buf = kmem_alloc(size, KM_SLEEP);
+
+ err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
+ rbp_buf, 0);
+ if (err != 0) {
+ dsl_redaction_list_rele(rl, FTAG);
+ kmem_free(rbp_buf, size);
+ return (err);
+ }
+
+ (void) printf("\t\tRedaction List: [{object: %llx, offset: "
+ "%llx, blksz: %x, count: %llx}",
+ (u_longlong_t)rbp_buf[0].rbp_object,
+ (u_longlong_t)rbp_buf[0].rbp_blkid,
+ (uint_t)(redact_block_get_size(&rbp_buf[0])),
+ (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
+
+ for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
+ (void) printf(",\n\t\t{object: %llx, offset: %llx, "
+ "blksz: %x, count: %llx}",
+ (u_longlong_t)rbp_buf[i].rbp_object,
+ (u_longlong_t)rbp_buf[i].rbp_blkid,
+ (uint_t)(redact_block_get_size(&rbp_buf[i])),
+ (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
+ }
+ dsl_redaction_list_rele(rl, FTAG);
+ kmem_free(rbp_buf, size);
+ (void) printf("]\n\n");
+ return (0);
+}
+
+static void
+dump_bookmarks(objset_t *os, int verbosity)
+{
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+ objset_t *mos = os->os_spa->spa_meta_objset;
+ if (verbosity < 4)
+ return;
+ dsl_pool_config_enter(dp, FTAG);
+
+ for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ zap_cursor_advance(&zc)) {
+ char osname[ZFS_MAX_DATASET_NAME_LEN];
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ dmu_objset_name(os, osname);
+ VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname,
+ attr.za_name));
+ (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
+ }
+ zap_cursor_fini(&zc);
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+static void
+bpobj_count_refd(bpobj_t *bpo)
+{
+ mos_obj_refd(bpo->bpo_object);
+
+ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+ mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
+ for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+ uint64_t subobj;
+ bpobj_t subbpo;
+ int error;
+ VERIFY0(dmu_read(bpo->bpo_os,
+ bpo->bpo_phys->bpo_subobjs,
+ i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+ error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+ if (error != 0) {
+ (void) printf("ERROR %u while trying to open "
+ "subobj id %llu\n",
+ error, (u_longlong_t)subobj);
+ continue;
+ }
+ bpobj_count_refd(&subbpo);
+ bpobj_close(&subbpo);
+ }
+ }
+}
+
+static int
+dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
+{
+ spa_t *spa = arg;
+ uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
+ if (dle->dle_bpobj.bpo_object != empty_bpobj)
+ bpobj_count_refd(&dle->dle_bpobj);
+ return (0);
+}
+
+static int
+dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
+{
+ ASSERT(arg == NULL);
+ if (dump_opt['d'] >= 5) {
+ char buf[128];
+ (void) snprintf(buf, sizeof (buf),
+ "mintxg %llu -> obj %llu",
+ (longlong_t)dle->dle_mintxg,
+ (longlong_t)dle->dle_bpobj.bpo_object);
+
+ dump_full_bpobj(&dle->dle_bpobj, buf, 0);
+ } else {
+ (void) printf("mintxg %llu -> obj %llu\n",
+ (longlong_t)dle->dle_mintxg,
+ (longlong_t)dle->dle_bpobj.bpo_object);
+ }
+ return (0);
+}
+
+static void
+dump_blkptr_list(dsl_deadlist_t *dl, char *name)
+{
+ char bytes[32];
+ char comp[32];
+ char uncomp[32];
+ char entries[32];
+ spa_t *spa = dmu_objset_spa(dl->dl_os);
+ uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
+
+ if (dl->dl_oldfmt) {
+ if (dl->dl_bpobj.bpo_object != empty_bpobj)
+ bpobj_count_refd(&dl->dl_bpobj);
+ } else {
+ mos_obj_refd(dl->dl_object);
+ dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
+ }
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ);
+
+ if (dump_opt['d'] < 3)
+ return;
+
+ if (dl->dl_oldfmt) {
+ dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+ return;
+ }
+
+ zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
+ zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
+ zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
+ zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
+ (void) printf("\n %s: %s (%s/%s comp), %s entries\n",
+ name, bytes, comp, uncomp, entries);
+
+ if (dump_opt['d'] < 4)
+ return;
+
+ (void) printf("\n");
+
+ dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
+}
+
+static int
+verify_dd_livelist(objset_t *os)
+{
+ uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
+ dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+ dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+
+ ASSERT(!dmu_objset_is_snapshot(os));
+ if (!dsl_deadlist_is_open(&dd->dd_livelist))
+ return (0);
+
+ /* Iterate through the livelist to check for duplicates */
+ dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
+ NULL);
+
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_deadlist_space(&dd->dd_livelist, &ll_used,
+ &ll_comp, &ll_uncomp);
+
+ dsl_dataset_t *origin_ds;
+ ASSERT(dsl_pool_config_held(dp));
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
+ VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
+ &used, &comp, &uncomp));
+ dsl_dataset_rele(origin_ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+ /*
+ * It's possible that the dataset's uncomp space is larger than the
+ * livelist's because livelists do not track embedded block pointers
+ */
+ if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
+ char nice_used[32], nice_comp[32], nice_uncomp[32];
+ (void) printf("Discrepancy in space accounting:\n");
+ zdb_nicenum(used, nice_used, sizeof (nice_used));
+ zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
+ zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
+ (void) printf("dir: used %s, comp %s, uncomp %s\n",
+ nice_used, nice_comp, nice_uncomp);
+ zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
+ zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
+ zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
+ (void) printf("livelist: used %s, comp %s, uncomp %s\n",
+ nice_used, nice_comp, nice_uncomp);
+ return (1);
+ }
+ return (0);
+}
+
+static avl_tree_t idx_tree;
+static avl_tree_t domain_tree;
+static boolean_t fuid_table_loaded;
+static objset_t *sa_os = NULL;
+static sa_attr_type_t *sa_attr_table = NULL;
+
+static int
+open_objset(const char *path, void *tag, objset_t **osp)
+{
+ int err;
+ uint64_t sa_attrs = 0;
+ uint64_t version = 0;
+
+ VERIFY3P(sa_os, ==, NULL);
+ /*
+ * We can't own an objset if it's redacted. Therefore, we do this
+ * dance: hold the objset, then acquire a long hold on its dataset, then
+ * release the pool (which is held as part of holding the objset).
+ */
+ err = dmu_objset_hold(path, tag, osp);
+ if (err != 0) {
+ (void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
+ path, strerror(err));
+ return (err);
+ }
+ dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
+ dsl_pool_rele(dmu_objset_pool(*osp), tag);
+
+ if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) {
+ (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &version);
+ if (version >= ZPL_VERSION_SA) {
+ (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+ 8, 1, &sa_attrs);
+ }
+ err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
+ &sa_attr_table);
+ if (err != 0) {
+ (void) fprintf(stderr, "sa_setup failed: %s\n",
+ strerror(err));
+ dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
+ dsl_dataset_rele(dmu_objset_ds(*osp), tag);
+ *osp = NULL;
+ }
+ }
+ sa_os = *osp;
+
+ return (0);
+}
+
+static void
+close_objset(objset_t *os, void *tag)
+{
+ VERIFY3P(os, ==, sa_os);
+ if (os->os_sa != NULL)
+ sa_tear_down(os);
+ dsl_dataset_long_rele(dmu_objset_ds(os), tag);
+ dsl_dataset_rele(dmu_objset_ds(os), tag);
+ sa_attr_table = NULL;
+ sa_os = NULL;
+}
+
+static void
+fuid_table_destroy(void)
+{
+ if (fuid_table_loaded) {
+ zfs_fuid_table_destroy(&idx_tree, &domain_tree);
+ fuid_table_loaded = B_FALSE;
+ }
+}
+
+/*
+ * print uid or gid information.
+ * For normal POSIX id just the id is printed in decimal format.
+ * For CIFS files with FUID the fuid is printed in hex followed by
+ * the domain-rid string.
+ */
+static void
+print_idstr(uint64_t id, const char *id_type)
+{
+ if (FUID_INDEX(id)) {
+ char *domain;
+
+ domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
+ (void) printf("\t%s %llx [%s-%d]\n", id_type,
+ (u_longlong_t)id, domain, (int)FUID_RID(id));
+ } else {
+ (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id);
+ }
+
+}
+
+static void
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
+{
+ uint32_t uid_idx, gid_idx;
+
+ uid_idx = FUID_INDEX(uid);
+ gid_idx = FUID_INDEX(gid);
+
+ /* Load domain table, if not already loaded */
+ if (!fuid_table_loaded && (uid_idx || gid_idx)) {
+ uint64_t fuid_obj;
+
+ /* first find the fuid object. It lives in the master node */
+ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
+ 8, 1, &fuid_obj) == 0);
+ zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
+ (void) zfs_fuid_table_load(os, fuid_obj,
+ &idx_tree, &domain_tree);
+ fuid_table_loaded = B_TRUE;
+ }
+
+ print_idstr(uid, "uid");
+ print_idstr(gid, "gid");
+}
+
+static void
+dump_znode_sa_xattr(sa_handle_t *hdl)
+{
+ nvlist_t *sa_xattr;
+ nvpair_t *elem = NULL;
+ int sa_xattr_size = 0;
+ int sa_xattr_entries = 0;
+ int error;
+ char *sa_xattr_packed;
+
+ error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
+ if (error || sa_xattr_size == 0)
+ return;
+
+ sa_xattr_packed = malloc(sa_xattr_size);
+ if (sa_xattr_packed == NULL)
+ return;
+
+ error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
+ sa_xattr_packed, sa_xattr_size);
+ if (error) {
+ free(sa_xattr_packed);
+ return;
+ }
+
+ error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
+ if (error) {
+ free(sa_xattr_packed);
+ return;
+ }
+
+ while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
+ sa_xattr_entries++;
+
+ (void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
+ sa_xattr_size, sa_xattr_entries);
+ while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
+ uchar_t *value;
+ uint_t cnt, idx;
+
+ (void) printf("\t\t%s = ", nvpair_name(elem));
+ nvpair_value_byte_array(elem, &value, &cnt);
+ for (idx = 0; idx < cnt; ++idx) {
+ if (isprint(value[idx]))
+ (void) putchar(value[idx]);
+ else
+ (void) printf("\\%3.3o", value[idx]);
+ }
+ (void) putchar('\n');
+ }
+
+ nvlist_free(sa_xattr);
+ free(sa_xattr_packed);
+}
+
+static void
+dump_znode_symlink(sa_handle_t *hdl)
+{
+ int sa_symlink_size = 0;
+ char linktarget[MAXPATHLEN];
+ linktarget[0] = '\0';
+ int error;
+
+ error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
+ if (error || sa_symlink_size == 0) {
+ return;
+ }
+ if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
+ &linktarget, sa_symlink_size) == 0)
+ (void) printf("\ttarget %s\n", linktarget);
+}
+
+/*ARGSUSED*/
+static void
+dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
+{
+ char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */
+ sa_handle_t *hdl;
+ uint64_t xattr, rdev, gen;
+ uint64_t uid, gid, mode, fsize, parent, links;
+ uint64_t pflags;
+ uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+ time_t z_crtime, z_atime, z_mtime, z_ctime;
+ sa_bulk_attr_t bulk[12];
+ int idx = 0;
+ int error;
+
+ VERIFY3P(os, ==, sa_os);
+ if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+ (void) printf("Failed to get handle for SA znode\n");
+ return;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+ &links, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+ &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+ &fsize, 8);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+ acctm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+ modtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+ crtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+ chgtm, 16);
+ SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+ &pflags, 8);
+
+ if (sa_bulk_lookup(hdl, bulk, idx)) {
+ (void) sa_handle_destroy(hdl);
+ return;
+ }
+
+ z_crtime = (time_t)crtm[0];
+ z_atime = (time_t)acctm[0];
+ z_mtime = (time_t)modtm[0];
+ z_ctime = (time_t)chgtm[0];
+
+ if (dump_opt['d'] > 4) {
+ error = zfs_obj_to_path(os, object, path, sizeof (path));
+ if (error == ESTALE) {
+ (void) snprintf(path, sizeof (path), "on delete queue");
+ } else if (error != 0) {
+ leaked_objects++;
+ (void) snprintf(path, sizeof (path),
+ "path not found, possibly leaked");
+ }
+ (void) printf("\tpath %s\n", path);
+ }
+
+ if (S_ISLNK(mode))
+ dump_znode_symlink(hdl);
+ dump_uidgid(os, uid, gid);
+ (void) printf("\tatime %s", ctime(&z_atime));
+ (void) printf("\tmtime %s", ctime(&z_mtime));
+ (void) printf("\tctime %s", ctime(&z_ctime));
+ (void) printf("\tcrtime %s", ctime(&z_crtime));
+ (void) printf("\tgen %llu\n", (u_longlong_t)gen);
+ (void) printf("\tmode %llo\n", (u_longlong_t)mode);
+ (void) printf("\tsize %llu\n", (u_longlong_t)fsize);
+ (void) printf("\tparent %llu\n", (u_longlong_t)parent);
+ (void) printf("\tlinks %llu\n", (u_longlong_t)links);
+ (void) printf("\tpflags %llx\n", (u_longlong_t)pflags);
+ if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
+ uint64_t projid;
+
+ if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\tprojid %llu\n", (u_longlong_t)projid);
+ }
+ if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\txattr %llu\n", (u_longlong_t)xattr);
+ if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+ sizeof (uint64_t)) == 0)
+ (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev);
+ dump_znode_sa_xattr(hdl);
+ sa_handle_destroy(hdl);
+}
+
+/*ARGSUSED*/
+static void
+dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+/*ARGSUSED*/
+static void
+dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
+{
+}
+
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
+ dump_none, /* unallocated */
+ dump_zap, /* object directory */
+ dump_uint64, /* object array */
+ dump_none, /* packed nvlist */
+ dump_packed_nvlist, /* packed nvlist size */
+ dump_none, /* bpobj */
+ dump_bpobj, /* bpobj header */
+ dump_none, /* SPA space map header */
+ dump_none, /* SPA space map */
+ dump_none, /* ZIL intent log */
+ dump_dnode, /* DMU dnode */
+ dump_dmu_objset, /* DMU objset */
+ dump_dsl_dir, /* DSL directory */
+ dump_zap, /* DSL directory child map */
+ dump_zap, /* DSL dataset snap map */
+ dump_zap, /* DSL props */
+ dump_dsl_dataset, /* DSL dataset */
+ dump_znode, /* ZFS znode */
+ dump_acl, /* ZFS V0 ACL */
+ dump_uint8, /* ZFS plain file */
+ dump_zpldir, /* ZFS directory */
+ dump_zap, /* ZFS master node */
+ dump_zap, /* ZFS delete queue */
+ dump_uint8, /* zvol object */
+ dump_zap, /* zvol prop */
+ dump_uint8, /* other uint8[] */
+ dump_uint64, /* other uint64[] */
+ dump_zap, /* other ZAP */
+ dump_zap, /* persistent error log */
+ dump_uint8, /* SPA history */
+ dump_history_offsets, /* SPA history offsets */
+ dump_zap, /* Pool properties */
+ dump_zap, /* DSL permissions */
+ dump_acl, /* ZFS ACL */
+ dump_uint8, /* ZFS SYSACL */
+ dump_none, /* FUID nvlist */
+ dump_packed_nvlist, /* FUID nvlist size */
+ dump_zap, /* DSL dataset next clones */
+ dump_zap, /* DSL scrub queue */
+ dump_zap, /* ZFS user/group/project used */
+ dump_zap, /* ZFS user/group/project quota */
+ dump_zap, /* snapshot refcount tags */
+ dump_ddt_zap, /* DDT ZAP object */
+ dump_zap, /* DDT statistics */
+ dump_znode, /* SA object */
+ dump_zap, /* SA Master Node */
+ dump_sa_attrs, /* SA attribute registration */
+ dump_sa_layouts, /* SA attribute layouts */
+ dump_zap, /* DSL scrub translations */
+ dump_none, /* fake dedup BP */
+ dump_zap, /* deadlist */
+ dump_none, /* deadlist hdr */
+ dump_zap, /* dsl clones */
+ dump_bpobj_subobjs, /* bpobj subobjs */
+ dump_unknown, /* Unknown type, must be last */
+};
+
+static boolean_t
+match_object_type(dmu_object_type_t obj_type, uint64_t flags)
+{
+ boolean_t match = B_TRUE;
+
+ switch (obj_type) {
+ case DMU_OT_DIRECTORY_CONTENTS:
+ if (!(flags & ZOR_FLAG_DIRECTORY))
+ match = B_FALSE;
+ break;
+ case DMU_OT_PLAIN_FILE_CONTENTS:
+ if (!(flags & ZOR_FLAG_PLAIN_FILE))
+ match = B_FALSE;
+ break;
+ case DMU_OT_SPACE_MAP:
+ if (!(flags & ZOR_FLAG_SPACE_MAP))
+ match = B_FALSE;
+ break;
+ default:
+ if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
+ if (!(flags & ZOR_FLAG_ZAP))
+ match = B_FALSE;
+ break;
+ }
+
+ /*
+ * If all bits except some of the supported flags are
+ * set, the user combined the all-types flag (A) with
+ * a negated flag to exclude some types (e.g. A-f to
+ * show all object types except plain files).
+ */
+ if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
+ match = B_FALSE;
+
+ break;
+ }
+
+ return (match);
+}
+
+static void
+dump_object(objset_t *os, uint64_t object, int verbosity,
+ boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
+{
+ dmu_buf_t *db = NULL;
+ dmu_object_info_t doi;
+ dnode_t *dn;
+ boolean_t dnode_held = B_FALSE;
+ void *bonus = NULL;
+ size_t bsize = 0;
+ char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
+ char bonus_size[32];
+ char aux[50];
+ int error;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ);
+
+ if (*print_header) {
+ (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n",
+ "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
+ "lsize", "%full", "type");
+ *print_header = 0;
+ }
+
+ if (object == 0) {
+ dn = DMU_META_DNODE(os);
+ dmu_object_info_from_dnode(dn, &doi);
+ } else {
+ /*
+ * Encrypted datasets will have sensitive bonus buffers
+ * encrypted. Therefore we cannot hold the bonus buffer and
+ * must hold the dnode itself instead.
+ */
+ error = dmu_object_info(os, object, &doi);
+ if (error)
+ fatal("dmu_object_info() failed, errno %u", error);
+
+ if (os->os_encrypted &&
+ DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error)
+ fatal("dnode_hold() failed, errno %u", error);
+ dnode_held = B_TRUE;
+ } else {
+ error = dmu_bonus_hold(os, object, FTAG, &db);
+ if (error)
+ fatal("dmu_bonus_hold(%llu) failed, errno %u",
+ object, error);
+ bonus = db->db_data;
+ bsize = db->db_size;
+ dn = DB_DNODE((dmu_buf_impl_t *)db);
+ }
+ }
+
+ /*
+ * Default to showing all object types if no flags were specified.
+ */
+ if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
+ !match_object_type(doi.doi_type, flags))
+ goto out;
+
+ if (dnode_slots_used)
+ *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
+
+ zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
+ zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
+ zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
+ zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
+ zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
+ zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
+ (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+ doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+ doi.doi_max_offset);
+
+ aux[0] = '\0';
+
+ if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
+ (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+ " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
+ }
+
+ if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
+ ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
+ const char *compname = NULL;
+ if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
+ ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
+ &compname) == 0) {
+ (void) snprintf(aux + strlen(aux),
+ sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
+ compname);
+ } else {
+ (void) snprintf(aux + strlen(aux),
+ sizeof (aux) - strlen(aux),
+ " (Z=inherit=%s-unknown)",
+ ZDB_COMPRESS_NAME(os->os_compress));
+ }
+ } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
+ (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+ " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
+ } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
+ (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
+ " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
+ }
+
+ (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n",
+ (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+ asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
+
+ if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
+ (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n",
+ "", "", "", "", "", "", bonus_size, "bonus",
+ zdb_ot_name(doi.doi_bonus_type));
+ }
+
+ if (verbosity >= 4) {
+ (void) printf("\tdnode flags: %s%s%s%s\n",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
+ "USED_BYTES " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
+ "USERUSED_ACCOUNTED " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
+ "USEROBJUSED_ACCOUNTED " : "",
+ (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+ "SPILL_BLKPTR" : "");
+ (void) printf("\tdnode maxblkid: %llu\n",
+ (longlong_t)dn->dn_phys->dn_maxblkid);
+
+ if (!dnode_held) {
+ object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
+ object, bonus, bsize);
+ } else {
+ (void) printf("\t\t(bonus encrypted)\n");
+ }
+
+ if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) {
+ object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
+ NULL, 0);
+ } else {
+ (void) printf("\t\t(object encrypted)\n");
+ }
+
+ *print_header = B_TRUE;
+ }
+
+ if (verbosity >= 5)
+ dump_indirect(dn);
+
+ if (verbosity >= 5) {
+ /*
+ * Report the list of segments that comprise the object.
+ */
+ uint64_t start = 0;
+ uint64_t end;
+ uint64_t blkfill = 1;
+ int minlvl = 1;
+
+ if (dn->dn_type == DMU_OT_DNODE) {
+ minlvl = 0;
+ blkfill = DNODES_PER_BLOCK;
+ }
+
+ for (;;) {
+ char segsize[32];
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ);
+ error = dnode_next_offset(dn,
+ 0, &start, minlvl, blkfill, 0);
+ if (error)
+ break;
+ end = start;
+ error = dnode_next_offset(dn,
+ DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
+ zdb_nicenum(end - start, segsize, sizeof (segsize));
+ (void) printf("\t\tsegment [%016llx, %016llx)"
+ " size %5s\n", (u_longlong_t)start,
+ (u_longlong_t)end, segsize);
+ if (error)
+ break;
+ start = end;
+ }
+ }
+
+out:
+ if (db != NULL)
+ dmu_buf_rele(db, FTAG);
+ if (dnode_held)
+ dnode_rele(dn, FTAG);
+}
+
+static void
+count_dir_mos_objects(dsl_dir_t *dd)
+{
+ mos_obj_refd(dd->dd_object);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
+ mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
+
+ /*
+ * The dd_crypto_obj can be referenced by multiple dsl_dir's.
+ * Ignore the references after the first one.
+ */
+ mos_obj_refd_multiple(dd->dd_crypto_obj);
+}
+
+static void
+count_ds_mos_objects(dsl_dataset_t *ds)
+{
+ mos_obj_refd(ds->ds_object);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
+ mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+ mos_obj_refd(ds->ds_bookmarks_obj);
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ count_dir_mos_objects(ds->ds_dir);
+ }
+}
+
+static const char *objset_types[DMU_OST_NUMTYPES] = {
+ "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
+
+/*
+ * Parse a string denoting a range of object IDs of the form
+ * <start>[:<end>[:flags]], and store the results in zor.
+ * Return 0 on success. On error, return 1 and update the msg
+ * pointer to point to a descriptive error message.
+ */
+static int
+parse_object_range(char *range, zopt_object_range_t *zor, char **msg)
+{
+ uint64_t flags = 0;
+ char *p, *s, *dup, *flagstr;
+ size_t len;
+ int i;
+ int rc = 0;
+
+ if (strchr(range, ':') == NULL) {
+ zor->zor_obj_start = strtoull(range, &p, 0);
+ if (*p != '\0') {
+ *msg = "Invalid characters in object ID";
+ rc = 1;
+ }
+ zor->zor_obj_end = zor->zor_obj_start;
+ return (rc);
+ }
+
+ if (strchr(range, ':') == range) {
+ *msg = "Invalid leading colon";
+ rc = 1;
+ return (rc);
+ }
+
+ len = strlen(range);
+ if (range[len - 1] == ':') {
+ *msg = "Invalid trailing colon";
+ rc = 1;
+ return (rc);
+ }
+
+ dup = strdup(range);
+ s = strtok(dup, ":");
+ zor->zor_obj_start = strtoull(s, &p, 0);
+
+ if (*p != '\0') {
+ *msg = "Invalid characters in start object ID";
+ rc = 1;
+ goto out;
+ }
+
+ s = strtok(NULL, ":");
+ zor->zor_obj_end = strtoull(s, &p, 0);
+
+ if (*p != '\0') {
+ *msg = "Invalid characters in end object ID";
+ rc = 1;
+ goto out;
+ }
+
+ if (zor->zor_obj_start > zor->zor_obj_end) {
+ *msg = "Start object ID may not exceed end object ID";
+ rc = 1;
+ goto out;
+ }
+
+ s = strtok(NULL, ":");
+ if (s == NULL) {
+ zor->zor_flags = ZOR_FLAG_ALL_TYPES;
+ goto out;
+ } else if (strtok(NULL, ":") != NULL) {
+ *msg = "Invalid colon-delimited field after flags";
+ rc = 1;
+ goto out;
+ }
+
+ flagstr = s;
+ for (i = 0; flagstr[i]; i++) {
+ int bit;
+ boolean_t negation = (flagstr[i] == '-');
+
+ if (negation) {
+ i++;
+ if (flagstr[i] == '\0') {
+ *msg = "Invalid trailing negation operator";
+ rc = 1;
+ goto out;
+ }
+ }
+ bit = flagbits[(uchar_t)flagstr[i]];
+ if (bit == 0) {
+ *msg = "Invalid flag";
+ rc = 1;
+ goto out;
+ }
+ if (negation)
+ flags &= ~bit;
+ else
+ flags |= bit;
+ }
+ zor->zor_flags = flags;
+
+out:
+ free(dup);
+ return (rc);
+}
+
+static void
+dump_objset(objset_t *os)
+{
+ dmu_objset_stats_t dds = { 0 };
+ uint64_t object, object_count;
+ uint64_t refdbytes, usedobjs, scratch;
+ char numbuf[32];
+ char blkbuf[BP_SPRINTF_LEN + 20];
+ char osname[ZFS_MAX_DATASET_NAME_LEN];
+ const char *type = "UNKNOWN";
+ int verbosity = dump_opt['d'];
+ boolean_t print_header;
+ unsigned i;
+ int error;
+ uint64_t total_slots_used = 0;
+ uint64_t max_slot_used = 0;
+ uint64_t dnode_slots;
+ uint64_t obj_start;
+ uint64_t obj_end;
+ uint64_t flags;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ);
+
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ dmu_objset_fast_stat(os, &dds);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+
+ print_header = B_TRUE;
+
+ if (dds.dds_type < DMU_OST_NUMTYPES)
+ type = objset_types[dds.dds_type];
+
+ if (dds.dds_type == DMU_OST_META) {
+ dds.dds_creation_txg = TXG_INITIAL;
+ usedobjs = BP_GET_FILL(os->os_rootbp);
+ refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+ dd_used_bytes;
+ } else {
+ dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
+ }
+
+ ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
+
+ zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
+
+ if (verbosity >= 4) {
+ (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
+ (void) snprintf_blkptr(blkbuf + strlen(blkbuf),
+ sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
+ } else {
+ blkbuf[0] = '\0';
+ }
+
+ dmu_objset_name(os, osname);
+
+ (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
+ "%s, %llu objects%s%s\n",
+ osname, type, (u_longlong_t)dmu_objset_id(os),
+ (u_longlong_t)dds.dds_creation_txg,
+ numbuf, (u_longlong_t)usedobjs, blkbuf,
+ (dds.dds_inconsistent) ? " (inconsistent)" : "");
+
+ for (i = 0; i < zopt_object_args; i++) {
+ obj_start = zopt_object_ranges[i].zor_obj_start;
+ obj_end = zopt_object_ranges[i].zor_obj_end;
+ flags = zopt_object_ranges[i].zor_flags;
+
+ object = obj_start;
+ if (object == 0 || obj_start == obj_end)
+ dump_object(os, object, verbosity, &print_header, NULL,
+ flags);
+ else
+ object--;
+
+ while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
+ object <= obj_end) {
+ dump_object(os, object, verbosity, &print_header, NULL,
+ flags);
+ }
+ }
+
+ if (zopt_object_args > 0) {
+ (void) printf("\n");
+ return;
+ }
+
+ if (dump_opt['i'] != 0 || verbosity >= 2)
+ dump_intent_log(dmu_objset_zil(os));
+
+ if (dmu_objset_ds(os) != NULL) {
+ dsl_dataset_t *ds = dmu_objset_ds(os);
+ dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
+ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+ !dmu_objset_is_snapshot(os)) {
+ dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
+ if (verify_dd_livelist(os) != 0)
+ fatal("livelist is incorrect");
+ }
+
+ if (dsl_dataset_remap_deadlist_exists(ds)) {
+ (void) printf("ds_remap_deadlist:\n");
+ dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
+ }
+ count_ds_mos_objects(ds);
+ }
+
+ if (dmu_objset_ds(os) != NULL)
+ dump_bookmarks(os, verbosity);
+
+ if (verbosity < 2)
+ return;
+
+ if (BP_IS_HOLE(os->os_rootbp))
+ return;
+
+ dump_object(os, 0, verbosity, &print_header, NULL, 0);
+ object_count = 0;
+ if (DMU_USERUSED_DNODE(os) != NULL &&
+ DMU_USERUSED_DNODE(os)->dn_type != 0) {
+ dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
+ NULL, 0);
+ dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
+ NULL, 0);
+ }
+
+ if (DMU_PROJECTUSED_DNODE(os) != NULL &&
+ DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
+ dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
+ &print_header, NULL, 0);
+
+ object = 0;
+ while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
+ dump_object(os, object, verbosity, &print_header, &dnode_slots,
+ 0);
+ object_count++;
+ total_slots_used += dnode_slots;
+ max_slot_used = object + dnode_slots - 1;
+ }
+
+ (void) printf("\n");
+
+ (void) printf(" Dnode slots:\n");
+ (void) printf("\tTotal used: %10llu\n",
+ (u_longlong_t)total_slots_used);
+ (void) printf("\tMax used: %10llu\n",
+ (u_longlong_t)max_slot_used);
+ (void) printf("\tPercent empty: %10lf\n",
+ (double)(max_slot_used - total_slots_used)*100 /
+ (double)max_slot_used);
+ (void) printf("\n");
+
+ if (error != ESRCH) {
+ (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
+ abort();
+ }
+
+ ASSERT3U(object_count, ==, usedobjs);
+
+ if (leaked_objects != 0) {
+ (void) printf("%d potentially leaked objects detected\n",
+ leaked_objects);
+ leaked_objects = 0;
+ }
+}
+
+static void
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
+{
+ time_t timestamp = ub->ub_timestamp;
+
+ (void) printf("%s", header ? header : "");
+ (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
+ (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
+ (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
+ (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
+ (void) printf("\ttimestamp = %llu UTC = %s",
+ (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
+
+ (void) printf("\tmmp_magic = %016llx\n",
+ (u_longlong_t)ub->ub_mmp_magic);
+ if (MMP_VALID(ub)) {
+ (void) printf("\tmmp_delay = %0llu\n",
+ (u_longlong_t)ub->ub_mmp_delay);
+ if (MMP_SEQ_VALID(ub))
+ (void) printf("\tmmp_seq = %u\n",
+ (unsigned int) MMP_SEQ(ub));
+ if (MMP_FAIL_INT_VALID(ub))
+ (void) printf("\tmmp_fail = %u\n",
+ (unsigned int) MMP_FAIL_INT(ub));
+ if (MMP_INTERVAL_VALID(ub))
+ (void) printf("\tmmp_write = %u\n",
+ (unsigned int) MMP_INTERVAL(ub));
+ /* After MMP_* to make summarize_uberblock_mmp cleaner */
+ (void) printf("\tmmp_valid = %x\n",
+ (unsigned int) ub->ub_mmp_config & 0xFF);
+ }
+
+ if (dump_opt['u'] >= 4) {
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
+ (void) printf("\trootbp = %s\n", blkbuf);
+ }
+ (void) printf("\tcheckpoint_txg = %llu\n",
+ (u_longlong_t)ub->ub_checkpoint_txg);
+ (void) printf("%s", footer ? footer : "");
+}
+
+static void
+dump_config(spa_t *spa)
+{
+ dmu_buf_t *db;
+ size_t nvsize = 0;
+ int error = 0;
+
+
+ error = dmu_bonus_hold(spa->spa_meta_objset,
+ spa->spa_config_object, FTAG, &db);
+
+ if (error == 0) {
+ nvsize = *(uint64_t *)db->db_data;
+ dmu_buf_rele(db, FTAG);
+
+ (void) printf("\nMOS Configuration:\n");
+ dump_packed_nvlist(spa->spa_meta_objset,
+ spa->spa_config_object, (void *)&nvsize, 1);
+ } else {
+ (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+ (u_longlong_t)spa->spa_config_object, error);
+ }
+}
+
+static void
+dump_cachefile(const char *cachefile)
+{
+ int fd;
+ struct stat64 statbuf;
+ char *buf;
+ nvlist_t *config;
+
+ if ((fd = open64(cachefile, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", cachefile,
+ strerror(errno));
+ exit(1);
+ }
+
+ if (fstat64(fd, &statbuf) != 0) {
+ (void) printf("failed to stat '%s': %s\n", cachefile,
+ strerror(errno));
+ exit(1);
+ }
+
+ if ((buf = malloc(statbuf.st_size)) == NULL) {
+ (void) fprintf(stderr, "failed to allocate %llu bytes\n",
+ (u_longlong_t)statbuf.st_size);
+ exit(1);
+ }
+
+ if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
+ (void) fprintf(stderr, "failed to read %llu bytes\n",
+ (u_longlong_t)statbuf.st_size);
+ exit(1);
+ }
+
+ (void) close(fd);
+
+ if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
+ (void) fprintf(stderr, "failed to unpack nvlist\n");
+ exit(1);
+ }
+
+ free(buf);
+
+ dump_nvlist(config, 0);
+
+ nvlist_free(config);
+}
+
+/*
+ * ZFS label nvlist stats
+ */
+typedef struct zdb_nvl_stats {
+ int zns_list_count;
+ int zns_leaf_count;
+ size_t zns_leaf_largest;
+ size_t zns_leaf_total;
+ nvlist_t *zns_string;
+ nvlist_t *zns_uint64;
+ nvlist_t *zns_boolean;
+} zdb_nvl_stats_t;
+
+static void
+collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
+{
+ nvlist_t *list, **array;
+ nvpair_t *nvp = NULL;
+ char *name;
+ uint_t i, items;
+
+ stats->zns_list_count++;
+
+ while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+ name = nvpair_name(nvp);
+
+ switch (nvpair_type(nvp)) {
+ case DATA_TYPE_STRING:
+ fnvlist_add_string(stats->zns_string, name,
+ fnvpair_value_string(nvp));
+ break;
+ case DATA_TYPE_UINT64:
+ fnvlist_add_uint64(stats->zns_uint64, name,
+ fnvpair_value_uint64(nvp));
+ break;
+ case DATA_TYPE_BOOLEAN:
+ fnvlist_add_boolean(stats->zns_boolean, name);
+ break;
+ case DATA_TYPE_NVLIST:
+ if (nvpair_value_nvlist(nvp, &list) == 0)
+ collect_nvlist_stats(list, stats);
+ break;
+ case DATA_TYPE_NVLIST_ARRAY:
+ if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
+ break;
+
+ for (i = 0; i < items; i++) {
+ collect_nvlist_stats(array[i], stats);
+
+ /* collect stats on leaf vdev */
+ if (strcmp(name, "children") == 0) {
+ size_t size;
+
+ (void) nvlist_size(array[i], &size,
+ NV_ENCODE_XDR);
+ stats->zns_leaf_total += size;
+ if (size > stats->zns_leaf_largest)
+ stats->zns_leaf_largest = size;
+ stats->zns_leaf_count++;
+ }
+ }
+ break;
+ default:
+ (void) printf("skip type %d!\n", (int)nvpair_type(nvp));
+ }
+ }
+}
+
+static void
+dump_nvlist_stats(nvlist_t *nvl, size_t cap)
+{
+ zdb_nvl_stats_t stats = { 0 };
+ size_t size, sum = 0, total;
+ size_t noise;
+
+ /* requires nvlist with non-unique names for stat collection */
+ VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
+ VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
+ VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
+ VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
+
+ (void) printf("\n\nZFS Label NVList Config Stats:\n");
+
+ VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
+ (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n",
+ (int)total, (int)(cap - total), 100.0 * total / cap);
+
+ collect_nvlist_stats(nvl, &stats);
+
+ VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
+ size -= noise;
+ sum += size;
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
+ (int)fnvlist_num_pairs(stats.zns_uint64),
+ (int)size, 100.0 * size / total);
+
+ VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
+ size -= noise;
+ sum += size;
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
+ (int)fnvlist_num_pairs(stats.zns_string),
+ (int)size, 100.0 * size / total);
+
+ VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
+ size -= noise;
+ sum += size;
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
+ (int)fnvlist_num_pairs(stats.zns_boolean),
+ (int)size, 100.0 * size / total);
+
+ size = total - sum; /* treat remainder as nvlist overhead */
+ (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
+ stats.zns_list_count, (int)size, 100.0 * size / total);
+
+ if (stats.zns_leaf_count > 0) {
+ size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
+
+ (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
+ stats.zns_leaf_count, (int)average);
+ (void) printf("%24d bytes largest\n",
+ (int)stats.zns_leaf_largest);
+
+ if (dump_opt['l'] >= 3 && average > 0)
+ (void) printf(" space for %d additional leaf vdevs\n",
+ (int)((cap - total) / average));
+ }
+ (void) printf("\n");
+
+ nvlist_free(stats.zns_string);
+ nvlist_free(stats.zns_uint64);
+ nvlist_free(stats.zns_boolean);
+}
+
+typedef struct cksum_record {
+ zio_cksum_t cksum;
+ boolean_t labels[VDEV_LABELS];
+ avl_node_t link;
+} cksum_record_t;
+
+static int
+cksum_record_compare(const void *x1, const void *x2)
+{
+ const cksum_record_t *l = (cksum_record_t *)x1;
+ const cksum_record_t *r = (cksum_record_t *)x2;
+ int arraysize = ARRAY_SIZE(l->cksum.zc_word);
+ int difference;
+
+ for (int i = 0; i < arraysize; i++) {
+ difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
+ if (difference)
+ break;
+ }
+
+ return (difference);
+}
+
+static cksum_record_t *
+cksum_record_alloc(zio_cksum_t *cksum, int l)
+{
+ cksum_record_t *rec;
+
+ rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
+ rec->cksum = *cksum;
+ rec->labels[l] = B_TRUE;
+
+ return (rec);
+}
+
+static cksum_record_t *
+cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
+{
+ cksum_record_t lookup = { .cksum = *cksum };
+ avl_index_t where;
+
+ return (avl_find(tree, &lookup, &where));
+}
+
+static cksum_record_t *
+cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
+{
+ cksum_record_t *rec;
+
+ rec = cksum_record_lookup(tree, cksum);
+ if (rec) {
+ rec->labels[l] = B_TRUE;
+ } else {
+ rec = cksum_record_alloc(cksum, l);
+ avl_add(tree, rec);
+ }
+
+ return (rec);
+}
+
+static int
+first_label(cksum_record_t *rec)
+{
+ for (int i = 0; i < VDEV_LABELS; i++)
+ if (rec->labels[i])
+ return (i);
+
+ return (-1);
+}
+
+static void
+print_label_numbers(char *prefix, cksum_record_t *rec)
+{
+ printf("%s", prefix);
+ for (int i = 0; i < VDEV_LABELS; i++)
+ if (rec->labels[i] == B_TRUE)
+ printf("%d ", i);
+ printf("\n");
+}
+
+#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
+
+typedef struct zdb_label {
+ vdev_label_t label;
+ nvlist_t *config_nv;
+ cksum_record_t *config;
+ cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
+ boolean_t header_printed;
+ boolean_t read_failed;
+} zdb_label_t;
+
+static void
+print_label_header(zdb_label_t *label, int l)
+{
+
+ if (dump_opt['q'])
+ return;
+
+ if (label->header_printed == B_TRUE)
+ return;
+
+ (void) printf("------------------------------------\n");
+ (void) printf("LABEL %d\n", l);
+ (void) printf("------------------------------------\n");
+
+ label->header_printed = B_TRUE;
+}
+
+static void
+print_l2arc_header(void)
+{
+ (void) printf("------------------------------------\n");
+ (void) printf("L2ARC device header\n");
+ (void) printf("------------------------------------\n");
+}
+
+static void
+print_l2arc_log_blocks(void)
+{
+ (void) printf("------------------------------------\n");
+ (void) printf("L2ARC device log blocks\n");
+ (void) printf("------------------------------------\n");
+}
+
+static void
+dump_l2arc_log_entries(uint64_t log_entries,
+ l2arc_log_ent_phys_t *le, uint64_t i)
+{
+ for (int j = 0; j < log_entries; j++) {
+ dva_t dva = le[j].le_dva;
+ (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
+ "vdev: %llu, offset: %llu\n",
+ (u_longlong_t)i, j + 1,
+ (u_longlong_t)DVA_GET_ASIZE(&dva),
+ (u_longlong_t)DVA_GET_VDEV(&dva),
+ (u_longlong_t)DVA_GET_OFFSET(&dva));
+ (void) printf("|\t\t\t\tbirth: %llu\n",
+ (u_longlong_t)le[j].le_birth);
+ (void) printf("|\t\t\t\tlsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tpsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tcompr: %llu\n",
+ (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tcomplevel: %llu\n",
+ (u_longlong_t)(&le[j])->le_complevel);
+ (void) printf("|\t\t\t\ttype: %llu\n",
+ (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tprotected: %llu\n",
+ (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
+ (void) printf("|\t\t\t\tprefetch: %llu\n",
+ (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
+ (void) printf("|\t\t\t\taddress: %llu\n",
+ (u_longlong_t)le[j].le_daddr);
+ (void) printf("|\n");
+ }
+ (void) printf("\n");
+}
+
+static void
+dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps)
+{
+ (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr);
+ (void) printf("|\t\tpayload_asize: %llu\n",
+ (u_longlong_t)lbps.lbp_payload_asize);
+ (void) printf("|\t\tpayload_start: %llu\n",
+ (u_longlong_t)lbps.lbp_payload_start);
+ (void) printf("|\t\tlsize: %llu\n",
+ (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop));
+ (void) printf("|\t\tasize: %llu\n",
+ (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop));
+ (void) printf("|\t\tcompralgo: %llu\n",
+ (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop));
+ (void) printf("|\t\tcksumalgo: %llu\n",
+ (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop));
+ (void) printf("|\n\n");
+}
+
+static void
+dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr,
+ l2arc_dev_hdr_phys_t *rebuild)
+{
+ l2arc_log_blk_phys_t this_lb;
+ uint64_t asize;
+ l2arc_log_blkptr_t lbps[2];
+ abd_t *abd;
+ zio_cksum_t cksum;
+ int failed = 0;
+ l2arc_dev_t dev;
+
+ if (!dump_opt['q'])
+ print_l2arc_log_blocks();
+ bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps));
+
+ dev.l2ad_evict = l2dhdr.dh_evict;
+ dev.l2ad_start = l2dhdr.dh_start;
+ dev.l2ad_end = l2dhdr.dh_end;
+
+ if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) {
+ /* no log blocks to read */
+ if (!dump_opt['q']) {
+ (void) printf("No log blocks to read\n");
+ (void) printf("\n");
+ }
+ return;
+ } else {
+ dev.l2ad_hand = lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ }
+
+ dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
+ break;
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
+ if (!dump_opt['q']) {
+ (void) printf("Error while reading next log "
+ "block\n\n");
+ }
+ break;
+ }
+
+ fletcher_4_native_varsize(&this_lb, asize, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
+ failed++;
+ if (!dump_opt['q']) {
+ (void) printf("Invalid cksum\n");
+ dump_l2arc_log_blkptr(lbps[0]);
+ }
+ break;
+ }
+
+ switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ default:
+ abd = abd_alloc_for_io(asize, B_TRUE);
+ abd_copy_from_buf_off(abd, &this_lb, 0, asize);
+ zio_decompress_data(L2BLK_GET_COMPRESS(
+ (&lbps[0])->lbp_prop), abd, &this_lb,
+ asize, sizeof (this_lb), NULL);
+ abd_free(abd);
+ break;
+ }
+
+ if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(&this_lb, sizeof (this_lb));
+ if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ if (!dump_opt['q'])
+ (void) printf("Invalid log block magic\n\n");
+ break;
+ }
+
+ rebuild->dh_lb_count++;
+ rebuild->dh_lb_asize += asize;
+ if (dump_opt['l'] > 1 && !dump_opt['q']) {
+ (void) printf("lb[%4llu]\tmagic: %llu\n",
+ (u_longlong_t)rebuild->dh_lb_count,
+ (u_longlong_t)this_lb.lb_magic);
+ dump_l2arc_log_blkptr(lbps[0]);
+ }
+
+ if (dump_opt['l'] > 2 && !dump_opt['q'])
+ dump_l2arc_log_entries(l2dhdr.dh_log_entries,
+ this_lb.lb_entries,
+ rebuild->dh_lb_count);
+
+ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+ lbps[0].lbp_payload_start, dev.l2ad_evict) &&
+ !dev.l2ad_first)
+ break;
+
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb.lb_prev_lbp;
+ }
+
+ if (!dump_opt['q']) {
+ (void) printf("log_blk_count:\t %llu with valid cksum\n",
+ (u_longlong_t)rebuild->dh_lb_count);
+ (void) printf("\t\t %d with invalid cksum\n", failed);
+ (void) printf("log_blk_asize:\t %llu\n\n",
+ (u_longlong_t)rebuild->dh_lb_asize);
+ }
+}
+
+static int
+dump_l2arc_header(int fd)
+{
+ l2arc_dev_hdr_phys_t l2dhdr, rebuild;
+ int error = B_FALSE;
+
+ bzero(&l2dhdr, sizeof (l2dhdr));
+ bzero(&rebuild, sizeof (rebuild));
+
+ if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
+ VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
+ error = B_TRUE;
+ } else {
+ if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
+
+ if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
+ error = B_TRUE;
+ }
+
+ if (error) {
+ (void) printf("L2ARC device header not found\n\n");
+ /* Do not return an error here for backward compatibility */
+ return (0);
+ } else if (!dump_opt['q']) {
+ print_l2arc_header();
+
+ (void) printf(" magic: %llu\n",
+ (u_longlong_t)l2dhdr.dh_magic);
+ (void) printf(" version: %llu\n",
+ (u_longlong_t)l2dhdr.dh_version);
+ (void) printf(" pool_guid: %llu\n",
+ (u_longlong_t)l2dhdr.dh_spa_guid);
+ (void) printf(" flags: %llu\n",
+ (u_longlong_t)l2dhdr.dh_flags);
+ (void) printf(" start_lbps[0]: %llu\n",
+ (u_longlong_t)
+ l2dhdr.dh_start_lbps[0].lbp_daddr);
+ (void) printf(" start_lbps[1]: %llu\n",
+ (u_longlong_t)
+ l2dhdr.dh_start_lbps[1].lbp_daddr);
+ (void) printf(" log_blk_ent: %llu\n",
+ (u_longlong_t)l2dhdr.dh_log_entries);
+ (void) printf(" start: %llu\n",
+ (u_longlong_t)l2dhdr.dh_start);
+ (void) printf(" end: %llu\n",
+ (u_longlong_t)l2dhdr.dh_end);
+ (void) printf(" evict: %llu\n",
+ (u_longlong_t)l2dhdr.dh_evict);
+ (void) printf(" lb_asize_refcount: %llu\n",
+ (u_longlong_t)l2dhdr.dh_lb_asize);
+ (void) printf(" lb_count_refcount: %llu\n",
+ (u_longlong_t)l2dhdr.dh_lb_count);
+ (void) printf(" trim_action_time: %llu\n",
+ (u_longlong_t)l2dhdr.dh_trim_action_time);
+ (void) printf(" trim_state: %llu\n\n",
+ (u_longlong_t)l2dhdr.dh_trim_state);
+ }
+
+ dump_l2arc_log_blocks(fd, l2dhdr, &rebuild);
+ /*
+ * The total aligned size of log blocks and the number of log blocks
+ * reported in the header of the device may be less than what zdb
+ * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
+ * This happens because dump_l2arc_log_blocks() lacks the memory
+ * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
+ * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
+ * and dh_lb_count will be lower to begin with than what exists on the
+ * device. This is normal and zdb should not exit with an error. The
+ * opposite case should never happen though, the values reported in the
+ * header should never be higher than what dump_l2arc_log_blocks() and
+ * l2arc_rebuild() report. If this happens there is a leak in the
+ * accounting of log blocks.
+ */
+ if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
+ l2dhdr.dh_lb_count > rebuild.dh_lb_count)
+ return (1);
+
+ return (0);
+}
+
+static void
+dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
+{
+ if (dump_opt['q'])
+ return;
+
+ if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
+ return;
+
+ print_label_header(label, l);
+ dump_nvlist(label->config_nv, 4);
+ print_label_numbers(" labels = ", label->config);
+
+ if (dump_opt['l'] >= 2)
+ dump_nvlist_stats(label->config_nv, buflen);
+}
+
+#define ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
+{
+
+ vdev_t vd;
+ char header[ZDB_MAX_UB_HEADER_SIZE];
+
+ vd.vdev_ashift = ashift;
+ vd.vdev_top = &vd;
+
+ for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
+ uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
+ uberblock_t *ub = (void *)((char *)&label->label + uoff);
+ cksum_record_t *rec = label->uberblocks[i];
+
+ if (rec == NULL) {
+ if (dump_opt['u'] >= 2) {
+ print_label_header(label, label_num);
+ (void) printf(" Uberblock[%d] invalid\n", i);
+ }
+ continue;
+ }
+
+ if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
+ continue;
+
+ if ((dump_opt['u'] < 4) &&
+ (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
+ (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
+ continue;
+
+ print_label_header(label, label_num);
+ (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+ " Uberblock[%d]\n", i);
+ dump_uberblock(ub, header, "");
+ print_label_numbers(" labels = ", rec);
+ }
+}
+
+static char curpath[PATH_MAX];
+
+/*
+ * Iterate through the path components, recursively passing
+ * current one's obj and remaining path until we find the obj
+ * for the last one.
+ */
+static int
+dump_path_impl(objset_t *os, uint64_t obj, char *name)
+{
+ int err;
+ boolean_t header = B_TRUE;
+ uint64_t child_obj;
+ char *s;
+ dmu_buf_t *db;
+ dmu_object_info_t doi;
+
+ if ((s = strchr(name, '/')) != NULL)
+ *s = '\0';
+ err = zap_lookup(os, obj, name, 8, 1, &child_obj);
+
+ (void) strlcat(curpath, name, sizeof (curpath));
+
+ if (err != 0) {
+ (void) fprintf(stderr, "failed to lookup %s: %s\n",
+ curpath, strerror(err));
+ return (err);
+ }
+
+ child_obj = ZFS_DIRENT_OBJ(child_obj);
+ err = sa_buf_hold(os, child_obj, FTAG, &db);
+ if (err != 0) {
+ (void) fprintf(stderr,
+ "failed to get SA dbuf for obj %llu: %s\n",
+ (u_longlong_t)child_obj, strerror(err));
+ return (EINVAL);
+ }
+ dmu_object_info_from_db(db, &doi);
+ sa_buf_rele(db, FTAG);
+
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) {
+ (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
+ doi.doi_bonus_type, (u_longlong_t)child_obj);
+ return (EINVAL);
+ }
+
+ if (dump_opt['v'] > 6) {
+ (void) printf("obj=%llu %s type=%d bonustype=%d\n",
+ (u_longlong_t)child_obj, curpath, doi.doi_type,
+ doi.doi_bonus_type);
+ }
+
+ (void) strlcat(curpath, "/", sizeof (curpath));
+
+ switch (doi.doi_type) {
+ case DMU_OT_DIRECTORY_CONTENTS:
+ if (s != NULL && *(s + 1) != '\0')
+ return (dump_path_impl(os, child_obj, s + 1));
+ /*FALLTHROUGH*/
+ case DMU_OT_PLAIN_FILE_CONTENTS:
+ dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0);
+ return (0);
+ default:
+ (void) fprintf(stderr, "object %llu has non-file/directory "
+ "type %d\n", (u_longlong_t)obj, doi.doi_type);
+ break;
+ }
+
+ return (EINVAL);
+}
+
+/*
+ * Dump the blocks for the object specified by path inside the dataset.
+ */
+static int
+dump_path(char *ds, char *path)
+{
+ int err;
+ objset_t *os;
+ uint64_t root_obj;
+
+ err = open_objset(ds, FTAG, &os);
+ if (err != 0)
+ return (err);
+
+ err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
+ if (err != 0) {
+ (void) fprintf(stderr, "can't lookup root znode: %s\n",
+ strerror(err));
+ close_objset(os, FTAG);
+ return (EINVAL);
+ }
+
+ (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
+
+ err = dump_path_impl(os, root_obj, path);
+
+ close_objset(os, FTAG);
+ return (err);
+}
+
+static int
+dump_label(const char *dev)
+{
+ char path[MAXPATHLEN];
+ zdb_label_t labels[VDEV_LABELS];
+ uint64_t psize, ashift, l2cache;
+ struct stat64 statbuf;
+ boolean_t config_found = B_FALSE;
+ boolean_t error = B_FALSE;
+ boolean_t read_l2arc_header = B_FALSE;
+ avl_tree_t config_tree;
+ avl_tree_t uberblock_tree;
+ void *node, *cookie;
+ int fd;
+
+ bzero(labels, sizeof (labels));
+
+ /*
+ * Check if we were given absolute path and use it as is.
+ * Otherwise if the provided vdev name doesn't point to a file,
+ * try prepending expected disk paths and partition numbers.
+ */
+ (void) strlcpy(path, dev, sizeof (path));
+ if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
+ int error;
+
+ error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
+ if (error == 0 && zfs_dev_is_whole_disk(path)) {
+ if (zfs_append_partition(path, MAXPATHLEN) == -1)
+ error = ENOENT;
+ }
+
+ if (error || (stat64(path, &statbuf) != 0)) {
+ (void) printf("failed to find device %s, try "
+ "specifying absolute path instead\n", dev);
+ return (1);
+ }
+ }
+
+ if ((fd = open64(path, O_RDONLY)) < 0) {
+ (void) printf("cannot open '%s': %s\n", path, strerror(errno));
+ exit(1);
+ }
+
+ if (fstat64_blk(fd, &statbuf) != 0) {
+ (void) printf("failed to stat '%s': %s\n", path,
+ strerror(errno));
+ (void) close(fd);
+ exit(1);
+ }
+
+ if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
+ (void) printf("failed to invalidate cache '%s' : %s\n", path,
+ strerror(errno));
+
+ avl_create(&config_tree, cksum_record_compare,
+ sizeof (cksum_record_t), offsetof(cksum_record_t, link));
+ avl_create(&uberblock_tree, cksum_record_compare,
+ sizeof (cksum_record_t), offsetof(cksum_record_t, link));
+
+ psize = statbuf.st_size;
+ psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+ ashift = SPA_MINBLOCKSHIFT;
+
+ /*
+ * 1. Read the label from disk
+ * 2. Unpack the configuration and insert in config tree.
+ * 3. Traverse all uberblocks and insert in uberblock tree.
+ */
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ zdb_label_t *label = &labels[l];
+ char *buf = label->label.vl_vdev_phys.vp_nvlist;
+ size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
+ nvlist_t *config;
+ cksum_record_t *rec;
+ zio_cksum_t cksum;
+ vdev_t vd;
+
+ if (pread64(fd, &label->label, sizeof (label->label),
+ vdev_label_offset(psize, l, 0)) != sizeof (label->label)) {
+ if (!dump_opt['q'])
+ (void) printf("failed to read label %d\n", l);
+ label->read_failed = B_TRUE;
+ error = B_TRUE;
+ continue;
+ }
+
+ label->read_failed = B_FALSE;
+
+ if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
+ nvlist_t *vdev_tree = NULL;
+ size_t size;
+
+ if ((nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+ (nvlist_lookup_uint64(vdev_tree,
+ ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+ ashift = SPA_MINBLOCKSHIFT;
+
+ if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
+ size = buflen;
+
+ /* If the device is a cache device clear the header. */
+ if (!read_l2arc_header) {
+ if (nvlist_lookup_uint64(config,
+ ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
+ l2cache == POOL_STATE_L2CACHE) {
+ read_l2arc_header = B_TRUE;
+ }
+ }
+
+ fletcher_4_native_varsize(buf, size, &cksum);
+ rec = cksum_record_insert(&config_tree, &cksum, l);
+
+ label->config = rec;
+ label->config_nv = config;
+ config_found = B_TRUE;
+ } else {
+ error = B_TRUE;
+ }
+
+ vd.vdev_ashift = ashift;
+ vd.vdev_top = &vd;
+
+ for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
+ uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
+ uberblock_t *ub = (void *)((char *)label + uoff);
+
+ if (uberblock_verify(ub))
+ continue;
+
+ fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
+ rec = cksum_record_insert(&uberblock_tree, &cksum, l);
+
+ label->uberblocks[i] = rec;
+ }
+ }
+
+ /*
+ * Dump the label and uberblocks.
+ */
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ zdb_label_t *label = &labels[l];
+ size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
+
+ if (label->read_failed == B_TRUE)
+ continue;
+
+ if (label->config_nv) {
+ dump_config_from_label(label, buflen, l);
+ } else {
+ if (!dump_opt['q'])
+ (void) printf("failed to unpack label %d\n", l);
+ }
+
+ if (dump_opt['u'])
+ dump_label_uberblocks(label, ashift, l);
+
+ nvlist_free(label->config_nv);
+ }
+
+ /*
+ * Dump the L2ARC header, if existent.
+ */
+ if (read_l2arc_header)
+ error |= dump_l2arc_header(fd);
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
+ umem_free(node, sizeof (cksum_record_t));
+
+ cookie = NULL;
+ while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
+ umem_free(node, sizeof (cksum_record_t));
+
+ avl_destroy(&config_tree);
+ avl_destroy(&uberblock_tree);
+
+ (void) close(fd);
+
+ return (config_found == B_FALSE ? 2 :
+ (error == B_TRUE ? 1 : 0));
+}
+
+static uint64_t dataset_feature_count[SPA_FEATURES];
+static uint64_t global_feature_count[SPA_FEATURES];
+static uint64_t remap_deadlist_count = 0;
+
+/*ARGSUSED*/
+static int
+dump_one_objset(const char *dsname, void *arg)
+{
+ int error;
+ objset_t *os;
+ spa_feature_t f;
+
+ error = open_objset(dsname, FTAG, &os);
+ if (error != 0)
+ return (0);
+
+ for (f = 0; f < SPA_FEATURES; f++) {
+ if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
+ continue;
+ ASSERT(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET);
+ dataset_feature_count[f]++;
+ }
+
+ if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
+ remap_deadlist_count++;
+ }
+
+ for (dsl_bookmark_node_t *dbn =
+ avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
+ dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
+ mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
+ if (dbn->dbn_phys.zbm_redaction_obj != 0)
+ global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++;
+ if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
+ global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
+ }
+
+ if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
+ !dmu_objset_is_snapshot(os)) {
+ global_feature_count[SPA_FEATURE_LIVELIST]++;
+ }
+
+ dump_objset(os);
+ close_objset(os, FTAG);
+ fuid_table_destroy();
+ return (0);
+}
+
+/*
+ * Block statistics.
+ */
+#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
+typedef struct zdb_blkstats {
+ uint64_t zb_asize;
+ uint64_t zb_lsize;
+ uint64_t zb_psize;
+ uint64_t zb_count;
+ uint64_t zb_gangs;
+ uint64_t zb_ditto_samevdev;
+ uint64_t zb_ditto_same_ms;
+ uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
+} zdb_blkstats_t;
+
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0)
+#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1)
+#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2)
+#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3)
+
+static const char *zdb_ot_extname[] = {
+ "deferred free",
+ "dedup ditto",
+ "other",
+ "Total",
+};
+
+#define ZB_TOTAL DN_MAX_LEVELS
+#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
+
+typedef struct zdb_cb {
+ zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+ uint64_t zcb_removing_size;
+ uint64_t zcb_checkpoint_size;
+ uint64_t zcb_dedup_asize;
+ uint64_t zcb_dedup_blocks;
+ uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
+ uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
+ uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
+ uint64_t zcb_psize_len[SPA_MAX_FOR_16M];
+ uint64_t zcb_lsize_len[SPA_MAX_FOR_16M];
+ uint64_t zcb_asize_len[SPA_MAX_FOR_16M];
+ uint64_t zcb_psize_total;
+ uint64_t zcb_lsize_total;
+ uint64_t zcb_asize_total;
+ uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+ uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+ [BPE_PAYLOAD_SIZE + 1];
+ uint64_t zcb_start;
+ hrtime_t zcb_lastprint;
+ uint64_t zcb_totalasize;
+ uint64_t zcb_errors[256];
+ int zcb_readfails;
+ int zcb_haderrors;
+ spa_t *zcb_spa;
+ uint32_t **zcb_vd_obsolete_counts;
+} zdb_cb_t;
+
+/* test if two DVA offsets from same vdev are within the same metaslab */
+static boolean_t
+same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
+{
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ uint64_t ms_shift = vd->vdev_ms_shift;
+
+ return ((off1 >> ms_shift) == (off2 >> ms_shift));
+}
+
+/*
+ * Used to simplify reporting of the histogram data.
+ */
+typedef struct one_histo {
+ char *name;
+ uint64_t *count;
+ uint64_t *len;
+ uint64_t cumulative;
+} one_histo_t;
+
+/*
+ * The number of separate histograms processed for psize, lsize and asize.
+ */
+#define NUM_HISTO 3
+
+/*
+ * This routine will create a fixed column size output of three different
+ * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
+ * the count, length and cumulative length of the psize, lsize and
+ * asize blocks.
+ *
+ * All three types of blocks are listed on a single line
+ *
+ * By default the table is printed in nicenumber format (e.g. 123K) but
+ * if the '-P' parameter is specified then the full raw number (parseable)
+ * is printed out.
+ */
+static void
+dump_size_histograms(zdb_cb_t *zcb)
+{
+ /*
+ * A temporary buffer that allows us to convert a number into
+ * a string using zdb_nicenumber to allow either raw or human
+ * readable numbers to be output.
+ */
+ char numbuf[32];
+
+ /*
+ * Define titles which are used in the headers of the tables
+ * printed by this routine.
+ */
+ const char blocksize_title1[] = "block";
+ const char blocksize_title2[] = "size";
+ const char count_title[] = "Count";
+ const char length_title[] = "Size";
+ const char cumulative_title[] = "Cum.";
+
+ /*
+ * Setup the histogram arrays (psize, lsize, and asize).
+ */
+ one_histo_t parm_histo[NUM_HISTO];
+
+ parm_histo[0].name = "psize";
+ parm_histo[0].count = zcb->zcb_psize_count;
+ parm_histo[0].len = zcb->zcb_psize_len;
+ parm_histo[0].cumulative = 0;
+
+ parm_histo[1].name = "lsize";
+ parm_histo[1].count = zcb->zcb_lsize_count;
+ parm_histo[1].len = zcb->zcb_lsize_len;
+ parm_histo[1].cumulative = 0;
+
+ parm_histo[2].name = "asize";
+ parm_histo[2].count = zcb->zcb_asize_count;
+ parm_histo[2].len = zcb->zcb_asize_len;
+ parm_histo[2].cumulative = 0;
+
+
+ (void) printf("\nBlock Size Histogram\n");
+ /*
+ * Print the first line titles
+ */
+ if (dump_opt['P'])
+ (void) printf("\n%s\t", blocksize_title1);
+ else
+ (void) printf("\n%7s ", blocksize_title1);
+
+ for (int j = 0; j < NUM_HISTO; j++) {
+ if (dump_opt['P']) {
+ if (j < NUM_HISTO - 1) {
+ (void) printf("%s\t\t\t", parm_histo[j].name);
+ } else {
+ /* Don't print trailing spaces */
+ (void) printf(" %s", parm_histo[j].name);
+ }
+ } else {
+ if (j < NUM_HISTO - 1) {
+ /* Left aligned strings in the output */
+ (void) printf("%-7s ",
+ parm_histo[j].name);
+ } else {
+ /* Don't print trailing spaces */
+ (void) printf("%s", parm_histo[j].name);
+ }
+ }
+ }
+ (void) printf("\n");
+
+ /*
+ * Print the second line titles
+ */
+ if (dump_opt['P']) {
+ (void) printf("%s\t", blocksize_title2);
+ } else {
+ (void) printf("%7s ", blocksize_title2);
+ }
+
+ for (int i = 0; i < NUM_HISTO; i++) {
+ if (dump_opt['P']) {
+ (void) printf("%s\t%s\t%s\t",
+ count_title, length_title, cumulative_title);
+ } else {
+ (void) printf("%7s%7s%7s",
+ count_title, length_title, cumulative_title);
+ }
+ }
+ (void) printf("\n");
+
+ /*
+ * Print the rows
+ */
+ for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
+
+ /*
+ * Print the first column showing the blocksize
+ */
+ zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
+
+ if (dump_opt['P']) {
+ printf("%s", numbuf);
+ } else {
+ printf("%7s:", numbuf);
+ }
+
+ /*
+ * Print the remaining set of 3 columns per size:
+ * for psize, lsize and asize
+ */
+ for (int j = 0; j < NUM_HISTO; j++) {
+ parm_histo[j].cumulative += parm_histo[j].len[i];
+
+ zdb_nicenum(parm_histo[j].count[i],
+ numbuf, sizeof (numbuf));
+ if (dump_opt['P'])
+ (void) printf("\t%s", numbuf);
+ else
+ (void) printf("%7s", numbuf);
+
+ zdb_nicenum(parm_histo[j].len[i],
+ numbuf, sizeof (numbuf));
+ if (dump_opt['P'])
+ (void) printf("\t%s", numbuf);
+ else
+ (void) printf("%7s", numbuf);
+
+ zdb_nicenum(parm_histo[j].cumulative,
+ numbuf, sizeof (numbuf));
+ if (dump_opt['P'])
+ (void) printf("\t%s", numbuf);
+ else
+ (void) printf("%7s", numbuf);
+ }
+ (void) printf("\n");
+ }
+}
+
+static void
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
+ dmu_object_type_t type)
+{
+ uint64_t refcnt = 0;
+ int i;
+
+ ASSERT(type < ZDB_OT_TOTAL);
+
+ if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+ return;
+
+ spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
+
+ for (i = 0; i < 4; i++) {
+ int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
+ int t = (i & 1) ? type : ZDB_OT_TOTAL;
+ int equal;
+ zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
+
+ zb->zb_asize += BP_GET_ASIZE(bp);
+ zb->zb_lsize += BP_GET_LSIZE(bp);
+ zb->zb_psize += BP_GET_PSIZE(bp);
+ zb->zb_count++;
+
+ /*
+ * The histogram is only big enough to record blocks up to
+ * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+ * "other", bucket.
+ */
+ unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+ idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+ zb->zb_psize_histogram[idx]++;
+
+ zb->zb_gangs += BP_COUNT_GANG(bp);
+
+ switch (BP_GET_NDVAS(bp)) {
+ case 2:
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) {
+ zb->zb_ditto_samevdev++;
+
+ if (same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[1])))
+ zb->zb_ditto_same_ms++;
+ }
+ break;
+ case 3:
+ equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1])) +
+ (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2])) +
+ (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]));
+ if (equal != 0) {
+ zb->zb_ditto_samevdev++;
+
+ if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[1]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[1])))
+ zb->zb_ditto_same_ms++;
+ else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[0]),
+ DVA_GET_OFFSET(&bp->blk_dva[2])))
+ zb->zb_ditto_same_ms++;
+ else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+ DVA_GET_VDEV(&bp->blk_dva[2]) &&
+ same_metaslab(zcb->zcb_spa,
+ DVA_GET_VDEV(&bp->blk_dva[1]),
+ DVA_GET_OFFSET(&bp->blk_dva[1]),
+ DVA_GET_OFFSET(&bp->blk_dva[2])))
+ zb->zb_ditto_same_ms++;
+ }
+ break;
+ }
+ }
+
+ spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
+
+ if (BP_IS_EMBEDDED(bp)) {
+ zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+ zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+ [BPE_GET_PSIZE(bp)]++;
+ return;
+ }
+ /*
+ * The binning histogram bins by powers of two up to
+ * SPA_MAXBLOCKSIZE rather than creating bins for
+ * every possible blocksize found in the pool.
+ */
+ int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
+
+ zcb->zcb_psize_count[bin]++;
+ zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
+ zcb->zcb_psize_total += BP_GET_PSIZE(bp);
+
+ bin = highbit64(BP_GET_LSIZE(bp)) - 1;
+
+ zcb->zcb_lsize_count[bin]++;
+ zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
+ zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
+
+ bin = highbit64(BP_GET_ASIZE(bp)) - 1;
+
+ zcb->zcb_asize_count[bin]++;
+ zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
+ zcb->zcb_asize_total += BP_GET_ASIZE(bp);
+
+ if (dump_opt['L'])
+ return;
+
+ if (BP_GET_DEDUP(bp)) {
+ ddt_t *ddt;
+ ddt_entry_t *dde;
+
+ ddt = ddt_select(zcb->zcb_spa, bp);
+ ddt_enter(ddt);
+ dde = ddt_lookup(ddt, bp, B_FALSE);
+
+ if (dde == NULL) {
+ refcnt = 0;
+ } else {
+ ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+ ddt_phys_decref(ddp);
+ refcnt = ddp->ddp_refcnt;
+ if (ddt_phys_total_refcnt(dde) == 0)
+ ddt_remove(ddt, dde);
+ }
+ ddt_exit(ddt);
+ }
+
+ VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+ refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
+ bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+}
+
+static void
+zdb_blkptr_done(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ blkptr_t *bp = zio->io_bp;
+ int ioerr = zio->io_error;
+ zdb_cb_t *zcb = zio->io_private;
+ zbookmark_phys_t *zb = &zio->io_bookmark;
+
+ abd_free(zio->io_abd);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
+ cv_broadcast(&spa->spa_scrub_io_cv);
+
+ if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ char blkbuf[BP_SPRINTF_LEN];
+
+ zcb->zcb_haderrors = 1;
+ zcb->zcb_errors[ioerr]++;
+
+ if (dump_opt['b'] >= 2)
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ else
+ blkbuf[0] = '\0';
+
+ (void) printf("zdb_blkptr_cb: "
+ "Got error %d reading "
+ "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+ ioerr,
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level,
+ (u_longlong_t)zb->zb_blkid,
+ blkbuf);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ zdb_cb_t *zcb = arg;
+ dmu_object_type_t type;
+ boolean_t is_metadata;
+
+ if (zb->zb_level == ZB_DNODE_LEVEL)
+ return (0);
+
+ if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("objset %llu object %llu "
+ "level %lld offset 0x%llx %s\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)zb->zb_object,
+ (longlong_t)zb->zb_level,
+ (u_longlong_t)blkid2offset(dnp, bp, zb),
+ blkbuf);
+ }
+
+ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+ return (0);
+
+ type = BP_GET_TYPE(bp);
+
+ zdb_count_block(zcb, zilog, bp,
+ (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
+
+ is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
+
+ if (!BP_IS_EMBEDDED(bp) &&
+ (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
+ size_t size = BP_GET_PSIZE(bp);
+ abd_t *abd = abd_alloc(size, B_FALSE);
+ int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+ /* If it's an intent log block, failure is expected. */
+ if (zb->zb_level == ZB_ZIL_LEVEL)
+ flags |= ZIO_FLAG_SPECULATIVE;
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_load_verify_bytes > max_inflight_bytes)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_load_verify_bytes += size;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(NULL, spa, bp, abd, size,
+ zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+ }
+
+ zcb->zcb_readfails = 0;
+
+ /* only call gethrtime() every 100 blocks */
+ static int iters;
+ if (++iters > 100)
+ iters = 0;
+ else
+ return (0);
+
+ if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+ uint64_t now = gethrtime();
+ char buf[10];
+ uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
+ int kb_per_sec =
+ 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
+ int sec_remaining =
+ (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ);
+
+ zfs_nicebytes(bytes, buf, sizeof (buf));
+ (void) fprintf(stderr,
+ "\r%5s completed (%4dMB/s) "
+ "estimated time remaining: %uhr %02umin %02usec ",
+ buf, kb_per_sec / 1024,
+ sec_remaining / 60 / 60,
+ sec_remaining / 60 % 60,
+ sec_remaining % 60);
+
+ zcb->zcb_lastprint = now;
+ }
+
+ return (0);
+}
+
+static void
+zdb_leak(void *arg, uint64_t start, uint64_t size)
+{
+ vdev_t *vd = arg;
+
+ (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+ (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+static metaslab_ops_t zdb_metaslab_ops = {
+ NULL /* alloc */
+};
+
+/* ARGSUSED */
+static int
+load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ spa_vdev_removal_t *svr = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+
+ /* skip vdevs we don't care about */
+ if (sme->sme_vdev != svr->svr_vdev_id)
+ return (0);
+
+ vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ ASSERT(vim != NULL);
+ if (offset >= vdev_indirect_mapping_max_offset(vim))
+ return (0);
+
+ if (sme->sme_type == SM_ALLOC)
+ range_tree_add(svr->svr_allocd_segs, offset, size);
+ else
+ range_tree_remove(svr->svr_allocd_segs, offset, size);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+ uint64_t size, void *arg)
+{
+ /*
+ * This callback was called through a remap from
+ * a device being removed. Therefore, the vdev that
+ * this callback is applied to is a concrete
+ * vdev.
+ */
+ ASSERT(vdev_is_concrete(vd));
+
+ VERIFY0(metaslab_claim_impl(vd, offset, size,
+ spa_min_claim_txg(vd->vdev_spa)));
+}
+
+static void
+claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+ vdev_t *vd = arg;
+
+ vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+ claim_segment_impl_cb, NULL);
+}
+
+/*
+ * After accounting for all allocated blocks that are directly referenced,
+ * we might have missed a reference to a block from a partially complete
+ * (and thus unused) indirect mapping object. We perform a secondary pass
+ * through the metaslabs we have already mapped and claim the destination
+ * blocks.
+ */
+static void
+zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
+{
+ if (dump_opt['L'])
+ return;
+
+ if (spa->spa_vdev_removal == NULL)
+ return;
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+ vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+ range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+ for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+ metaslab_t *msp = vd->vdev_ms[msi];
+
+ if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+ break;
+
+ ASSERT0(range_tree_space(allocs));
+ if (msp->ms_sm != NULL)
+ VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
+ range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
+ }
+ range_tree_destroy(allocs);
+
+ iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
+
+ /*
+ * Clear everything past what has been synced,
+ * because we have not allocated mappings for
+ * it yet.
+ */
+ range_tree_clear(svr->svr_allocd_segs,
+ vdev_indirect_mapping_max_offset(vim),
+ vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
+
+ zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
+ range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+/* ARGSUSED */
+static int
+increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+ spa_t *spa = zcb->zcb_spa;
+ vdev_t *vd;
+ const dva_t *dva = &bp->blk_dva[0];
+
+ ASSERT(!bp_freed);
+ ASSERT(!dump_opt['L']);
+ ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
+
+ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+ vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
+ ASSERT3P(vd, !=, NULL);
+ spa_config_exit(spa, SCL_VDEV, FTAG);
+
+ ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+ ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
+
+ vdev_indirect_mapping_increment_obsolete_count(
+ vd->vdev_indirect_mapping,
+ DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+
+ return (0);
+}
+
+static uint32_t *
+zdb_load_obsolete_counts(vdev_t *vd)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ spa_t *spa = vd->vdev_spa;
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ uint64_t obsolete_sm_object;
+ uint32_t *counts;
+
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
+ counts = vdev_indirect_mapping_load_obsolete_counts(vim);
+ if (vd->vdev_obsolete_sm != NULL) {
+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+ vd->vdev_obsolete_sm);
+ }
+ if (scip->scip_vdev == vd->vdev_id &&
+ scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+ vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
+ prev_obsolete_sm);
+ space_map_close(prev_obsolete_sm);
+ }
+ return (counts);
+}
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ ddt_bookmark_t ddb;
+ ddt_entry_t dde;
+ int error;
+ int p;
+
+ ASSERT(!dump_opt['L']);
+
+ bzero(&ddb, sizeof (ddb));
+ while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+ blkptr_t blk;
+ ddt_phys_t *ddp = dde.dde_phys;
+
+ if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+ return;
+
+ ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+ for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+ if (ddp->ddp_phys_birth == 0)
+ continue;
+ ddt_bp_create(ddb.ddb_checksum,
+ &dde.dde_key, ddp, &blk);
+ if (p == DDT_PHYS_DITTO) {
+ zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
+ } else {
+ zcb->zcb_dedup_asize +=
+ BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+ zcb->zcb_dedup_blocks++;
+ }
+ }
+ ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+ ddt_enter(ddt);
+ VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+ ddt_exit(ddt);
+ }
+
+ ASSERT(error == ENOENT);
+}
+
+typedef struct checkpoint_sm_exclude_entry_arg {
+ vdev_t *cseea_vd;
+ uint64_t cseea_checkpoint_size;
+} checkpoint_sm_exclude_entry_arg_t;
+
+static int
+checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
+{
+ checkpoint_sm_exclude_entry_arg_t *cseea = arg;
+ vdev_t *vd = cseea->cseea_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ ASSERT(sme->sme_type == SM_FREE);
+
+ /*
+ * Since the vdev_checkpoint_sm exists in the vdev level
+ * and the ms_sm space maps exist in the metaslab level,
+ * an entry in the checkpoint space map could theoretically
+ * cross the boundaries of the metaslab that it belongs.
+ *
+ * In reality, because of the way that we populate and
+ * manipulate the checkpoint's space maps currently,
+ * there shouldn't be any entries that cross metaslabs.
+ * Hence the assertion below.
+ *
+ * That said, there is no fundamental requirement that
+ * the checkpoint's space map entries should not cross
+ * metaslab boundaries. So if needed we could add code
+ * that handles metaslab-crossing segments in the future.
+ */
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * By removing the entry from the allocated segments we
+ * also verify that the entry is there to begin with.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ cseea->cseea_checkpoint_size += sme->sme_run;
+ return (0);
+}
+
+static void
+zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
+{
+ spa_t *spa = vd->vdev_spa;
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ /*
+ * If there is no vdev_top_zap, we are in a pool whose
+ * version predates the pool checkpoint feature.
+ */
+ if (vd->vdev_top_zap == 0)
+ return;
+
+ /*
+ * If there is no reference of the vdev_checkpoint_sm in
+ * the vdev_top_zap, then one of the following scenarios
+ * is true:
+ *
+ * 1] There is no checkpoint
+ * 2] There is a checkpoint, but no checkpointed blocks
+ * have been freed yet
+ * 3] The current vdev is indirect
+ *
+ * In these cases we return immediately.
+ */
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ return;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
+ &checkpoint_sm_obj));
+
+ checkpoint_sm_exclude_entry_arg_t cseea;
+ cseea.cseea_vd = vd;
+ cseea.cseea_checkpoint_size = 0;
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
+ checkpoint_sm_exclude_entry_cb, &cseea));
+ space_map_close(checkpoint_sm);
+
+ zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
+}
+
+static void
+zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
+{
+ ASSERT(!dump_opt['L']);
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
+ zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
+ }
+}
+
+static int
+count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ int64_t *ualloc_space = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ if (sme->sme_type == SM_ALLOC)
+ *ualloc_space += sme->sme_run;
+ else
+ *ualloc_space -= sme->sme_run;
+
+ return (0);
+}
+
+static int64_t
+get_unflushed_alloc_space(spa_t *spa)
+{
+ if (dump_opt['L'])
+ return (0);
+
+ int64_t ualloc_space = 0;
+ iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
+ &ualloc_space);
+ return (ualloc_space);
+}
+
+static int
+load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
+{
+ maptype_t *uic_maptype = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t size = sme->sme_run;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+ /* skip indirect vdevs */
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+ ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+
+ if (*uic_maptype == sme->sme_type)
+ range_tree_add(ms->ms_allocatable, offset, size);
+ else
+ range_tree_remove(ms->ms_allocatable, offset, size);
+
+ return (0);
+}
+
+static void
+load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
+{
+ iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
+}
+
+static void
+load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ ASSERT3U(i, ==, vd->vdev_id);
+
+ if (vd->vdev_ops == &vdev_indirect_ops)
+ continue;
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rloading concrete vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)msp->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ if (msp->ms_sm != NULL) {
+ VERIFY0(space_map_load(msp->ms_sm,
+ msp->ms_allocatable, maptype));
+ }
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+ }
+ }
+
+ load_unflushed_to_ms_allocatables(spa, maptype);
+}
+
+/*
+ * vm_idxp is an in-out parameter which (for indirect vdevs) is the
+ * index in vim_entries that has the first entry in this metaslab.
+ * On return, it will be set to the first entry after this metaslab.
+ */
+static void
+load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
+ uint64_t *vim_idxp)
+{
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ mutex_enter(&msp->ms_lock);
+ range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+
+ /*
+ * We don't want to spend the CPU manipulating the
+ * size-ordered tree, so clear the range_tree ops.
+ */
+ msp->ms_allocatable->rt_ops = NULL;
+
+ for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
+ (*vim_idxp)++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[*vim_idxp];
+ uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
+ ASSERT3U(ent_offset, >=, msp->ms_start);
+ if (ent_offset >= msp->ms_start + msp->ms_size)
+ break;
+
+ /*
+ * Mappings do not cross metaslab boundaries,
+ * because we create them by walking the metaslabs.
+ */
+ ASSERT3U(ent_offset + ent_len, <=,
+ msp->ms_start + msp->ms_size);
+ range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
+ }
+
+ if (!msp->ms_loaded)
+ msp->ms_loaded = B_TRUE;
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
+{
+ ASSERT(!dump_opt['L']);
+
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+
+ ASSERT3U(c, ==, vd->vdev_id);
+
+ if (vd->vdev_ops != &vdev_indirect_ops)
+ continue;
+
+ /*
+ * Note: we don't check for mapping leaks on
+ * removing vdevs because their ms_allocatable's
+ * are used to look for leaks in allocated space.
+ */
+ zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
+
+ /*
+ * Normally, indirect vdevs don't have any
+ * metaslabs. We want to set them up for
+ * zio_claim().
+ */
+ VERIFY0(vdev_metaslab_init(vd, 0));
+
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t vim_idx = 0;
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+
+ (void) fprintf(stderr,
+ "\rloading indirect vdev %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vd->vdev_ms[m]->ms_id,
+ (longlong_t)vd->vdev_ms_count);
+
+ load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
+ &vim_idx);
+ }
+ ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
+ }
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+ zcb->zcb_spa = spa;
+
+ if (dump_opt['L'])
+ return;
+
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ /*
+ * We are going to be changing the meaning of the metaslab's
+ * ms_allocatable. Ensure that the allocator doesn't try to
+ * use the tree.
+ */
+ spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+ spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+
+ zcb->zcb_vd_obsolete_counts =
+ umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
+ UMEM_NOFAIL);
+
+ /*
+ * For leak detection, we overload the ms_allocatable trees
+ * to contain allocated segments instead of free segments.
+ * As a result, we can't use the normal metaslab_load/unload
+ * interfaces.
+ */
+ zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
+ load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
+
+ /*
+ * On load_concrete_ms_allocatable_trees() we loaded all the
+ * allocated entries from the ms_sm to the ms_allocatable for
+ * each metaslab. If the pool has a checkpoint or is in the
+ * middle of discarding a checkpoint, some of these blocks
+ * may have been freed but their ms_sm may not have been
+ * updated because they are referenced by the checkpoint. In
+ * order to avoid false-positives during leak-detection, we
+ * go through the vdev's checkpoint space map and exclude all
+ * its entries from their relevant ms_allocatable.
+ *
+ * We also aggregate the space held by the checkpoint and add
+ * it to zcb_checkpoint_size.
+ *
+ * Note that at this point we are also verifying that all the
+ * entries on the checkpoint_sm are marked as allocated in
+ * the ms_sm of their relevant metaslab.
+ * [see comment in checkpoint_sm_exclude_entry_cb()]
+ */
+ zdb_leak_init_exclude_checkpoint(spa, zcb);
+ ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
+ increment_indirect_mapping_cb, zcb, NULL);
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ zdb_ddt_leak_init(spa, zcb);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static boolean_t
+zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
+{
+ boolean_t leaks = B_FALSE;
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+ uint64_t total_leaked = 0;
+ boolean_t are_precise = B_FALSE;
+
+ ASSERT(vim != NULL);
+
+ for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
+ vdev_indirect_mapping_entry_phys_t *vimep =
+ &vim->vim_entries[i];
+ uint64_t obsolete_bytes = 0;
+ uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
+ metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+ /*
+ * This is not very efficient but it's easy to
+ * verify correctness.
+ */
+ for (uint64_t inner_offset = 0;
+ inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
+ inner_offset += 1 << vd->vdev_ashift) {
+ if (range_tree_contains(msp->ms_allocatable,
+ offset + inner_offset, 1 << vd->vdev_ashift)) {
+ obsolete_bytes += 1 << vd->vdev_ashift;
+ }
+ }
+
+ int64_t bytes_leaked = obsolete_bytes -
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
+ ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
+
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
+ (void) printf("obsolete indirect mapping count "
+ "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
+ (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
+ (u_longlong_t)bytes_leaked);
+ }
+ total_leaked += ABS(bytes_leaked);
+ }
+
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (!are_precise && total_leaked > 0) {
+ int pct_leaked = total_leaked * 100 /
+ vdev_indirect_mapping_bytes_mapped(vim);
+ (void) printf("cannot verify obsolete indirect mapping "
+ "counts of vdev %llu because precise feature was not "
+ "enabled when it was removed: %d%% (%llx bytes) of mapping"
+ "unreferenced\n",
+ (u_longlong_t)vd->vdev_id, pct_leaked,
+ (u_longlong_t)total_leaked);
+ } else if (total_leaked > 0) {
+ (void) printf("obsolete indirect mapping count mismatch "
+ "for vdev %llu -- %llx total bytes mismatched\n",
+ (u_longlong_t)vd->vdev_id,
+ (u_longlong_t)total_leaked);
+ leaks |= B_TRUE;
+ }
+
+ vdev_indirect_mapping_free_obsolete_counts(vim,
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
+ zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
+
+ return (leaks);
+}
+
+static boolean_t
+zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
+{
+ if (dump_opt['L'])
+ return (B_FALSE);
+
+ boolean_t leaks = B_FALSE;
+ vdev_t *rvd = spa->spa_root_vdev;
+ for (unsigned c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *vd = rvd->vdev_child[c];
+ metaslab_group_t *mg __maybe_unused = vd->vdev_mg;
+
+ if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
+ leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
+ }
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+ ASSERT3P(mg, ==, msp->ms_group);
+
+ /*
+ * ms_allocatable has been overloaded
+ * to contain allocated segments. Now that
+ * we finished traversing all blocks, any
+ * block that remains in the ms_allocatable
+ * represents an allocated block that we
+ * did not claim during the traversal.
+ * Claimed blocks would have been removed
+ * from the ms_allocatable. For indirect
+ * vdevs, space remaining in the tree
+ * represents parts of the mapping that are
+ * not referenced, which is not a bug.
+ */
+ if (vd->vdev_ops == &vdev_indirect_ops) {
+ range_tree_vacate(msp->ms_allocatable,
+ NULL, NULL);
+ } else {
+ range_tree_vacate(msp->ms_allocatable,
+ zdb_leak, vd);
+ }
+ if (msp->ms_loaded) {
+ msp->ms_loaded = B_FALSE;
+ }
+ }
+ }
+
+ umem_free(zcb->zcb_vd_obsolete_counts,
+ rvd->vdev_children * sizeof (uint32_t *));
+ zcb->zcb_vd_obsolete_counts = NULL;
+
+ return (leaks);
+}
+
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ zdb_cb_t *zcb = arg;
+
+ if (dump_opt['b'] >= 5) {
+ char blkbuf[BP_SPRINTF_LEN];
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("[%s] %s\n",
+ "deferred free", blkbuf);
+ }
+ zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+ return (0);
+}
+
+/*
+ * Iterate over livelists which have been destroyed by the user but
+ * are still present in the MOS, waiting to be freed
+ */
+static void
+iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
+{
+ objset_t *mos = spa->spa_meta_objset;
+ uint64_t zap_obj;
+ int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+ if (err == ENOENT)
+ return;
+ ASSERT0(err);
+
+ zap_cursor_t zc;
+ zap_attribute_t attr;
+ dsl_deadlist_t ll;
+ /* NULL out os prior to dsl_deadlist_open in case it's garbage */
+ ll.dl_os = NULL;
+ for (zap_cursor_init(&zc, mos, zap_obj);
+ zap_cursor_retrieve(&zc, &attr) == 0;
+ (void) zap_cursor_advance(&zc)) {
+ dsl_deadlist_open(&ll, mos, attr.za_first_integer);
+ func(&ll, arg);
+ dsl_deadlist_close(&ll);
+ }
+ zap_cursor_fini(&zc);
+}
+
+static int
+bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+ dmu_tx_t *tx)
+{
+ ASSERT(!bp_freed);
+ return (count_block_cb(arg, bp, tx));
+}
+
+static int
+livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
+{
+ zdb_cb_t *zbc = args;
+ bplist_t blks;
+ bplist_create(&blks);
+ /* determine which blocks have been alloc'd but not freed */
+ VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
+ /* count those blocks */
+ (void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
+ bplist_destroy(&blks);
+ return (0);
+}
+
+static void
+livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
+{
+ dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
+}
+
+/*
+ * Count the blocks in the livelists that have been destroyed by the user
+ * but haven't yet been freed.
+ */
+static void
+deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
+{
+ iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
+}
+
+static void
+dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
+{
+ ASSERT3P(arg, ==, NULL);
+ global_feature_count[SPA_FEATURE_LIVELIST]++;
+ dump_blkptr_list(ll, "Deleted Livelist");
+ dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
+}
+
+/*
+ * Print out, register object references to, and increment feature counts for
+ * livelists that have been destroyed by the user but haven't yet been freed.
+ */
+static void
+deleted_livelists_dump_mos(spa_t *spa)
+{
+ uint64_t zap_obj;
+ objset_t *mos = spa->spa_meta_objset;
+ int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+ if (err == ENOENT)
+ return;
+ mos_obj_refd(zap_obj);
+ iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
+}
+
+static int
+dump_block_stats(spa_t *spa)
+{
+ zdb_cb_t zcb;
+ zdb_blkstats_t *zb, *tzb;
+ uint64_t norm_alloc, norm_space, total_alloc, total_found;
+ int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
+ boolean_t leaks = B_FALSE;
+ int e, c, err;
+ bp_embedded_type_t i;
+
+ bzero(&zcb, sizeof (zcb));
+ (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
+ (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+ (dump_opt['c'] == 1) ? "metadata " : "",
+ dump_opt['c'] ? "checksums " : "",
+ (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+ !dump_opt['L'] ? "nothing leaked " : "");
+
+ /*
+ * When leak detection is enabled we load all space maps as SM_ALLOC
+ * maps, then traverse the pool claiming each block we discover. If
+ * the pool is perfectly consistent, the segment trees will be empty
+ * when we're done. Anything left over is a leak; any block we can't
+ * claim (because it's not part of any space map) is a double
+ * allocation, reference to a freed block, or an unclaimed log block.
+ *
+ * When leak detection is disabled (-L option) we still traverse the
+ * pool claiming each block we discover, but we skip opening any space
+ * maps.
+ */
+ bzero(&zcb, sizeof (zdb_cb_t));
+ zdb_leak_init(spa, &zcb);
+
+ /*
+ * If there's a deferred-free bplist, process that first.
+ */
+ (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+ bpobj_count_block_cb, &zcb, NULL);
+
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+ bpobj_count_block_cb, &zcb, NULL);
+ }
+
+ zdb_claim_removing(spa, &zcb);
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
+ spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
+ &zcb, NULL));
+ }
+
+ deleted_livelists_count_blocks(spa, &zcb);
+
+ if (dump_opt['c'] > 1)
+ flags |= TRAVERSE_PREFETCH_DATA;
+
+ zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
+ zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
+ zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
+ zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
+ err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
+
+ /*
+ * If we've traversed the data blocks then we need to wait for those
+ * I/Os to complete. We leverage "The Godfather" zio to wait on
+ * all async I/Os to complete.
+ */
+ if (dump_opt['c']) {
+ for (c = 0; c < max_ncpus; c++) {
+ (void) zio_wait(spa->spa_async_zio_root[c]);
+ spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
+ }
+ ASSERT0(spa->spa_load_verify_bytes);
+
+ /*
+ * Done after zio_wait() since zcb_haderrors is modified in
+ * zdb_blkptr_done()
+ */
+ zcb.zcb_haderrors |= err;
+
+ if (zcb.zcb_haderrors) {
+ (void) printf("\nError counts:\n\n");
+ (void) printf("\t%5s %s\n", "errno", "count");
+ for (e = 0; e < 256; e++) {
+ if (zcb.zcb_errors[e] != 0) {
+ (void) printf("\t%5d %llu\n",
+ e, (u_longlong_t)zcb.zcb_errors[e]);
+ }
+ }
+ }
+
+ /*
+ * Report any leaked segments.
+ */
+ leaks |= zdb_leak_fini(spa, &zcb);
+
+ tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
+
+ norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ norm_space = metaslab_class_get_space(spa_normal_class(spa));
+
+ total_alloc = norm_alloc +
+ metaslab_class_get_alloc(spa_log_class(spa)) +
+ metaslab_class_get_alloc(spa_special_class(spa)) +
+ metaslab_class_get_alloc(spa_dedup_class(spa)) +
+ get_unflushed_alloc_space(spa);
+ total_found = tzb->zb_asize - zcb.zcb_dedup_asize +
+ zcb.zcb_removing_size + zcb.zcb_checkpoint_size;
+
+ if (total_found == total_alloc && !dump_opt['L']) {
+ (void) printf("\n\tNo leaks (block sum matches space"
+ " maps exactly)\n");
+ } else if (!dump_opt['L']) {
+ (void) printf("block traversal size %llu != alloc %llu "
+ "(%s %lld)\n",
+ (u_longlong_t)total_found,
+ (u_longlong_t)total_alloc,
+ (dump_opt['L']) ? "unreachable" : "leaked",
+ (longlong_t)(total_alloc - total_found));
+ leaks = B_TRUE;
+ }
+
+ if (tzb->zb_count == 0)
+ return (2);
+
+ (void) printf("\n");
+ (void) printf("\t%-16s %14llu\n", "bp count:",
+ (u_longlong_t)tzb->zb_count);
+ (void) printf("\t%-16s %14llu\n", "ganged count:",
+ (longlong_t)tzb->zb_gangs);
+ (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:",
+ (u_longlong_t)tzb->zb_lsize,
+ (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
+ (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
+ "bp physical:", (u_longlong_t)tzb->zb_psize,
+ (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
+ (double)tzb->zb_lsize / tzb->zb_psize);
+ (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n",
+ "bp allocated:", (u_longlong_t)tzb->zb_asize,
+ (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
+ (double)tzb->zb_lsize / tzb->zb_asize);
+ (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n",
+ "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize,
+ (u_longlong_t)zcb.zcb_dedup_blocks,
+ (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
+ (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
+
+ if (spa_special_class(spa)->mc_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_special_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_special_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Special class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
+ if (spa_dedup_class(spa)->mc_rotor != NULL) {
+ uint64_t alloc = metaslab_class_get_alloc(
+ spa_dedup_class(spa));
+ uint64_t space = metaslab_class_get_space(
+ spa_dedup_class(spa));
+
+ (void) printf("\t%-16s %14llu used: %5.2f%%\n",
+ "Dedup class", (u_longlong_t)alloc,
+ 100.0 * alloc / space);
+ }
+
+ for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+ if (zcb.zcb_embedded_blocks[i] == 0)
+ continue;
+ (void) printf("\n");
+ (void) printf("\tadditional, non-pointer bps of type %u: "
+ "%10llu\n",
+ i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+ if (dump_opt['b'] >= 3) {
+ (void) printf("\t number of (compressed) bytes: "
+ "number of bps\n");
+ dump_histogram(zcb.zcb_embedded_histogram[i],
+ sizeof (zcb.zcb_embedded_histogram[i]) /
+ sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+ }
+ }
+
+ if (tzb->zb_ditto_samevdev != 0) {
+ (void) printf("\tDittoed blocks on same vdev: %llu\n",
+ (longlong_t)tzb->zb_ditto_samevdev);
+ }
+ if (tzb->zb_ditto_same_ms != 0) {
+ (void) printf("\tDittoed blocks in same metaslab: %llu\n",
+ (longlong_t)tzb->zb_ditto_same_ms);
+ }
+
+ for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
+ vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+ if (vim == NULL) {
+ continue;
+ }
+
+ char mem[32];
+ zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
+ mem, vdev_indirect_mapping_size(vim));
+
+ (void) printf("\tindirect vdev id %llu has %llu segments "
+ "(%s in memory)\n",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
+ }
+
+ if (dump_opt['b'] >= 2) {
+ int l, t, level;
+ (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
+ "\t avg\t comp\t%%Total\tType\n");
+
+ for (t = 0; t <= ZDB_OT_TOTAL; t++) {
+ char csize[32], lsize[32], psize[32], asize[32];
+ char avg[32], gang[32];
+ const char *typename;
+
+ /* make sure nicenum has enough space */
+ CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ);
+ CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ);
+
+ if (t < DMU_OT_NUMTYPES)
+ typename = dmu_ot[t].ot_name;
+ else
+ typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
+
+ if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
+ (void) printf("%6s\t%5s\t%5s\t%5s"
+ "\t%5s\t%5s\t%6s\t%s\n",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ "-",
+ typename);
+ continue;
+ }
+
+ for (l = ZB_TOTAL - 1; l >= -1; l--) {
+ level = (l == -1 ? ZB_TOTAL : l);
+ zb = &zcb.zcb_type[level][t];
+
+ if (zb->zb_asize == 0)
+ continue;
+
+ if (dump_opt['b'] < 3 && level != ZB_TOTAL)
+ continue;
+
+ if (level == 0 && zb->zb_asize ==
+ zcb.zcb_type[ZB_TOTAL][t].zb_asize)
+ continue;
+
+ zdb_nicenum(zb->zb_count, csize,
+ sizeof (csize));
+ zdb_nicenum(zb->zb_lsize, lsize,
+ sizeof (lsize));
+ zdb_nicenum(zb->zb_psize, psize,
+ sizeof (psize));
+ zdb_nicenum(zb->zb_asize, asize,
+ sizeof (asize));
+ zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
+ sizeof (avg));
+ zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
+
+ (void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
+ "\t%5.2f\t%6.2f\t",
+ csize, lsize, psize, asize, avg,
+ (double)zb->zb_lsize / zb->zb_psize,
+ 100.0 * zb->zb_asize / tzb->zb_asize);
+
+ if (level == ZB_TOTAL)
+ (void) printf("%s\n", typename);
+ else
+ (void) printf(" L%d %s\n",
+ level, typename);
+
+ if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+ (void) printf("\t number of ganged "
+ "blocks: %s\n", gang);
+ }
+
+ if (dump_opt['b'] >= 4) {
+ (void) printf("psize "
+ "(in 512-byte sectors): "
+ "number of blocks\n");
+ dump_histogram(zb->zb_psize_histogram,
+ PSIZE_HISTO_SIZE, 0);
+ }
+ }
+ }
+
+ /* Output a table summarizing block sizes in the pool */
+ if (dump_opt['b'] >= 2) {
+ dump_size_histograms(&zcb);
+ }
+ }
+
+ (void) printf("\n");
+
+ if (leaks)
+ return (2);
+
+ if (zcb.zcb_haderrors)
+ return (3);
+
+ return (0);
+}
+
+typedef struct zdb_ddt_entry {
+ ddt_key_t zdde_key;
+ uint64_t zdde_ref_blocks;
+ uint64_t zdde_ref_lsize;
+ uint64_t zdde_ref_psize;
+ uint64_t zdde_ref_dsize;
+ avl_node_t zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ avl_tree_t *t = arg;
+ avl_index_t where;
+ zdb_ddt_entry_t *zdde, zdde_search;
+
+ if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+ BP_IS_EMBEDDED(bp))
+ return (0);
+
+ if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+ (void) printf("traversing objset %llu, %llu objects, "
+ "%lu blocks so far\n",
+ (u_longlong_t)zb->zb_objset,
+ (u_longlong_t)BP_GET_FILL(bp),
+ avl_numnodes(t));
+ }
+
+ if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+ BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+ return (0);
+
+ ddt_key_fill(&zdde_search.zdde_key, bp);
+
+ zdde = avl_find(t, &zdde_search, &where);
+
+ if (zdde == NULL) {
+ zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+ zdde->zdde_key = zdde_search.zdde_key;
+ avl_insert(t, zdde, where);
+ }
+
+ zdde->zdde_ref_blocks += 1;
+ zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+ zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+ zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+ return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+ avl_tree_t t;
+ void *cookie = NULL;
+ zdb_ddt_entry_t *zdde;
+ ddt_histogram_t ddh_total;
+ ddt_stat_t dds_total;
+
+ bzero(&ddh_total, sizeof (ddh_total));
+ bzero(&dds_total, sizeof (dds_total));
+ avl_create(&t, ddt_entry_compare,
+ sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+ (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+ TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
+
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+ ddt_stat_t dds;
+ uint64_t refcnt = zdde->zdde_ref_blocks;
+ ASSERT(refcnt != 0);
+
+ dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+ dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+ dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+ dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+ dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+ dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+ dds.dds_ref_psize = zdde->zdde_ref_psize;
+ dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+ ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
+ &dds, 0);
+
+ umem_free(zdde, sizeof (*zdde));
+ }
+
+ avl_destroy(&t);
+
+ ddt_histogram_stat(&dds_total, &ddh_total);
+
+ (void) printf("Simulated DDT histogram:\n");
+
+ zpool_dump_ddt(&dds_total, &ddh_total);
+
+ dump_dedup_ratio(&dds_total);
+}
+
+static int
+verify_device_removal_feature_counts(spa_t *spa)
+{
+ uint64_t dr_feature_refcount = 0;
+ uint64_t oc_feature_refcount = 0;
+ uint64_t indirect_vdev_count = 0;
+ uint64_t precise_vdev_count = 0;
+ uint64_t obsolete_counts_object_count = 0;
+ uint64_t obsolete_sm_count = 0;
+ uint64_t obsolete_counts_count = 0;
+ uint64_t scip_count = 0;
+ uint64_t obsolete_bpobj_count = 0;
+ int ret = 0;
+
+ spa_condensing_indirect_phys_t *scip =
+ &spa->spa_condensing_indirect_phys;
+ if (scip->scip_next_mapping_object != 0) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
+ ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+ (void) printf("Condensing indirect vdev %llu: new mapping "
+ "object %llu, prev obsolete sm %llu\n",
+ (u_longlong_t)scip->scip_vdev,
+ (u_longlong_t)scip->scip_next_mapping_object,
+ (u_longlong_t)scip->scip_prev_obsolete_sm_object);
+ if (scip->scip_prev_obsolete_sm_object != 0) {
+ space_map_t *prev_obsolete_sm = NULL;
+ VERIFY0(space_map_open(&prev_obsolete_sm,
+ spa->spa_meta_objset,
+ scip->scip_prev_obsolete_sm_object,
+ 0, vd->vdev_asize, 0));
+ dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
+ (void) printf("\n");
+ space_map_close(prev_obsolete_sm);
+ }
+
+ scip_count += 2;
+ }
+
+ for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+ vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+ if (vic->vic_mapping_object != 0) {
+ ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
+ vd->vdev_removing);
+ indirect_vdev_count++;
+
+ if (vd->vdev_indirect_mapping->vim_havecounts) {
+ obsolete_counts_count++;
+ }
+ }
+
+ boolean_t are_precise;
+ VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+ if (are_precise) {
+ ASSERT(vic->vic_mapping_object != 0);
+ precise_vdev_count++;
+ }
+
+ uint64_t obsolete_sm_object;
+ VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+ if (obsolete_sm_object != 0) {
+ ASSERT(vic->vic_mapping_object != 0);
+ obsolete_sm_count++;
+ }
+ }
+
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
+ &dr_feature_refcount);
+ (void) feature_get_refcount(spa,
+ &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
+ &oc_feature_refcount);
+
+ if (dr_feature_refcount != indirect_vdev_count) {
+ ret = 1;
+ (void) printf("Number of indirect vdevs (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)indirect_vdev_count,
+ (u_longlong_t)dr_feature_refcount);
+ } else {
+ (void) printf("Verified device_removal feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)dr_feature_refcount);
+ }
+
+ if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_OBSOLETE_BPOBJ) == 0) {
+ obsolete_bpobj_count++;
+ }
+
+
+ obsolete_counts_object_count = precise_vdev_count;
+ obsolete_counts_object_count += obsolete_sm_count;
+ obsolete_counts_object_count += obsolete_counts_count;
+ obsolete_counts_object_count += scip_count;
+ obsolete_counts_object_count += obsolete_bpobj_count;
+ obsolete_counts_object_count += remap_deadlist_count;
+
+ if (oc_feature_refcount != obsolete_counts_object_count) {
+ ret = 1;
+ (void) printf("Number of obsolete counts objects (%llu) " \
+ "does not match feature count (%llu)\n",
+ (u_longlong_t)obsolete_counts_object_count,
+ (u_longlong_t)oc_feature_refcount);
+ (void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
+ "ob:%llu rd:%llu\n",
+ (u_longlong_t)precise_vdev_count,
+ (u_longlong_t)obsolete_sm_count,
+ (u_longlong_t)obsolete_counts_count,
+ (u_longlong_t)scip_count,
+ (u_longlong_t)obsolete_bpobj_count,
+ (u_longlong_t)remap_deadlist_count);
+ } else {
+ (void) printf("Verified indirect_refcount feature refcount " \
+ "of %llu is correct\n",
+ (u_longlong_t)oc_feature_refcount);
+ }
+ return (ret);
+}
+
+static void
+zdb_set_skip_mmp(char *target)
+{
+ spa_t *spa;
+
+ /*
+ * Disable the activity check to allow examination of
+ * active pools.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL) {
+ spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
+ }
+ mutex_exit(&spa_namespace_lock);
+}
+
+#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
+/*
+ * Import the checkpointed state of the pool specified by the target
+ * parameter as readonly. The function also accepts a pool config
+ * as an optional parameter, else it attempts to infer the config by
+ * the name of the target pool.
+ *
+ * Note that the checkpointed state's pool name will be the name of
+ * the original pool with the above suffix appended to it. In addition,
+ * if the target is not a pool name (e.g. a path to a dataset) then
+ * the new_path parameter is populated with the updated path to
+ * reflect the fact that we are looking into the checkpointed state.
+ *
+ * The function returns a newly-allocated copy of the name of the
+ * pool containing the checkpointed state. When this copy is no
+ * longer needed it should be freed with free(3C). Same thing
+ * applies to the new_path parameter if allocated.
+ */
+static char *
+import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
+{
+ int error = 0;
+ char *poolname, *bogus_name = NULL;
+
+ /* If the target is not a pool, the extract the pool name */
+ char *path_start = strchr(target, '/');
+ if (path_start != NULL) {
+ size_t poolname_len = path_start - target;
+ poolname = strndup(target, poolname_len);
+ } else {
+ poolname = target;
+ }
+
+ if (cfg == NULL) {
+ zdb_set_skip_mmp(poolname);
+ error = spa_get_stats(poolname, &cfg, NULL, 0);
+ if (error != 0) {
+ fatal("Tried to read config of pool \"%s\" but "
+ "spa_get_stats() failed with error %d\n",
+ poolname, error);
+ }
+ }
+
+ if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1)
+ return (NULL);
+ fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
+
+ error = spa_import(bogus_name, cfg, NULL,
+ ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
+ ZFS_IMPORT_SKIP_MMP);
+ if (error != 0) {
+ fatal("Tried to import pool \"%s\" but spa_import() failed "
+ "with error %d\n", bogus_name, error);
+ }
+
+ if (new_path != NULL && path_start != NULL) {
+ if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
+ if (path_start != NULL)
+ free(poolname);
+ return (NULL);
+ }
+ }
+
+ if (target != poolname)
+ free(poolname);
+
+ return (bogus_name);
+}
+
+typedef struct verify_checkpoint_sm_entry_cb_arg {
+ vdev_t *vcsec_vd;
+
+ /* the following fields are only used for printing progress */
+ uint64_t vcsec_entryid;
+ uint64_t vcsec_num_entries;
+} verify_checkpoint_sm_entry_cb_arg_t;
+
+#define ENTRIES_PER_PROGRESS_UPDATE 10000
+
+static int
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
+{
+ verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
+ vdev_t *vd = vcsec->vcsec_vd;
+ metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+ uint64_t end = sme->sme_offset + sme->sme_run;
+
+ ASSERT(sme->sme_type == SM_FREE);
+
+ if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu, space map entry %llu of %llu ...",
+ (longlong_t)vd->vdev_id,
+ (longlong_t)vcsec->vcsec_entryid,
+ (longlong_t)vcsec->vcsec_num_entries);
+ }
+ vcsec->vcsec_entryid++;
+
+ /*
+ * See comment in checkpoint_sm_exclude_entry_cb()
+ */
+ VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+ VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+ /*
+ * The entries in the vdev_checkpoint_sm should be marked as
+ * allocated in the checkpointed state of the pool, therefore
+ * their respective ms_allocateable trees should not contain them.
+ */
+ mutex_enter(&ms->ms_lock);
+ range_tree_verify_not_present(ms->ms_allocatable,
+ sme->sme_offset, sme->sme_run);
+ mutex_exit(&ms->ms_lock);
+
+ return (0);
+}
+
+/*
+ * Verify that all segments in the vdev_checkpoint_sm are allocated
+ * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
+ * ms_allocatable).
+ *
+ * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
+ * each vdev in the current state of the pool to the metaslab space maps
+ * (ms_sm) of the checkpointed state of the pool.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of the current spa_t. The entries of these ms_allocatable
+ * trees are cleared out and then repopulated from with the free
+ * entries of their respective ms_sm space maps.
+ */
+static void
+verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
+
+ for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * Since we don't allow device removal in a pool
+ * that has a checkpoint, we expect that all removed
+ * vdevs were removed from the pool before the
+ * checkpoint.
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ /*
+ * If the checkpoint space map doesn't exist, then nothing
+ * here is checkpointed so there's nothing to verify.
+ */
+ if (current_vd->vdev_top_zap == 0 ||
+ zap_contains(spa_meta_objset(current),
+ current_vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(current),
+ current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
+ checkpoint_sm_obj, 0, current_vd->vdev_asize,
+ current_vd->vdev_ashift));
+
+ verify_checkpoint_sm_entry_cb_arg_t vcsec;
+ vcsec.vcsec_vd = ckpoint_vd;
+ vcsec.vcsec_entryid = 0;
+ vcsec.vcsec_num_entries =
+ space_map_length(checkpoint_sm) / sizeof (uint64_t);
+ VERIFY0(space_map_iterate(checkpoint_sm,
+ space_map_length(checkpoint_sm),
+ verify_checkpoint_sm_entry_cb, &vcsec));
+ if (dump_opt['m'] > 3)
+ dump_spacemap(current->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+
+ /*
+ * If we've added vdevs since we took the checkpoint, ensure
+ * that their checkpoint space maps are empty.
+ */
+ if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
+ for (uint64_t c = ckpoint_rvd->vdev_children;
+ c < current_rvd->vdev_children; c++) {
+ vdev_t *current_vd = current_rvd->vdev_child[c];
+ ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+/*
+ * Verifies that all space that's allocated in the checkpoint is
+ * still allocated in the current version, by checking that everything
+ * in checkpoint's ms_allocatable (which is actually allocated, not
+ * allocatable/free) is not present in current's ms_allocatable.
+ *
+ * Note that the function changes the state of the ms_allocatable
+ * trees of both spas when called. The entries of all ms_allocatable
+ * trees are cleared out and then repopulated from their respective
+ * ms_sm space maps. In the checkpointed state we load the allocated
+ * entries, and in the current state we load the free entries.
+ */
+static void
+verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
+{
+ vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
+ vdev_t *current_rvd = current->spa_root_vdev;
+
+ load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
+ load_concrete_ms_allocatable_trees(current, SM_FREE);
+
+ for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
+ vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
+ vdev_t *current_vd = current_rvd->vdev_child[i];
+
+ if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
+ /*
+ * See comment in verify_checkpoint_vdev_spacemaps()
+ */
+ ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
+ continue;
+ }
+
+ for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
+ metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
+ metaslab_t *current_msp = current_vd->vdev_ms[m];
+
+ (void) fprintf(stderr,
+ "\rverifying vdev %llu of %llu, "
+ "metaslab %llu of %llu ...",
+ (longlong_t)current_vd->vdev_id,
+ (longlong_t)current_rvd->vdev_children,
+ (longlong_t)current_vd->vdev_ms[m]->ms_id,
+ (longlong_t)current_vd->vdev_ms_count);
+
+ /*
+ * We walk through the ms_allocatable trees that
+ * are loaded with the allocated blocks from the
+ * ms_sm spacemaps of the checkpoint. For each
+ * one of these ranges we ensure that none of them
+ * exists in the ms_allocatable trees of the
+ * current state which are loaded with the ranges
+ * that are currently free.
+ *
+ * This way we ensure that none of the blocks that
+ * are part of the checkpoint were freed by mistake.
+ */
+ range_tree_walk(ckpoint_msp->ms_allocatable,
+ (range_tree_func_t *)range_tree_verify_not_present,
+ current_msp->ms_allocatable);
+ }
+ }
+
+ /* for cleaner progress output */
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+verify_checkpoint_blocks(spa_t *spa)
+{
+ ASSERT(!dump_opt['L']);
+
+ spa_t *checkpoint_spa;
+ char *checkpoint_pool;
+ nvlist_t *config = NULL;
+ int error = 0;
+
+ /*
+ * We import the checkpointed state of the pool (under a different
+ * name) so we can do verification on it against the current state
+ * of the pool.
+ */
+ checkpoint_pool = import_checkpointed_state(spa->spa_name, config,
+ NULL);
+ ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
+
+ error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but spa_open() failed with "
+ "error %d\n", checkpoint_pool, error);
+ }
+
+ /*
+ * Ensure that ranges in the checkpoint space maps of each vdev
+ * are allocated according to the checkpointed state's metaslab
+ * space maps.
+ */
+ verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Ensure that allocated ranges in the checkpoint's metaslab
+ * space maps remain allocated in the metaslab space maps of
+ * the current state.
+ */
+ verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
+
+ /*
+ * Once we are done, we get rid of the checkpointed state.
+ */
+ spa_close(checkpoint_spa, FTAG);
+ free(checkpoint_pool);
+}
+
+static void
+dump_leftover_checkpoint_blocks(spa_t *spa)
+{
+ vdev_t *rvd = spa->spa_root_vdev;
+
+ for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+ vdev_t *vd = rvd->vdev_child[i];
+
+ space_map_t *checkpoint_sm = NULL;
+ uint64_t checkpoint_sm_obj;
+
+ if (vd->vdev_top_zap == 0)
+ continue;
+
+ if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
+ continue;
+
+ VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
+ VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+ sizeof (uint64_t), 1, &checkpoint_sm_obj));
+
+ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
+ checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
+ dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
+ space_map_close(checkpoint_sm);
+ }
+}
+
+static int
+verify_checkpoint(spa_t *spa)
+{
+ uberblock_t checkpoint;
+ int error;
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+ return (0);
+
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+ sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+ if (error == ENOENT && !dump_opt['L']) {
+ /*
+ * If the feature is active but the uberblock is missing
+ * then we must be in the middle of discarding the
+ * checkpoint.
+ */
+ (void) printf("\nPartially discarded checkpoint "
+ "state found:\n");
+ if (dump_opt['m'] > 3)
+ dump_leftover_checkpoint_blocks(spa);
+ return (0);
+ } else if (error != 0) {
+ (void) printf("lookup error %d when looking for "
+ "checkpointed uberblock in MOS\n", error);
+ return (error);
+ }
+ dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
+
+ if (checkpoint.ub_checkpoint_txg == 0) {
+ (void) printf("\nub_checkpoint_txg not set in checkpointed "
+ "uberblock\n");
+ error = 3;
+ }
+
+ if (error == 0 && !dump_opt['L'])
+ verify_checkpoint_blocks(spa);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
+{
+ for (uint64_t i = start; i < size; i++) {
+ (void) printf("MOS object %llu referenced but not allocated\n",
+ (u_longlong_t)i);
+ }
+}
+
+static void
+mos_obj_refd(uint64_t obj)
+{
+ if (obj != 0 && mos_refd_objs != NULL)
+ range_tree_add(mos_refd_objs, obj, 1);
+}
+
+/*
+ * Call on a MOS object that may already have been referenced.
+ */
+static void
+mos_obj_refd_multiple(uint64_t obj)
+{
+ if (obj != 0 && mos_refd_objs != NULL &&
+ !range_tree_contains(mos_refd_objs, obj, 1))
+ range_tree_add(mos_refd_objs, obj, 1);
+}
+
+static void
+mos_leak_vdev_top_zap(vdev_t *vd)
+{
+ uint64_t ms_flush_data_obj;
+ int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
+ vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+ sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
+ if (error == ENOENT)
+ return;
+ ASSERT0(error);
+
+ mos_obj_refd(ms_flush_data_obj);
+}
+
+static void
+mos_leak_vdev(vdev_t *vd)
+{
+ mos_obj_refd(vd->vdev_dtl_object);
+ mos_obj_refd(vd->vdev_ms_array);
+ mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
+ mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
+ mos_obj_refd(vd->vdev_leaf_zap);
+ if (vd->vdev_checkpoint_sm != NULL)
+ mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
+ if (vd->vdev_indirect_mapping != NULL) {
+ mos_obj_refd(vd->vdev_indirect_mapping->
+ vim_phys->vimp_counts_object);
+ }
+ if (vd->vdev_obsolete_sm != NULL)
+ mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
+
+ for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *ms = vd->vdev_ms[m];
+ mos_obj_refd(space_map_object(ms->ms_sm));
+ }
+
+ if (vd->vdev_top_zap != 0) {
+ mos_obj_refd(vd->vdev_top_zap);
+ mos_leak_vdev_top_zap(vd);
+ }
+
+ for (uint64_t c = 0; c < vd->vdev_children; c++) {
+ mos_leak_vdev(vd->vdev_child[c]);
+ }
+}
+
+static void
+mos_leak_log_spacemaps(spa_t *spa)
+{
+ uint64_t spacemap_zap;
+ int error = zap_lookup(spa_meta_objset(spa),
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
+ sizeof (spacemap_zap), 1, &spacemap_zap);
+ if (error == ENOENT)
+ return;
+ ASSERT0(error);
+
+ mos_obj_refd(spacemap_zap);
+ for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+ sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
+ mos_obj_refd(sls->sls_sm_obj);
+}
+
+static int
+dump_mos_leaks(spa_t *spa)
+{
+ int rv = 0;
+ objset_t *mos = spa->spa_meta_objset;
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+
+ /* Visit and mark all referenced objects in the MOS */
+
+ mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
+ mos_obj_refd(spa->spa_pool_props_object);
+ mos_obj_refd(spa->spa_config_object);
+ mos_obj_refd(spa->spa_ddt_stat_object);
+ mos_obj_refd(spa->spa_feat_desc_obj);
+ mos_obj_refd(spa->spa_feat_enabled_txg_obj);
+ mos_obj_refd(spa->spa_feat_for_read_obj);
+ mos_obj_refd(spa->spa_feat_for_write_obj);
+ mos_obj_refd(spa->spa_history);
+ mos_obj_refd(spa->spa_errlog_last);
+ mos_obj_refd(spa->spa_errlog_scrub);
+ mos_obj_refd(spa->spa_all_vdev_zaps);
+ mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
+ mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
+ mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
+ bpobj_count_refd(&spa->spa_deferred_bpobj);
+ mos_obj_refd(dp->dp_empty_bpobj);
+ bpobj_count_refd(&dp->dp_obsolete_bpobj);
+ bpobj_count_refd(&dp->dp_free_bpobj);
+ mos_obj_refd(spa->spa_l2cache.sav_object);
+ mos_obj_refd(spa->spa_spares.sav_object);
+
+ if (spa->spa_syncing_log_sm != NULL)
+ mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
+ mos_leak_log_spacemaps(spa);
+
+ mos_obj_refd(spa->spa_condensing_indirect_phys.
+ scip_next_mapping_object);
+ mos_obj_refd(spa->spa_condensing_indirect_phys.
+ scip_prev_obsolete_sm_object);
+ if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
+ vdev_indirect_mapping_t *vim =
+ vdev_indirect_mapping_open(mos,
+ spa->spa_condensing_indirect_phys.scip_next_mapping_object);
+ mos_obj_refd(vim->vim_phys->vimp_counts_object);
+ vdev_indirect_mapping_close(vim);
+ }
+ deleted_livelists_dump_mos(spa);
+
+ if (dp->dp_origin_snap != NULL) {
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(dp, FTAG);
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
+ FTAG, &ds));
+ count_ds_mos_objects(ds);
+ dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
+ dsl_dataset_rele(ds, FTAG);
+ dsl_pool_config_exit(dp, FTAG);
+
+ count_ds_mos_objects(dp->dp_origin_snap);
+ dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
+ }
+ count_dir_mos_objects(dp->dp_mos_dir);
+ if (dp->dp_free_dir != NULL)
+ count_dir_mos_objects(dp->dp_free_dir);
+ if (dp->dp_leak_dir != NULL)
+ count_dir_mos_objects(dp->dp_leak_dir);
+
+ mos_leak_vdev(spa->spa_root_vdev);
+
+ for (uint64_t class = 0; class < DDT_CLASSES; class++) {
+ for (uint64_t type = 0; type < DDT_TYPES; type++) {
+ for (uint64_t cksum = 0;
+ cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
+ ddt_t *ddt = spa->spa_ddt[cksum];
+ mos_obj_refd(ddt->ddt_object[type][class]);
+ }
+ }
+ }
+
+ /*
+ * Visit all allocated objects and make sure they are referenced.
+ */
+ uint64_t object = 0;
+ while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
+ if (range_tree_contains(mos_refd_objs, object, 1)) {
+ range_tree_remove(mos_refd_objs, object, 1);
+ } else {
+ dmu_object_info_t doi;
+ const char *name;
+ dmu_object_info(mos, object, &doi);
+ if (doi.doi_type & DMU_OT_NEWTYPE) {
+ dmu_object_byteswap_t bswap =
+ DMU_OT_BYTESWAP(doi.doi_type);
+ name = dmu_ot_byteswap[bswap].ob_name;
+ } else {
+ name = dmu_ot[doi.doi_type].ot_name;
+ }
+
+ (void) printf("MOS object %llu (%s) leaked\n",
+ (u_longlong_t)object, name);
+ rv = 2;
+ }
+ }
+ (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
+ if (!range_tree_is_empty(mos_refd_objs))
+ rv = 2;
+ range_tree_vacate(mos_refd_objs, NULL, NULL);
+ range_tree_destroy(mos_refd_objs);
+ return (rv);
+}
+
+typedef struct log_sm_obsolete_stats_arg {
+ uint64_t lsos_current_txg;
+
+ uint64_t lsos_total_entries;
+ uint64_t lsos_valid_entries;
+
+ uint64_t lsos_sm_entries;
+ uint64_t lsos_valid_sm_entries;
+} log_sm_obsolete_stats_arg_t;
+
+static int
+log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
+ uint64_t txg, void *arg)
+{
+ log_sm_obsolete_stats_arg_t *lsos = arg;
+
+ uint64_t offset = sme->sme_offset;
+ uint64_t vdev_id = sme->sme_vdev;
+
+ if (lsos->lsos_current_txg == 0) {
+ /* this is the first log */
+ lsos->lsos_current_txg = txg;
+ } else if (lsos->lsos_current_txg < txg) {
+ /* we just changed log - print stats and reset */
+ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+ (u_longlong_t)lsos->lsos_valid_sm_entries,
+ (u_longlong_t)lsos->lsos_sm_entries,
+ (u_longlong_t)lsos->lsos_current_txg);
+ lsos->lsos_valid_sm_entries = 0;
+ lsos->lsos_sm_entries = 0;
+ lsos->lsos_current_txg = txg;
+ }
+ ASSERT3U(lsos->lsos_current_txg, ==, txg);
+
+ lsos->lsos_sm_entries++;
+ lsos->lsos_total_entries++;
+
+ vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+ if (!vdev_is_concrete(vd))
+ return (0);
+
+ metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+ ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
+
+ if (txg < metaslab_unflushed_txg(ms))
+ return (0);
+ lsos->lsos_valid_sm_entries++;
+ lsos->lsos_valid_entries++;
+ return (0);
+}
+
+static void
+dump_log_spacemap_obsolete_stats(spa_t *spa)
+{
+ if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+ return;
+
+ log_sm_obsolete_stats_arg_t lsos;
+ bzero(&lsos, sizeof (lsos));
+
+ (void) printf("Log Space Map Obsolete Entry Statistics:\n");
+
+ iterate_through_spacemap_logs(spa,
+ log_spacemap_obsolete_stats_cb, &lsos);
+
+ /* print stats for latest log */
+ (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
+ (u_longlong_t)lsos.lsos_valid_sm_entries,
+ (u_longlong_t)lsos.lsos_sm_entries,
+ (u_longlong_t)lsos.lsos_current_txg);
+
+ (void) printf("%-8llu valid entries out of %-8llu - total\n\n",
+ (u_longlong_t)lsos.lsos_valid_entries,
+ (u_longlong_t)lsos.lsos_total_entries);
+}
+
+static void
+dump_zpool(spa_t *spa)
+{
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ int rc = 0;
+
+ if (dump_opt['y']) {
+ livelist_metaslab_validate(spa);
+ }
+
+ if (dump_opt['S']) {
+ dump_simulated_ddt(spa);
+ return;
+ }
+
+ if (!dump_opt['e'] && dump_opt['C'] > 1) {
+ (void) printf("\nCached configuration:\n");
+ dump_nvlist(spa->spa_config, 8);
+ }
+
+ if (dump_opt['C'])
+ dump_config(spa);
+
+ if (dump_opt['u'])
+ dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
+
+ if (dump_opt['D'])
+ dump_all_ddts(spa);
+
+ if (dump_opt['d'] > 2 || dump_opt['m'])
+ dump_metaslabs(spa);
+ if (dump_opt['M'])
+ dump_metaslab_groups(spa);
+ if (dump_opt['d'] > 2 || dump_opt['m']) {
+ dump_log_spacemaps(spa);
+ dump_log_spacemap_obsolete_stats(spa);
+ }
+
+ if (dump_opt['d'] || dump_opt['i']) {
+ spa_feature_t f;
+ mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+ 0);
+ dump_objset(dp->dp_meta_objset);
+
+ if (dump_opt['d'] >= 3) {
+ dsl_pool_t *dp = spa->spa_dsl_pool;
+ dump_full_bpobj(&spa->spa_deferred_bpobj,
+ "Deferred frees", 0);
+ if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+ dump_full_bpobj(&dp->dp_free_bpobj,
+ "Pool snapshot frees", 0);
+ }
+ if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+ ASSERT(spa_feature_is_enabled(spa,
+ SPA_FEATURE_DEVICE_REMOVAL));
+ dump_full_bpobj(&dp->dp_obsolete_bpobj,
+ "Pool obsolete blocks", 0);
+ }
+
+ if (spa_feature_is_active(spa,
+ SPA_FEATURE_ASYNC_DESTROY)) {
+ dump_bptree(spa->spa_meta_objset,
+ dp->dp_bptree_obj,
+ "Pool dataset frees");
+ }
+ dump_dtl(spa->spa_root_vdev, 0);
+ }
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
+ global_feature_count[f] = UINT64_MAX;
+ global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
+ global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
+ global_feature_count[SPA_FEATURE_LIVELIST] = 0;
+
+ (void) dmu_objset_find(spa_name(spa), dump_one_objset,
+ NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+ if (rc == 0 && !dump_opt['L'])
+ rc = dump_mos_leaks(spa);
+
+ for (f = 0; f < SPA_FEATURES; f++) {
+ uint64_t refcount;
+
+ uint64_t *arr;
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET)) {
+ if (global_feature_count[f] == UINT64_MAX)
+ continue;
+ if (!spa_feature_is_enabled(spa, f)) {
+ ASSERT0(global_feature_count[f]);
+ continue;
+ }
+ arr = global_feature_count;
+ } else {
+ if (!spa_feature_is_enabled(spa, f)) {
+ ASSERT0(dataset_feature_count[f]);
+ continue;
+ }
+ arr = dataset_feature_count;
+ }
+ if (feature_get_refcount(spa, &spa_feature_table[f],
+ &refcount) == ENOTSUP)
+ continue;
+ if (arr[f] != refcount) {
+ (void) printf("%s feature refcount mismatch: "
+ "%lld consumers != %lld refcount\n",
+ spa_feature_table[f].fi_uname,
+ (longlong_t)arr[f], (longlong_t)refcount);
+ rc = 2;
+ } else {
+ (void) printf("Verified %s feature refcount "
+ "of %llu is correct\n",
+ spa_feature_table[f].fi_uname,
+ (longlong_t)refcount);
+ }
+ }
+
+ if (rc == 0)
+ rc = verify_device_removal_feature_counts(spa);
+ }
+
+ if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
+ rc = dump_block_stats(spa);
+
+ if (rc == 0)
+ rc = verify_spacemap_refcounts(spa);
+
+ if (dump_opt['s'])
+ show_pool_stats(spa);
+
+ if (dump_opt['h'])
+ dump_history(spa);
+
+ if (rc == 0)
+ rc = verify_checkpoint(spa);
+
+ if (rc != 0) {
+ dump_debug_buffer();
+ exit(rc);
+ }
+}
+
+#define ZDB_FLAG_CHECKSUM 0x0001
+#define ZDB_FLAG_DECOMPRESS 0x0002
+#define ZDB_FLAG_BSWAP 0x0004
+#define ZDB_FLAG_GBH 0x0008
+#define ZDB_FLAG_INDIRECT 0x0010
+#define ZDB_FLAG_RAW 0x0020
+#define ZDB_FLAG_PRINT_BLKPTR 0x0040
+#define ZDB_FLAG_VERBOSE 0x0080
+
+static int flagbits[256];
+static char flagbitstr[16];
+
+static void
+zdb_print_blkptr(const blkptr_t *bp, int flags)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("%s\n", blkbuf);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+ int i;
+
+ for (i = 0; i < nbps; i++)
+ zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+ zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+ if (flags & ZDB_FLAG_BSWAP)
+ byteswap_uint64_array(buf, size);
+ VERIFY(write(fileno(stdout), buf, size) == size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+ uint64_t *d = (uint64_t *)buf;
+ unsigned nwords = size / sizeof (uint64_t);
+ int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+ unsigned i, j;
+ const char *hdr;
+ char *c;
+
+
+ if (do_bswap)
+ hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8";
+ else
+ hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f";
+
+ (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr);
+
+#ifdef _LITTLE_ENDIAN
+ /* correct the endianness */
+ do_bswap = !do_bswap;
+#endif
+ for (i = 0; i < nwords; i += 2) {
+ (void) printf("%06llx: %016llx %016llx ",
+ (u_longlong_t)(i * sizeof (uint64_t)),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+ (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+ c = (char *)&d[i];
+ for (j = 0; j < 2 * sizeof (uint64_t); j++)
+ (void) printf("%c", isprint(c[j]) ? c[j] : '.');
+ (void) printf("\n");
+ }
+}
+
+/*
+ * There are two acceptable formats:
+ * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
+ * child[.child]* - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the hierarchy. For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, const char *path)
+{
+ char *s, *p, *q;
+ unsigned i;
+
+ if (vdev == NULL)
+ return (NULL);
+
+ /* First, assume the x.x.x.x format */
+ i = strtoul(path, &s, 10);
+ if (s == path || (s && *s != '.' && *s != '\0'))
+ goto name;
+ if (i >= vdev->vdev_children)
+ return (NULL);
+
+ vdev = vdev->vdev_child[i];
+ if (s && *s == '\0')
+ return (vdev);
+ return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+ for (i = 0; i < vdev->vdev_children; i++) {
+ vdev_t *vc = vdev->vdev_child[i];
+
+ if (vc->vdev_path == NULL) {
+ vc = zdb_vdev_lookup(vc, path);
+ if (vc == NULL)
+ continue;
+ else
+ return (vc);
+ }
+
+ p = strrchr(vc->vdev_path, '/');
+ p = p ? p + 1 : vc->vdev_path;
+ q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+ if (strcmp(vc->vdev_path, path) == 0)
+ return (vc);
+ if (strcmp(p, path) == 0)
+ return (vc);
+ if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+ return (vc);
+ }
+
+ return (NULL);
+}
+
+static int
+name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
+{
+ dsl_dataset_t *ds;
+
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+ int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
+ NULL, &ds);
+ if (error != 0) {
+ (void) fprintf(stderr, "failed to hold objset %llu: %s\n",
+ (u_longlong_t)objset_id, strerror(error));
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ return (error);
+ }
+ dsl_dataset_name(ds, outstr);
+ dsl_dataset_rele(ds, NULL);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ return (0);
+}
+
+static boolean_t
+zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
+{
+ char *s0, *s1;
+
+ if (sizes == NULL)
+ return (B_FALSE);
+
+ s0 = strtok(sizes, "/");
+ if (s0 == NULL)
+ return (B_FALSE);
+ s1 = strtok(NULL, "/");
+ *lsize = strtoull(s0, NULL, 16);
+ *psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
+ return (*lsize >= *psize && *psize > 0);
+}
+
+#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg))
+
+static boolean_t
+zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
+ uint64_t psize, int flags)
+{
+ boolean_t exceeded = B_FALSE;
+ /*
+ * We don't know how the data was compressed, so just try
+ * every decompress function at every inflated blocksize.
+ */
+ void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
+ int *cfuncp = cfuncs;
+ uint64_t maxlsize = SPA_MAXBLOCKSIZE;
+ uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
+ ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
+ (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0);
+ *cfuncp++ = ZIO_COMPRESS_LZ4;
+ *cfuncp++ = ZIO_COMPRESS_LZJB;
+ mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
+ for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
+ if (((1ULL << c) & mask) == 0)
+ *cfuncp++ = c;
+
+ /*
+ * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
+ * could take a while and we should let the user know
+ * we are not stuck. On the other hand, printing progress
+ * info gets old after a while. User can specify 'v' flag
+ * to see the progression.
+ */
+ if (lsize == psize)
+ lsize += SPA_MINBLOCKSIZE;
+ else
+ maxlsize = lsize;
+ for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
+ for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
+ if (flags & ZDB_FLAG_VERBOSE) {
+ (void) fprintf(stderr,
+ "Trying %05llx -> %05llx (%s)\n",
+ (u_longlong_t)psize,
+ (u_longlong_t)lsize,
+ zio_compress_table[*cfuncp].\
+ ci_name);
+ }
+
+ /*
+ * We randomize lbuf2, and decompress to both
+ * lbuf and lbuf2. This way, we will know if
+ * decompression fill exactly to lsize.
+ */
+ VERIFY0(random_get_pseudo_bytes(lbuf2, lsize));
+
+ if (zio_decompress_data(*cfuncp, pabd,
+ lbuf, psize, lsize, NULL) == 0 &&
+ zio_decompress_data(*cfuncp, pabd,
+ lbuf2, psize, lsize, NULL) == 0 &&
+ bcmp(lbuf, lbuf2, lsize) == 0)
+ break;
+ }
+ if (*cfuncp != 0)
+ break;
+ }
+ umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+ if (lsize > maxlsize) {
+ exceeded = B_TRUE;
+ }
+ buf = lbuf;
+ if (*cfuncp == ZIO_COMPRESS_ZLE) {
+ printf("\nZLE decompression was selected. If you "
+ "suspect the results are wrong,\ntry avoiding ZLE "
+ "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
+ }
+
+ return (exceeded);
+}
+
+/*
+ * Read a block from a pool and print it out. The syntax of the
+ * block descriptor is:
+ *
+ * pool:vdev_specifier:offset:[lsize/]psize[:flags]
+ *
+ * pool - The name of the pool you wish to read from
+ * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ * offset - offset, in hex, in bytes
+ * size - Amount of data to read, in hex, in bytes
+ * flags - A string of characters specifying options
+ * b: Decode a blkptr at given offset within block
+ * c: Calculate and display checksums
+ * d: Decompress data before dumping
+ * e: Byteswap data before dumping
+ * g: Display data as a gang block header
+ * i: Display as an indirect block
+ * r: Dump raw data to stdout
+ * v: Verbose
+ *
+ */
+static void
+zdb_read_block(char *thing, spa_t *spa)
+{
+ blkptr_t blk, *bp = &blk;
+ dva_t *dva = bp->blk_dva;
+ int flags = 0;
+ uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
+ zio_t *zio;
+ vdev_t *vd;
+ abd_t *pabd;
+ void *lbuf, *buf;
+ char *s, *p, *dup, *vdev, *flagstr, *sizes;
+ int i, error;
+ boolean_t borrowed = B_FALSE, found = B_FALSE;
+
+ dup = strdup(thing);
+ s = strtok(dup, ":");
+ vdev = s ? s : "";
+ s = strtok(NULL, ":");
+ offset = strtoull(s ? s : "", NULL, 16);
+ sizes = strtok(NULL, ":");
+ s = strtok(NULL, ":");
+ flagstr = strdup(s ? s : "");
+
+ s = NULL;
+ if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
+ s = "invalid size(s)";
+ if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
+ s = "size must be a multiple of sector size";
+ if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+ s = "offset must be a multiple of sector size";
+ if (s) {
+ (void) printf("Invalid block specifier: %s - %s\n", thing, s);
+ goto done;
+ }
+
+ for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+ for (i = 0; i < strlen(flagstr); i++) {
+ int bit = flagbits[(uchar_t)flagstr[i]];
+
+ if (bit == 0) {
+ (void) printf("***Ignoring flag: %c\n",
+ (uchar_t)flagstr[i]);
+ continue;
+ }
+ found = B_TRUE;
+ flags |= bit;
+
+ p = &flagstr[i + 1];
+ if (*p != ':' && *p != '\0') {
+ int j = 0, nextbit = flagbits[(uchar_t)*p];
+ char *end, offstr[8] = { 0 };
+ if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
+ (nextbit == 0)) {
+ /* look ahead to isolate the offset */
+ while (nextbit == 0 &&
+ strchr(flagbitstr, *p) == NULL) {
+ offstr[j] = *p;
+ j++;
+ if (i + j > strlen(flagstr))
+ break;
+ p++;
+ nextbit = flagbits[(uchar_t)*p];
+ }
+ blkptr_offset = strtoull(offstr, &end,
+ 16);
+ i += j;
+ } else if (nextbit == 0) {
+ (void) printf("***Ignoring flag arg:"
+ " '%c'\n", (uchar_t)*p);
+ }
+ }
+ }
+ }
+ if (blkptr_offset % sizeof (blkptr_t)) {
+ printf("Block pointer offset 0x%llx "
+ "must be divisible by 0x%x\n",
+ (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
+ goto done;
+ }
+ if (found == B_FALSE && strlen(flagstr) > 0) {
+ printf("Invalid flag arg: '%s'\n", flagstr);
+ goto done;
+ }
+
+ vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+ if (vd == NULL) {
+ (void) printf("***Invalid vdev: %s\n", vdev);
+ free(dup);
+ return;
+ } else {
+ if (vd->vdev_path)
+ (void) fprintf(stderr, "Found vdev: %s\n",
+ vd->vdev_path);
+ else
+ (void) fprintf(stderr, "Found vdev type: %s\n",
+ vd->vdev_ops->vdev_op_type);
+ }
+
+ pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
+ lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+ BP_ZERO(bp);
+
+ DVA_SET_VDEV(&dva[0], vd->vdev_id);
+ DVA_SET_OFFSET(&dva[0], offset);
+ DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+ DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+ BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_PSIZE(bp, psize);
+ BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+ BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+ BP_SET_TYPE(bp, DMU_OT_NONE);
+ BP_SET_LEVEL(bp, 0);
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ zio = zio_root(spa, NULL, NULL, 0);
+
+ if (vd == vd->vdev_top) {
+ /*
+ * Treat this as a normal block read.
+ */
+ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+ } else {
+ /*
+ * Treat this as a vdev child I/O.
+ */
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+ psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+
+ error = zio_wait(zio);
+ spa_config_exit(spa, SCL_STATE, FTAG);
+
+ if (error) {
+ (void) printf("Read of %s failed, error: %d\n", thing, error);
+ goto out;
+ }
+
+ uint64_t orig_lsize = lsize;
+ buf = lbuf;
+ if (flags & ZDB_FLAG_DECOMPRESS) {
+ boolean_t failed = zdb_decompress_block(pabd, buf, lbuf,
+ lsize, psize, flags);
+ if (failed) {
+ (void) printf("Decompress of %s failed\n", thing);
+ goto out;
+ }
+ } else {
+ buf = abd_borrow_buf_copy(pabd, lsize);
+ borrowed = B_TRUE;
+ }
+ /*
+ * Try to detect invalid block pointer. If invalid, try
+ * decompressing.
+ */
+ if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
+ !(flags & ZDB_FLAG_DECOMPRESS)) {
+ const blkptr_t *b = (const blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset);
+ if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) ==
+ B_FALSE) {
+ abd_return_buf_copy(pabd, buf, lsize);
+ borrowed = B_FALSE;
+ buf = lbuf;
+ boolean_t failed = zdb_decompress_block(pabd, buf,
+ lbuf, lsize, psize, flags);
+ b = (const blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset);
+ if (failed || zfs_blkptr_verify(spa, b, B_FALSE,
+ BLK_VERIFY_LOG) == B_FALSE) {
+ printf("invalid block pointer at this DVA\n");
+ goto out;
+ }
+ }
+ }
+
+ if (flags & ZDB_FLAG_PRINT_BLKPTR)
+ zdb_print_blkptr((blkptr_t *)(void *)
+ ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+ else if (flags & ZDB_FLAG_RAW)
+ zdb_dump_block_raw(buf, lsize, flags);
+ else if (flags & ZDB_FLAG_INDIRECT)
+ zdb_dump_indirect((blkptr_t *)buf,
+ orig_lsize / sizeof (blkptr_t), flags);
+ else if (flags & ZDB_FLAG_GBH)
+ zdb_dump_gbh(buf, flags);
+ else
+ zdb_dump_block(thing, buf, lsize, flags);
+
+ /*
+ * If :c was specified, iterate through the checksum table to
+ * calculate and display each checksum for our specified
+ * DVA and length.
+ */
+ if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
+ !(flags & ZDB_FLAG_GBH)) {
+ zio_t *czio;
+ (void) printf("\n");
+ for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
+ ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
+
+ if ((zio_checksum_table[ck].ci_flags &
+ ZCHECKSUM_FLAG_EMBEDDED) ||
+ ck == ZIO_CHECKSUM_NOPARITY) {
+ continue;
+ }
+ BP_SET_CHECKSUM(bp, ck);
+ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+ czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ czio->io_bp = bp;
+
+ if (vd == vd->vdev_top) {
+ zio_nowait(zio_read(czio, spa, bp, pabd, psize,
+ NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_DONT_RETRY, NULL));
+ } else {
+ zio_nowait(zio_vdev_child_io(czio, bp, vd,
+ offset, pabd, psize, ZIO_TYPE_READ,
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
+ ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_OPTIONAL, NULL, NULL));
+ }
+ error = zio_wait(czio);
+ if (error == 0 || error == ECKSUM) {
+ zio_t *ck_zio = zio_root(spa, NULL, NULL, 0);
+ ck_zio->io_offset =
+ DVA_GET_OFFSET(&bp->blk_dva[0]);
+ ck_zio->io_bp = bp;
+ zio_checksum_compute(ck_zio, ck, pabd, lsize);
+ printf("%12s\tcksum=%llx:%llx:%llx:%llx\n",
+ zio_checksum_table[ck].ci_name,
+ (u_longlong_t)bp->blk_cksum.zc_word[0],
+ (u_longlong_t)bp->blk_cksum.zc_word[1],
+ (u_longlong_t)bp->blk_cksum.zc_word[2],
+ (u_longlong_t)bp->blk_cksum.zc_word[3]);
+ zio_wait(ck_zio);
+ } else {
+ printf("error %d reading block\n", error);
+ }
+ spa_config_exit(spa, SCL_STATE, FTAG);
+ }
+ }
+
+ if (borrowed)
+ abd_return_buf_copy(pabd, buf, lsize);
+
+out:
+ abd_free(pabd);
+ umem_free(lbuf, SPA_MAXBLOCKSIZE);
+done:
+ free(flagstr);
+ free(dup);
+}
+
+static void
+zdb_embedded_block(char *thing)
+{
+ blkptr_t bp;
+ unsigned long long *words = (void *)&bp;
+ char *buf;
+ int err;
+
+ bzero(&bp, sizeof (bp));
+ err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
+ "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
+ words + 0, words + 1, words + 2, words + 3,
+ words + 4, words + 5, words + 6, words + 7,
+ words + 8, words + 9, words + 10, words + 11,
+ words + 12, words + 13, words + 14, words + 15);
+ if (err != 16) {
+ (void) fprintf(stderr, "invalid input format\n");
+ exit(1);
+ }
+ ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
+ buf = malloc(SPA_MAXBLOCKSIZE);
+ if (buf == NULL) {
+ (void) fprintf(stderr, "out of memory\n");
+ exit(1);
+ }
+ err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
+ if (err != 0) {
+ (void) fprintf(stderr, "decode failed: %u\n", err);
+ exit(1);
+ }
+ zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
+ free(buf);
+}
+
+int
+main(int argc, char **argv)
+{
+ int c;
+ struct rlimit rl = { 1024, 1024 };
+ spa_t *spa = NULL;
+ objset_t *os = NULL;
+ int dump_all = 1;
+ int verbose = 0;
+ int error = 0;
+ char **searchdirs = NULL;
+ int nsearch = 0;
+ char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
+ nvlist_t *policy = NULL;
+ uint64_t max_txg = UINT64_MAX;
+ int64_t objset_id = -1;
+ int flags = ZFS_IMPORT_MISSING_LOG;
+ int rewind = ZPOOL_NEVER_REWIND;
+ char *spa_config_path_env, *objset_str;
+ boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
+ nvlist_t *cfg = NULL;
+
+ (void) setrlimit(RLIMIT_NOFILE, &rl);
+ (void) enable_extended_FILE_stdio(-1, -1);
+
+ dprintf_setup(&argc, argv);
+
+ /*
+ * If there is an environment variable SPA_CONFIG_PATH it overrides
+ * default spa_config_path setting. If -U flag is specified it will
+ * override this environment variable settings once again.
+ */
+ spa_config_path_env = getenv("SPA_CONFIG_PATH");
+ if (spa_config_path_env != NULL)
+ spa_config_path = spa_config_path_env;
+
+ /*
+ * For performance reasons, we set this tunable down. We do so before
+ * the arg parsing section so that the user can override this value if
+ * they choose.
+ */
+ zfs_btree_verify_intensity = 3;
+
+ while ((c = getopt(argc, argv,
+ "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) {
+ switch (c) {
+ case 'b':
+ case 'c':
+ case 'C':
+ case 'd':
+ case 'D':
+ case 'E':
+ case 'G':
+ case 'h':
+ case 'i':
+ case 'l':
+ case 'm':
+ case 'M':
+ case 'O':
+ case 'R':
+ case 's':
+ case 'S':
+ case 'u':
+ case 'y':
+ case 'Z':
+ dump_opt[c]++;
+ dump_all = 0;
+ break;
+ case 'A':
+ case 'e':
+ case 'F':
+ case 'k':
+ case 'L':
+ case 'P':
+ case 'q':
+ case 'X':
+ dump_opt[c]++;
+ break;
+ case 'Y':
+ zfs_reconstruct_indirect_combinations_max = INT_MAX;
+ zfs_deadman_enabled = 0;
+ break;
+ /* NB: Sort single match options below. */
+ case 'I':
+ max_inflight_bytes = strtoull(optarg, NULL, 0);
+ if (max_inflight_bytes == 0) {
+ (void) fprintf(stderr, "maximum number "
+ "of inflight bytes must be greater "
+ "than 0\n");
+ usage();
+ }
+ break;
+ case 'o':
+ error = set_global_var(optarg);
+ if (error != 0)
+ usage();
+ break;
+ case 'p':
+ if (searchdirs == NULL) {
+ searchdirs = umem_alloc(sizeof (char *),
+ UMEM_NOFAIL);
+ } else {
+ char **tmp = umem_alloc((nsearch + 1) *
+ sizeof (char *), UMEM_NOFAIL);
+ bcopy(searchdirs, tmp, nsearch *
+ sizeof (char *));
+ umem_free(searchdirs,
+ nsearch * sizeof (char *));
+ searchdirs = tmp;
+ }
+ searchdirs[nsearch++] = optarg;
+ break;
+ case 't':
+ max_txg = strtoull(optarg, NULL, 0);
+ if (max_txg < TXG_INITIAL) {
+ (void) fprintf(stderr, "incorrect txg "
+ "specified: %s\n", optarg);
+ usage();
+ }
+ break;
+ case 'U':
+ spa_config_path = optarg;
+ if (spa_config_path[0] != '/') {
+ (void) fprintf(stderr,
+ "cachefile must be an absolute path "
+ "(i.e. start with a slash)\n");
+ usage();
+ }
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'V':
+ flags = ZFS_IMPORT_VERBATIM;
+ break;
+ case 'x':
+ vn_dumpdir = optarg;
+ break;
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (!dump_opt['e'] && searchdirs != NULL) {
+ (void) fprintf(stderr, "-p option requires use of -e\n");
+ usage();
+ }
+ if (dump_opt['d']) {
+ /* <pool>[/<dataset | objset id> is accepted */
+ if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL &&
+ objset_str++ != NULL) {
+ char *endptr;
+ errno = 0;
+ objset_id = strtoull(objset_str, &endptr, 0);
+ /* dataset 0 is the same as opening the pool */
+ if (errno == 0 && endptr != objset_str &&
+ objset_id != 0) {
+ target_is_spa = B_FALSE;
+ dataset_lookup = B_TRUE;
+ } else if (objset_id != 0) {
+ printf("failed to open objset %s "
+ "%llu %s", objset_str,
+ (u_longlong_t)objset_id,
+ strerror(errno));
+ exit(1);
+ }
+ /* normal dataset name not an objset ID */
+ if (endptr == objset_str) {
+ objset_id = -1;
+ }
+ }
+ }
+
+#if defined(_LP64)
+ /*
+ * ZDB does not typically re-read blocks; therefore limit the ARC
+ * to 256 MB, which can be used entirely for metadata.
+ */
+ zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT;
+ zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
+#endif
+
+ /*
+ * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+ * "zdb -b" uses traversal prefetch which uses async reads.
+ * For good performance, let several of them be active at once.
+ */
+ zfs_vdev_async_read_max_active = 10;
+
+ /*
+ * Disable reference tracking for better performance.
+ */
+ reference_tracking_enable = B_FALSE;
+
+ /*
+ * Do not fail spa_load when spa_load_verify fails. This is needed
+ * to load non-idle pools.
+ */
+ spa_load_verify_dryrun = B_TRUE;
+
+ kernel_init(SPA_MODE_READ);
+
+ if (dump_all)
+ verbose = MAX(verbose, 1);
+
+ for (c = 0; c < 256; c++) {
+ if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL)
+ dump_opt[c] = 1;
+ if (dump_opt[c])
+ dump_opt[c] += verbose;
+ }
+
+ aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+ zfs_recover = (dump_opt['A'] > 1);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 2 && dump_opt['R'])
+ usage();
+
+ if (dump_opt['E']) {
+ if (argc != 1)
+ usage();
+ zdb_embedded_block(argv[0]);
+ return (0);
+ }
+
+ if (argc < 1) {
+ if (!dump_opt['e'] && dump_opt['C']) {
+ dump_cachefile(spa_config_path);
+ return (0);
+ }
+ usage();
+ }
+
+ if (dump_opt['l'])
+ return (dump_label(argv[0]));
+
+ if (dump_opt['O']) {
+ if (argc != 2)
+ usage();
+ dump_opt['v'] = verbose + 3;
+ return (dump_path(argv[0], argv[1]));
+ }
+
+ if (dump_opt['X'] || dump_opt['F'])
+ rewind = ZPOOL_DO_REWIND |
+ (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
+
+ if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+ nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
+ nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
+ fatal("internal error: %s", strerror(ENOMEM));
+
+ error = 0;
+ target = argv[0];
+
+ if (strpbrk(target, "/@") != NULL) {
+ size_t targetlen;
+
+ target_pool = strdup(target);
+ *strpbrk(target_pool, "/@") = '\0';
+
+ target_is_spa = B_FALSE;
+ targetlen = strlen(target);
+ if (targetlen && target[targetlen - 1] == '/')
+ target[targetlen - 1] = '\0';
+ } else {
+ target_pool = target;
+ }
+
+ if (dump_opt['e']) {
+ importargs_t args = { 0 };
+
+ args.paths = nsearch;
+ args.path = searchdirs;
+ args.can_be_active = B_TRUE;
+
+ error = zpool_find_config(NULL, target_pool, &cfg, &args,
+ &libzpool_config_ops);
+
+ if (error == 0) {
+
+ if (nvlist_add_nvlist(cfg,
+ ZPOOL_LOAD_POLICY, policy) != 0) {
+ fatal("can't open '%s': %s",
+ target, strerror(ENOMEM));
+ }
+
+ if (dump_opt['C'] > 1) {
+ (void) printf("\nConfiguration for import:\n");
+ dump_nvlist(cfg, 8);
+ }
+
+ /*
+ * Disable the activity check to allow examination of
+ * active pools.
+ */
+ error = spa_import(target_pool, cfg, NULL,
+ flags | ZFS_IMPORT_SKIP_MMP);
+ }
+ }
+
+ /*
+ * import_checkpointed_state makes the assumption that the
+ * target pool that we pass it is already part of the spa
+ * namespace. Because of that we need to make sure to call
+ * it always after the -e option has been processed, which
+ * imports the pool to the namespace if it's not in the
+ * cachefile.
+ */
+ char *checkpoint_pool = NULL;
+ char *checkpoint_target = NULL;
+ if (dump_opt['k']) {
+ checkpoint_pool = import_checkpointed_state(target, cfg,
+ &checkpoint_target);
+
+ if (checkpoint_target != NULL)
+ target = checkpoint_target;
+ }
+
+ if (target_pool != target)
+ free(target_pool);
+
+ if (error == 0) {
+ if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
+ ASSERT(checkpoint_pool != NULL);
+ ASSERT(checkpoint_target == NULL);
+
+ error = spa_open(checkpoint_pool, &spa, FTAG);
+ if (error != 0) {
+ fatal("Tried to open pool \"%s\" but "
+ "spa_open() failed with error %d\n",
+ checkpoint_pool, error);
+ }
+
+ } else if (target_is_spa || dump_opt['R'] || objset_id == 0) {
+ zdb_set_skip_mmp(target);
+ error = spa_open_rewind(target, &spa, FTAG, policy,
+ NULL);
+ if (error) {
+ /*
+ * If we're missing the log device then
+ * try opening the pool after clearing the
+ * log state.
+ */
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(target)) != NULL &&
+ spa->spa_log_state == SPA_LOG_MISSING) {
+ spa->spa_log_state = SPA_LOG_CLEAR;
+ error = 0;
+ }
+ mutex_exit(&spa_namespace_lock);
+
+ if (!error) {
+ error = spa_open_rewind(target, &spa,
+ FTAG, policy, NULL);
+ }
+ }
+ } else if (strpbrk(target, "#") != NULL) {
+ dsl_pool_t *dp;
+ error = dsl_pool_hold(target, FTAG, &dp);
+ if (error != 0) {
+ fatal("can't dump '%s': %s", target,
+ strerror(error));
+ }
+ error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
+ dsl_pool_rele(dp, FTAG);
+ if (error != 0) {
+ fatal("can't dump '%s': %s", target,
+ strerror(error));
+ }
+ return (error);
+ } else {
+ zdb_set_skip_mmp(target);
+ if (dataset_lookup == B_TRUE) {
+ /*
+ * Use the supplied id to get the name
+ * for open_objset.
+ */
+ error = spa_open(target, &spa, FTAG);
+ if (error == 0) {
+ error = name_from_objset_id(spa,
+ objset_id, dsname);
+ spa_close(spa, FTAG);
+ if (error == 0)
+ target = dsname;
+ }
+ }
+ if (error == 0)
+ error = open_objset(target, FTAG, &os);
+ if (error == 0)
+ spa = dmu_objset_spa(os);
+ }
+ }
+ nvlist_free(policy);
+
+ if (error)
+ fatal("can't open '%s': %s", target, strerror(error));
+
+ /*
+ * Set the pool failure mode to panic in order to prevent the pool
+ * from suspending. A suspended I/O will have no way to resume and
+ * can prevent the zdb(8) command from terminating as expected.
+ */
+ if (spa != NULL)
+ spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
+
+ argv++;
+ argc--;
+ if (!dump_opt['R']) {
+ flagbits['d'] = ZOR_FLAG_DIRECTORY;
+ flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
+ flagbits['m'] = ZOR_FLAG_SPACE_MAP;
+ flagbits['z'] = ZOR_FLAG_ZAP;
+ flagbits['A'] = ZOR_FLAG_ALL_TYPES;
+
+ if (argc > 0 && dump_opt['d']) {
+ zopt_object_args = argc;
+ zopt_object_ranges = calloc(zopt_object_args,
+ sizeof (zopt_object_range_t));
+ for (unsigned i = 0; i < zopt_object_args; i++) {
+ int err;
+ char *msg = NULL;
+
+ err = parse_object_range(argv[i],
+ &zopt_object_ranges[i], &msg);
+ if (err != 0)
+ fatal("Bad object or range: '%s': %s\n",
+ argv[i], msg ? msg : "");
+ }
+ } else if (argc > 0 && dump_opt['m']) {
+ zopt_metaslab_args = argc;
+ zopt_metaslab = calloc(zopt_metaslab_args,
+ sizeof (uint64_t));
+ for (unsigned i = 0; i < zopt_metaslab_args; i++) {
+ errno = 0;
+ zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
+ if (zopt_metaslab[i] == 0 && errno != 0)
+ fatal("bad number %s: %s", argv[i],
+ strerror(errno));
+ }
+ }
+ if (os != NULL) {
+ dump_objset(os);
+ } else if (zopt_object_args > 0 && !dump_opt['m']) {
+ dump_objset(spa->spa_meta_objset);
+ } else {
+ dump_zpool(spa);
+ }
+ } else {
+ flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+ flagbits['c'] = ZDB_FLAG_CHECKSUM;
+ flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+ flagbits['e'] = ZDB_FLAG_BSWAP;
+ flagbits['g'] = ZDB_FLAG_GBH;
+ flagbits['i'] = ZDB_FLAG_INDIRECT;
+ flagbits['r'] = ZDB_FLAG_RAW;
+ flagbits['v'] = ZDB_FLAG_VERBOSE;
+
+ for (int i = 0; i < argc; i++)
+ zdb_read_block(argv[i], spa);
+ }
+
+ if (dump_opt['k']) {
+ free(checkpoint_pool);
+ if (!target_is_spa)
+ free(checkpoint_target);
+ }
+
+ if (os != NULL) {
+ close_objset(os, FTAG);
+ } else {
+ spa_close(spa, FTAG);
+ }
+
+ fuid_table_destroy();
+
+ dump_debug_buffer();
+
+ kernel_fini();
+
+ return (error);
+}
diff --git a/cmd/zdb/zdb.h b/cmd/zdb/zdb.h
new file mode 100644
index 000000000000..49579811efbb
--- /dev/null
+++ b/cmd/zdb/zdb.h
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2017 Spectra Logic Corp Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#ifndef _ZDB_H
+#define _ZDB_H
+
+void dump_intent_log(zilog_t *);
+extern uint8_t dump_opt[256];
+
+#endif /* _ZDB_H */
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
new file mode 100644
index 000000000000..c12178effae0
--- /dev/null
+++ b/cmd/zdb/zdb_il.c
@@ -0,0 +1,431 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Print intent log header and statistics.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/abd.h>
+
+#include "zdb.h"
+
+extern uint8_t dump_opt[256];
+
+static char tab_prefix[4] = "\t\t\t";
+
+static void
+print_log_bp(const blkptr_t *bp, const char *prefix)
+{
+ char blkbuf[BP_SPRINTF_LEN];
+
+ snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+ (void) printf("%s%s\n", prefix, blkbuf);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_create_t *lr = arg;
+ time_t crtime = lr->lr_crtime[0];
+ char *name, *link;
+ lr_attr_t *lrattr;
+
+ name = (char *)(lr + 1);
+
+ if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+ lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+ lrattr = (lr_attr_t *)(lr + 1);
+ name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+ }
+
+ if (txtype == TX_SYMLINK) {
+ link = name + strlen(name) + 1;
+ (void) printf("%s%s -> %s\n", tab_prefix, name, link);
+ } else if (txtype != TX_MKXATTR) {
+ (void) printf("%s%s\n", tab_prefix, name);
+ }
+
+ (void) printf("%s%s", tab_prefix, ctime(&crtime));
+ (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n",
+ tab_prefix, (u_longlong_t)lr->lr_doid,
+ (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid),
+ (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid),
+ (longlong_t)lr->lr_mode);
+ (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+ tab_prefix,
+ (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
+ (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_remove_t *lr = arg;
+
+ (void) printf("%sdoid %llu, name %s\n", tab_prefix,
+ (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_link_t *lr = arg;
+
+ (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix,
+ (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
+ (char *)(lr + 1));
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_rename_t *lr = arg;
+ char *snm = (char *)(lr + 1);
+ char *tnm = snm + strlen(snm) + 1;
+
+ (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
+ (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
+}
+
+/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+ char *cdata = data;
+
+ for (size_t i = 0; i < len; i++) {
+ if (isprint(*cdata))
+ (void) printf("%c ", *cdata);
+ else
+ (void) printf("%2X", *cdata);
+ cdata++;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_write_t *lr = arg;
+ abd_t *data;
+ blkptr_t *bp = &lr->lr_blkptr;
+ zbookmark_phys_t zb;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ int error;
+
+ (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
+
+ if (txtype == TX_WRITE2 || verbose < 5)
+ return;
+
+ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+ (void) printf("%shas blkptr, %s\n", tab_prefix,
+ !BP_IS_HOLE(bp) &&
+ bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+ "will claim" : "won't claim");
+ print_log_bp(bp, tab_prefix);
+
+ if (BP_IS_HOLE(bp)) {
+ (void) printf("\t\t\tLSIZE 0x%llx\n",
+ (u_longlong_t)BP_GET_LSIZE(bp));
+ (void) printf("%s<hole>\n", tab_prefix);
+ return;
+ }
+ if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+ (void) printf("%s<block already committed>\n",
+ tab_prefix);
+ return;
+ }
+
+ SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+ lr->lr_foid, ZB_ZIL_LEVEL,
+ lr->lr_offset / BP_GET_LSIZE(bp));
+
+ data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
+ error = zio_wait(zio_read(NULL, zilog->zl_spa,
+ bp, data, BP_GET_LSIZE(bp), NULL, NULL,
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+ if (error)
+ goto out;
+ } else {
+ /* data is stored after the end of the lr_write record */
+ data = abd_alloc(lr->lr_length, B_FALSE);
+ abd_copy_from_buf(data, lr + 1, lr->lr_length);
+ }
+
+ (void) printf("%s", tab_prefix);
+ (void) abd_iterate_func(data,
+ 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+ zil_prt_rec_write_cb, NULL);
+ (void) printf("\n");
+
+out:
+ abd_free(data);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_truncate_t *lr = arg;
+
+ (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
+ (u_longlong_t)lr->lr_length);
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_setattr_t *lr = arg;
+ time_t atime = (time_t)lr->lr_atime[0];
+ time_t mtime = (time_t)lr->lr_mtime[0];
+
+ (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
+
+ if (lr->lr_mask & AT_MODE) {
+ (void) printf("%sAT_MODE %llo\n", tab_prefix,
+ (longlong_t)lr->lr_mode);
+ }
+
+ if (lr->lr_mask & AT_UID) {
+ (void) printf("%sAT_UID %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_uid);
+ }
+
+ if (lr->lr_mask & AT_GID) {
+ (void) printf("%sAT_GID %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_gid);
+ }
+
+ if (lr->lr_mask & AT_SIZE) {
+ (void) printf("%sAT_SIZE %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_size);
+ }
+
+ if (lr->lr_mask & AT_ATIME) {
+ (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix,
+ (u_longlong_t)lr->lr_atime[0],
+ (u_longlong_t)lr->lr_atime[1],
+ ctime(&atime));
+ }
+
+ if (lr->lr_mask & AT_MTIME) {
+ (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix,
+ (u_longlong_t)lr->lr_mtime[0],
+ (u_longlong_t)lr->lr_mtime[1],
+ ctime(&mtime));
+ }
+}
+
+/* ARGSUSED */
+static void
+zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg)
+{
+ lr_acl_t *lr = arg;
+
+ (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix,
+ (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
+}
+
+typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *);
+typedef struct zil_rec_info {
+ zil_prt_rec_func_t zri_print;
+ const char *zri_name;
+ uint64_t zri_count;
+} zil_rec_info_t;
+
+static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
+ {.zri_print = NULL, .zri_name = "Total "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "},
+ {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "},
+ {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "},
+ {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "},
+ {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "},
+ {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "},
+ {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "},
+ {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "},
+ {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "},
+ {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "},
+ {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "},
+ {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "},
+};
+
+/* ARGSUSED */
+static int
+print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
+{
+ int txtype;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+
+ /* reduce size of txtype to strip off TX_CI bit */
+ txtype = lr->lrc_txtype;
+
+ ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE);
+ ASSERT(lr->lrc_txg);
+
+ (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n",
+ (lr->lrc_txtype & TX_CI) ? "CI-" : "",
+ zil_rec_info[txtype].zri_name,
+ (u_longlong_t)lr->lrc_reclen,
+ (u_longlong_t)lr->lrc_txg,
+ (u_longlong_t)lr->lrc_seq);
+
+ if (txtype && verbose >= 3) {
+ if (!zilog->zl_os->os_encrypted) {
+ zil_rec_info[txtype].zri_print(zilog, txtype, lr);
+ } else {
+ (void) printf("%s(encrypted)\n", tab_prefix);
+ }
+ }
+
+ zil_rec_info[txtype].zri_count++;
+ zil_rec_info[0].zri_count++;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+ char blkbuf[BP_SPRINTF_LEN + 10];
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ const char *claim;
+
+ if (verbose <= 3)
+ return (0);
+
+ if (verbose >= 5) {
+ (void) strcpy(blkbuf, ", ");
+ snprintf_blkptr(blkbuf + strlen(blkbuf),
+ sizeof (blkbuf) - strlen(blkbuf), bp);
+ } else {
+ blkbuf[0] = '\0';
+ }
+
+ if (claim_txg != 0)
+ claim = "already claimed";
+ else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
+ claim = "will claim";
+ else
+ claim = "won't claim";
+
+ (void) printf("\tBlock seqno %llu, %s%s\n",
+ (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+ return (0);
+}
+
+static void
+print_log_stats(int verbose)
+{
+ unsigned i, w, p10;
+
+ if (verbose > 3)
+ (void) printf("\n");
+
+ if (zil_rec_info[0].zri_count == 0)
+ return;
+
+ for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10)
+ w++;
+
+ for (i = 0; i < TX_MAX_TYPE; i++)
+ if (zil_rec_info[i].zri_count || verbose >= 3)
+ (void) printf("\t\t%s %*llu\n",
+ zil_rec_info[i].zri_name, w,
+ (u_longlong_t)zil_rec_info[i].zri_count);
+ (void) printf("\n");
+}
+
+/* ARGSUSED */
+void
+dump_intent_log(zilog_t *zilog)
+{
+ const zil_header_t *zh = zilog->zl_header;
+ int verbose = MAX(dump_opt['d'], dump_opt['i']);
+ int i;
+
+ if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
+ return;
+
+ (void) printf("\n ZIL header: claim_txg %llu, "
+ "claim_blk_seq %llu, claim_lr_seq %llu",
+ (u_longlong_t)zh->zh_claim_txg,
+ (u_longlong_t)zh->zh_claim_blk_seq,
+ (u_longlong_t)zh->zh_claim_lr_seq);
+ (void) printf(" replay_seq %llu, flags 0x%llx\n",
+ (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
+
+ for (i = 0; i < TX_MAX_TYPE; i++)
+ zil_rec_info[i].zri_count = 0;
+
+ /* see comment in zil_claim() or zil_check_log_chain() */
+ if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+ zh->zh_claim_txg == 0)
+ return;
+
+ if (verbose >= 2) {
+ (void) printf("\n");
+ (void) zil_parse(zilog, print_log_block, print_log_record, NULL,
+ zh->zh_claim_txg, B_FALSE);
+ print_log_stats(verbose);
+ }
+}
diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore
new file mode 100644
index 000000000000..76557bb6bb3a
--- /dev/null
+++ b/cmd/zed/.gitignore
@@ -0,0 +1 @@
+/zed
diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am
new file mode 100644
index 000000000000..4bd8ac4a53e6
--- /dev/null
+++ b/cmd/zed/Makefile.am
@@ -0,0 +1,49 @@
+include $(top_srcdir)/config/Rules.am
+
+AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS)
+
+SUBDIRS = zed.d
+
+sbin_PROGRAMS = zed
+
+ZED_SRC = \
+ zed.c \
+ zed.h \
+ zed_conf.c \
+ zed_conf.h \
+ zed_disk_event.c \
+ zed_disk_event.h \
+ zed_event.c \
+ zed_event.h \
+ zed_exec.c \
+ zed_exec.h \
+ zed_file.c \
+ zed_file.h \
+ zed_log.c \
+ zed_log.h \
+ zed_strings.c \
+ zed_strings.h
+
+FMA_SRC = \
+ agents/zfs_agents.c \
+ agents/zfs_agents.h \
+ agents/zfs_diagnosis.c \
+ agents/zfs_mod.c \
+ agents/zfs_retire.c \
+ agents/fmd_api.c \
+ agents/fmd_api.h \
+ agents/fmd_serd.c \
+ agents/fmd_serd.h
+
+zed_SOURCES = $(ZED_SRC) $(FMA_SRC)
+
+zed_LDADD = \
+ $(abs_top_builddir)/lib/libzfs/libzfs.la \
+ $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \
+ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \
+ $(abs_top_builddir)/lib/libuutil/libuutil.la
+
+zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS)
+zed_LDFLAGS = -pthread
+
+EXTRA_DIST = agents/README.md
diff --git a/cmd/zed/agents/README.md b/cmd/zed/agents/README.md
new file mode 100644
index 000000000000..e35b97668a9d
--- /dev/null
+++ b/cmd/zed/agents/README.md
@@ -0,0 +1,112 @@
+## Fault Management Logic for ZED ##
+
+The integration of Fault Management Daemon (FMD) logic from illumos
+is being deployed in three phases. This logic is encapsulated in
+several software modules inside ZED.
+
+### ZED+FM Phase 1 ###
+
+All the phase 1 work is in current Master branch. Phase I work includes:
+
+* Add new paths to the persistent VDEV label for device matching.
+* Add a disk monitor for generating _disk-add_ and _disk-change_ events.
+* Add support for automated VDEV auto-online, auto-replace and auto-expand.
+* Expand the statechange event to include all VDEV state transitions.
+
+### ZED+FM Phase 2 (WIP) ###
+
+The phase 2 work primarily entails the _Diagnosis Engine_ and the
+_Retire Agent_ modules. It also includes infrastructure to support a
+crude FMD environment to host these modules. For additional
+information see the **FMD Components in ZED** and **Implementation
+Notes** sections below.
+
+### ZED+FM Phase 3 ###
+
+Future work will add additional functionality and will likely include:
+
+* Add FMD module garbage collection (periodically call `fmd_module_gc()`).
+* Add real module property retrieval (currently hard-coded in accessors).
+* Additional diagnosis telemetry (like latency outliers and SMART data).
+* Export FMD module statistics.
+* Zedlet parallel execution and resiliency (add watchdog).
+
+### ZFS Fault Management Overview ###
+
+The primary purpose with ZFS fault management is automated diagnosis
+and isolation of VDEV faults. A fault is something we can associate
+with an impact (e.g. loss of data redundancy) and a corrective action
+(e.g. offline or replace a disk). A typical ZFS fault management stack
+is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk
+monitor_, a _diagnosis engine_ and _response agents_.
+
+After detecting a software error, the ZFS kernel module sends error
+events to the ZED user daemon which in turn routes the events to its
+internal FMA modules based on their event subscriptions. Likewise, if
+a disk is added or changed in the system, the disk monitor sends disk
+events which are consumed by a response agent.
+
+### FMD Components in ZED ###
+
+There are three FMD modules (aka agents) that are now built into ZED.
+
+ 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`)
+ 2. A _Retire Agent_ module (`agents/zfs_retire.c`)
+ 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`)
+
+To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum
+ereports and feeds them into a Soft Error Rate Discrimination (SERD)
+algorithm which will generate a corresponding fault diagnosis when the
+tracked VDEV encounters **N** events in a given **T** time window. The
+initial N and T values for the SERD algorithm are estimates inherited
+from illumos (10 errors in 10 minutes).
+
+In turn, a **Retire Agent** responds to diagnosed faults by isolating
+the faulty VDEV. It will notify the ZFS kernel module of the new VDEV
+state (degraded or faulted). The retire agent is also responsible for
+managing hot spares across all pools. When it encounters a device fault
+or a device removal it will replace the device with an appropriate
+spare if available.
+
+Finally, a **Disk Add Agent** responds to events from a libudev disk
+monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or
+expand the associated VDEV. This agent is also known as the `zfs_mod`
+or Sysevent Loadable Module (SLM) on the illumos platform. The added
+disk is matched to a specific VDEV using its device id, physical path
+or VDEV GUID.
+
+Note that the _auto-replace_ feature (aka hot plug) is opt-in and you
+must set the pool's `autoreplace` property to enable it. The new disk
+will be matched to the corresponding leaf VDEV by physical location
+and labeled with a GPT partition before replacing the original VDEV
+in the pool.
+
+### Implementation Notes ###
+
+* The FMD module API required for logic modules is emulated and implemented
+ in the `fmd_api.c` and `fmd_serd.c` source files. This support includes
+ module registration, memory allocation, module property accessors, basic
+ case management, one-shot timers and SERD engines.
+ For detailed information on the FMD module API, see the document --
+ _"Fault Management Daemon Programmer's Reference Manual"_.
+
+* The event subscriptions for the modules (located in a module specific
+ configuration file on illumos) are currently hard-coded into the ZED
+ `zfs_agent_dispatch()` function.
+
+* The FMD modules are called one at a time from a single thread that
+ consumes events queued to the modules. These events are sourced from
+ the normal ZED events and also include events posted from the diagnosis
+ engine and the libudev disk event monitor.
+
+* The FMD code modules have minimal changes and were intentionally left
+ as similar as possible to their upstream source files.
+
+* The sysevent namespace in ZED differs from illumos. For example:
+ * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"`
+ * Linux uses `"sysevent.fs.zfs.vdev_remove"`
+
+* The FMD Modules port was produced by Intel Federal, LLC under award
+ number B609815 between the U.S. Department of Energy (DOE) and Intel
+ Federal, LLC.
+
diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c
new file mode 100644
index 000000000000..607b387ca3a8
--- /dev/null
+++ b/cmd/zed/agents/fmd_api.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+/*
+ * This file implements the minimal FMD module API required to support the
+ * fault logic modules in ZED. This support includes module registration,
+ * memory allocation, module property accessors, basic case management,
+ * one-shot timers and SERD engines.
+ *
+ * In the ZED runtime, the modules are called from a single thread so no
+ * locking is required in this emulated FMD environment.
+ */
+
+#include <sys/types.h>
+#include <sys/fm/protocol.h>
+#include <uuid/uuid.h>
+#include <signal.h>
+#include <strings.h>
+#include <time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+typedef struct fmd_modstat {
+ fmd_stat_t ms_accepted; /* total events accepted by module */
+ fmd_stat_t ms_caseopen; /* cases currently open */
+ fmd_stat_t ms_casesolved; /* total cases solved by module */
+ fmd_stat_t ms_caseclosed; /* total cases closed by module */
+} fmd_modstat_t;
+
+typedef struct fmd_module {
+ const char *mod_name; /* basename of module (ro) */
+ const fmd_hdl_info_t *mod_info; /* module info registered with handle */
+ void *mod_spec; /* fmd_hdl_get/setspecific data value */
+ fmd_stat_t *mod_ustat; /* module specific custom stats */
+ uint_t mod_ustat_cnt; /* count of ustat stats */
+ fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */
+ fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */
+ char *mod_vers; /* a copy of module version string */
+} fmd_module_t;
+
+/*
+ * ZED has two FMD hardwired module instances
+ */
+fmd_module_t zfs_retire_module;
+fmd_module_t zfs_diagnosis_module;
+
+/*
+ * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
+ */
+
+#ifdef DEBUG
+const char *
+_umem_debug_init(void)
+{
+ return ("default,verbose"); /* $UMEM_DEBUG setting */
+}
+
+const char *
+_umem_logging_init(void)
+{
+ return ("fail,contents"); /* $UMEM_LOGGING setting */
+}
+#endif
+
+/*
+ * Register a module with fmd and finish module initialization.
+ * Returns an integer indicating whether it succeeded (zero) or
+ * failed (non-zero).
+ */
+int
+fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ mp->mod_info = mip;
+ mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */
+ mp->mod_spec = NULL;
+
+ /* bare minimum module stats */
+ (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted");
+ (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen");
+ (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved");
+ (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed");
+
+ fmd_serd_hash_create(&mp->mod_serds);
+
+ fmd_hdl_debug(hdl, "register module");
+
+ return (0);
+}
+
+void
+fmd_hdl_unregister(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_modstat_t *msp = &mp->mod_stats;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+ /* dump generic module stats */
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name,
+ msp->ms_accepted.fmds_value.ui64);
+ if (ops->fmdo_close != NULL) {
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name,
+ msp->ms_caseopen.fmds_value.ui64);
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name,
+ msp->ms_casesolved.fmds_value.ui64);
+ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name,
+ msp->ms_caseclosed.fmds_value.ui64);
+ }
+
+ /* dump module specific stats */
+ if (mp->mod_ustat != NULL) {
+ int i;
+
+ for (i = 0; i < mp->mod_ustat_cnt; i++) {
+ fmd_hdl_debug(hdl, "%s: %llu",
+ mp->mod_ustat[i].fmds_name,
+ mp->mod_ustat[i].fmds_value.ui64);
+ }
+ }
+
+ fmd_serd_hash_destroy(&mp->mod_serds);
+
+ fmd_hdl_debug(hdl, "unregister module");
+}
+
+/*
+ * fmd_hdl_setspecific() is used to associate a data pointer with
+ * the specified handle for the duration of the module's lifetime.
+ * This pointer can be retrieved using fmd_hdl_getspecific().
+ */
+void
+fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ mp->mod_spec = spec;
+}
+
+/*
+ * Return the module-specific data pointer previously associated
+ * with the handle using fmd_hdl_setspecific().
+ */
+void *
+fmd_hdl_getspecific(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (mp->mod_spec);
+}
+
+void *
+fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+ return (umem_alloc(size, flags));
+}
+
+void *
+fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags)
+{
+ return (umem_zalloc(size, flags));
+}
+
+void
+fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size)
+{
+ umem_free(data, size);
+}
+
+/*
+ * Record a module debug message using the specified format.
+ */
+void
+fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...)
+{
+ char message[256];
+ va_list vargs;
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ va_start(vargs, format);
+ (void) vsnprintf(message, sizeof (message), format, vargs);
+ va_end(vargs);
+
+ /* prefix message with module name */
+ zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message);
+}
+
+/* Property Retrieval */
+
+int32_t
+fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
+{
+ /*
+ * These can be looked up in mp->modinfo->fmdi_props
+ * For now we just hard code for phase 2. In the
+ * future, there can be a ZED based override.
+ */
+ if (strcmp(name, "spare_on_remove") == 0)
+ return (1);
+
+ if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
+ return (10); /* N = 10 events */
+
+ return (0);
+}
+
+int64_t
+fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
+{
+ /*
+ * These can be looked up in mp->modinfo->fmdi_props
+ * For now we just hard code for phase 2. In the
+ * future, there can be a ZED based override.
+ */
+ if (strcmp(name, "remove_timeout") == 0)
+ return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
+
+ if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
+ return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
+
+ return (0);
+}
+
+/* FMD Statistics */
+
+fmd_stat_t *
+fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ if (flags == FMD_STAT_NOALLOC) {
+ mp->mod_ustat = statv;
+ mp->mod_ustat_cnt = nstats;
+ }
+
+ return (statv);
+}
+
+/* Case Management */
+
+fmd_case_t *
+fmd_case_open(fmd_hdl_t *hdl, void *data)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ uuid_t uuid;
+
+ fmd_case_t *cp;
+
+ cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP);
+ cp->ci_mod = hdl;
+ cp->ci_state = FMD_CASE_UNSOLVED;
+ cp->ci_flags = FMD_CF_DIRTY;
+ cp->ci_data = data;
+ cp->ci_bufptr = NULL;
+ cp->ci_bufsiz = 0;
+
+ uuid_generate(uuid);
+ uuid_unparse(uuid, cp->ci_uuid);
+
+ fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid);
+ mp->mod_stats.ms_caseopen.fmds_value.ui64++;
+
+ return (cp);
+}
+
+void
+fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ /*
+ * For ZED, the event was already sent from fmd_case_add_suspect()
+ */
+
+ if (cp->ci_state >= FMD_CASE_SOLVED)
+ fmd_hdl_debug(hdl, "case is already solved or closed");
+
+ cp->ci_state = FMD_CASE_SOLVED;
+
+ fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid);
+ mp->mod_stats.ms_casesolved.fmds_value.ui64++;
+}
+
+void
+fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+
+ fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid);
+
+ if (ops->fmdo_close != NULL)
+ ops->fmdo_close(hdl, cp);
+
+ mp->mod_stats.ms_caseopen.fmds_value.ui64--;
+ mp->mod_stats.ms_caseclosed.fmds_value.ui64++;
+
+ if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0)
+ fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz);
+
+ fmd_hdl_free(hdl, cp, sizeof (fmd_case_t));
+}
+
+void
+fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid)
+{
+ fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid);
+}
+
+int
+fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE);
+}
+
+void
+fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep)
+{
+}
+
+static void
+zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code)
+{
+ nvlist_t *rsrc;
+ char *strval;
+ uint64_t guid;
+ uint8_t byte;
+
+ zed_log_msg(LOG_INFO, "\nzed_fault_event:");
+
+ if (uuid != NULL)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid);
+ if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval);
+ if (code != NULL)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code);
+ if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte);
+ if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) {
+ if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME,
+ strval);
+ if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL,
+ guid);
+ if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0)
+ zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV,
+ guid);
+ }
+}
+
+static const char *
+fmd_fault_mkcode(nvlist_t *fault)
+{
+ char *class, *code = "-";
+
+ /*
+ * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po
+ */
+ if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) {
+ if (strcmp(class, "fault.fs.zfs.vdev.io") == 0)
+ code = "ZFS-8000-FD";
+ else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0)
+ code = "ZFS-8000-GH";
+ else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0)
+ code = "ZFS-8000-HC";
+ else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0)
+ code = "ZFS-8000-JQ";
+ else if (strcmp(class, "fault.fs.zfs.log_replay") == 0)
+ code = "ZFS-8000-K4";
+ else if (strcmp(class, "fault.fs.zfs.pool") == 0)
+ code = "ZFS-8000-CS";
+ else if (strcmp(class, "fault.fs.zfs.device") == 0)
+ code = "ZFS-8000-D3";
+
+ }
+ return (code);
+}
+
+void
+fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault)
+{
+ nvlist_t *nvl;
+ const char *code = fmd_fault_mkcode(fault);
+ int64_t tod[2];
+ int err = 0;
+
+ /*
+ * payload derived from fmd_protocol_list()
+ */
+
+ (void) gettimeofday(&cp->ci_tv, NULL);
+ tod[0] = cp->ci_tv.tv_sec;
+ tod[1] = cp->ci_tv.tv_usec;
+
+ nvl = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION);
+ err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS);
+ err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid);
+ err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code);
+ err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2);
+ err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1);
+ err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1);
+
+ if (err)
+ zed_log_die("failed to populate nvlist");
+
+ zed_log_fault(fault, cp->ci_uuid, code);
+ zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl);
+
+ nvlist_free(nvl);
+ nvlist_free(fault);
+}
+
+void
+fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data)
+{
+ cp->ci_data = data;
+}
+
+void *
+fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ return (cp->ci_data);
+}
+
+void
+fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr == NULL);
+ assert(size < (1024 * 1024));
+
+ cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP);
+ cp->ci_bufsiz = size;
+}
+
+void
+fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp,
+ const char *name, void *buf, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr != NULL);
+ assert(size <= cp->ci_bufsiz);
+
+ bcopy(cp->ci_bufptr, buf, size);
+}
+
+void
+fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp,
+ const char *name, const void *buf, size_t size)
+{
+ assert(strcmp(name, "data") == 0);
+ assert(cp->ci_bufptr != NULL);
+ assert(cp->ci_bufsiz >= size);
+
+ bcopy(buf, cp->ci_bufptr, size);
+}
+
+/* SERD Engines */
+
+void
+fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) {
+ zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': "
+ " name already exists", name);
+ return;
+ }
+
+ (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t);
+}
+
+void
+fmd_serd_destroy(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ fmd_serd_eng_delete(&mp->mod_serds, name);
+
+ fmd_hdl_debug(hdl, "serd_destroy %s", name);
+}
+
+int
+fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
+}
+
+void
+fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
+
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
+ return;
+ }
+
+ fmd_serd_eng_reset(sgp);
+
+ fmd_hdl_debug(hdl, "serd_reset %s", name);
+}
+
+int
+fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ fmd_serd_eng_t *sgp;
+ int err;
+
+ if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
+ zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
+ name);
+ return (FMD_B_FALSE);
+ }
+ err = fmd_serd_eng_record(sgp, ep->ev_hrt);
+
+ return (err);
+}
+
+/* FMD Timers */
+
+static void
+_timer_notify(union sigval sv)
+{
+ fmd_timer_t *ftp = sv.sival_ptr;
+ fmd_hdl_t *hdl = ftp->ft_hdl;
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+ struct itimerspec its;
+
+ fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
+
+ /* disarm the timer */
+ bzero(&its, sizeof (struct itimerspec));
+ timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+ /* Note that the fmdo_timeout can remove this timer */
+ if (ops->fmdo_timeout != NULL)
+ ops->fmdo_timeout(hdl, ftp, ftp->ft_arg);
+}
+
+/*
+ * Install a new timer which will fire at least delta nanoseconds after the
+ * current time. After the timeout has expired, the module's fmdo_timeout
+ * entry point is called.
+ */
+fmd_timer_t *
+fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta)
+{
+ struct sigevent sev;
+ struct itimerspec its;
+ fmd_timer_t *ftp;
+
+ ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP);
+ ftp->ft_arg = arg;
+ ftp->ft_hdl = hdl;
+
+ its.it_value.tv_sec = delta / 1000000000;
+ its.it_value.tv_nsec = delta % 1000000000;
+ its.it_interval.tv_sec = its.it_value.tv_sec;
+ its.it_interval.tv_nsec = its.it_value.tv_nsec;
+
+ sev.sigev_notify = SIGEV_THREAD;
+ sev.sigev_notify_function = _timer_notify;
+ sev.sigev_notify_attributes = NULL;
+ sev.sigev_value.sival_ptr = ftp;
+
+ timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid);
+ timer_settime(ftp->ft_tid, 0, &its, NULL);
+
+ fmd_hdl_debug(hdl, "installing timer for %d secs (%p)",
+ (int)its.it_value.tv_sec, ftp->ft_tid);
+
+ return (ftp);
+}
+
+void
+fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp)
+{
+ fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid);
+
+ timer_delete(ftp->ft_tid);
+
+ fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t));
+}
+
+/* Name-Value Pair Lists */
+
+nvlist_t *
+fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty,
+ nvlist_t *asru, nvlist_t *fru, nvlist_t *resource)
+{
+ nvlist_t *nvl;
+ int err = 0;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ zed_log_die("failed to xalloc fault nvlist");
+
+ err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION);
+ err |= nvlist_add_string(nvl, FM_CLASS, class);
+ err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty);
+
+ if (asru != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru);
+ if (fru != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru);
+ if (resource != NULL)
+ err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource);
+
+ if (err)
+ zed_log_die("failed to populate nvlist: %s\n", strerror(err));
+
+ return (nvl);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static int
+fmd_strmatch(const char *s, const char *p)
+{
+ char c;
+
+ if (p == NULL)
+ return (0);
+
+ if (s == NULL)
+ s = ""; /* treat NULL string as the empty string */
+
+ do {
+ if ((c = *p++) == '\0')
+ return (*s == '\0');
+
+ if (c == '*') {
+ while (*p == '*')
+ p++; /* consecutive *'s can be collapsed */
+
+ if (*p == '\0')
+ return (1);
+
+ while (*s != '\0') {
+ if (fmd_strmatch(s++, p) != 0)
+ return (1);
+ }
+
+ return (0);
+ }
+ } while (c == *s++);
+
+ return (0);
+}
+
+int
+fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern)
+{
+ char *class;
+
+ return (nvl != NULL &&
+ nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 &&
+ fmd_strmatch(class, pattern));
+}
+
+nvlist_t *
+fmd_nvl_alloc(fmd_hdl_t *hdl, int flags)
+{
+ nvlist_t *nvl = NULL;
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+ return (NULL);
+
+ return (nvl);
+}
+
+
+/*
+ * ZED Agent specific APIs
+ */
+
+fmd_hdl_t *
+fmd_module_hdl(const char *name)
+{
+ if (strcmp(name, "zfs-retire") == 0)
+ return ((fmd_hdl_t *)&zfs_retire_module);
+ if (strcmp(name, "zfs-diagnosis") == 0)
+ return ((fmd_hdl_t *)&zfs_diagnosis_module);
+
+ return (NULL);
+}
+
+boolean_t
+fmd_module_initialized(fmd_hdl_t *hdl)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+
+ return (mp->mod_info != NULL);
+}
+
+/*
+ * fmd_module_recv is called for each event that is received by
+ * the fault manager that has a class that matches one of the
+ * module's subscriptions.
+ */
+void
+fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class)
+{
+ fmd_module_t *mp = (fmd_module_t *)hdl;
+ const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
+ fmd_event_t faux_event = {0};
+ int64_t *tv;
+ uint_t n;
+
+ /*
+ * Will need to normalized this if we persistently store the case data
+ */
+ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0)
+ faux_event.ev_hrt = tv[0] * NANOSEC + tv[1];
+ else
+ faux_event.ev_hrt = 0;
+
+ ops->fmdo_recv(hdl, &faux_event, nvl, class);
+
+ mp->mod_stats.ms_accepted.fmds_value.ui64++;
+
+ /* TBD - should we initiate fm_module_gc() periodically? */
+}
diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h
new file mode 100644
index 000000000000..4f06fb244b7b
--- /dev/null
+++ b/cmd/zed/agents/fmd_api.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _FMD_API_H
+#define _FMD_API_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <libnvpair.h>
+#include <stdarg.h>
+#include <umem.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Fault Management Daemon Client Interfaces
+ */
+
+#define FMD_API_VERSION 5
+
+typedef struct fmd_hdl fmd_hdl_t;
+
+typedef struct fmd_timer {
+ timer_t ft_tid;
+ void *ft_arg;
+ fmd_hdl_t *ft_hdl;
+} fmd_timer_t;
+
+#define id_t fmd_timer_t *
+
+
+typedef struct fmd_event {
+ hrtime_t ev_hrt; /* event time used by SERD engines */
+} fmd_event_t;
+
+typedef struct fmd_case {
+ char ci_uuid[48]; /* uuid string for this case */
+ fmd_hdl_t *ci_mod; /* module that owns this case */
+ void *ci_data; /* data from fmd_case_setspecific() */
+ ushort_t ci_state; /* case state (see below) */
+ ushort_t ci_flags; /* case flags (see below) */
+ struct timeval ci_tv; /* time of original diagnosis */
+ void *ci_bufptr; /* case data serialization buffer */
+ size_t ci_bufsiz;
+} fmd_case_t;
+
+
+#define FMD_B_FALSE 0 /* false value for booleans as int */
+#define FMD_B_TRUE 1 /* true value for booleans as int */
+
+
+#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */
+#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */
+#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */
+#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */
+#define FMD_CASE_REPAIRED 4 /* case is repaired */
+#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */
+
+#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */
+#define FMD_CF_SOLVED 0x02 /* case has been solved */
+#define FMD_CF_ISOLATED 0x04 /* case has been isolated */
+#define FMD_CF_REPAIRED 0x08 /* case has been repaired */
+#define FMD_CF_RESOLVED 0x10 /* case has been resolved */
+
+
+#define FMD_TYPE_BOOL 0 /* int */
+#define FMD_TYPE_INT32 1 /* int32_t */
+#define FMD_TYPE_UINT32 2 /* uint32_t */
+#define FMD_TYPE_INT64 3 /* int64_t */
+#define FMD_TYPE_UINT64 4 /* uint64_t */
+#define FMD_TYPE_TIME 5 /* uint64_t */
+#define FMD_TYPE_SIZE 6 /* uint64_t */
+
+typedef struct fmd_prop {
+ const char *fmdp_name; /* property name */
+ uint_t fmdp_type; /* property type (see above) */
+ const char *fmdp_defv; /* default value */
+} fmd_prop_t;
+
+typedef struct fmd_stat {
+ char fmds_name[32]; /* statistic name */
+ uint_t fmds_type; /* statistic type (see above) */
+ char fmds_desc[64]; /* statistic description */
+ union {
+ int bool; /* FMD_TYPE_BOOL */
+ int32_t i32; /* FMD_TYPE_INT32 */
+ uint32_t ui32; /* FMD_TYPE_UINT32 */
+ int64_t i64; /* FMD_TYPE_INT64 */
+ uint64_t ui64; /* FMD_TYPE_UINT64 */
+ } fmds_value;
+} fmd_stat_t;
+
+typedef struct fmd_hdl_ops {
+ void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *);
+ void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *);
+ void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *);
+ void (*fmdo_stats)(fmd_hdl_t *);
+ void (*fmdo_gc)(fmd_hdl_t *);
+} fmd_hdl_ops_t;
+
+#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */
+#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */
+#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */
+
+typedef struct fmd_hdl_info {
+ const char *fmdi_desc; /* fmd client description string */
+ const char *fmdi_vers; /* fmd client version string */
+ const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */
+ const fmd_prop_t *fmdi_props; /* array of configuration props */
+} fmd_hdl_info_t;
+
+extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *);
+extern void fmd_hdl_unregister(fmd_hdl_t *);
+
+extern void fmd_hdl_setspecific(fmd_hdl_t *, void *);
+extern void *fmd_hdl_getspecific(fmd_hdl_t *);
+
+#define FMD_SLEEP UMEM_NOFAIL
+
+extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int);
+extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int);
+extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t);
+
+extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int);
+extern void fmd_hdl_strfree(fmd_hdl_t *, char *);
+
+extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
+extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
+
+extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
+extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
+
+#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
+#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
+
+extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *);
+extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *);
+extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *);
+
+extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *);
+extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *);
+extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *);
+
+extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *);
+extern void fmd_case_uuclose(fmd_hdl_t *, const char *);
+extern int fmd_case_uuclosed(fmd_hdl_t *, const char *);
+extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *);
+extern void fmd_case_uuresolved(fmd_hdl_t *, const char *);
+
+extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *);
+extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *);
+extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *);
+
+extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *);
+extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *);
+
+extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *);
+extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *);
+
+extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t);
+extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *);
+extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *,
+ const char *, void *, size_t);
+extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *,
+ const char *, const void *, size_t);
+extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
+
+extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
+extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
+extern int fmd_serd_exists(fmd_hdl_t *, const char *);
+extern void fmd_serd_reset(fmd_hdl_t *, const char *);
+extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
+extern int fmd_serd_fired(fmd_hdl_t *, const char *);
+extern int fmd_serd_empty(fmd_hdl_t *, const char *);
+
+extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
+extern void fmd_timer_remove(fmd_hdl_t *, id_t);
+
+extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *,
+ const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *);
+
+extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *);
+
+#define FMD_HAS_FAULT_FRU 0
+#define FMD_HAS_FAULT_ASRU 1
+#define FMD_HAS_FAULT_RESOURCE 2
+
+extern void fmd_repair_fru(fmd_hdl_t *, const char *);
+extern int fmd_repair_asru(fmd_hdl_t *, const char *);
+
+extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int);
+extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int);
+
+/*
+ * ZED Specific Interfaces
+ */
+
+extern fmd_hdl_t *fmd_module_hdl(const char *);
+extern boolean_t fmd_module_initialized(fmd_hdl_t *);
+extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *);
+
+/* ZFS FMA Retire Agent */
+extern void _zfs_retire_init(fmd_hdl_t *);
+extern void _zfs_retire_fini(fmd_hdl_t *);
+
+/* ZFS FMA Diagnosis Engine */
+extern void _zfs_diagnosis_init(fmd_hdl_t *);
+extern void _zfs_diagnosis_fini(fmd_hdl_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FMD_API_H */
diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c
new file mode 100644
index 000000000000..d4ec37fb7691
--- /dev/null
+++ b/cmd/zed/agents/fmd_serd.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/list.h>
+#include <sys/time.h>
+
+#include "fmd_api.h"
+#include "fmd_serd.h"
+#include "../zed_log.h"
+
+
+#define FMD_STR_BUCKETS 211
+
+
+#ifdef SERD_ENG_DEBUG
+#define serd_log_msg(fmt, ...) \
+ zed_log_msg(LOG_INFO, fmt, __VA_ARGS__)
+#else
+#define serd_log_msg(fmt, ...)
+#endif
+
+
+/*
+ * SERD Engine Backend
+ */
+
+/*
+ * Compute the delta between events in nanoseconds. To account for very old
+ * events which are replayed, we must handle the case where time is negative.
+ * We convert the hrtime_t's to unsigned 64-bit integers and then handle the
+ * case where 'old' is greater than 'new' (i.e. high-res time has wrapped).
+ */
+static hrtime_t
+fmd_event_delta(hrtime_t t1, hrtime_t t2)
+{
+ uint64_t old = t1;
+ uint64_t new = t2;
+
+ return (new >= old ? new - old : (UINT64_MAX - old) + new + 1);
+}
+
+static fmd_serd_eng_t *
+fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t)
+{
+ fmd_serd_eng_t *sgp;
+
+ sgp = malloc(sizeof (fmd_serd_eng_t));
+ bzero(sgp, sizeof (fmd_serd_eng_t));
+
+ sgp->sg_name = strdup(name);
+ sgp->sg_flags = FMD_SERD_DIRTY;
+ sgp->sg_n = n;
+ sgp->sg_t = t;
+
+ list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t),
+ offsetof(fmd_serd_elem_t, se_list));
+
+ return (sgp);
+}
+
+static void
+fmd_serd_eng_free(fmd_serd_eng_t *sgp)
+{
+ fmd_serd_eng_reset(sgp);
+ free(sgp->sg_name);
+ list_destroy(&sgp->sg_list);
+ free(sgp);
+}
+
+/*
+ * sourced from fmd_string.c
+ */
+static ulong_t
+fmd_strhash(const char *key)
+{
+ ulong_t g, h = 0;
+ const char *p;
+
+ for (p = key; *p != '\0'; p++) {
+ h = (h << 4) + *p;
+
+ if ((g = (h & 0xf0000000)) != 0) {
+ h ^= (g >> 24);
+ h ^= g;
+ }
+ }
+
+ return (h);
+}
+
+void
+fmd_serd_hash_create(fmd_serd_hash_t *shp)
+{
+ shp->sh_hashlen = FMD_STR_BUCKETS;
+ shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *));
+ shp->sh_count = 0;
+}
+
+void
+fmd_serd_hash_destroy(fmd_serd_hash_t *shp)
+{
+ fmd_serd_eng_t *sgp, *ngp;
+ uint_t i;
+
+ for (i = 0; i < shp->sh_hashlen; i++) {
+ for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) {
+ ngp = sgp->sg_next;
+ fmd_serd_eng_free(sgp);
+ }
+ }
+
+ free(shp->sh_hash);
+ bzero(shp, sizeof (fmd_serd_hash_t));
+}
+
+void
+fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg)
+{
+ fmd_serd_eng_t *sgp;
+ uint_t i;
+
+ for (i = 0; i < shp->sh_hashlen; i++) {
+ for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next)
+ func(sgp, arg);
+ }
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name,
+ uint_t n, hrtime_t t)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t);
+
+ serd_log_msg(" SERD Engine: inserting %s N %d T %llu",
+ name, (int)n, (long long unsigned)t);
+
+ sgp->sg_next = shp->sh_hash[h];
+ shp->sh_hash[h] = sgp;
+ shp->sh_count++;
+
+ return (sgp);
+}
+
+fmd_serd_eng_t *
+fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp;
+
+ for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) {
+ if (strcmp(name, sgp->sg_name) == 0)
+ return (sgp);
+ }
+
+ return (NULL);
+}
+
+void
+fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name)
+{
+ uint_t h = fmd_strhash(name) % shp->sh_hashlen;
+ fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h];
+
+ serd_log_msg(" SERD Engine: deleting %s", name);
+
+ for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) {
+ if (strcmp(sgp->sg_name, name) != 0)
+ pp = &sgp->sg_next;
+ else
+ break;
+ }
+
+ if (sgp != NULL) {
+ *pp = sgp->sg_next;
+ fmd_serd_eng_free(sgp);
+ assert(shp->sh_count != 0);
+ shp->sh_count--;
+ }
+}
+
+static void
+fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep)
+{
+ list_remove(&sgp->sg_list, sep);
+ sgp->sg_count--;
+
+ serd_log_msg(" SERD Engine: discarding %s, %d remaining",
+ sgp->sg_name, (int)sgp->sg_count);
+
+ free(sep);
+}
+
+int
+fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt)
+{
+ fmd_serd_elem_t *sep, *oep;
+
+ /*
+ * If the fired flag is already set, return false and discard the
+ * event. This means that the caller will only see the engine "fire"
+ * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired()
+ * function can also be used in combination with fmd_serd_eng_record().
+ */
+ if (sgp->sg_flags & FMD_SERD_FIRED) {
+ serd_log_msg(" SERD Engine: record %s already fired!",
+ sgp->sg_name);
+ return (FMD_B_FALSE);
+ }
+
+ while (sgp->sg_count >= sgp->sg_n)
+ fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list));
+
+ sep = malloc(sizeof (fmd_serd_elem_t));
+ sep->se_hrt = hrt;
+
+ list_insert_head(&sgp->sg_list, sep);
+ sgp->sg_count++;
+
+ serd_log_msg(" SERD Engine: recording %s of %d (%llu)",
+ sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt);
+
+ /*
+ * Pick up the oldest element pointer for comparison to 'sep'. We must
+ * do this after adding 'sep' because 'oep' and 'sep' can be the same.
+ */
+ oep = list_tail(&sgp->sg_list);
+
+ if (sgp->sg_count >= sgp->sg_n &&
+ fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) {
+ sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY;
+ serd_log_msg(" SERD Engine: fired %s", sgp->sg_name);
+ return (FMD_B_TRUE);
+ }
+
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+ return (FMD_B_FALSE);
+}
+
+int
+fmd_serd_eng_fired(fmd_serd_eng_t *sgp)
+{
+ return (sgp->sg_flags & FMD_SERD_FIRED);
+}
+
+int
+fmd_serd_eng_empty(fmd_serd_eng_t *sgp)
+{
+ return (sgp->sg_count == 0);
+}
+
+void
+fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
+{
+ serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name);
+
+ while (sgp->sg_count != 0)
+ fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list));
+
+ sgp->sg_flags &= ~FMD_SERD_FIRED;
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+}
+
+void
+fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
+{
+ fmd_serd_elem_t *sep, *nep;
+ hrtime_t hrt;
+
+ if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED))
+ return; /* no garbage collection needed if empty or fired */
+
+ sep = list_head(&sgp->sg_list);
+ if (sep == NULL)
+ return;
+
+ hrt = sep->se_hrt - sgp->sg_t;
+
+ for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) {
+ if (sep->se_hrt >= hrt)
+ break; /* sep and subsequent events are all within T */
+
+ nep = list_next(&sgp->sg_list, sep);
+ fmd_serd_eng_discard(sgp, sep);
+ sgp->sg_flags |= FMD_SERD_DIRTY;
+ }
+}
diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h
new file mode 100644
index 000000000000..c35c9acc7785
--- /dev/null
+++ b/cmd/zed/agents/fmd_serd.h
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef _FMD_SERD_H
+#define _FMD_SERD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/list.h>
+#include <sys/time.h>
+
+typedef struct fmd_serd_elem {
+ list_node_t se_list; /* linked list forward/back pointers */
+ hrtime_t se_hrt; /* upper bound on event hrtime */
+} fmd_serd_elem_t;
+
+typedef struct fmd_serd_eng {
+ char *sg_name; /* string name for this engine */
+ struct fmd_serd_eng *sg_next; /* next engine on hash chain */
+ list_t sg_list; /* list of fmd_serd_elem_t's */
+ uint_t sg_count; /* count of events in sg_list */
+ uint_t sg_flags; /* engine flags (see below) */
+ uint_t sg_n; /* engine N parameter (event count) */
+ hrtime_t sg_t; /* engine T parameter (nanoseconds) */
+} fmd_serd_eng_t;
+
+#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */
+#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */
+
+typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *);
+
+typedef struct fmd_serd_hash {
+ fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */
+ uint_t sh_hashlen; /* length of hash bucket array */
+ uint_t sh_count; /* count of engines in hash */
+} fmd_serd_hash_t;
+
+extern void fmd_serd_hash_create(fmd_serd_hash_t *);
+extern void fmd_serd_hash_destroy(fmd_serd_hash_t *);
+extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *);
+
+extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *,
+ const char *, uint32_t, hrtime_t);
+
+extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *);
+extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *);
+
+extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t);
+extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
+extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
+
+extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
+extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FMD_SERD_H */
diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c
new file mode 100644
index 000000000000..006e0ab99f47
--- /dev/null
+++ b/cmd/zed/agents/zfs_agents.c
@@ -0,0 +1,422 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>
+ */
+
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/list.h>
+#include <sys/time.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+#include "../zed_log.h"
+
+/*
+ * agent dispatch code
+ */
+
+static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER;
+static list_t agent_events; /* list of pending events */
+static int agent_exiting;
+
+typedef struct agent_event {
+ char ae_class[64];
+ char ae_subclass[32];
+ nvlist_t *ae_nvl;
+ list_node_t ae_node;
+} agent_event_t;
+
+pthread_t g_agents_tid;
+
+libzfs_handle_t *g_zfs_hdl;
+
+/* guid search data */
+typedef enum device_type {
+ DEVICE_TYPE_L2ARC, /* l2arc device */
+ DEVICE_TYPE_SPARE, /* spare device */
+ DEVICE_TYPE_PRIMARY /* any primary pool storage device */
+} device_type_t;
+
+typedef struct guid_search {
+ uint64_t gs_pool_guid;
+ uint64_t gs_vdev_guid;
+ char *gs_devid;
+ device_type_t gs_vdev_type;
+ uint64_t gs_vdev_expandtime; /* vdev expansion time */
+} guid_search_t;
+
+/*
+ * Walks the vdev tree recursively looking for a matching devid.
+ * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise.
+ */
+static boolean_t
+zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg)
+{
+ guid_search_t *gsp = arg;
+ char *path = NULL;
+ uint_t c, children;
+ nvlist_t **child;
+
+ /*
+ * First iterate over any children.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY;
+ return (B_TRUE);
+ }
+ }
+ }
+ /*
+ * Iterate over any spares and cache devices
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_L2ARC;
+ return (B_TRUE);
+ }
+ }
+ }
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if (zfs_agent_iter_vdev(zhp, child[c], gsp)) {
+ gsp->gs_vdev_type = DEVICE_TYPE_SPARE;
+ return (B_TRUE);
+ }
+ }
+ }
+ /*
+ * On a devid match, grab the vdev guid and expansion time, if any.
+ */
+ if (gsp->gs_devid != NULL &&
+ (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) &&
+ (strcmp(gsp->gs_devid, path) == 0)) {
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+ &gsp->gs_vdev_guid);
+ (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME,
+ &gsp->gs_vdev_expandtime);
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
+static int
+zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg)
+{
+ guid_search_t *gsp = arg;
+ nvlist_t *config, *nvl;
+
+ /*
+ * For each vdev in this pool, look for a match by devid
+ */
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+ &nvl) == 0) {
+ (void) zfs_agent_iter_vdev(zhp, nvl, gsp);
+ }
+ }
+ /*
+ * if a match was found then grab the pool guid
+ */
+ if (gsp->gs_vdev_guid) {
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &gsp->gs_pool_guid);
+ }
+
+ zpool_close(zhp);
+ return (gsp->gs_vdev_guid != 0);
+}
+
+void
+zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ agent_event_t *event;
+
+ if (subclass == NULL)
+ subclass = "";
+
+ event = malloc(sizeof (agent_event_t));
+ if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) {
+ if (event)
+ free(event);
+ return;
+ }
+
+ if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) {
+ class = EC_ZFS;
+ subclass = ESC_ZFS_VDEV_CHECK;
+ }
+
+ /*
+ * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED
+ * ereport from vdev_disk layer after a hot unplug. Fortunately we
+ * get a EC_DEV_REMOVE from our disk monitor and it is a suitable
+ * proxy so we remap it here for the benefit of the diagnosis engine.
+ */
+ if ((strcmp(class, EC_DEV_REMOVE) == 0) &&
+ (strcmp(subclass, ESC_DISK) == 0) &&
+ (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) ||
+ nvlist_exists(nvl, DEV_IDENTIFIER))) {
+ nvlist_t *payload = event->ae_nvl;
+ struct timeval tv;
+ int64_t tod[2];
+ uint64_t pool_guid = 0, vdev_guid = 0;
+ guid_search_t search = { 0 };
+ device_type_t devtype = DEVICE_TYPE_PRIMARY;
+
+ class = "resource.fs.zfs.removed";
+ subclass = "";
+
+ (void) nvlist_add_string(payload, FM_CLASS, class);
+ (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
+ (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid);
+
+ (void) gettimeofday(&tv, NULL);
+ tod[0] = tv.tv_sec;
+ tod[1] = tv.tv_usec;
+ (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2);
+
+ /*
+ * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or
+ * ZFS_EV_POOL_GUID may be missing so find them.
+ */
+ (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER,
+ &search.gs_devid);
+ (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search);
+ pool_guid = search.gs_pool_guid;
+ vdev_guid = search.gs_vdev_guid;
+ devtype = search.gs_vdev_type;
+
+ /*
+ * We want to avoid reporting "remove" events coming from
+ * libudev for VDEVs which were expanded recently (10s) and
+ * avoid activating spares in response to partitions being
+ * deleted and created in rapid succession.
+ */
+ if (search.gs_vdev_expandtime != 0 &&
+ search.gs_vdev_expandtime + 10 > tv.tv_sec) {
+ zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' "
+ "for recently expanded device '%s'", EC_DEV_REMOVE,
+ search.gs_devid);
+ goto out;
+ }
+
+ (void) nvlist_add_uint64(payload,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid);
+ (void) nvlist_add_uint64(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid);
+ switch (devtype) {
+ case DEVICE_TYPE_L2ARC:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ VDEV_TYPE_L2CACHE);
+ break;
+ case DEVICE_TYPE_SPARE:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE);
+ break;
+ case DEVICE_TYPE_PRIMARY:
+ (void) nvlist_add_string(payload,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK);
+ break;
+ }
+
+ zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'",
+ EC_DEV_REMOVE, class);
+ }
+
+ (void) strlcpy(event->ae_class, class, sizeof (event->ae_class));
+ (void) strlcpy(event->ae_subclass, subclass,
+ sizeof (event->ae_subclass));
+
+ (void) pthread_mutex_lock(&agent_lock);
+ list_insert_tail(&agent_events, event);
+ (void) pthread_mutex_unlock(&agent_lock);
+
+out:
+ (void) pthread_cond_signal(&agent_cond);
+}
+
+static void
+zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl)
+{
+ /*
+ * The diagnosis engine subscribes to the following events.
+ * On illumos these subscriptions reside in:
+ * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf
+ */
+ if (strstr(class, "ereport.fs.zfs.") != NULL ||
+ strstr(class, "resource.fs.zfs.") != NULL ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) {
+ fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class);
+ }
+
+ /*
+ * The retire agent subscribes to the following events.
+ * On illumos these subscriptions reside in:
+ * /usr/lib/fm/fmd/plugins/zfs-retire.conf
+ *
+ * NOTE: faults events come directly from our diagnosis engine
+ * and will not pass through the zfs kernel module.
+ */
+ if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 ||
+ strcmp(class, "resource.fs.zfs.removed") == 0 ||
+ strcmp(class, "resource.fs.zfs.statechange") == 0 ||
+ strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) {
+ fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class);
+ }
+
+ /*
+ * The SLM module only consumes disk events and vdev check events
+ *
+ * NOTE: disk events come directly from disk monitor and will
+ * not pass through the zfs kernel module.
+ */
+ if (strstr(class, "EC_dev_") != NULL ||
+ strcmp(class, EC_ZFS) == 0) {
+ (void) zfs_slm_event(class, subclass, nvl);
+ }
+}
+
+/*
+ * Events are consumed and dispatched from this thread
+ * An agent can also post an event so event list lock
+ * is not held when calling an agent.
+ * One event is consumed at a time.
+ */
+static void *
+zfs_agent_consumer_thread(void *arg)
+{
+ for (;;) {
+ agent_event_t *event;
+
+ (void) pthread_mutex_lock(&agent_lock);
+
+ /* wait for an event to show up */
+ while (!agent_exiting && list_is_empty(&agent_events))
+ (void) pthread_cond_wait(&agent_cond, &agent_lock);
+
+ if (agent_exiting) {
+ (void) pthread_mutex_unlock(&agent_lock);
+ zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: "
+ "exiting");
+ return (NULL);
+ }
+
+ if ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+
+ (void) pthread_mutex_unlock(&agent_lock);
+
+ /* dispatch to all event subscribers */
+ zfs_agent_dispatch(event->ae_class, event->ae_subclass,
+ event->ae_nvl);
+
+ nvlist_free(event->ae_nvl);
+ free(event);
+ continue;
+ }
+
+ (void) pthread_mutex_unlock(&agent_lock);
+ }
+
+ return (NULL);
+}
+
+void
+zfs_agent_init(libzfs_handle_t *zfs_hdl)
+{
+ fmd_hdl_t *hdl;
+
+ g_zfs_hdl = zfs_hdl;
+
+ if (zfs_slm_init() != 0)
+ zed_log_die("Failed to initialize zfs slm");
+ zed_log_msg(LOG_INFO, "Add Agent: init");
+
+ hdl = fmd_module_hdl("zfs-diagnosis");
+ _zfs_diagnosis_init(hdl);
+ if (!fmd_module_initialized(hdl))
+ zed_log_die("Failed to initialize zfs diagnosis");
+
+ hdl = fmd_module_hdl("zfs-retire");
+ _zfs_retire_init(hdl);
+ if (!fmd_module_initialized(hdl))
+ zed_log_die("Failed to initialize zfs retire");
+
+ list_create(&agent_events, sizeof (agent_event_t),
+ offsetof(struct agent_event, ae_node));
+
+ if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread,
+ NULL) != 0) {
+ list_destroy(&agent_events);
+ zed_log_die("Failed to initialize agents");
+ }
+}
+
+void
+zfs_agent_fini(void)
+{
+ fmd_hdl_t *hdl;
+ agent_event_t *event;
+
+ agent_exiting = 1;
+ (void) pthread_cond_signal(&agent_cond);
+
+ /* wait for zfs_enum_pools thread to complete */
+ (void) pthread_join(g_agents_tid, NULL);
+
+ /* drain any pending events */
+ while ((event = (list_head(&agent_events))) != NULL) {
+ list_remove(&agent_events, event);
+ nvlist_free(event->ae_nvl);
+ free(event);
+ }
+
+ list_destroy(&agent_events);
+
+ if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) {
+ _zfs_retire_fini(hdl);
+ fmd_hdl_unregister(hdl);
+ }
+ if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) {
+ _zfs_diagnosis_fini(hdl);
+ fmd_hdl_unregister(hdl);
+ }
+
+ zed_log_msg(LOG_INFO, "Add Agent: fini");
+ zfs_slm_fini();
+
+ g_zfs_hdl = NULL;
+}
diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h
new file mode 100644
index 000000000000..d1a459139b1e
--- /dev/null
+++ b/cmd/zed/agents/zfs_agents.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License Version 1.0 (CDDL-1.0).
+ * You can obtain a copy of the license from the top-level file
+ * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
+ * You may not use this file except in compliance with the license.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#ifndef ZFS_AGENTS_H
+#define ZFS_AGENTS_H
+
+#include <libzfs.h>
+#include <libnvpair.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Agent abstraction presented to ZED
+ */
+extern void zfs_agent_init(libzfs_handle_t *);
+extern void zfs_agent_fini(void);
+extern void zfs_agent_post_event(const char *, const char *, nvlist_t *);
+
+/*
+ * ZFS Sysevent Linkable Module (SLM)
+ */
+extern int zfs_slm_init(void);
+extern void zfs_slm_fini(void);
+extern void zfs_slm_event(const char *, const char *, nvlist_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !ZFS_AGENTS_H */
diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
new file mode 100644
index 000000000000..0b27f6702ee8
--- /dev/null
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -0,0 +1,981 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, Intel Corporation.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <strings.h>
+#include <libuutil.h>
+#include <libzfs.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/fs/zfs.h>
+
+#include "zfs_agents.h"
+#include "fmd_api.h"
+
+/*
+ * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
+ * #define reserves enough space for two 64-bit hex values plus the length of
+ * the longest string.
+ */
+#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
+
+/*
+ * On-disk case structure. This must maintain backwards compatibility with
+ * previous versions of the DE. By default, any members appended to the end
+ * will be filled with zeros if they don't exist in a previous version.
+ */
+typedef struct zfs_case_data {
+ uint64_t zc_version;
+ uint64_t zc_ena;
+ uint64_t zc_pool_guid;
+ uint64_t zc_vdev_guid;
+ int zc_pool_state;
+ char zc_serd_checksum[MAX_SERDLEN];
+ char zc_serd_io[MAX_SERDLEN];
+ int zc_has_remove_timer;
+} zfs_case_data_t;
+
+/*
+ * Time-of-day
+ */
+typedef struct er_timeval {
+ uint64_t ertv_sec;
+ uint64_t ertv_nsec;
+} er_timeval_t;
+
+/*
+ * In-core case structure.
+ */
+typedef struct zfs_case {
+ boolean_t zc_present;
+ uint32_t zc_version;
+ zfs_case_data_t zc_data;
+ fmd_case_t *zc_case;
+ uu_list_node_t zc_node;
+ id_t zc_remove_timer;
+ char *zc_fru;
+ er_timeval_t zc_when;
+} zfs_case_t;
+
+#define CASE_DATA "data"
+#define CASE_FRU "fru"
+#define CASE_DATA_VERSION_INITIAL 1
+#define CASE_DATA_VERSION_SERD 2
+
+typedef struct zfs_de_stats {
+ fmd_stat_t old_drops;
+ fmd_stat_t dev_drops;
+ fmd_stat_t vdev_drops;
+ fmd_stat_t import_drops;
+ fmd_stat_t resource_drops;
+} zfs_de_stats_t;
+
+zfs_de_stats_t zfs_stats = {
+ { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" },
+ { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"},
+ { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"},
+ { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" },
+ { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
+};
+
+static hrtime_t zfs_remove_timeout;
+
+uu_list_pool_t *zfs_case_pool;
+uu_list_t *zfs_cases;
+
+#define ZFS_MAKE_RSRC(type) \
+ FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type
+#define ZFS_MAKE_EREPORT(type) \
+ FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
+
+/*
+ * Write out the persistent representation of an active case.
+ */
+static void
+zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp)
+{
+ zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD;
+}
+
+/*
+ * Read back the persistent representation of an active case.
+ */
+static zfs_case_t *
+zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
+{
+ zfs_case_t *zcp;
+
+ zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP);
+ zcp->zc_case = cp;
+
+ fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data,
+ sizeof (zcp->zc_data));
+
+ if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) {
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+ return (NULL);
+ }
+
+ /*
+ * fmd_buf_read() will have already zeroed out the remainder of the
+ * buffer, so we don't have to do anything special if the version
+ * doesn't include the SERD engine name.
+ */
+
+ if (zcp->zc_data.zc_has_remove_timer)
+ zcp->zc_remove_timer = fmd_timer_install(hdl, zcp,
+ NULL, zfs_remove_timeout);
+
+ uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool);
+ (void) uu_list_insert_before(zfs_cases, NULL, zcp);
+
+ fmd_case_setspecific(hdl, cp, zcp);
+
+ return (zcp);
+}
+
+/*
+ * Iterate over any active cases. If any cases are associated with a pool or
+ * vdev which is no longer present on the system, close the associated case.
+ */
+static void
+zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded)
+{
+ uint64_t vdev_guid = 0;
+ uint_t c, children;
+ nvlist_t **child;
+ zfs_case_t *zcp;
+
+ (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid);
+
+ /*
+ * Mark any cases associated with this (pool, vdev) pair.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == vdev_guid) {
+ zcp->zc_present = B_TRUE;
+ zcp->zc_when = *loaded;
+ }
+ }
+
+ /*
+ * Iterate over all children.
+ */
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+
+ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_mark_vdev(pool_guid, child[c], loaded);
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_mark_pool(zpool_handle_t *zhp, void *unused)
+{
+ zfs_case_t *zcp;
+ uint64_t pool_guid;
+ uint64_t *tod;
+ er_timeval_t loaded = { 0 };
+ nvlist_t *config, *vd;
+ uint_t nelem = 0;
+ int ret;
+
+ pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+ /*
+ * Mark any cases associated with just this pool.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == 0)
+ zcp->zc_present = B_TRUE;
+ }
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+ &tod, &nelem);
+ if (nelem == 2) {
+ loaded.ertv_sec = tod[0];
+ loaded.ertv_nsec = tod[1];
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid &&
+ zcp->zc_data.zc_vdev_guid == 0) {
+ zcp->zc_when = loaded;
+ }
+ }
+ }
+
+ ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd);
+ if (ret) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ zfs_mark_vdev(pool_guid, vd, &loaded);
+
+ zpool_close(zhp);
+
+ return (0);
+}
+
+struct load_time_arg {
+ uint64_t lt_guid;
+ er_timeval_t *lt_time;
+ boolean_t lt_found;
+};
+
+static int
+zpool_find_load_time(zpool_handle_t *zhp, void *arg)
+{
+ struct load_time_arg *lta = arg;
+ uint64_t pool_guid;
+ uint64_t *tod;
+ nvlist_t *config;
+ uint_t nelem;
+
+ if (lta->lt_found) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL);
+ if (pool_guid != lta->lt_guid) {
+ zpool_close(zhp);
+ return (0);
+ }
+
+ if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+ zpool_close(zhp);
+ return (-1);
+ }
+
+ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME,
+ &tod, &nelem) == 0 && nelem == 2) {
+ lta->lt_found = B_TRUE;
+ lta->lt_time->ertv_sec = tod[0];
+ lta->lt_time->ertv_nsec = tod[1];
+ }
+
+ zpool_close(zhp);
+
+ return (0);
+}
+
+static void
+zfs_purge_cases(fmd_hdl_t *hdl)
+{
+ zfs_case_t *zcp;
+ uu_list_walk_t *walk;
+ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+
+ /*
+ * There is no way to open a pool by GUID, or lookup a vdev by GUID. No
+ * matter what we do, we're going to have to stomach an O(vdevs * cases)
+ * algorithm. In reality, both quantities are likely so small that
+ * neither will matter. Given that iterating over pools is more
+ * expensive than iterating over the in-memory case list, we opt for a
+ * 'present' flag in each case that starts off cleared. We then iterate
+ * over all pools, marking those that are still present, and removing
+ * those that aren't found.
+ *
+ * Note that we could also construct an FMRI and rely on
+ * fmd_nvl_fmri_present(), but this would end up doing the same search.
+ */
+
+ /*
+ * Mark the cases as not present.
+ */
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp))
+ zcp->zc_present = B_FALSE;
+
+ /*
+ * Iterate over all pools and mark the pools and vdevs found. If this
+ * fails (most probably because we're out of memory), then don't close
+ * any of the cases and we cannot be sure they are accurate.
+ */
+ if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0)
+ return;
+
+ /*
+ * Remove those cases which were not found.
+ */
+ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+ while ((zcp = uu_list_walk_next(walk)) != NULL) {
+ if (!zcp->zc_present)
+ fmd_case_close(hdl, zcp->zc_case);
+ }
+ uu_list_walk_end(walk);
+}
+
+/*
+ * Construct the name of a serd engine given the pool/vdev GUID and type (io or
+ * checksum).
+ */
+static void
+zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
+ const char *type)
+{
+ (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s",
+ (long long unsigned int)pool_guid,
+ (long long unsigned int)vdev_guid, type);
+}
+
+/*
+ * Solve a given ZFS case. This first checks to make sure the diagnosis is
+ * still valid, as well as cleaning up any pending timer associated with the
+ * case.
+ */
+static void
+zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname,
+ boolean_t checkunusable)
+{
+ nvlist_t *detector, *fault;
+ boolean_t serialize;
+ nvlist_t *fru = NULL;
+ fmd_hdl_debug(hdl, "solving fault '%s'", faultname);
+
+ /*
+ * Construct the detector from the case data. The detector is in the
+ * ZFS scheme, and is either the pool or the vdev, depending on whether
+ * this is a vdev or pool fault.
+ */
+ detector = fmd_nvl_alloc(hdl, FMD_SLEEP);
+
+ (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0);
+ (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS);
+ (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL,
+ zcp->zc_data.zc_pool_guid);
+ if (zcp->zc_data.zc_vdev_guid != 0) {
+ (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV,
+ zcp->zc_data.zc_vdev_guid);
+ }
+
+ fault = fmd_nvl_create_fault(hdl, faultname, 100, detector,
+ fru, detector);
+ fmd_case_add_suspect(hdl, zcp->zc_case, fault);
+
+ nvlist_free(fru);
+
+ fmd_case_solve(hdl, zcp->zc_case);
+
+ serialize = B_FALSE;
+ if (zcp->zc_data.zc_has_remove_timer) {
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_data.zc_has_remove_timer = 0;
+ serialize = B_TRUE;
+ }
+ if (serialize)
+ zfs_case_serialize(hdl, zcp);
+
+ nvlist_free(detector);
+}
+
+static boolean_t
+timeval_earlier(er_timeval_t *a, er_timeval_t *b)
+{
+ return (a->ertv_sec < b->ertv_sec ||
+ (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec));
+}
+
+/*ARGSUSED*/
+static void
+zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when)
+{
+ int64_t *tod;
+ uint_t nelem;
+
+ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod,
+ &nelem) == 0 && nelem == 2) {
+ when->ertv_sec = tod[0];
+ when->ertv_nsec = tod[1];
+ } else {
+ when->ertv_sec = when->ertv_nsec = UINT64_MAX;
+ }
+}
+
+/*
+ * Main fmd entry point.
+ */
+/*ARGSUSED*/
+static void
+zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
+{
+ zfs_case_t *zcp, *dcp;
+ int32_t pool_state;
+ uint64_t ena, pool_guid, vdev_guid;
+ er_timeval_t pool_load;
+ er_timeval_t er_when;
+ nvlist_t *detector;
+ boolean_t pool_found = B_FALSE;
+ boolean_t isresource;
+ char *type;
+
+ /*
+ * We subscribe to notifications for vdev or pool removal. In these
+ * cases, there may be cases that no longer apply. Purge any cases
+ * that no longer apply.
+ */
+ if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) {
+ fmd_hdl_debug(hdl, "purging orphaned cases from %s",
+ strrchr(class, '.') + 1);
+ zfs_purge_cases(hdl);
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*");
+
+ if (isresource) {
+ /*
+ * For resources, we don't have a normal payload.
+ */
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ &vdev_guid) != 0)
+ pool_state = SPA_LOAD_OPEN;
+ else
+ pool_state = SPA_LOAD_NONE;
+ detector = NULL;
+ } else {
+ (void) nvlist_lookup_nvlist(nvl,
+ FM_EREPORT_DETECTOR, &detector);
+ (void) nvlist_lookup_int32(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state);
+ }
+
+ /*
+ * We also ignore all ereports generated during an import of a pool,
+ * since the only possible fault (.pool) would result in import failure,
+ * and hence no persistent fault. Some day we may want to do something
+ * with these ereports, so we continue generating them internally.
+ */
+ if (pool_state == SPA_LOAD_IMPORT) {
+ zfs_stats.import_drops.fmds_value.ui64++;
+ fmd_hdl_debug(hdl, "ignoring '%s' during import", class);
+ return;
+ }
+
+ /*
+ * Device I/O errors are ignored during pool open.
+ */
+ if (pool_state == SPA_LOAD_OPEN &&
+ (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) {
+ fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class);
+ zfs_stats.dev_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * We ignore ereports for anything except disks and files.
+ */
+ if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ &type) == 0) {
+ if (strcmp(type, VDEV_TYPE_DISK) != 0 &&
+ strcmp(type, VDEV_TYPE_FILE) != 0) {
+ zfs_stats.vdev_drops.fmds_value.ui64++;
+ return;
+ }
+ }
+
+ /*
+ * Determine if this ereport corresponds to an open case.
+ * Each vdev or pool can have a single case.
+ */
+ (void) nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid);
+ if (nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
+ vdev_guid = 0;
+ if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0)
+ ena = 0;
+
+ zfs_ereport_when(hdl, nvl, &er_when);
+
+ for (zcp = uu_list_first(zfs_cases); zcp != NULL;
+ zcp = uu_list_next(zfs_cases, zcp)) {
+ if (zcp->zc_data.zc_pool_guid == pool_guid) {
+ pool_found = B_TRUE;
+ pool_load = zcp->zc_when;
+ }
+ if (zcp->zc_data.zc_vdev_guid == vdev_guid)
+ break;
+ }
+
+ /*
+ * Avoid falsely accusing a pool of being faulty. Do so by
+ * not replaying ereports that were generated prior to the
+ * current import. If the failure that generated them was
+ * transient because the device was actually removed but we
+ * didn't receive the normal asynchronous notification, we
+ * don't want to mark it as faulted and potentially panic. If
+ * there is still a problem we'd expect not to be able to
+ * import the pool, or that new ereports will be generated
+ * once the pool is used.
+ */
+ if (pool_found && timeval_earlier(&er_when, &pool_load)) {
+ fmd_hdl_debug(hdl, "ignoring pool %llx, "
+ "ereport time %lld.%lld, pool load time = %lld.%lld",
+ pool_guid, er_when.ertv_sec, er_when.ertv_nsec,
+ pool_load.ertv_sec, pool_load.ertv_nsec);
+ zfs_stats.old_drops.fmds_value.ui64++;
+ return;
+ }
+
+ if (!pool_found) {
+ /*
+ * Haven't yet seen this pool, but same situation
+ * may apply.
+ */
+ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl);
+ struct load_time_arg la;
+
+ la.lt_guid = pool_guid;
+ la.lt_time = &pool_load;
+ la.lt_found = B_FALSE;
+
+ if (zhdl != NULL &&
+ zpool_iter(zhdl, zpool_find_load_time, &la) == 0 &&
+ la.lt_found == B_TRUE) {
+ pool_found = B_TRUE;
+
+ if (timeval_earlier(&er_when, &pool_load)) {
+ fmd_hdl_debug(hdl, "ignoring pool %llx, "
+ "ereport time %lld.%lld, "
+ "pool load time = %lld.%lld",
+ pool_guid, er_when.ertv_sec,
+ er_when.ertv_nsec, pool_load.ertv_sec,
+ pool_load.ertv_nsec);
+ zfs_stats.old_drops.fmds_value.ui64++;
+ return;
+ }
+ }
+ }
+
+ if (zcp == NULL) {
+ fmd_case_t *cs;
+ zfs_case_data_t data = { 0 };
+
+ /*
+ * If this is one of our 'fake' resource ereports, and there is
+ * no case open, simply discard it.
+ */
+ if (isresource) {
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ fmd_hdl_debug(hdl, "discarding '%s for vdev %llu",
+ class, vdev_guid);
+ return;
+ }
+
+ /*
+ * Skip tracking some ereports
+ */
+ if (strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
+ strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
+ strcmp(class,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * Open a new case.
+ */
+ cs = fmd_case_open(hdl, NULL);
+
+ fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'",
+ vdev_guid, class);
+
+ /*
+ * Initialize the case buffer. To commonize code, we actually
+ * create the buffer with existing data, and then call
+ * zfs_case_unserialize() to instantiate the in-core structure.
+ */
+ fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t));
+
+ data.zc_version = CASE_DATA_VERSION_SERD;
+ data.zc_ena = ena;
+ data.zc_pool_guid = pool_guid;
+ data.zc_vdev_guid = vdev_guid;
+ data.zc_pool_state = (int)pool_state;
+
+ fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data));
+
+ zcp = zfs_case_unserialize(hdl, cs);
+ assert(zcp != NULL);
+ if (pool_found)
+ zcp->zc_when = pool_load;
+ }
+
+ if (isresource) {
+ fmd_hdl_debug(hdl, "resource event '%s'", class);
+
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) {
+ /*
+ * The 'resource.fs.zfs.autoreplace' event indicates
+ * that the pool was loaded with the 'autoreplace'
+ * property set. In this case, any pending device
+ * failures should be ignored, as the asynchronous
+ * autoreplace handling will take care of them.
+ */
+ fmd_case_close(hdl, zcp->zc_case);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) {
+ /*
+ * The 'resource.fs.zfs.removed' event indicates that
+ * device removal was detected, and the device was
+ * closed asynchronously. If this is the case, we
+ * assume that any recent I/O errors were due to the
+ * device removal, not any fault of the device itself.
+ * We reset the SERD engine, and cancel any pending
+ * timers.
+ */
+ if (zcp->zc_data.zc_has_remove_timer) {
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_data.zc_has_remove_timer = 0;
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (zcp->zc_data.zc_serd_io[0] != '\0')
+ fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+ fmd_serd_reset(hdl,
+ zcp->zc_data.zc_serd_checksum);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
+ uint64_t state = 0;
+
+ if (zcp != NULL &&
+ nvlist_lookup_uint64(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 &&
+ state == VDEV_STATE_HEALTHY) {
+ fmd_hdl_debug(hdl, "closing case after a "
+ "device statechange to healthy");
+ fmd_case_close(hdl, zcp->zc_case);
+ }
+ }
+ zfs_stats.resource_drops.fmds_value.ui64++;
+ return;
+ }
+
+ /*
+ * Associate the ereport with this case.
+ */
+ fmd_case_add_ereport(hdl, zcp->zc_case, ep);
+
+ /*
+ * Don't do anything else if this case is already solved.
+ */
+ if (fmd_case_solved(hdl, zcp->zc_case))
+ return;
+
+ fmd_hdl_debug(hdl, "error event '%s'", class);
+
+ /*
+ * Determine if we should solve the case and generate a fault. We solve
+ * a case if:
+ *
+ * a. A pool failed to open (ereport.fs.zfs.pool)
+ * b. A device failed to open (ereport.fs.zfs.pool) while a pool
+ * was up and running.
+ *
+ * We may see a series of ereports associated with a pool open, all
+ * chained together by the same ENA. If the pool open succeeds, then
+ * we'll see no further ereports. To detect when a pool open has
+ * succeeded, we associate a timer with the event. When it expires, we
+ * close the case.
+ */
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) {
+ /*
+ * Pool level fault. Before solving the case, go through and
+ * close any open device cases that may be pending.
+ */
+ for (dcp = uu_list_first(zfs_cases); dcp != NULL;
+ dcp = uu_list_next(zfs_cases, dcp)) {
+ if (dcp->zc_data.zc_pool_guid ==
+ zcp->zc_data.zc_pool_guid &&
+ dcp->zc_data.zc_vdev_guid != 0)
+ fmd_case_close(hdl, dcp->zc_case);
+ }
+
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) {
+ /*
+ * Pool level fault for reading the intent logs.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) {
+ /*
+ * Device fault.
+ */
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE);
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
+ fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+ char *failmode = NULL;
+ boolean_t checkremove = B_FALSE;
+
+ /*
+ * If this is a checksum or I/O error, then toss it into the
+ * appropriate SERD engine and check to see if it has fired.
+ * Ideally, we want to do something more sophisticated,
+ * (persistent errors for a single data block, etc). For now,
+ * a single SERD engine is sufficient.
+ */
+ if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) {
+ if (zcp->zc_data.zc_serd_io[0] == '\0') {
+ zfs_serd_name(zcp->zc_data.zc_serd_io,
+ pool_guid, vdev_guid, "io");
+ fmd_serd_create(hdl, zcp->zc_data.zc_serd_io,
+ fmd_prop_get_int32(hdl, "io_N"),
+ fmd_prop_get_int64(hdl, "io_T"));
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
+ checkremove = B_TRUE;
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+ if (zcp->zc_data.zc_serd_checksum[0] == '\0') {
+ zfs_serd_name(zcp->zc_data.zc_serd_checksum,
+ pool_guid, vdev_guid, "checksum");
+ fmd_serd_create(hdl,
+ zcp->zc_data.zc_serd_checksum,
+ fmd_prop_get_int32(hdl, "checksum_N"),
+ fmd_prop_get_int64(hdl, "checksum_T"));
+ zfs_case_serialize(hdl, zcp);
+ }
+ if (fmd_serd_record(hdl,
+ zcp->zc_data.zc_serd_checksum, ep)) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.vdev.checksum", B_FALSE);
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) &&
+ (nvlist_lookup_string(nvl,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) &&
+ failmode != NULL) {
+ if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE,
+ strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.io_failure_continue",
+ B_FALSE);
+ } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT,
+ strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) {
+ zfs_case_solve(hdl, zcp,
+ "fault.fs.zfs.io_failure_wait", B_FALSE);
+ }
+ } else if (fmd_nvl_class_match(hdl, nvl,
+ ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
+#ifndef __linux__
+ /* This causes an unexpected fault diagnosis on linux */
+ checkremove = B_TRUE;
+#endif
+ }
+
+ /*
+ * Because I/O errors may be due to device removal, we postpone
+ * any diagnosis until we're sure that we aren't about to
+ * receive a 'resource.fs.zfs.removed' event.
+ */
+ if (checkremove) {
+ if (zcp->zc_data.zc_has_remove_timer)
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+ zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL,
+ zfs_remove_timeout);
+ if (!zcp->zc_data.zc_has_remove_timer) {
+ zcp->zc_data.zc_has_remove_timer = 1;
+ zfs_case_serialize(hdl, zcp);
+ }
+ }
+ }
+}
+
+/*
+ * The timeout is fired when we diagnosed an I/O error, and it was not due to
+ * device removal (which would cause the timeout to be cancelled).
+ */
+/* ARGSUSED */
+static void
+zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data)
+{
+ zfs_case_t *zcp = data;
+
+ if (id == zcp->zc_remove_timer)
+ zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE);
+}
+
+/*
+ * The specified case has been closed and any case-specific
+ * data structures should be deallocated.
+ */
+static void
+zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
+{
+ zfs_case_t *zcp = fmd_case_getspecific(hdl, cs);
+
+ if (zcp->zc_data.zc_serd_checksum[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
+ if (zcp->zc_data.zc_serd_io[0] != '\0')
+ fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
+ if (zcp->zc_data.zc_has_remove_timer)
+ fmd_timer_remove(hdl, zcp->zc_remove_timer);
+
+ uu_list_remove(zfs_cases, zcp);
+ uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+}
+
+/*
+ * We use the fmd gc entry point to look for old cases that no longer apply.
+ * This allows us to keep our set of case data small in a long running system.
+ */
+static void
+zfs_fm_gc(fmd_hdl_t *hdl)
+{
+ zfs_purge_cases(hdl);
+}
+
+static const fmd_hdl_ops_t fmd_ops = {
+ zfs_fm_recv, /* fmdo_recv */
+ zfs_fm_timeout, /* fmdo_timeout */
+ zfs_fm_close, /* fmdo_close */
+ NULL, /* fmdo_stats */
+ zfs_fm_gc, /* fmdo_gc */
+};
+
+static const fmd_prop_t fmd_props[] = {
+ { "checksum_N", FMD_TYPE_UINT32, "10" },
+ { "checksum_T", FMD_TYPE_TIME, "10min" },
+ { "io_N", FMD_TYPE_UINT32, "10" },
+ { "io_T", FMD_TYPE_TIME, "10min" },
+ { "remove_timeout", FMD_TYPE_TIME, "15sec" },
+ { NULL, 0, NULL }
+};
+
+static const fmd_hdl_info_t fmd_info = {
+ "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props
+};
+
+void
+_zfs_diagnosis_init(fmd_hdl_t *hdl)
+{
+ libzfs_handle_t *zhdl;
+
+ if ((zhdl = libzfs_init()) == NULL)
+ return;
+
+ if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool",
+ sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node),
+ NULL, UU_LIST_POOL_DEBUG)) == NULL) {
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ if ((zfs_cases = uu_list_create(zfs_case_pool, NULL,
+ UU_LIST_DEBUG)) == NULL) {
+ uu_list_pool_destroy(zfs_case_pool);
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
+ uu_list_destroy(zfs_cases);
+ uu_list_pool_destroy(zfs_case_pool);
+ libzfs_fini(zhdl);
+ return;
+ }
+
+ fmd_hdl_setspecific(hdl, zhdl);
+
+ (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
+ sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
+
+ zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
+}
+
+void
+_zfs_diagnosis_fini(fmd_hdl_t *hdl)
+{
+ zfs_case_t *zcp;
+ uu_list_walk_t *walk;
+ libzfs_handle_t *zhdl;
+
+ /*
+ * Remove all active cases.
+ */
+ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST);
+ while ((zcp = uu_list_walk_next(walk)) != NULL) {
+ fmd_hdl_debug(hdl, "removing case ena %llu",
+ (long long unsigned)zcp->zc_data.zc_ena);
+ uu_list_remove(zfs_cases, zcp);
+ uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool);
+ fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
+ }
+ uu_list_walk_end(walk);
+
+ uu_list_destroy(zfs_cases);
+ uu_list_pool_destroy(zfs_case_pool);
+
+ zhdl = fmd_hdl_getspecific(hdl);
+ libzfs_fini(zhdl);
+}
diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
new file mode 100644
index 000000000000..8d0a3b420086
--- /dev/null
+++ b/cmd/zed/agents/zfs_mod.c
@@ -0,0 +1,956 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2016, 2017, Intel Corporation.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ */
+
+/*
+ * ZFS syseventd module.
+ *
+ * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
+ *
+ * The purpose of this module is to identify when devices are added to the
+ * system, and appropriately online or replace the affected vdevs.
+ *
+ * When a device is added to the system:
+ *
+ * 1. Search for any vdevs whose devid matches that of the newly added
+ * device.
+ *
+ * 2. If no vdevs are found, then search for any vdevs whose udev path
+ * matches that of the new device.
+ *
+ * 3. If no vdevs match by either method, then ignore the event.
+ *
+ * 4. Attempt to online the device with a flag to indicate that it should
+ * be unspared when resilvering completes. If this succeeds, then the
+ * same device was inserted and we should continue normally.
+ *
+ * 5. If the pool does not have the 'autoreplace' property set, attempt to
+ * online the device again without the unspare flag, which will
+ * generate a FMA fault.
+ *
+ * 6. If the pool has the 'autoreplace' property set, and the matching vdev
+ * is a whole disk, then label the new disk and attempt a 'zpool
+ * replace'.
+ *
+ * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK
+ * event indicates that a device failed to open during pool load, but the
+ * autoreplace property was set. In this case, we deferred the associated
+ * FMA fault until our module had a chance to process the autoreplace logic.
+ * If the device could not be replaced, then the second online attempt will
+ * trigger the FMA fault that we skipped earlier.
+ *
+ * ZFS on Linux porting notes:
+ * Linux udev provides a disk insert for both the disk and the partition
+ *
+ */
+
+#include <ctype.h>
+#include <fcntl.h>
+#include <libnvpair.h>
+#include <libzfs.h>
+#include <libzutil.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#include <sys/list.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
+#include <thread_pool.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <errno.h>
+#include "zfs_agents.h"
+#include "../zed_log.h"
+
+#define DEV_BYID_PATH "/dev/disk/by-id/"
+#define DEV_BYPATH_PATH "/dev/disk/by-path/"
+#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/"
+
+typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
+
+libzfs_handle_t *g_zfshdl;
+list_t g_pool_list; /* list of unavailable pools at initialization */
+list_t g_device_list; /* list of disks with asynchronous label request */
+tpool_t *g_tpool;
+boolean_t g_enumeration_done;
+pthread_t g_zfs_tid; /* zfs_enum_pools() thread */
+
+typedef struct unavailpool {
+ zpool_handle_t *uap_zhp;
+ list_node_t uap_node;
+} unavailpool_t;
+
+typedef struct pendingdev {
+ char pd_physpath[128];
+ list_node_t pd_node;
+} pendingdev_t;
+
+static int
+zfs_toplevel_state(zpool_handle_t *zhp)
+{
+ nvlist_t *nvroot;
+ vdev_stat_t *vs;
+ unsigned int c;
+
+ verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ return (vs->vs_state);
+}
+
+static int
+zfs_unavail_pool(zpool_handle_t *zhp, void *data)
+{
+ zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)",
+ zpool_get_name(zhp), (int)zfs_toplevel_state(zhp));
+
+ if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
+ unavailpool_t *uap;
+ uap = malloc(sizeof (unavailpool_t));
+ uap->uap_zhp = zhp;
+ list_insert_tail((list_t *)data, uap);
+ } else {
+ zpool_close(zhp);
+ }
+ return (0);
+}
+
+/*
+ * Two stage replace on Linux
+ * since we get disk notifications
+ * we can wait for partitioned disk slice to show up!
+ *
+ * First stage tags the disk, initiates async partitioning, and returns
+ * Second stage finds the tag and proceeds to ZFS labeling/replace
+ *
+ * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach
+ *
+ * 1. physical match with no fs, no partition
+ * tag it top, partition disk
+ *
+ * 2. physical match again, see partition and tag
+ *
+ */
+
+/*
+ * The device associated with the given vdev (either by devid or physical path)
+ * has been added to the system. If 'isdisk' is set, then we only attempt a
+ * replacement if it's a whole disk. This also implies that we should label the
+ * disk first.
+ *
+ * First, we attempt to online the device (making sure to undo any spare
+ * operation when finished). If this succeeds, then we're done. If it fails,
+ * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
+ * but that the label was not what we expected. If the 'autoreplace' property
+ * is enabled, then we relabel the disk (if specified), and attempt a 'zpool
+ * replace'. If the online is successful, but the new state is something else
+ * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
+ * race, and we should avoid attempting to relabel the disk.
+ *
+ * Also can arrive here from a ESC_ZFS_VDEV_CHECK event
+ */
+static void
+zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
+{
+ char *path;
+ vdev_state_t newstate;
+ nvlist_t *nvroot, *newvd;
+ pendingdev_t *device;
+ uint64_t wholedisk = 0ULL;
+ uint64_t offline = 0ULL;
+ uint64_t guid = 0ULL;
+ char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
+ char rawpath[PATH_MAX], fullpath[PATH_MAX];
+ char devpath[PATH_MAX];
+ int ret;
+ boolean_t is_dm = B_FALSE;
+ boolean_t is_sd = B_FALSE;
+ uint_t c;
+ vdev_stat_t *vs;
+
+ if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
+ return;
+
+ /* Skip healthy disks */
+ verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t **)&vs, &c) == 0);
+ if (vs->vs_state == VDEV_STATE_HEALTHY) {
+ zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.",
+ __func__, path);
+ return;
+ }
+
+ (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
+ (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+ &enc_sysfs_path);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
+ (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);
+
+ if (offline)
+ return; /* don't intervene if it was taken offline */
+
+ is_dm = zfs_dev_is_dm(path);
+ zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
+ " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
+ physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
+ (long long unsigned int)guid);
+
+ /*
+ * The VDEV guid is preferred for identification (gets passed in path)
+ */
+ if (guid != 0) {
+ (void) snprintf(fullpath, sizeof (fullpath), "%llu",
+ (long long unsigned int)guid);
+ } else {
+ /*
+ * otherwise use path sans partition suffix for whole disks
+ */
+ (void) strlcpy(fullpath, path, sizeof (fullpath));
+ if (wholedisk) {
+ char *spath = zfs_strip_partition(fullpath);
+ if (!spath) {
+ zed_log_msg(LOG_INFO, "%s: Can't alloc",
+ __func__);
+ return;
+ }
+
+ (void) strlcpy(fullpath, spath, sizeof (fullpath));
+ free(spath);
+ }
+ }
+
+ /*
+ * Attempt to online the device.
+ */
+ if (zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
+ (newstate == VDEV_STATE_HEALTHY ||
+ newstate == VDEV_STATE_DEGRADED)) {
+ zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s",
+ fullpath, (newstate == VDEV_STATE_HEALTHY) ?
+ "HEALTHY" : "DEGRADED");
+ return;
+ }
+
+ /*
+ * vdev_id alias rule for using scsi_debug devices (FMA automated
+ * testing)
+ */
+ if (physpath != NULL && strcmp("scsidebug", physpath) == 0)
+ is_sd = B_TRUE;
+
+ /*
+ * If the pool doesn't have the autoreplace property set, then use
+ * vdev online to trigger a FMA fault by posting an ereport.
+ */
+ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
+ !(wholedisk || is_dm) || (physpath == NULL)) {
+ (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+ &newstate);
+ zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
+ "not a whole disk for '%s'", fullpath);
+ return;
+ }
+
+ /*
+ * Convert physical path into its current device node. Rawpath
+ * needs to be /dev/disk/by-vdev for a scsi_debug device since
+ * /dev/disk/by-path will not be present.
+ */
+ (void) snprintf(rawpath, sizeof (rawpath), "%s%s",
+ is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
+
+ if (realpath(rawpath, devpath) == NULL && !is_dm) {
+ zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
+ rawpath, strerror(errno));
+
+ (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
+ &newstate);
+
+ zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
+ fullpath, libzfs_error_description(g_zfshdl));
+ return;
+ }
+
+ /* Only autoreplace bad disks */
+ if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
+ (vs->vs_state != VDEV_STATE_FAULTED) &&
+ (vs->vs_state != VDEV_STATE_CANT_OPEN)) {
+ return;
+ }
+
+ nvlist_lookup_string(vdev, "new_devid", &new_devid);
+
+ if (is_dm) {
+ /* Don't label device mapper or multipath disks. */
+ } else if (!labeled) {
+ /*
+ * we're auto-replacing a raw disk, so label it first
+ */
+ char *leafname;
+
+ /*
+ * If this is a request to label a whole disk, then attempt to
+ * write out the label. Before we can label the disk, we need
+ * to map the physical string that was matched on to the under
+ * lying device node.
+ *
+ * If any part of this process fails, then do a force online
+ * to trigger a ZFS fault for the device (and any hot spare
+ * replacement).
+ */
+ leafname = strrchr(devpath, '/') + 1;
+
+ /*
+ * If this is a request to label a whole disk, then attempt to
+ * write out the label.
+ */
+ if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) {
+ zed_log_msg(LOG_INFO, " zpool_label_disk: could not "
+ "label '%s' (%s)", leafname,
+ libzfs_error_description(g_zfshdl));
+
+ (void) zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_FORCEFAULT, &newstate);
+ return;
+ }
+
+ /*
+ * The disk labeling is asynchronous on Linux. Just record
+ * this label request and return as there will be another
+ * disk add event for the partition after the labeling is
+ * completed.
+ */
+ device = malloc(sizeof (pendingdev_t));
+ (void) strlcpy(device->pd_physpath, physpath,
+ sizeof (device->pd_physpath));
+ list_insert_tail(&g_device_list, device);
+
+ zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
+ leafname, (u_longlong_t)guid);
+
+ return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
+
+ } else /* labeled */ {
+ boolean_t found = B_FALSE;
+ /*
+ * match up with request above to label the disk
+ */
+ for (device = list_head(&g_device_list); device != NULL;
+ device = list_next(&g_device_list, device)) {
+ if (strcmp(physpath, device->pd_physpath) == 0) {
+ list_remove(&g_device_list, device);
+ free(device);
+ found = B_TRUE;
+ break;
+ }
+ zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s",
+ physpath, device->pd_physpath);
+ }
+ if (!found) {
+ /* unexpected partition slice encountered */
+ zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
+ fullpath);
+ (void) zpool_vdev_online(zhp, fullpath,
+ ZFS_ONLINE_FORCEFAULT, &newstate);
+ return;
+ }
+
+ zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
+ physpath, (u_longlong_t)guid);
+
+ (void) snprintf(devpath, sizeof (devpath), "%s%s",
+ DEV_BYID_PATH, new_devid);
+ }
+
+ /*
+ * Construct the root vdev to pass to zpool_vdev_attach(). While adding
+ * the entire vdev structure is harmless, we construct a reduced set of
+ * path/physpath/wholedisk to keep it simple.
+ */
+ if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+ return;
+ }
+ if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory");
+ nvlist_free(nvroot);
+ return;
+ }
+
+ if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
+ nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
+ nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 ||
+ (physpath != NULL && nvlist_add_string(newvd,
+ ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
+ (enc_sysfs_path != NULL && nvlist_add_string(newvd,
+ ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) ||
+ nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
+ nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
+ nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
+ 1) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs");
+ nvlist_free(newvd);
+ nvlist_free(nvroot);
+ return;
+ }
+
+ nvlist_free(newvd);
+
+ /*
+ * Wait for udev to verify the links exist, then auto-replace
+ * the leaf disk at same physical location.
+ */
+ if (zpool_label_disk_wait(path, 3000) != 0) {
+ zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
+ "disk %s is missing", path);
+ nvlist_free(nvroot);
+ return;
+ }
+
+ ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
+
+ zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
+ fullpath, path, (ret == 0) ? "no errors" :
+ libzfs_error_description(g_zfshdl));
+
+ nvlist_free(nvroot);
+}
+
+/*
+ * Utility functions to find a vdev matching given criteria.
+ */
+typedef struct dev_data {
+ const char *dd_compare;
+ const char *dd_prop;
+ zfs_process_func_t dd_func;
+ boolean_t dd_found;
+ boolean_t dd_islabeled;
+ uint64_t dd_pool_guid;
+ uint64_t dd_vdev_guid;
+ const char *dd_new_devid;
+} dev_data_t;
+
+static void
+zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
+{
+ dev_data_t *dp = data;
+ char *path = NULL;
+ uint_t c, children;
+ nvlist_t **child;
+
+ /*
+ * First iterate over any children.
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+
+ /*
+ * Iterate over any spares and cache devices
+ */
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ zfs_iter_vdev(zhp, child[c], data);
+ }
+
+ /* once a vdev was matched and processed there is nothing left to do */
+ if (dp->dd_found)
+ return;
+
+ /*
+ * Match by GUID if available otherwise fallback to devid or physical
+ */
+ if (dp->dd_vdev_guid != 0) {
+ uint64_t guid;
+
+ if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
+ &guid) != 0 || guid != dp->dd_vdev_guid) {
+ return;
+ }
+ zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid);
+ dp->dd_found = B_TRUE;
+
+ } else if (dp->dd_compare != NULL) {
+ /*
+ * NOTE: On Linux there is an event for partition, so unlike
+ * illumos, substring matching is not required to accommodate
+ * the partition suffix. An exact match will be present in
+ * the dp->dd_compare value.
+ */
+ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
+ strcmp(dp->dd_compare, path) != 0)
+ return;
+
+ zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s",
+ dp->dd_prop, path);
+ dp->dd_found = B_TRUE;
+
+ /* pass the new devid for use by replacing code */
+ if (dp->dd_new_devid != NULL) {
+ (void) nvlist_add_string(nvl, "new_devid",
+ dp->dd_new_devid);
+ }
+ }
+
+ (dp->dd_func)(zhp, nvl, dp->dd_islabeled);
+}
+
+static void
+zfs_enable_ds(void *arg)
+{
+ unavailpool_t *pool = (unavailpool_t *)arg;
+
+ (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
+ zpool_close(pool->uap_zhp);
+ free(pool);
+}
+
+static int
+zfs_iter_pool(zpool_handle_t *zhp, void *data)
+{
+ nvlist_t *config, *nvl;
+ dev_data_t *dp = data;
+ uint64_t pool_guid;
+ unavailpool_t *pool;
+
+ zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)",
+ zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop);
+
+ /*
+ * For each vdev in this pool, look for a match to apply dd_func
+ */
+ if ((config = zpool_get_config(zhp, NULL)) != NULL) {
+ if (dp->dd_pool_guid == 0 ||
+ (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+ &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
+ (void) nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvl);
+ zfs_iter_vdev(zhp, nvl, data);
+ }
+ }
+
+ /*
+ * if this pool was originally unavailable,
+ * then enable its datasets asynchronously
+ */
+ if (g_enumeration_done) {
+ for (pool = list_head(&g_pool_list); pool != NULL;
+ pool = list_next(&g_pool_list, pool)) {
+
+ if (strcmp(zpool_get_name(zhp),
+ zpool_get_name(pool->uap_zhp)))
+ continue;
+ if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
+ list_remove(&g_pool_list, pool);
+ (void) tpool_dispatch(g_tpool, zfs_enable_ds,
+ pool);
+ break;
+ }
+ }
+ }
+
+ zpool_close(zhp);
+ return (dp->dd_found); /* cease