diff options
author | Matt Macy <mmacy@FreeBSD.org> | 2020-08-24 22:48:19 +0000 |
---|---|---|
committer | Matt Macy <mmacy@FreeBSD.org> | 2020-08-24 22:48:19 +0000 |
commit | 3b0ce0e28db46d0403929aba45c682285e1ac217 (patch) | |
tree | 91721e6e5518bd0d8113dee535898f2225443411 /cmd | |
download | src-3b0ce0e28db46d0403929aba45c682285e1ac217.tar.gz src-3b0ce0e28db46d0403929aba45c682285e1ac217.zip |
Vendor import of openzfs master @ 184df27eef0abdc7ab2105b21257f753834b936bvendor/openzfs/2.0-rc0-g184df27
Sponsored by: iX Systems, Inc.
Notes
Notes:
svn path=/vendor-sys/openzfs/dist/; revision=364736
svn path=/vendor-sys/openzfs/2.0-rc0-g184df27/; revision=364741; tag=vendor/openzfs/2.0-rc0-g184df27
Diffstat (limited to 'cmd')
165 files changed, 59838 insertions, 0 deletions
diff --git a/cmd/Makefile.am b/cmd/Makefile.am new file mode 100644 index 000000000000..88d32b1c538c --- /dev/null +++ b/cmd/Makefile.am @@ -0,0 +1,10 @@ +SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest +SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path + +if USING_PYTHON +SUBDIRS += arcstat arc_summary dbufstat +endif + +if BUILD_LINUX +SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait +endif diff --git a/cmd/arc_summary/.gitignore b/cmd/arc_summary/.gitignore new file mode 100644 index 000000000000..50ba15f034e2 --- /dev/null +++ b/cmd/arc_summary/.gitignore @@ -0,0 +1 @@ +arc_summary diff --git a/cmd/arc_summary/Makefile.am b/cmd/arc_summary/Makefile.am new file mode 100644 index 000000000000..1a26c2c199f8 --- /dev/null +++ b/cmd/arc_summary/Makefile.am @@ -0,0 +1,13 @@ +bin_SCRIPTS = arc_summary + +CLEANFILES = arc_summary +EXTRA_DIST = arc_summary2 arc_summary3 + +if USING_PYTHON_2 +SCRIPT = arc_summary2 +else +SCRIPT = arc_summary3 +endif + +arc_summary: $(SCRIPT) + cp $< $@ diff --git a/cmd/arc_summary/arc_summary2 b/cmd/arc_summary/arc_summary2 new file mode 100755 index 000000000000..5dc40d759dce --- /dev/null +++ b/cmd/arc_summary/arc_summary2 @@ -0,0 +1,1093 @@ +#!/usr/bin/env python2 +# +# $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $ +# +# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>, +# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>, +# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>, +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# If you are having troubles when using this script from cron(8) please try +# adjusting your PATH before reporting problems. +# +# Note some of this code uses older code (eg getopt instead of argparse, +# subprocess.Popen() instead of subprocess.run()) because we need to support +# some very old versions of Python. +# + +"""Print statistics on the ZFS Adjustable Replacement Cache (ARC) + +Provides basic information on the ARC, its efficiency, the L2ARC (if present), +the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the +in-source documentation and code at +https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +""" + +import getopt +import os +import sys +import time +import errno + +from subprocess import Popen, PIPE +from decimal import Decimal as D + + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + base = 'kstat.zfs.misc.%s.' % namespace + return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)] + + def load_tunables(): + return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs')) + +elif sys.platform.startswith('linux'): + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + kstat = 'kstat.zfs.misc.%s.%%s' % namespace + path = '/proc/spl/kstat/zfs/%s' % namespace + with open(path) as f: + entries = [line.strip().split() for line in f][2:] # Skip header + return [(kstat % name, D(value)) for name, _, value in entries] + + def load_tunables(): + basepath = '/sys/module/zfs/parameters' + tunables = {} + for name in os.listdir(basepath): + if not name: + continue + path = '%s/%s' % (basepath, name) + with open(path) as f: + value = f.read() + tunables[name] = value.strip() + return tunables + + +show_tunable_descriptions = False +alternate_tunable_layout = False + + +def handle_Exception(ex_cls, ex, tb): + if ex is IOError: + if ex.errno == errno.EPIPE: + sys.exit() + + if ex is KeyboardInterrupt: + sys.exit() + + +sys.excepthook = handle_Exception + + +def get_Kstat(): + """Collect information on the ZFS subsystem from the /proc virtual + file system. The name "kstat" is a holdover from the Solaris utility + of the same name. + """ + + Kstat = {} + Kstat.update(load_kstats('arcstats')) + Kstat.update(load_kstats('zfetchstats')) + Kstat.update(load_kstats('vdev_cache_stats')) + return Kstat + + +def fBytes(b=0): + """Return human-readable representation of a byte value in + powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal + points. Values smaller than one KiB are returned without + decimal points. + """ + + prefixes = [ + [2**80, "YiB"], # yobibytes (yotta) + [2**70, "ZiB"], # zebibytes (zetta) + [2**60, "EiB"], # exbibytes (exa) + [2**50, "PiB"], # pebibytes (peta) + [2**40, "TiB"], # tebibytes (tera) + [2**30, "GiB"], # gibibytes (giga) + [2**20, "MiB"], # mebibytes (mega) + [2**10, "KiB"]] # kibibytes (kilo) + + if b >= 2**10: + + for limit, unit in prefixes: + + if b >= limit: + value = b / limit + break + + result = "%0.2f\t%s" % (value, unit) + + else: + + result = "%d\tBytes" % b + + return result + + +def fHits(hits=0): + """Create a human-readable representation of the number of hits. + The single-letter symbols used are SI to avoid the confusion caused + by the different "short scale" and "long scale" representations in + English, which use the same words for different values. See + https://en.wikipedia.org/wiki/Names_of_large_numbers and + https://physics.nist.gov/cuu/Units/prefixes.html + """ + + numbers = [ + [10**24, 'Y'], # yotta (septillion) + [10**21, 'Z'], # zetta (sextillion) + [10**18, 'E'], # exa (quintrillion) + [10**15, 'P'], # peta (quadrillion) + [10**12, 'T'], # tera (trillion) + [10**9, 'G'], # giga (billion) + [10**6, 'M'], # mega (million) + [10**3, 'k']] # kilo (thousand) + + if hits >= 1000: + + for limit, symbol in numbers: + + if hits >= limit: + value = hits/limit + break + + result = "%0.2f%s" % (value, symbol) + + else: + + result = "%d" % hits + + return result + + +def fPerc(lVal=0, rVal=0, Decimal=2): + """Calculate percentage value and return in human-readable format""" + + if rVal > 0: + return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%" + else: + return str("%0." + str(Decimal) + "f") % 100 + "%" + + +def get_arc_summary(Kstat): + """Collect general data on the ARC""" + + output = {} + memory_throttle_count = Kstat[ + "kstat.zfs.misc.arcstats.memory_throttle_count" + ] + + if memory_throttle_count > 0: + output['health'] = 'THROTTLED' + else: + output['health'] = 'HEALTHY' + + output['memory_throttle_count'] = fHits(memory_throttle_count) + + # ARC Misc. + deleted = Kstat["kstat.zfs.misc.arcstats.deleted"] + mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"] + evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"] + + # ARC Misc. + output["arc_misc"] = {} + output["arc_misc"]["deleted"] = fHits(deleted) + output["arc_misc"]['mutex_miss'] = fHits(mutex_miss) + output["arc_misc"]['evict_skips'] = fHits(evict_skip) + + # ARC Sizing + arc_size = Kstat["kstat.zfs.misc.arcstats.size"] + mru_size = Kstat["kstat.zfs.misc.arcstats.mru_size"] + mfu_size = Kstat["kstat.zfs.misc.arcstats.mfu_size"] + meta_limit = Kstat["kstat.zfs.misc.arcstats.arc_meta_limit"] + meta_size = Kstat["kstat.zfs.misc.arcstats.arc_meta_used"] + dnode_limit = Kstat["kstat.zfs.misc.arcstats.arc_dnode_limit"] + dnode_size = Kstat["kstat.zfs.misc.arcstats.dnode_size"] + target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"] + target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"] + target_size = Kstat["kstat.zfs.misc.arcstats.c"] + + target_size_ratio = (target_max_size / target_min_size) + + # ARC Sizing + output['arc_sizing'] = {} + output['arc_sizing']['arc_size'] = { + 'per': fPerc(arc_size, target_max_size), + 'num': fBytes(arc_size), + } + output['arc_sizing']['target_max_size'] = { + 'ratio': target_size_ratio, + 'num': fBytes(target_max_size), + } + output['arc_sizing']['target_min_size'] = { + 'per': fPerc(target_min_size, target_max_size), + 'num': fBytes(target_min_size), + } + output['arc_sizing']['target_size'] = { + 'per': fPerc(target_size, target_max_size), + 'num': fBytes(target_size), + } + output['arc_sizing']['meta_limit'] = { + 'per': fPerc(meta_limit, target_max_size), + 'num': fBytes(meta_limit), + } + output['arc_sizing']['meta_size'] = { + 'per': fPerc(meta_size, meta_limit), + 'num': fBytes(meta_size), + } + output['arc_sizing']['dnode_limit'] = { + 'per': fPerc(dnode_limit, meta_limit), + 'num': fBytes(dnode_limit), + } + output['arc_sizing']['dnode_size'] = { + 'per': fPerc(dnode_size, dnode_limit), + 'num': fBytes(dnode_size), + } + + # ARC Hash Breakdown + output['arc_hash_break'] = {} + output['arc_hash_break']['hash_chain_max'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_chain_max" + ] + output['arc_hash_break']['hash_chains'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_chains" + ] + output['arc_hash_break']['hash_collisions'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_collisions" + ] + output['arc_hash_break']['hash_elements'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_elements" + ] + output['arc_hash_break']['hash_elements_max'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_elements_max" + ] + + output['arc_size_break'] = {} + output['arc_size_break']['recently_used_cache_size'] = { + 'per': fPerc(mru_size, mru_size + mfu_size), + 'num': fBytes(mru_size), + } + output['arc_size_break']['frequently_used_cache_size'] = { + 'per': fPerc(mfu_size, mru_size + mfu_size), + 'num': fBytes(mfu_size), + } + + # ARC Hash Breakdown + hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"] + hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"] + hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"] + hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"] + hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"] + + output['arc_hash_break'] = {} + output['arc_hash_break']['elements_max'] = fHits(hash_elements_max) + output['arc_hash_break']['elements_current'] = { + 'per': fPerc(hash_elements, hash_elements_max), + 'num': fHits(hash_elements), + } + output['arc_hash_break']['collisions'] = fHits(hash_collisions) + output['arc_hash_break']['chain_max'] = fHits(hash_chain_max) + output['arc_hash_break']['chains'] = fHits(hash_chains) + + return output + + +def _arc_summary(Kstat): + """Print information on the ARC""" + + # ARC Sizing + arc = get_arc_summary(Kstat) + + sys.stdout.write("ARC Summary: (%s)\n" % arc['health']) + + sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" % + arc['memory_throttle_count']) + sys.stdout.write("\n") + + # ARC Misc. + sys.stdout.write("ARC Misc:\n") + sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted']) + sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" % + arc['arc_misc']['mutex_miss']) + sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" % + arc['arc_misc']['evict_skips']) + sys.stdout.write("\n") + + # ARC Sizing + sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['arc_size']['per'], + arc['arc_sizing']['arc_size']['num'] + ) + ) + sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % ( + arc['arc_sizing']['target_size']['per'], + arc['arc_sizing']['target_size']['num'], + ) + ) + + sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % ( + arc['arc_sizing']['target_min_size']['per'], + arc['arc_sizing']['target_min_size']['num'], + ) + ) + + sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % ( + arc['arc_sizing']['target_max_size']['ratio'], + arc['arc_sizing']['target_max_size']['num'], + ) + ) + + sys.stdout.write("\nARC Size Breakdown:\n") + sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % ( + arc['arc_size_break']['recently_used_cache_size']['per'], + arc['arc_size_break']['recently_used_cache_size']['num'], + ) + ) + sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % ( + arc['arc_size_break']['frequently_used_cache_size']['per'], + arc['arc_size_break']['frequently_used_cache_size']['num'], + ) + ) + sys.stdout.write("\tMetadata Size (Hard Limit):\t%s\t%s\n" % ( + arc['arc_sizing']['meta_limit']['per'], + arc['arc_sizing']['meta_limit']['num'], + ) + ) + sys.stdout.write("\tMetadata Size:\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['meta_size']['per'], + arc['arc_sizing']['meta_size']['num'], + ) + ) + sys.stdout.write("\tDnode Size (Hard Limit):\t%s\t%s\n" % ( + arc['arc_sizing']['dnode_limit']['per'], + arc['arc_sizing']['dnode_limit']['num'], + ) + ) + sys.stdout.write("\tDnode Size:\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['dnode_size']['per'], + arc['arc_sizing']['dnode_size']['num'], + ) + ) + + sys.stdout.write("\n") + + # ARC Hash Breakdown + sys.stdout.write("ARC Hash Breakdown:\n") + sys.stdout.write("\tElements Max:\t\t\t\t%s\n" % + arc['arc_hash_break']['elements_max']) + sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % ( + arc['arc_hash_break']['elements_current']['per'], + arc['arc_hash_break']['elements_current']['num'], + ) + ) + sys.stdout.write("\tCollisions:\t\t\t\t%s\n" % + arc['arc_hash_break']['collisions']) + sys.stdout.write("\tChain Max:\t\t\t\t%s\n" % + arc['arc_hash_break']['chain_max']) + sys.stdout.write("\tChains:\t\t\t\t\t%s\n" % + arc['arc_hash_break']['chains']) + + +def get_arc_efficiency(Kstat): + """Collect information on the efficiency of the ARC""" + + output = {} + + arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"] + arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"] + demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"] + demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"] + demand_metadata_hits = Kstat[ + "kstat.zfs.misc.arcstats.demand_metadata_hits" + ] + demand_metadata_misses = Kstat[ + "kstat.zfs.misc.arcstats.demand_metadata_misses" + ] + mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"] + mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"] + mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"] + mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"] + prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"] + prefetch_data_misses = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_data_misses" + ] + prefetch_metadata_hits = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_metadata_hits" + ] + prefetch_metadata_misses = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_metadata_misses" + ] + + anon_hits = arc_hits - ( + mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits + ) + arc_accesses_total = (arc_hits + arc_misses) + demand_data_total = (demand_data_hits + demand_data_misses) + prefetch_data_total = (prefetch_data_hits + prefetch_data_misses) + real_hits = (mfu_hits + mru_hits) + + output["total_accesses"] = fHits(arc_accesses_total) + output["cache_hit_ratio"] = { + 'per': fPerc(arc_hits, arc_accesses_total), + 'num': fHits(arc_hits), + } + output["cache_miss_ratio"] = { + 'per': fPerc(arc_misses, arc_accesses_total), + 'num': fHits(arc_misses), + } + output["actual_hit_ratio"] = { + 'per': fPerc(real_hits, arc_accesses_total), + 'num': fHits(real_hits), + } + output["data_demand_efficiency"] = { + 'per': fPerc(demand_data_hits, demand_data_total), + 'num': fHits(demand_data_total), + } + + if prefetch_data_total > 0: + output["data_prefetch_efficiency"] = { + 'per': fPerc(prefetch_data_hits, prefetch_data_total), + 'num': fHits(prefetch_data_total), + } + + if anon_hits > 0: + output["cache_hits_by_cache_list"] = {} + output["cache_hits_by_cache_list"]["anonymously_used"] = { + 'per': fPerc(anon_hits, arc_hits), + 'num': fHits(anon_hits), + } + + output["most_recently_used"] = { + 'per': fPerc(mru_hits, arc_hits), + 'num': fHits(mru_hits), + } + output["most_frequently_used"] = { + 'per': fPerc(mfu_hits, arc_hits), + 'num': fHits(mfu_hits), + } + output["most_recently_used_ghost"] = { + 'per': fPerc(mru_ghost_hits, arc_hits), + 'num': fHits(mru_ghost_hits), + } + output["most_frequently_used_ghost"] = { + 'per': fPerc(mfu_ghost_hits, arc_hits), + 'num': fHits(mfu_ghost_hits), + } + + output["cache_hits_by_data_type"] = {} + output["cache_hits_by_data_type"]["demand_data"] = { + 'per': fPerc(demand_data_hits, arc_hits), + 'num': fHits(demand_data_hits), + } + output["cache_hits_by_data_type"]["prefetch_data"] = { + 'per': fPerc(prefetch_data_hits, arc_hits), + 'num': fHits(prefetch_data_hits), + } + output["cache_hits_by_data_type"]["demand_metadata"] = { + 'per': fPerc(demand_metadata_hits, arc_hits), + 'num': fHits(demand_metadata_hits), + } + output["cache_hits_by_data_type"]["prefetch_metadata"] = { + 'per': fPerc(prefetch_metadata_hits, arc_hits), + 'num': fHits(prefetch_metadata_hits), + } + + output["cache_misses_by_data_type"] = {} + output["cache_misses_by_data_type"]["demand_data"] = { + 'per': fPerc(demand_data_misses, arc_misses), + 'num': fHits(demand_data_misses), + } + output["cache_misses_by_data_type"]["prefetch_data"] = { + 'per': fPerc(prefetch_data_misses, arc_misses), + 'num': fHits(prefetch_data_misses), + } + output["cache_misses_by_data_type"]["demand_metadata"] = { + 'per': fPerc(demand_metadata_misses, arc_misses), + 'num': fHits(demand_metadata_misses), + } + output["cache_misses_by_data_type"]["prefetch_metadata"] = { + 'per': fPerc(prefetch_metadata_misses, arc_misses), + 'num': fHits(prefetch_metadata_misses), + } + + return output + + +def _arc_efficiency(Kstat): + """Print information on the efficiency of the ARC""" + + arc = get_arc_efficiency(Kstat) + + sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" % + arc['total_accesses']) + sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % ( + arc['cache_hit_ratio']['per'], + arc['cache_hit_ratio']['num'], + ) + ) + sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % ( + arc['cache_miss_ratio']['per'], + arc['cache_miss_ratio']['num'], + ) + ) + + sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % ( + arc['actual_hit_ratio']['per'], + arc['actual_hit_ratio']['num'], + ) + ) + + sys.stdout.write("\n") + sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % ( + arc['data_demand_efficiency']['per'], + arc['data_demand_efficiency']['num'], + ) + ) + + if 'data_prefetch_efficiency' in arc: + sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % ( + arc['data_prefetch_efficiency']['per'], + arc['data_prefetch_efficiency']['num'], + ) + ) + sys.stdout.write("\n") + + sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n") + if 'cache_hits_by_cache_list' in arc: + sys.stdout.write("\t Anonymously Used:\t\t%s\t%s\n" % ( + arc['cache_hits_by_cache_list']['anonymously_used']['per'], + arc['cache_hits_by_cache_list']['anonymously_used']['num'], + ) + ) + sys.stdout.write("\t Most Recently Used:\t\t%s\t%s\n" % ( + arc['most_recently_used']['per'], + arc['most_recently_used']['num'], + ) + ) + sys.stdout.write("\t Most Frequently Used:\t\t%s\t%s\n" % ( + arc['most_frequently_used']['per'], + arc['most_frequently_used']['num'], + ) + ) + sys.stdout.write("\t Most Recently Used Ghost:\t%s\t%s\n" % ( + arc['most_recently_used_ghost']['per'], + arc['most_recently_used_ghost']['num'], + ) + ) + sys.stdout.write("\t Most Frequently Used Ghost:\t%s\t%s\n" % ( + arc['most_frequently_used_ghost']['per'], + arc['most_frequently_used_ghost']['num'], + ) + ) + + sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n") + sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['demand_data']['per'], + arc["cache_hits_by_data_type"]['demand_data']['num'], + ) + ) + sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['prefetch_data']['per'], + arc["cache_hits_by_data_type"]['prefetch_data']['num'], + ) + ) + sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['demand_metadata']['per'], + arc["cache_hits_by_data_type"]['demand_metadata']['num'], + ) + ) + sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['prefetch_metadata']['per'], + arc["cache_hits_by_data_type"]['prefetch_metadata']['num'], + ) + ) + + sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n") + sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['demand_data']['per'], + arc["cache_misses_by_data_type"]['demand_data']['num'], + ) + ) + sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['prefetch_data']['per'], + arc["cache_misses_by_data_type"]['prefetch_data']['num'], + ) + ) + sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['demand_metadata']['per'], + arc["cache_misses_by_data_type"]['demand_metadata']['num'], + ) + ) + sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['prefetch_metadata']['per'], + arc["cache_misses_by_data_type"]['prefetch_metadata']['num'], + ) + ) + + +def get_l2arc_summary(Kstat): + """Collection information on the L2ARC""" + + output = {} + + l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"] + l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"] + l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"] + l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"] + l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"] + l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"] + l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"] + l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"] + l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"] + l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"] + l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"] + l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"] + l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"] + l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"] + l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"] + l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"] + + l2_access_total = (l2_hits + l2_misses) + output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error) + + output['l2_access_total'] = l2_access_total + output['l2_size'] = l2_size + output['l2_asize'] = l2_asize + + if l2_size > 0 and l2_access_total > 0: + + if output['l2_health_count'] > 0: + output["health"] = "DEGRADED" + else: + output["health"] = "HEALTHY" + + output["low_memory_aborts"] = fHits(l2_abort_lowmem) + output["free_on_write"] = fHits(l2_free_on_write) + output["rw_clashes"] = fHits(l2_rw_clash) + output["bad_checksums"] = fHits(l2_cksum_bad) + output["io_errors"] = fHits(l2_io_error) + + output["l2_arc_size"] = {} + output["l2_arc_size"]["adative"] = fBytes(l2_size) + output["l2_arc_size"]["actual"] = { + 'per': fPerc(l2_asize, l2_size), + 'num': fBytes(l2_asize) + } + output["l2_arc_size"]["head_size"] = { + 'per': fPerc(l2_hdr_size, l2_size), + 'num': fBytes(l2_hdr_size), + } + + output["l2_arc_evicts"] = {} + output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry) + output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading) + + output['l2_arc_breakdown'] = {} + output['l2_arc_breakdown']['value'] = fHits(l2_access_total) + output['l2_arc_breakdown']['hit_ratio'] = { + 'per': fPerc(l2_hits, l2_access_total), + 'num': fHits(l2_hits), + } + output['l2_arc_breakdown']['miss_ratio'] = { + 'per': fPerc(l2_misses, l2_access_total), + 'num': fHits(l2_misses), + } + output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds) + + output['l2_arc_buffer'] = {} + + output['l2_arc_writes'] = {} + output['l2_writes_done'] = l2_writes_done + output['l2_writes_sent'] = l2_writes_sent + if l2_writes_done != l2_writes_sent: + output['l2_arc_writes']['writes_sent'] = { + 'value': "FAULTED", + 'num': fHits(l2_writes_sent), + } + output['l2_arc_writes']['done_ratio'] = { + 'per': fPerc(l2_writes_done, l2_writes_sent), + 'num': fHits(l2_writes_done), + } + output['l2_arc_writes']['error_ratio'] = { + 'per': fPerc(l2_writes_error, l2_writes_sent), + 'num': fHits(l2_writes_error), + } + else: + output['l2_arc_writes']['writes_sent'] = { + 'per': fPerc(100), + 'num': fHits(l2_writes_sent), + } + + return output + + +def _l2arc_summary(Kstat): + """Print information on the L2ARC""" + + arc = get_l2arc_summary(Kstat) + + if arc['l2_size'] > 0 and arc['l2_access_total'] > 0: + sys.stdout.write("L2 ARC Summary: ") + if arc['l2_health_count'] > 0: + sys.stdout.write("(DEGRADED)\n") + else: + sys.stdout.write("(HEALTHY)\n") + sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" % + arc['low_memory_aborts']) + sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write']) + sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes']) + sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums']) + sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors']) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" % + arc["l2_arc_size"]["adative"]) + sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["actual"]["per"], + arc["l2_arc_size"]["actual"]["num"], + ) + ) + sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["head_size"]["per"], + arc["l2_arc_size"]["head_size"]["num"], + ) + ) + sys.stdout.write("\n") + + if arc["l2_arc_evicts"]['lock_retries'] != '0' or \ + arc["l2_arc_evicts"]["reading"] != '0': + sys.stdout.write("L2 ARC Evicts:\n") + sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" % + arc["l2_arc_evicts"]['lock_retries']) + sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" % + arc["l2_arc_evicts"]["reading"]) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" % + arc['l2_arc_breakdown']['value']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_breakdown']['hit_ratio']['per'], + arc['l2_arc_breakdown']['hit_ratio']['num'], + ) + ) + + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_breakdown']['miss_ratio']['per'], + arc['l2_arc_breakdown']['miss_ratio']['num'], + ) + ) + + sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" % + arc['l2_arc_breakdown']['feeds']) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Writes:\n") + if arc['l2_writes_done'] != arc['l2_writes_sent']: + sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % ( + arc['l2_arc_writes']['writes_sent']['value'], + arc['l2_arc_writes']['writes_sent']['num'], + ) + ) + sys.stdout.write("\t Done Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['done_ratio']['per'], + arc['l2_arc_writes']['done_ratio']['num'], + ) + ) + sys.stdout.write("\t Error Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['error_ratio']['per'], + arc['l2_arc_writes']['error_ratio']['num'], + ) + ) + else: + sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['writes_sent']['per'], + arc['l2_arc_writes']['writes_sent']['num'], + ) + ) + + +def get_dmu_summary(Kstat): + """Collect information on the DMU""" + + output = {} + + zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"] + zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"] + + zfetch_access_total = (zfetch_hits + zfetch_misses) + output['zfetch_access_total'] = zfetch_access_total + + if zfetch_access_total > 0: + output['dmu'] = {} + output['dmu']['efficiency'] = {} + output['dmu']['efficiency']['value'] = fHits(zfetch_access_total) + output['dmu']['efficiency']['hit_ratio'] = { + 'per': fPerc(zfetch_hits, zfetch_access_total), + 'num': fHits(zfetch_hits), + } + output['dmu']['efficiency']['miss_ratio'] = { + 'per': fPerc(zfetch_misses, zfetch_access_total), + 'num': fHits(zfetch_misses), + } + + return output + + +def _dmu_summary(Kstat): + """Print information on the DMU""" + + arc = get_dmu_summary(Kstat) + + if arc['zfetch_access_total'] > 0: + sys.stdout.write("DMU Prefetch Efficiency:\t\t\t\t\t%s\n" % + arc['dmu']['efficiency']['value']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['dmu']['efficiency']['hit_ratio']['per'], + arc['dmu']['efficiency']['hit_ratio']['num'], + ) + ) + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['dmu']['efficiency']['miss_ratio']['per'], + arc['dmu']['efficiency']['miss_ratio']['num'], + ) + ) + + sys.stdout.write("\n") + + +def get_vdev_summary(Kstat): + """Collect information on the VDEVs""" + + output = {} + + vdev_cache_delegations = \ + Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"] + vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"] + vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"] + vdev_cache_total = (vdev_cache_misses + vdev_cache_hits + + vdev_cache_delegations) + + output['vdev_cache_total'] = vdev_cache_total + + if vdev_cache_total > 0: + output['summary'] = fHits(vdev_cache_total) + output['hit_ratio'] = { + 'per': fPerc(vdev_cache_hits, vdev_cache_total), + 'num': fHits(vdev_cache_hits), + } + output['miss_ratio'] = { + 'per': fPerc(vdev_cache_misses, vdev_cache_total), + 'num': fHits(vdev_cache_misses), + } + output['delegations'] = { + 'per': fPerc(vdev_cache_delegations, vdev_cache_total), + 'num': fHits(vdev_cache_delegations), + } + + return output + + +def _vdev_summary(Kstat): + """Print information on the VDEVs""" + + arc = get_vdev_summary(Kstat) + + if arc['vdev_cache_total'] > 0: + sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['hit_ratio']['per'], + arc['hit_ratio']['num'], + )) + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['miss_ratio']['per'], + arc['miss_ratio']['num'], + )) + sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % ( + arc['delegations']['per'], + arc['delegations']['num'], + )) + + +def _tunable_summary(Kstat): + """Print information on tunables, including descriptions if requested""" + + global show_tunable_descriptions + global alternate_tunable_layout + + tunables = load_tunables() + descriptions = {} + + if show_tunable_descriptions: + + command = ["/sbin/modinfo", "zfs", "-0"] + + try: + p = Popen(command, stdin=PIPE, stdout=PIPE, + stderr=PIPE, shell=False, close_fds=True) + p.wait() + + # By default, Python 2 returns a string as the first element of the + # tuple from p.communicate(), while Python 3 returns bytes which + # must be decoded first. The better way to do this would be with + # subprocess.run() or at least .check_output(), but this fails on + # CentOS 6 because of its old version of Python 2 + desc = bytes.decode(p.communicate()[0]) + description_list = desc.strip().split('\0') + + if p.returncode == 0: + for tunable in description_list: + if tunable[0:5] == 'parm:': + tunable = tunable[5:].strip() + name, description = tunable.split(':', 1) + if not description: + description = "Description unavailable" + descriptions[name] = description + else: + sys.stderr.write("%s: '%s' exited with code %i\n" % + (sys.argv[0], command[0], p.returncode)) + sys.stderr.write("Tunable descriptions will be disabled.\n") + except OSError as e: + sys.stderr.write("%s: Cannot run '%s': %s\n" % + (sys.argv[0], command[0], e.strerror)) + sys.stderr.write("Tunable descriptions will be disabled.\n") + + sys.stdout.write("ZFS Tunables:\n") + + if alternate_tunable_layout: + fmt = "\t%s=%s\n" + else: + fmt = "\t%-50s%s\n" + + for name in sorted(tunables.keys()): + if show_tunable_descriptions and name in descriptions: + sys.stdout.write("\t# %s\n" % descriptions[name]) + + sys.stdout.write(fmt % (name, tunables[name])) + + +unSub = [ + _arc_summary, + _arc_efficiency, + _l2arc_summary, + _dmu_summary, + _vdev_summary, + _tunable_summary +] + + +def zfs_header(): + """Print title string with date""" + + daydate = time.strftime('%a %b %d %H:%M:%S %Y') + + sys.stdout.write('\n'+'-'*72+'\n') + sys.stdout.write('ZFS Subsystem Report\t\t\t\t%s' % daydate) + sys.stdout.write('\n') + + +def usage(): + """Print usage information""" + + sys.stdout.write("Usage: arc_summary [-h] [-a] [-d] [-p PAGE]\n\n") + sys.stdout.write("\t -h, --help : " + "Print this help message and exit\n") + sys.stdout.write("\t -a, --alternate : " + "Show an alternate sysctl layout\n") + sys.stdout.write("\t -d, --description : " + "Show the sysctl descriptions\n") + sys.stdout.write("\t -p PAGE, --page=PAGE : " + "Select a single output page to display,\n") + sys.stdout.write("\t " + "should be an integer between 1 and " + + str(len(unSub)) + "\n\n") + sys.stdout.write("Examples:\n") + sys.stdout.write("\tarc_summary -a\n") + sys.stdout.write("\tarc_summary -p 4\n") + sys.stdout.write("\tarc_summary -ad\n") + sys.stdout.write("\tarc_summary --page=2\n") + + +def main(): + """Main function""" + + global show_tunable_descriptions + global alternate_tunable_layout + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "adp:h", ["alternate", "description", "page=", "help"] + ) + except getopt.error as e: + sys.stderr.write("Error: %s\n" % e.msg) + usage() + sys.exit(1) + + args = {} + for opt, arg in opts: + if opt in ('-a', '--alternate'): + args['a'] = True + if opt in ('-d', '--description'): + args['d'] = True + if opt in ('-p', '--page'): + args['p'] = arg + if opt in ('-h', '--help'): + usage() + sys.exit(0) + + Kstat = get_Kstat() + + alternate_tunable_layout = 'a' in args + show_tunable_descriptions = 'd' in args + + pages = [] + + if 'p' in args: + try: + pages.append(unSub[int(args['p']) - 1]) + except IndexError: + sys.stderr.write('the argument to -p must be between 1 and ' + + str(len(unSub)) + '\n') + sys.exit(1) + else: + pages = unSub + + zfs_header() + for page in pages: + page(Kstat) + sys.stdout.write("\n") + + +if __name__ == '__main__': + main() diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3 new file mode 100755 index 000000000000..c920b8e5395d --- /dev/null +++ b/cmd/arc_summary/arc_summary3 @@ -0,0 +1,943 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>, +# Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>, +# Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>, +# Copyright (c) 2017 Scot W. Stevenson <scot.stevenson@gmail.com> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +"""Print statistics on the ZFS ARC Cache and other information + +Provides basic information on the ARC, its efficiency, the L2ARC (if present), +the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See +the in-source documentation and code at +https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +The original introduction to arc_summary can be found at +http://cuddletech.com/?p=454 +""" + +import argparse +import os +import subprocess +import sys +import time + +DESCRIPTION = 'Print ARC and other statistics for ZFS on Linux' +INDENT = ' '*8 +LINE_LENGTH = 72 +DATE_FORMAT = '%a %b %d %H:%M:%S %Y' +TITLE = 'ZFS Subsystem Report' + +SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split() +SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' + +# Tunables and SPL are handled separately because they come from +# different sources +SECTION_PATHS = {'arc': 'arcstats', + 'dmu': 'dmu_tx', + 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats + 'vdev': 'vdev_cache_stats', + 'xuio': 'xuio_stats', + 'zfetch': 'zfetchstats', + 'zil': 'zil'} + +parser = argparse.ArgumentParser(description=DESCRIPTION) +parser.add_argument('-a', '--alternate', action='store_true', default=False, + help='use alternate formatting for tunables and SPL', + dest='alt') +parser.add_argument('-d', '--description', action='store_true', default=False, + help='print descriptions with tunables and SPL', + dest='desc') +parser.add_argument('-g', '--graph', action='store_true', default=False, + help='print graph on ARC use and exit', dest='graph') +parser.add_argument('-p', '--page', type=int, dest='page', + help='print page by number (DEPRECATED, use "-s")') +parser.add_argument('-r', '--raw', action='store_true', default=False, + help='dump all available data with minimal formatting', + dest='raw') +parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP) +ARGS = parser.parse_args() + + +if sys.platform.startswith('freebsd'): + # Requires py36-sysctl on FreeBSD + import sysctl + + VDEV_CACHE_SIZE = 'vdev.cache_size' + + def load_kstats(section): + base = 'kstat.zfs.misc.{section}.'.format(section=section) + # base is removed from the name + fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):], + value=kstat.value) + return [fmt(kstat) for kstat in sysctl.filter(base)] + + def get_params(base): + cut = 8 # = len('vfs.zfs.') + return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)} + + def get_tunable_params(): + return get_params('vfs.zfs') + + def get_vdev_params(): + return get_params('vfs.zfs.vdev') + + def get_version_impl(request): + # FreeBSD reports versions for zpl and spa instead of zfs and spl. + name = {'zfs': 'zpl', + 'spl': 'spa'}[request] + mib = 'vfs.zfs.version.{}'.format(name) + version = sysctl.filter(mib)[0].value + return '{} version {}'.format(name, version) + + def get_descriptions(_request): + # py-sysctl doesn't give descriptions, so we have to shell out. + command = ['sysctl', '-d', 'vfs.zfs'] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + lines = info.stdout.split('\n') + else: + info = subprocess.check_output(command, universal_newlines=True) + lines = info.split('\n') + + def fmt(line): + name, desc = line.split(':', 1) + return (name.strip(), desc.strip()) + + return dict([fmt(line) for line in lines if len(line) > 0]) + + +elif sys.platform.startswith('linux'): + KSTAT_PATH = '/proc/spl/kstat/zfs' + SPL_PATH = '/sys/module/spl/parameters' + TUNABLES_PATH = '/sys/module/zfs/parameters' + + VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' + + def load_kstats(section): + path = os.path.join(KSTAT_PATH, section) + with open(path) as f: + return list(f)[2:] # Get rid of header + + def get_params(basepath): + """Collect information on the Solaris Porting Layer (SPL) or the + tunables, depending on the PATH given. Does not check if PATH is + legal. + """ + result = {} + for name in os.listdir(basepath): + path = os.path.join(basepath, name) + with open(path) as f: + value = f.read() + result[name] = value.strip() + return result + + def get_spl_params(): + return get_params(SPL_PATH) + + def get_tunable_params(): + return get_params(TUNABLES_PATH) + + def get_vdev_params(): + return get_params(TUNABLES_PATH) + + def get_version_impl(request): + # The original arc_summary called /sbin/modinfo/{spl,zfs} to get + # the version information. We switch to /sys/module/{spl,zfs}/version + # to make sure we get what is really loaded in the kernel + command = ["cat", "/sys/module/{0}/version".format(request)] + req = request.upper() + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + version = info.stdout.strip() + else: + info = subprocess.check_output(command, universal_newlines=True) + version = info.strip() + + return version + + def get_descriptions(request): + """Get the descriptions of the Solaris Porting Layer (SPL) or the + tunables, return with minimal formatting. + """ + + if request not in ('spl', 'zfs'): + print('ERROR: description of "{0}" requested)'.format(request)) + sys.exit(1) + + descs = {} + target_prefix = 'parm:' + + # We would prefer to do this with /sys/modules -- see the discussion at + # get_version() -- but there isn't a way to get the descriptions from + # there, so we fall back on modinfo + command = ["/sbin/modinfo", request, "-0"] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + info = '' + + try: + + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + raw_output = info.stdout.split('\0') + else: + info = subprocess.check_output(command, + universal_newlines=True) + raw_output = info.split('\0') + + except subprocess.CalledProcessError: + print("Error: Descriptions not available", + "(can't access kernel module)") + sys.exit(1) + + for line in raw_output: + + if not line.startswith(target_prefix): + continue + + line = line[len(target_prefix):].strip() + name, raw_desc = line.split(':', 1) + desc = raw_desc.rsplit('(', 1)[0] + + if desc == '': + desc = '(No description found)' + + descs[name.strip()] = desc.strip() + + return descs + + +def cleanup_line(single_line): + """Format a raw line of data from /proc and isolate the name value + part, returning a tuple with each. Currently, this gets rid of the + middle '4'. For example "arc_no_grow 4 0" returns the tuple + ("arc_no_grow", "0"). + """ + name, _, value = single_line.split() + + return name, value + + +def draw_graph(kstats_dict): + """Draw a primitive graph representing the basic information on the + ARC -- its size and the proportion used by MFU and MRU -- and quit. + We use max size of the ARC to calculate how full it is. This is a + very rough representation. + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + + GRAPH_INDENT = ' '*4 + GRAPH_WIDTH = 60 + arc_size = f_bytes(arc_stats['size']) + arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) + mfu_size = f_bytes(arc_stats['mfu_size']) + mru_size = f_bytes(arc_stats['mru_size']) + meta_limit = f_bytes(arc_stats['arc_meta_limit']) + meta_size = f_bytes(arc_stats['arc_meta_used']) + dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) + dnode_size = f_bytes(arc_stats['dnode_size']) + + info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' + 'DNODE {6} ({7})') + info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, + meta_size, meta_limit, dnode_size, + dnode_limit) + info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) + info_line = GRAPH_INDENT+info_spc+info_line + + graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+' + + mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max'])) + mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max'])) + arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max'])) + total_ticks = float(arc_perc)*GRAPH_WIDTH + mfu_ticks = mfu_perc*GRAPH_WIDTH + mru_ticks = mru_perc*GRAPH_WIDTH + other_ticks = total_ticks-(mfu_ticks+mru_ticks) + + core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks) + core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form))) + core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|' + + for line in ('', info_line, graph_line, core_line, graph_line, ''): + print(line) + + +def f_bytes(byte_string): + """Return human-readable representation of a byte value in + powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal + points. Values smaller than one KiB are returned without + decimal points. Note "bytes" is a reserved keyword. + """ + + prefixes = ([2**80, "YiB"], # yobibytes (yotta) + [2**70, "ZiB"], # zebibytes (zetta) + [2**60, "EiB"], # exbibytes (exa) + [2**50, "PiB"], # pebibytes (peta) + [2**40, "TiB"], # tebibytes (tera) + [2**30, "GiB"], # gibibytes (giga) + [2**20, "MiB"], # mebibytes (mega) + [2**10, "KiB"]) # kibibytes (kilo) + + bites = int(byte_string) + + if bites >= 2**10: + for limit, unit in prefixes: + + if bites >= limit: + value = bites / limit + break + + result = '{0:.1f} {1}'.format(value, unit) + else: + result = '{0} Bytes'.format(bites) + + return result + + +def f_hits(hits_string): + """Create a human-readable representation of the number of hits. + The single-letter symbols used are SI to avoid the confusion caused + by the different "short scale" and "long scale" representations in + English, which use the same words for different values. See + https://en.wikipedia.org/wiki/Names_of_large_numbers and: + https://physics.nist.gov/cuu/Units/prefixes.html + """ + + numbers = ([10**24, 'Y'], # yotta (septillion) + [10**21, 'Z'], # zetta (sextillion) + [10**18, 'E'], # exa (quintrillion) + [10**15, 'P'], # peta (quadrillion) + [10**12, 'T'], # tera (trillion) + [10**9, 'G'], # giga (billion) + [10**6, 'M'], # mega (million) + [10**3, 'k']) # kilo (thousand) + + hits = int(hits_string) + + if hits >= 1000: + for limit, symbol in numbers: + + if hits >= limit: + value = hits/limit + break + + result = "%0.1f%s" % (value, symbol) + else: + result = "%d" % hits + + return result + + +def f_perc(value1, value2): + """Calculate percentage and return in human-readable form. If + rounding produces the result '0.0' though the first number is + not zero, include a 'less-than' symbol to avoid confusion. + Division by zero is handled by returning 'n/a'; no error + is called. + """ + + v1 = float(value1) + v2 = float(value2) + + try: + perc = 100 * v1/v2 + except ZeroDivisionError: + result = 'n/a' + else: + result = '{0:0.1f} %'.format(perc) + + if result == '0.0 %' and v1 > 0: + result = '< 0.1 %' + + return result + + +def format_raw_line(name, value): + """For the --raw option for the tunable and SPL outputs, decide on the + correct formatting based on the --alternate flag. + """ + + if ARGS.alt: + result = '{0}{1}={2}'.format(INDENT, name, value) + else: + spc = LINE_LENGTH-(len(INDENT)+len(value)) + result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc) + + return result + + +def get_kstats(): + """Collect information on the ZFS subsystem. The step does not perform any + further processing, giving us the option to only work on what is actually + needed. The name "kstat" is a holdover from the Solaris utility of the same + name. + """ + + result = {} + + for section in SECTION_PATHS.values(): + if section not in result: + result[section] = load_kstats(section) + + return result + + +def get_version(request): + """Get the version number of ZFS or SPL on this machine for header. + Returns an error string, but does not raise an error, if we can't + get the ZFS/SPL version. + """ + + if request not in ('spl', 'zfs'): + error_msg = '(ERROR: "{0}" requested)'.format(request) + return error_msg + + return get_version_impl(request) + + +def print_header(): + """Print the initial heading with date and time as well as info on the + kernel and ZFS versions. This is not called for the graph. + """ + + # datetime is now recommended over time but we keep the exact formatting + # from the older version of arc_summary in case there are scripts + # that expect it in this way + daydate = time.strftime(DATE_FORMAT) + spc_date = LINE_LENGTH-len(daydate) + sys_version = os.uname() + + sys_msg = sys_version.sysname+' '+sys_version.release + zfs = get_version('zfs') + spc_zfs = LINE_LENGTH-len(zfs) + + machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')' + spl = get_version('spl') + spc_spl = LINE_LENGTH-len(spl) + + print('\n'+('-'*LINE_LENGTH)) + print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date)) + print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs)) + print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl)) + + +def print_raw(kstats_dict): + """Print all available data from the system in a minimally sorted format. + This can be used as a source to be piped through 'grep'. + """ + + sections = sorted(kstats_dict.keys()) + + for section in sections: + + print('\n{0}:'.format(section.upper())) + lines = sorted(kstats_dict[section]) + + for line in lines: + name, value = cleanup_line(line) + print(format_raw_line(name, value)) + + # Tunables and SPL must be handled separately because they come from a + # different source and have descriptions the user might request + print() + section_spl() + section_tunables() + + +def isolate_section(section_name, kstats_dict): + """From the complete information on all sections, retrieve only those + for one section. + """ + + try: + section_data = kstats_dict[section_name] + except KeyError: + print('ERROR: Data on {0} not available'.format(section_data)) + sys.exit(1) + + section_dict = dict(cleanup_line(l) for l in section_data) + + return section_dict + + +# Formatted output helper functions + + +def prt_1(text, value): + """Print text and one value, no indent""" + spc = ' '*(LINE_LENGTH-(len(text)+len(value))) + print('{0}{spc}{1}'.format(text, value, spc=spc)) + + +def prt_i1(text, value): + """Print text and one value, with indent""" + spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value))) + print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc)) + + +def prt_2(text, value1, value2): + """Print text and two values, no indent""" + values = '{0:>9} {1:>9}'.format(value1, value2) + spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2)) + print('{0}{spc} {1}'.format(text, values, spc=spc)) + + +def prt_i2(text, value1, value2): + """Print text and two values, with indent""" + values = '{0:>9} {1:>9}'.format(value1, value2) + spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2)) + print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc)) + + +# The section output concentrates on important parameters instead of +# being exhaustive (that is what the --raw parameter is for) + + +def section_arc(kstats_dict): + """Give basic information on the ARC, MRU and MFU. This is the first + and most used section. + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + + throttle = arc_stats['memory_throttle_count'] + + if throttle == '0': + health = 'HEALTHY' + else: + health = 'THROTTLED' + + prt_1('ARC status:', health) + prt_i1('Memory throttle count:', throttle) + print() + + arc_size = arc_stats['size'] + arc_target_size = arc_stats['c'] + arc_max = arc_stats['c_max'] + arc_min = arc_stats['c_min'] + mfu_size = arc_stats['mfu_size'] + mru_size = arc_stats['mru_size'] + meta_limit = arc_stats['arc_meta_limit'] + meta_size = arc_stats['arc_meta_used'] + dnode_limit = arc_stats['arc_dnode_limit'] + dnode_size = arc_stats['dnode_size'] + target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) + + prt_2('ARC size (current):', + f_perc(arc_size, arc_max), f_bytes(arc_size)) + prt_i2('Target size (adaptive):', + f_perc(arc_target_size, arc_max), f_bytes(arc_target_size)) + prt_i2('Min size (hard limit):', + f_perc(arc_min, arc_max), f_bytes(arc_min)) + prt_i2('Max size (high water):', + target_size_ratio, f_bytes(arc_max)) + caches_size = int(mfu_size)+int(mru_size) + prt_i2('Most Frequently Used (MFU) cache size:', + f_perc(mfu_size, caches_size), f_bytes(mfu_size)) + prt_i2('Most Recently Used (MRU) cache size:', + f_perc(mru_size, caches_size), f_bytes(mru_size)) + prt_i2('Metadata cache size (hard limit):', + f_perc(meta_limit, arc_max), f_bytes(meta_limit)) + prt_i2('Metadata cache size (current):', + f_perc(meta_size, meta_limit), f_bytes(meta_size)) + prt_i2('Dnode cache size (hard limit):', + f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) + prt_i2('Dnode cache size (current):', + f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) + print() + + print('ARC hash breakdown:') + prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) + prt_i2('Elements current:', + f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), + f_hits(arc_stats['hash_elements'])) + prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) + + prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) + prt_i1('Chains:', f_hits(arc_stats['hash_chains'])) + print() + + print('ARC misc:') + prt_i1('Deleted:', f_hits(arc_stats['deleted'])) + prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) + prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) + print() + + +def section_archits(kstats_dict): + """Print information on how the caches are accessed ("arc hits"). + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + all_accesses = int(arc_stats['hits'])+int(arc_stats['misses']) + actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits']) + + prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses)) + ta_todo = (('Cache hit ratio:', arc_stats['hits']), + ('Cache miss ratio:', arc_stats['misses']), + ('Actual hit ratio (MFU + MRU hits):', actual_hits)) + + for title, value in ta_todo: + prt_i2(title, f_perc(value, all_accesses), f_hits(value)) + + dd_total = int(arc_stats['demand_data_hits']) +\ + int(arc_stats['demand_data_misses']) + prt_i2('Data demand efficiency:', + f_perc(arc_stats['demand_data_hits'], dd_total), + f_hits(dd_total)) + + dp_total = int(arc_stats['prefetch_data_hits']) +\ + int(arc_stats['prefetch_data_misses']) + prt_i2('Data prefetch efficiency:', + f_perc(arc_stats['prefetch_data_hits'], dp_total), + f_hits(dp_total)) + + known_hits = int(arc_stats['mfu_hits']) +\ + int(arc_stats['mru_hits']) +\ + int(arc_stats['mfu_ghost_hits']) +\ + int(arc_stats['mru_ghost_hits']) + + anon_hits = int(arc_stats['hits'])-known_hits + + print() + print('Cache hits by cache type:') + cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']), + ('Most recently used (MRU):', arc_stats['mru_hits']), + ('Most frequently used (MFU) ghost:', + arc_stats['mfu_ghost_hits']), + ('Most recently used (MRU) ghost:', + arc_stats['mru_ghost_hits'])) + + for title, value in cl_todo: + prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) + + # For some reason, anon_hits can turn negative, which is weird. Until we + # have figured out why this happens, we just hide the problem, following + # the behavior of the original arc_summary. + if anon_hits >= 0: + prt_i2('Anonymously used:', + f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits)) + + print() + print('Cache hits by data type:') + dt_todo = (('Demand data:', arc_stats['demand_data_hits']), + ('Demand prefetch data:', arc_stats['prefetch_data_hits']), + ('Demand metadata:', arc_stats['demand_metadata_hits']), + ('Demand prefetch metadata:', + arc_stats['prefetch_metadata_hits'])) + + for title, value in dt_todo: + prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) + + print() + print('Cache misses by data type:') + dm_todo = (('Demand data:', arc_stats['demand_data_misses']), + ('Demand prefetch data:', + arc_stats['prefetch_data_misses']), + ('Demand metadata:', arc_stats['demand_metadata_misses']), + ('Demand prefetch metadata:', + arc_stats['prefetch_metadata_misses'])) + + for title, value in dm_todo: + prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value)) + + print() + + +def section_dmu(kstats_dict): + """Collect information on the DMU""" + + zfetch_stats = isolate_section('zfetchstats', kstats_dict) + + zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) + + prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total)) + prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total), + f_hits(zfetch_stats['hits'])) + prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total), + f_hits(zfetch_stats['misses'])) + print() + + +def section_l2arc(kstats_dict): + """Collect information on L2ARC device if present. If not, tell user + that we're skipping the section. + """ + + # The L2ARC statistics live in the same section as the normal ARC stuff + arc_stats = isolate_section('arcstats', kstats_dict) + + if arc_stats['l2_size'] == '0': + print('L2ARC not detected, skipping section\n') + return + + l2_errors = int(arc_stats['l2_writes_error']) +\ + int(arc_stats['l2_cksum_bad']) +\ + int(arc_stats['l2_io_error']) + + l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses']) + health = 'HEALTHY' + + if l2_errors > 0: + health = 'DEGRADED' + + prt_1('L2ARC status:', health) + + l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'), + ('Free on write:', 'l2_free_on_write'), + ('R/W clashes:', 'l2_rw_clash'), + ('Bad checksums:', 'l2_cksum_bad'), + ('I/O errors:', 'l2_io_error')) + + for title, value in l2_todo: + prt_i1(title, f_hits(arc_stats[value])) + + print() + prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size'])) + prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_asize'])) + prt_i2('Header size:', + f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_hdr_size'])) + + print() + prt_1('L2ARC breakdown:', f_hits(l2_access_total)) + prt_i2('Hit ratio:', + f_perc(arc_stats['l2_hits'], l2_access_total), + f_hits(arc_stats['l2_hits'])) + prt_i2('Miss ratio:', + f_perc(arc_stats['l2_misses'], l2_access_total), + f_hits(arc_stats['l2_misses'])) + prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) + + print() + print('L2ARC writes:') + + if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: + prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) + prt_i2('Done ratio:', + f_perc(arc_stats['l2_writes_done'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_done'])) + prt_i2('Error ratio:', + f_perc(arc_stats['l2_writes_error'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_error'])) + else: + prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) + + print() + print('L2ARC evicts:') + prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) + prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) + print() + + +def section_spl(*_): + """Print the SPL parameters, if requested with alternative format + and/or descriptions. This does not use kstats. + """ + + if sys.platform.startswith('freebsd'): + # No SPL support in FreeBSD + return + + spls = get_spl_params() + keylist = sorted(spls.keys()) + print('Solaris Porting Layer (SPL):') + + if ARGS.desc: + descriptions = get_descriptions('spl') + + for key in keylist: + value = spls[key] + + if ARGS.desc: + try: + print(INDENT+'#', descriptions[key]) + except KeyError: + print(INDENT+'# (No description found)') # paranoid + + print(format_raw_line(key, value)) + + print() + + +def section_tunables(*_): + """Print the tunables, if requested with alternative format and/or + descriptions. This does not use kstasts. + """ + + tunables = get_tunable_params() + keylist = sorted(tunables.keys()) + print('Tunables:') + + if ARGS.desc: + descriptions = get_descriptions('zfs') + + for key in keylist: + value = tunables[key] + + if ARGS.desc: + try: + print(INDENT+'#', descriptions[key]) + except KeyError: + print(INDENT+'# (No description found)') # paranoid + + print(format_raw_line(key, value)) + + print() + + +def section_vdev(kstats_dict): + """Collect information on VDEV caches""" + + # Currently [Nov 2017] the VDEV cache is disabled, because it is actually + # harmful. When this is the case, we just skip the whole entry. See + # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c + # for details + tunables = get_vdev_params() + + if tunables[VDEV_CACHE_SIZE] == '0': + print('VDEV cache disabled, skipping section\n') + return + + vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) + + vdev_cache_total = int(vdev_stats['hits']) +\ + int(vdev_stats['misses']) +\ + int(vdev_stats['delegations']) + + prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) + prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), + f_hits(vdev_stats['hits'])) + prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), + f_hits(vdev_stats['misses'])) + prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), + f_hits(vdev_stats['delegations'])) + print() + + +def section_zil(kstats_dict): + """Collect information on the ZFS Intent Log. Some of the information + taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h + """ + + zil_stats = isolate_section('zil', kstats_dict) + + prt_1('ZIL committed transactions:', + f_hits(zil_stats['zil_itx_count'])) + prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count'])) + prt_i1('Flushes to stable storage:', + f_hits(zil_stats['zil_commit_writer_count'])) + prt_i2('Transactions to SLOG storage pool:', + f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']), + f_hits(zil_stats['zil_itx_metaslab_slog_count'])) + prt_i2('Transactions to non-SLOG storage pool:', + f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']), + f_hits(zil_stats['zil_itx_metaslab_normal_count'])) + print() + + +section_calls = {'arc': section_arc, + 'archits': section_archits, + 'dmu': section_dmu, + 'l2arc': section_l2arc, + 'spl': section_spl, + 'tunables': section_tunables, + 'vdev': section_vdev, + 'zil': section_zil} + + +def main(): + """Run program. The options to draw a graph and to print all data raw are + treated separately because they come with their own call. + """ + + kstats = get_kstats() + + if ARGS.graph: + draw_graph(kstats) + sys.exit(0) + + print_header() + + if ARGS.raw: + print_raw(kstats) + + elif ARGS.section: + + try: + section_calls[ARGS.section](kstats) + except KeyError: + print('Error: Section "{0}" unknown'.format(ARGS.section)) + sys.exit(1) + + elif ARGS.page: + print('WARNING: Pages are deprecated, please use "--section"\n') + + pages_to_calls = {1: 'arc', + 2: 'archits', + 3: 'l2arc', + 4: 'dmu', + 5: 'vdev', + 6: 'tunables'} + + try: + call = pages_to_calls[ARGS.page] + except KeyError: + print('Error: Page "{0}" not supported'.format(ARGS.page)) + sys.exit(1) + else: + section_calls[call](kstats) + + else: + # If no parameters were given, we print all sections. We might want to + # change the sequence by hand + calls = sorted(section_calls.keys()) + + for section in calls: + section_calls[section](kstats) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/cmd/arcstat/.gitignore b/cmd/arcstat/.gitignore new file mode 100644 index 000000000000..6d6cd1ab75fc --- /dev/null +++ b/cmd/arcstat/.gitignore @@ -0,0 +1 @@ +arcstat diff --git a/cmd/arcstat/Makefile.am b/cmd/arcstat/Makefile.am new file mode 100644 index 000000000000..d1ba989a0cd8 --- /dev/null +++ b/cmd/arcstat/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +bin_SCRIPTS = arcstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in new file mode 100755 index 000000000000..c83a1c74599e --- /dev/null +++ b/cmd/arcstat/arcstat.in @@ -0,0 +1,494 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out ZFS ARC Statistics exported via kstat(1) +# For a definition of fields, or usage, use arcstat -v +# +# This script was originally a fork of the original arcstat.pl (0.1) +# by Neelakanth Nadgir, originally published on his Sun blog on +# 09/18/2007 +# http://blogs.sun.com/realneel/entry/zfs_arc_statistics +# +# A new version aimed to improve upon the original by adding features +# and fixing bugs as needed. This version was maintained by Mike +# Harsch and was hosted in a public open source repository: +# http://github.com/mharsch/arcstat +# +# but has since moved to the illumos-gate repository. +# +# This Python port was written by John Hixson for FreeNAS, introduced +# in commit e2c29f: +# https://github.com/freenas/freenas +# +# and has been improved by many people since. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Fields have a fixed width. Every interval, we fill the "v" +# hash with its corresponding value (v[field]=value) using calculate(). +# @hdr is the array of fields that needs to be printed, so we +# just iterate over this array and print the values using our pretty printer. +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import sys +import time +import getopt +import re +import copy + +from signal import signal, SIGINT, SIGWINCH, SIG_DFL + + +cols = { + # HDR: [Size, Scale, Description] + "time": [8, -1, "Time"], + "hits": [4, 1000, "ARC reads per second"], + "miss": [4, 1000, "ARC misses per second"], + "read": [4, 1000, "Total ARC accesses per second"], + "hit%": [4, 100, "ARC hit percentage"], + "miss%": [5, 100, "ARC miss percentage"], + "dhit": [4, 1000, "Demand hits per second"], + "dmis": [4, 1000, "Demand misses per second"], + "dh%": [3, 100, "Demand hit percentage"], + "dm%": [3, 100, "Demand miss percentage"], + "phit": [4, 1000, "Prefetch hits per second"], + "pmis": [4, 1000, "Prefetch misses per second"], + "ph%": [3, 100, "Prefetch hits percentage"], + "pm%": [3, 100, "Prefetch miss percentage"], + "mhit": [4, 1000, "Metadata hits per second"], + "mmis": [4, 1000, "Metadata misses per second"], + "mread": [5, 1000, "Metadata accesses per second"], + "mh%": [3, 100, "Metadata hit percentage"], + "mm%": [3, 100, "Metadata miss percentage"], + "arcsz": [5, 1024, "ARC size"], + "size": [4, 1024, "ARC size"], + "c": [4, 1024, "ARC target size"], + "mfu": [4, 1000, "MFU list hits per second"], + "mru": [4, 1000, "MRU list hits per second"], + "mfug": [4, 1000, "MFU ghost list hits per second"], + "mrug": [4, 1000, "MRU ghost list hits per second"], + "eskip": [5, 1000, "evict_skip per second"], + "mtxmis": [6, 1000, "mutex_miss per second"], + "dread": [5, 1000, "Demand accesses per second"], + "pread": [5, 1000, "Prefetch accesses per second"], + "l2hits": [6, 1000, "L2ARC hits per second"], + "l2miss": [6, 1000, "L2ARC misses per second"], + "l2read": [6, 1000, "Total L2ARC accesses per second"], + "l2hit%": [6, 100, "L2ARC access hit percentage"], + "l2miss%": [7, 100, "L2ARC access miss percentage"], + "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], + "l2size": [6, 1024, "Size of the L2ARC"], + "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], + "grow": [4, 1000, "ARC grow disabled"], + "need": [4, 1024, "ARC reclaim need"], + "free": [4, 1024, "ARC free memory"], + "avail": [5, 1024, "ARC available memory"], + "waste": [5, 1024, "Wasted memory due to round up to pagesize"], +} + +v = {} +hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", + "mm%", "size", "c", "avail"] +xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread", + "pread", "read"] +sint = 1 # Default interval is 1 second +count = 1 # Default count is 1 +hdr_intr = 20 # Print header every 20 lines of output +opfile = None +sep = " " # Default separator is 2 spaces +version = "0.4" +l2exist = False +cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " + "[count]]\n") +cur = {} +d = {} +out = None +kstat = None + + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def kstat_update(): + global kstat + + k = sysctl.filter('kstat.zfs.misc.arcstats') + + if not k: + sys.exit(1) + + kstat = {} + + for s in k: + if not s: + continue + + name, value = s.name, s.value + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + +elif sys.platform.startswith('linux'): + def kstat_update(): + global kstat + + k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + + if not k: + sys.exit(1) + + del k[0:2] + kstat = {} + + for s in k: + if not s: + continue + + name, unused, value = s.split() + kstat[name] = int(value) + + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("Field definitions are as follows:\n") + for key in cols: + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(0) + + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat -v\n") + sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\n") + + sys.exit(1) + + +def snap_stats(): + global cur + global kstat + + prev = copy.deepcopy(cur) + kstat_update() + + cur = kstat + for key in cur: + if re.match(key, "class"): + continue + if key in prev: + d[key] = cur[key] - prev[key] + else: + d[key] = cur[key] + + +def prettynum(sz, scale, num=0): + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + # Special case for date field + if scale == -1: + return "%s" % num + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while abs(num) > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if abs(save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + + +def print_values(): + global hdr + global sep + global v + + sys.stdout.write(sep.join( + prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr)) + + sys.stdout.write("\n") + sys.stdout.flush() + + +def print_header(): + global hdr + global sep + + sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr)) + + sys.stdout.write("\n") + + +def get_terminal_lines(): + try: + import fcntl + import termios + import struct + data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234') + sz = struct.unpack('hh', data) + return sz[0] + except Exception: + pass + + +def update_hdr_intr(): + global hdr_intr + + lines = get_terminal_lines() + if lines and lines > 3: + hdr_intr = lines - 3 + + +def resize_handler(signum, frame): + update_hdr_intr() + + +def init(): + global sint + global count + global hdr + global xhdr + global opfile + global sep + global out + global l2exist + + desired_cols = None + xflag = False + hflag = False + vflag = False + i = 1 + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "xo:hvs:f:", + [ + "extended", + "outfile", + "help", + "verbose", + "separator", + "columns" + ] + ) + except getopt.error as msg: + sys.stderr.write("Error: %s\n" % str(msg)) + usage() + opts = None + + for opt, arg in opts: + if opt in ('-x', '--extended'): + xflag = True + if opt in ('-o', '--outfile'): + opfile = arg + i += 1 + if opt in ('-h', '--help'): + hflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-s', '--separator'): + sep = arg + i += 1 + if opt in ('-f', '--columns'): + desired_cols = arg + i += 1 + i += 1 + + argv = sys.argv[i:] + sint = int(argv[0]) if argv else sint + count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + if xflag: + hdr = xhdr + + update_hdr_intr() + + # check if L2ARC exists + snap_stats() + l2_size = cur.get("l2_size") + if l2_size: + l2exist = True + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif not l2exist and ele.startswith("l2"): + sys.stdout.write("No L2ARC Here\n%s\n" % ele) + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % + incompat) + usage() + + if opfile: + try: + out = open(opfile, "w") + sys.stdout = out + + except IOError: + sys.stderr.write("Cannot open %s for writing\n" % opfile) + sys.exit(1) + + +def calculate(): + global d + global v + global l2exist + + v = dict() + v["time"] = time.strftime("%H:%M:%S", time.localtime()) + v["hits"] = d["hits"] / sint + v["miss"] = d["misses"] / sint + v["read"] = v["hits"] + v["miss"] + v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 + v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 + + v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint + v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint + + v["dread"] = v["dhit"] + v["dmis"] + v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 + v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 + + v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint + v["pmis"] = (d["prefetch_data_misses"] + + d["prefetch_metadata_misses"]) / sint + + v["pread"] = v["phit"] + v["pmis"] + v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 + v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 + + v["mhit"] = (d["prefetch_metadata_hits"] + + d["demand_metadata_hits"]) / sint + v["mmis"] = (d["prefetch_metadata_misses"] + + d["demand_metadata_misses"]) / sint + + v["mread"] = v["mhit"] + v["mmis"] + v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 + v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 + + v["arcsz"] = cur["size"] + v["size"] = cur["size"] + v["c"] = cur["c"] + v["mfu"] = d["mfu_hits"] / sint + v["mru"] = d["mru_hits"] / sint + v["mrug"] = d["mru_ghost_hits"] / sint + v["mfug"] = d["mfu_ghost_hits"] / sint + v["eskip"] = d["evict_skip"] / sint + v["mtxmis"] = d["mutex_miss"] / sint + + if l2exist: + v["l2hits"] = d["l2_hits"] / sint + v["l2miss"] = d["l2_misses"] / sint + v["l2read"] = v["l2hits"] + v["l2miss"] + v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 + + v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 + v["l2asize"] = cur["l2_asize"] + v["l2size"] = cur["l2_size"] + v["l2bytes"] = d["l2_read_bytes"] / sint + + v["grow"] = 0 if cur["arc_no_grow"] else 1 + v["need"] = cur["arc_need_free"] + v["free"] = cur["memory_free_bytes"] + v["avail"] = cur["memory_available_bytes"] + v["waste"] = cur["abd_chunk_waste_size"] + + +def main(): + global sint + global count + global hdr_intr + + i = 0 + count_flag = 0 + + init() + if count > 0: + count_flag = 1 + + signal(SIGINT, SIG_DFL) + signal(SIGWINCH, resize_handler) + while True: + if i == 0: + print_header() + + snap_stats() + calculate() + print_values() + + if count_flag == 1: + if count <= 1: + break + count -= 1 + + i = 0 if i >= hdr_intr else i + 1 + time.sleep(sint) + + if out: + out.close() + + +if __name__ == '__main__': + main() diff --git a/cmd/dbufstat/.gitignore b/cmd/dbufstat/.gitignore new file mode 100644 index 000000000000..2c2e913cef70 --- /dev/null +++ b/cmd/dbufstat/.gitignore @@ -0,0 +1 @@ +dbufstat diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am new file mode 100644 index 000000000000..e672a01a4227 --- /dev/null +++ b/cmd/dbufstat/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +bin_SCRIPTS = dbufstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/cmd/dbufstat/dbufstat.in b/cmd/dbufstat/dbufstat.in new file mode 100755 index 000000000000..98eb79057388 --- /dev/null +++ b/cmd/dbufstat/dbufstat.in @@ -0,0 +1,669 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out statistics for all cached dmu buffers. This information +# is available through the dbufs kstat and may be post-processed as +# needed by the script. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (C) 2013 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import sys +import getopt +import errno +import re + +bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] +bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", + "meta", "state", "dbholds", "dbc", "list", "atype", "flags", + "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", + "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", + "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] +bincompat = ["cached", "direct", "indirect", "bonus", "spill"] + +dhdr = ["pool", "objset", "object", "dtype", "cached"] +dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", + "indirect", "bonus", "spill"] +dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", + "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", + "l2_comp", "aholds"] + +thdr = ["pool", "objset", "dtype", "cached"] +txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", + "bonus", "spill"] +tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", + "dbc", "dbholds", "list", "atype", "flags", "count", "asize", + "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize"] + +cols = { + # hdr: [size, scale, description] + "pool": [15, -1, "pool name"], + "objset": [6, -1, "dataset identification number"], + "object": [10, -1, "object number"], + "level": [5, -1, "indirection level of buffer"], + "blkid": [8, -1, "block number of buffer"], + "offset": [12, 1024, "offset in object of buffer"], + "dbsize": [7, 1024, "size of buffer"], + "meta": [4, -1, "is this buffer metadata?"], + "state": [5, -1, "state of buffer (read, cached, etc)"], + "dbholds": [7, 1000, "number of holds on buffer"], + "dbc": [3, -1, "in dbuf cache"], + "list": [4, -1, "which ARC list contains this buffer"], + "atype": [7, -1, "ARC header type (data or metadata)"], + "flags": [9, -1, "ARC read flags"], + "count": [5, -1, "ARC data count"], + "asize": [7, 1024, "size of this ARC buffer"], + "access": [10, -1, "time this ARC buffer was last accessed"], + "mru": [5, 1000, "hits while on the ARC's MRU list"], + "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], + "mfu": [5, 1000, "hits while on the ARC's MFU list"], + "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], + "l2": [5, 1000, "hits while on the L2ARC"], + "l2_dattr": [8, -1, "L2ARC disk address/offset"], + "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], + "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], + "aholds": [6, 1000, "number of holds on this ARC buffer"], + "dtype": [27, -1, "dnode type"], + "btype": [27, -1, "bonus buffer type"], + "data_bs": [7, 1024, "data block size"], + "meta_bs": [7, 1024, "metadata block size"], + "bsize": [6, 1024, "bonus buffer size"], + "lvls": [6, -1, "number of indirection levels"], + "dholds": [6, 1000, "number of holds on dnode"], + "blocks": [8, 1000, "number of allocated blocks"], + "dsize": [12, 1024, "size of dnode"], + "cached": [6, 1024, "bytes cached for all blocks"], + "direct": [6, 1024, "bytes cached for direct blocks"], + "indirect": [8, 1024, "bytes cached for indirect blocks"], + "bonus": [5, 1024, "bytes cached for bonus buffer"], + "spill": [5, 1024, "bytes cached for spill block"], +} + +hdr = None +xhdr = None +sep = " " # Default separator is 2 spaces +cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] " + "[-s string] [-F filter]\n") +raw = 0 + + +def print_incompat_helper(incompat): + cnt = 0 + for key in sorted(incompat): + if cnt is 0: + sys.stderr.write("\t") + elif cnt > 8: + sys.stderr.write(",\n\t") + cnt = 0 + else: + sys.stderr.write(", ") + + sys.stderr.write("%s" % key) + cnt += 1 + + sys.stderr.write("\n\n") + + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + + sys.stderr.write("Field definitions incompatible with '-b' option:\n") + print_incompat_helper(bincompat) + + sys.stderr.write("Field definitions incompatible with '-d' option:\n") + print_incompat_helper(dincompat) + + sys.stderr.write("Field definitions incompatible with '-t' option:\n") + print_incompat_helper(tincompat) + + sys.stderr.write("Field definitions are as follows:\n") + for key in sorted(cols.keys()): + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(0) + + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -b : Print table of information for each dbuf\n") + sys.stderr.write("\t -d : Print table of information for each dnode\n") + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -n : Exclude header from output\n") + sys.stderr.write("\t -r : Print raw values\n") + sys.stderr.write("\t -t : Print table of information for each dnode type" + "\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -i : Redirect input from the specified file\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\t -F : Filter output by value or regex\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n") + sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n") + sys.stderr.write("\tdbufstat -v\n") + sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n") + sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n") + sys.stderr.write("\n") + + sys.exit(1) + + +def prettynum(sz, scale, num=0): + global raw + + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + if raw or scale == -1: + return "%*s" % (sz, num) + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while num > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if (save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + + +def print_values(v): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%s%s" % ( + prettynum(cols[col][0], cols[col][1], v[col]), sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + + +def print_header(): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + + +def get_typestring(t): + ot_strings = [ + "DMU_OT_NONE", + # general: + "DMU_OT_OBJECT_DIRECTORY", + "DMU_OT_OBJECT_ARRAY", + "DMU_OT_PACKED_NVLIST", + "DMU_OT_PACKED_NVLIST_SIZE", + "DMU_OT_BPOBJ", + "DMU_OT_BPOBJ_HDR", + # spa: + "DMU_OT_SPACE_MAP_HEADER", + "DMU_OT_SPACE_MAP", + # zil: + "DMU_OT_INTENT_LOG", + # dmu: + "DMU_OT_DNODE", + "DMU_OT_OBJSET", + # dsl: + "DMU_OT_DSL_DIR", + "DMU_OT_DSL_DIR_CHILD_MAP", + "DMU_OT_DSL_DS_SNAP_MAP", + "DMU_OT_DSL_PROPS", + "DMU_OT_DSL_DATASET", + # zpl: + "DMU_OT_ZNODE", + "DMU_OT_OLDACL", + "DMU_OT_PLAIN_FILE_CONTENTS", + "DMU_OT_DIRECTORY_CONTENTS", + "DMU_OT_MASTER_NODE", + "DMU_OT_UNLINKED_SET", + # zvol: + "DMU_OT_ZVOL", + "DMU_OT_ZVOL_PROP", + # other; for testing only! + "DMU_OT_PLAIN_OTHER", + "DMU_OT_UINT64_OTHER", + "DMU_OT_ZAP_OTHER", + # new object types: + "DMU_OT_ERROR_LOG", + "DMU_OT_SPA_HISTORY", + "DMU_OT_SPA_HISTORY_OFFSETS", + "DMU_OT_POOL_PROPS", + "DMU_OT_DSL_PERMS", + "DMU_OT_ACL", + "DMU_OT_SYSACL", + "DMU_OT_FUID", + "DMU_OT_FUID_SIZE", + "DMU_OT_NEXT_CLONES", + "DMU_OT_SCAN_QUEUE", + "DMU_OT_USERGROUP_USED", + "DMU_OT_USERGROUP_QUOTA", + "DMU_OT_USERREFS", + "DMU_OT_DDT_ZAP", + "DMU_OT_DDT_STATS", + "DMU_OT_SA", + "DMU_OT_SA_MASTER_NODE", + "DMU_OT_SA_ATTR_REGISTRATION", + "DMU_OT_SA_ATTR_LAYOUTS", + "DMU_OT_SCAN_XLATE", + "DMU_OT_DEDUP", + "DMU_OT_DEADLIST", + "DMU_OT_DEADLIST_HDR", + "DMU_OT_DSL_CLONES", + "DMU_OT_BPOBJ_SUBOBJ"] + otn_strings = { + 0x80: "DMU_OTN_UINT8_DATA", + 0xc0: "DMU_OTN_UINT8_METADATA", + 0x81: "DMU_OTN_UINT16_DATA", + 0xc1: "DMU_OTN_UINT16_METADATA", + 0x82: "DMU_OTN_UINT32_DATA", + 0xc2: "DMU_OTN_UINT32_METADATA", + 0x83: "DMU_OTN_UINT64_DATA", + 0xc3: "DMU_OTN_UINT64_METADATA", + 0x84: "DMU_OTN_ZAP_DATA", + 0xc4: "DMU_OTN_ZAP_METADATA", + 0xa0: "DMU_OTN_UINT8_ENC_DATA", + 0xe0: "DMU_OTN_UINT8_ENC_METADATA", + 0xa1: "DMU_OTN_UINT16_ENC_DATA", + 0xe1: "DMU_OTN_UINT16_ENC_METADATA", + 0xa2: "DMU_OTN_UINT32_ENC_DATA", + 0xe2: "DMU_OTN_UINT32_ENC_METADATA", + 0xa3: "DMU_OTN_UINT64_ENC_DATA", + 0xe3: "DMU_OTN_UINT64_ENC_METADATA", + 0xa4: "DMU_OTN_ZAP_ENC_DATA", + 0xe4: "DMU_OTN_ZAP_ENC_METADATA"} + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % t + + try: + if t < len(ot_strings): + return ot_strings[t] + else: + return otn_strings[t] + except (IndexError, KeyError): + return "(UNKNOWN)" + + +def get_compstring(c): + comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", + "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", + "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", + "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", + "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", + "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", + "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", + "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", + "ZIO_COMPRESS_ZSTD", "ZIO_COMPRESS_FUNCTION"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % c + + try: + return comp_strings[c] + except IndexError: + return "%i" % c + + +def parse_line(line, labels): + global hdr + + new = dict() + val = None + for col in hdr: + # These are "special" fields computed in the update_dict + # function, prevent KeyError exception on labels[col] for these. + if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: + val = line[labels[col]] + + if col in ['pool', 'flags']: + new[col] = str(val) + elif col in ['dtype', 'btype']: + new[col] = get_typestring(int(val)) + elif col in ['l2_comp']: + new[col] = get_compstring(int(val)) + else: + new[col] = int(val) + + return new + + +def update_dict(d, k, line, labels): + pool = line[labels['pool']] + objset = line[labels['objset']] + key = line[labels[k]] + + dbsize = int(line[labels['dbsize']]) + blkid = int(line[labels['blkid']]) + level = int(line[labels['level']]) + + if pool not in d: + d[pool] = dict() + + if objset not in d[pool]: + d[pool][objset] = dict() + + if key not in d[pool][objset]: + d[pool][objset][key] = parse_line(line, labels) + d[pool][objset][key]['bonus'] = 0 + d[pool][objset][key]['cached'] = 0 + d[pool][objset][key]['direct'] = 0 + d[pool][objset][key]['indirect'] = 0 + d[pool][objset][key]['spill'] = 0 + + d[pool][objset][key]['cached'] += dbsize + + if blkid == -1: + d[pool][objset][key]['bonus'] += dbsize + elif blkid == -2: + d[pool][objset][key]['spill'] += dbsize + else: + if level == 0: + d[pool][objset][key]['direct'] += dbsize + else: + d[pool][objset][key]['indirect'] += dbsize + + return d + + +def skip_line(vals, filters): + ''' + Determines if a line should be skipped during printing + based on a set of filters + ''' + if len(filters) == 0: + return False + + for key in vals: + if key in filters: + val = prettynum(cols[key][0], cols[key][1], vals[key]).strip() + # we want a full match here + if re.match("(?:" + filters[key] + r")\Z", val) is None: + return True + + return False + + +def print_dict(d, filters, noheader): + if not noheader: + print_header() + for pool in list(d.keys()): + for objset in list(d[pool].keys()): + for v in list(d[pool][objset].values()): + if not skip_line(v, filters): + print_values(v) + + +def dnodes_build_dict(filehandle): + labels = dict() + dnodes = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(dnodes, 'object', line.split(), labels) + + return dnodes + + +def types_build_dict(filehandle): + labels = dict() + types = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(types, 'dtype', line.split(), labels) + + return types + + +def buffers_print_all(filehandle, filters, noheader): + labels = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + if not noheader: + print_header() + + # The rest of the file is buffer information + for line in filehandle: + vals = parse_line(line.split(), labels) + if not skip_line(vals, filters): + print_values(vals) + + +def main(): + global hdr + global sep + global raw + + desired_cols = None + bflag = False + dflag = False + hflag = False + ifile = None + ofile = None + tflag = False + vflag = False + xflag = False + nflag = False + filters = dict() + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "bdf:hi:o:rs:tvxF:n", + [ + "buffers", + "dnodes", + "columns", + "help", + "infile", + "outfile", + "separator", + "types", + "verbose", + "extended", + "filter" + ] + ) + except getopt.error: + usage() + opts = None + + for opt, arg in opts: + if opt in ('-b', '--buffers'): + bflag = True + if opt in ('-d', '--dnodes'): + dflag = True + if opt in ('-f', '--columns'): + desired_cols = arg + if opt in ('-h', '--help'): + hflag = True + if opt in ('-i', '--infile'): + ifile = arg + if opt in ('-o', '--outfile'): + ofile = arg + if opt in ('-r', '--raw'): + raw += 1 + if opt in ('-s', '--separator'): + sep = arg + if opt in ('-t', '--types'): + tflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-x', '--extended'): + xflag = True + if opt in ('-n', '--noheader'): + nflag = True + if opt in ('-F', '--filter'): + fils = [x.strip() for x in arg.split(",")] + + for fil in fils: + f = [x.strip() for x in fil.split("=")] + + if len(f) != 2: + sys.stderr.write("Invalid filter '%s'.\n" % fil) + sys.exit(1) + + if f[0] not in cols: + sys.stderr.write("Invalid field '%s' in filter.\n" % f[0]) + sys.exit(1) + + if f[0] in filters: + sys.stderr.write("Field '%s' specified multiple times in " + "filter.\n" % f[0]) + sys.exit(1) + + try: + re.compile("(?:" + f[1] + r")\Z") + except re.error: + sys.stderr.write("Invalid regex for field '%s' in " + "filter.\n" % f[0]) + sys.exit(1) + + filters[f[0]] = f[1] + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + # Ensure at most only one of b, d, or t flags are set + if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): + usage() + + if bflag: + hdr = bxhdr if xflag else bhdr + elif tflag: + hdr = txhdr if xflag else thdr + else: # Even if dflag is False, it's the default if none set + dflag = True + hdr = dxhdr if xflag else dhdr + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif ((bflag and bincompat and ele in bincompat) or + (dflag and dincompat and ele in dincompat) or + (tflag and tincompat and ele in tincompat)): + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % + incompat) + usage() + + if ofile: + try: + tmp = open(ofile, "w") + sys.stdout = tmp + + except IOError: + sys.stderr.write("Cannot open %s for writing\n" % ofile) + sys.exit(1) + + if not ifile: + ifile = '/proc/spl/kstat/zfs/dbufs' + + if ifile is not "-": + try: + tmp = open(ifile, "r") + sys.stdin = tmp + except IOError: + sys.stderr.write("Cannot open %s for reading\n" % ifile) + sys.exit(1) + + if bflag: + buffers_print_all(sys.stdin, filters, nflag) + + if dflag: + print_dict(dnodes_build_dict(sys.stdin), filters, nflag) + + if tflag: + print_dict(types_build_dict(sys.stdin), filters, nflag) + + +if __name__ == '__main__': + main() diff --git a/cmd/fsck_zfs/Makefile.am b/cmd/fsck_zfs/Makefile.am new file mode 100644 index 000000000000..2380f56fa4d4 --- /dev/null +++ b/cmd/fsck_zfs/Makefile.am @@ -0,0 +1 @@ +dist_sbin_SCRIPTS = fsck.zfs diff --git a/cmd/fsck_zfs/fsck.zfs b/cmd/fsck_zfs/fsck.zfs new file mode 100755 index 000000000000..129a7f39c388 --- /dev/null +++ b/cmd/fsck_zfs/fsck.zfs @@ -0,0 +1,9 @@ +#!/bin/sh +# +# fsck.zfs: A fsck helper to accommodate distributions that expect +# to be able to execute a fsck on all filesystem types. Currently +# this script does nothing but it could be extended to act as a +# compatibility wrapper for 'zpool scrub'. +# + +exit 0 diff --git a/cmd/mount_zfs/.gitignore b/cmd/mount_zfs/.gitignore new file mode 100644 index 000000000000..cd9254bde3da --- /dev/null +++ b/cmd/mount_zfs/.gitignore @@ -0,0 +1 @@ +mount.zfs diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am new file mode 100644 index 000000000000..6c4d6ff79f16 --- /dev/null +++ b/cmd/mount_zfs/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +# +# Ignore the prefix for the mount helper. It must be installed in /sbin/ +# because this path is hardcoded in the mount(8) for security reasons. +# However, if needed, the configure option --with-mounthelperdir= can be used +# to override the default install location. +# +sbindir=$(mounthelperdir) +sbin_PROGRAMS = mount.zfs + +mount_zfs_SOURCES = \ + mount_zfs.c + +mount_zfs_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +mount_zfs_LDADD += $(LTLIBINTL) diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c new file mode 100644 index 000000000000..87d2ccadcded --- /dev/null +++ b/cmd/mount_zfs/mount_zfs.c @@ -0,0 +1,408 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Security, LLC. + */ + +#include <libintl.h> +#include <unistd.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/mntent.h> +#include <sys/stat.h> +#include <libzfs.h> +#include <libzutil.h> +#include <locale.h> +#include <getopt.h> +#include <fcntl.h> +#include <errno.h> + +#define ZS_COMMENT 0x00000000 /* comment */ +#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ + +libzfs_handle_t *g_zfs; + +/* + * Return the pool/dataset to mount given the name passed to mount. This + * is expected to be of the form pool/dataset, however may also refer to + * a block device if that device contains a valid zfs label. + */ +static char * +parse_dataset(char *dataset) +{ + char cwd[PATH_MAX]; + struct stat64 statbuf; + int error; + int len; + + /* + * We expect a pool/dataset to be provided, however if we're + * given a device which is a member of a zpool we attempt to + * extract the pool name stored in the label. Given the pool + * name we can mount the root dataset. + */ + error = stat64(dataset, &statbuf); + if (error == 0) { + nvlist_t *config; + char *name; + int fd; + + fd = open(dataset, O_RDONLY); + if (fd < 0) + goto out; + + error = zpool_read_label(fd, &config, NULL); + (void) close(fd); + if (error) + goto out; + + error = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name); + if (error) { + nvlist_free(config); + } else { + dataset = strdup(name); + nvlist_free(config); + return (dataset); + } + } +out: + /* + * If a file or directory in your current working directory is + * named 'dataset' then mount(8) will prepend your current working + * directory to the dataset. There is no way to prevent this + * behavior so we simply check for it and strip the prepended + * patch when it is added. + */ + if (getcwd(cwd, PATH_MAX) == NULL) + return (dataset); + + len = strlen(cwd); + + /* Do not add one when cwd already ends in a trailing '/' */ + if (strncmp(cwd, dataset, len) == 0) + return (dataset + len + (cwd[len-1] != '/')); + + return (dataset); +} + +/* + * Update the mtab_* code to use the libmount library when it is commonly + * available otherwise fallback to legacy mode. The mount(8) utility will + * manage the lock file for us to prevent racing updates to /etc/mtab. + */ +static int +mtab_is_writeable(void) +{ + struct stat st; + int error, fd; + + error = lstat("/etc/mtab", &st); + if (error || S_ISLNK(st.st_mode)) + return (0); + + fd = open("/etc/mtab", O_RDWR | O_CREAT, 0644); + if (fd < 0) + return (0); + + close(fd); + return (1); +} + +static int +mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) +{ + struct mntent mnt; + FILE *fp; + int error; + + mnt.mnt_fsname = dataset; + mnt.mnt_dir = mntpoint; + mnt.mnt_type = type; + mnt.mnt_opts = mntopts ? mntopts : ""; + mnt.mnt_freq = 0; + mnt.mnt_passno = 0; + + fp = setmntent("/etc/mtab", "a+"); + if (!fp) { + (void) fprintf(stderr, gettext( + "filesystem '%s' was mounted, but /etc/mtab " + "could not be opened due to error %d\n"), + dataset, errno); + return (MOUNT_FILEIO); + } + + error = addmntent(fp, &mnt); + if (error) { + (void) fprintf(stderr, gettext( + "filesystem '%s' was mounted, but /etc/mtab " + "could not be updated due to error %d\n"), + dataset, errno); + return (MOUNT_FILEIO); + } + + (void) endmntent(fp); + + return (MOUNT_SUCCESS); +} + +int +main(int argc, char **argv) +{ + zfs_handle_t *zhp; + char prop[ZFS_MAXPROPLEN]; + uint64_t zfs_version = 0; + char mntopts[MNT_LINE_MAX] = { '\0' }; + char badopt[MNT_LINE_MAX] = { '\0' }; + char mtabopt[MNT_LINE_MAX] = { '\0' }; + char mntpoint[PATH_MAX]; + char *dataset; + unsigned long mntflags = 0, zfsflags = 0, remount = 0; + int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0; + int error, c; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + opterr = 0; + + /* check options */ + while ((c = getopt_long(argc, argv, "sfnvo:h?", 0, 0)) != -1) { + switch (c) { + case 's': + sloppy = 1; + break; + case 'f': + fake = 1; + break; + case 'n': + nomtab = 1; + break; + case 'v': + verbose++; + break; + case 'o': + (void) strlcpy(mntopts, optarg, sizeof (mntopts)); + break; + case 'h': + case '?': + (void) fprintf(stderr, gettext("Invalid option '%c'\n"), + optopt); + (void) fprintf(stderr, gettext("Usage: mount.zfs " + "[-sfnv] [-o options] <dataset> <mountpoint>\n")); + return (MOUNT_USAGE); + } + } + + argc -= optind; + argv += optind; + + /* check that we only have two arguments */ + if (argc != 2) { + if (argc == 0) + (void) fprintf(stderr, gettext("missing dataset " + "argument\n")); + else if (argc == 1) + (void) fprintf(stderr, + gettext("missing mountpoint argument\n")); + else + (void) fprintf(stderr, gettext("too many arguments\n")); + (void) fprintf(stderr, "usage: mount <dataset> <mountpoint>\n"); + return (MOUNT_USAGE); + } + + dataset = parse_dataset(argv[0]); + + /* canonicalize the mount point */ + if (realpath(argv[1], mntpoint) == NULL) { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted at '%s' due to canonicalization error %d.\n"), + dataset, argv[1], errno); + return (MOUNT_SYSERR); + } + + /* validate mount options and set mntflags */ + error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy, + badopt, mtabopt); + if (error) { + switch (error) { + case ENOMEM: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to a memory allocation " + "failure.\n"), dataset); + return (MOUNT_SYSERR); + case ENOENT: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to invalid option " + "'%s'.\n"), dataset, badopt); + (void) fprintf(stderr, gettext("Use the '-s' option " + "to ignore the bad mount option.\n")); + return (MOUNT_USAGE); + default: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to internal error %d.\n"), + dataset, error); + return (MOUNT_SOFTWARE); + } + } + + if (verbose) + (void) fprintf(stdout, gettext("mount.zfs:\n" + " dataset: \"%s\"\n mountpoint: \"%s\"\n" + " mountflags: 0x%lx\n zfsflags: 0x%lx\n" + " mountopts: \"%s\"\n mtabopts: \"%s\"\n"), + dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); + + if (mntflags & MS_REMOUNT) { + nomtab = 1; + remount = 1; + } + + if (zfsflags & ZS_ZFSUTIL) + zfsutil = 1; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (MOUNT_SYSERR); + } + + /* try to open the dataset to access the mount point */ + if ((zhp = zfs_open(g_zfs, dataset, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted, unable to open the dataset\n"), dataset); + libzfs_fini(g_zfs); + return (MOUNT_USAGE); + } + + zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); + + /* treat all snapshots as legacy mount points */ + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) + (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); + else + (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop, + sizeof (prop), NULL, NULL, 0, B_FALSE); + + /* + * Fetch the max supported zfs version in case we get ENOTSUP + * back from the mount command, since we need the zfs handle + * to do so. + */ + zfs_version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (zfs_version == 0) { + fprintf(stderr, gettext("unable to fetch " + "ZFS version for filesystem '%s'\n"), dataset); + return (MOUNT_SYSERR); + } + + zfs_close(zhp); + libzfs_fini(g_zfs); + + /* + * Legacy mount points may only be mounted using 'mount', never using + * 'zfs mount'. However, since 'zfs mount' actually invokes 'mount' + * we differentiate the two cases using the 'zfsutil' mount option. + * This mount option should only be supplied by the 'zfs mount' util. + * + * The only exception to the above rule is '-o remount' which is + * always allowed for non-legacy datasets. This is done because when + * using zfs as your root file system both rc.sysinit/umountroot and + * systemd depend on 'mount -o remount <mountpoint>' to work. + */ + if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) { + (void) fprintf(stderr, gettext( + "filesystem '%s' cannot be mounted using 'zfs mount'.\n" + "Use 'zfs set mountpoint=%s' or 'mount -t zfs %s %s'.\n" + "See zfs(8) for more information.\n"), + dataset, mntpoint, dataset, mntpoint); + return (MOUNT_USAGE); + } + + if (!zfsutil && !(remount || fake) && + strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) { + (void) fprintf(stderr, gettext( + "filesystem '%s' cannot be mounted using 'mount'.\n" + "Use 'zfs set mountpoint=%s' or 'zfs mount %s'.\n" + "See zfs(8) for more information.\n"), + dataset, "legacy", dataset); + return (MOUNT_USAGE); + } + + if (!fake) { + error = mount(dataset, mntpoint, MNTTYPE_ZFS, + mntflags, mntopts); + } + + if (error) { + switch (errno) { + case ENOENT: + (void) fprintf(stderr, gettext("mount point " + "'%s' does not exist\n"), mntpoint); + return (MOUNT_SYSERR); + case EBUSY: + (void) fprintf(stderr, gettext("filesystem " + "'%s' is already mounted\n"), dataset); + return (MOUNT_BUSY); + case ENOTSUP: + if (zfs_version > ZPL_VERSION) { + (void) fprintf(stderr, + gettext("filesystem '%s' (v%d) is not " + "supported by this implementation of " + "ZFS (max v%d).\n"), dataset, + (int)zfs_version, (int)ZPL_VERSION); + } else { + (void) fprintf(stderr, + gettext("filesystem '%s' mount " + "failed for unknown reason.\n"), dataset); + } + return (MOUNT_SYSERR); +#ifdef MS_MANDLOCK + case EPERM: + if (mntflags & MS_MANDLOCK) { + (void) fprintf(stderr, gettext("filesystem " + "'%s' has the 'nbmand=on' property set, " + "this mount\noption may be disabled in " + "your kernel. Use 'zfs set nbmand=off'\n" + "to disable this option and try to " + "mount the filesystem again.\n"), dataset); + return (MOUNT_SYSERR); + } + /* fallthru */ +#endif + default: + (void) fprintf(stderr, gettext("filesystem " + "'%s' can not be mounted: %s\n"), dataset, + strerror(errno)); + return (MOUNT_USAGE); + } + } + + if (!nomtab && mtab_is_writeable()) { + error = mtab_update(dataset, mntpoint, MNTTYPE_ZFS, mtabopt); + if (error) + return (error); + } + + return (MOUNT_SUCCESS); +} diff --git a/cmd/raidz_test/.gitignore b/cmd/raidz_test/.gitignore new file mode 100644 index 000000000000..f8b83d9cce03 --- /dev/null +++ b/cmd/raidz_test/.gitignore @@ -0,0 +1 @@ +/raidz_test diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am new file mode 100644 index 000000000000..72c914e641e4 --- /dev/null +++ b/cmd/raidz_test/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +# Unconditionally enable ASSERTs +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +bin_PROGRAMS = raidz_test + +raidz_test_SOURCES = \ + raidz_test.h \ + raidz_test.c \ + raidz_bench.c + +raidz_test_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la + +raidz_test_LDADD += -lm diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c new file mode 100644 index 000000000000..8a2cec4ca685 --- /dev/null +++ b/cmd/raidz_test/raidz_bench.c @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/zio.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <stdio.h> + +#include <sys/time.h> + +#include "raidz_test.h" + +#define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32) +#define REC_BENCH_MEMORY (((uint64_t)1ULL)<<29) +#define BENCH_ASHIFT 12 +#define MIN_CS_SHIFT BENCH_ASHIFT +#define MAX_CS_SHIFT SPA_MAXBLOCKSHIFT + +static zio_t zio_bench; +static raidz_map_t *rm_bench; +static size_t max_data_size = SPA_MAXBLOCKSIZE; + +static void +bench_init_raidz_map(void) +{ + zio_bench.io_offset = 0; + zio_bench.io_size = max_data_size; + + /* + * To permit larger column sizes these have to be done + * allocated using aligned alloc instead of zio_abd_buf_alloc + */ + zio_bench.io_abd = raidz_alloc(max_data_size); + + init_zio_abd(&zio_bench); +} + +static void +bench_fini_raidz_maps(void) +{ + /* tear down golden zio */ + raidz_free(zio_bench.io_abd, max_data_size); + bzero(&zio_bench, sizeof (zio_t)); +} + +static inline void +run_gen_bench_impl(const char *impl) +{ + int fn, ncols; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + + /* Benchmark generate functions */ + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + fn + 1; + zio_bench.io_size = 1ULL << ds; + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + + /* estimate iteration count */ + iter_cnt = GEN_BENCH_MEMORY; + iter_cnt /= zio_bench.io_size; + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_generate_parity(rm_bench); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)disksize; + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_gen_name[fn], + rto_opts.rto_dcols, + (1ULL<<ds), + d_bw, + d_bw * (double)(ncols), + (unsigned)iter_cnt); + + vdev_raidz_map_free(rm_bench); + } + } +} + +static void +run_gen_bench(void) +{ + char **impl_name; + + LOG(D_INFO, DBLSEP "\nBenchmarking parity generation...\n\n"); + LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n"); + + for (impl_name = (char **)raidz_impl_names; *impl_name != NULL; + impl_name++) { + + if (vdev_raidz_impl_set(*impl_name) != 0) + continue; + + run_gen_bench_impl(*impl_name); + } +} + +static void +run_rec_bench_impl(const char *impl) +{ + int fn, ncols, nbad; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + static const int tgt[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + PARITY_PQR; + zio_bench.io_size = 1ULL << ds; + + /* + * raidz block is too short to test + * the requested method + */ + if (zio_bench.io_size / rto_opts.rto_dcols < + (1ULL << BENCH_ASHIFT)) + continue; + + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + + /* estimate iteration count */ + iter_cnt = (REC_BENCH_MEMORY); + iter_cnt /= zio_bench.io_size; + + /* calculate how many bad columns there are */ + nbad = MIN(3, raidz_ncols(rm_bench) - + raidz_parity(rm_bench)); + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_reconstruct(rm_bench, tgt[fn], nbad); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)(disksize); + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_rec_name[fn], + rto_opts.rto_dcols, + (1ULL<<ds), + d_bw, + d_bw * (double)ncols, + (unsigned)iter_cnt); + + vdev_raidz_map_free(rm_bench); + } + } +} + +static void +run_rec_bench(void) +{ + char **impl_name; + + LOG(D_INFO, DBLSEP "\nBenchmarking data reconstruction...\n\n"); + LOG(D_ALL, "impl, math, dcols, iosize, disk_bw, total_bw, iter\n"); + + for (impl_name = (char **)raidz_impl_names; *impl_name != NULL; + impl_name++) { + + if (vdev_raidz_impl_set(*impl_name) != 0) + continue; + + run_rec_bench_impl(*impl_name); + } +} + +void +run_raidz_benchmark(void) +{ + bench_init_raidz_map(); + + run_gen_bench(); + run_rec_bench(); + + bench_fini_raidz_maps(); +} diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c new file mode 100644 index 000000000000..66f36b0d56ca --- /dev/null +++ b/cmd/raidz_test/raidz_test.c @@ -0,0 +1,782 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/zio.h> +#include <umem.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <assert.h> +#include <stdio.h> +#include "raidz_test.h" + +static int *rand_data; +raidz_test_opts_t rto_opts; + +static char gdb[256]; +static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d"; + +static void sig_handler(int signo) +{ + struct sigaction action; + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + + if (rto_opts.rto_gdb) + if (system(gdb)) { } + + raise(signo); +} + +static void print_opts(raidz_test_opts_t *opts, boolean_t force) +{ + char *verbose; + switch (opts->rto_v) { + case 0: + verbose = "no"; + break; + case 1: + verbose = "info"; + break; + default: + verbose = "debug"; + break; + } + + if (force || opts->rto_v >= D_INFO) { + (void) fprintf(stdout, DBLSEP "Running with options:\n" + " (-a) zio ashift : %zu\n" + " (-o) zio offset : 1 << %zu\n" + " (-d) number of raidz data columns : %zu\n" + " (-s) size of DATA : 1 << %zu\n" + " (-S) sweep parameters : %s \n" + " (-v) verbose : %s \n\n", + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ + } +} + +static void usage(boolean_t requested) +{ + const raidz_test_opts_t *o = &rto_opts_defaults; + + FILE *fp = requested ? stdout : stderr; + + (void) fprintf(fp, "Usage:\n" + "\t[-a zio ashift (default: %zu)]\n" + "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" + "\t[-d number of raidz data columns (default: %zu)]\n" + "\t[-s zio size, exponent radix 2 (default: %zu)]\n" + "\t[-S parameter sweep (default: %s)]\n" + "\t[-t timeout for parameter sweep test]\n" + "\t[-B benchmark all raidz implementations]\n" + "\t[-v increase verbosity (default: %zu)]\n" + "\t[-h (print help)]\n" + "\t[-T test the test, see if failure would be detected]\n" + "\t[-D debug (attach gdb on SIGSEGV)]\n" + "", + o->rto_ashift, /* -a */ + ilog2(o->rto_offset), /* -o */ + o->rto_dcols, /* -d */ + ilog2(o->rto_dsize), /* -s */ + rto_opts.rto_sweep ? "yes" : "no", /* -S */ + o->rto_v); /* -d */ + + exit(requested ? 0 : 1); +} + +static void process_options(int argc, char **argv) +{ + size_t value; + int opt; + + raidz_test_opts_t *o = &rto_opts; + + bcopy(&rto_opts_defaults, o, sizeof (*o)); + + while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + value = 0; + + switch (opt) { + case 'a': + value = strtoull(optarg, NULL, 0); + o->rto_ashift = MIN(13, MAX(9, value)); + break; + case 'o': + value = strtoull(optarg, NULL, 0); + o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; + break; + case 'd': + value = strtoull(optarg, NULL, 0); + o->rto_dcols = MIN(255, MAX(1, value)); + break; + case 's': + value = strtoull(optarg, NULL, 0); + o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, + MAX(SPA_MINBLOCKSHIFT, value)); + break; + case 't': + value = strtoull(optarg, NULL, 0); + o->rto_sweep_timeout = value; + break; + case 'v': + o->rto_v++; + break; + case 'S': + o->rto_sweep = 1; + break; + case 'B': + o->rto_benchmark = 1; + break; + case 'D': + o->rto_gdb = 1; + break; + case 'T': + o->rto_sanity = 1; + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } +} + +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) +#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) + +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) +#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) + +static int +cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) +{ + int i, ret = 0; + + VERIFY(parity >= 1 && parity <= 3); + + for (i = 0; i < parity; i++) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } + } + return (ret); +} + +static int +cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) +{ + int i, ret = 0; + int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + + for (i = 0; i < dcols; i++) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } + } + return (ret); +} + +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *)data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + +static void +corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) +{ + int i; + raidz_col_t *col; + + for (i = 0; i < cnt; i++) { + col = &rm->rm_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + } +} + +void +init_zio_abd(zio_t *zio) +{ + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); +} + +static void +fini_raidz_map(zio_t **zio, raidz_map_t **rm) +{ + vdev_raidz_map_free(*rm); + raidz_free((*zio)->io_abd, (*zio)->io_size); + umem_free(*zio, sizeof (zio_t)); + + *zio = NULL; + *rm = NULL; +} + +static int +init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) +{ + int err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + const size_t total_ncols = opts->rto_dcols + parity; + + if (opts->rm_golden) { + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + } + + opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; + opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; + + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); + + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); + + VERIFY0(vdev_raidz_impl_set("original")); + + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + + VERIFY(opts->zio_golden); + VERIFY(opts->rm_golden); + + vdev_raidz_generate_parity(opts->rm_golden); + vdev_raidz_generate_parity(rm_test); + + /* sanity check */ + err |= cmp_data(opts, rm_test); + err |= cmp_code(opts, rm_test, parity); + + if (err) + ERR("initializing the golden copy ... [FAIL]!\n"); + + /* tear down raidz_map of test zio */ + fini_raidz_map(&zio_test, &rm_test); + + return (err); +} + +static raidz_map_t * +init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) +{ + raidz_map_t *rm = NULL; + const size_t alloc_dsize = opts->rto_dsize; + const size_t total_ncols = opts->rto_dcols + parity; + const int ccols[] = { 0, 1, 2 }; + + VERIFY(zio); + VERIFY(parity <= 3 && parity >= 1); + + *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + (*zio)->io_offset = 0; + (*zio)->io_size = alloc_dsize; + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); + + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + VERIFY(rm); + + /* Make sure code columns are destroyed */ + corrupt_colums(rm, ccols, parity); + + return (rm); +} + +static int +run_gen_check(raidz_test_opts_t *opts) +{ + char **impl_name; + int fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing parity generation...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (0 != vdev_raidz_impl_set(*impl_name)) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else { + LOG(D_INFO, "[SUPPORTED]\n"); + } + + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, fn+1); + VERIFY(rm_test); + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_gen_name[fn]); + + if (!opts->rto_sanity) + vdev_raidz_generate_parity(rm_test); + + if (cmp_code(opts, rm_test, fn+1) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + } else + LOG(D_INFO, "[PASS]\n"); + + fini_raidz_map(&zio_test, &rm_test); + } + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) +{ + int x0, x1, x2; + int tgtidx[3]; + int err = 0; + static const int rec_tgts[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); + + if (fn < RAIDZ_REC_PQ) { + /* can reconstruct 1 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d] ", x0); + + tgtidx[2] = x0 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+2, 1); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); + } + } + + } else if (fn < RAIDZ_REC_PQR) { + /* can reconstruct 2 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d] ", x0, x1); + + tgtidx[1] = x0 + raidz_parity(rm); + tgtidx[2] = x1 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+1, 2); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d %d]... " + "[FAIL]\n", x0, x1); + } + } + } + } else { + /* can reconstruct 3 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { + if (x2 >= + rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); + + tgtidx[0] = x0 + raidz_parity(rm); + tgtidx[1] = x1 + raidz_parity(rm); + tgtidx[2] = x2 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx, 3); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, + tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, + "\nREC D[%d %d %d]... " + "[FAIL]\n", x0, x1, x2); + } + } + } + } + } + return (err); +} + +static int +run_rec_check(raidz_test_opts_t *opts) +{ + char **impl_name; + unsigned fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing data reconstruction...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (vdev_raidz_impl_set(*impl_name) != 0) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else + LOG(D_INFO, "[SUPPORTED]\n"); + + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); + /* generate parity */ + vdev_raidz_generate_parity(rm_test); + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_rec_name[fn]); + + if (run_rec_check_impl(opts, rm_test, fn) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + + } else + LOG(D_INFO, "[PASS]\n"); + + } + /* tear down test raidz_map */ + fini_raidz_map(&zio_test, &rm_test); + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_test(raidz_test_opts_t *opts) +{ + int err = 0; + + if (opts == NULL) + opts = &rto_opts; + + print_opts(opts, B_FALSE); + + err |= run_gen_check(opts); + err |= run_rec_check(opts); + + return (err); +} + +#define SWEEP_RUNNING 0 +#define SWEEP_FINISHED 1 +#define SWEEP_ERROR 2 +#define SWEEP_TIMEOUT 3 + +static int sweep_state = 0; +static raidz_test_opts_t failed_opts; + +static kmutex_t sem_mtx; +static kcondvar_t sem_cv; +static int max_free_slots; +static int free_slots; + +static void +sweep_thread(void *arg) +{ + int err = 0; + raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; + VERIFY(opts != NULL); + + err = run_test(opts); + + if (rto_opts.rto_sanity) { + /* 25% chance that a sweep test fails */ + if (rand() < (RAND_MAX/4)) + err = 1; + } + + if (0 != err) { + mutex_enter(&sem_mtx); + memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); + sweep_state = SWEEP_ERROR; + mutex_exit(&sem_mtx); + } + + umem_free(opts, sizeof (raidz_test_opts_t)); + + /* signal the next thread */ + mutex_enter(&sem_mtx); + free_slots++; + cv_signal(&sem_cv); + mutex_exit(&sem_mtx); + + thread_exit(); +} + +static int +run_sweep(void) +{ + static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; + static const size_t ashift_v[] = { 9, 12, 14 }; + static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), + 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; + + (void) setvbuf(stdout, NULL, _IONBF, 0); + + ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * + ARRAY_SIZE(dcols_v); + ulong_t tried_comb = 0; + hrtime_t time_diff, start_time = gethrtime(); + raidz_test_opts_t *opts; + int a, d, s; + + max_free_slots = free_slots = MAX(2, boot_ncpus); + + mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); + + for (s = 0; s < ARRAY_SIZE(size_v); s++) + for (a = 0; a < ARRAY_SIZE(ashift_v); a++) + for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { + + if (size_v[s] < (1 << ashift_v[a])) { + total_comb--; + continue; + } + + if (++tried_comb % 20 == 0) + LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); + + /* wait for signal to start new thread */ + mutex_enter(&sem_mtx); + while (cv_timedwait_sig(&sem_cv, &sem_mtx, + ddi_get_lbolt() + hz)) { + + /* check if should stop the test (timeout) */ + time_diff = (gethrtime() - start_time) / NANOSEC; + if (rto_opts.rto_sweep_timeout > 0 && + time_diff >= rto_opts.rto_sweep_timeout) { + sweep_state = SWEEP_TIMEOUT; + rto_opts.rto_should_stop = B_TRUE; + mutex_exit(&sem_mtx); + goto exit; + } + + /* check if should stop the test (error) */ + if (sweep_state != SWEEP_RUNNING) { + mutex_exit(&sem_mtx); + goto exit; + } + + /* exit loop if a slot is available */ + if (free_slots > 0) { + break; + } + } + + free_slots--; + mutex_exit(&sem_mtx); + + opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); + opts->rto_ashift = ashift_v[a]; + opts->rto_dcols = dcols_v[d]; + opts->rto_offset = (1 << ashift_v[a]) * rand(); + opts->rto_dsize = size_v[s]; + opts->rto_v = 0; /* be quiet */ + + VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, + 0, NULL, TS_RUN, defclsyspri), !=, NULL); + } + +exit: + LOG(D_ALL, "\nWaiting for test threads to finish...\n"); + mutex_enter(&sem_mtx); + VERIFY(free_slots <= max_free_slots); + while (free_slots < max_free_slots) { + (void) cv_wait(&sem_cv, &sem_mtx); + } + mutex_exit(&sem_mtx); + + if (sweep_state == SWEEP_ERROR) { + ERR("Sweep test failed! Failed option: \n"); + print_opts(&failed_opts, B_TRUE); + } else { + if (sweep_state == SWEEP_TIMEOUT) + LOG(D_ALL, "Test timeout (%lus). Stopping...\n", + (ulong_t)rto_opts.rto_sweep_timeout); + + LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", + (ulong_t)tried_comb); + } + + mutex_destroy(&sem_mtx); + + return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); +} + +int +main(int argc, char **argv) +{ + size_t i; + struct sigaction action; + int err = 0; + + /* init gdb string early */ + (void) sprintf(gdb, gdb_tmpl, getpid()); + + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + + if (sigaction(SIGSEGV, &action, NULL) < 0) { + ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + dprintf_setup(&argc, argv); + + process_options(argc, argv); + + kernel_init(SPA_MODE_READ); + + /* setup random data because rand() is not reentrant */ + rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + srand((unsigned)time(NULL) * getpid()); + for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) + rand_data[i] = rand(); + + mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); + + if (rto_opts.rto_benchmark) { + run_raidz_benchmark(); + } else if (rto_opts.rto_sweep) { + err = run_sweep(); + } else { + err = run_test(NULL); + } + + umem_free(rand_data, SPA_MAXBLOCKSIZE); + kernel_fini(); + + return (err); +} diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h new file mode 100644 index 000000000000..09c825ae43c7 --- /dev/null +++ b/cmd/raidz_test/raidz_test.h @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef RAIDZ_TEST_H +#define RAIDZ_TEST_H + +#include <sys/spa.h> + +static const char *raidz_impl_names[] = { + "original", + "scalar", + "sse2", + "ssse3", + "avx2", + "avx512f", + "avx512bw", + "aarch64_neon", + "aarch64_neonx2", + "powerpc_altivec", + NULL +}; + +typedef struct raidz_test_opts { + size_t rto_ashift; + size_t rto_offset; + size_t rto_dcols; + size_t rto_dsize; + size_t rto_v; + size_t rto_sweep; + size_t rto_sweep_timeout; + size_t rto_benchmark; + size_t rto_sanity; + size_t rto_gdb; + + /* non-user options */ + boolean_t rto_should_stop; + + zio_t *zio_golden; + raidz_map_t *rm_golden; +} raidz_test_opts_t; + +static const raidz_test_opts_t rto_opts_defaults = { + .rto_ashift = 9, + .rto_offset = 1ULL << 0, + .rto_dcols = 8, + .rto_dsize = 1<<19, + .rto_v = 0, + .rto_sweep = 0, + .rto_benchmark = 0, + .rto_sanity = 0, + .rto_gdb = 0, + .rto_should_stop = B_FALSE +}; + +extern raidz_test_opts_t rto_opts; + +static inline size_t ilog2(size_t a) +{ + return (a > 1 ? 1 + ilog2(a >> 1) : 0); +} + + +#define D_ALL 0 +#define D_INFO 1 +#define D_DEBUG 2 + +#define LOG(lvl, a...) \ +{ \ + if (rto_opts.rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define LOG_OPT(lvl, opt, a...) \ +{ \ + if (opt->rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define ERR(a...) (void) fprintf(stderr, a) + + +#define DBLSEP "================\n" +#define SEP "----------------\n" + + +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) + + +void init_zio_abd(zio_t *zio); + +void run_raidz_benchmark(void); + +#endif /* RAIDZ_TEST_H */ diff --git a/cmd/vdev_id/Makefile.am b/cmd/vdev_id/Makefile.am new file mode 100644 index 000000000000..fb815faad084 --- /dev/null +++ b/cmd/vdev_id/Makefile.am @@ -0,0 +1 @@ +dist_udev_SCRIPTS = vdev_id diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id new file mode 100755 index 000000000000..8a75e638b67e --- /dev/null +++ b/cmd/vdev_id/vdev_id @@ -0,0 +1,605 @@ +#!/bin/sh +# +# vdev_id: udev helper to generate user-friendly names for JBOD disks +# +# This script parses the file /etc/zfs/vdev_id.conf to map a +# physical path in a storage topology to a channel name. The +# channel name is combined with a disk enclosure slot number to +# create an alias that reflects the physical location of the drive. +# This is particularly helpful when it comes to tasks like replacing +# failed drives. Slot numbers may also be re-mapped in case the +# default numbering is unsatisfactory. The drive aliases will be +# created as symbolic links in /dev/disk/by-vdev. +# +# The currently supported topologies are sas_direct and sas_switch. +# A multipath mode is supported in which dm-mpath devices are +# handled by examining the first-listed running component disk. In +# multipath mode the configuration file should contain a channel +# definition with the same name for each path to a given enclosure. +# +# The alias keyword provides a simple way to map already-existing +# device symlinks to more convenient names. It is suitable for +# small, static configurations or for sites that have some automated +# way to generate the mapping file. +# +# +# Some example configuration files are given below. + +# # +# # Example vdev_id.conf - sas_direct. +# # +# +# multipath no +# topology sas_direct +# phys_per_port 4 +# slot bay +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 C +# channel 86:00.0 0 D +# +# # Custom mapping for Channel A +# +# # Linux Mapped +# # Slot Slot Channel +# slot 1 7 A +# slot 2 10 A +# slot 3 3 A +# slot 4 6 A +# +# # Default mapping for B, C, and D +# slot 1 4 +# slot 2 2 +# slot 3 1 +# slot 4 3 + +# # +# # Example vdev_id.conf - sas_switch +# # +# +# topology sas_switch +# +# # SWITCH PORT CHANNEL NAME +# channel 1 A +# channel 2 B +# channel 3 C +# channel 4 D + +# # +# # Example vdev_id.conf - multipath +# # +# +# multipath yes +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 A +# channel 86:00.0 0 B + +# # +# # Example vdev_id.conf - alias +# # +# +# # by-vdev +# # name fully qualified or base name of device link +# alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca +# alias d2 wwn-0x5000c5002def789e + +PATH=/bin:/sbin:/usr/bin:/usr/sbin +CONFIG=/etc/zfs/vdev_id.conf +PHYS_PER_PORT= +DEV= +MULTIPATH= +TOPOLOGY= +BAY= + +usage() { + cat << EOF +Usage: vdev_id [-h] + vdev_id <-d device> [-c config_file] [-p phys_per_port] + [-g sas_direct|sas_switch|scsi] [-m] + + -c specify name of an alternative config file [default=$CONFIG] + -d specify basename of device (i.e. sda) + -e Create enclose device symlinks only (/dev/by-enclosure) + -g Storage network topology [default="$TOPOLOGY"] + -m Run in multipath mode + -p number of phy's per switch port [default=$PHYS_PER_PORT] + -h show this summary +EOF + exit 0 +} + +map_slot() { + LINUX_SLOT=$1 + CHANNEL=$2 + + MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ + \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG` + if [ -z "$MAPPED_SLOT" ] ; then + MAPPED_SLOT=$LINUX_SLOT + fi + printf "%d" ${MAPPED_SLOT} +} + +map_channel() { + MAPPED_CHAN= + PCI_ID=$1 + PORT=$2 + + case $TOPOLOGY in + "sas_switch") + MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \ + { print \\$3; exit }" $CONFIG` + ;; + "sas_direct"|"scsi") + MAPPED_CHAN=`awk "\\$1 == \"channel\" && \ + \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \ + { print \\$4; exit }" $CONFIG` + ;; + esac + printf "%s" ${MAPPED_CHAN} +} + +sas_handler() { + if [ -z "$PHYS_PER_PORT" ] ; then + PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ + {print \\$2; exit}" $CONFIG` + fi + PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" + exit 1 + fi + + if [ -z "$MULTIPATH_MODE" ] ; then + MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ + {print \\$2; exit}" $CONFIG` + fi + + # Use first running component device if we're handling a dm-mpath device + if [ "$MULTIPATH_MODE" = "yes" ] ; then + # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper + if [ -z "$DM_NAME" ] ; then + DM_NAME=`ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}"` + fi + + # For raw disks udev exports DEVTYPE=partition when + # handling partitions, and the rules can be written to + # take advantage of this to append a -part suffix. For + # dm devices we get DEVTYPE=disk even for partitions so + # we have to append the -part suffix directly in the + # helper. + if [ "$DEVTYPE" != "partition" ] ; then + PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + + # Strip off partition information. + DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + if [ -z "$DM_NAME" ] ; then + return + fi + + # Get the raw scsi device name from multipath -ll. Strip off + # leading pipe symbols to make field numbering consistent. + DEV=`multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + if [ -z "$DEV" ] ; then + return + fi + fi + + if echo $DEV | grep -q ^/devices/ ; then + sys_path=$DEV + else + sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + fi + + # Use positional parameters as an ad-hoc array + set -- $(echo "$sys_path" | tr / ' ') + num_dirs=$# + scsi_host_dir="/sys" + + # Get path up to /sys/.../hostX + i=1 + while [ $i -le $num_dirs ] ; do + d=$(eval echo \${$i}) + scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break + i=$(($i + 1)) + done + + if [ $i = $num_dirs ] ; then + return + fi + + PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + + # In sas_switch mode, the directory four levels beneath + # /sys/.../hostX contains symlinks to phy devices that reveal + # the switch port number. In sas_direct mode, the phy links one + # directory down reveal the HBA port. + port_dir=$scsi_host_dir + case $TOPOLOGY in + "sas_switch") j=$(($i + 4)) ;; + "sas_direct") j=$(($i + 1)) ;; + esac + + i=$(($i + 1)) + while [ $i -le $j ] ; do + port_dir="$port_dir/$(eval echo \${$i})" + i=$(($i + 1)) + done + + PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'` + if [ -z "$PHY" ] ; then + PHY=0 + fi + PORT=$(( $PHY / $PHYS_PER_PORT )) + + # Look in /sys/.../sas_device/end_device-X for the bay_identifier + # attribute. + end_device_dir=$port_dir + while [ $i -lt $num_dirs ] ; do + d=$(eval echo \${$i}) + end_device_dir="$end_device_dir/$d" + if echo $d | grep -q '^end_device' ; then + end_device_dir="$end_device_dir/sas_device/$d" + break + fi + i=$(($i + 1)) + done + + SLOT= + case $BAY in + "bay") + SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null` + ;; + "phy") + SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null` + ;; + "port") + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "id") + i=$(($i + 1)) + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "lun") + i=$(($i + 2)) + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "ses") + # look for this SAS path in all SCSI Enclosure Services + # (SES) enclosures + sas_address=`cat $end_device_dir/sas_address 2>/dev/null` + enclosures=`lsscsi -g | \ + sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'` + for enclosure in $enclosures; do + set -- $(sg_ses -p aes $enclosure | \ + awk "/device slot number:/{slot=\$12} \ + /SAS address: $sas_address/\ + {print slot}") + SLOT=$1 + if [ -n "$SLOT" ] ; then + break + fi + done + ;; + esac + if [ -z "$SLOT" ] ; then + return + fi + + CHAN=`map_channel $PCI_ID $PORT` + SLOT=`map_slot $SLOT $CHAN` + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} +} + +scsi_handler() { + if [ -z "$FIRST_BAY_NUMBER" ] ; then + FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \ + {print \\$2; exit}" $CONFIG` + fi + FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} + + if [ -z "$PHYS_PER_PORT" ] ; then + PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ + {print \\$2; exit}" $CONFIG` + fi + PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" + exit 1 + fi + + if [ -z "$MULTIPATH_MODE" ] ; then + MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ + {print \\$2; exit}" $CONFIG` + fi + + # Use first running component device if we're handling a dm-mpath device + if [ "$MULTIPATH_MODE" = "yes" ] ; then + # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper + if [ -z "$DM_NAME" ] ; then + DM_NAME=`ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}"` + fi + + # For raw disks udev exports DEVTYPE=partition when + # handling partitions, and the rules can be written to + # take advantage of this to append a -part suffix. For + # dm devices we get DEVTYPE=disk even for partitions so + # we have to append the -part suffix directly in the + # helper. + if [ "$DEVTYPE" != "partition" ] ; then + PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + + # Strip off partition information. + DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + if [ -z "$DM_NAME" ] ; then + return + fi + + # Get the raw scsi device name from multipath -ll. Strip off + # leading pipe symbols to make field numbering consistent. + DEV=`multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + if [ -z "$DEV" ] ; then + return + fi + fi + + if echo $DEV | grep -q ^/devices/ ; then + sys_path=$DEV + else + sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + fi + + # expect sys_path like this, for example: + # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv + + # Use positional parameters as an ad-hoc array + set -- $(echo "$sys_path" | tr / ' ') + num_dirs=$# + scsi_host_dir="/sys" + + # Get path up to /sys/.../hostX + i=1 + while [ $i -le $num_dirs ] ; do + d=$(eval echo \${$i}) + scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break + i=$(($i + 1)) + done + + if [ $i = $num_dirs ] ; then + return + fi + + PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + + # In scsi mode, the directory two levels beneath + # /sys/.../hostX reveals the port and slot. + port_dir=$scsi_host_dir + j=$(($i + 2)) + + i=$(($i + 1)) + while [ $i -le $j ] ; do + port_dir="$port_dir/$(eval echo \${$i})" + i=$(($i + 1)) + done + + set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') + PORT=$1 + SLOT=$(($2 + $FIRST_BAY_NUMBER)) + + if [ -z "$SLOT" ] ; then + return + fi + + CHAN=`map_channel $PCI_ID $PORT` + SLOT=`map_slot $SLOT $CHAN` + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} +} + +# Figure out the name for the enclosure symlink +enclosure_handler () { + # We get all the info we need from udev's DEVPATH variable: + # + # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0 + + # Get the enclosure ID ("0:0:0:0") + ENC=$(basename $(readlink -m "/sys/$DEVPATH/../..")) + if [ ! -d /sys/class/enclosure/$ENC ] ; then + # Not an enclosure, bail out + return + fi + + # Get the long sysfs device path to our enclosure. Looks like: + # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0 + + ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC) + + # Grab the full path to the hosts port dir: + # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0 + PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') + + # Get the port number + PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$") + + # The PCI directory is two directories up from the port directory + # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0 + PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../..")) + + # Strip down the PCI address from 0000:05:00.0 to 05:00.0 + PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') + + # Name our device according to vdev_id.conf (like "L0" or "U1"). + NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ + \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG) + + echo "${NAME}" +} + +alias_handler () { + # Special handling is needed to correctly append a -part suffix + # to partitions of device mapper devices. The DEVTYPE attribute + # is normally set to "disk" instead of "partition" in this case, + # so the udev rules won't handle that for us as they do for + # "plain" block devices. + # + # For example, we may have the following links for a device and its + # partitions, + # + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0 + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1 + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3 + # + # and the following alias in vdev_id.conf. + # + # alias A0 dm-name-isw_dibgbfcije_ARRAY0 + # + # The desired outcome is for the following links to be created + # without having explicitly defined aliases for the partitions. + # + # /dev/disk/by-vdev/A0 -> ../../dm-0 + # /dev/disk/by-vdev/A0-part1 -> ../../dm-1 + # /dev/disk/by-vdev/A0-part2 -> ../../dm-3 + # + # Warning: The following grep pattern will misidentify whole-disk + # devices whose names end with 'p' followed by a string of + # digits as partitions, causing alias creation to fail. This + # ambiguity seems unavoidable, so devices using this facility + # must not use such names. + DM_PART= + if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then + if [ "$DEVTYPE" != "partition" ] ; then + DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + fi + + # DEVLINKS attribute must have been populated by already-run udev rules. + for link in $DEVLINKS ; do + # Remove partition information to match key of top-level device. + if [ -n "$DM_PART" ] ; then + link=`echo $link | sed 's/p[0-9][0-9]*$//'` + fi + # Check both the fully qualified and the base name of link. + for l in $link `basename $link` ; do + alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \ + { print \\$2; exit }" $CONFIG` + if [ -n "$alias" ] ; then + echo ${alias}${DM_PART} + return + fi + done + done +} + +while getopts 'c:d:eg:mp:h' OPTION; do + case ${OPTION} in + c) + CONFIG=${OPTARG} + ;; + d) + DEV=${OPTARG} + ;; + e) + # When udev sees a scsi_generic device, it calls this script with -e to + # create the enclosure device symlinks only. We also need + # "enclosure_symlinks yes" set in vdev_id.config to actually create the + # symlink. + ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG) + if [ "$ENCLOSURE_MODE" != "yes" ] ; then + exit 0 + fi + ;; + g) + TOPOLOGY=$OPTARG + ;; + p) + PHYS_PER_PORT=${OPTARG} + ;; + m) + MULTIPATH_MODE=yes + ;; + h) + usage + ;; + esac +done + +if [ ! -r $CONFIG ] ; then + exit 0 +fi + +if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then + echo "Error: missing required option -d" + exit 1 +fi + +if [ -z "$TOPOLOGY" ] ; then + TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG` +fi + +if [ -z "$BAY" ] ; then + BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG` +fi + +TOPOLOGY=${TOPOLOGY:-sas_direct} + +# Should we create /dev/by-enclosure symlinks? +if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then + ID_ENCLOSURE=$(enclosure_handler) + if [ -z "$ID_ENCLOSURE" ] ; then + exit 0 + fi + + # Just create the symlinks to the enclosure devices and then exit. + ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG) + if [ -z "$ENCLOSURE_PREFIX" ] ; then + ENCLOSURE_PREFIX="enc" + fi + echo "ID_ENCLOSURE=$ID_ENCLOSURE" + echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE" + exit 0 +fi + +# First check if an alias was defined for this device. +ID_VDEV=`alias_handler` + +if [ -z "$ID_VDEV" ] ; then + BAY=${BAY:-bay} + case $TOPOLOGY in + sas_direct|sas_switch) + ID_VDEV=`sas_handler` + ;; + scsi) + ID_VDEV=`scsi_handler` + ;; + *) + echo "Error: unknown topology $TOPOLOGY" + exit 1 + ;; + esac +fi + +if [ -n "$ID_VDEV" ] ; then + echo "ID_VDEV=${ID_VDEV}" + echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}" +fi diff --git a/cmd/zdb/.gitignore b/cmd/zdb/.gitignore new file mode 100644 index 000000000000..f64a3fc5a160 --- /dev/null +++ b/cmd/zdb/.gitignore @@ -0,0 +1 @@ +/zdb diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am new file mode 100644 index 000000000000..b325cb060bd2 --- /dev/null +++ b/cmd/zdb/Makefile.am @@ -0,0 +1,16 @@ +include $(top_srcdir)/config/Rules.am + +# Unconditionally enable debugging for zdb +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = zdb + +zdb_SOURCES = \ + zdb.c \ + zdb_il.c \ + zdb.h + +zdb_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c new file mode 100644 index 000000000000..e7211711a41c --- /dev/null +++ b/cmd/zdb/zdb.c @@ -0,0 +1,8606 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. + * Copyright (c) 2015, 2017, Intel Corporation. + * Copyright (c) 2020 Datto Inc. + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/dmu.h> +#include <sys/zap.h> +#include <sys/fs/zfs.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_sa.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/metaslab_impl.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_pool.h> +#include <sys/dsl_bookmark.h> +#include <sys/dbuf.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/dmu_send.h> +#include <sys/dmu_traverse.h> +#include <sys/zio_checksum.h> +#include <sys/zio_compress.h> +#include <sys/zfs_fuid.h> +#include <sys/arc.h> +#include <sys/arc_impl.h> +#include <sys/ddt.h> +#include <sys/zfeature.h> +#include <sys/abd.h> +#include <sys/blkptr.h> +#include <sys/dsl_crypt.h> +#include <sys/dsl_scan.h> +#include <sys/btree.h> +#include <zfs_comutil.h> +#include <sys/zstd/zstd.h> + +#include <libnvpair.h> +#include <libzutil.h> + +#include "zdb.h" + +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ + (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ + DMU_OT_ZAP_OTHER : \ + (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ + DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) + +static char * +zdb_ot_name(dmu_object_type_t type) +{ + if (type < DMU_OT_NUMTYPES) + return (dmu_ot[type].ot_name); + else if ((type & DMU_OT_NEWTYPE) && + ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) + return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); + else + return ("UNKNOWN"); +} + +extern int reference_tracking_enable; +extern int zfs_recover; +extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; +extern int zfs_vdev_async_read_max_active; +extern boolean_t spa_load_verify_dryrun; +extern int zfs_reconstruct_indirect_combinations_max; +extern int zfs_btree_verify_intensity; + +static const char cmdname[] = "zdb"; +uint8_t dump_opt[256]; + +typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); + +uint64_t *zopt_metaslab = NULL; +static unsigned zopt_metaslab_args = 0; + +typedef struct zopt_object_range { + uint64_t zor_obj_start; + uint64_t zor_obj_end; + uint64_t zor_flags; +} zopt_object_range_t; +zopt_object_range_t *zopt_object_ranges = NULL; +static unsigned zopt_object_args = 0; + +static int flagbits[256]; + +#define ZOR_FLAG_PLAIN_FILE 0x0001 +#define ZOR_FLAG_DIRECTORY 0x0002 +#define ZOR_FLAG_SPACE_MAP 0x0004 +#define ZOR_FLAG_ZAP 0x0008 +#define ZOR_FLAG_ALL_TYPES -1 +#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ + ZOR_FLAG_DIRECTORY | \ + ZOR_FLAG_SPACE_MAP | \ + ZOR_FLAG_ZAP) + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ +static int leaked_objects = 0; +static range_tree_t *mos_refd_objs; + +static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, + boolean_t); +static void mos_obj_refd(uint64_t); +static void mos_obj_refd_multiple(uint64_t); +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx); + +typedef struct sublivelist_verify { + /* all ALLOC'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_allocs; + + /* all FREE'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_frees; + + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +static void zdb_print_blkptr(const blkptr_t *bp, int flags); + +static int +sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx) +{ + ASSERT3P(tx, ==, NULL); + struct sublivelist_verify *sv = arg; + char blkbuf[BP_SPRINTF_LEN]; + zfs_btree_index_t where; + if (free) { + zfs_btree_add(&sv->sv_pair, bp); + /* Check if the FREE is a duplicate */ + if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_frees, bp, &where); + } + } else { + /* Check if the ALLOC has been freed */ + if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) { + zfs_btree_remove_idx(&sv->sv_pair, &where); + } else { + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + if (DVA_IS_EMPTY(&bp->blk_dva[i])) + break; + sublivelist_verify_block_t svb = { + .svb_dva = bp->blk_dva[i], + .svb_allocated_txg = bp->blk_birth + }; + + if (zfs_btree_find(&sv->sv_leftover, &svb, + &where) == NULL) { + zfs_btree_add_idx(&sv->sv_leftover, + &svb, &where); + } + } + } + /* Check if the ALLOC is a duplicate */ + if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where); + } + } + return (0); +} + +static int +sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) +{ + int err; + char blkbuf[BP_SPRINTF_LEN]; + struct sublivelist_verify *sv = args; + + zfs_btree_create(&sv->sv_all_allocs, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_all_frees, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_pair, livelist_compare, + sizeof (blkptr_t)); + + err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, + sv, NULL); + + zfs_btree_clear(&sv->sv_all_allocs); + zfs_btree_destroy(&sv->sv_all_allocs); + + zfs_btree_clear(&sv->sv_all_frees); + zfs_btree_destroy(&sv->sv_all_frees); + + blkptr_t *e; + zfs_btree_index_t *cookie = NULL; + while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE); + (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf); + } + zfs_btree_destroy(&sv->sv_pair); + + return (err); +} + +static int +livelist_block_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_t *l = larg; + const sublivelist_verify_block_t *r = rarg; + + if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) + return (-1); + else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) + return (+1); + + if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) + return (-1); + else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) + return (+1); + + if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) + return (-1); + else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) + return (+1); + + return (0); +} + +/* + * Check for errors in a livelist while tracking all unfreed ALLOCs in the + * sublivelist_verify_t: sv->sv_leftover + */ +static void +livelist_verify(dsl_deadlist_t *dl, void *arg) +{ + sublivelist_verify_t *sv = arg; + dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); +} + +/* + * Check for errors in the livelist entry and discard the intermediary + * data structures + */ +/* ARGSUSED */ +static int +sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) +{ + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + int err = sublivelist_verify_func(&sv, dle); + zfs_btree_clear(&sv.sv_leftover); + zfs_btree_destroy(&sv.sv_leftover); + return (err); +} + +typedef struct metaslab_verify { + /* + * Tree containing all the leftover ALLOCs from the livelists + * that are part of this metaslab. + */ + zfs_btree_t mv_livelist_allocs; + + /* + * Metaslab information. + */ + uint64_t mv_vdid; + uint64_t mv_msid; + uint64_t mv_start; + uint64_t mv_end; + + /* + * What's currently allocated for this metaslab. + */ + range_tree_t *mv_allocated; +} metaslab_verify_t; + +typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); + +typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, + void *arg); + +typedef struct unflushed_iter_cb_arg { + spa_t *uic_spa; + uint64_t uic_txg; + void *uic_arg; + zdb_log_sm_cb_t uic_cb; +} unflushed_iter_cb_arg_t; + +static int +iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) +{ + unflushed_iter_cb_arg_t *uic = arg; + return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); +} + +static void +iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + unflushed_iter_cb_arg_t uic = { + .uic_spa = spa, + .uic_txg = sls->sls_txg, + .uic_arg = arg, + .uic_cb = cb + }; + VERIFY0(space_map_iterate(sm, space_map_length(sm), + iterate_through_spacemap_logs_cb, &uic)); + space_map_close(sm); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, + uint64_t offset, uint64_t size) +{ + sublivelist_verify_block_t svb; + DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); + DVA_SET_OFFSET(&svb.svb_dva, offset); + DVA_SET_ASIZE(&svb.svb_dva, size); + zfs_btree_index_t where; + uint64_t end_offset = offset + size; + + /* + * Look for an exact match for spacemap entry in the livelist entries. + * Then, look for other livelist entries that fall within the range + * of the spacemap entry as it may have been condensed + */ + sublivelist_verify_block_t *found = + zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); + if (found == NULL) { + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); + } + for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && + DVA_GET_OFFSET(&found->svb_dva) < end_offset; + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + if (found->svb_allocated_txg <= txg) { + (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " + "from TXG %llx FREED at TXG %llx\n", + (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), + (u_longlong_t)found->svb_allocated_txg, + (u_longlong_t)txg); + } + } +} + +static int +metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t txg = sme->sme_txg; + + if (sme->sme_type == SM_ALLOC) { + if (range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE ALLOC: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_add(mv->mv_allocated, + offset, size); + } + } else { + if (!range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE FREE: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_remove(mv->mv_allocated, + offset, size); + } + } + + if (sme->sme_type != SM_ALLOC) { + /* + * If something is freed in the spacemap, verify that + * it is not listed as allocated in the livelist. + */ + verify_livelist_allocs(mv, txg, offset, size); + } + return (0); +} + +static int +spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + if (vdev_id != mv->mv_vdid) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + if (ms->ms_id != mv->mv_msid) + return (0); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + + ASSERT3U(txg, ==, sme->sme_txg); + return (metaslab_spacemap_validation_cb(sme, mv)); +} + +static void +spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) +{ + iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); +} + +static void +spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) +{ + if (sm == NULL) + return; + + VERIFY0(space_map_iterate(sm, space_map_length(sm), + metaslab_spacemap_validation_cb, mv)); +} + +static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); + +/* + * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if + * they are part of that metaslab (mv_msid). + */ +static void +mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) +{ + zfs_btree_index_t where; + sublivelist_verify_block_t *svb; + ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); + for (svb = zfs_btree_first(&sv->sv_leftover, &where); + svb != NULL; + svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { + if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && + (DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) + continue; + + if ((DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + zfs_btree_add(&mv->mv_livelist_allocs, svb); + } + + for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); + svb != NULL; + svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + zfs_btree_remove(&sv->sv_leftover, svb); + } +} + +/* + * [Livelist Check] + * Iterate through all the sublivelists and: + * - report leftover frees + * - report double ALLOCs/FREEs + * - record leftover ALLOCs together with their TXG [see Cross Check] + * + * [Spacemap Check] + * for each metaslab: + * - iterate over spacemap and then the metaslab's entries in the + * spacemap log, then report any double FREEs and ALLOCs (do not + * blow up). + * + * [Cross Check] + * After finishing the Livelist Check phase and while being in the + * Spacemap Check phase, we find all the recorded leftover ALLOCs + * of the livelist check that are part of the metaslab that we are + * currently looking at in the Spacemap Check. We report any entries + * that are marked as ALLOCs in the livelists but have been actually + * freed (and potentially allocated again) after their TXG stamp in + * the spacemaps. Also report any ALLOCs from the livelists that + * belong to indirect vdevs (e.g. their vdev completed removal). + * + * Note that this will miss Log Spacemap entries that cancelled each other + * out before being flushed to the metaslab, so we are not guaranteed + * to match all erroneous ALLOCs. + */ +static void +livelist_metaslab_validate(spa_t *spa) +{ + (void) printf("Verifying deleted livelist entries\n"); + + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + iterate_deleted_livelists(spa, livelist_verify, &sv); + + (void) printf("Verifying metaslab entries\n"); + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (!vdev_is_concrete(vd)) + continue; + + for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { + metaslab_t *m = vd->vdev_ms[mid]; + + (void) fprintf(stderr, + "\rverifying concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)mid, + (longlong_t)vd->vdev_ms_count); + + uint64_t shift, start; + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, m, + &start, &shift); + metaslab_verify_t mv; + mv.mv_allocated = range_tree_create(NULL, + type, NULL, start, shift); + mv.mv_vdid = vd->vdev_id; + mv.mv_msid = m->ms_id; + mv.mv_start = m->ms_start; + mv.mv_end = m->ms_start + m->ms_size; + zfs_btree_create(&mv.mv_livelist_allocs, + livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + + mv_populate_livelist_allocs(&mv, &sv); + + spacemap_check_ms_sm(m->ms_sm, &mv); + spacemap_check_sm_log(spa, &mv); + + range_tree_vacate(mv.mv_allocated, NULL, NULL); + range_tree_destroy(mv.mv_allocated); + zfs_btree_clear(&mv.mv_livelist_allocs); + zfs_btree_destroy(&mv.mv_livelist_allocs); + } + } + (void) fprintf(stderr, "\n"); + + /* + * If there are any segments in the leftover tree after we walked + * through all the metaslabs in the concrete vdevs then this means + * that we have segments in the livelists that belong to indirect + * vdevs and are marked as allocated. + */ + if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { + zfs_btree_destroy(&sv.sv_leftover); + return; + } + (void) printf("ERROR: Found livelist blocks marked as allocated " + "for indirect vdevs:\n"); + + zfs_btree_index_t *where = NULL; + sublivelist_verify_block_t *svb; + while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != + NULL) { + int vdev_id = DVA_GET_VDEV(&svb->svb_dva); + ASSERT3U(vdev_id, <, rvd->vdev_children); + vdev_t *vd = rvd->vdev_child[vdev_id]; + ASSERT(!vdev_is_concrete(vd)); + (void) printf("<%d:%llx:%llx> TXG %llx\n", + vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), + (u_longlong_t)svb->svb_allocated_txg); + } + (void) printf("\n"); + zfs_btree_destroy(&sv.sv_leftover); +} + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] " + "[-I <inflight I/Os>]\n" + "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" + "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n" + "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>]\n" + "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n" + "\t%s [-v] <bookmark>\n" + "\t%s -C [-A] [-U <cache>]\n" + "\t%s -l [-Aqu] <device>\n" + "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] " + "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n" + "\t%s -O <dataset> <path>\n" + "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n" + "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" + "\t%s -E [-A] word0:word1:...:word15\n" + "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] " + "<poolname>\n\n", + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, + cmdname, cmdname, cmdname); + + (void) fprintf(stderr, " Dataset name must include at least one " + "separator character '/' or '@'\n"); + (void) fprintf(stderr, " If dataset name is specified, only that " + "dataset is dumped\n"); + (void) fprintf(stderr, " If object numbers or object number " + "ranges are specified, only those\n" + " objects or ranges are dumped.\n\n"); + (void) fprintf(stderr, + " Object ranges take the form <start>:<end>[:<flags>]\n" + " start Starting object number\n" + " end Ending object number, or -1 for no upper bound\n" + " flags Optional flags to select object types:\n" + " A All objects (this is the default)\n" + " d ZFS directories\n" + " f ZFS files \n" + " m SPA space maps\n" + " z ZAPs\n" + " - Negate effect of next flag\n\n"); + (void) fprintf(stderr, " Options to control amount of output:\n"); + (void) fprintf(stderr, " -b block statistics\n"); + (void) fprintf(stderr, " -c checksum all metadata (twice for " + "all data) blocks\n"); + (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); + (void) fprintf(stderr, " -d dataset(s)\n"); + (void) fprintf(stderr, " -D dedup statistics\n"); + (void) fprintf(stderr, " -E decode and display block from an " + "embedded block pointer\n"); + (void) fprintf(stderr, " -h pool history\n"); + (void) fprintf(stderr, " -i intent logs\n"); + (void) fprintf(stderr, " -l read label contents\n"); + (void) fprintf(stderr, " -k examine the checkpointed state " + "of the pool\n"); + (void) fprintf(stderr, " -L disable leak tracking (do not " + "load spacemaps)\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -M metaslab groups\n"); + (void) fprintf(stderr, " -O perform object lookups by path\n"); + (void) fprintf(stderr, " -R read and display block from a " + "device\n"); + (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); + (void) fprintf(stderr, " -v verbose (applies to all " + "others)\n"); + (void) fprintf(stderr, " -y perform livelist and metaslab " + "validation on any livelists being deleted\n\n"); + (void) fprintf(stderr, " Below options are intended for use " + "with other options:\n"); + (void) fprintf(stderr, " -A ignore assertions (-A), enable " + "panic recovery (-AA) or both (-AAA)\n"); + (void) fprintf(stderr, " -e pool is exported/destroyed/" + "has altroot/not in a cachefile\n"); + (void) fprintf(stderr, " -F attempt automatic rewind within " + "safe range of transaction groups\n"); + (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " + "exiting\n"); + (void) fprintf(stderr, " -I <number of inflight I/Os> -- " + "specify the maximum number of\n " + "checksumming I/Os [default is 200]\n"); + (void) fprintf(stderr, " -o <variable>=<value> set global " + "variable to an unsigned 32-bit integer\n"); + (void) fprintf(stderr, " -p <path> -- use one or more with " + "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -P print numbers in parseable form\n"); + (void) fprintf(stderr, " -q don't print label contents\n"); + (void) fprintf(stderr, " -t <txg> -- highest txg to use when " + "searching for uberblocks\n"); + (void) fprintf(stderr, " -u uberblock\n"); + (void) fprintf(stderr, " -U <cachefile_path> -- use alternate " + "cachefile\n"); + (void) fprintf(stderr, " -V do verbatim import\n"); + (void) fprintf(stderr, " -x <dumpdir> -- " + "dump all read blocks into specified directory\n"); + (void) fprintf(stderr, " -X attempt extreme rewind (does not " + "work with dataset)\n"); + (void) fprintf(stderr, " -Y attempt all reconstruction " + "combinations for split blocks\n"); + (void) fprintf(stderr, " -Z show ZSTD headers \n"); + (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " + "to make only that option verbose\n"); + (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); + exit(1); +} + +static void +dump_debug_buffer(void) +{ + if (dump_opt['G']) { + (void) printf("\n"); + (void) fflush(stdout); + zfs_dbgmsg_print("zdb"); + } +} + +/* + * Called for usage errors that are discovered after a call to spa_open(), + * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. + */ + +static void +fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + dump_debug_buffer(); + + exit(1); +} + +/* ARGSUSED */ +static void +dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) +{ + nvlist_t *nv; + size_t nvsize = *(uint64_t *)data; + char *packed = umem_alloc(nvsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); + + VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); + + umem_free(packed, nvsize); + + dump_nvlist(nv, 8); + + nvlist_free(nv); +} + +/* ARGSUSED */ +static void +dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) +{ + spa_history_phys_t *shp = data; + + if (shp == NULL) + return; + + (void) printf("\t\tpool_create_len = %llu\n", + (u_longlong_t)shp->sh_pool_create_len); + (void) printf("\t\tphys_max_off = %llu\n", + (u_longlong_t)shp->sh_phys_max_off); + (void) printf("\t\tbof = %llu\n", + (u_longlong_t)shp->sh_bof); + (void) printf("\t\teof = %llu\n", + (u_longlong_t)shp->sh_eof); + (void) printf("\t\trecords_lost = %llu\n", + (u_longlong_t)shp->sh_records_lost); +} + +static void +zdb_nicenum(uint64_t num, char *buf, size_t buflen) +{ + if (dump_opt['P']) + (void) snprintf(buf, buflen, "%llu", (longlong_t)num); + else + nicenum(num, buf, sizeof (buf)); +} + +static const char histo_stars[] = "****************************************"; +static const uint64_t histo_width = sizeof (histo_stars) - 1; + +static void +dump_histogram(const uint64_t *histo, int size, int offset) +{ + int i; + int minidx = size - 1; + int maxidx = 0; + uint64_t max = 0; + + for (i = 0; i < size; i++) { + if (histo[i] > max) + max = histo[i]; + if (histo[i] > 0 && i > maxidx) + maxidx = i; + if (histo[i] > 0 && i < minidx) + minidx = i; + } + + if (max < histo_width) + max = histo_width; + + for (i = minidx; i <= maxidx; i++) { + (void) printf("\t\t\t%3u: %6llu %s\n", + i + offset, (u_longlong_t)histo[i], + &histo_stars[(max - histo[i]) * histo_width / max]); + } +} + +static void +dump_zap_stats(objset_t *os, uint64_t object) +{ + int error; + zap_stats_t zs; + + error = zap_get_stats(os, object, &zs); + if (error) + return; + + if (zs.zs_ptrtbl_len == 0) { + ASSERT(zs.zs_num_blocks == 1); + (void) printf("\tmicrozap: %llu bytes, %llu entries\n", + (u_longlong_t)zs.zs_blocksize, + (u_longlong_t)zs.zs_num_entries); + return; + } + + (void) printf("\tFat ZAP stats:\n"); + + (void) printf("\t\tPointer table:\n"); + (void) printf("\t\t\t%llu elements\n", + (u_longlong_t)zs.zs_ptrtbl_len); + (void) printf("\t\t\tzt_blk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_blk); + (void) printf("\t\t\tzt_numblks: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_numblks); + (void) printf("\t\t\tzt_shift: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_shift); + (void) printf("\t\t\tzt_blks_copied: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_blks_copied); + (void) printf("\t\t\tzt_nextblk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_nextblk); + + (void) printf("\t\tZAP entries: %llu\n", + (u_longlong_t)zs.zs_num_entries); + (void) printf("\t\tLeaf blocks: %llu\n", + (u_longlong_t)zs.zs_num_leafs); + (void) printf("\t\tTotal blocks: %llu\n", + (u_longlong_t)zs.zs_num_blocks); + (void) printf("\t\tzap_block_type: 0x%llx\n", + (u_longlong_t)zs.zs_block_type); + (void) printf("\t\tzap_magic: 0x%llx\n", + (u_longlong_t)zs.zs_magic); + (void) printf("\t\tzap_salt: 0x%llx\n", + (u_longlong_t)zs.zs_salt); + + (void) printf("\t\tLeafs with 2^n pointers:\n"); + dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBlocks with n*5 entries:\n"); + dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBlocks n/10 full:\n"); + dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tEntries with n chunks:\n"); + dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBuckets with n entries:\n"); + dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); +} + +/*ARGSUSED*/ +static void +dump_none(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) +{ + (void) printf("\tUNKNOWN OBJECT TYPE\n"); +} + +/*ARGSUSED*/ +static void +dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) +{ + uint64_t *arr; + uint64_t oursize; + if (dump_opt['d'] < 6) + return; + + if (data == NULL) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + size = doi.doi_max_offset; + /* + * We cap the size at 1 mebibyte here to prevent + * allocation failures and nigh-infinite printing if the + * object is extremely large. + */ + oursize = MIN(size, 1 << 20); + arr = kmem_alloc(oursize, KM_SLEEP); + + int err = dmu_read(os, object, 0, oursize, arr, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(arr, oursize); + return; + } + } else { + /* + * Even though the allocation is already done in this code path, + * we still cap the size to prevent excessive printing. + */ + oursize = MIN(size, 1 << 20); + arr = data; + } + + if (size == 0) { + (void) printf("\t\t[]\n"); + return; + } + + (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); + for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { + if (i % 4 != 0) + (void) printf(", %0llx", (u_longlong_t)arr[i]); + else + (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); + } + if (oursize != size) + (void) printf(", ... "); + (void) printf("]\n"); + + if (data == NULL) + kmem_free(arr, oursize); +} + +/*ARGSUSED*/ +static void +dump_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + void *prop; + unsigned i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + prop = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + (void) zap_lookup(os, object, attr.za_name, + attr.za_integer_length, attr.za_num_integers, prop); + if (attr.za_integer_length == 1) { + (void) printf("%s", (char *)prop); + } else { + for (i = 0; i < attr.za_num_integers; i++) { + switch (attr.za_integer_length) { + case 2: + (void) printf("%u ", + ((uint16_t *)prop)[i]); + break; + case 4: + (void) printf("%u ", + ((uint32_t *)prop)[i]); + break; + case 8: + (void) printf("%lld ", + (u_longlong_t)((int64_t *)prop)[i]); + break; + } + } + } + (void) printf("\n"); + umem_free(prop, attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +static void +dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) +{ + bpobj_phys_t *bpop = data; + uint64_t i; + char bytes[32], comp[32], uncomp[32]; + + /* make sure the output won't get truncated */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + + if (bpop == NULL) + return; + + zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); + zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); + zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); + + (void) printf("\t\tnum_blkptrs = %llu\n", + (u_longlong_t)bpop->bpo_num_blkptrs); + (void) printf("\t\tbytes = %s\n", bytes); + if (size >= BPOBJ_SIZE_V1) { + (void) printf("\t\tcomp = %s\n", comp); + (void) printf("\t\tuncomp = %s\n", uncomp); + } + if (size >= BPOBJ_SIZE_V2) { + (void) printf("\t\tsubobjs = %llu\n", + (u_longlong_t)bpop->bpo_subobjs); + (void) printf("\t\tnum_subobjs = %llu\n", + (u_longlong_t)bpop->bpo_num_subobjs); + } + if (size >= sizeof (*bpop)) { + (void) printf("\t\tnum_freed = %llu\n", + (u_longlong_t)bpop->bpo_num_freed); + } + + if (dump_opt['d'] < 5) + return; + + for (i = 0; i < bpop->bpo_num_blkptrs; i++) { + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t bp; + + int err = dmu_read(os, object, + i * sizeof (bp), sizeof (bp), &bp, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + break; + } + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, + BP_GET_FREE(&bp)); + (void) printf("\t%s\n", blkbuf); + } +} + +/* ARGSUSED */ +static void +dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) +{ + dmu_object_info_t doi; + int64_t i; + + VERIFY0(dmu_object_info(os, object, &doi)); + uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); + + int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(subobjs, doi.doi_max_offset); + return; + } + + int64_t last_nonzero = -1; + for (i = 0; i < doi.doi_max_offset / 8; i++) { + if (subobjs[i] != 0) + last_nonzero = i; + } + + for (i = 0; i <= last_nonzero; i++) { + (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); + } + kmem_free(subobjs, doi.doi_max_offset); +} + +/*ARGSUSED*/ +static void +dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + dump_zap_stats(os, object); + /* contents are printed elsewhere, properly decoded */ +} + +/*ARGSUSED*/ +static void +dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + (void) printf(" %llx : [%d:%d:%d]\n", + (u_longlong_t)attr.za_first_integer, + (int)ATTR_LENGTH(attr.za_first_integer), + (int)ATTR_BSWAP(attr.za_first_integer), + (int)ATTR_NUM(attr.za_first_integer)); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + uint16_t *layout_attrs; + unsigned i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = [", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + + VERIFY(attr.za_integer_length == 2); + layout_attrs = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + + VERIFY(zap_lookup(os, object, attr.za_name, + attr.za_integer_length, + attr.za_num_integers, layout_attrs) == 0); + + for (i = 0; i != attr.za_num_integers; i++) + (void) printf(" %d ", (int)layout_attrs[i]); + (void) printf("]\n"); + umem_free(layout_attrs, + attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + const char *typenames[] = { + /* 0 */ "not specified", + /* 1 */ "FIFO", + /* 2 */ "Character Device", + /* 3 */ "3 (invalid)", + /* 4 */ "Directory", + /* 5 */ "5 (invalid)", + /* 6 */ "Block Device", + /* 7 */ "7 (invalid)", + /* 8 */ "Regular File", + /* 9 */ "9 (invalid)", + /* 10 */ "Symbolic Link", + /* 11 */ "11 (invalid)", + /* 12 */ "Socket", + /* 13 */ "Door", + /* 14 */ "Event Port", + /* 15 */ "15 (invalid)", + }; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = %lld (type: %s)\n", + attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), + typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); + } + zap_cursor_fini(&zc); +} + +static int +get_dtl_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_ops->vdev_op_leaf) { + space_map_t *sm = vd->vdev_dtl_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + return (1); + return (0); + } + + for (unsigned c = 0; c < vd->vdev_children; c++) + refcount += get_dtl_refcount(vd->vdev_child[c]); + return (refcount); +} + +static int +get_metaslab_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd) { + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + space_map_t *sm = vd->vdev_ms[m]->ms_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + refcount++; + } + } + for (unsigned c = 0; c < vd->vdev_children; c++) + refcount += get_metaslab_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +get_obsolete_refcount(vdev_t *vd) +{ + uint64_t obsolete_sm_object; + int refcount = 0; + + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (vd->vdev_top == vd && obsolete_sm_object != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, + obsolete_sm_object, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + refcount++; + } + } else { + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + ASSERT3U(obsolete_sm_object, ==, 0); + } + for (unsigned c = 0; c < vd->vdev_children; c++) { + refcount += get_obsolete_refcount(vd->vdev_child[c]); + } + + return (refcount); +} + +static int +get_prev_obsolete_spacemap_refcount(spa_t *spa) +{ + uint64_t prev_obj = + spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; + if (prev_obj != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + return (1); + } + } + return (0); +} + +static int +get_checkpoint_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && + zap_contains(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) + refcount++; + + for (uint64_t c = 0; c < vd->vdev_children; c++) + refcount += get_checkpoint_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +get_log_spacemap_refcount(spa_t *spa) +{ + return (avl_numnodes(&spa->spa_sm_logs_by_txg)); +} + +static int +verify_spacemap_refcounts(spa_t *spa) +{ + uint64_t expected_refcount = 0; + uint64_t actual_refcount; + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], + &expected_refcount); + actual_refcount = get_dtl_refcount(spa->spa_root_vdev); + actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); + actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); + actual_refcount += get_prev_obsolete_spacemap_refcount(spa); + actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); + actual_refcount += get_log_spacemap_refcount(spa); + + if (expected_refcount != actual_refcount) { + (void) printf("space map refcount mismatch: expected %lld != " + "actual %lld\n", + (longlong_t)expected_refcount, + (longlong_t)actual_refcount); + return (2); + } + return (0); +} + +static void +dump_spacemap(objset_t *os, space_map_t *sm) +{ + const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", + "INVALID", "INVALID", "INVALID", "INVALID" }; + + if (sm == NULL) + return; + + (void) printf("space map object %llu:\n", + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); + (void) printf(" smp_alloc = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_alloc); + + if (dump_opt['d'] < 6 && dump_opt['m'] < 4) + return; + + /* + * Print out the freelist entries in both encoded and decoded form. + */ + uint8_t mapshift = sm->sm_shift; + int64_t alloc = 0; + uint64_t word, entry_id = 0; + for (uint64_t offset = 0; offset < space_map_length(sm); + offset += sizeof (word)) { + + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (word), &word, DMU_READ_PREFETCH)); + + if (sm_entry_is_debug(word)) { + uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); + uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); + if (de_txg == 0) { + (void) printf( + "\t [%6llu] PADDING\n", + (u_longlong_t)entry_id); + } else { + (void) printf( + "\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, + ddata[SM_DEBUG_ACTION_DECODE(word)], + (u_longlong_t)de_txg, + (u_longlong_t)de_sync_pass); + } + entry_id++; + continue; + } + + uint8_t words; + char entry_type; + uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; + + if (sm_entry_is_single_word(word)) { + entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM_OFFSET_DECODE(word) << mapshift) + + sm->sm_start; + entry_run = SM_RUN_DECODE(word) << mapshift; + words = 1; + } else { + /* it is a two-word entry so we read another word */ + ASSERT(sm_entry_is_double_word(word)); + + uint64_t extra_word; + offset += sizeof (extra_word); + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (extra_word), &extra_word, + DMU_READ_PREFETCH)); + + ASSERT3U(offset, <=, space_map_length(sm)); + + entry_run = SM2_RUN_DECODE(word) << mapshift; + entry_vdev = SM2_VDEV_DECODE(word); + entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM2_OFFSET_DECODE(extra_word) << + mapshift) + sm->sm_start; + words = 2; + } + + (void) printf("\t [%6llu] %c range:" + " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", + (u_longlong_t)entry_id, + entry_type, (u_longlong_t)entry_off, + (u_longlong_t)(entry_off + entry_run), + (u_longlong_t)entry_run, + (u_longlong_t)entry_vdev, words); + + if (entry_type == 'A') + alloc += entry_run; + else + alloc -= entry_run; + entry_id++; + } + if (alloc != space_map_allocated(sm)) { + (void) printf("space_map_object alloc (%lld) INCONSISTENT " + "with space map summary (%lld)\n", + (longlong_t)space_map_allocated(sm), (longlong_t)alloc); + } +} + +static void +dump_metaslab_stats(metaslab_t *msp) +{ + char maxbuf[32]; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *t = &msp->ms_allocatable_by_size; + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + + /* max sure nicenum has enough space */ + CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); + + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); + + (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, + "freepct", free_pct); + (void) printf("\tIn-memory histogram:\n"); + dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void +dump_metaslab(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + char freebuf[32]; + + zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, + sizeof (freebuf)); + + (void) printf( + "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", + (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, + (u_longlong_t)space_map_object(sm), freebuf); + + if (dump_opt['m'] > 2 && !dump_opt['L']) { + mutex_enter(&msp->ms_lock); + VERIFY0(metaslab_load(msp)); + range_tree_stat_verify(msp->ms_allocatable); + dump_metaslab_stats(msp); + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + + if (dump_opt['m'] > 1 && sm != NULL && + spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + /* + * The space map histogram represents free space in chunks + * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). + */ + (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", + (u_longlong_t)msp->ms_fragmentation); + dump_histogram(sm->sm_phys->smp_histogram, + SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); + } + + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); + + if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", + (u_longlong_t)metaslab_unflushed_txg(msp)); + } +} + +static void +print_vdev_metaslab_header(vdev_t *vd) +{ + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *bias_str = ""; + if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { + bias_str = VDEV_ALLOC_BIAS_LOG; + } else if (alloc_bias == VDEV_BIAS_SPECIAL) { + bias_str = VDEV_ALLOC_BIAS_SPECIAL; + } else if (alloc_bias == VDEV_BIAS_DEDUP) { + bias_str = VDEV_ALLOC_BIAS_DEDUP; + } + + uint64_t ms_flush_data_obj = 0; + if (vd->vdev_top_zap != 0) { + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &ms_flush_data_obj); + if (error != ENOENT) { + ASSERT0(error); + } + } + + (void) printf("\tvdev %10llu %s", + (u_longlong_t)vd->vdev_id, bias_str); + + if (ms_flush_data_obj != 0) { + (void) printf(" ms_unflushed_phys object %llu", + (u_longlong_t)ms_flush_data_obj); + } + + (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", + "metaslabs", (u_longlong_t)vd->vdev_ms_count, + "offset", "spacemap", "free"); + (void) printf("\t%15s %19s %15s %12s\n", + "---------------", "-------------------", + "---------------", "------------"); +} + +static void +dump_metaslab_groups(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + metaslab_class_t *mc = spa_normal_class(spa); + uint64_t fragmentation; + + metaslab_class_histogram_verify(mc); + + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != mc) + continue; + + metaslab_group_histogram_verify(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" + "fragmentation", + (u_longlong_t)tvd->vdev_id, + (u_longlong_t)tvd->vdev_ms_count); + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + (void) printf("%3s\n", "-"); + } else { + (void) printf("%3llu%%\n", + (u_longlong_t)mg->mg_fragmentation); + } + dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + } + + (void) printf("\tpool %s\tfragmentation", spa_name(spa)); + fragmentation = metaslab_class_fragmentation(mc); + if (fragmentation == ZFS_FRAG_INVALID) + (void) printf("\t%3s\n", "-"); + else + (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); + dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void +print_vdev_indirect(vdev_t *vd) +{ + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + if (vim == NULL) { + ASSERT3P(vib, ==, NULL); + return; + } + + ASSERT3U(vdev_indirect_mapping_object(vim), ==, + vic->vic_mapping_object); + ASSERT3U(vdev_indirect_births_object(vib), ==, + vic->vic_births_object); + + (void) printf("indirect births obj %llu:\n", + (longlong_t)vic->vic_births_object); + (void) printf(" vib_count = %llu\n", + (longlong_t)vdev_indirect_births_count(vib)); + for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { + vdev_indirect_birth_entry_phys_t *cur_vibe = + &vib->vib_entries[i]; + (void) printf("\toffset %llx -> txg %llu\n", + (longlong_t)cur_vibe->vibe_offset, + (longlong_t)cur_vibe->vibe_phys_birth_txg); + } + (void) printf("\n"); + + (void) printf("indirect mapping obj %llu:\n", + (longlong_t)vic->vic_mapping_object); + (void) printf(" vim_max_offset = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_max_offset(vim)); + (void) printf(" vim_bytes_mapped = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); + (void) printf(" vim_count = %llu\n", + (longlong_t)vdev_indirect_mapping_num_entries(vim)); + + if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) + return; + + uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + (void) printf("\t<%llx:%llx:%llx> -> " + "<%llx:%llx:%llx> (%x obsolete)\n", + (longlong_t)vd->vdev_id, + (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), + (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + counts[i]); + } + (void) printf("\n"); + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + (void) printf("obsolete space map object %llu:\n", + (u_longlong_t)obsolete_sm_object); + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, + obsolete_sm_object); + dump_spacemap(mos, vd->vdev_obsolete_sm); + (void) printf("\n"); + } +} + +static void +dump_metaslabs(spa_t *spa) +{ + vdev_t *vd, *rvd = spa->spa_root_vdev; + uint64_t m, c = 0, children = rvd->vdev_children; + + (void) printf("\nMetaslabs:\n"); + + if (!dump_opt['d'] && zopt_metaslab_args > 0) { + c = zopt_metaslab[0]; + + if (c >= children) + (void) fatal("bad vdev id: %llu", (u_longlong_t)c); + + if (zopt_metaslab_args > 1) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + for (m = 1; m < zopt_metaslab_args; m++) { + if (zopt_metaslab[m] < vd->vdev_ms_count) + dump_metaslab( + vd->vdev_ms[zopt_metaslab[m]]); + else + (void) fprintf(stderr, "bad metaslab " + "number %llu\n", + (u_longlong_t)zopt_metaslab[m]); + } + (void) printf("\n"); + return; + } + children = c + 1; + } + for (; c < children; c++) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + print_vdev_indirect(vd); + + for (m = 0; m < vd->vdev_ms_count; m++) + dump_metaslab(vd->vdev_ms[m]); + (void) printf("\n"); + } +} + +static void +dump_log_spacemaps(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + (void) printf("\nLog Space Maps in Pool:\n"); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + (void) printf("Log Spacemap object %llu txg %llu\n", + (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); + dump_spacemap(spa->spa_meta_objset, sm); + space_map_close(sm); + } + (void) printf("\n"); +} + +static void +dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +{ + const ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &dde->dde_key; + const char *types[4] = { "ditto", "single", "double", "triple" }; + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t blk; + int p; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); + (void) printf("index %llx refcnt %llu %s %s\n", + (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + types[p], blkbuf); + } +} + +static void +dump_dedup_ratio(const ddt_stat_t *dds) +{ + double rL, rP, rD, D, dedup, compress, copies; + + if (dds->dds_blocks == 0) + return; + + rL = (double)dds->dds_ref_lsize; + rP = (double)dds->dds_ref_psize; + rD = (double)dds->dds_ref_dsize; + D = (double)dds->dds_dsize; + + dedup = rD / D; + compress = rL / rP; + copies = rD / rP; + + (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " + "dedup * compress / copies = %.2f\n\n", + dedup, compress, copies, dedup * compress / copies); +} + +static void +dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + char name[DDT_NAMELEN]; + ddt_entry_t dde; + uint64_t walk = 0; + dmu_object_info_t doi; + uint64_t count, dspace, mspace; + int error; + + error = ddt_object_info(ddt, type, class, &doi); + + if (error == ENOENT) + return; + ASSERT(error == 0); + + error = ddt_object_count(ddt, type, class, &count); + ASSERT(error == 0); + if (count == 0) + return; + + dspace = doi.doi_physical_blocks_512 << 9; + mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ddt_object_name(ddt, type, class, name); + + (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + name, + (u_longlong_t)count, + (u_longlong_t)(dspace / count), + (u_longlong_t)(mspace / count)); + + if (dump_opt['D'] < 3) + return; + + zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); + + if (dump_opt['D'] < 4) + return; + + if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) + return; + + (void) printf("%s contents:\n\n", name); + + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) + dump_dde(ddt, &dde, walk); + + ASSERT3U(error, ==, ENOENT); + + (void) printf("\n"); +} + +static void +dump_all_ddts(spa_t *spa) +{ + ddt_histogram_t ddh_total; + ddt_stat_t dds_total; + + bzero(&ddh_total, sizeof (ddh_total)); + bzero(&dds_total, sizeof (dds_total)); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + dump_ddt(ddt, type, class); + } + } + } + + ddt_get_dedup_stats(spa, &dds_total); + + if (dds_total.dds_blocks == 0) { + (void) printf("All DDTs are empty\n"); + return; + } + + (void) printf("\n"); + + if (dump_opt['D'] > 1) { + (void) printf("DDT histogram (aggregated over all DDTs):\n"); + ddt_get_dedup_histogram(spa, &ddh_total); + zpool_dump_ddt(&dds_total, &ddh_total); + } + + dump_dedup_ratio(&dds_total); +} + +static void +dump_dtl_seg(void *arg, uint64_t start, uint64_t size) +{ + char *prefix = arg; + + (void) printf("%s [%llu,%llu) length %llu\n", + prefix, + (u_longlong_t)start, + (u_longlong_t)(start + size), + (u_longlong_t)(size)); +} + +static void +dump_dtl(vdev_t *vd, int indent) +{ + spa_t *spa = vd->vdev_spa; + boolean_t required; + const char *name[DTL_TYPES] = { "missing", "partial", "scrub", + "outage" }; + char prefix[256]; + + spa_vdev_state_enter(spa, SCL_NONE); + required = vdev_dtl_required(vd); + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (indent == 0) + (void) printf("\nDirty time logs:\n\n"); + + (void) printf("\t%*s%s [%s]\n", indent, "", + vd->vdev_path ? vd->vdev_path : + vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), + required ? "DTL-required" : "DTL-expendable"); + + for (int t = 0; t < DTL_TYPES; t++) { + range_tree_t *rt = vd->vdev_dtl[t]; + if (range_tree_space(rt) == 0) + continue; + (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", + indent + 2, "", name[t]); + range_tree_walk(rt, dump_dtl_seg, prefix); + if (dump_opt['d'] > 5 && vd->vdev_children == 0) + dump_spacemap(spa->spa_meta_objset, + vd->vdev_dtl_sm); + } + + for (unsigned c = 0; c < vd->vdev_children; c++) + dump_dtl(vd->vdev_child[c], indent + 4); +} + +static void +dump_history(spa_t *spa) +{ + nvlist_t **events = NULL; + char *buf; + uint64_t resid, len, off = 0; + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; + + if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { + (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", + __func__); + return; + } + + do { + len = SPA_OLD_MAXBLOCKSIZE; + + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + free(buf); + return; + } + + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); + for (unsigned i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + boolean_t printed = B_FALSE; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + goto next; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + goto next; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) + goto next; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + zfs_history_event_names[ievent], + (longlong_t)txg, intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + printed = B_TRUE; + +next: + if (dump_opt['h'] > 1) { + if (!printed) + (void) printf("unrecognized record:\n"); + dump_nvlist(events[i], 2); + } + } + free(buf); +} + +/*ARGSUSED*/ +static void +dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static uint64_t +blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb) +{ + if (dnp == NULL) { + ASSERT(zb->zb_level < 0); + if (zb->zb_object == 0) + return (zb->zb_blkid); + return (zb->zb_blkid * BP_GET_LSIZE(bp)); + } + + ASSERT(zb->zb_level >= 0); + + return ((zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); +} + +static void +snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, + const blkptr_t *bp) +{ + abd_t *pabd; + void *buf; + zio_t *zio; + zfs_zstdhdr_t zstd_hdr; + int error; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) + return; + + if (BP_IS_HOLE(bp)) + return; + + if (BP_IS_EMBEDDED(bp)) { + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + decode_embedded_bp_compressed(bp, buf); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + free(buf); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", + zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + return; + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + zio = zio_root(spa, NULL, NULL, 0); + + /* Decrypt but don't decompress so we can read the compression header */ + zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, + NULL)); + error = zio_wait(zio); + if (error) { + (void) fprintf(stderr, "read failed: %d\n", error); + return; + } + buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:NORMAL", + zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + + abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); +} + +static void +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, + boolean_t bp_freed) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; + int i; + + if (dump_opt['b'] >= 6) { + snprintf_blkptr(blkbuf, buflen, bp); + if (bp_freed) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + } + return; + } + + if (BP_IS_EMBEDDED(bp)) { + (void) sprintf(blkbuf, + "EMBEDDED et=%u %llxL/%llxP B=%llu", + (int)BPE_GET_ETYPE(bp), + (u_longlong_t)BPE_GET_LSIZE(bp), + (u_longlong_t)BPE_GET_PSIZE(bp), + (u_longlong_t)bp->blk_birth); + return; + } + + blkbuf[0] = '\0'; + + for (i = 0; i < ndvas; i++) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "%llu:%llx:%llx ", + (u_longlong_t)DVA_GET_VDEV(&dva[i]), + (u_longlong_t)DVA_GET_OFFSET(&dva[i]), + (u_longlong_t)DVA_GET_ASIZE(&dva[i])); + + if (BP_IS_HOLE(bp)) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL B=%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)bp->blk_birth); + } else { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)BP_GET_FILL(bp), + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (bp_freed) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx", + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + } +} + +static void +print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, + const dnode_phys_t *dnp) +{ + char blkbuf[BP_SPRINTF_LEN]; + int l; + + if (!BP_IS_EMBEDDED(bp)) { + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + } + + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); + + ASSERT(zb->zb_level >= 0); + + for (l = dnp->dn_nlevels - 1; l >= -1; l--) { + if (l == zb->zb_level) { + (void) printf("L%llx", (u_longlong_t)zb->zb_level); + } else { + (void) printf(" "); + } + } + + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); + if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) + snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); + (void) printf("%s\n", blkbuf); +} + +static int +visit_indirect(spa_t *spa, const dnode_phys_t *dnp, + blkptr_t *bp, const zbookmark_phys_t *zb) +{ + int err = 0; + + if (bp->blk_birth == 0) + return (0); + + print_indirect(spa, bp, zb, dnp); + + if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + uint64_t fill = 0; + ASSERT(!BP_IS_REDACTED(bp)); + + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + ASSERT(buf->b_data); + + /* recursively visit blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = visit_indirect(spa, dnp, cbp, &czb); + if (err) + break; + fill += BP_GET_FILL(cbp); + } + if (!err) + ASSERT3U(fill, ==, BP_GET_FILL(bp)); + arc_buf_destroy(buf, &buf); + } + + return (err); +} + +/*ARGSUSED*/ +static void +dump_indirect(dnode_t *dn) +{ + dnode_phys_t *dnp = dn->dn_phys; + int j; + zbookmark_phys_t czb; + + (void) printf("Indirect blocks:\n"); + + SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), + dn->dn_object, dnp->dn_nlevels - 1, 0); + for (j = 0; j < dnp->dn_nblkptr; j++) { + czb.zb_blkid = j; + (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, + &dnp->dn_blkptr[j], &czb); + } + + (void) printf("\n"); +} + +/*ARGSUSED*/ +static void +dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dir_phys_t *dd = data; + time_t crtime; + char nice[32]; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); + + if (dd == NULL) + return; + + ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); + + crtime = dd->dd_creation_time; + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\thead_dataset_obj = %llu\n", + (u_longlong_t)dd->dd_head_dataset_obj); + (void) printf("\t\tparent_dir_obj = %llu\n", + (u_longlong_t)dd->dd_parent_obj); + (void) printf("\t\torigin_obj = %llu\n", + (u_longlong_t)dd->dd_origin_obj); + (void) printf("\t\tchild_dir_zapobj = %llu\n", + (u_longlong_t)dd->dd_child_dir_zapobj); + zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); + (void) printf("\t\tused_bytes = %s\n", nice); + zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); + (void) printf("\t\tcompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); + (void) printf("\t\tuncompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); + (void) printf("\t\tquota = %s\n", nice); + zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); + (void) printf("\t\treserved = %s\n", nice); + (void) printf("\t\tprops_zapobj = %llu\n", + (u_longlong_t)dd->dd_props_zapobj); + (void) printf("\t\tdeleg_zapobj = %llu\n", + (u_longlong_t)dd->dd_deleg_zapobj); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)dd->dd_flags); + +#define DO(which) \ + zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ + sizeof (nice)); \ + (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) + DO(HEAD); + DO(SNAP); + DO(CHILD); + DO(CHILD_RSRV); + DO(REFRSRV); +#undef DO + (void) printf("\t\tclones = %llu\n", + (u_longlong_t)dd->dd_clones); +} + +/*ARGSUSED*/ +static void +dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dataset_phys_t *ds = data; + time_t crtime; + char used[32], compressed[32], uncompressed[32], unique[32]; + char blkbuf[BP_SPRINTF_LEN]; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); + + if (ds == NULL) + return; + + ASSERT(size == sizeof (*ds)); + crtime = ds->ds_creation_time; + zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); + zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); + zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, + sizeof (uncompressed)); + zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); + + (void) printf("\t\tdir_obj = %llu\n", + (u_longlong_t)ds->ds_dir_obj); + (void) printf("\t\tprev_snap_obj = %llu\n", + (u_longlong_t)ds->ds_prev_snap_obj); + (void) printf("\t\tprev_snap_txg = %llu\n", + (u_longlong_t)ds->ds_prev_snap_txg); + (void) printf("\t\tnext_snap_obj = %llu\n", + (u_longlong_t)ds->ds_next_snap_obj); + (void) printf("\t\tsnapnames_zapobj = %llu\n", + (u_longlong_t)ds->ds_snapnames_zapobj); + (void) printf("\t\tnum_children = %llu\n", + (u_longlong_t)ds->ds_num_children); + (void) printf("\t\tuserrefs_obj = %llu\n", + (u_longlong_t)ds->ds_userrefs_obj); + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\tcreation_txg = %llu\n", + (u_longlong_t)ds->ds_creation_txg); + (void) printf("\t\tdeadlist_obj = %llu\n", + (u_longlong_t)ds->ds_deadlist_obj); + (void) printf("\t\tused_bytes = %s\n", used); + (void) printf("\t\tcompressed_bytes = %s\n", compressed); + (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); + (void) printf("\t\tunique = %s\n", unique); + (void) printf("\t\tfsid_guid = %llu\n", + (u_longlong_t)ds->ds_fsid_guid); + (void) printf("\t\tguid = %llu\n", + (u_longlong_t)ds->ds_guid); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)ds->ds_flags); + (void) printf("\t\tnext_clones_obj = %llu\n", + (u_longlong_t)ds->ds_next_clones_obj); + (void) printf("\t\tprops_obj = %llu\n", + (u_longlong_t)ds->ds_props_obj); + (void) printf("\t\tbp = %s\n", blkbuf); +} + +/* ARGSUSED */ +static int +dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (bp->blk_birth != 0) { + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("\t%s\n", blkbuf); + } + return (0); +} + +static void +dump_bptree(objset_t *os, uint64_t obj, const char *name) +{ + char bytes[32]; + bptree_phys_t *bt; + dmu_buf_t *db; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); + (void) printf("\n %s: %llu datasets, %s\n", + name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); + dmu_buf_rele(db, FTAG); + + if (dump_opt['d'] < 5) + return; + + (void) printf("\n"); + + (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); +} + +/* ARGSUSED */ +static int +dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + ASSERT(bp->blk_birth != 0); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); + (void) printf("\t%s\n", blkbuf); + return (0); +} + +static void +dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + uint64_t i; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); + zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu freed, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } else { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } + + for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + dump_full_bpobj(&subbpo, "subobj", indent + 1); + bpobj_close(&subbpo); + } + } else { + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%llu freed, %s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + bytes); + } else { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + bytes); + } + } + + if (dump_opt['d'] < 5) + return; + + + if (indent == 0) { + (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); + (void) printf("\n"); + } +} + +static int +dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, + boolean_t print_list) +{ + int err = 0; + zfs_bookmark_phys_t prop; + objset_t *mos = dp->dp_spa->spa_meta_objset; + err = dsl_bookmark_lookup(dp, name, NULL, &prop); + + if (err != 0) { + return (err); + } + + (void) printf("\t#%s: ", strchr(name, '#') + 1); + (void) printf("{guid: %llx creation_txg: %llu creation_time: " + "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, + (u_longlong_t)prop.zbm_creation_txg, + (u_longlong_t)prop.zbm_creation_time, + (u_longlong_t)prop.zbm_redaction_obj); + + IMPLY(print_list, print_redact); + if (!print_redact || prop.zbm_redaction_obj == 0) + return (0); + + redaction_list_t *rl; + VERIFY0(dsl_redaction_list_hold_obj(dp, + prop.zbm_redaction_obj, FTAG, &rl)); + + redaction_list_phys_t *rlp = rl->rl_phys; + (void) printf("\tRedacted:\n\t\tProgress: "); + if (rlp->rlp_last_object != UINT64_MAX || + rlp->rlp_last_blkid != UINT64_MAX) { + (void) printf("%llu %llu (incomplete)\n", + (u_longlong_t)rlp->rlp_last_object, + (u_longlong_t)rlp->rlp_last_blkid); + } else { + (void) printf("complete\n"); + } + (void) printf("\t\tSnapshots: ["); + for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { + if (i > 0) + (void) printf(", "); + (void) printf("%0llu", + (u_longlong_t)rlp->rlp_snaps[i]); + } + (void) printf("]\n\t\tLength: %llu\n", + (u_longlong_t)rlp->rlp_num_entries); + + if (!print_list) { + dsl_redaction_list_rele(rl, FTAG); + return (0); + } + + if (rlp->rlp_num_entries == 0) { + dsl_redaction_list_rele(rl, FTAG); + (void) printf("\t\tRedaction List: []\n\n"); + return (0); + } + + redact_block_phys_t *rbp_buf; + uint64_t size; + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); + size = doi.doi_max_offset; + rbp_buf = kmem_alloc(size, KM_SLEEP); + + err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, + rbp_buf, 0); + if (err != 0) { + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + return (err); + } + + (void) printf("\t\tRedaction List: [{object: %llx, offset: " + "%llx, blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[0].rbp_object, + (u_longlong_t)rbp_buf[0].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[0])), + (u_longlong_t)redact_block_get_count(&rbp_buf[0])); + + for (size_t i = 1; i < rlp->rlp_num_entries; i++) { + (void) printf(",\n\t\t{object: %llx, offset: %llx, " + "blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[i].rbp_object, + (u_longlong_t)rbp_buf[i].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[i])), + (u_longlong_t)redact_block_get_count(&rbp_buf[i])); + } + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + (void) printf("]\n\n"); + return (0); +} + +static void +dump_bookmarks(objset_t *os, int verbosity) +{ + zap_cursor_t zc; + zap_attribute_t attr; + dsl_dataset_t *ds = dmu_objset_ds(os); + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + objset_t *mos = os->os_spa->spa_meta_objset; + if (verbosity < 4) + return; + dsl_pool_config_enter(dp, FTAG); + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char osname[ZFS_MAX_DATASET_NAME_LEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dmu_objset_name(os, osname); + VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname, + attr.za_name)); + (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); + } + zap_cursor_fini(&zc); + dsl_pool_config_exit(dp, FTAG); +} + +static void +bpobj_count_refd(bpobj_t *bpo) +{ + mos_obj_refd(bpo->bpo_object); + + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + mos_obj_refd(bpo->bpo_phys->bpo_subobjs); + for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + bpobj_count_refd(&subbpo); + bpobj_close(&subbpo); + } + } +} + +static int +dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) +{ + spa_t *spa = arg; + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + if (dle->dle_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dle->dle_bpobj); + return (0); +} + +static int +dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) +{ + ASSERT(arg == NULL); + if (dump_opt['d'] >= 5) { + char buf[128]; + (void) snprintf(buf, sizeof (buf), + "mintxg %llu -> obj %llu", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + dump_full_bpobj(&dle->dle_bpobj, buf, 0); + } else { + (void) printf("mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + } + return (0); +} + +static void +dump_blkptr_list(dsl_deadlist_t *dl, char *name) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + char entries[32]; + spa_t *spa = dmu_objset_spa(dl->dl_os); + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + + if (dl->dl_oldfmt) { + if (dl->dl_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dl->dl_bpobj); + } else { + mos_obj_refd(dl->dl_object); + dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); + } + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + if (dl->dl_oldfmt) { + dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); + return; + } + + zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); + zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); + zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); + zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); + (void) printf("\n %s: %s (%s/%s comp), %s entries\n", + name, bytes, comp, uncomp, entries); + + if (dump_opt['d'] < 4) + return; + + (void) printf("\n"); + + dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); +} + +static int +verify_dd_livelist(objset_t *os) +{ + uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + + ASSERT(!dmu_objset_is_snapshot(os)); + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return (0); + + /* Iterate through the livelist to check for duplicates */ + dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, + NULL); + + dsl_pool_config_enter(dp, FTAG); + dsl_deadlist_space(&dd->dd_livelist, &ll_used, + &ll_comp, &ll_uncomp); + + dsl_dataset_t *origin_ds; + ASSERT(dsl_pool_config_held(dp)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); + VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, + &used, &comp, &uncomp)); + dsl_dataset_rele(origin_ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + /* + * It's possible that the dataset's uncomp space is larger than the + * livelist's because livelists do not track embedded block pointers + */ + if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { + char nice_used[32], nice_comp[32], nice_uncomp[32]; + (void) printf("Discrepancy in space accounting:\n"); + zdb_nicenum(used, nice_used, sizeof (nice_used)); + zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("dir: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); + zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("livelist: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + return (1); + } + return (0); +} + +static avl_tree_t idx_tree; +static avl_tree_t domain_tree; +static boolean_t fuid_table_loaded; +static objset_t *sa_os = NULL; +static sa_attr_type_t *sa_attr_table = NULL; + +static int +open_objset(const char *path, void *tag, objset_t **osp) +{ + int err; + uint64_t sa_attrs = 0; + uint64_t version = 0; + + VERIFY3P(sa_os, ==, NULL); + /* + * We can't own an objset if it's redacted. Therefore, we do this + * dance: hold the objset, then acquire a long hold on its dataset, then + * release the pool (which is held as part of holding the objset). + */ + err = dmu_objset_hold(path, tag, osp); + if (err != 0) { + (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", + path, strerror(err)); + return (err); + } + dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); + dsl_pool_rele(dmu_objset_pool(*osp), tag); + + if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version); + if (version >= ZPL_VERSION_SA) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs); + } + err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, + &sa_attr_table); + if (err != 0) { + (void) fprintf(stderr, "sa_setup failed: %s\n", + strerror(err)); + dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); + dsl_dataset_rele(dmu_objset_ds(*osp), tag); + *osp = NULL; + } + } + sa_os = *osp; + + return (0); +} + +static void +close_objset(objset_t *os, void *tag) +{ + VERIFY3P(os, ==, sa_os); + if (os->os_sa != NULL) + sa_tear_down(os); + dsl_dataset_long_rele(dmu_objset_ds(os), tag); + dsl_dataset_rele(dmu_objset_ds(os), tag); + sa_attr_table = NULL; + sa_os = NULL; +} + +static void +fuid_table_destroy(void) +{ + if (fuid_table_loaded) { + zfs_fuid_table_destroy(&idx_tree, &domain_tree); + fuid_table_loaded = B_FALSE; + } +} + +/* + * print uid or gid information. + * For normal POSIX id just the id is printed in decimal format. + * For CIFS files with FUID the fuid is printed in hex followed by + * the domain-rid string. + */ +static void +print_idstr(uint64_t id, const char *id_type) +{ + if (FUID_INDEX(id)) { + char *domain; + + domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); + (void) printf("\t%s %llx [%s-%d]\n", id_type, + (u_longlong_t)id, domain, (int)FUID_RID(id)); + } else { + (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); + } + +} + +static void +dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) +{ + uint32_t uid_idx, gid_idx; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + + /* Load domain table, if not already loaded */ + if (!fuid_table_loaded && (uid_idx || gid_idx)) { + uint64_t fuid_obj; + + /* first find the fuid object. It lives in the master node */ + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, + 8, 1, &fuid_obj) == 0); + zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); + (void) zfs_fuid_table_load(os, fuid_obj, + &idx_tree, &domain_tree); + fuid_table_loaded = B_TRUE; + } + + print_idstr(uid, "uid"); + print_idstr(gid, "gid"); +} + +static void +dump_znode_sa_xattr(sa_handle_t *hdl) +{ + nvlist_t *sa_xattr; + nvpair_t *elem = NULL; + int sa_xattr_size = 0; + int sa_xattr_entries = 0; + int error; + char *sa_xattr_packed; + + error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); + if (error || sa_xattr_size == 0) + return; + + sa_xattr_packed = malloc(sa_xattr_size); + if (sa_xattr_packed == NULL) + return; + + error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], + sa_xattr_packed, sa_xattr_size); + if (error) { + free(sa_xattr_packed); + return; + } + + error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); + if (error) { + free(sa_xattr_packed); + return; + } + + while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) + sa_xattr_entries++; + + (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", + sa_xattr_size, sa_xattr_entries); + while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { + uchar_t *value; + uint_t cnt, idx; + + (void) printf("\t\t%s = ", nvpair_name(elem)); + nvpair_value_byte_array(elem, &value, &cnt); + for (idx = 0; idx < cnt; ++idx) { + if (isprint(value[idx])) + (void) putchar(value[idx]); + else + (void) printf("\\%3.3o", value[idx]); + } + (void) putchar('\n'); + } + + nvlist_free(sa_xattr); + free(sa_xattr_packed); +} + +static void +dump_znode_symlink(sa_handle_t *hdl) +{ + int sa_symlink_size = 0; + char linktarget[MAXPATHLEN]; + linktarget[0] = '\0'; + int error; + + error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); + if (error || sa_symlink_size == 0) { + return; + } + if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], + &linktarget, sa_symlink_size) == 0) + (void) printf("\ttarget %s\n", linktarget); +} + +/*ARGSUSED*/ +static void +dump_znode(objset_t *os, uint64_t object, void *data, size_t size) +{ + char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_handle_t *hdl; + uint64_t xattr, rdev, gen; + uint64_t uid, gid, mode, fsize, parent, links; + uint64_t pflags; + uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; + time_t z_crtime, z_atime, z_mtime, z_ctime; + sa_bulk_attr_t bulk[12]; + int idx = 0; + int error; + + VERIFY3P(os, ==, sa_os); + if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { + (void) printf("Failed to get handle for SA znode\n"); + return; + } + + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, + &links, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, + &fsize, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, + acctm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, + modtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, + crtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, + chgtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, + &pflags, 8); + + if (sa_bulk_lookup(hdl, bulk, idx)) { + (void) sa_handle_destroy(hdl); + return; + } + + z_crtime = (time_t)crtm[0]; + z_atime = (time_t)acctm[0]; + z_mtime = (time_t)modtm[0]; + z_ctime = (time_t)chgtm[0]; + + if (dump_opt['d'] > 4) { + error = zfs_obj_to_path(os, object, path, sizeof (path)); + if (error == ESTALE) { + (void) snprintf(path, sizeof (path), "on delete queue"); + } else if (error != 0) { + leaked_objects++; + (void) snprintf(path, sizeof (path), + "path not found, possibly leaked"); + } + (void) printf("\tpath %s\n", path); + } + + if (S_ISLNK(mode)) + dump_znode_symlink(hdl); + dump_uidgid(os, uid, gid); + (void) printf("\tatime %s", ctime(&z_atime)); + (void) printf("\tmtime %s", ctime(&z_mtime)); + (void) printf("\tctime %s", ctime(&z_ctime)); + (void) printf("\tcrtime %s", ctime(&z_crtime)); + (void) printf("\tgen %llu\n", (u_longlong_t)gen); + (void) printf("\tmode %llo\n", (u_longlong_t)mode); + (void) printf("\tsize %llu\n", (u_longlong_t)fsize); + (void) printf("\tparent %llu\n", (u_longlong_t)parent); + (void) printf("\tlinks %llu\n", (u_longlong_t)links); + (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); + if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { + uint64_t projid; + + if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, + sizeof (uint64_t)) == 0) + (void) printf("\tprojid %llu\n", (u_longlong_t)projid); + } + if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, + sizeof (uint64_t)) == 0) + (void) printf("\txattr %llu\n", (u_longlong_t)xattr); + if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, + sizeof (uint64_t)) == 0) + (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); + dump_znode_sa_xattr(hdl); + sa_handle_destroy(hdl); +} + +/*ARGSUSED*/ +static void +dump_acl(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { + dump_none, /* unallocated */ + dump_zap, /* object directory */ + dump_uint64, /* object array */ + dump_none, /* packed nvlist */ + dump_packed_nvlist, /* packed nvlist size */ + dump_none, /* bpobj */ + dump_bpobj, /* bpobj header */ + dump_none, /* SPA space map header */ + dump_none, /* SPA space map */ + dump_none, /* ZIL intent log */ + dump_dnode, /* DMU dnode */ + dump_dmu_objset, /* DMU objset */ + dump_dsl_dir, /* DSL directory */ + dump_zap, /* DSL directory child map */ + dump_zap, /* DSL dataset snap map */ + dump_zap, /* DSL props */ + dump_dsl_dataset, /* DSL dataset */ + dump_znode, /* ZFS znode */ + dump_acl, /* ZFS V0 ACL */ + dump_uint8, /* ZFS plain file */ + dump_zpldir, /* ZFS directory */ + dump_zap, /* ZFS master node */ + dump_zap, /* ZFS delete queue */ + dump_uint8, /* zvol object */ + dump_zap, /* zvol prop */ + dump_uint8, /* other uint8[] */ + dump_uint64, /* other uint64[] */ + dump_zap, /* other ZAP */ + dump_zap, /* persistent error log */ + dump_uint8, /* SPA history */ + dump_history_offsets, /* SPA history offsets */ + dump_zap, /* Pool properties */ + dump_zap, /* DSL permissions */ + dump_acl, /* ZFS ACL */ + dump_uint8, /* ZFS SYSACL */ + dump_none, /* FUID nvlist */ + dump_packed_nvlist, /* FUID nvlist size */ + dump_zap, /* DSL dataset next clones */ + dump_zap, /* DSL scrub queue */ + dump_zap, /* ZFS user/group/project used */ + dump_zap, /* ZFS user/group/project quota */ + dump_zap, /* snapshot refcount tags */ + dump_ddt_zap, /* DDT ZAP object */ + dump_zap, /* DDT statistics */ + dump_znode, /* SA object */ + dump_zap, /* SA Master Node */ + dump_sa_attrs, /* SA attribute registration */ + dump_sa_layouts, /* SA attribute layouts */ + dump_zap, /* DSL scrub translations */ + dump_none, /* fake dedup BP */ + dump_zap, /* deadlist */ + dump_none, /* deadlist hdr */ + dump_zap, /* dsl clones */ + dump_bpobj_subobjs, /* bpobj subobjs */ + dump_unknown, /* Unknown type, must be last */ +}; + +static boolean_t +match_object_type(dmu_object_type_t obj_type, uint64_t flags) +{ + boolean_t match = B_TRUE; + + switch (obj_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (!(flags & ZOR_FLAG_DIRECTORY)) + match = B_FALSE; + break; + case DMU_OT_PLAIN_FILE_CONTENTS: + if (!(flags & ZOR_FLAG_PLAIN_FILE)) + match = B_FALSE; + break; + case DMU_OT_SPACE_MAP: + if (!(flags & ZOR_FLAG_SPACE_MAP)) + match = B_FALSE; + break; + default: + if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { + if (!(flags & ZOR_FLAG_ZAP)) + match = B_FALSE; + break; + } + + /* + * If all bits except some of the supported flags are + * set, the user combined the all-types flag (A) with + * a negated flag to exclude some types (e.g. A-f to + * show all object types except plain files). + */ + if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) + match = B_FALSE; + + break; + } + + return (match); +} + +static void +dump_object(objset_t *os, uint64_t object, int verbosity, + boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) +{ + dmu_buf_t *db = NULL; + dmu_object_info_t doi; + dnode_t *dn; + boolean_t dnode_held = B_FALSE; + void *bonus = NULL; + size_t bsize = 0; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; + char bonus_size[32]; + char aux[50]; + int error; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); + + if (*print_header) { + (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", + "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", + "lsize", "%full", "type"); + *print_header = 0; + } + + if (object == 0) { + dn = DMU_META_DNODE(os); + dmu_object_info_from_dnode(dn, &doi); + } else { + /* + * Encrypted datasets will have sensitive bonus buffers + * encrypted. Therefore we cannot hold the bonus buffer and + * must hold the dnode itself instead. + */ + error = dmu_object_info(os, object, &doi); + if (error) + fatal("dmu_object_info() failed, errno %u", error); + + if (os->os_encrypted && + DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { + error = dnode_hold(os, object, FTAG, &dn); + if (error) + fatal("dnode_hold() failed, errno %u", error); + dnode_held = B_TRUE; + } else { + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) + fatal("dmu_bonus_hold(%llu) failed, errno %u", + object, error); + bonus = db->db_data; + bsize = db->db_size; + dn = DB_DNODE((dmu_buf_impl_t *)db); + } + } + + /* + * Default to showing all object types if no flags were specified. + */ + if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && + !match_object_type(doi.doi_type, flags)) + goto out; + + if (dnode_slots_used) + *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; + + zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); + zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); + zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); + zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); + zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); + zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); + (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * + doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / + doi.doi_max_offset); + + aux[0] = '\0'; + + if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); + } + + if (doi.doi_compress == ZIO_COMPRESS_INHERIT && + ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { + const char *compname = NULL; + if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, + ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), + &compname) == 0) { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), " (Z=inherit=%s)", + compname); + } else { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), + " (Z=inherit=%s-unknown)", + ZDB_COMPRESS_NAME(os->os_compress)); + } + } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); + } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); + } + + (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", + (u_longlong_t)object, doi.doi_indirection, iblk, dblk, + asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); + + if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { + (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", + "", "", "", "", "", "", bonus_size, "bonus", + zdb_ot_name(doi.doi_bonus_type)); + } + + if (verbosity >= 4) { + (void) printf("\tdnode flags: %s%s%s%s\n", + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? + "USED_BYTES " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? + "USERUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? + "USEROBJUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? + "SPILL_BLKPTR" : ""); + (void) printf("\tdnode maxblkid: %llu\n", + (longlong_t)dn->dn_phys->dn_maxblkid); + + if (!dnode_held) { + object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, + object, bonus, bsize); + } else { + (void) printf("\t\t(bonus encrypted)\n"); + } + + if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) { + object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, + NULL, 0); + } else { + (void) printf("\t\t(object encrypted)\n"); + } + + *print_header = B_TRUE; + } + + if (verbosity >= 5) + dump_indirect(dn); + + if (verbosity >= 5) { + /* + * Report the list of segments that comprise the object. + */ + uint64_t start = 0; + uint64_t end; + uint64_t blkfill = 1; + int minlvl = 1; + + if (dn->dn_type == DMU_OT_DNODE) { + minlvl = 0; + blkfill = DNODES_PER_BLOCK; + } + + for (;;) { + char segsize[32]; + /* make sure nicenum has enough space */ + CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); + error = dnode_next_offset(dn, + 0, &start, minlvl, blkfill, 0); + if (error) + break; + end = start; + error = dnode_next_offset(dn, + DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); + zdb_nicenum(end - start, segsize, sizeof (segsize)); + (void) printf("\t\tsegment [%016llx, %016llx)" + " size %5s\n", (u_longlong_t)start, + (u_longlong_t)end, segsize); + if (error) + break; + start = end; + } + } + +out: + if (db != NULL) + dmu_buf_rele(db, FTAG); + if (dnode_held) + dnode_rele(dn, FTAG); +} + +static void +count_dir_mos_objects(dsl_dir_t *dd) +{ + mos_obj_refd(dd->dd_object); + mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_clones); + + /* + * The dd_crypto_obj can be referenced by multiple dsl_dir's. + * Ignore the references after the first one. + */ + mos_obj_refd_multiple(dd->dd_crypto_obj); +} + +static void +count_ds_mos_objects(dsl_dataset_t *ds) +{ + mos_obj_refd(ds->ds_object); + mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); + mos_obj_refd(ds->ds_bookmarks_obj); + + if (!dsl_dataset_is_snapshot(ds)) { + count_dir_mos_objects(ds->ds_dir); + } +} + +static const char *objset_types[DMU_OST_NUMTYPES] = { + "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; + +/* + * Parse a string denoting a range of object IDs of the form + * <start>[:<end>[:flags]], and store the results in zor. + * Return 0 on success. On error, return 1 and update the msg + * pointer to point to a descriptive error message. + */ +static int +parse_object_range(char *range, zopt_object_range_t *zor, char **msg) +{ + uint64_t flags = 0; + char *p, *s, *dup, *flagstr; + size_t len; + int i; + int rc = 0; + + if (strchr(range, ':') == NULL) { + zor->zor_obj_start = strtoull(range, &p, 0); + if (*p != '\0') { + *msg = "Invalid characters in object ID"; + rc = 1; + } + zor->zor_obj_end = zor->zor_obj_start; + return (rc); + } + + if (strchr(range, ':') == range) { + *msg = "Invalid leading colon"; + rc = 1; + return (rc); + } + + len = strlen(range); + if (range[len - 1] == ':') { + *msg = "Invalid trailing colon"; + rc = 1; + return (rc); + } + + dup = strdup(range); + s = strtok(dup, ":"); + zor->zor_obj_start = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in start object ID"; + rc = 1; + goto out; + } + + s = strtok(NULL, ":"); + zor->zor_obj_end = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in end object ID"; + rc = 1; + goto out; + } + + if (zor->zor_obj_start > zor->zor_obj_end) { + *msg = "Start object ID may not exceed end object ID"; + rc = 1; + goto out; + } + + s = strtok(NULL, ":"); + if (s == NULL) { + zor->zor_flags = ZOR_FLAG_ALL_TYPES; + goto out; + } else if (strtok(NULL, ":") != NULL) { + *msg = "Invalid colon-delimited field after flags"; + rc = 1; + goto out; + } + + flagstr = s; + for (i = 0; flagstr[i]; i++) { + int bit; + boolean_t negation = (flagstr[i] == '-'); + + if (negation) { + i++; + if (flagstr[i] == '\0') { + *msg = "Invalid trailing negation operator"; + rc = 1; + goto out; + } + } + bit = flagbits[(uchar_t)flagstr[i]]; + if (bit == 0) { + *msg = "Invalid flag"; + rc = 1; + goto out; + } + if (negation) + flags &= ~bit; + else + flags |= bit; + } + zor->zor_flags = flags; + +out: + free(dup); + return (rc); +} + +static void +dump_objset(objset_t *os) +{ + dmu_objset_stats_t dds = { 0 }; + uint64_t object, object_count; + uint64_t refdbytes, usedobjs, scratch; + char numbuf[32]; + char blkbuf[BP_SPRINTF_LEN + 20]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; + const char *type = "UNKNOWN"; + int verbosity = dump_opt['d']; + boolean_t print_header; + unsigned i; + int error; + uint64_t total_slots_used = 0; + uint64_t max_slot_used = 0; + uint64_t dnode_slots; + uint64_t obj_start; + uint64_t obj_end; + uint64_t flags; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); + + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + + print_header = B_TRUE; + + if (dds.dds_type < DMU_OST_NUMTYPES) + type = objset_types[dds.dds_type]; + + if (dds.dds_type == DMU_OST_META) { + dds.dds_creation_txg = TXG_INITIAL; + usedobjs = BP_GET_FILL(os->os_rootbp); + refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> + dd_used_bytes; + } else { + dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); + } + + ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); + + zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); + + if (verbosity >= 4) { + (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); + (void) snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); + } else { + blkbuf[0] = '\0'; + } + + dmu_objset_name(os, osname); + + (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " + "%s, %llu objects%s%s\n", + osname, type, (u_longlong_t)dmu_objset_id(os), + (u_longlong_t)dds.dds_creation_txg, + numbuf, (u_longlong_t)usedobjs, blkbuf, + (dds.dds_inconsistent) ? " (inconsistent)" : ""); + + for (i = 0; i < zopt_object_args; i++) { + obj_start = zopt_object_ranges[i].zor_obj_start; + obj_end = zopt_object_ranges[i].zor_obj_end; + flags = zopt_object_ranges[i].zor_flags; + + object = obj_start; + if (object == 0 || obj_start == obj_end) + dump_object(os, object, verbosity, &print_header, NULL, + flags); + else + object--; + + while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && + object <= obj_end) { + dump_object(os, object, verbosity, &print_header, NULL, + flags); + } + } + + if (zopt_object_args > 0) { + (void) printf("\n"); + return; + } + + if (dump_opt['i'] != 0 || verbosity >= 2) + dump_intent_log(dmu_objset_zil(os)); + + if (dmu_objset_ds(os) != NULL) { + dsl_dataset_t *ds = dmu_objset_ds(os); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); + if (verify_dd_livelist(os) != 0) + fatal("livelist is incorrect"); + } + + if (dsl_dataset_remap_deadlist_exists(ds)) { + (void) printf("ds_remap_deadlist:\n"); + dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); + } + count_ds_mos_objects(ds); + } + + if (dmu_objset_ds(os) != NULL) + dump_bookmarks(os, verbosity); + + if (verbosity < 2) + return; + + if (BP_IS_HOLE(os->os_rootbp)) + return; + + dump_object(os, 0, verbosity, &print_header, NULL, 0); + object_count = 0; + if (DMU_USERUSED_DNODE(os) != NULL && + DMU_USERUSED_DNODE(os)->dn_type != 0) { + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, + NULL, 0); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, + NULL, 0); + } + + if (DMU_PROJECTUSED_DNODE(os) != NULL && + DMU_PROJECTUSED_DNODE(os)->dn_type != 0) + dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, + &print_header, NULL, 0); + + object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + dump_object(os, object, verbosity, &print_header, &dnode_slots, + 0); + object_count++; + total_slots_used += dnode_slots; + max_slot_used = object + dnode_slots - 1; + } + + (void) printf("\n"); + + (void) printf(" Dnode slots:\n"); + (void) printf("\tTotal used: %10llu\n", + (u_longlong_t)total_slots_used); + (void) printf("\tMax used: %10llu\n", + (u_longlong_t)max_slot_used); + (void) printf("\tPercent empty: %10lf\n", + (double)(max_slot_used - total_slots_used)*100 / + (double)max_slot_used); + (void) printf("\n"); + + if (error != ESRCH) { + (void) fprintf(stderr, "dmu_object_next() = %d\n", error); + abort(); + } + + ASSERT3U(object_count, ==, usedobjs); + + if (leaked_objects != 0) { + (void) printf("%d potentially leaked objects detected\n", + leaked_objects); + leaked_objects = 0; + } +} + +static void +dump_uberblock(uberblock_t *ub, const char *header, const char *footer) +{ + time_t timestamp = ub->ub_timestamp; + + (void) printf("%s", header ? header : ""); + (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); + (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); + (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); + (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); + (void) printf("\ttimestamp = %llu UTC = %s", + (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); + + (void) printf("\tmmp_magic = %016llx\n", + (u_longlong_t)ub->ub_mmp_magic); + if (MMP_VALID(ub)) { + (void) printf("\tmmp_delay = %0llu\n", + (u_longlong_t)ub->ub_mmp_delay); + if (MMP_SEQ_VALID(ub)) + (void) printf("\tmmp_seq = %u\n", + (unsigned int) MMP_SEQ(ub)); + if (MMP_FAIL_INT_VALID(ub)) + (void) printf("\tmmp_fail = %u\n", + (unsigned int) MMP_FAIL_INT(ub)); + if (MMP_INTERVAL_VALID(ub)) + (void) printf("\tmmp_write = %u\n", + (unsigned int) MMP_INTERVAL(ub)); + /* After MMP_* to make summarize_uberblock_mmp cleaner */ + (void) printf("\tmmp_valid = %x\n", + (unsigned int) ub->ub_mmp_config & 0xFF); + } + + if (dump_opt['u'] >= 4) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); + (void) printf("\trootbp = %s\n", blkbuf); + } + (void) printf("\tcheckpoint_txg = %llu\n", + (u_longlong_t)ub->ub_checkpoint_txg); + (void) printf("%s", footer ? footer : ""); +} + +static void +dump_config(spa_t *spa) +{ + dmu_buf_t *db; + size_t nvsize = 0; + int error = 0; + + + error = dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db); + + if (error == 0) { + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + (void) printf("\nMOS Configuration:\n"); + dump_packed_nvlist(spa->spa_meta_objset, + spa->spa_config_object, (void *)&nvsize, 1); + } else { + (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", + (u_longlong_t)spa->spa_config_object, error); + } +} + +static void +dump_cachefile(const char *cachefile) +{ + int fd; + struct stat64 statbuf; + char *buf; + nvlist_t *config; + + if ((fd = open64(cachefile, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if (fstat64(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if ((buf = malloc(statbuf.st_size)) == NULL) { + (void) fprintf(stderr, "failed to allocate %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) fprintf(stderr, "failed to read %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "failed to unpack nvlist\n"); + exit(1); + } + + free(buf); + + dump_nvlist(config, 0); + + nvlist_free(config); +} + +/* + * ZFS label nvlist stats + */ +typedef struct zdb_nvl_stats { + int zns_list_count; + int zns_leaf_count; + size_t zns_leaf_largest; + size_t zns_leaf_total; + nvlist_t *zns_string; + nvlist_t *zns_uint64; + nvlist_t *zns_boolean; +} zdb_nvl_stats_t; + +static void +collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) +{ + nvlist_t *list, **array; + nvpair_t *nvp = NULL; + char *name; + uint_t i, items; + + stats->zns_list_count++; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + name = nvpair_name(nvp); + + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: + fnvlist_add_string(stats->zns_string, name, + fnvpair_value_string(nvp)); + break; + case DATA_TYPE_UINT64: + fnvlist_add_uint64(stats->zns_uint64, name, + fnvpair_value_uint64(nvp)); + break; + case DATA_TYPE_BOOLEAN: + fnvlist_add_boolean(stats->zns_boolean, name); + break; + case DATA_TYPE_NVLIST: + if (nvpair_value_nvlist(nvp, &list) == 0) + collect_nvlist_stats(list, stats); + break; + case DATA_TYPE_NVLIST_ARRAY: + if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) + break; + + for (i = 0; i < items; i++) { + collect_nvlist_stats(array[i], stats); + + /* collect stats on leaf vdev */ + if (strcmp(name, "children") == 0) { + size_t size; + + (void) nvlist_size(array[i], &size, + NV_ENCODE_XDR); + stats->zns_leaf_total += size; + if (size > stats->zns_leaf_largest) + stats->zns_leaf_largest = size; + stats->zns_leaf_count++; + } + } + break; + default: + (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); + } + } +} + +static void +dump_nvlist_stats(nvlist_t *nvl, size_t cap) +{ + zdb_nvl_stats_t stats = { 0 }; + size_t size, sum = 0, total; + size_t noise; + + /* requires nvlist with non-unique names for stat collection */ + VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); + VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); + VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); + VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); + + (void) printf("\n\nZFS Label NVList Config Stats:\n"); + + VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); + (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", + (int)total, (int)(cap - total), 100.0 * total / cap); + + collect_nvlist_stats(nvl, &stats); + + VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", + (int)fnvlist_num_pairs(stats.zns_uint64), + (int)size, 100.0 * size / total); + + VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", + (int)fnvlist_num_pairs(stats.zns_string), + (int)size, 100.0 * size / total); + + VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", + (int)fnvlist_num_pairs(stats.zns_boolean), + (int)size, 100.0 * size / total); + + size = total - sum; /* treat remainder as nvlist overhead */ + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", + stats.zns_list_count, (int)size, 100.0 * size / total); + + if (stats.zns_leaf_count > 0) { + size_t average = stats.zns_leaf_total / stats.zns_leaf_count; + + (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", + stats.zns_leaf_count, (int)average); + (void) printf("%24d bytes largest\n", + (int)stats.zns_leaf_largest); + + if (dump_opt['l'] >= 3 && average > 0) + (void) printf(" space for %d additional leaf vdevs\n", + (int)((cap - total) / average)); + } + (void) printf("\n"); + + nvlist_free(stats.zns_string); + nvlist_free(stats.zns_uint64); + nvlist_free(stats.zns_boolean); +} + +typedef struct cksum_record { + zio_cksum_t cksum; + boolean_t labels[VDEV_LABELS]; + avl_node_t link; +} cksum_record_t; + +static int +cksum_record_compare(const void *x1, const void *x2) +{ + const cksum_record_t *l = (cksum_record_t *)x1; + const cksum_record_t *r = (cksum_record_t *)x2; + int arraysize = ARRAY_SIZE(l->cksum.zc_word); + int difference; + + for (int i = 0; i < arraysize; i++) { + difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); + if (difference) + break; + } + + return (difference); +} + +static cksum_record_t * +cksum_record_alloc(zio_cksum_t *cksum, int l) +{ + cksum_record_t *rec; + + rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); + rec->cksum = *cksum; + rec->labels[l] = B_TRUE; + + return (rec); +} + +static cksum_record_t * +cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) +{ + cksum_record_t lookup = { .cksum = *cksum }; + avl_index_t where; + + return (avl_find(tree, &lookup, &where)); +} + +static cksum_record_t * +cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) +{ + cksum_record_t *rec; + + rec = cksum_record_lookup(tree, cksum); + if (rec) { + rec->labels[l] = B_TRUE; + } else { + rec = cksum_record_alloc(cksum, l); + avl_add(tree, rec); + } + + return (rec); +} + +static int +first_label(cksum_record_t *rec) +{ + for (int i = 0; i < VDEV_LABELS; i++) + if (rec->labels[i]) + return (i); + + return (-1); +} + +static void +print_label_numbers(char *prefix, cksum_record_t *rec) +{ + printf("%s", prefix); + for (int i = 0; i < VDEV_LABELS; i++) + if (rec->labels[i] == B_TRUE) + printf("%d ", i); + printf("\n"); +} + +#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) + +typedef struct zdb_label { + vdev_label_t label; + nvlist_t *config_nv; + cksum_record_t *config; + cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; + boolean_t header_printed; + boolean_t read_failed; +} zdb_label_t; + +static void +print_label_header(zdb_label_t *label, int l) +{ + + if (dump_opt['q']) + return; + + if (label->header_printed == B_TRUE) + return; + + (void) printf("------------------------------------\n"); + (void) printf("LABEL %d\n", l); + (void) printf("------------------------------------\n"); + + label->header_printed = B_TRUE; +} + +static void +print_l2arc_header(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device header\n"); + (void) printf("------------------------------------\n"); +} + +static void +print_l2arc_log_blocks(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device log blocks\n"); + (void) printf("------------------------------------\n"); +} + +static void +dump_l2arc_log_entries(uint64_t log_entries, + l2arc_log_ent_phys_t *le, uint64_t i) +{ + for (int j = 0; j < log_entries; j++) { + dva_t dva = le[j].le_dva; + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, + (u_longlong_t)DVA_GET_ASIZE(&dva), + (u_longlong_t)DVA_GET_VDEV(&dva), + (u_longlong_t)DVA_GET_OFFSET(&dva)); + (void) printf("|\t\t\t\tbirth: %llu\n", + (u_longlong_t)le[j].le_birth); + (void) printf("|\t\t\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tpsize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcompr: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcomplevel: %llu\n", + (u_longlong_t)(&le[j])->le_complevel); + (void) printf("|\t\t\t\ttype: %llu\n", + (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprotected: %llu\n", + (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprefetch: %llu\n", + (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); + (void) printf("|\t\t\t\taddress: %llu\n", + (u_longlong_t)le[j].le_daddr); + (void) printf("|\n"); + } + (void) printf("\n"); +} + +static void +dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) +{ + (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr); + (void) printf("|\t\tpayload_asize: %llu\n", + (u_longlong_t)lbps.lbp_payload_asize); + (void) printf("|\t\tpayload_start: %llu\n", + (u_longlong_t)lbps.lbp_payload_start); + (void) printf("|\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tasize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tcompralgo: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); + (void) printf("|\t\tcksumalgo: %llu\n", + (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop)); + (void) printf("|\n\n"); +} + +static void +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) +{ + l2arc_log_blk_phys_t this_lb; + uint64_t asize; + l2arc_log_blkptr_t lbps[2]; + abd_t *abd; + zio_cksum_t cksum; + int failed = 0; + l2arc_dev_t dev; + + if (!dump_opt['q']) + print_l2arc_log_blocks(); + bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); + + dev.l2ad_evict = l2dhdr.dh_evict; + dev.l2ad_start = l2dhdr.dh_start; + dev.l2ad_end = l2dhdr.dh_end; + + if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { + /* no log blocks to read */ + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } + return; + } else { + dev.l2ad_hand = lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + } + + dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + for (;;) { + if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) + break; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } + break; + } + + fletcher_4_native_varsize(&this_lb, asize, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { + failed++; + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } + break; + } + + switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + default: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); + zio_decompress_data(L2BLK_GET_COMPRESS( + (&lbps[0])->lbp_prop), abd, &this_lb, + asize, sizeof (this_lb), NULL); + abd_free(abd); + break; + } + + if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(&this_lb, sizeof (this_lb)); + if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); + break; + } + + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, + (u_longlong_t)this_lb.lb_magic); + dump_l2arc_log_blkptr(lbps[0]); + } + + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); + + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) + break; + + lbps[0] = lbps[1]; + lbps[1] = this_lb.lb_prev_lbp; + } + + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } +} + +static int +dump_l2arc_header(int fd) +{ + l2arc_dev_hdr_phys_t l2dhdr, rebuild; + int error = B_FALSE; + + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), + VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { + error = B_TRUE; + } else { + if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); + + if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) + error = B_TRUE; + } + + if (error) { + (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); + } else if (!dump_opt['q']) { + print_l2arc_header(); + + (void) printf(" magic: %llu\n", + (u_longlong_t)l2dhdr.dh_magic); + (void) printf(" version: %llu\n", + (u_longlong_t)l2dhdr.dh_version); + (void) printf(" pool_guid: %llu\n", + (u_longlong_t)l2dhdr.dh_spa_guid); + (void) printf(" flags: %llu\n", + (u_longlong_t)l2dhdr.dh_flags); + (void) printf(" start_lbps[0]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[0].lbp_daddr); + (void) printf(" start_lbps[1]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[1].lbp_daddr); + (void) printf(" log_blk_ent: %llu\n", + (u_longlong_t)l2dhdr.dh_log_entries); + (void) printf(" start: %llu\n", + (u_longlong_t)l2dhdr.dh_start); + (void) printf(" end: %llu\n", + (u_longlong_t)l2dhdr.dh_end); + (void) printf(" evict: %llu\n", + (u_longlong_t)l2dhdr.dh_evict); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_count); + (void) printf(" trim_action_time: %llu\n", + (u_longlong_t)l2dhdr.dh_trim_action_time); + (void) printf(" trim_state: %llu\n\n", + (u_longlong_t)l2dhdr.dh_trim_state); + } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); +} + +static void +dump_config_from_label(zdb_label_t *label, size_t buflen, int l) +{ + if (dump_opt['q']) + return; + + if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) + return; + + print_label_header(label, l); + dump_nvlist(label->config_nv, 4); + print_label_numbers(" labels = ", label->config); + + if (dump_opt['l'] >= 2) + dump_nvlist_stats(label->config_nv, buflen); +} + +#define ZDB_MAX_UB_HEADER_SIZE 32 + +static void +dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) +{ + + vdev_t vd; + char header[ZDB_MAX_UB_HEADER_SIZE]; + + vd.vdev_ashift = ashift; + vd.vdev_top = &vd; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); + uberblock_t *ub = (void *)((char *)&label->label + uoff); + cksum_record_t *rec = label->uberblocks[i]; + + if (rec == NULL) { + if (dump_opt['u'] >= 2) { + print_label_header(label, label_num); + (void) printf(" Uberblock[%d] invalid\n", i); + } + continue; + } + + if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) + continue; + + if ((dump_opt['u'] < 4) && + (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && + (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) + continue; + + print_label_header(label, label_num); + (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, + " Uberblock[%d]\n", i); + dump_uberblock(ub, header, ""); + print_label_numbers(" labels = ", rec); + } +} + +static char curpath[PATH_MAX]; + +/* + * Iterate through the path components, recursively passing + * current one's obj and remaining path until we find the obj + * for the last one. + */ +static int +dump_path_impl(objset_t *os, uint64_t obj, char *name) +{ + int err; + boolean_t header = B_TRUE; + uint64_t child_obj; + char *s; + dmu_buf_t *db; + dmu_object_info_t doi; + + if ((s = strchr(name, '/')) != NULL) + *s = '\0'; + err = zap_lookup(os, obj, name, 8, 1, &child_obj); + + (void) strlcat(curpath, name, sizeof (curpath)); + + if (err != 0) { + (void) fprintf(stderr, "failed to lookup %s: %s\n", + curpath, strerror(err)); + return (err); + } + + child_obj = ZFS_DIRENT_OBJ(child_obj); + err = sa_buf_hold(os, child_obj, FTAG, &db); + if (err != 0) { + (void) fprintf(stderr, + "failed to get SA dbuf for obj %llu: %s\n", + (u_longlong_t)child_obj, strerror(err)); + return (EINVAL); + } + dmu_object_info_from_db(db, &doi); + sa_buf_rele(db, FTAG); + + if (doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) { + (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", + doi.doi_bonus_type, (u_longlong_t)child_obj); + return (EINVAL); + } + + if (dump_opt['v'] > 6) { + (void) printf("obj=%llu %s type=%d bonustype=%d\n", + (u_longlong_t)child_obj, curpath, doi.doi_type, + doi.doi_bonus_type); + } + + (void) strlcat(curpath, "/", sizeof (curpath)); + + switch (doi.doi_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (s != NULL && *(s + 1) != '\0') + return (dump_path_impl(os, child_obj, s + 1)); + /*FALLTHROUGH*/ + case DMU_OT_PLAIN_FILE_CONTENTS: + dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0); + return (0); + default: + (void) fprintf(stderr, "object %llu has non-file/directory " + "type %d\n", (u_longlong_t)obj, doi.doi_type); + break; + } + + return (EINVAL); +} + +/* + * Dump the blocks for the object specified by path inside the dataset. + */ +static int +dump_path(char *ds, char *path) +{ + int err; + objset_t *os; + uint64_t root_obj; + + err = open_objset(ds, FTAG, &os); + if (err != 0) + return (err); + + err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); + if (err != 0) { + (void) fprintf(stderr, "can't lookup root znode: %s\n", + strerror(err)); + close_objset(os, FTAG); + return (EINVAL); + } + + (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); + + err = dump_path_impl(os, root_obj, path); + + close_objset(os, FTAG); + return (err); +} + +static int +dump_label(const char *dev) +{ + char path[MAXPATHLEN]; + zdb_label_t labels[VDEV_LABELS]; + uint64_t psize, ashift, l2cache; + struct stat64 statbuf; + boolean_t config_found = B_FALSE; + boolean_t error = B_FALSE; + boolean_t read_l2arc_header = B_FALSE; + avl_tree_t config_tree; + avl_tree_t uberblock_tree; + void *node, *cookie; + int fd; + + bzero(labels, sizeof (labels)); + + /* + * Check if we were given absolute path and use it as is. + * Otherwise if the provided vdev name doesn't point to a file, + * try prepending expected disk paths and partition numbers. + */ + (void) strlcpy(path, dev, sizeof (path)); + if (dev[0] != '/' && stat64(path, &statbuf) != 0) { + int error; + + error = zfs_resolve_shortname(dev, path, MAXPATHLEN); + if (error == 0 && zfs_dev_is_whole_disk(path)) { + if (zfs_append_partition(path, MAXPATHLEN) == -1) + error = ENOENT; + } + + if (error || (stat64(path, &statbuf) != 0)) { + (void) printf("failed to find device %s, try " + "specifying absolute path instead\n", dev); + return (1); + } + } + + if ((fd = open64(path, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", path, strerror(errno)); + exit(1); + } + + if (fstat64_blk(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", path, + strerror(errno)); + (void) close(fd); + exit(1); + } + + if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) + (void) printf("failed to invalidate cache '%s' : %s\n", path, + strerror(errno)); + + avl_create(&config_tree, cksum_record_compare, + sizeof (cksum_record_t), offsetof(cksum_record_t, link)); + avl_create(&uberblock_tree, cksum_record_compare, + sizeof (cksum_record_t), offsetof(cksum_record_t, link)); + + psize = statbuf.st_size; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + ashift = SPA_MINBLOCKSHIFT; + + /* + * 1. Read the label from disk + * 2. Unpack the configuration and insert in config tree. + * 3. Traverse all uberblocks and insert in uberblock tree. + */ + for (int l = 0; l < VDEV_LABELS; l++) { + zdb_label_t *label = &labels[l]; + char *buf = label->label.vl_vdev_phys.vp_nvlist; + size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + nvlist_t *config; + cksum_record_t *rec; + zio_cksum_t cksum; + vdev_t vd; + + if (pread64(fd, &label->label, sizeof (label->label), + vdev_label_offset(psize, l, 0)) != sizeof (label->label)) { + if (!dump_opt['q']) + (void) printf("failed to read label %d\n", l); + label->read_failed = B_TRUE; + error = B_TRUE; + continue; + } + + label->read_failed = B_FALSE; + + if (nvlist_unpack(buf, buflen, &config, 0) == 0) { + nvlist_t *vdev_tree = NULL; + size_t size; + + if ((nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || + (nvlist_lookup_uint64(vdev_tree, + ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) + ashift = SPA_MINBLOCKSHIFT; + + if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) + size = buflen; + + /* If the device is a cache device clear the header. */ + if (!read_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + read_l2arc_header = B_TRUE; + } + } + + fletcher_4_native_varsize(buf, size, &cksum); + rec = cksum_record_insert(&config_tree, &cksum, l); + + label->config = rec; + label->config_nv = config; + config_found = B_TRUE; + } else { + error = B_TRUE; + } + + vd.vdev_ashift = ashift; + vd.vdev_top = &vd; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); + uberblock_t *ub = (void *)((char *)label + uoff); + + if (uberblock_verify(ub)) + continue; + + fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); + rec = cksum_record_insert(&uberblock_tree, &cksum, l); + + label->uberblocks[i] = rec; + } + } + + /* + * Dump the label and uberblocks. + */ + for (int l = 0; l < VDEV_LABELS; l++) { + zdb_label_t *label = &labels[l]; + size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + + if (label->read_failed == B_TRUE) + continue; + + if (label->config_nv) { + dump_config_from_label(label, buflen, l); + } else { + if (!dump_opt['q']) + (void) printf("failed to unpack label %d\n", l); + } + + if (dump_opt['u']) + dump_label_uberblocks(label, ashift, l); + + nvlist_free(label->config_nv); + } + + /* + * Dump the L2ARC header, if existent. + */ + if (read_l2arc_header) + error |= dump_l2arc_header(fd); + + cookie = NULL; + while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) + umem_free(node, sizeof (cksum_record_t)); + + cookie = NULL; + while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) + umem_free(node, sizeof (cksum_record_t)); + + avl_destroy(&config_tree); + avl_destroy(&uberblock_tree); + + (void) close(fd); + + return (config_found == B_FALSE ? 2 : + (error == B_TRUE ? 1 : 0)); +} + +static uint64_t dataset_feature_count[SPA_FEATURES]; +static uint64_t global_feature_count[SPA_FEATURES]; +static uint64_t remap_deadlist_count = 0; + +/*ARGSUSED*/ +static int +dump_one_objset(const char *dsname, void *arg) +{ + int error; + objset_t *os; + spa_feature_t f; + + error = open_objset(dsname, FTAG, &os); + if (error != 0) + return (0); + + for (f = 0; f < SPA_FEATURES; f++) { + if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) + continue; + ASSERT(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET); + dataset_feature_count[f]++; + } + + if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { + remap_deadlist_count++; + } + + for (dsl_bookmark_node_t *dbn = + avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; + dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { + mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); + if (dbn->dbn_phys.zbm_redaction_obj != 0) + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; + } + + if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + global_feature_count[SPA_FEATURE_LIVELIST]++; + } + + dump_objset(os); + close_objset(os, FTAG); + fuid_table_destroy(); + return (0); +} + +/* + * Block statistics. + */ +#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) +typedef struct zdb_blkstats { + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_count; + uint64_t zb_gangs; + uint64_t zb_ditto_samevdev; + uint64_t zb_ditto_same_ms; + uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; +} zdb_blkstats_t; + +/* + * Extended object types to report deferred frees and dedup auto-ditto blocks. + */ +#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) +#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) +#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) + +static const char *zdb_ot_extname[] = { + "deferred free", + "dedup ditto", + "other", + "Total", +}; + +#define ZB_TOTAL DN_MAX_LEVELS +#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) + +typedef struct zdb_cb { + zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; + uint64_t zcb_removing_size; + uint64_t zcb_checkpoint_size; + uint64_t zcb_dedup_asize; + uint64_t zcb_dedup_blocks; + uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_total; + uint64_t zcb_lsize_total; + uint64_t zcb_asize_total; + uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; + uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] + [BPE_PAYLOAD_SIZE + 1]; + uint64_t zcb_start; + hrtime_t zcb_lastprint; + uint64_t zcb_totalasize; + uint64_t zcb_errors[256]; + int zcb_readfails; + int zcb_haderrors; + spa_t *zcb_spa; + uint32_t **zcb_vd_obsolete_counts; +} zdb_cb_t; + +/* test if two DVA offsets from same vdev are within the same metaslab */ +static boolean_t +same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t ms_shift = vd->vdev_ms_shift; + + return ((off1 >> ms_shift) == (off2 >> ms_shift)); +} + +/* + * Used to simplify reporting of the histogram data. + */ +typedef struct one_histo { + char *name; + uint64_t *count; + uint64_t *len; + uint64_t cumulative; +} one_histo_t; + +/* + * The number of separate histograms processed for psize, lsize and asize. + */ +#define NUM_HISTO 3 + +/* + * This routine will create a fixed column size output of three different + * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M + * the count, length and cumulative length of the psize, lsize and + * asize blocks. + * + * All three types of blocks are listed on a single line + * + * By default the table is printed in nicenumber format (e.g. 123K) but + * if the '-P' parameter is specified then the full raw number (parseable) + * is printed out. + */ +static void +dump_size_histograms(zdb_cb_t *zcb) +{ + /* + * A temporary buffer that allows us to convert a number into + * a string using zdb_nicenumber to allow either raw or human + * readable numbers to be output. + */ + char numbuf[32]; + + /* + * Define titles which are used in the headers of the tables + * printed by this routine. + */ + const char blocksize_title1[] = "block"; + const char blocksize_title2[] = "size"; + const char count_title[] = "Count"; + const char length_title[] = "Size"; + const char cumulative_title[] = "Cum."; + + /* + * Setup the histogram arrays (psize, lsize, and asize). + */ + one_histo_t parm_histo[NUM_HISTO]; + + parm_histo[0].name = "psize"; + parm_histo[0].count = zcb->zcb_psize_count; + parm_histo[0].len = zcb->zcb_psize_len; + parm_histo[0].cumulative = 0; + + parm_histo[1].name = "lsize"; + parm_histo[1].count = zcb->zcb_lsize_count; + parm_histo[1].len = zcb->zcb_lsize_len; + parm_histo[1].cumulative = 0; + + parm_histo[2].name = "asize"; + parm_histo[2].count = zcb->zcb_asize_count; + parm_histo[2].len = zcb->zcb_asize_len; + parm_histo[2].cumulative = 0; + + + (void) printf("\nBlock Size Histogram\n"); + /* + * Print the first line titles + */ + if (dump_opt['P']) + (void) printf("\n%s\t", blocksize_title1); + else + (void) printf("\n%7s ", blocksize_title1); + + for (int j = 0; j < NUM_HISTO; j++) { + if (dump_opt['P']) { + if (j < NUM_HISTO - 1) { + (void) printf("%s\t\t\t", parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf(" %s", parm_histo[j].name); + } + } else { + if (j < NUM_HISTO - 1) { + /* Left aligned strings in the output */ + (void) printf("%-7s ", + parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf("%s", parm_histo[j].name); + } + } + } + (void) printf("\n"); + + /* + * Print the second line titles + */ + if (dump_opt['P']) { + (void) printf("%s\t", blocksize_title2); + } else { + (void) printf("%7s ", blocksize_title2); + } + + for (int i = 0; i < NUM_HISTO; i++) { + if (dump_opt['P']) { + (void) printf("%s\t%s\t%s\t", + count_title, length_title, cumulative_title); + } else { + (void) printf("%7s%7s%7s", + count_title, length_title, cumulative_title); + } + } + (void) printf("\n"); + + /* + * Print the rows + */ + for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { + + /* + * Print the first column showing the blocksize + */ + zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); + + if (dump_opt['P']) { + printf("%s", numbuf); + } else { + printf("%7s:", numbuf); + } + + /* + * Print the remaining set of 3 columns per size: + * for psize, lsize and asize + */ + for (int j = 0; j < NUM_HISTO; j++) { + parm_histo[j].cumulative += parm_histo[j].len[i]; + + zdb_nicenum(parm_histo[j].count[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].len[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].cumulative, + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + } + (void) printf("\n"); + } +} + +static void +zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, + dmu_object_type_t type) +{ + uint64_t refcnt = 0; + int i; + + ASSERT(type < ZDB_OT_TOTAL); + + if (zilog && zil_bp_tree_add(zilog, bp) != 0) + return; + + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; + int t = (i & 1) ? type : ZDB_OT_TOTAL; + int equal; + zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; + + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_count++; + + /* + * The histogram is only big enough to record blocks up to + * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, + * "other", bucket. + */ + unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; + idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); + zb->zb_psize_histogram[idx]++; + + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) { + zb->zb_ditto_samevdev++; + + if (same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + } + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal != 0) { + zb->zb_ditto_samevdev++; + + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + } + break; + } + } + + spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + + if (BP_IS_EMBEDDED(bp)) { + zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; + zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] + [BPE_GET_PSIZE(bp)]++; + return; + } + /* + * The binning histogram bins by powers of two up to + * SPA_MAXBLOCKSIZE rather than creating bins for + * every possible blocksize found in the pool. + */ + int bin = highbit64(BP_GET_PSIZE(bp)) - 1; + + zcb->zcb_psize_count[bin]++; + zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); + zcb->zcb_psize_total += BP_GET_PSIZE(bp); + + bin = highbit64(BP_GET_LSIZE(bp)) - 1; + + zcb->zcb_lsize_count[bin]++; + zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); + zcb->zcb_lsize_total += BP_GET_LSIZE(bp); + + bin = highbit64(BP_GET_ASIZE(bp)) - 1; + + zcb->zcb_asize_count[bin]++; + zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); + zcb->zcb_asize_total += BP_GET_ASIZE(bp); + + if (dump_opt['L']) + return; + + if (BP_GET_DEDUP(bp)) { + ddt_t *ddt; + ddt_entry_t *dde; + + ddt = ddt_select(zcb->zcb_spa, bp); + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_FALSE); + + if (dde == NULL) { + refcnt = 0; + } else { + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + refcnt = ddp->ddp_refcnt; + if (ddt_phys_total_refcnt(dde) == 0) + ddt_remove(ddt, dde); + } + ddt_exit(ddt); + } + + VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, + refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), + bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); +} + +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_phys_t *zb = &zio->io_bookmark; + + abd_free(zio->io_abd); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + zdb_cb_t *zcb = arg; + dmu_object_type_t type; + boolean_t is_metadata; + + if (zb->zb_level == ZB_DNODE_LEVEL) + return (0); + + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) + return (0); + + type = BP_GET_TYPE(bp); + + zdb_count_block(zcb, zilog, bp, + (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); + + is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); + + if (!BP_IS_EMBEDDED(bp) && + (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { + size_t size = BP_GET_PSIZE(bp); + abd_t *abd = abd_alloc(size, B_FALSE); + int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + flags |= ZIO_FLAG_SPECULATIVE; + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_load_verify_bytes > max_inflight_bytes) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_load_verify_bytes += size; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, abd, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + } + + zcb->zcb_readfails = 0; + + /* only call gethrtime() every 100 blocks */ + static int iters; + if (++iters > 100) + iters = 0; + else + return (0); + + if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { + uint64_t now = gethrtime(); + char buf[10]; + uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; + int kb_per_sec = + 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); + int sec_remaining = + (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); + + zfs_nicebytes(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "\r%5s completed (%4dMB/s) " + "estimated time remaining: %uhr %02umin %02usec ", + buf, kb_per_sec / 1024, + sec_remaining / 60 / 60, + sec_remaining / 60 % 60, + sec_remaining % 60); + + zcb->zcb_lastprint = now; + } + + return (0); +} + +static void +zdb_leak(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + + (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", + (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); +} + +static metaslab_ops_t zdb_metaslab_ops = { + NULL /* alloc */ +}; + +/* ARGSUSED */ +static int +load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + spa_vdev_removal_t *svr = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + + /* skip vdevs we don't care about */ + if (sme->sme_vdev != svr->svr_vdev_id) + return (0); + + vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + ASSERT(vim != NULL); + if (offset >= vdev_indirect_mapping_max_offset(vim)) + return (0); + + if (sme->sme_type == SM_ALLOC) + range_tree_add(svr->svr_allocd_segs, offset, size); + else + range_tree_remove(svr->svr_allocd_segs, offset, size); + + return (0); +} + +/* ARGSUSED */ +static void +claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + /* + * This callback was called through a remap from + * a device being removed. Therefore, the vdev that + * this callback is applied to is a concrete + * vdev. + */ + ASSERT(vdev_is_concrete(vd)); + + VERIFY0(metaslab_claim_impl(vd, offset, size, + spa_min_claim_txg(vd->vdev_spa))); +} + +static void +claim_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + claim_segment_impl_cb, NULL); +} + +/* + * After accounting for all allocated blocks that are directly referenced, + * we might have missed a reference to a block from a partially complete + * (and thus unused) indirect mapping object. We perform a secondary pass + * through the metaslabs we have already mapped and claim the destination + * blocks. + */ +static void +zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) +{ + if (dump_opt['L']) + return; + + if (spa->spa_vdev_removal == NULL) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(allocs)); + if (msp->ms_sm != NULL) + VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); + range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); + } + range_tree_destroy(allocs); + + iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for + * it yet. + */ + range_tree_clear(svr->svr_allocd_segs, + vdev_indirect_mapping_max_offset(vim), + vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); + + zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); + range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* ARGSUSED */ +static int +increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + spa_t *spa = zcb->zcb_spa; + vdev_t *vd; + const dva_t *dva = &bp->blk_dva[0]; + + ASSERT(!bp_freed); + ASSERT(!dump_opt['L']); + ASSERT3U(BP_GET_NDVAS(bp), ==, 1); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); + + vdev_indirect_mapping_increment_obsolete_count( + vd->vdev_indirect_mapping, + DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + + return (0); +} + +static uint32_t * +zdb_load_obsolete_counts(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint64_t obsolete_sm_object; + uint32_t *counts; + + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); + counts = vdev_indirect_mapping_load_obsolete_counts(vim); + if (vd->vdev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + vd->vdev_obsolete_sm); + } + if (scip->scip_vdev == vd->vdev_id && + scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + prev_obsolete_sm); + space_map_close(prev_obsolete_sm); + } + return (counts); +} + +static void +zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + ddt_bookmark_t ddb; + ddt_entry_t dde; + int error; + int p; + + ASSERT(!dump_opt['L']); + + bzero(&ddb, sizeof (ddb)); + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { + blkptr_t blk; + ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); + if (p == DDT_PHYS_DITTO) { + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); + } else { + zcb->zcb_dedup_asize += + BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_blocks++; + } + } + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); + } + + ASSERT(error == ENOENT); +} + +typedef struct checkpoint_sm_exclude_entry_arg { + vdev_t *cseea_vd; + uint64_t cseea_checkpoint_size; +} checkpoint_sm_exclude_entry_arg_t; + +static int +checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) +{ + checkpoint_sm_exclude_entry_arg_t *cseea = arg; + vdev_t *vd = cseea->cseea_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + ASSERT(sme->sme_type == SM_FREE); + + /* + * Since the vdev_checkpoint_sm exists in the vdev level + * and the ms_sm space maps exist in the metaslab level, + * an entry in the checkpoint space map could theoretically + * cross the boundaries of the metaslab that it belongs. + * + * In reality, because of the way that we populate and + * manipulate the checkpoint's space maps currently, + * there shouldn't be any entries that cross metaslabs. + * Hence the assertion below. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * By removing the entry from the allocated segments we + * also verify that the entry is there to begin with. + */ + mutex_enter(&ms->ms_lock); + range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + cseea->cseea_checkpoint_size += sme->sme_run; + return (0); +} + +static void +zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) +{ + spa_t *spa = vd->vdev_spa; + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + /* + * If there is no vdev_top_zap, we are in a pool whose + * version predates the pool checkpoint feature. + */ + if (vd->vdev_top_zap == 0) + return; + + /* + * If there is no reference of the vdev_checkpoint_sm in + * the vdev_top_zap, then one of the following scenarios + * is true: + * + * 1] There is no checkpoint + * 2] There is a checkpoint, but no checkpointed blocks + * have been freed yet + * 3] The current vdev is indirect + * + * In these cases we return immediately. + */ + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + return; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, + &checkpoint_sm_obj)); + + checkpoint_sm_exclude_entry_arg_t cseea; + cseea.cseea_vd = vd; + cseea.cseea_checkpoint_size = 0; + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + + VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), + checkpoint_sm_exclude_entry_cb, &cseea)); + space_map_close(checkpoint_sm); + + zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; +} + +static void +zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) +{ + ASSERT(!dump_opt['L']); + + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); + zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); + } +} + +static int +count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + int64_t *ualloc_space = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (sme->sme_type == SM_ALLOC) + *ualloc_space += sme->sme_run; + else + *ualloc_space -= sme->sme_run; + + return (0); +} + +static int64_t +get_unflushed_alloc_space(spa_t *spa) +{ + if (dump_opt['L']) + return (0); + + int64_t ualloc_space = 0; + iterate_through_spacemap_logs(spa, count_unflushed_space_cb, + &ualloc_space); + return (ualloc_space); +} + +static int +load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) +{ + maptype_t *uic_maptype = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (*uic_maptype == sme->sme_type) + range_tree_add(ms->ms_allocatable, offset, size); + else + range_tree_remove(ms->ms_allocatable, offset, size); + + return (0); +} + +static void +load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) +{ + iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); +} + +static void +load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + ASSERT3U(i, ==, vd->vdev_id); + + if (vd->vdev_ops == &vdev_indirect_ops) + continue; + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rloading concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)msp->ms_id, + (longlong_t)vd->vdev_ms_count); + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + msp->ms_allocatable, maptype)); + } + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); + } + } + + load_unflushed_to_ms_allocatables(spa, maptype); +} + +/* + * vm_idxp is an in-out parameter which (for indirect vdevs) is the + * index in vim_entries that has the first entry in this metaslab. + * On return, it will be set to the first entry after this metaslab. + */ +static void +load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, + uint64_t *vim_idxp) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); + (*vim_idxp)++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[*vim_idxp]; + uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); + ASSERT3U(ent_offset, >=, msp->ms_start); + if (ent_offset >= msp->ms_start + msp->ms_size) + break; + + /* + * Mappings do not cross metaslab boundaries, + * because we create them by walking the metaslabs. + */ + ASSERT3U(ent_offset + ent_len, <=, + msp->ms_start + msp->ms_size); + range_tree_add(msp->ms_allocatable, ent_offset, ent_len); + } + + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); +} + +static void +zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) +{ + ASSERT(!dump_opt['L']); + + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + ASSERT3U(c, ==, vd->vdev_id); + + if (vd->vdev_ops != &vdev_indirect_ops) + continue; + + /* + * Note: we don't check for mapping leaks on + * removing vdevs because their ms_allocatable's + * are used to look for leaks in allocated space. + */ + zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); + + /* + * Normally, indirect vdevs don't have any + * metaslabs. We want to set them up for + * zio_claim(). + */ + VERIFY0(vdev_metaslab_init(vd, 0)); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t vim_idx = 0; + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + + (void) fprintf(stderr, + "\rloading indirect vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vd->vdev_ms[m]->ms_id, + (longlong_t)vd->vdev_ms_count); + + load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], + &vim_idx); + } + ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); + } +} + +static void +zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + zcb->zcb_spa = spa; + + if (dump_opt['L']) + return; + + dsl_pool_t *dp = spa->spa_dsl_pool; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We are going to be changing the meaning of the metaslab's + * ms_allocatable. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); + + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); + + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); + ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + zdb_ddt_leak_init(spa, zcb); + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static boolean_t +zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) +{ + boolean_t leaks = B_FALSE; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t total_leaked = 0; + boolean_t are_precise = B_FALSE; + + ASSERT(vim != NULL); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + uint64_t obsolete_bytes = 0; + uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + /* + * This is not very efficient but it's easy to + * verify correctness. + */ + for (uint64_t inner_offset = 0; + inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); + inner_offset += 1 << vd->vdev_ashift) { + if (range_tree_contains(msp->ms_allocatable, + offset + inner_offset, 1 << vd->vdev_ashift)) { + obsolete_bytes += 1 << vd->vdev_ashift; + } + } + + int64_t bytes_leaked = obsolete_bytes - + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; + ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); + + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { + (void) printf("obsolete indirect mapping count " + "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (u_longlong_t)bytes_leaked); + } + total_leaked += ABS(bytes_leaked); + } + + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (!are_precise && total_leaked > 0) { + int pct_leaked = total_leaked * 100 / + vdev_indirect_mapping_bytes_mapped(vim); + (void) printf("cannot verify obsolete indirect mapping " + "counts of vdev %llu because precise feature was not " + "enabled when it was removed: %d%% (%llx bytes) of mapping" + "unreferenced\n", + (u_longlong_t)vd->vdev_id, pct_leaked, + (u_longlong_t)total_leaked); + } else if (total_leaked > 0) { + (void) printf("obsolete indirect mapping count mismatch " + "for vdev %llu -- %llx total bytes mismatched\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)total_leaked); + leaks |= B_TRUE; + } + + vdev_indirect_mapping_free_obsolete_counts(vim, + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; + + return (leaks); +} + +static boolean_t +zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) +{ + if (dump_opt['L']) + return (B_FALSE); + + boolean_t leaks = B_FALSE; + vdev_t *rvd = spa->spa_root_vdev; + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + metaslab_group_t *mg __maybe_unused = vd->vdev_mg; + + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); + + /* + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_allocatable, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_allocatable, + zdb_leak, vd); + } + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } + } + } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; + + return (leaks); +} + +/* ARGSUSED */ +static int +count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 5) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); +} + +/* + * Iterate over livelists which have been destroyed by the user but + * are still present in the MOS, waiting to be freed + */ +static void +iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + ASSERT0(err); + + zap_cursor_t zc; + zap_attribute_t attr; + dsl_deadlist_t ll; + /* NULL out os prior to dsl_deadlist_open in case it's garbage */ + ll.dl_os = NULL; + for (zap_cursor_init(&zc, mos, zap_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + dsl_deadlist_open(&ll, mos, attr.za_first_integer); + func(&ll, arg); + dsl_deadlist_close(&ll); + } + zap_cursor_fini(&zc); +} + +static int +bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (count_block_cb(arg, bp, tx)); +} + +static int +livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) +{ + zdb_cb_t *zbc = args; + bplist_t blks; + bplist_create(&blks); + /* determine which blocks have been alloc'd but not freed */ + VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); + /* count those blocks */ + (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); + bplist_destroy(&blks); + return (0); +} + +static void +livelist_count_blocks(dsl_deadlist_t *ll, void *arg) +{ + dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); +} + +/* + * Count the blocks in the livelists that have been destroyed by the user + * but haven't yet been freed. + */ +static void +deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) +{ + iterate_deleted_livelists(spa, livelist_count_blocks, zbc); +} + +static void +dump_livelist_cb(dsl_deadlist_t *ll, void *arg) +{ + ASSERT3P(arg, ==, NULL); + global_feature_count[SPA_FEATURE_LIVELIST]++; + dump_blkptr_list(ll, "Deleted Livelist"); + dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); +} + +/* + * Print out, register object references to, and increment feature counts for + * livelists that have been destroyed by the user but haven't yet been freed. + */ +static void +deleted_livelists_dump_mos(spa_t *spa) +{ + uint64_t zap_obj; + objset_t *mos = spa->spa_meta_objset; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + mos_obj_refd(zap_obj); + iterate_deleted_livelists(spa, dump_livelist_cb, NULL); +} + +static int +dump_block_stats(spa_t *spa) +{ + zdb_cb_t zcb; + zdb_blkstats_t *zb, *tzb; + uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; + boolean_t leaks = B_FALSE; + int e, c, err; + bp_embedded_type_t i; + + bzero(&zcb, sizeof (zcb)); + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", + (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", + dump_opt['c'] ? "checksums " : "", + (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", + !dump_opt['L'] ? "nothing leaked " : ""); + + /* + * When leak detection is enabled we load all space maps as SM_ALLOC + * maps, then traverse the pool claiming each block we discover. If + * the pool is perfectly consistent, the segment trees will be empty + * when we're done. Anything left over is a leak; any block we can't + * claim (because it's not part of any space map) is a double + * allocation, reference to a freed block, or an unclaimed log block. + * + * When leak detection is disabled (-L option) we still traverse the + * pool claiming each block we discover, but we skip opening any space + * maps. + */ + bzero(&zcb, sizeof (zdb_cb_t)); + zdb_leak_init(spa, &zcb); + + /* + * If there's a deferred-free bplist, process that first. + */ + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + bpobj_count_block_cb, &zcb, NULL); + + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + bpobj_count_block_cb, &zcb, NULL); + } + + zdb_claim_removing(spa, &zcb); + + if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, + &zcb, NULL)); + } + + deleted_livelists_count_blocks(spa, &zcb); + + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; + + zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); + err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + for (c = 0; c < max_ncpus; c++) { + (void) zio_wait(spa->spa_async_zio_root[c]); + spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + } + ASSERT0(spa->spa_load_verify_bytes); + + /* + * Done after zio_wait() since zcb_haderrors is modified in + * zdb_blkptr_done() + */ + zcb.zcb_haderrors |= err; + + if (zcb.zcb_haderrors) { + (void) printf("\nError counts:\n\n"); + (void) printf("\t%5s %s\n", "errno", "count"); + for (e = 0; e < 256; e++) { + if (zcb.zcb_errors[e] != 0) { + (void) printf("\t%5d %llu\n", + e, (u_longlong_t)zcb.zcb_errors[e]); + } + } + } + + /* + * Report any leaked segments. + */ + leaks |= zdb_leak_fini(spa, &zcb); + + tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; + + norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + norm_space = metaslab_class_get_space(spa_normal_class(spa)); + + total_alloc = norm_alloc + + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_special_class(spa)) + + metaslab_class_get_alloc(spa_dedup_class(spa)) + + get_unflushed_alloc_space(spa); + total_found = tzb->zb_asize - zcb.zcb_dedup_asize + + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; + + if (total_found == total_alloc && !dump_opt['L']) { + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else if (!dump_opt['L']) { + (void) printf("block traversal size %llu != alloc %llu " + "(%s %lld)\n", + (u_longlong_t)total_found, + (u_longlong_t)total_alloc, + (dump_opt['L']) ? "unreachable" : "leaked", + (longlong_t)(total_alloc - total_found)); + leaks = B_TRUE; + } + + if (tzb->zb_count == 0) + return (2); + + (void) printf("\n"); + (void) printf("\t%-16s %14llu\n", "bp count:", + (u_longlong_t)tzb->zb_count); + (void) printf("\t%-16s %14llu\n", "ganged count:", + (longlong_t)tzb->zb_gangs); + (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", + (u_longlong_t)tzb->zb_lsize, + (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp physical:", (u_longlong_t)tzb->zb_psize, + (u_longlong_t)(tzb->zb_psize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_psize); + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp allocated:", (u_longlong_t)tzb->zb_asize, + (u_longlong_t)(tzb->zb_asize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_asize); + (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", + "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, + (u_longlong_t)zcb.zcb_dedup_blocks, + (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", + (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + + if (spa_special_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + if (spa_dedup_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_dedup_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_dedup_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Dedup class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { + if (zcb.zcb_embedded_blocks[i] == 0) + continue; + (void) printf("\n"); + (void) printf("\tadditional, non-pointer bps of type %u: " + "%10llu\n", + i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); + + if (dump_opt['b'] >= 3) { + (void) printf("\t number of (compressed) bytes: " + "number of bps\n"); + dump_histogram(zcb.zcb_embedded_histogram[i], + sizeof (zcb.zcb_embedded_histogram[i]) / + sizeof (zcb.zcb_embedded_histogram[i][0]), 0); + } + } + + if (tzb->zb_ditto_samevdev != 0) { + (void) printf("\tDittoed blocks on same vdev: %llu\n", + (longlong_t)tzb->zb_ditto_samevdev); + } + if (tzb->zb_ditto_same_ms != 0) { + (void) printf("\tDittoed blocks in same metaslab: %llu\n", + (longlong_t)tzb->zb_ditto_same_ms); + } + + for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + if (vim == NULL) { + continue; + } + + char mem[32]; + zdb_nicenum(vdev_indirect_mapping_num_entries(vim), + mem, vdev_indirect_mapping_size(vim)); + + (void) printf("\tindirect vdev id %llu has %llu segments " + "(%s in memory)\n", + (longlong_t)vd->vdev_id, + (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); + } + + if (dump_opt['b'] >= 2) { + int l, t, level; + (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" + "\t avg\t comp\t%%Total\tType\n"); + + for (t = 0; t <= ZDB_OT_TOTAL; t++) { + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32], gang[32]; + const char *typename; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); + + if (t < DMU_OT_NUMTYPES) + typename = dmu_ot[t].ot_name; + else + typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; + + if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { + (void) printf("%6s\t%5s\t%5s\t%5s" + "\t%5s\t%5s\t%6s\t%s\n", + "-", + "-", + "-", + "-", + "-", + "-", + "-", + typename); + continue; + } + + for (l = ZB_TOTAL - 1; l >= -1; l--) { + level = (l == -1 ? ZB_TOTAL : l); + zb = &zcb.zcb_type[level][t]; + + if (zb->zb_asize == 0) + continue; + + if (dump_opt['b'] < 3 && level != ZB_TOTAL) + continue; + + if (level == 0 && zb->zb_asize == + zcb.zcb_type[ZB_TOTAL][t].zb_asize) + continue; + + zdb_nicenum(zb->zb_count, csize, + sizeof (csize)); + zdb_nicenum(zb->zb_lsize, lsize, + sizeof (lsize)); + zdb_nicenum(zb->zb_psize, psize, + sizeof (psize)); + zdb_nicenum(zb->zb_asize, asize, + sizeof (asize)); + zdb_nicenum(zb->zb_asize / zb->zb_count, avg, + sizeof (avg)); + zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); + + (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" + "\t%5.2f\t%6.2f\t", + csize, lsize, psize, asize, avg, + (double)zb->zb_lsize / zb->zb_psize, + 100.0 * zb->zb_asize / tzb->zb_asize); + + if (level == ZB_TOTAL) + (void) printf("%s\n", typename); + else + (void) printf(" L%d %s\n", + level, typename); + + if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { + (void) printf("\t number of ganged " + "blocks: %s\n", gang); + } + + if (dump_opt['b'] >= 4) { + (void) printf("psize " + "(in 512-byte sectors): " + "number of blocks\n"); + dump_histogram(zb->zb_psize_histogram, + PSIZE_HISTO_SIZE, 0); + } + } + } + + /* Output a table summarizing block sizes in the pool */ + if (dump_opt['b'] >= 2) { + dump_size_histograms(&zcb); + } + } + + (void) printf("\n"); + + if (leaks) + return (2); + + if (zcb.zcb_haderrors) + return (3); + + return (0); +} + +typedef struct zdb_ddt_entry { + ddt_key_t zdde_key; + uint64_t zdde_ref_blocks; + uint64_t zdde_ref_lsize; + uint64_t zdde_ref_psize; + uint64_t zdde_ref_dsize; + avl_node_t zdde_node; +} zdb_ddt_entry_t; + +/* ARGSUSED */ +static int +zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + avl_tree_t *t = arg; + avl_index_t where; + zdb_ddt_entry_t *zdde, zdde_search; + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) + return (0); + + if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { + (void) printf("traversing objset %llu, %llu objects, " + "%lu blocks so far\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)BP_GET_FILL(bp), + avl_numnodes(t)); + } + + if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || + BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + return (0); + + ddt_key_fill(&zdde_search.zdde_key, bp); + + zdde = avl_find(t, &zdde_search, &where); + + if (zdde == NULL) { + zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); + zdde->zdde_key = zdde_search.zdde_key; + avl_insert(t, zdde, where); + } + + zdde->zdde_ref_blocks += 1; + zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); + zdde->zdde_ref_psize += BP_GET_PSIZE(bp); + zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); + + return (0); +} + +static void +dump_simulated_ddt(spa_t *spa) +{ + avl_tree_t t; + void *cookie = NULL; + zdb_ddt_entry_t *zdde; + ddt_histogram_t ddh_total; + ddt_stat_t dds_total; + + bzero(&ddh_total, sizeof (ddh_total)); + bzero(&dds_total, sizeof (dds_total)); + avl_create(&t, ddt_entry_compare, + sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { + ddt_stat_t dds; + uint64_t refcnt = zdde->zdde_ref_blocks; + ASSERT(refcnt != 0); + + dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; + dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; + dds.dds_psize = zdde->zdde_ref_psize / refcnt; + dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + + dds.dds_ref_blocks = zdde->zdde_ref_blocks; + dds.dds_ref_lsize = zdde->zdde_ref_lsize; + dds.dds_ref_psize = zdde->zdde_ref_psize; + dds.dds_ref_dsize = zdde->zdde_ref_dsize; + + ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], + &dds, 0); + + umem_free(zdde, sizeof (*zdde)); + } + + avl_destroy(&t); + + ddt_histogram_stat(&dds_total, &ddh_total); + + (void) printf("Simulated DDT histogram:\n"); + + zpool_dump_ddt(&dds_total, &ddh_total); + + dump_dedup_ratio(&dds_total); +} + +static int +verify_device_removal_feature_counts(spa_t *spa) +{ + uint64_t dr_feature_refcount = 0; + uint64_t oc_feature_refcount = 0; + uint64_t indirect_vdev_count = 0; + uint64_t precise_vdev_count = 0; + uint64_t obsolete_counts_object_count = 0; + uint64_t obsolete_sm_count = 0; + uint64_t obsolete_counts_count = 0; + uint64_t scip_count = 0; + uint64_t obsolete_bpobj_count = 0; + int ret = 0; + + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + if (scip->scip_next_mapping_object != 0) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + (void) printf("Condensing indirect vdev %llu: new mapping " + "object %llu, prev obsolete sm %llu\n", + (u_longlong_t)scip->scip_vdev, + (u_longlong_t)scip->scip_next_mapping_object, + (u_longlong_t)scip->scip_prev_obsolete_sm_object); + if (scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, + spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, + 0, vd->vdev_asize, 0)); + dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); + (void) printf("\n"); + space_map_close(prev_obsolete_sm); + } + + scip_count += 2; + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (vic->vic_mapping_object != 0) { + ASSERT(vd->vdev_ops == &vdev_indirect_ops || + vd->vdev_removing); + indirect_vdev_count++; + + if (vd->vdev_indirect_mapping->vim_havecounts) { + obsolete_counts_count++; + } + } + + boolean_t are_precise; + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (are_precise) { + ASSERT(vic->vic_mapping_object != 0); + precise_vdev_count++; + } + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + ASSERT(vic->vic_mapping_object != 0); + obsolete_sm_count++; + } + } + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], + &dr_feature_refcount); + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], + &oc_feature_refcount); + + if (dr_feature_refcount != indirect_vdev_count) { + ret = 1; + (void) printf("Number of indirect vdevs (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)indirect_vdev_count, + (u_longlong_t)dr_feature_refcount); + } else { + (void) printf("Verified device_removal feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)dr_feature_refcount); + } + + if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ) == 0) { + obsolete_bpobj_count++; + } + + + obsolete_counts_object_count = precise_vdev_count; + obsolete_counts_object_count += obsolete_sm_count; + obsolete_counts_object_count += obsolete_counts_count; + obsolete_counts_object_count += scip_count; + obsolete_counts_object_count += obsolete_bpobj_count; + obsolete_counts_object_count += remap_deadlist_count; + + if (oc_feature_refcount != obsolete_counts_object_count) { + ret = 1; + (void) printf("Number of obsolete counts objects (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)obsolete_counts_object_count, + (u_longlong_t)oc_feature_refcount); + (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " + "ob:%llu rd:%llu\n", + (u_longlong_t)precise_vdev_count, + (u_longlong_t)obsolete_sm_count, + (u_longlong_t)obsolete_counts_count, + (u_longlong_t)scip_count, + (u_longlong_t)obsolete_bpobj_count, + (u_longlong_t)remap_deadlist_count); + } else { + (void) printf("Verified indirect_refcount feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)oc_feature_refcount); + } + return (ret); +} + +static void +zdb_set_skip_mmp(char *target) +{ + spa_t *spa; + + /* + * Disable the activity check to allow examination of + * active pools. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL) { + spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; + } + mutex_exit(&spa_namespace_lock); +} + +#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" +/* + * Import the checkpointed state of the pool specified by the target + * parameter as readonly. The function also accepts a pool config + * as an optional parameter, else it attempts to infer the config by + * the name of the target pool. + * + * Note that the checkpointed state's pool name will be the name of + * the original pool with the above suffix appended to it. In addition, + * if the target is not a pool name (e.g. a path to a dataset) then + * the new_path parameter is populated with the updated path to + * reflect the fact that we are looking into the checkpointed state. + * + * The function returns a newly-allocated copy of the name of the + * pool containing the checkpointed state. When this copy is no + * longer needed it should be freed with free(3C). Same thing + * applies to the new_path parameter if allocated. + */ +static char * +import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +{ + int error = 0; + char *poolname, *bogus_name = NULL; + + /* If the target is not a pool, the extract the pool name */ + char *path_start = strchr(target, '/'); + if (path_start != NULL) { + size_t poolname_len = path_start - target; + poolname = strndup(target, poolname_len); + } else { + poolname = target; + } + + if (cfg == NULL) { + zdb_set_skip_mmp(poolname); + error = spa_get_stats(poolname, &cfg, NULL, 0); + if (error != 0) { + fatal("Tried to read config of pool \"%s\" but " + "spa_get_stats() failed with error %d\n", + poolname, error); + } + } + + if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) + return (NULL); + fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); + + error = spa_import(bogus_name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | + ZFS_IMPORT_SKIP_MMP); + if (error != 0) { + fatal("Tried to import pool \"%s\" but spa_import() failed " + "with error %d\n", bogus_name, error); + } + + if (new_path != NULL && path_start != NULL) { + if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { + if (path_start != NULL) + free(poolname); + return (NULL); + } + } + + if (target != poolname) + free(poolname); + + return (bogus_name); +} + +typedef struct verify_checkpoint_sm_entry_cb_arg { + vdev_t *vcsec_vd; + + /* the following fields are only used for printing progress */ + uint64_t vcsec_entryid; + uint64_t vcsec_num_entries; +} verify_checkpoint_sm_entry_cb_arg_t; + +#define ENTRIES_PER_PROGRESS_UPDATE 10000 + +static int +verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) +{ + verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; + vdev_t *vd = vcsec->vcsec_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + ASSERT(sme->sme_type == SM_FREE); + + if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { + (void) fprintf(stderr, + "\rverifying vdev %llu, space map entry %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vcsec->vcsec_entryid, + (longlong_t)vcsec->vcsec_num_entries); + } + vcsec->vcsec_entryid++; + + /* + * See comment in checkpoint_sm_exclude_entry_cb() + */ + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * The entries in the vdev_checkpoint_sm should be marked as + * allocated in the checkpointed state of the pool, therefore + * their respective ms_allocateable trees should not contain them. + */ + mutex_enter(&ms->ms_lock); + range_tree_verify_not_present(ms->ms_allocatable, + sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + return (0); +} + +/* + * Verify that all segments in the vdev_checkpoint_sm are allocated + * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's + * ms_allocatable). + * + * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of + * each vdev in the current state of the pool to the metaslab space maps + * (ms_sm) of the checkpointed state of the pool. + * + * Note that the function changes the state of the ms_allocatable + * trees of the current spa_t. The entries of these ms_allocatable + * trees are cleared out and then repopulated from with the free + * entries of their respective ms_sm space maps. + */ +static void +verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); + + for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; + vdev_t *current_vd = current_rvd->vdev_child[c]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * Since we don't allow device removal in a pool + * that has a checkpoint, we expect that all removed + * vdevs were removed from the pool before the + * checkpoint. + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + /* + * If the checkpoint space map doesn't exist, then nothing + * here is checkpointed so there's nothing to verify. + */ + if (current_vd->vdev_top_zap == 0 || + zap_contains(spa_meta_objset(current), + current_vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(current), + current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), + checkpoint_sm_obj, 0, current_vd->vdev_asize, + current_vd->vdev_ashift)); + + verify_checkpoint_sm_entry_cb_arg_t vcsec; + vcsec.vcsec_vd = ckpoint_vd; + vcsec.vcsec_entryid = 0; + vcsec.vcsec_num_entries = + space_map_length(checkpoint_sm) / sizeof (uint64_t); + VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), + verify_checkpoint_sm_entry_cb, &vcsec)); + if (dump_opt['m'] > 3) + dump_spacemap(current->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } + + /* + * If we've added vdevs since we took the checkpoint, ensure + * that their checkpoint space maps are empty. + */ + if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { + for (uint64_t c = ckpoint_rvd->vdev_children; + c < current_rvd->vdev_children; c++) { + vdev_t *current_vd = current_rvd->vdev_child[c]; + ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +/* + * Verifies that all space that's allocated in the checkpoint is + * still allocated in the current version, by checking that everything + * in checkpoint's ms_allocatable (which is actually allocated, not + * allocatable/free) is not present in current's ms_allocatable. + * + * Note that the function changes the state of the ms_allocatable + * trees of both spas when called. The entries of all ms_allocatable + * trees are cleared out and then repopulated from their respective + * ms_sm space maps. In the checkpointed state we load the allocated + * entries, and in the current state we load the free entries. + */ +static void +verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); + load_concrete_ms_allocatable_trees(current, SM_FREE); + + for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; + vdev_t *current_vd = current_rvd->vdev_child[i]; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * See comment in verify_checkpoint_vdev_spacemaps() + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { + metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; + metaslab_t *current_msp = current_vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rverifying vdev %llu of %llu, " + "metaslab %llu of %llu ...", + (longlong_t)current_vd->vdev_id, + (longlong_t)current_rvd->vdev_children, + (longlong_t)current_vd->vdev_ms[m]->ms_id, + (longlong_t)current_vd->vdev_ms_count); + + /* + * We walk through the ms_allocatable trees that + * are loaded with the allocated blocks from the + * ms_sm spacemaps of the checkpoint. For each + * one of these ranges we ensure that none of them + * exists in the ms_allocatable trees of the + * current state which are loaded with the ranges + * that are currently free. + * + * This way we ensure that none of the blocks that + * are part of the checkpoint were freed by mistake. + */ + range_tree_walk(ckpoint_msp->ms_allocatable, + (range_tree_func_t *)range_tree_verify_not_present, + current_msp->ms_allocatable); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +static void +verify_checkpoint_blocks(spa_t *spa) +{ + ASSERT(!dump_opt['L']); + + spa_t *checkpoint_spa; + char *checkpoint_pool; + nvlist_t *config = NULL; + int error = 0; + + /* + * We import the checkpointed state of the pool (under a different + * name) so we can do verification on it against the current state + * of the pool. + */ + checkpoint_pool = import_checkpointed_state(spa->spa_name, config, + NULL); + ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); + + error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but spa_open() failed with " + "error %d\n", checkpoint_pool, error); + } + + /* + * Ensure that ranges in the checkpoint space maps of each vdev + * are allocated according to the checkpointed state's metaslab + * space maps. + */ + verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); + + /* + * Ensure that allocated ranges in the checkpoint's metaslab + * space maps remain allocated in the metaslab space maps of + * the current state. + */ + verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); + + /* + * Once we are done, we get rid of the checkpointed state. + */ + spa_close(checkpoint_spa, FTAG); + free(checkpoint_pool); +} + +static void +dump_leftover_checkpoint_blocks(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (vd->vdev_top_zap == 0) + continue; + + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + dump_spacemap(spa->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } +} + +static int +verify_checkpoint(spa_t *spa) +{ + uberblock_t checkpoint; + int error; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (0); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT && !dump_opt['L']) { + /* + * If the feature is active but the uberblock is missing + * then we must be in the middle of discarding the + * checkpoint. + */ + (void) printf("\nPartially discarded checkpoint " + "state found:\n"); + if (dump_opt['m'] > 3) + dump_leftover_checkpoint_blocks(spa); + return (0); + } else if (error != 0) { + (void) printf("lookup error %d when looking for " + "checkpointed uberblock in MOS\n", error); + return (error); + } + dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); + + if (checkpoint.ub_checkpoint_txg == 0) { + (void) printf("\nub_checkpoint_txg not set in checkpointed " + "uberblock\n"); + error = 3; + } + + if (error == 0 && !dump_opt['L']) + verify_checkpoint_blocks(spa); + + return (error); +} + +/* ARGSUSED */ +static void +mos_leaks_cb(void *arg, uint64_t start, uint64_t size) +{ + for (uint64_t i = start; i < size; i++) { + (void) printf("MOS object %llu referenced but not allocated\n", + (u_longlong_t)i); + } +} + +static void +mos_obj_refd(uint64_t obj) +{ + if (obj != 0 && mos_refd_objs != NULL) + range_tree_add(mos_refd_objs, obj, 1); +} + +/* + * Call on a MOS object that may already have been referenced. + */ +static void +mos_obj_refd_multiple(uint64_t obj) +{ + if (obj != 0 && mos_refd_objs != NULL && + !range_tree_contains(mos_refd_objs, obj, 1)) + range_tree_add(mos_refd_objs, obj, 1); +} + +static void +mos_leak_vdev_top_zap(vdev_t *vd) +{ + uint64_t ms_flush_data_obj; + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(ms_flush_data_obj); +} + +static void +mos_leak_vdev(vdev_t *vd) +{ + mos_obj_refd(vd->vdev_dtl_object); + mos_obj_refd(vd->vdev_ms_array); + mos_obj_refd(vd->vdev_indirect_config.vic_births_object); + mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); + mos_obj_refd(vd->vdev_leaf_zap); + if (vd->vdev_checkpoint_sm != NULL) + mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); + if (vd->vdev_indirect_mapping != NULL) { + mos_obj_refd(vd->vdev_indirect_mapping-> + vim_phys->vimp_counts_object); + } + if (vd->vdev_obsolete_sm != NULL) + mos_obj_refd(vd->vdev_obsolete_sm->sm_object); + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + mos_obj_refd(space_map_object(ms->ms_sm)); + } + + if (vd->vdev_top_zap != 0) { + mos_obj_refd(vd->vdev_top_zap); + mos_leak_vdev_top_zap(vd); + } + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + mos_leak_vdev(vd->vdev_child[c]); + } +} + +static void +mos_leak_log_spacemaps(spa_t *spa) +{ + uint64_t spacemap_zap; + int error = zap_lookup(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, + sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(spacemap_zap); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) + mos_obj_refd(sls->sls_sm_obj); +} + +static int +dump_mos_leaks(spa_t *spa) +{ + int rv = 0; + objset_t *mos = spa->spa_meta_objset; + dsl_pool_t *dp = spa->spa_dsl_pool; + + /* Visit and mark all referenced objects in the MOS */ + + mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); + mos_obj_refd(spa->spa_pool_props_object); + mos_obj_refd(spa->spa_config_object); + mos_obj_refd(spa->spa_ddt_stat_object); + mos_obj_refd(spa->spa_feat_desc_obj); + mos_obj_refd(spa->spa_feat_enabled_txg_obj); + mos_obj_refd(spa->spa_feat_for_read_obj); + mos_obj_refd(spa->spa_feat_for_write_obj); + mos_obj_refd(spa->spa_history); + mos_obj_refd(spa->spa_errlog_last); + mos_obj_refd(spa->spa_errlog_scrub); + mos_obj_refd(spa->spa_all_vdev_zaps); + mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); + mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); + mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); + bpobj_count_refd(&spa->spa_deferred_bpobj); + mos_obj_refd(dp->dp_empty_bpobj); + bpobj_count_refd(&dp->dp_obsolete_bpobj); + bpobj_count_refd(&dp->dp_free_bpobj); + mos_obj_refd(spa->spa_l2cache.sav_object); + mos_obj_refd(spa->spa_spares.sav_object); + + if (spa->spa_syncing_log_sm != NULL) + mos_obj_refd(spa->spa_syncing_log_sm->sm_object); + mos_leak_log_spacemaps(spa); + + mos_obj_refd(spa->spa_condensing_indirect_phys. + scip_next_mapping_object); + mos_obj_refd(spa->spa_condensing_indirect_phys. + scip_prev_obsolete_sm_object); + if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { + vdev_indirect_mapping_t *vim = + vdev_indirect_mapping_open(mos, + spa->spa_condensing_indirect_phys.scip_next_mapping_object); + mos_obj_refd(vim->vim_phys->vimp_counts_object); + vdev_indirect_mapping_close(vim); + } + deleted_livelists_dump_mos(spa); + + if (dp->dp_origin_snap != NULL) { + dsl_dataset_t *ds; + + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, + FTAG, &ds)); + count_ds_mos_objects(ds); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + + count_ds_mos_objects(dp->dp_origin_snap); + dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); + } + count_dir_mos_objects(dp->dp_mos_dir); + if (dp->dp_free_dir != NULL) + count_dir_mos_objects(dp->dp_free_dir); + if (dp->dp_leak_dir != NULL) + count_dir_mos_objects(dp->dp_leak_dir); + + mos_leak_vdev(spa->spa_root_vdev); + + for (uint64_t class = 0; class < DDT_CLASSES; class++) { + for (uint64_t type = 0; type < DDT_TYPES; type++) { + for (uint64_t cksum = 0; + cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { + ddt_t *ddt = spa->spa_ddt[cksum]; + mos_obj_refd(ddt->ddt_object[type][class]); + } + } + } + + /* + * Visit all allocated objects and make sure they are referenced. + */ + uint64_t object = 0; + while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { + if (range_tree_contains(mos_refd_objs, object, 1)) { + range_tree_remove(mos_refd_objs, object, 1); + } else { + dmu_object_info_t doi; + const char *name; + dmu_object_info(mos, object, &doi); + if (doi.doi_type & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(doi.doi_type); + name = dmu_ot_byteswap[bswap].ob_name; + } else { + name = dmu_ot[doi.doi_type].ot_name; + } + + (void) printf("MOS object %llu (%s) leaked\n", + (u_longlong_t)object, name); + rv = 2; + } + } + (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); + if (!range_tree_is_empty(mos_refd_objs)) + rv = 2; + range_tree_vacate(mos_refd_objs, NULL, NULL); + range_tree_destroy(mos_refd_objs); + return (rv); +} + +typedef struct log_sm_obsolete_stats_arg { + uint64_t lsos_current_txg; + + uint64_t lsos_total_entries; + uint64_t lsos_valid_entries; + + uint64_t lsos_sm_entries; + uint64_t lsos_valid_sm_entries; +} log_sm_obsolete_stats_arg_t; + +static int +log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + log_sm_obsolete_stats_arg_t *lsos = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + if (lsos->lsos_current_txg == 0) { + /* this is the first log */ + lsos->lsos_current_txg = txg; + } else if (lsos->lsos_current_txg < txg) { + /* we just changed log - print stats and reset */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos->lsos_valid_sm_entries, + (u_longlong_t)lsos->lsos_sm_entries, + (u_longlong_t)lsos->lsos_current_txg); + lsos->lsos_valid_sm_entries = 0; + lsos->lsos_sm_entries = 0; + lsos->lsos_current_txg = txg; + } + ASSERT3U(lsos->lsos_current_txg, ==, txg); + + lsos->lsos_sm_entries++; + lsos->lsos_total_entries++; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + lsos->lsos_valid_sm_entries++; + lsos->lsos_valid_entries++; + return (0); +} + +static void +dump_log_spacemap_obsolete_stats(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + log_sm_obsolete_stats_arg_t lsos; + bzero(&lsos, sizeof (lsos)); + + (void) printf("Log Space Map Obsolete Entry Statistics:\n"); + + iterate_through_spacemap_logs(spa, + log_spacemap_obsolete_stats_cb, &lsos); + + /* print stats for latest log */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos.lsos_valid_sm_entries, + (u_longlong_t)lsos.lsos_sm_entries, + (u_longlong_t)lsos.lsos_current_txg); + + (void) printf("%-8llu valid entries out of %-8llu - total\n\n", + (u_longlong_t)lsos.lsos_valid_entries, + (u_longlong_t)lsos.lsos_total_entries); +} + +static void +dump_zpool(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + int rc = 0; + + if (dump_opt['y']) { + livelist_metaslab_validate(spa); + } + + if (dump_opt['S']) { + dump_simulated_ddt(spa); + return; + } + + if (!dump_opt['e'] && dump_opt['C'] > 1) { + (void) printf("\nCached configuration:\n"); + dump_nvlist(spa->spa_config, 8); + } + + if (dump_opt['C']) + dump_config(spa); + + if (dump_opt['u']) + dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); + + if (dump_opt['D']) + dump_all_ddts(spa); + + if (dump_opt['d'] > 2 || dump_opt['m']) + dump_metaslabs(spa); + if (dump_opt['M']) + dump_metaslab_groups(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) { + dump_log_spacemaps(spa); + dump_log_spacemap_obsolete_stats(spa); + } + + if (dump_opt['d'] || dump_opt['i']) { + spa_feature_t f; + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); + dump_objset(dp->dp_meta_objset); + + if (dump_opt['d'] >= 3) { + dsl_pool_t *dp = spa->spa_dsl_pool; + dump_full_bpobj(&spa->spa_deferred_bpobj, + "Deferred frees", 0); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + dump_full_bpobj(&dp->dp_free_bpobj, + "Pool snapshot frees", 0); + } + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + dump_full_bpobj(&dp->dp_obsolete_bpobj, + "Pool obsolete blocks", 0); + } + + if (spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dump_bptree(spa->spa_meta_objset, + dp->dp_bptree_obj, + "Pool dataset frees"); + } + dump_dtl(spa->spa_root_vdev, 0); + } + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) + global_feature_count[f] = UINT64_MAX; + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; + global_feature_count[SPA_FEATURE_LIVELIST] = 0; + + (void) dmu_objset_find(spa_name(spa), dump_one_objset, + NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + + if (rc == 0 && !dump_opt['L']) + rc = dump_mos_leaks(spa); + + for (f = 0; f < SPA_FEATURES; f++) { + uint64_t refcount; + + uint64_t *arr; + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + if (global_feature_count[f] == UINT64_MAX) + continue; + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(global_feature_count[f]); + continue; + } + arr = global_feature_count; + } else { + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(dataset_feature_count[f]); + continue; + } + arr = dataset_feature_count; + } + if (feature_get_refcount(spa, &spa_feature_table[f], + &refcount) == ENOTSUP) + continue; + if (arr[f] != refcount) { + (void) printf("%s feature refcount mismatch: " + "%lld consumers != %lld refcount\n", + spa_feature_table[f].fi_uname, + (longlong_t)arr[f], (longlong_t)refcount); + rc = 2; + } else { + (void) printf("Verified %s feature refcount " + "of %llu is correct\n", + spa_feature_table[f].fi_uname, + (longlong_t)refcount); + } + } + + if (rc == 0) + rc = verify_device_removal_feature_counts(spa); + } + + if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) + rc = dump_block_stats(spa); + + if (rc == 0) + rc = verify_spacemap_refcounts(spa); + + if (dump_opt['s']) + show_pool_stats(spa); + + if (dump_opt['h']) + dump_history(spa); + + if (rc == 0) + rc = verify_checkpoint(spa); + + if (rc != 0) { + dump_debug_buffer(); + exit(rc); + } +} + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +static int flagbits[256]; +static char flagbitstr[16]; + +static void +zdb_print_blkptr(const blkptr_t *bp, int flags) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); + + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("%s\n", blkbuf); +} + +static void +zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) +{ + int i; + + for (i = 0; i < nbps; i++) + zdb_print_blkptr(&bp[i], flags); +} + +static void +zdb_dump_gbh(void *buf, int flags) +{ + zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); +} + +static void +zdb_dump_block_raw(void *buf, uint64_t size, int flags) +{ + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array(buf, size); + VERIFY(write(fileno(stdout), buf, size) == size); +} + +static void +zdb_dump_block(char *label, void *buf, uint64_t size, int flags) +{ + uint64_t *d = (uint64_t *)buf; + unsigned nwords = size / sizeof (uint64_t); + int do_bswap = !!(flags & ZDB_FLAG_BSWAP); + unsigned i, j; + const char *hdr; + char *c; + + + if (do_bswap) + hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; + else + hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; + + (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); + +#ifdef _LITTLE_ENDIAN + /* correct the endianness */ + do_bswap = !do_bswap; +#endif + for (i = 0; i < nwords; i += 2) { + (void) printf("%06llx: %016llx %016llx ", + (u_longlong_t)(i * sizeof (uint64_t)), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); + + c = (char *)&d[i]; + for (j = 0; j < 2 * sizeof (uint64_t); j++) + (void) printf("%c", isprint(c[j]) ? c[j] : '.'); + (void) printf("\n"); + } +} + +/* + * There are two acceptable formats: + * leaf_name - For example: c1t0d0 or /tmp/ztest.0a + * child[.child]* - For example: 0.1.1 + * + * The second form can be used to specify arbitrary vdevs anywhere + * in the hierarchy. For example, in a pool with a mirror of + * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . + */ +static vdev_t * +zdb_vdev_lookup(vdev_t *vdev, const char *path) +{ + char *s, *p, *q; + unsigned i; + + if (vdev == NULL) + return (NULL); + + /* First, assume the x.x.x.x format */ + i = strtoul(path, &s, 10); + if (s == path || (s && *s != '.' && *s != '\0')) + goto name; + if (i >= vdev->vdev_children) + return (NULL); + + vdev = vdev->vdev_child[i]; + if (s && *s == '\0') + return (vdev); + return (zdb_vdev_lookup(vdev, s+1)); + +name: + for (i = 0; i < vdev->vdev_children; i++) { + vdev_t *vc = vdev->vdev_child[i]; + + if (vc->vdev_path == NULL) { + vc = zdb_vdev_lookup(vc, path); + if (vc == NULL) + continue; + else + return (vc); + } + + p = strrchr(vc->vdev_path, '/'); + p = p ? p + 1 : vc->vdev_path; + q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; + + if (strcmp(vc->vdev_path, path) == 0) + return (vc); + if (strcmp(p, path) == 0) + return (vc); + if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) + return (vc); + } + + return (NULL); +} + +static int +name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) +{ + dsl_dataset_t *ds; + + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, + NULL, &ds); + if (error != 0) { + (void) fprintf(stderr, "failed to hold objset %llu: %s\n", + (u_longlong_t)objset_id, strerror(error)); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (error); + } + dsl_dataset_name(ds, outstr); + dsl_dataset_rele(ds, NULL); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (0); +} + +static boolean_t +zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) +{ + char *s0, *s1; + + if (sizes == NULL) + return (B_FALSE); + + s0 = strtok(sizes, "/"); + if (s0 == NULL) + return (B_FALSE); + s1 = strtok(NULL, "/"); + *lsize = strtoull(s0, NULL, 16); + *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; + return (*lsize >= *psize && *psize > 0); +} + +#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) + +static boolean_t +zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, + uint64_t psize, int flags) +{ + boolean_t exceeded = B_FALSE; + /* + * We don't know how the data was compressed, so just try + * every decompress function at every inflated blocksize. + */ + void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; + int *cfuncp = cfuncs; + uint64_t maxlsize = SPA_MAXBLOCKSIZE; + uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | + ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | + (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); + *cfuncp++ = ZIO_COMPRESS_LZ4; + *cfuncp++ = ZIO_COMPRESS_LZJB; + mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); + for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) + if (((1ULL << c) & mask) == 0) + *cfuncp++ = c; + + /* + * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this + * could take a while and we should let the user know + * we are not stuck. On the other hand, printing progress + * info gets old after a while. User can specify 'v' flag + * to see the progression. + */ + if (lsize == psize) + lsize += SPA_MINBLOCKSIZE; + else + maxlsize = lsize; + for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { + for (cfuncp = cfuncs; *cfuncp; cfuncp++) { + if (flags & ZDB_FLAG_VERBOSE) { + (void) fprintf(stderr, + "Trying %05llx -> %05llx (%s)\n", + (u_longlong_t)psize, + (u_longlong_t)lsize, + zio_compress_table[*cfuncp].\ + ci_name); + } + + /* + * We randomize lbuf2, and decompress to both + * lbuf and lbuf2. This way, we will know if + * decompression fill exactly to lsize. + */ + VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); + + if (zio_decompress_data(*cfuncp, pabd, + lbuf, psize, lsize, NULL) == 0 && + zio_decompress_data(*cfuncp, pabd, + lbuf2, psize, lsize, NULL) == 0 && + bcmp(lbuf, lbuf2, lsize) == 0) + break; + } + if (*cfuncp != 0) + break; + } + umem_free(lbuf2, SPA_MAXBLOCKSIZE); + + if (lsize > maxlsize) { + exceeded = B_TRUE; + } + buf = lbuf; + if (*cfuncp == ZIO_COMPRESS_ZLE) { + printf("\nZLE decompression was selected. If you " + "suspect the results are wrong,\ntry avoiding ZLE " + "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); + } + + return (exceeded); +} + +/* + * Read a block from a pool and print it out. The syntax of the + * block descriptor is: + * + * pool:vdev_specifier:offset:[lsize/]psize[:flags] + * + * pool - The name of the pool you wish to read from + * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) + * offset - offset, in hex, in bytes + * size - Amount of data to read, in hex, in bytes + * flags - A string of characters specifying options + * b: Decode a blkptr at given offset within block + * c: Calculate and display checksums + * d: Decompress data before dumping + * e: Byteswap data before dumping + * g: Display data as a gang block header + * i: Display as an indirect block + * r: Dump raw data to stdout + * v: Verbose + * + */ +static void +zdb_read_block(char *thing, spa_t *spa) +{ + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + int flags = 0; + uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; + zio_t *zio; + vdev_t *vd; + abd_t *pabd; + void *lbuf, *buf; + char *s, *p, *dup, *vdev, *flagstr, *sizes; + int i, error; + boolean_t borrowed = B_FALSE, found = B_FALSE; + + dup = strdup(thing); + s = strtok(dup, ":"); + vdev = s ? s : ""; + s = strtok(NULL, ":"); + offset = strtoull(s ? s : "", NULL, 16); + sizes = strtok(NULL, ":"); + s = strtok(NULL, ":"); + flagstr = strdup(s ? s : ""); + + s = NULL; + if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) + s = "invalid size(s)"; + if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) + s = "size must be a multiple of sector size"; + if (!IS_P2ALIGNED(offset, DEV_BSIZE)) + s = "offset must be a multiple of sector size"; + if (s) { + (void) printf("Invalid block specifier: %s - %s\n", thing, s); + goto done; + } + + for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { + for (i = 0; i < strlen(flagstr); i++) { + int bit = flagbits[(uchar_t)flagstr[i]]; + + if (bit == 0) { + (void) printf("***Ignoring flag: %c\n", + (uchar_t)flagstr[i]); + continue; + } + found = B_TRUE; + flags |= bit; + + p = &flagstr[i + 1]; + if (*p != ':' && *p != '\0') { + int j = 0, nextbit = flagbits[(uchar_t)*p]; + char *end, offstr[8] = { 0 }; + if ((bit == ZDB_FLAG_PRINT_BLKPTR) && + (nextbit == 0)) { + /* look ahead to isolate the offset */ + while (nextbit == 0 && + strchr(flagbitstr, *p) == NULL) { + offstr[j] = *p; + j++; + if (i + j > strlen(flagstr)) + break; + p++; + nextbit = flagbits[(uchar_t)*p]; + } + blkptr_offset = strtoull(offstr, &end, + 16); + i += j; + } else if (nextbit == 0) { + (void) printf("***Ignoring flag arg:" + " '%c'\n", (uchar_t)*p); + } + } + } + } + if (blkptr_offset % sizeof (blkptr_t)) { + printf("Block pointer offset 0x%llx " + "must be divisible by 0x%x\n", + (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); + goto done; + } + if (found == B_FALSE && strlen(flagstr) > 0) { + printf("Invalid flag arg: '%s'\n", flagstr); + goto done; + } + + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); + if (vd == NULL) { + (void) printf("***Invalid vdev: %s\n", vdev); + free(dup); + return; + } else { + if (vd->vdev_path) + (void) fprintf(stderr, "Found vdev: %s\n", + vd->vdev_path); + else + (void) fprintf(stderr, "Found vdev type: %s\n", + vd->vdev_ops->vdev_op_type); + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + zio = zio_root(spa, NULL, NULL, 0); + + if (vd == vd->vdev_top) { + /* + * Treat this as a normal block read. + */ + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); + } else { + /* + * Treat this as a vdev child I/O. + */ + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + + error = zio_wait(zio); + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error) { + (void) printf("Read of %s failed, error: %d\n", thing, error); + goto out; + } + + uint64_t orig_lsize = lsize; + buf = lbuf; + if (flags & ZDB_FLAG_DECOMPRESS) { + boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, + lsize, psize, flags); + if (failed) { + (void) printf("Decompress of %s failed\n", thing); + goto out; + } + } else { + buf = abd_borrow_buf_copy(pabd, lsize); + borrowed = B_TRUE; + } + /* + * Try to detect invalid block pointer. If invalid, try + * decompressing. + */ + if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && + !(flags & ZDB_FLAG_DECOMPRESS)) { + const blkptr_t *b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == + B_FALSE) { + abd_return_buf_copy(pabd, buf, lsize); + borrowed = B_FALSE; + buf = lbuf; + boolean_t failed = zdb_decompress_block(pabd, buf, + lbuf, lsize, psize, flags); + b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (failed || zfs_blkptr_verify(spa, b, B_FALSE, + BLK_VERIFY_LOG) == B_FALSE) { + printf("invalid block pointer at this DVA\n"); + goto out; + } + } + } + + if (flags & ZDB_FLAG_PRINT_BLKPTR) + zdb_print_blkptr((blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); + else if (flags & ZDB_FLAG_RAW) + zdb_dump_block_raw(buf, lsize, flags); + else if (flags & ZDB_FLAG_INDIRECT) + zdb_dump_indirect((blkptr_t *)buf, + orig_lsize / sizeof (blkptr_t), flags); + else if (flags & ZDB_FLAG_GBH) + zdb_dump_gbh(buf, flags); + else + zdb_dump_block(thing, buf, lsize, flags); + + /* + * If :c was specified, iterate through the checksum table to + * calculate and display each checksum for our specified + * DVA and length. + */ + if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && + !(flags & ZDB_FLAG_GBH)) { + zio_t *czio; + (void) printf("\n"); + for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; + ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { + + if ((zio_checksum_table[ck].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED) || + ck == ZIO_CHECKSUM_NOPARITY) { + continue; + } + BP_SET_CHECKSUM(bp, ck); + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + czio->io_bp = bp; + + if (vd == vd->vdev_top) { + zio_nowait(zio_read(czio, spa, bp, pabd, psize, + NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_DONT_RETRY, NULL)); + } else { + zio_nowait(zio_vdev_child_io(czio, bp, vd, + offset, pabd, psize, ZIO_TYPE_READ, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + error = zio_wait(czio); + if (error == 0 || error == ECKSUM) { + zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); + ck_zio->io_offset = + DVA_GET_OFFSET(&bp->blk_dva[0]); + ck_zio->io_bp = bp; + zio_checksum_compute(ck_zio, ck, pabd, lsize); + printf("%12s\tcksum=%llx:%llx:%llx:%llx\n", + zio_checksum_table[ck].ci_name, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + zio_wait(ck_zio); + } else { + printf("error %d reading block\n", error); + } + spa_config_exit(spa, SCL_STATE, FTAG); + } + } + + if (borrowed) + abd_return_buf_copy(pabd, buf, lsize); + +out: + abd_free(pabd); + umem_free(lbuf, SPA_MAXBLOCKSIZE); +done: + free(flagstr); + free(dup); +} + +static void +zdb_embedded_block(char *thing) +{ + blkptr_t bp; + unsigned long long *words = (void *)&bp; + char *buf; + int err; + + bzero(&bp, sizeof (bp)); + err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" + "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", + words + 0, words + 1, words + 2, words + 3, + words + 4, words + 5, words + 6, words + 7, + words + 8, words + 9, words + 10, words + 11, + words + 12, words + 13, words + 14, words + 15); + if (err != 16) { + (void) fprintf(stderr, "invalid input format\n"); + exit(1); + } + ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); + if (err != 0) { + (void) fprintf(stderr, "decode failed: %u\n", err); + exit(1); + } + zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + free(buf); +} + +int +main(int argc, char **argv) +{ + int c; + struct rlimit rl = { 1024, 1024 }; + spa_t *spa = NULL; + objset_t *os = NULL; + int dump_all = 1; + int verbose = 0; + int error = 0; + char **searchdirs = NULL; + int nsearch = 0; + char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *policy = NULL; + uint64_t max_txg = UINT64_MAX; + int64_t objset_id = -1; + int flags = ZFS_IMPORT_MISSING_LOG; + int rewind = ZPOOL_NEVER_REWIND; + char *spa_config_path_env, *objset_str; + boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; + nvlist_t *cfg = NULL; + + (void) setrlimit(RLIMIT_NOFILE, &rl); + (void) enable_extended_FILE_stdio(-1, -1); + + dprintf_setup(&argc, argv); + + /* + * If there is an environment variable SPA_CONFIG_PATH it overrides + * default spa_config_path setting. If -U flag is specified it will + * override this environment variable settings once again. + */ + spa_config_path_env = getenv("SPA_CONFIG_PATH"); + if (spa_config_path_env != NULL) + spa_config_path = spa_config_path_env; + + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + zfs_btree_verify_intensity = 3; + + while ((c = getopt(argc, argv, + "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) { + switch (c) { + case 'b': + case 'c': + case 'C': + case 'd': + case 'D': + case 'E': + case 'G': + case 'h': + case 'i': + case 'l': + case 'm': + case 'M': + case 'O': + case 'R': + case 's': + case 'S': + case 'u': + case 'y': + case 'Z': + dump_opt[c]++; + dump_all = 0; + break; + case 'A': + case 'e': + case 'F': + case 'k': + case 'L': + case 'P': + case 'q': + case 'X': + dump_opt[c]++; + break; + case 'Y': + zfs_reconstruct_indirect_combinations_max = INT_MAX; + zfs_deadman_enabled = 0; + break; + /* NB: Sort single match options below. */ + case 'I': + max_inflight_bytes = strtoull(optarg, NULL, 0); + if (max_inflight_bytes == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight bytes must be greater " + "than 0\n"); + usage(); + } + break; + case 'o': + error = set_global_var(optarg); + if (error != 0) + usage(); + break; + case 'p': + if (searchdirs == NULL) { + searchdirs = umem_alloc(sizeof (char *), + UMEM_NOFAIL); + } else { + char **tmp = umem_alloc((nsearch + 1) * + sizeof (char *), UMEM_NOFAIL); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + umem_free(searchdirs, + nsearch * sizeof (char *)); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 't': + max_txg = strtoull(optarg, NULL, 0); + if (max_txg < TXG_INITIAL) { + (void) fprintf(stderr, "incorrect txg " + "specified: %s\n", optarg); + usage(); + } + break; + case 'U': + spa_config_path = optarg; + if (spa_config_path[0] != '/') { + (void) fprintf(stderr, + "cachefile must be an absolute path " + "(i.e. start with a slash)\n"); + usage(); + } + break; + case 'v': + verbose++; + break; + case 'V': + flags = ZFS_IMPORT_VERBATIM; + break; + case 'x': + vn_dumpdir = optarg; + break; + default: + usage(); + break; + } + } + + if (!dump_opt['e'] && searchdirs != NULL) { + (void) fprintf(stderr, "-p option requires use of -e\n"); + usage(); + } + if (dump_opt['d']) { + /* <pool>[/<dataset | objset id> is accepted */ + if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL && + objset_str++ != NULL) { + char *endptr; + errno = 0; + objset_id = strtoull(objset_str, &endptr, 0); + /* dataset 0 is the same as opening the pool */ + if (errno == 0 && endptr != objset_str && + objset_id != 0) { + target_is_spa = B_FALSE; + dataset_lookup = B_TRUE; + } else if (objset_id != 0) { + printf("failed to open objset %s " + "%llu %s", objset_str, + (u_longlong_t)objset_id, + strerror(errno)); + exit(1); + } + /* normal dataset name not an objset ID */ + if (endptr == objset_str) { + objset_id = -1; + } + } + } + +#if defined(_LP64) + /* + * ZDB does not typically re-read blocks; therefore limit the ARC + * to 256 MB, which can be used entirely for metadata. + */ + zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; + zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; +#endif + + /* + * "zdb -c" uses checksum-verifying scrub i/os which are async reads. + * "zdb -b" uses traversal prefetch which uses async reads. + * For good performance, let several of them be active at once. + */ + zfs_vdev_async_read_max_active = 10; + + /* + * Disable reference tracking for better performance. + */ + reference_tracking_enable = B_FALSE; + + /* + * Do not fail spa_load when spa_load_verify fails. This is needed + * to load non-idle pools. + */ + spa_load_verify_dryrun = B_TRUE; + + kernel_init(SPA_MODE_READ); + + if (dump_all) + verbose = MAX(verbose, 1); + + for (c = 0; c < 256; c++) { + if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL) + dump_opt[c] = 1; + if (dump_opt[c]) + dump_opt[c] += verbose; + } + + aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); + zfs_recover = (dump_opt['A'] > 1); + + argc -= optind; + argv += optind; + if (argc < 2 && dump_opt['R']) + usage(); + + if (dump_opt['E']) { + if (argc != 1) + usage(); + zdb_embedded_block(argv[0]); + return (0); + } + + if (argc < 1) { + if (!dump_opt['e'] && dump_opt['C']) { + dump_cachefile(spa_config_path); + return (0); + } + usage(); + } + + if (dump_opt['l']) + return (dump_label(argv[0])); + + if (dump_opt['O']) { + if (argc != 2) + usage(); + dump_opt['v'] = verbose + 3; + return (dump_path(argv[0], argv[1])); + } + + if (dump_opt['X'] || dump_opt['F']) + rewind = ZPOOL_DO_REWIND | + (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); + + if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) + fatal("internal error: %s", strerror(ENOMEM)); + + error = 0; + target = argv[0]; + + if (strpbrk(target, "/@") != NULL) { + size_t targetlen; + + target_pool = strdup(target); + *strpbrk(target_pool, "/@") = '\0'; + + target_is_spa = B_FALSE; + targetlen = strlen(target); + if (targetlen && target[targetlen - 1] == '/') + target[targetlen - 1] = '\0'; + } else { + target_pool = target; + } + + if (dump_opt['e']) { + importargs_t args = { 0 }; + + args.paths = nsearch; + args.path = searchdirs; + args.can_be_active = B_TRUE; + + error = zpool_find_config(NULL, target_pool, &cfg, &args, + &libzpool_config_ops); + + if (error == 0) { + + if (nvlist_add_nvlist(cfg, + ZPOOL_LOAD_POLICY, policy) != 0) { + fatal("can't open '%s': %s", + target, strerror(ENOMEM)); + } + + if (dump_opt['C'] > 1) { + (void) printf("\nConfiguration for import:\n"); + dump_nvlist(cfg, 8); + } + + /* + * Disable the activity check to allow examination of + * active pools. + */ + error = spa_import(target_pool, cfg, NULL, + flags | ZFS_IMPORT_SKIP_MMP); + } + } + + /* + * import_checkpointed_state makes the assumption that the + * target pool that we pass it is already part of the spa + * namespace. Because of that we need to make sure to call + * it always after the -e option has been processed, which + * imports the pool to the namespace if it's not in the + * cachefile. + */ + char *checkpoint_pool = NULL; + char *checkpoint_target = NULL; + if (dump_opt['k']) { + checkpoint_pool = import_checkpointed_state(target, cfg, + &checkpoint_target); + + if (checkpoint_target != NULL) + target = checkpoint_target; + } + + if (target_pool != target) + free(target_pool); + + if (error == 0) { + if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { + ASSERT(checkpoint_pool != NULL); + ASSERT(checkpoint_target == NULL); + + error = spa_open(checkpoint_pool, &spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but " + "spa_open() failed with error %d\n", + checkpoint_pool, error); + } + + } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { + zdb_set_skip_mmp(target); + error = spa_open_rewind(target, &spa, FTAG, policy, + NULL); + if (error) { + /* + * If we're missing the log device then + * try opening the pool after clearing the + * log state. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL && + spa->spa_log_state == SPA_LOG_MISSING) { + spa->spa_log_state = SPA_LOG_CLEAR; + error = 0; + } + mutex_exit(&spa_namespace_lock); + + if (!error) { + error = spa_open_rewind(target, &spa, + FTAG, policy, NULL); + } + } + } else if (strpbrk(target, "#") != NULL) { + dsl_pool_t *dp; + error = dsl_pool_hold(target, FTAG, &dp); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + error = dump_bookmark(dp, target, B_TRUE, verbose > 1); + dsl_pool_rele(dp, FTAG); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + return (error); + } else { + zdb_set_skip_mmp(target); + if (dataset_lookup == B_TRUE) { + /* + * Use the supplied id to get the name + * for open_objset. + */ + error = spa_open(target, &spa, FTAG); + if (error == 0) { + error = name_from_objset_id(spa, + objset_id, dsname); + spa_close(spa, FTAG); + if (error == 0) + target = dsname; + } + } + if (error == 0) + error = open_objset(target, FTAG, &os); + if (error == 0) + spa = dmu_objset_spa(os); + } + } + nvlist_free(policy); + + if (error) + fatal("can't open '%s': %s", target, strerror(error)); + + /* + * Set the pool failure mode to panic in order to prevent the pool + * from suspending. A suspended I/O will have no way to resume and + * can prevent the zdb(8) command from terminating as expected. + */ + if (spa != NULL) + spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; + + argv++; + argc--; + if (!dump_opt['R']) { + flagbits['d'] = ZOR_FLAG_DIRECTORY; + flagbits['f'] = ZOR_FLAG_PLAIN_FILE; + flagbits['m'] = ZOR_FLAG_SPACE_MAP; + flagbits['z'] = ZOR_FLAG_ZAP; + flagbits['A'] = ZOR_FLAG_ALL_TYPES; + + if (argc > 0 && dump_opt['d']) { + zopt_object_args = argc; + zopt_object_ranges = calloc(zopt_object_args, + sizeof (zopt_object_range_t)); + for (unsigned i = 0; i < zopt_object_args; i++) { + int err; + char *msg = NULL; + + err = parse_object_range(argv[i], + &zopt_object_ranges[i], &msg); + if (err != 0) + fatal("Bad object or range: '%s': %s\n", + argv[i], msg ? msg : ""); + } + } else if (argc > 0 && dump_opt['m']) { + zopt_metaslab_args = argc; + zopt_metaslab = calloc(zopt_metaslab_args, + sizeof (uint64_t)); + for (unsigned i = 0; i < zopt_metaslab_args; i++) { + errno = 0; + zopt_metaslab[i] = strtoull(argv[i], NULL, 0); + if (zopt_metaslab[i] == 0 && errno != 0) + fatal("bad number %s: %s", argv[i], + strerror(errno)); + } + } + if (os != NULL) { + dump_objset(os); + } else if (zopt_object_args > 0 && !dump_opt['m']) { + dump_objset(spa->spa_meta_objset); + } else { + dump_zpool(spa); + } + } else { + flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; + flagbits['c'] = ZDB_FLAG_CHECKSUM; + flagbits['d'] = ZDB_FLAG_DECOMPRESS; + flagbits['e'] = ZDB_FLAG_BSWAP; + flagbits['g'] = ZDB_FLAG_GBH; + flagbits['i'] = ZDB_FLAG_INDIRECT; + flagbits['r'] = ZDB_FLAG_RAW; + flagbits['v'] = ZDB_FLAG_VERBOSE; + + for (int i = 0; i < argc; i++) + zdb_read_block(argv[i], spa); + } + + if (dump_opt['k']) { + free(checkpoint_pool); + if (!target_is_spa) + free(checkpoint_target); + } + + if (os != NULL) { + close_objset(os, FTAG); + } else { + spa_close(spa, FTAG); + } + + fuid_table_destroy(); + + dump_debug_buffer(); + + kernel_fini(); + + return (error); +} diff --git a/cmd/zdb/zdb.h b/cmd/zdb/zdb.h new file mode 100644 index 000000000000..49579811efbb --- /dev/null +++ b/cmd/zdb/zdb.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2017 Spectra Logic Corp Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _ZDB_H +#define _ZDB_H + +void dump_intent_log(zilog_t *); +extern uint8_t dump_opt[256]; + +#endif /* _ZDB_H */ diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c new file mode 100644 index 000000000000..c12178effae0 --- /dev/null +++ b/cmd/zdb/zdb_il.c @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +/* + * Print intent log header and statistics. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/dmu.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <sys/zil.h> +#include <sys/zil_impl.h> +#include <sys/spa_impl.h> +#include <sys/abd.h> + +#include "zdb.h" + +extern uint8_t dump_opt[256]; + +static char tab_prefix[4] = "\t\t\t"; + +static void +print_log_bp(const blkptr_t *bp, const char *prefix) +{ + char blkbuf[BP_SPRINTF_LEN]; + + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("%s%s\n", prefix, blkbuf); +} + +/* ARGSUSED */ +static void +zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) +{ + lr_create_t *lr = arg; + time_t crtime = lr->lr_crtime[0]; + char *name, *link; + lr_attr_t *lrattr; + + name = (char *)(lr + 1); + + if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || + lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { + lrattr = (lr_attr_t *)(lr + 1); + name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + if (txtype == TX_SYMLINK) { + link = name + strlen(name) + 1; + (void) printf("%s%s -> %s\n", tab_prefix, name, link); + } else if (txtype != TX_MKXATTR) { + (void) printf("%s%s\n", tab_prefix, name); + } + + (void) printf("%s%s", tab_prefix, ctime(&crtime)); + (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", + tab_prefix, (u_longlong_t)lr->lr_doid, + (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid), + (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid), + (longlong_t)lr->lr_mode); + (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", + tab_prefix, + (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, + (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); +} + +/* ARGSUSED */ +static void +zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) +{ + lr_remove_t *lr = arg; + + (void) printf("%sdoid %llu, name %s\n", tab_prefix, + (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) +{ + lr_link_t *lr = arg; + + (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, + (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, + (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg) +{ + lr_rename_t *lr = arg; + char *snm = (char *)(lr + 1); + char *tnm = snm + strlen(snm) + 1; + + (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, + (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); + (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm); +} + +/* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + + for (size_t i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ +static void +zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) +{ + lr_write_t *lr = arg; + abd_t *data; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int error; + + (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); + + if (txtype == TX_WRITE2 || verbose < 5) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", tab_prefix, + !BP_IS_HOLE(bp) && + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, tab_prefix); + + if (BP_IS_HOLE(bp)) { + (void) printf("\t\t\tLSIZE 0x%llx\n", + (u_longlong_t)BP_GET_LSIZE(bp)); + (void) printf("%s<hole>\n", tab_prefix); + return; + } + if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { + (void) printf("%s<block already committed>\n", + tab_prefix); + return; + } + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); + error = zio_wait(zio_read(NULL, zilog->zl_spa, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + if (error) + goto out; + } else { + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); + } + + (void) printf("%s", tab_prefix); + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); + (void) printf("\n"); + +out: + abd_free(data); +} + +/* ARGSUSED */ +static void +zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) +{ + lr_truncate_t *lr = arg; + + (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); +} + +/* ARGSUSED */ +static void +zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) +{ + lr_setattr_t *lr = arg; + time_t atime = (time_t)lr->lr_atime[0]; + time_t mtime = (time_t)lr->lr_mtime[0]; + + (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); + + if (lr->lr_mask & AT_MODE) { + (void) printf("%sAT_MODE %llo\n", tab_prefix, + (longlong_t)lr->lr_mode); + } + + if (lr->lr_mask & AT_UID) { + (void) printf("%sAT_UID %llu\n", tab_prefix, + (u_longlong_t)lr->lr_uid); + } + + if (lr->lr_mask & AT_GID) { + (void) printf("%sAT_GID %llu\n", tab_prefix, + (u_longlong_t)lr->lr_gid); + } + + if (lr->lr_mask & AT_SIZE) { + (void) printf("%sAT_SIZE %llu\n", tab_prefix, + (u_longlong_t)lr->lr_size); + } + + if (lr->lr_mask & AT_ATIME) { + (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix, + (u_longlong_t)lr->lr_atime[0], + (u_longlong_t)lr->lr_atime[1], + ctime(&atime)); + } + + if (lr->lr_mask & AT_MTIME) { + (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix, + (u_longlong_t)lr->lr_mtime[0], + (u_longlong_t)lr->lr_mtime[1], + ctime(&mtime)); + } +} + +/* ARGSUSED */ +static void +zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg) +{ + lr_acl_t *lr = arg; + + (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); +} + +typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *); +typedef struct zil_rec_info { + zil_prt_rec_func_t zri_print; + const char *zri_name; + uint64_t zri_count; +} zil_rec_info_t; + +static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { + {.zri_print = NULL, .zri_name = "Total "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "}, + {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "}, + {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, + {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, + {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, + {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, + {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, + {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, + {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, + {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "}, + {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "}, +}; + +/* ARGSUSED */ +static int +print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) +{ + int txtype; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + /* reduce size of txtype to strip off TX_CI bit */ + txtype = lr->lrc_txtype; + + ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE); + ASSERT(lr->lrc_txg); + + (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n", + (lr->lrc_txtype & TX_CI) ? "CI-" : "", + zil_rec_info[txtype].zri_name, + (u_longlong_t)lr->lrc_reclen, + (u_longlong_t)lr->lrc_txg, + (u_longlong_t)lr->lrc_seq); + + if (txtype && verbose >= 3) { + if (!zilog->zl_os->os_encrypted) { + zil_rec_info[txtype].zri_print(zilog, txtype, lr); + } else { + (void) printf("%s(encrypted)\n", tab_prefix); + } + } + + zil_rec_info[txtype].zri_count++; + zil_rec_info[0].zri_count++; + + return (0); +} + +/* ARGSUSED */ +static int +print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + char blkbuf[BP_SPRINTF_LEN + 10]; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + const char *claim; + + if (verbose <= 3) + return (0); + + if (verbose >= 5) { + (void) strcpy(blkbuf, ", "); + snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), bp); + } else { + blkbuf[0] = '\0'; + } + + if (claim_txg != 0) + claim = "already claimed"; + else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) + claim = "will claim"; + else + claim = "won't claim"; + + (void) printf("\tBlock seqno %llu, %s%s\n", + (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); + + return (0); +} + +static void +print_log_stats(int verbose) +{ + unsigned i, w, p10; + + if (verbose > 3) + (void) printf("\n"); + + if (zil_rec_info[0].zri_count == 0) + return; + + for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10) + w++; + + for (i = 0; i < TX_MAX_TYPE; i++) + if (zil_rec_info[i].zri_count || verbose >= 3) + (void) printf("\t\t%s %*llu\n", + zil_rec_info[i].zri_name, w, + (u_longlong_t)zil_rec_info[i].zri_count); + (void) printf("\n"); +} + +/* ARGSUSED */ +void +dump_intent_log(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int i; + + if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) + return; + + (void) printf("\n ZIL header: claim_txg %llu, " + "claim_blk_seq %llu, claim_lr_seq %llu", + (u_longlong_t)zh->zh_claim_txg, + (u_longlong_t)zh->zh_claim_blk_seq, + (u_longlong_t)zh->zh_claim_lr_seq); + (void) printf(" replay_seq %llu, flags 0x%llx\n", + (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); + + for (i = 0; i < TX_MAX_TYPE; i++) + zil_rec_info[i].zri_count = 0; + + /* see comment in zil_claim() or zil_check_log_chain() */ + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return; + + if (verbose >= 2) { + (void) printf("\n"); + (void) zil_parse(zilog, print_log_block, print_log_record, NULL, + zh->zh_claim_txg, B_FALSE); + print_log_stats(verbose); + } +} diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore new file mode 100644 index 000000000000..76557bb6bb3a --- /dev/null +++ b/cmd/zed/.gitignore @@ -0,0 +1 @@ +/zed diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am new file mode 100644 index 000000000000..4bd8ac4a53e6 --- /dev/null +++ b/cmd/zed/Makefile.am @@ -0,0 +1,49 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS) + +SUBDIRS = zed.d + +sbin_PROGRAMS = zed + +ZED_SRC = \ + zed.c \ + zed.h \ + zed_conf.c \ + zed_conf.h \ + zed_disk_event.c \ + zed_disk_event.h \ + zed_event.c \ + zed_event.h \ + zed_exec.c \ + zed_exec.h \ + zed_file.c \ + zed_file.h \ + zed_log.c \ + zed_log.h \ + zed_strings.c \ + zed_strings.h + +FMA_SRC = \ + agents/zfs_agents.c \ + agents/zfs_agents.h \ + agents/zfs_diagnosis.c \ + agents/zfs_mod.c \ + agents/zfs_retire.c \ + agents/fmd_api.c \ + agents/fmd_api.h \ + agents/fmd_serd.c \ + agents/fmd_serd.h + +zed_SOURCES = $(ZED_SRC) $(FMA_SRC) + +zed_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS) +zed_LDFLAGS = -pthread + +EXTRA_DIST = agents/README.md diff --git a/cmd/zed/agents/README.md b/cmd/zed/agents/README.md new file mode 100644 index 000000000000..e35b97668a9d --- /dev/null +++ b/cmd/zed/agents/README.md @@ -0,0 +1,112 @@ +## Fault Management Logic for ZED ## + +The integration of Fault Management Daemon (FMD) logic from illumos +is being deployed in three phases. This logic is encapsulated in +several software modules inside ZED. + +### ZED+FM Phase 1 ### + +All the phase 1 work is in current Master branch. Phase I work includes: + +* Add new paths to the persistent VDEV label for device matching. +* Add a disk monitor for generating _disk-add_ and _disk-change_ events. +* Add support for automated VDEV auto-online, auto-replace and auto-expand. +* Expand the statechange event to include all VDEV state transitions. + +### ZED+FM Phase 2 (WIP) ### + +The phase 2 work primarily entails the _Diagnosis Engine_ and the +_Retire Agent_ modules. It also includes infrastructure to support a +crude FMD environment to host these modules. For additional +information see the **FMD Components in ZED** and **Implementation +Notes** sections below. + +### ZED+FM Phase 3 ### + +Future work will add additional functionality and will likely include: + +* Add FMD module garbage collection (periodically call `fmd_module_gc()`). +* Add real module property retrieval (currently hard-coded in accessors). +* Additional diagnosis telemetry (like latency outliers and SMART data). +* Export FMD module statistics. +* Zedlet parallel execution and resiliency (add watchdog). + +### ZFS Fault Management Overview ### + +The primary purpose with ZFS fault management is automated diagnosis +and isolation of VDEV faults. A fault is something we can associate +with an impact (e.g. loss of data redundancy) and a corrective action +(e.g. offline or replace a disk). A typical ZFS fault management stack +is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk +monitor_, a _diagnosis engine_ and _response agents_. + +After detecting a software error, the ZFS kernel module sends error +events to the ZED user daemon which in turn routes the events to its +internal FMA modules based on their event subscriptions. Likewise, if +a disk is added or changed in the system, the disk monitor sends disk +events which are consumed by a response agent. + +### FMD Components in ZED ### + +There are three FMD modules (aka agents) that are now built into ZED. + + 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`) + 2. A _Retire Agent_ module (`agents/zfs_retire.c`) + 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`) + +To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum +ereports and feeds them into a Soft Error Rate Discrimination (SERD) +algorithm which will generate a corresponding fault diagnosis when the +tracked VDEV encounters **N** events in a given **T** time window. The +initial N and T values for the SERD algorithm are estimates inherited +from illumos (10 errors in 10 minutes). + +In turn, a **Retire Agent** responds to diagnosed faults by isolating +the faulty VDEV. It will notify the ZFS kernel module of the new VDEV +state (degraded or faulted). The retire agent is also responsible for +managing hot spares across all pools. When it encounters a device fault +or a device removal it will replace the device with an appropriate +spare if available. + +Finally, a **Disk Add Agent** responds to events from a libudev disk +monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or +expand the associated VDEV. This agent is also known as the `zfs_mod` +or Sysevent Loadable Module (SLM) on the illumos platform. The added +disk is matched to a specific VDEV using its device id, physical path +or VDEV GUID. + +Note that the _auto-replace_ feature (aka hot plug) is opt-in and you +must set the pool's `autoreplace` property to enable it. The new disk +will be matched to the corresponding leaf VDEV by physical location +and labeled with a GPT partition before replacing the original VDEV +in the pool. + +### Implementation Notes ### + +* The FMD module API required for logic modules is emulated and implemented + in the `fmd_api.c` and `fmd_serd.c` source files. This support includes + module registration, memory allocation, module property accessors, basic + case management, one-shot timers and SERD engines. + For detailed information on the FMD module API, see the document -- + _"Fault Management Daemon Programmer's Reference Manual"_. + +* The event subscriptions for the modules (located in a module specific + configuration file on illumos) are currently hard-coded into the ZED + `zfs_agent_dispatch()` function. + +* The FMD modules are called one at a time from a single thread that + consumes events queued to the modules. These events are sourced from + the normal ZED events and also include events posted from the diagnosis + engine and the libudev disk event monitor. + +* The FMD code modules have minimal changes and were intentionally left + as similar as possible to their upstream source files. + +* The sysevent namespace in ZED differs from illumos. For example: + * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"` + * Linux uses `"sysevent.fs.zfs.vdev_remove"` + +* The FMD Modules port was produced by Intel Federal, LLC under award + number B609815 between the U.S. Department of Energy (DOE) and Intel + Federal, LLC. + diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c new file mode 100644 index 000000000000..607b387ca3a8 --- /dev/null +++ b/cmd/zed/agents/fmd_api.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * This file implements the minimal FMD module API required to support the + * fault logic modules in ZED. This support includes module registration, + * memory allocation, module property accessors, basic case management, + * one-shot timers and SERD engines. + * + * In the ZED runtime, the modules are called from a single thread so no + * locking is required in this emulated FMD environment. + */ + +#include <sys/types.h> +#include <sys/fm/protocol.h> +#include <uuid/uuid.h> +#include <signal.h> +#include <strings.h> +#include <time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" + +#include "zfs_agents.h" +#include "../zed_log.h" + +typedef struct fmd_modstat { + fmd_stat_t ms_accepted; /* total events accepted by module */ + fmd_stat_t ms_caseopen; /* cases currently open */ + fmd_stat_t ms_casesolved; /* total cases solved by module */ + fmd_stat_t ms_caseclosed; /* total cases closed by module */ +} fmd_modstat_t; + +typedef struct fmd_module { + const char *mod_name; /* basename of module (ro) */ + const fmd_hdl_info_t *mod_info; /* module info registered with handle */ + void *mod_spec; /* fmd_hdl_get/setspecific data value */ + fmd_stat_t *mod_ustat; /* module specific custom stats */ + uint_t mod_ustat_cnt; /* count of ustat stats */ + fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ + fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ + char *mod_vers; /* a copy of module version string */ +} fmd_module_t; + +/* + * ZED has two FMD hardwired module instances + */ +fmd_module_t zfs_retire_module; +fmd_module_t zfs_diagnosis_module; + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +/* + * Register a module with fmd and finish module initialization. + * Returns an integer indicating whether it succeeded (zero) or + * failed (non-zero). + */ +int +fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_info = mip; + mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ + mp->mod_spec = NULL; + + /* bare minimum module stats */ + (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); + (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); + (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); + (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); + + fmd_serd_hash_create(&mp->mod_serds); + + fmd_hdl_debug(hdl, "register module"); + + return (0); +} + +void +fmd_hdl_unregister(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_modstat_t *msp = &mp->mod_stats; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + /* dump generic module stats */ + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, + msp->ms_accepted.fmds_value.ui64); + if (ops->fmdo_close != NULL) { + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, + msp->ms_caseopen.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, + msp->ms_casesolved.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, + msp->ms_caseclosed.fmds_value.ui64); + } + + /* dump module specific stats */ + if (mp->mod_ustat != NULL) { + int i; + + for (i = 0; i < mp->mod_ustat_cnt; i++) { + fmd_hdl_debug(hdl, "%s: %llu", + mp->mod_ustat[i].fmds_name, + mp->mod_ustat[i].fmds_value.ui64); + } + } + + fmd_serd_hash_destroy(&mp->mod_serds); + + fmd_hdl_debug(hdl, "unregister module"); +} + +/* + * fmd_hdl_setspecific() is used to associate a data pointer with + * the specified handle for the duration of the module's lifetime. + * This pointer can be retrieved using fmd_hdl_getspecific(). + */ +void +fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_spec = spec; +} + +/* + * Return the module-specific data pointer previously associated + * with the handle using fmd_hdl_setspecific(). + */ +void * +fmd_hdl_getspecific(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_spec); +} + +void * +fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_alloc(size, flags)); +} + +void * +fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_zalloc(size, flags)); +} + +void +fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) +{ + umem_free(data, size); +} + +/* + * Record a module debug message using the specified format. + */ +void +fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) +{ + char message[256]; + va_list vargs; + fmd_module_t *mp = (fmd_module_t *)hdl; + + va_start(vargs, format); + (void) vsnprintf(message, sizeof (message), format, vargs); + va_end(vargs); + + /* prefix message with module name */ + zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); +} + +/* Property Retrieval */ + +int32_t +fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "spare_on_remove") == 0) + return (1); + + if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) + return (10); /* N = 10 events */ + + return (0); +} + +int64_t +fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "remove_timeout") == 0) + return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ + + if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) + return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ + + return (0); +} + +/* FMD Statistics */ + +fmd_stat_t * +fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (flags == FMD_STAT_NOALLOC) { + mp->mod_ustat = statv; + mp->mod_ustat_cnt = nstats; + } + + return (statv); +} + +/* Case Management */ + +fmd_case_t * +fmd_case_open(fmd_hdl_t *hdl, void *data) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + uuid_t uuid; + + fmd_case_t *cp; + + cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); + cp->ci_mod = hdl; + cp->ci_state = FMD_CASE_UNSOLVED; + cp->ci_flags = FMD_CF_DIRTY; + cp->ci_data = data; + cp->ci_bufptr = NULL; + cp->ci_bufsiz = 0; + + uuid_generate(uuid); + uuid_unparse(uuid, cp->ci_uuid); + + fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); + mp->mod_stats.ms_caseopen.fmds_value.ui64++; + + return (cp); +} + +void +fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + /* + * For ZED, the event was already sent from fmd_case_add_suspect() + */ + + if (cp->ci_state >= FMD_CASE_SOLVED) + fmd_hdl_debug(hdl, "case is already solved or closed"); + + cp->ci_state = FMD_CASE_SOLVED; + + fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); + mp->mod_stats.ms_casesolved.fmds_value.ui64++; +} + +void +fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); + + if (ops->fmdo_close != NULL) + ops->fmdo_close(hdl, cp); + + mp->mod_stats.ms_caseopen.fmds_value.ui64--; + mp->mod_stats.ms_caseclosed.fmds_value.ui64++; + + if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) + fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); + + fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); +} + +void +fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) +{ + fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); +} + +int +fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE); +} + +void +fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) +{ +} + +static void +zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) +{ + nvlist_t *rsrc; + char *strval; + uint64_t guid; + uint8_t byte; + + zed_log_msg(LOG_INFO, "\nzed_fault_event:"); + + if (uuid != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); + if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); + if (code != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); + if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { + if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, + strval); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, + guid); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, + guid); + } +} + +static const char * +fmd_fault_mkcode(nvlist_t *fault) +{ + char *class, *code = "-"; + + /* + * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po + */ + if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { + if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) + code = "ZFS-8000-FD"; + else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) + code = "ZFS-8000-GH"; + else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) + code = "ZFS-8000-HC"; + else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) + code = "ZFS-8000-JQ"; + else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) + code = "ZFS-8000-K4"; + else if (strcmp(class, "fault.fs.zfs.pool") == 0) + code = "ZFS-8000-CS"; + else if (strcmp(class, "fault.fs.zfs.device") == 0) + code = "ZFS-8000-D3"; + + } + return (code); +} + +void +fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) +{ + nvlist_t *nvl; + const char *code = fmd_fault_mkcode(fault); + int64_t tod[2]; + int err = 0; + + /* + * payload derived from fmd_protocol_list() + */ + + (void) gettimeofday(&cp->ci_tv, NULL); + tod[0] = cp->ci_tv.tv_sec; + tod[1] = cp->ci_tv.tv_usec; + + nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); + err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); + err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); + err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); + err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1); + + if (err) + zed_log_die("failed to populate nvlist"); + + zed_log_fault(fault, cp->ci_uuid, code); + zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); + + nvlist_free(nvl); + nvlist_free(fault); +} + +void +fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) +{ + cp->ci_data = data; +} + +void * +fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return (cp->ci_data); +} + +void +fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr == NULL); + assert(size < (1024 * 1024)); + + cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); + cp->ci_bufsiz = size; +} + +void +fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(size <= cp->ci_bufsiz); + + bcopy(cp->ci_bufptr, buf, size); +} + +void +fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, const void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(cp->ci_bufsiz >= size); + + bcopy(buf, cp->ci_bufptr, size); +} + +/* SERD Engines */ + +void +fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { + zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " + " name already exists", name); + return; + } + + (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); +} + +void +fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_eng_delete(&mp->mod_serds, name); + + fmd_hdl_debug(hdl, "serd_destroy %s", name); +} + +int +fmd_serd_exists(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); +} + +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return; + } + + fmd_serd_eng_reset(sgp); + + fmd_hdl_debug(hdl, "serd_reset %s", name); +} + +int +fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + int err; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", + name); + return (FMD_B_FALSE); + } + err = fmd_serd_eng_record(sgp, ep->ev_hrt); + + return (err); +} + +/* FMD Timers */ + +static void +_timer_notify(union sigval sv) +{ + fmd_timer_t *ftp = sv.sival_ptr; + fmd_hdl_t *hdl = ftp->ft_hdl; + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + struct itimerspec its; + + fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + + /* disarm the timer */ + bzero(&its, sizeof (struct itimerspec)); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + /* Note that the fmdo_timeout can remove this timer */ + if (ops->fmdo_timeout != NULL) + ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); +} + +/* + * Install a new timer which will fire at least delta nanoseconds after the + * current time. After the timeout has expired, the module's fmdo_timeout + * entry point is called. + */ +fmd_timer_t * +fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) +{ + struct sigevent sev; + struct itimerspec its; + fmd_timer_t *ftp; + + ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); + ftp->ft_arg = arg; + ftp->ft_hdl = hdl; + + its.it_value.tv_sec = delta / 1000000000; + its.it_value.tv_nsec = delta % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = _timer_notify; + sev.sigev_notify_attributes = NULL; + sev.sigev_value.sival_ptr = ftp; + + timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", + (int)its.it_value.tv_sec, ftp->ft_tid); + + return (ftp); +} + +void +fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) +{ + fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); + + timer_delete(ftp->ft_tid); + + fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); +} + +/* Name-Value Pair Lists */ + +nvlist_t * +fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, + nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) +{ + nvlist_t *nvl; + int err = 0; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + zed_log_die("failed to xalloc fault nvlist"); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, class); + err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); + + if (asru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); + if (fru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); + if (resource != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); + + if (err) + zed_log_die("failed to populate nvlist: %s\n", strerror(err)); + + return (nvl); +} + +/* + * sourced from fmd_string.c + */ +static int +fmd_strmatch(const char *s, const char *p) +{ + char c; + + if (p == NULL) + return (0); + + if (s == NULL) + s = ""; /* treat NULL string as the empty string */ + + do { + if ((c = *p++) == '\0') + return (*s == '\0'); + + if (c == '*') { + while (*p == '*') + p++; /* consecutive *'s can be collapsed */ + + if (*p == '\0') + return (1); + + while (*s != '\0') { + if (fmd_strmatch(s++, p) != 0) + return (1); + } + + return (0); + } + } while (c == *s++); + + return (0); +} + +int +fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) +{ + char *class; + + return (nvl != NULL && + nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && + fmd_strmatch(class, pattern)); +} + +nvlist_t * +fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) +{ + nvlist_t *nvl = NULL; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + return (nvl); +} + + +/* + * ZED Agent specific APIs + */ + +fmd_hdl_t * +fmd_module_hdl(const char *name) +{ + if (strcmp(name, "zfs-retire") == 0) + return ((fmd_hdl_t *)&zfs_retire_module); + if (strcmp(name, "zfs-diagnosis") == 0) + return ((fmd_hdl_t *)&zfs_diagnosis_module); + + return (NULL); +} + +boolean_t +fmd_module_initialized(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_info != NULL); +} + +/* + * fmd_module_recv is called for each event that is received by + * the fault manager that has a class that matches one of the + * module's subscriptions. + */ +void +fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + fmd_event_t faux_event = {0}; + int64_t *tv; + uint_t n; + + /* + * Will need to normalized this if we persistently store the case data + */ + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) + faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; + else + faux_event.ev_hrt = 0; + + ops->fmdo_recv(hdl, &faux_event, nvl, class); + + mp->mod_stats.ms_accepted.fmds_value.ui64++; + + /* TBD - should we initiate fm_module_gc() periodically? */ +} diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h new file mode 100644 index 000000000000..4f06fb244b7b --- /dev/null +++ b/cmd/zed/agents/fmd_api.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_API_H +#define _FMD_API_H + +#include <sys/types.h> +#include <sys/time.h> +#include <time.h> +#include <libnvpair.h> +#include <stdarg.h> +#include <umem.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fault Management Daemon Client Interfaces + */ + +#define FMD_API_VERSION 5 + +typedef struct fmd_hdl fmd_hdl_t; + +typedef struct fmd_timer { + timer_t ft_tid; + void *ft_arg; + fmd_hdl_t *ft_hdl; +} fmd_timer_t; + +#define id_t fmd_timer_t * + + +typedef struct fmd_event { + hrtime_t ev_hrt; /* event time used by SERD engines */ +} fmd_event_t; + +typedef struct fmd_case { + char ci_uuid[48]; /* uuid string for this case */ + fmd_hdl_t *ci_mod; /* module that owns this case */ + void *ci_data; /* data from fmd_case_setspecific() */ + ushort_t ci_state; /* case state (see below) */ + ushort_t ci_flags; /* case flags (see below) */ + struct timeval ci_tv; /* time of original diagnosis */ + void *ci_bufptr; /* case data serialization buffer */ + size_t ci_bufsiz; +} fmd_case_t; + + +#define FMD_B_FALSE 0 /* false value for booleans as int */ +#define FMD_B_TRUE 1 /* true value for booleans as int */ + + +#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ +#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ +#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ +#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ +#define FMD_CASE_REPAIRED 4 /* case is repaired */ +#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ + +#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ +#define FMD_CF_SOLVED 0x02 /* case has been solved */ +#define FMD_CF_ISOLATED 0x04 /* case has been isolated */ +#define FMD_CF_REPAIRED 0x08 /* case has been repaired */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ + + +#define FMD_TYPE_BOOL 0 /* int */ +#define FMD_TYPE_INT32 1 /* int32_t */ +#define FMD_TYPE_UINT32 2 /* uint32_t */ +#define FMD_TYPE_INT64 3 /* int64_t */ +#define FMD_TYPE_UINT64 4 /* uint64_t */ +#define FMD_TYPE_TIME 5 /* uint64_t */ +#define FMD_TYPE_SIZE 6 /* uint64_t */ + +typedef struct fmd_prop { + const char *fmdp_name; /* property name */ + uint_t fmdp_type; /* property type (see above) */ + const char *fmdp_defv; /* default value */ +} fmd_prop_t; + +typedef struct fmd_stat { + char fmds_name[32]; /* statistic name */ + uint_t fmds_type; /* statistic type (see above) */ + char fmds_desc[64]; /* statistic description */ + union { + int bool; /* FMD_TYPE_BOOL */ + int32_t i32; /* FMD_TYPE_INT32 */ + uint32_t ui32; /* FMD_TYPE_UINT32 */ + int64_t i64; /* FMD_TYPE_INT64 */ + uint64_t ui64; /* FMD_TYPE_UINT64 */ + } fmds_value; +} fmd_stat_t; + +typedef struct fmd_hdl_ops { + void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); + void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); + void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); + void (*fmdo_stats)(fmd_hdl_t *); + void (*fmdo_gc)(fmd_hdl_t *); +} fmd_hdl_ops_t; + +#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ +#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ +#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ + +typedef struct fmd_hdl_info { + const char *fmdi_desc; /* fmd client description string */ + const char *fmdi_vers; /* fmd client version string */ + const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ + const fmd_prop_t *fmdi_props; /* array of configuration props */ +} fmd_hdl_info_t; + +extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); +extern void fmd_hdl_unregister(fmd_hdl_t *); + +extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); +extern void *fmd_hdl_getspecific(fmd_hdl_t *); + +#define FMD_SLEEP UMEM_NOFAIL + +extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); +extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); +extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); + +extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); +extern void fmd_hdl_strfree(fmd_hdl_t *, char *); + +extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); +extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); + +extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); +extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); + +#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ +#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ + +extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); +extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); +extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); + +extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); +extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); + +extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); +extern void fmd_case_uuclose(fmd_hdl_t *, const char *); +extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); +extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); +extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); + +extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *); +extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); +extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); + +extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); +extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); + +extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); +extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, + const char *, void *, size_t); +extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, + const char *, const void *, size_t); +extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); + +extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); +extern void fmd_serd_destroy(fmd_hdl_t *, const char *); +extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern void fmd_serd_reset(fmd_hdl_t *, const char *); +extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); +extern int fmd_serd_fired(fmd_hdl_t *, const char *); +extern int fmd_serd_empty(fmd_hdl_t *, const char *); + +extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); +extern void fmd_timer_remove(fmd_hdl_t *, id_t); + +extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, + const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); + +extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); + +#define FMD_HAS_FAULT_FRU 0 +#define FMD_HAS_FAULT_ASRU 1 +#define FMD_HAS_FAULT_RESOURCE 2 + +extern void fmd_repair_fru(fmd_hdl_t *, const char *); +extern int fmd_repair_asru(fmd_hdl_t *, const char *); + +extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); +extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); + +/* + * ZED Specific Interfaces + */ + +extern fmd_hdl_t *fmd_module_hdl(const char *); +extern boolean_t fmd_module_initialized(fmd_hdl_t *); +extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); + +/* ZFS FMA Retire Agent */ +extern void _zfs_retire_init(fmd_hdl_t *); +extern void _zfs_retire_fini(fmd_hdl_t *); + +/* ZFS FMA Diagnosis Engine */ +extern void _zfs_diagnosis_init(fmd_hdl_t *); +extern void _zfs_diagnosis_fini(fmd_hdl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_API_H */ diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c new file mode 100644 index 000000000000..d4ec37fb7691 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include <assert.h> +#include <stddef.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/list.h> +#include <sys/time.h> + +#include "fmd_api.h" +#include "fmd_serd.h" +#include "../zed_log.h" + + +#define FMD_STR_BUCKETS 211 + + +#ifdef SERD_ENG_DEBUG +#define serd_log_msg(fmt, ...) \ + zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) +#else +#define serd_log_msg(fmt, ...) +#endif + + +/* + * SERD Engine Backend + */ + +/* + * Compute the delta between events in nanoseconds. To account for very old + * events which are replayed, we must handle the case where time is negative. + * We convert the hrtime_t's to unsigned 64-bit integers and then handle the + * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). + */ +static hrtime_t +fmd_event_delta(hrtime_t t1, hrtime_t t2) +{ + uint64_t old = t1; + uint64_t new = t2; + + return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); +} + +static fmd_serd_eng_t * +fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) +{ + fmd_serd_eng_t *sgp; + + sgp = malloc(sizeof (fmd_serd_eng_t)); + bzero(sgp, sizeof (fmd_serd_eng_t)); + + sgp->sg_name = strdup(name); + sgp->sg_flags = FMD_SERD_DIRTY; + sgp->sg_n = n; + sgp->sg_t = t; + + list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), + offsetof(fmd_serd_elem_t, se_list)); + + return (sgp); +} + +static void +fmd_serd_eng_free(fmd_serd_eng_t *sgp) +{ + fmd_serd_eng_reset(sgp); + free(sgp->sg_name); + list_destroy(&sgp->sg_list); + free(sgp); +} + +/* + * sourced from fmd_string.c + */ +static ulong_t +fmd_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h); +} + +void +fmd_serd_hash_create(fmd_serd_hash_t *shp) +{ + shp->sh_hashlen = FMD_STR_BUCKETS; + shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); + shp->sh_count = 0; +} + +void +fmd_serd_hash_destroy(fmd_serd_hash_t *shp) +{ + fmd_serd_eng_t *sgp, *ngp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { + ngp = sgp->sg_next; + fmd_serd_eng_free(sgp); + } + } + + free(shp->sh_hash); + bzero(shp, sizeof (fmd_serd_hash_t)); +} + +void +fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) +{ + fmd_serd_eng_t *sgp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) + func(sgp, arg); + } +} + +fmd_serd_eng_t * +fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, + uint_t n, hrtime_t t) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); + + serd_log_msg(" SERD Engine: inserting %s N %d T %llu", + name, (int)n, (long long unsigned)t); + + sgp->sg_next = shp->sh_hash[h]; + shp->sh_hash[h] = sgp; + shp->sh_count++; + + return (sgp); +} + +fmd_serd_eng_t * +fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp; + + for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(name, sgp->sg_name) == 0) + return (sgp); + } + + return (NULL); +} + +void +fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; + + serd_log_msg(" SERD Engine: deleting %s", name); + + for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(sgp->sg_name, name) != 0) + pp = &sgp->sg_next; + else + break; + } + + if (sgp != NULL) { + *pp = sgp->sg_next; + fmd_serd_eng_free(sgp); + assert(shp->sh_count != 0); + shp->sh_count--; + } +} + +static void +fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) +{ + list_remove(&sgp->sg_list, sep); + sgp->sg_count--; + + serd_log_msg(" SERD Engine: discarding %s, %d remaining", + sgp->sg_name, (int)sgp->sg_count); + + free(sep); +} + +int +fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) +{ + fmd_serd_elem_t *sep, *oep; + + /* + * If the fired flag is already set, return false and discard the + * event. This means that the caller will only see the engine "fire" + * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() + * function can also be used in combination with fmd_serd_eng_record(). + */ + if (sgp->sg_flags & FMD_SERD_FIRED) { + serd_log_msg(" SERD Engine: record %s already fired!", + sgp->sg_name); + return (FMD_B_FALSE); + } + + while (sgp->sg_count >= sgp->sg_n) + fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); + + sep = malloc(sizeof (fmd_serd_elem_t)); + sep->se_hrt = hrt; + + list_insert_head(&sgp->sg_list, sep); + sgp->sg_count++; + + serd_log_msg(" SERD Engine: recording %s of %d (%llu)", + sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); + + /* + * Pick up the oldest element pointer for comparison to 'sep'. We must + * do this after adding 'sep' because 'oep' and 'sep' can be the same. + */ + oep = list_tail(&sgp->sg_list); + + if (sgp->sg_count >= sgp->sg_n && + fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { + sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; + serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); + return (FMD_B_TRUE); + } + + sgp->sg_flags |= FMD_SERD_DIRTY; + return (FMD_B_FALSE); +} + +int +fmd_serd_eng_fired(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_flags & FMD_SERD_FIRED); +} + +int +fmd_serd_eng_empty(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_count == 0); +} + +void +fmd_serd_eng_reset(fmd_serd_eng_t *sgp) +{ + serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name); + + while (sgp->sg_count != 0) + fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); + + sgp->sg_flags &= ~FMD_SERD_FIRED; + sgp->sg_flags |= FMD_SERD_DIRTY; +} + +void +fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +{ + fmd_serd_elem_t *sep, *nep; + hrtime_t hrt; + + if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) + return; /* no garbage collection needed if empty or fired */ + + sep = list_head(&sgp->sg_list); + if (sep == NULL) + return; + + hrt = sep->se_hrt - sgp->sg_t; + + for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { + if (sep->se_hrt >= hrt) + break; /* sep and subsequent events are all within T */ + + nep = list_next(&sgp->sg_list, sep); + fmd_serd_eng_discard(sgp, sep); + sgp->sg_flags |= FMD_SERD_DIRTY; + } +} diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h new file mode 100644 index 000000000000..c35c9acc7785 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_SERD_H +#define _FMD_SERD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/list.h> +#include <sys/time.h> + +typedef struct fmd_serd_elem { + list_node_t se_list; /* linked list forward/back pointers */ + hrtime_t se_hrt; /* upper bound on event hrtime */ +} fmd_serd_elem_t; + +typedef struct fmd_serd_eng { + char *sg_name; /* string name for this engine */ + struct fmd_serd_eng *sg_next; /* next engine on hash chain */ + list_t sg_list; /* list of fmd_serd_elem_t's */ + uint_t sg_count; /* count of events in sg_list */ + uint_t sg_flags; /* engine flags (see below) */ + uint_t sg_n; /* engine N parameter (event count) */ + hrtime_t sg_t; /* engine T parameter (nanoseconds) */ +} fmd_serd_eng_t; + +#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ +#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ + +typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); + +typedef struct fmd_serd_hash { + fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ + uint_t sh_hashlen; /* length of hash bucket array */ + uint_t sh_count; /* count of engines in hash */ +} fmd_serd_hash_t; + +extern void fmd_serd_hash_create(fmd_serd_hash_t *); +extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); +extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); + +extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, + const char *, uint32_t, hrtime_t); + +extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); +extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); + +extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); +extern int fmd_serd_eng_fired(fmd_serd_eng_t *); +extern int fmd_serd_eng_empty(fmd_serd_eng_t *); + +extern void fmd_serd_eng_reset(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_SERD_H */ diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c new file mode 100644 index 000000000000..006e0ab99f47 --- /dev/null +++ b/cmd/zed/agents/zfs_agents.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com> + */ + +#include <libnvpair.h> +#include <libzfs.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/list.h> +#include <sys/time.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> +#include <pthread.h> +#include <unistd.h> + +#include "zfs_agents.h" +#include "fmd_api.h" +#include "../zed_log.h" + +/* + * agent dispatch code + */ + +static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER; +static list_t agent_events; /* list of pending events */ +static int agent_exiting; + +typedef struct agent_event { + char ae_class[64]; + char ae_subclass[32]; + nvlist_t *ae_nvl; + list_node_t ae_node; +} agent_event_t; + +pthread_t g_agents_tid; + +libzfs_handle_t *g_zfs_hdl; + +/* guid search data */ +typedef enum device_type { + DEVICE_TYPE_L2ARC, /* l2arc device */ + DEVICE_TYPE_SPARE, /* spare device */ + DEVICE_TYPE_PRIMARY /* any primary pool storage device */ +} device_type_t; + +typedef struct guid_search { + uint64_t gs_pool_guid; + uint64_t gs_vdev_guid; + char *gs_devid; + device_type_t gs_vdev_type; + uint64_t gs_vdev_expandtime; /* vdev expansion time */ +} guid_search_t; + +/* + * Walks the vdev tree recursively looking for a matching devid. + * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise. + */ +static boolean_t +zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) +{ + guid_search_t *gsp = arg; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY; + return (B_TRUE); + } + } + } + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_L2ARC; + return (B_TRUE); + } + } + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_SPARE; + return (B_TRUE); + } + } + } + /* + * On a devid match, grab the vdev guid and expansion time, if any. + */ + if (gsp->gs_devid != NULL && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + (strcmp(gsp->gs_devid, path) == 0)) { + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &gsp->gs_vdev_guid); + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, + &gsp->gs_vdev_expandtime); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) +{ + guid_search_t *gsp = arg; + nvlist_t *config, *nvl; + + /* + * For each vdev in this pool, look for a match by devid + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvl) == 0) { + (void) zfs_agent_iter_vdev(zhp, nvl, gsp); + } + } + /* + * if a match was found then grab the pool guid + */ + if (gsp->gs_vdev_guid) { + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &gsp->gs_pool_guid); + } + + zpool_close(zhp); + return (gsp->gs_vdev_guid != 0); +} + +void +zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + agent_event_t *event; + + if (subclass == NULL) + subclass = ""; + + event = malloc(sizeof (agent_event_t)); + if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) { + if (event) + free(event); + return; + } + + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + class = EC_ZFS; + subclass = ESC_ZFS_VDEV_CHECK; + } + + /* + * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED + * ereport from vdev_disk layer after a hot unplug. Fortunately we + * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * proxy so we remap it here for the benefit of the diagnosis engine. + */ + if ((strcmp(class, EC_DEV_REMOVE) == 0) && + (strcmp(subclass, ESC_DISK) == 0) && + (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) || + nvlist_exists(nvl, DEV_IDENTIFIER))) { + nvlist_t *payload = event->ae_nvl; + struct timeval tv; + int64_t tod[2]; + uint64_t pool_guid = 0, vdev_guid = 0; + guid_search_t search = { 0 }; + device_type_t devtype = DEVICE_TYPE_PRIMARY; + + class = "resource.fs.zfs.removed"; + subclass = ""; + + (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); + + (void) gettimeofday(&tv, NULL); + tod[0] = tv.tv_sec; + tod[1] = tv.tv_usec; + (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); + + /* + * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or + * ZFS_EV_POOL_GUID may be missing so find them. + */ + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid); + (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + pool_guid = search.gs_pool_guid; + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; + + /* + * We want to avoid reporting "remove" events coming from + * libudev for VDEVs which were expanded recently (10s) and + * avoid activating spares in response to partitions being + * deleted and created in rapid succession. + */ + if (search.gs_vdev_expandtime != 0 && + search.gs_vdev_expandtime + 10 > tv.tv_sec) { + zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' " + "for recently expanded device '%s'", EC_DEV_REMOVE, + search.gs_devid); + goto out; + } + + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid); + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid); + switch (devtype) { + case DEVICE_TYPE_L2ARC: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + VDEV_TYPE_L2CACHE); + break; + case DEVICE_TYPE_SPARE: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE); + break; + case DEVICE_TYPE_PRIMARY: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK); + break; + } + + zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'", + EC_DEV_REMOVE, class); + } + + (void) strlcpy(event->ae_class, class, sizeof (event->ae_class)); + (void) strlcpy(event->ae_subclass, subclass, + sizeof (event->ae_subclass)); + + (void) pthread_mutex_lock(&agent_lock); + list_insert_tail(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); + +out: + (void) pthread_cond_signal(&agent_cond); +} + +static void +zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl) +{ + /* + * The diagnosis engine subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf + */ + if (strstr(class, "ereport.fs.zfs.") != NULL || + strstr(class, "resource.fs.zfs.") != NULL || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 || + strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class); + } + + /* + * The retire agent subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-retire.conf + * + * NOTE: faults events come directly from our diagnosis engine + * and will not pass through the zfs kernel module. + */ + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, "resource.fs.zfs.removed") == 0 || + strcmp(class, "resource.fs.zfs.statechange") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class); + } + + /* + * The SLM module only consumes disk events and vdev check events + * + * NOTE: disk events come directly from disk monitor and will + * not pass through the zfs kernel module. + */ + if (strstr(class, "EC_dev_") != NULL || + strcmp(class, EC_ZFS) == 0) { + (void) zfs_slm_event(class, subclass, nvl); + } +} + +/* + * Events are consumed and dispatched from this thread + * An agent can also post an event so event list lock + * is not held when calling an agent. + * One event is consumed at a time. + */ +static void * +zfs_agent_consumer_thread(void *arg) +{ + for (;;) { + agent_event_t *event; + + (void) pthread_mutex_lock(&agent_lock); + + /* wait for an event to show up */ + while (!agent_exiting && list_is_empty(&agent_events)) + (void) pthread_cond_wait(&agent_cond, &agent_lock); + + if (agent_exiting) { + (void) pthread_mutex_unlock(&agent_lock); + zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: " + "exiting"); + return (NULL); + } + + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + + (void) pthread_mutex_unlock(&agent_lock); + + /* dispatch to all event subscribers */ + zfs_agent_dispatch(event->ae_class, event->ae_subclass, + event->ae_nvl); + + nvlist_free(event->ae_nvl); + free(event); + continue; + } + + (void) pthread_mutex_unlock(&agent_lock); + } + + return (NULL); +} + +void +zfs_agent_init(libzfs_handle_t *zfs_hdl) +{ + fmd_hdl_t *hdl; + + g_zfs_hdl = zfs_hdl; + + if (zfs_slm_init() != 0) + zed_log_die("Failed to initialize zfs slm"); + zed_log_msg(LOG_INFO, "Add Agent: init"); + + hdl = fmd_module_hdl("zfs-diagnosis"); + _zfs_diagnosis_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs diagnosis"); + + hdl = fmd_module_hdl("zfs-retire"); + _zfs_retire_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs retire"); + + list_create(&agent_events, sizeof (agent_event_t), + offsetof(struct agent_event, ae_node)); + + if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread, + NULL) != 0) { + list_destroy(&agent_events); + zed_log_die("Failed to initialize agents"); + } +} + +void +zfs_agent_fini(void) +{ + fmd_hdl_t *hdl; + agent_event_t *event; + + agent_exiting = 1; + (void) pthread_cond_signal(&agent_cond); + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_agents_tid, NULL); + + /* drain any pending events */ + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + nvlist_free(event->ae_nvl); + free(event); + } + + list_destroy(&agent_events); + + if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) { + _zfs_retire_fini(hdl); + fmd_hdl_unregister(hdl); + } + if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) { + _zfs_diagnosis_fini(hdl); + fmd_hdl_unregister(hdl); + } + + zed_log_msg(LOG_INFO, "Add Agent: fini"); + zfs_slm_fini(); + + g_zfs_hdl = NULL; +} diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h new file mode 100644 index 000000000000..d1a459139b1e --- /dev/null +++ b/cmd/zed/agents/zfs_agents.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>. + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZFS_AGENTS_H +#define ZFS_AGENTS_H + +#include <libzfs.h> +#include <libnvpair.h> + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Agent abstraction presented to ZED + */ +extern void zfs_agent_init(libzfs_handle_t *); +extern void zfs_agent_fini(void); +extern void zfs_agent_post_event(const char *, const char *, nvlist_t *); + +/* + * ZFS Sysevent Linkable Module (SLM) + */ +extern int zfs_slm_init(void); +extern void zfs_slm_fini(void); +extern void zfs_slm_event(const char *, const char *, nvlist_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZFS_AGENTS_H */ diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c new file mode 100644 index 000000000000..0b27f6702ee8 --- /dev/null +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -0,0 +1,981 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. + */ + +#include <stddef.h> +#include <string.h> +#include <strings.h> +#include <libuutil.h> +#include <libzfs.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/fs/zfs.h> + +#include "zfs_agents.h" +#include "fmd_api.h" + +/* + * Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) + +/* + * On-disk case structure. This must maintain backwards compatibility with + * previous versions of the DE. By default, any members appended to the end + * will be filled with zeros if they don't exist in a previous version. + */ +typedef struct zfs_case_data { + uint64_t zc_version; + uint64_t zc_ena; + uint64_t zc_pool_guid; + uint64_t zc_vdev_guid; + int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_remove_timer; +} zfs_case_data_t; + +/* + * Time-of-day + */ +typedef struct er_timeval { + uint64_t ertv_sec; + uint64_t ertv_nsec; +} er_timeval_t; + +/* + * In-core case structure. + */ +typedef struct zfs_case { + boolean_t zc_present; + uint32_t zc_version; + zfs_case_data_t zc_data; + fmd_case_t *zc_case; + uu_list_node_t zc_node; + id_t zc_remove_timer; + char *zc_fru; + er_timeval_t zc_when; +} zfs_case_t; + +#define CASE_DATA "data" +#define CASE_FRU "fru" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 + +typedef struct zfs_de_stats { + fmd_stat_t old_drops; + fmd_stat_t dev_drops; + fmd_stat_t vdev_drops; + fmd_stat_t import_drops; + fmd_stat_t resource_drops; +} zfs_de_stats_t; + +zfs_de_stats_t zfs_stats = { + { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, + { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, + { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, + { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, + { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } +}; + +static hrtime_t zfs_remove_timeout; + +uu_list_pool_t *zfs_case_pool; +uu_list_t *zfs_cases; + +#define ZFS_MAKE_RSRC(type) \ + FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type +#define ZFS_MAKE_EREPORT(type) \ + FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type + +/* + * Write out the persistent representation of an active case. + */ +static void +zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; +} + +/* + * Read back the persistent representation of an active case. + */ +static zfs_case_t * +zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + zfs_case_t *zcp; + + zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); + zcp->zc_case = cp; + + fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, + sizeof (zcp->zc_data)); + + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + return (NULL); + } + + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + + if (zcp->zc_data.zc_has_remove_timer) + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_remove_timeout); + + uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); + (void) uu_list_insert_before(zfs_cases, NULL, zcp); + + fmd_case_setspecific(hdl, cp, zcp); + + return (zcp); +} + +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) +{ + uint64_t vdev_guid = 0; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + + (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) { + zcp->zc_present = B_TRUE; + zcp->zc_when = *loaded; + } + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } +} + +/*ARGSUSED*/ +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid; + uint64_t *tod; + er_timeval_t loaded = { 0 }; + nvlist_t *config, *vd; + uint_t nelem = 0; + int ret; + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem); + if (nelem == 2) { + loaded.ertv_sec = tod[0]; + loaded.ertv_nsec = tod[1]; + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) { + zcp->zc_when = loaded; + } + } + } + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + if (ret) { + zpool_close(zhp); + return (-1); + } + + zfs_mark_vdev(pool_guid, vd, &loaded); + + zpool_close(zhp); + + return (0); +} + +struct load_time_arg { + uint64_t lt_guid; + er_timeval_t *lt_time; + boolean_t lt_found; +}; + +static int +zpool_find_load_time(zpool_handle_t *zhp, void *arg) +{ + struct load_time_arg *lta = arg; + uint64_t pool_guid; + uint64_t *tod; + nvlist_t *config; + uint_t nelem; + + if (lta->lt_found) { + zpool_close(zhp); + return (0); + } + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + if (pool_guid != lta->lt_guid) { + zpool_close(zhp); + return (0); + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem) == 0 && nelem == 2) { + lta->lt_found = B_TRUE; + lta->lt_time->ertv_sec = tod[0]; + lta->lt_time->ertv_nsec = tod[1]; + } + + zpool_close(zhp); + + return (0); +} + +static void +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach an O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases as not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", + (long long unsigned int)pool_guid, + (long long unsigned int)vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + nvlist_t *fru = NULL; + fmd_hdl_debug(hdl, "solving fault '%s'", faultname); + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + detector = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid); + if (zcp->zc_data.zc_vdev_guid != 0) { + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid); + } + + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, + fru, detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + + nvlist_free(fru); + + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +static boolean_t +timeval_earlier(er_timeval_t *a, er_timeval_t *b) +{ + return (a->ertv_sec < b->ertv_sec || + (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); +} + +/*ARGSUSED*/ +static void +zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) +{ + int64_t *tod; + uint_t nelem; + + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, + &nelem) == 0 && nelem == 2) { + when->ertv_sec = tod[0]; + when->ertv_nsec = tod[1]; + } else { + when->ertv_sec = when->ertv_nsec = UINT64_MAX; + } +} + +/* + * Main fmd entry point. + */ +/*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +{ + zfs_case_t *zcp, *dcp; + int32_t pool_state; + uint64_t ena, pool_guid, vdev_guid; + er_timeval_t pool_load; + er_timeval_t er_when; + nvlist_t *detector; + boolean_t pool_found = B_FALSE; + boolean_t isresource; + char *type; + + /* + * We subscribe to notifications for vdev or pool removal. In these + * cases, there may be cases that no longer apply. Purge any cases + * that no longer apply. + */ + if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { + fmd_hdl_debug(hdl, "purging orphaned cases from %s", + strrchr(class, '.') + 1); + zfs_purge_cases(hdl); + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); + + if (isresource) { + /* + * For resources, we don't have a normal payload. + */ + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + pool_state = SPA_LOAD_OPEN; + else + pool_state = SPA_LOAD_NONE; + detector = NULL; + } else { + (void) nvlist_lookup_nvlist(nvl, + FM_EREPORT_DETECTOR, &detector); + (void) nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); + } + + /* + * We also ignore all ereports generated during an import of a pool, + * since the only possible fault (.pool) would result in import failure, + * and hence no persistent fault. Some day we may want to do something + * with these ereports, so we continue generating them internally. + */ + if (pool_state == SPA_LOAD_IMPORT) { + zfs_stats.import_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "ignoring '%s' during import", class); + return; + } + + /* + * Device I/O errors are ignored during pool open. + */ + if (pool_state == SPA_LOAD_OPEN && + (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { + fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); + zfs_stats.dev_drops.fmds_value.ui64++; + return; + } + + /* + * We ignore ereports for anything except disks and files. + */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &type) == 0) { + if (strcmp(type, VDEV_TYPE_DISK) != 0 && + strcmp(type, VDEV_TYPE_FILE) != 0) { + zfs_stats.vdev_drops.fmds_value.ui64++; + return; + } + } + + /* + * Determine if this ereport corresponds to an open case. + * Each vdev or pool can have a single case. + */ + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) + ena = 0; + + zfs_ereport_when(hdl, nvl, &er_when); + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid) { + pool_found = B_TRUE; + pool_load = zcp->zc_when; + } + if (zcp->zc_data.zc_vdev_guid == vdev_guid) + break; + } + + /* + * Avoid falsely accusing a pool of being faulty. Do so by + * not replaying ereports that were generated prior to the + * current import. If the failure that generated them was + * transient because the device was actually removed but we + * didn't receive the normal asynchronous notification, we + * don't want to mark it as faulted and potentially panic. If + * there is still a problem we'd expect not to be able to + * import the pool, or that new ereports will be generated + * once the pool is used. + */ + if (pool_found && timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, er_when.ertv_nsec, + pool_load.ertv_sec, pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + + if (!pool_found) { + /* + * Haven't yet seen this pool, but same situation + * may apply. + */ + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + struct load_time_arg la; + + la.lt_guid = pool_guid; + la.lt_time = &pool_load; + la.lt_found = B_FALSE; + + if (zhdl != NULL && + zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && + la.lt_found == B_TRUE) { + pool_found = B_TRUE; + + if (timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, " + "pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, + er_when.ertv_nsec, pool_load.ertv_sec, + pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + } + } + + if (zcp == NULL) { + fmd_case_t *cs; + zfs_case_data_t data = { 0 }; + + /* + * If this is one of our 'fake' resource ereports, and there is + * no case open, simply discard it. + */ + if (isresource) { + zfs_stats.resource_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + class, vdev_guid); + return; + } + + /* + * Skip tracking some ereports + */ + if (strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Open a new case. + */ + cs = fmd_case_open(hdl, NULL); + + fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", + vdev_guid, class); + + /* + * Initialize the case buffer. To commonize code, we actually + * create the buffer with existing data, and then call + * zfs_case_unserialize() to instantiate the in-core structure. + */ + fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); + + data.zc_version = CASE_DATA_VERSION_SERD; + data.zc_ena = ena; + data.zc_pool_guid = pool_guid; + data.zc_vdev_guid = vdev_guid; + data.zc_pool_state = (int)pool_state; + + fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); + + zcp = zfs_case_unserialize(hdl, cs); + assert(zcp != NULL); + if (pool_found) + zcp->zc_when = pool_load; + } + + if (isresource) { + fmd_hdl_debug(hdl, "resource event '%s'", class); + + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { + uint64_t state = 0; + + if (zcp != NULL && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && + state == VDEV_STATE_HEALTHY) { + fmd_hdl_debug(hdl, "closing case after a " + "device statechange to healthy"); + fmd_case_close(hdl, zcp->zc_case); + } + } + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Associate the ereport with this case. + */ + fmd_case_add_ereport(hdl, zcp->zc_case, ep); + + /* + * Don't do anything else if this case is already solved. + */ + if (fmd_case_solved(hdl, zcp->zc_case)) + return; + + fmd_hdl_debug(hdl, "error event '%s'", class); + + /* + * Determine if we should solve the case and generate a fault. We solve + * a case if: + * + * a. A pool failed to open (ereport.fs.zfs.pool) + * b. A device failed to open (ereport.fs.zfs.pool) while a pool + * was up and running. + * + * We may see a series of ereports associated with a pool open, all + * chained together by the same ENA. If the pool open succeeds, then + * we'll see no further ereports. To detect when a pool open has + * succeeded, we associate a timer with the event. When it expires, we + * close the case. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { + /* + * Pool level fault. Before solving the case, go through and + * close any open device cases that may be pending. + */ + for (dcp = uu_list_first(zfs_cases); dcp != NULL; + dcp = uu_list_next(zfs_cases, dcp)) { + if (dcp->zc_data.zc_pool_guid == + zcp->zc_data.zc_pool_guid && + dcp->zc_data.zc_vdev_guid != 0) + fmd_case_close(hdl, dcp->zc_case); + } + + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { + /* + * Pool level fault for reading the intent logs. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { + /* + * Device fault. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { + char *failmode = NULL; + boolean_t checkremove = B_FALSE; + + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep)) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.checksum", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && + (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && + failmode != NULL) { + if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, + strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_continue", + B_FALSE); + } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, + strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_wait", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { +#ifndef __linux__ + /* This causes an unexpected fault diagnosis on linux */ + checkremove = B_TRUE; +#endif + } + + /* + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. + */ + if (checkremove) { + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_remove_timeout); + if (!zcp->zc_data.zc_has_remove_timer) { + zcp->zc_data.zc_has_remove_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } + } +} + +/* + * The timeout is fired when we diagnosed an I/O error, and it was not due to + * device removal (which would cause the timeout to be cancelled). + */ +/* ARGSUSED */ +static void +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + zfs_case_t *zcp = data; + + if (id == zcp->zc_remove_timer) + zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); +} + +/* + * The specified case has been closed and any case-specific + * data structures should be deallocated. + */ +static void +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) +{ + zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); +} + +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ + NULL, /* fmdo_stats */ + zfs_fm_gc, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "remove_timeout", FMD_TYPE_TIME, "15sec" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_diagnosis_init(fmd_hdl_t *hdl) +{ + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", + sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), + NULL, UU_LIST_POOL_DEBUG)) == NULL) { + libzfs_fini(zhdl); + return; + } + + if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, + UU_LIST_DEBUG)) == NULL) { + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + fmd_hdl_setspecific(hdl, zhdl); + + (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / + sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); + + zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); +} + +void +_zfs_diagnosis_fini(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl; + + /* + * Remove all active cases. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + fmd_hdl_debug(hdl, "removing case ena %llu", + (long long unsigned)zcp->zc_data.zc_ena); + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + } + uu_list_walk_end(walk); + + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + libzfs_fini(zhdl); +} diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c new file mode 100644 index 000000000000..8d0a3b420086 --- /dev/null +++ b/cmd/zed/agents/zfs_mod.c @@ -0,0 +1,956 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, 2017, Intel Corporation. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + */ + +/* + * ZFS syseventd module. + * + * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c + * + * The purpose of this module is to identify when devices are added to the + * system, and appropriately online or replace the affected vdevs. + * + * When a device is added to the system: + * + * 1. Search for any vdevs whose devid matches that of the newly added + * device. + * + * 2. If no vdevs are found, then search for any vdevs whose udev path + * matches that of the new device. + * + * 3. If no vdevs match by either method, then ignore the event. + * + * 4. Attempt to online the device with a flag to indicate that it should + * be unspared when resilvering completes. If this succeeds, then the + * same device was inserted and we should continue normally. + * + * 5. If the pool does not have the 'autoreplace' property set, attempt to + * online the device again without the unspare flag, which will + * generate a FMA fault. + * + * 6. If the pool has the 'autoreplace' property set, and the matching vdev + * is a whole disk, then label the new disk and attempt a 'zpool + * replace'. + * + * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK + * event indicates that a device failed to open during pool load, but the + * autoreplace property was set. In this case, we deferred the associated + * FMA fault until our module had a chance to process the autoreplace logic. + * If the device could not be replaced, then the second online attempt will + * trigger the FMA fault that we skipped earlier. + * + * ZFS on Linux porting notes: + * Linux udev provides a disk insert for both the disk and the partition + * + */ + +#include <ctype.h> +#include <fcntl.h> +#include <libnvpair.h> +#include <libzfs.h> +#include <libzutil.h> +#include <limits.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> +#include <sys/list.h> +#include <sys/sunddi.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> +#include <thread_pool.h> +#include <pthread.h> +#include <unistd.h> +#include <errno.h> +#include "zfs_agents.h" +#include "../zed_log.h" + +#define DEV_BYID_PATH "/dev/disk/by-id/" +#define DEV_BYPATH_PATH "/dev/disk/by-path/" +#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" + +typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); + +libzfs_handle_t *g_zfshdl; +list_t g_pool_list; /* list of unavailable pools at initialization */ +list_t g_device_list; /* list of disks with asynchronous label request */ +tpool_t *g_tpool; +boolean_t g_enumeration_done; +pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ + +typedef struct unavailpool { + zpool_handle_t *uap_zhp; + list_node_t uap_node; +} unavailpool_t; + +typedef struct pendingdev { + char pd_physpath[128]; + list_node_t pd_node; +} pendingdev_t; + +static int +zfs_toplevel_state(zpool_handle_t *zhp) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + unsigned int c; + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + return (vs->vs_state); +} + +static int +zfs_unavail_pool(zpool_handle_t *zhp, void *data) +{ + zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", + zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); + + if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { + unavailpool_t *uap; + uap = malloc(sizeof (unavailpool_t)); + uap->uap_zhp = zhp; + list_insert_tail((list_t *)data, uap); + } else { + zpool_close(zhp); + } + return (0); +} + +/* + * Two stage replace on Linux + * since we get disk notifications + * we can wait for partitioned disk slice to show up! + * + * First stage tags the disk, initiates async partitioning, and returns + * Second stage finds the tag and proceeds to ZFS labeling/replace + * + * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach + * + * 1. physical match with no fs, no partition + * tag it top, partition disk + * + * 2. physical match again, see partition and tag + * + */ + +/* + * The device associated with the given vdev (either by devid or physical path) + * has been added to the system. If 'isdisk' is set, then we only attempt a + * replacement if it's a whole disk. This also implies that we should label the + * disk first. + * + * First, we attempt to online the device (making sure to undo any spare + * operation when finished). If this succeeds, then we're done. If it fails, + * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, + * but that the label was not what we expected. If the 'autoreplace' property + * is enabled, then we relabel the disk (if specified), and attempt a 'zpool + * replace'. If the online is successful, but the new state is something else + * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of + * race, and we should avoid attempting to relabel the disk. + * + * Also can arrive here from a ESC_ZFS_VDEV_CHECK event + */ +static void +zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) +{ + char *path; + vdev_state_t newstate; + nvlist_t *nvroot, *newvd; + pendingdev_t *device; + uint64_t wholedisk = 0ULL; + uint64_t offline = 0ULL; + uint64_t guid = 0ULL; + char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; + char rawpath[PATH_MAX], fullpath[PATH_MAX]; + char devpath[PATH_MAX]; + int ret; + boolean_t is_dm = B_FALSE; + boolean_t is_sd = B_FALSE; + uint_t c; + vdev_stat_t *vs; + + if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) + return; + + /* Skip healthy disks */ + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (vs->vs_state == VDEV_STATE_HEALTHY) { + zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", + __func__, path); + return; + } + + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); + + if (offline) + return; /* don't intervene if it was taken offline */ + + is_dm = zfs_dev_is_dm(path); + zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" + " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, + physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", + (long long unsigned int)guid); + + /* + * The VDEV guid is preferred for identification (gets passed in path) + */ + if (guid != 0) { + (void) snprintf(fullpath, sizeof (fullpath), "%llu", + (long long unsigned int)guid); + } else { + /* + * otherwise use path sans partition suffix for whole disks + */ + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) { + char *spath = zfs_strip_partition(fullpath); + if (!spath) { + zed_log_msg(LOG_INFO, "%s: Can't alloc", + __func__); + return; + } + + (void) strlcpy(fullpath, spath, sizeof (fullpath)); + free(spath); + } + } + + /* + * Attempt to online the device. + */ + if (zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && + (newstate == VDEV_STATE_HEALTHY || + newstate == VDEV_STATE_DEGRADED)) { + zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", + fullpath, (newstate == VDEV_STATE_HEALTHY) ? + "HEALTHY" : "DEGRADED"); + return; + } + + /* + * vdev_id alias rule for using scsi_debug devices (FMA automated + * testing) + */ + if (physpath != NULL && strcmp("scsidebug", physpath) == 0) + is_sd = B_TRUE; + + /* + * If the pool doesn't have the autoreplace property set, then use + * vdev online to trigger a FMA fault by posting an ereport. + */ + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !(wholedisk || is_dm) || (physpath == NULL)) { + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " + "not a whole disk for '%s'", fullpath); + return; + } + + /* + * Convert physical path into its current device node. Rawpath + * needs to be /dev/disk/by-vdev for a scsi_debug device since + * /dev/disk/by-path will not be present. + */ + (void) snprintf(rawpath, sizeof (rawpath), "%s%s", + is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); + + if (realpath(rawpath, devpath) == NULL && !is_dm) { + zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", + rawpath, strerror(errno)); + + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", + fullpath, libzfs_error_description(g_zfshdl)); + return; + } + + /* Only autoreplace bad disks */ + if ((vs->vs_state != VDEV_STATE_DEGRADED) && + (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_CANT_OPEN)) { + return; + } + + nvlist_lookup_string(vdev, "new_devid", &new_devid); + + if (is_dm) { + /* Don't label device mapper or multipath disks. */ + } else if (!labeled) { + /* + * we're auto-replacing a raw disk, so label it first + */ + char *leafname; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. Before we can label the disk, we need + * to map the physical string that was matched on to the under + * lying device node. + * + * If any part of this process fails, then do a force online + * to trigger a ZFS fault for the device (and any hot spare + * replacement). + */ + leafname = strrchr(devpath, '/') + 1; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. + */ + if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { + zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + "label '%s' (%s)", leafname, + libzfs_error_description(g_zfshdl)); + + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + /* + * The disk labeling is asynchronous on Linux. Just record + * this label request and return as there will be another + * disk add event for the partition after the labeling is + * completed. + */ + device = malloc(sizeof (pendingdev_t)); + (void) strlcpy(device->pd_physpath, physpath, + sizeof (device->pd_physpath)); + list_insert_tail(&g_device_list, device); + + zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + leafname, (u_longlong_t)guid); + + return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ + + } else /* labeled */ { + boolean_t found = B_FALSE; + /* + * match up with request above to label the disk + */ + for (device = list_head(&g_device_list); device != NULL; + device = list_next(&g_device_list, device)) { + if (strcmp(physpath, device->pd_physpath) == 0) { + list_remove(&g_device_list, device); + free(device); + found = B_TRUE; + break; + } + zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", + physpath, device->pd_physpath); + } + if (!found) { + /* unexpected partition slice encountered */ + zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", + fullpath); + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", + physpath, (u_longlong_t)guid); + + (void) snprintf(devpath, sizeof (devpath), "%s%s", + DEV_BYID_PATH, new_devid); + } + + /* + * Construct the root vdev to pass to zpool_vdev_attach(). While adding + * the entire vdev structure is harmless, we construct a reduced set of + * path/physpath/wholedisk to keep it simple. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + return; + } + if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + nvlist_free(nvroot); + return; + } + + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || + (physpath != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || + (enc_sysfs_path != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || + nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || + nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, + 1) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); + nvlist_free(newvd); + nvlist_free(nvroot); + return; + } + + nvlist_free(newvd); + + /* + * Wait for udev to verify the links exist, then auto-replace + * the leaf disk at same physical location. + */ + if (zpool_label_disk_wait(path, 3000) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " + "disk %s is missing", path); + nvlist_free(nvroot); + return; + } + + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + + zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + fullpath, path, (ret == 0) ? "no errors" : + libzfs_error_description(g_zfshdl)); + + nvlist_free(nvroot); +} + +/* + * Utility functions to find a vdev matching given criteria. + */ +typedef struct dev_data { + const char *dd_compare; + const char *dd_prop; + zfs_process_func_t dd_func; + boolean_t dd_found; + boolean_t dd_islabeled; + uint64_t dd_pool_guid; + uint64_t dd_vdev_guid; + const char *dd_new_devid; +} dev_data_t; + +static void +zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) +{ + dev_data_t *dp = data; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* once a vdev was matched and processed there is nothing left to do */ + if (dp->dd_found) + return; + + /* + * Match by GUID if available otherwise fallback to devid or physical + */ + if (dp->dd_vdev_guid != 0) { + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != dp->dd_vdev_guid) { + return; + } + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); + dp->dd_found = B_TRUE; + + } else if (dp->dd_compare != NULL) { + /* + * NOTE: On Linux there is an event for partition, so unlike + * illumos, substring matching is not required to accommodate + * the partition suffix. An exact match will be present in + * the dp->dd_compare value. + */ + if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || + strcmp(dp->dd_compare, path) != 0) + return; + + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", + dp->dd_prop, path); + dp->dd_found = B_TRUE; + + /* pass the new devid for use by replacing code */ + if (dp->dd_new_devid != NULL) { + (void) nvlist_add_string(nvl, "new_devid", + dp->dd_new_devid); + } + } + + (dp->dd_func)(zhp, nvl, dp->dd_islabeled); +} + +static void +zfs_enable_ds(void *arg) +{ + unavailpool_t *pool = (unavailpool_t *)arg; + + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + zpool_close(pool->uap_zhp); + free(pool); +} + +static int +zfs_iter_pool(zpool_handle_t *zhp, void *data) +{ + nvlist_t *config, *nvl; + dev_data_t *dp = data; + uint64_t pool_guid; + unavailpool_t *pool; + + zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", + zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); + + /* + * For each vdev in this pool, look for a match to apply dd_func + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (dp->dd_pool_guid == 0 || + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { + (void) nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvl); + zfs_iter_vdev(zhp, nvl, data); + } + } + + /* + * if this pool was originally unavailable, + * then enable its datasets asynchronously + */ + if (g_enumeration_done) { + for (pool = list_head(&g_pool_list |